diff --git a/EXTERNAL_HEADERS/Makefile b/EXTERNAL_HEADERS/Makefile index b99d5b451..a8db883a3 100644 --- a/EXTERNAL_HEADERS/Makefile +++ b/EXTERNAL_HEADERS/Makefile @@ -34,6 +34,9 @@ KERNEL_FILES = \ stddef.h \ stdint.h +KERNEL_FILES += \ + ptrauth.h + INSTALL_MI_LIST = INSTALL_MI_DIR = . diff --git a/EXTERNAL_HEADERS/corecrypto/cc.h b/EXTERNAL_HEADERS/corecrypto/cc.h index 7790a4faa..5493e41c9 100644 --- a/EXTERNAL_HEADERS/corecrypto/cc.h +++ b/EXTERNAL_HEADERS/corecrypto/cc.h @@ -12,9 +12,14 @@ #define _CORECRYPTO_CC_H_ #include +#include #include #include +/* Provide a general purpose macro concat method. */ +#define cc_concat_(a, b) a##b +#define cc_concat(a, b) cc_concat_(a, b) + /* Manage asserts here because a few functions in header public files do use asserts */ #define cc_assert(x) assert(x) #if CC_KERNEL @@ -25,6 +30,10 @@ #include #endif +/* Provide a static assert that can be used to create compile-type failures. */ +#define cc_static_assert(e,m) \ + ;enum { cc_concat(static_assert_, __COUNTER__) = 1/(int)(!!(e)) } + /* Declare a struct element with a guarenteed alignment of _alignment_. The resulting struct can be used to create arrays that are aligned by a certain amount. */ @@ -61,12 +70,12 @@ uint8_t b[_alignment_]; \ @param len number of bytes to be cleared in dst @param dst input array */ -CC_NONNULL2 +CC_NONNULL((2)) void cc_clear(size_t len, void *dst); #define cc_copy(_size_, _dst_, _src_) memcpy(_dst_, _src_, _size_) -CC_INLINE CC_NONNULL2 CC_NONNULL3 CC_NONNULL4 +CC_INLINE CC_NONNULL((2, 3, 4)) void cc_xor(size_t size, void *r, const void *s, const void *t) { uint8_t *_r=(uint8_t *)r; const uint8_t *_s=(const uint8_t *)s; @@ -84,7 +93,7 @@ void cc_xor(size_t size, void *r, const void *s, const void *t) { @param ptr2 input array @return returns 0 if the num bytes starting at ptr1 are identical to the num bytes starting at ptr2 and 1 if they are different or if num is 0 (empty arrays). */ -CC_NONNULL2 CC_NONNULL3 +CC_NONNULL((2, 3)) int cc_cmp_safe (size_t num, const void * ptr1, const void * ptr2); /* Exchange S and T of any type. NOTE: Both and S and T are evaluated diff --git a/EXTERNAL_HEADERS/corecrypto/cc_config.h b/EXTERNAL_HEADERS/corecrypto/cc_config.h index 044c8e168..fbdb2c61c 100644 --- a/EXTERNAL_HEADERS/corecrypto/cc_config.h +++ b/EXTERNAL_HEADERS/corecrypto/cc_config.h @@ -12,9 +12,9 @@ #define _CORECRYPTO_CC_CONFIG_H_ /* A word about configuration macros: - + Conditional configuration macros specific to corecrypto should be named CORECRYPTO_xxx - or CCxx_yyy and be defined to be either 0 or 1 in this file. You can add an + or CCxx_yyy and be defined to be either 0 or 1 in this file. You can add an #ifndef #error construct at the end of this file to make sure it's always defined. They should always be tested using the #if directive, never the #ifdef directive. @@ -23,23 +23,23 @@ Configuration Macros that are defined outside of corecrypto (eg: KERNEL, DEBUG, ...) shall only be used in this file to define CCxxx macros. - + External macros should be assumed to be either undefined, defined with no value, or defined as true or false. We shall strive to build with -Wundef whenever possible, so the following construct should be used to test external macros in this file: - + #if defined(DEBUG) && (DEBUG) #define CORECRYPTO_DEBUG 1 #else #define CORECRYPTO_DEBUG 0 #endif - + It is acceptable to define a conditional CC_xxxx macro in an implementation file, to be used only in this file. - + The current code is not guaranteed to follow those rules, but should be fixed to. - + Corecrypto requires GNU and C99 compatibility. Typically enabled by passing --gnu --c99 to the compiler (eg. armcc) @@ -52,17 +52,6 @@ #define CORECRYPTO_SIMULATE_WINDOWS_ENVIRONMENT 0 #define CORECRYPTO_HACK_FOR_WINDOWS_DEVELOPMENT 0 //to be removed after port corecrypto to Windows -//this macro is used to turn on/off usage of transparent union in corecrypto -//it should be commented out in corecrypto and be used only by the software that use corecrypto -//#define CORECRYPTO_DONOT_USE_TRANSPARENT_UNION -#if defined(__cplusplus) -#define CORECRYPTO_USE_TRANSPARENT_UNION 0 -#elif defined(CORECRYPTO_DONOT_USE_TRANSPARENT_UNION) - #define CORECRYPTO_USE_TRANSPARENT_UNION !CORECRYPTO_DONOT_USE_TRANSPARENT_UNION -#else - #define CORECRYPTO_USE_TRANSPARENT_UNION 1 -#endif - #if (defined(DEBUG) && (DEBUG)) || defined(_DEBUG) //MSVC defines _DEBUG /* CC_DEBUG is already used in CommonCrypto */ #define CORECRYPTO_DEBUG 1 @@ -99,6 +88,12 @@ #define CC_RTKIT 0 #endif +#if defined(RTKITROM) && (RTKITROM) +#define CC_RTKITROM 1 +#else +#define CC_RTKITROM 0 +#endif + #if defined(USE_SEPROM) && (USE_SEPROM) #define CC_USE_SEPROM 1 #else @@ -170,11 +165,22 @@ // warning: pointer of type 'void *' used in arithmetic #pragma GCC diagnostic ignored "-Wpointer-arith" #endif // __arm__ +#define CC_SMALL_CODE 1 + #endif // CC_BASEBAND +#if CC_RTKIT || CC_RTKITROM +#define CC_SMALL_CODE 1 +#endif + + +#ifndef CC_SMALL_CODE +#define CC_SMALL_CODE 0 +#endif + //CC_XNU_KERNEL_AVAILABLE indicates the availibity of XNU kernel functions, //like what we have on OSX, iOS, tvOS, Watch OS -#if defined(__APPLE__) && defined(__MACH__) +#if defined(__APPLE__) && defined(__MACH__) #define CC_XNU_KERNEL_AVAILABLE 1 #else #define CC_XNU_KERNEL_AVAILABLE 0 @@ -186,7 +192,7 @@ #endif #if !defined(CCN_UNIT_SIZE) - #if defined(__arm64__) || defined(__x86_64__) || defined(_WIN64) + #if defined(__arm64__) || defined(__x86_64__) || defined(_WIN64) #define CCN_UNIT_SIZE 8 #elif defined(__arm__) || defined(__i386__) || defined(_WIN32) #define CCN_UNIT_SIZE 4 @@ -221,7 +227,7 @@ #if defined(_MSC_VER) #if defined(__clang__) - #define CC_ALIGNED(x) __attribute__ ((aligned(x))) //clang compiler + #define CC_ALIGNED(x) __attribute__ ((aligned(x))) //clang compiler #else #define CC_ALIGNED(x) __declspec(align(x)) //MS complier #endif @@ -235,7 +241,7 @@ #if defined(__arm__) //this is copied from , because is not available on SEPROM environment - #if defined (__ARM_ARCH_7A__) || defined (__ARM_ARCH_7S__) || defined (__ARM_ARCH_7F__) || defined (__ARM_ARCH_7K__) +#if defined (__ARM_ARCH_7A__) || defined (__ARM_ARCH_7S__) || defined (__ARM_ARCH_7F__) || defined (__ARM_ARCH_7K__) || defined(__ARM_ARCH_7EM__) #define _ARM_ARCH_7 #endif @@ -250,7 +256,7 @@ #elif defined(__x86_64__) || defined(__i386__) #define CCN_IOS 0 #define CCN_OSX 1 -#endif +#endif #if CC_USE_L4 || CC_USE_S3 /* No dynamic linking allowed in L4, e.g. avoid nonlazy symbols */ @@ -259,7 +265,7 @@ #endif #if !defined(CC_USE_HEAP_FOR_WORKSPACE) - #if CC_USE_S3 || CC_USE_SEPROM || CC_RTKIT + #if CC_USE_S3 || CC_USE_SEPROM || CC_RTKITROM #define CC_USE_HEAP_FOR_WORKSPACE 0 #else #define CC_USE_HEAP_FOR_WORKSPACE 1 @@ -288,16 +294,23 @@ #define CC_DISABLE_RSAKEYGEN 0 /* default */ #endif +// see rdar://problem/26636018 +#if (CCN_UNIT_SIZE == 8) && !( defined(_MSC_VER) && defined(__clang__)) +#define CCEC25519_CURVE25519DONNA_64BIT 1 +#else +#define CCEC25519_CURVE25519DONNA_64BIT 0 +#endif + //- functions implemented in assembly ------------------------------------------ //this the list of corecrypto clients that use assembly and the clang compiler -#if !(CC_XNU_KERNEL_AVAILABLE || CC_KERNEL || CC_USE_L4 || CC_IBOOT || CC_RTKIT || CC_USE_SEPROM || CC_USE_S3) && !defined(_WIN32) && CORECRYPTO_DEBUG +#if !(CC_XNU_KERNEL_AVAILABLE || CC_KERNEL || CC_USE_L4 || CC_IBOOT || CC_RTKIT || CC_RTKITROM || CC_USE_SEPROM || CC_USE_S3) && !defined(_WIN32) && CORECRYPTO_DEBUG #warning "You are using the default corecrypto configuration, assembly optimizations may not be available for your platform" #endif // Use this macro to strictly disable assembly regardless of cpu/os/compiler/etc. // Our assembly code is not gcc compatible. Clang defines the __GNUC__ macro as well. #if !defined(CC_USE_ASM) - #if defined(_WIN32) || CC_EFI || CC_BASEBAND || CC_XNU_KERNEL_PRIVATE || (defined(__GNUC__) && !defined(__clang__)) || defined(__ANDROID_API__) + #if defined(_WIN32) || CC_EFI || CC_BASEBAND || CC_XNU_KERNEL_PRIVATE || (defined(__GNUC__) && !defined(__clang__)) || defined(__ANDROID_API__) || CC_RTKIT || CC_RTKITROM #define CC_USE_ASM 0 #else #define CC_USE_ASM 1 @@ -306,7 +319,7 @@ //-(1) ARM V7 #if defined(_ARM_ARCH_7) && __clang__ && CC_USE_ASM - #define CCN_DEDICATED_SQR 1 + #define CCN_DEDICATED_SQR CC_SMALL_CODE #define CCN_MUL_KARATSUBA 0 // no performance improvement #define CCN_ADD_ASM 1 #define CCN_SUB_ASM 1 @@ -321,7 +334,7 @@ #define CCN_SHIFT_RIGHT_ASM 1 #define CCAES_ARM_ASM 1 #define CCAES_INTEL_ASM 0 - #if CC_KERNEL || CC_USE_L4 || CC_IBOOT || CC_RTKIT || CC_USE_SEPROM || CC_USE_S3 + #if CC_KERNEL || CC_USE_L4 || CC_IBOOT || CC_RTKIT || CC_RTKITROM || CC_USE_SEPROM || CC_USE_S3 #define CCAES_MUX 0 #else #define CCAES_MUX 1 @@ -341,7 +354,7 @@ //-(2) ARM 64 #elif defined(__arm64__) && __clang__ && CC_USE_ASM - #define CCN_DEDICATED_SQR 1 + #define CCN_DEDICATED_SQR CC_SMALL_CODE #define CCN_MUL_KARATSUBA 1 // 4*n CCN_UNIT extra memory required. #define CCN_ADD_ASM 1 #define CCN_SUB_ASM 1 @@ -404,7 +417,7 @@ #define CCSHA2_VNG_ARMV7NEON 0 #define CCSHA256_ARMV6M_ASM 0 -//-(4) disable assembly +//-(4) disable assembly #else #if CCN_UINT128_SUPPORT_FOR_64BIT_ARCH #define CCN_DEDICATED_SQR 1 @@ -437,25 +450,11 @@ #define CC_INLINE static inline -#if CORECRYPTO_USE_TRANSPARENT_UNION -// Non null for transparent unions is ambiguous and cause problems -// for most tools (GCC and others: 23919290). - #define CC_NONNULL_TU(N) -#else - #define CC_NONNULL_TU(N) CC_NONNULL(N) -#endif - #ifdef __GNUC__ #define CC_NORETURN __attribute__((__noreturn__)) #define CC_NOTHROW __attribute__((__nothrow__)) #define CC_NONNULL(N) __attribute__((__nonnull__ N)) - #define CC_NONNULL1 __attribute__((__nonnull__(1))) - #define CC_NONNULL2 __attribute__((__nonnull__(2))) - #define CC_NONNULL3 __attribute__((__nonnull__(3))) - #define CC_NONNULL4 __attribute__((__nonnull__(4))) - #define CC_NONNULL5 __attribute__((__nonnull__(5))) - #define CC_NONNULL6 __attribute__((__nonnull__(6))) - #define CC_NONNULL7 __attribute__((__nonnull__(7))) + #define CC_NONNULL4 CC_NONNULL((4)) #define CC_NONNULL_ALL __attribute__((__nonnull__)) #define CC_SENTINEL __attribute__((__sentinel__)) #define CC_CONST __attribute__((__const__)) @@ -468,24 +467,12 @@ #define CC_UNUSED /*! @parseOnly */ #define CC_NONNULL(N) -/*! @parseOnly */ - #define CC_NORETURN -/*! @parseOnly */ - #define CC_NOTHROW -/*! @parseOnly */ - #define CC_NONNULL1 -/*! @parseOnly */ - #define CC_NONNULL2 -/*! @parseOnly */ - #define CC_NONNULL3 /*! @parseOnly */ #define CC_NONNULL4 /*! @parseOnly */ - #define CC_NONNULL5 -/*! @parseOnly */ - #define CC_NONNULL6 + #define CC_NORETURN /*! @parseOnly */ - #define CC_NONNULL7 + #define CC_NOTHROW /*! @parseOnly */ #define CC_NONNULL_ALL /*! @parseOnly */ diff --git a/EXTERNAL_HEADERS/corecrypto/cc_debug.h b/EXTERNAL_HEADERS/corecrypto/cc_debug.h index 80e61a7b3..8cd85e279 100644 --- a/EXTERNAL_HEADERS/corecrypto/cc_debug.h +++ b/EXTERNAL_HEADERS/corecrypto/cc_debug.h @@ -26,7 +26,7 @@ #if !CONFIG_EMBEDDED extern int printf(const char *format, ...) __printflike(1,2); #endif -#elif CC_USE_S3 || CC_IBOOT || CC_RTKIT +#elif CC_USE_S3 || CC_IBOOT || CC_RTKIT || CC_RTKITROM #include #define cc_printf(x...) printf(x) #elif defined(__ANDROID_API__) diff --git a/EXTERNAL_HEADERS/corecrypto/cc_error.h b/EXTERNAL_HEADERS/corecrypto/cc_error.h new file mode 100644 index 000000000..57b8ec70c --- /dev/null +++ b/EXTERNAL_HEADERS/corecrypto/cc_error.h @@ -0,0 +1,124 @@ +/* + * cc_error.h + * corecrypto + * + * Created on 11/14/2017 + * + * Copyright (c) 2017 Apple Inc. All rights reserved. + * + */ + +#ifndef _CORECRYPTO_CC_ERROR_H_ +#define _CORECRYPTO_CC_ERROR_H_ + +enum { + CCERR_OK = 0, + + /* the default error code */ + CCERR_INTERNAL = -1, + + CCERR_INTEGRITY = -2, + + CCERR_DEVICE = -3, + CCERR_INTERRUPTS = -4, + CCERR_CRYPTO_CONFIG = -5, + CCERR_PERMS = -6, + CCERR_PARAMETER = -7, + CCERR_MEMORY = -8, + CCERR_FILEDESC = -9, + CCERR_OUT_OF_ENTROPY = -10, + CCERR_ATFORK = -11, + CCERR_OVERFLOW = -12, + + CCERR_MEMORY_ALLOC_FAIL = -13, + + CCEC_GENERATE_KEY_DEFAULT_ERR = -14, + CCEC_GENERATE_KEY_TOO_MANY_TRIES = -15, + CCEC_GENERATE_KEY_MULT_FAIL = -16, + CCEC_GENERATE_KEY_AFF_FAIL = -17, + CCEC_GENERATE_KEY_CONSISTENCY = -18, + CCEC_GENERATE_NOT_ON_CURVE = -19, + CCEC_GENERATE_NOT_ENOUGH_ENTROPY = -20, + CCEC_GENERATE_NOT_SUPPORTED = -21, + CCEC_GENERATE_INVALID_INPUT = -22, + + // Program error: buffer too small or encrypted message is too small + CCRSA_INVALID_INPUT = -23, + // Invalid crypto configuration: Hash length versus RSA key size + CCRSA_INVALID_CONFIG = -24, + CCRSA_ENCODING_ERROR = -25, + CCRSA_DECODING_ERROR = -26, + + // The data is invalid (we won't say more for security) + CCRSA_PRIVATE_OP_ERROR = -27, + CCRSA_KEY_ERROR = -28, + + // Key generation specific + CCRSA_KEYGEN_PRIME_NOT_FOUND = -29, + CCRSA_KEYGEN_PRIME_NEED_NEW_SEED = -30, + CCRSA_KEYGEN_PRIME_TOO_MANY_ITERATIONS = -31, + CCRSA_KEYGEN_PRIME_SEED_GENERATION_ERROR = -32, + CCRSA_KEYGEN_MODULUS_CRT_INV_ERROR = -33, + CCRSA_KEYGEN_NEXT_PRIME_ERROR = -34, + CCRSA_KEYGEN_SEED_X_ERROR = -35, + CCRSA_KEYGEN_SEED_r_ERROR = -36, + CCRSA_KEYGEN_KEYGEN_CONSISTENCY_FAIL = -37, + CCRSA_KEYGEN_R1R2_SIZE_ERROR = -38, + CCRSA_KEYGEN_PQ_DELTA_ERROR = -39, + + CCRSA_FIPS_KEYGEN_DISABLED = -40, + + CCZP_INV_ERROR = -41, + CCZP_INV_NO_INVERSE = -42, + CCZP_INV_INVALID_INPUT = -43, + + CCZ_INVALID_INPUT_ERROR = -44, + CCZ_INVALID_RADIX_ERROR = -45, + + CCDH_ERROR_DEFAULT = -46, + CCDH_GENERATE_KEY_TOO_MANY_TRIES = -47, + CCDH_NOT_SUPPORTED_CONFIGURATION = -48, + CCDH_SAFETY_CHECK = -49, + CCDH_PUBLIC_KEY_MISSING = -50, + CCDH_INVALID_DOMAIN_PARAMETER = -51, + CCDH_INVALID_INPUT = -52, + CCDH_DOMAIN_PARAMETER_MISMATCH = -53, + CCDH_GENERATE_KEY_CONSISTENCY = -54, + + CCSRP_ERROR_DEFAULT = -55, + CCSRP_GENERATE_KEY_TOO_MANY_TRIES = -56, + CCSRP_NOT_SUPPORTED_CONFIGURATION = -57, + CCSRP_SAFETY_CHECK = -58, + CCSRP_PUBLIC_KEY_MISSING = -59, + CCSRP_INVALID_DOMAIN_PARAMETER = -60, + + CCDRBG_STATUS_ERROR = -61, + CCDRBG_STATUS_NEED_RESEED = -62, + CCDRBG_STATUS_PARAM_ERROR = -63, + // If this value is returned, the caller must abort or panic the process for + // security reasons. for example in the case of catastrophic error in + // http://csrc.nist.gov/publications/drafts/800-90/sp800_90a_r1_draft.pdf + // ccdrbg calls abort() or panic(), if they are available in the system. + CCDRBG_STATUS_ABORT = -64, + + CCKPRNG_NEED_ENTROPY = -65, + CCKPRNG_ABORT = -66, + + CCMODE_INVALID_INPUT = -67, + CCMODE_INVALID_CALL_SEQUENCE = -68, + CCMODE_INTEGRITY_FAILURE = -69, + CCMODE_NOT_SUPPORTED = -70, + CCMODE_INTERNAL_ERROR = -71, + + // Configuration or unexpected issue + CCPOST_GENERIC_FAILURE = -72, + CCPOST_LIBRARY_ERROR = -73, + CCPOST_INTEGRITY_ERROR = -74, + // Output of the algo is not as expected + CCPOST_KAT_FAILURE = -75, +}; + +#define CCDRBG_STATUS_OK CCERR_OK +#define CCKPRNG_OK CCERR_OK + +#endif /* _CORECRYPTO_CC_ERROR_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/cc_priv.h b/EXTERNAL_HEADERS/corecrypto/cc_priv.h index 55e0eb2b8..0a51e66ee 100644 --- a/EXTERNAL_HEADERS/corecrypto/cc_priv.h +++ b/EXTERNAL_HEADERS/corecrypto/cc_priv.h @@ -471,6 +471,12 @@ void cc_mux2p(int s, void **r_true, void **r_false, const void *a, const void *b r = (~_cond&(a))|(_cond&(b)); \ } -int cc_is_compiled_with_tu(void); +/* + Unfortunately, since we export this symbol, this declaration needs + to be in a public header to satisfy TAPI. + + See fipspost_trace_priv.h for more details. +*/ +extern const void *fipspost_trace_vtable; #endif /* _CORECRYPTO_CC_PRIV_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/cc_runtime_config.h b/EXTERNAL_HEADERS/corecrypto/cc_runtime_config.h index 0064c6ca6..0d7ac5289 100644 --- a/EXTERNAL_HEADERS/corecrypto/cc_runtime_config.h +++ b/EXTERNAL_HEADERS/corecrypto/cc_runtime_config.h @@ -23,6 +23,7 @@ #define CC_HAS_SupplementalSSE3() ((cpuid_features() & CPUID_FEATURE_SSSE3) != 0) #define CC_HAS_AVX1() ((cpuid_features() & CPUID_FEATURE_AVX1_0) != 0) #define CC_HAS_AVX2() ((cpuid_info()->cpuid_leaf7_features & CPUID_LEAF7_FEATURE_AVX2) != 0) + #define CC_HAS_AVX512_AND_IN_KERNEL() ((cpuid_info()->cpuid_leaf7_features & CPUID_LEAF7_FEATURE_AVX512F) !=0) #elif CC_XNU_KERNEL_AVAILABLE # include @@ -36,11 +37,13 @@ #define CC_HAS_SupplementalSSE3() (_cpu_capabilities & kHasSupplementalSSE3) #define CC_HAS_AVX1() (_cpu_capabilities & kHasAVX1_0) #define CC_HAS_AVX2() (_cpu_capabilities & kHasAVX2_0) + #define CC_HAS_AVX512_AND_IN_KERNEL() 0 #else #define CC_HAS_AESNI() 0 #define CC_HAS_SupplementalSSE3() 0 #define CC_HAS_AVX1() 0 #define CC_HAS_AVX2() 0 + #define CC_HAS_AVX512_AND_IN_KERNEL() 0 #endif #endif /* !(defined(__x86_64__) || defined(__i386__)) */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccaes.h b/EXTERNAL_HEADERS/corecrypto/ccaes.h index ec119b9b6..281c99d22 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccaes.h +++ b/EXTERNAL_HEADERS/corecrypto/ccaes.h @@ -45,6 +45,9 @@ extern const struct ccmode_ofb ccaes_arm_ofb_crypt_mode; #endif #if CCAES_MUX +/* Runtime check to see if hardware should be used */ +int ccaes_ios_hardware_enabled(int operation); + extern const struct ccmode_cbc ccaes_ios_hardware_cbc_encrypt_mode; extern const struct ccmode_cbc ccaes_ios_hardware_cbc_decrypt_mode; @@ -86,6 +89,15 @@ extern const struct ccmode_xts ccaes_intel_xts_decrypt_opt_mode; extern const struct ccmode_xts ccaes_intel_xts_decrypt_aesni_mode; #endif +#if CC_USE_L4 +extern const struct ccmode_cbc ccaes_skg_cbc_encrypt_mode; +extern const struct ccmode_cbc ccaes_skg_cbc_decrypt_mode; + +extern const struct ccmode_ecb ccaes_skg_ecb_encrypt_mode; +extern const struct ccmode_ecb ccaes_skg_ecb_decrypt_mode; + +extern const struct ccmode_ecb ccaes_trng_ecb_encrypt_mode; +#endif /* Implementation Selectors: */ const struct ccmode_ecb *ccaes_ecb_encrypt_mode(void); diff --git a/EXTERNAL_HEADERS/corecrypto/ccasn1.h b/EXTERNAL_HEADERS/corecrypto/ccasn1.h index 28fba4eef..75aac6e68 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccasn1.h +++ b/EXTERNAL_HEADERS/corecrypto/ccasn1.h @@ -69,26 +69,16 @@ enum { CCASN1_CONSTRUCTED_SEQUENCE = CCASN1_SEQUENCE | CCASN1_CONSTRUCTED, }; -#if CORECRYPTO_USE_TRANSPARENT_UNION -typedef union { - const unsigned char * oid; -} __attribute__((transparent_union)) ccoid_t; -#define CCOID(x) ((x).oid) -#else - typedef const unsigned char * ccoid_t; +typedef const unsigned char * ccoid_t; #define CCOID(oid) (oid) -#endif - -/* Returns *der iff *der points to a DER encoded oid that fits within *der_len. */ -ccoid_t ccoid_for_der(size_t *der_len, const uint8_t **der); /* Returns the size of an oid including it's tag and length. */ -CC_INLINE CC_PURE CC_NONNULL_TU((1)) +CC_INLINE CC_PURE CC_NONNULL((1)) size_t ccoid_size(ccoid_t oid) { return 2 + CCOID(oid)[1]; } -CC_INLINE CC_PURE CC_NONNULL_TU((1)) CC_NONNULL_TU((2)) +CC_INLINE CC_PURE CC_NONNULL((1, 2)) bool ccoid_equal(ccoid_t oid1, ccoid_t oid2) { return (ccoid_size(oid1) == ccoid_size(oid2) && memcmp(CCOID(oid1), CCOID(oid2), ccoid_size(oid1))== 0); diff --git a/EXTERNAL_HEADERS/corecrypto/ccchacha20poly1305.h b/EXTERNAL_HEADERS/corecrypto/ccchacha20poly1305.h index 3e76b81b4..4ca59e63b 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccchacha20poly1305.h +++ b/EXTERNAL_HEADERS/corecrypto/ccchacha20poly1305.h @@ -107,8 +107,6 @@ struct ccchacha20poly1305_info { }; -extern const struct ccchacha20poly1305_info ccchacha20poly1305_info_default; - const struct ccchacha20poly1305_info *ccchacha20poly1305_info(void); /*! diff --git a/EXTERNAL_HEADERS/corecrypto/cccmac.h b/EXTERNAL_HEADERS/corecrypto/cccmac.h index 63a892fd6..d2e018143 100644 --- a/EXTERNAL_HEADERS/corecrypto/cccmac.h +++ b/EXTERNAL_HEADERS/corecrypto/cccmac.h @@ -17,30 +17,6 @@ #define CMAC_BLOCKSIZE 16 -#if CORECRYPTO_USE_TRANSPARENT_UNION -struct cccmac_ctx { - uint8_t b[8]; -} CC_ALIGNED(8); - -typedef struct cccmac_ctx_hdr { - uint8_t k1[CMAC_BLOCKSIZE]; - uint8_t k2[CMAC_BLOCKSIZE]; - uint8_t block[CMAC_BLOCKSIZE]; - size_t block_nbytes; // Number of byte occupied in block buf - size_t cumulated_nbytes; // Total size processed - const struct ccmode_cbc *cbc; - uint8_t ctx[8]; -} CC_ALIGNED(8) cccmac_ctx_hdr; - - -typedef union { - struct cccmac_ctx *b; - cccmac_ctx_hdr *hdr; -} cccmac_ctx_t __attribute__((transparent_union)); -#define cccmac_hdr_size sizeof(struct cccmac_ctx_hdr) - -#else - struct cccmac_ctx { uint8_t k1[CMAC_BLOCKSIZE]; uint8_t k2[CMAC_BLOCKSIZE]; @@ -55,8 +31,6 @@ typedef struct cccmac_ctx* cccmac_ctx_t; #define cccmac_hdr_size sizeof(struct cccmac_ctx) -#endif - #define cccmac_iv_size(_mode_) ((_mode_)->block_size) #define cccmac_cbc_size(_mode_) ((_mode_)->size) @@ -67,15 +41,9 @@ typedef struct cccmac_ctx* cccmac_ctx_t; #define cccmac_mode_decl(_mode_, _name_) cc_ctx_decl(struct cccmac_ctx, cccmac_ctx_size(_mode_), _name_) #define cccmac_mode_clear(_mode_, _name_) cc_clear(cccmac_ctx_size(_mode_), _name_) -#if CORECRYPTO_USE_TRANSPARENT_UNION -/* Return a cccbc_ctx * which can be accesed with the macros in ccmode.h */ -#define cccmac_mode_ctx_start(_mode_, HC) (((HC).hdr)->ctx) -#define CCCMAC_HDR(HC) (((cccmac_ctx_t)(HC)).hdr) -#else /* Return a cccbc_ctx * which can be accesed with the macros in ccmode.h */ #define cccmac_mode_ctx_start(_mode_, HC) (HC->ctx) #define CCCMAC_HDR(HC) (HC) -#endif #define cccmac_mode_sym_ctx(_mode_, HC) (cccbc_ctx *)(cccmac_mode_ctx_start(_mode_, HC)) #define cccmac_mode_iv(_mode_, HC) (cccbc_iv *)(cccmac_mode_ctx_start(_mode_, HC)+cccmac_cbc_size(_mode_)) diff --git a/EXTERNAL_HEADERS/corecrypto/ccder.h b/EXTERNAL_HEADERS/corecrypto/ccder.h index 6e2c504be..5bd102962 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccder.h +++ b/EXTERNAL_HEADERS/corecrypto/ccder.h @@ -134,21 +134,21 @@ size_t ccder_sizeof_uint64(uint64_t value); /* Encode a tag backwards, der_end should point to one byte past the end of destination for the tag, returns a pointer to the first byte of the tag. Returns NULL if there is an encoding error. */ -CC_NONNULL2 +CC_NONNULL((2)) uint8_t *ccder_encode_tag(ccder_tag tag, const uint8_t *der, uint8_t *der_end); /* Returns a pointer to the start of the len field. returns NULL if there is an encoding error. */ -CC_NONNULL2 +CC_NONNULL((2)) uint8_t * ccder_encode_len(size_t len, const uint8_t *der, uint8_t *der_end); /* der_end should point to the first byte of the content of this der item. */ -CC_NONNULL3 +CC_NONNULL((3)) uint8_t * ccder_encode_tl(ccder_tag tag, size_t len, const uint8_t *der, uint8_t *der_end); -CC_PURE CC_NONNULL2 +CC_PURE CC_NONNULL((2)) uint8_t * ccder_encode_body_nocopy(size_t size, const uint8_t *der, uint8_t *der_end); @@ -163,7 +163,7 @@ ccder_encode_constructed_tl(ccder_tag tag, const uint8_t *body_end, /* Encodes oid into der and returns der + ccder_sizeof_oid(oid). */ -CC_NONNULL_TU((1)) CC_NONNULL2 +CC_NONNULL((1, 2)) uint8_t *ccder_encode_oid(ccoid_t oid, const uint8_t *der, uint8_t *der_end); CC_NONNULL((3, 4)) @@ -175,12 +175,12 @@ CC_NONNULL((2, 3)) uint8_t *ccder_encode_integer(cc_size n, const cc_unit *s, const uint8_t *der, uint8_t *der_end); -CC_NONNULL3 +CC_NONNULL((3)) uint8_t *ccder_encode_implicit_uint64(ccder_tag implicit_tag, uint64_t value, const uint8_t *der, uint8_t *der_end); -CC_NONNULL2 +CC_NONNULL((2)) uint8_t *ccder_encode_uint64(uint64_t value, const uint8_t *der, uint8_t *der_end); @@ -206,7 +206,7 @@ uint8_t *ccder_encode_raw_octet_string(size_t s_size, const uint8_t *s, size_t ccder_encode_eckey_size(size_t priv_size, ccoid_t oid, size_t pub_size); -CC_NONNULL2 CC_NONNULL5 CC_NONNULL6 CC_NONNULL7 +CC_NONNULL((2, 5, 6, 7)) uint8_t *ccder_encode_eckey(size_t priv_size, const uint8_t *priv_key, ccoid_t oid, size_t pub_size, const uint8_t *pub_key, @@ -216,7 +216,7 @@ uint8_t *ccder_encode_eckey(size_t priv_size, const uint8_t *priv_key, It's inefficient – especially when you already have to convert to get to the form for the body. see encode integer for the right way to unify conversion and insertion */ -CC_NONNULL3 +CC_NONNULL((3)) uint8_t * ccder_encode_body(size_t size, const uint8_t* body, const uint8_t *der, uint8_t *der_end); @@ -291,16 +291,16 @@ const uint8_t *ccder_decode_uint64(uint64_t* r, CC_NONNULL((2, 3, 5)) const uint8_t *ccder_decode_seqii(cc_size n, cc_unit *r, cc_unit *s, const uint8_t *der, const uint8_t *der_end); -CC_NONNULL_TU((1)) CC_NONNULL((3)) +CC_NONNULL((1, 3)) const uint8_t *ccder_decode_oid(ccoid_t *oidp, const uint8_t *der, const uint8_t *der_end); -CC_NONNULL((1,2,4)) +CC_NONNULL((1, 2, 4)) const uint8_t *ccder_decode_bitstring(const uint8_t **bit_string, size_t *bit_length, const uint8_t *der, const uint8_t *der_end); -CC_NONNULL_TU((4)) CC_NONNULL((1,2,3,5,6,8)) +CC_NONNULL((1, 2, 3, 4, 5, 6, 8)) const uint8_t *ccder_decode_eckey(uint64_t *version, size_t *priv_size, const uint8_t **priv_key, ccoid_t *oid, diff --git a/EXTERNAL_HEADERS/corecrypto/ccdes.h b/EXTERNAL_HEADERS/corecrypto/ccdes.h index b4925bd14..31b5dadbf 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccdes.h +++ b/EXTERNAL_HEADERS/corecrypto/ccdes.h @@ -17,12 +17,8 @@ #define CCDES_BLOCK_SIZE 8 #define CCDES_KEY_SIZE 8 -extern const struct ccmode_ecb ccdes_ltc_ecb_decrypt_mode; -extern const struct ccmode_ecb ccdes_ltc_ecb_encrypt_mode; - extern const struct ccmode_ecb ccdes3_ltc_ecb_decrypt_mode; extern const struct ccmode_ecb ccdes3_ltc_ecb_encrypt_mode; -extern const struct ccmode_ecb ccdes168_ltc_ecb_encrypt_mode; const struct ccmode_ecb *ccdes_ecb_decrypt_mode(void); const struct ccmode_ecb *ccdes_ecb_encrypt_mode(void); @@ -61,8 +57,8 @@ int ccdes_key_is_weak( void *key, size_t length); void ccdes_key_set_odd_parity(void *key, size_t length); uint32_t -ccdes_cbc_cksum(void *in, void *out, size_t length, - void *key, size_t keylen, void *ivec); +ccdes_cbc_cksum(const void *in, void *out, size_t length, + const void *key, size_t key_nbytes, const void *ivec); #endif /* _CORECRYPTO_CCDES_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccdigest.h b/EXTERNAL_HEADERS/corecrypto/ccdigest.h index a1b178a60..52ee15123 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccdigest.h +++ b/EXTERNAL_HEADERS/corecrypto/ccdigest.h @@ -16,35 +16,6 @@ /* To malloc a digest context for a given di, use malloc(ccdigest_di_size(di)) and assign the result to a pointer to a struct ccdigest_ctx. */ -#if CORECRYPTO_USE_TRANSPARENT_UNION -struct ccdigest_ctx { - union { - uint8_t u8; - uint32_t u32; - uint64_t u64; - cc_unit ccn; - } state; -} CC_ALIGNED(8); - -typedef union { - struct ccdigest_ctx *hdr; -} ccdigest_ctx_t __attribute__((transparent_union)); - -struct ccdigest_state { - union { - uint8_t u8; - uint32_t u32; - uint64_t u64; - cc_unit ccn; - } state; -} CC_ALIGNED(8); - -typedef union { - struct ccdigest_state *hdr; - struct ccdigest_ctx *_ctx; - ccdigest_ctx_t _ctxt; -} ccdigest_state_t __attribute__((transparent_union)); -#else //======================================================= struct ccdigest_ctx { union { uint8_t u8; @@ -66,8 +37,6 @@ struct ccdigest_state { } CC_ALIGNED(8); typedef struct ccdigest_state *ccdigest_state_t; -#endif //======================================================= - struct ccdigest_info { size_t output_size; @@ -99,40 +68,22 @@ struct ccdigest_info { #define ccdigest_di_clear(_di_, _name_) cc_clear(ccdigest_di_size(_di_), _name_) /* Digest context field accessors. Consider the implementation private. */ -#if CORECRYPTO_USE_TRANSPARENT_UNION -#define ccdigest_state(_di_, _ctx_) ((struct ccdigest_state *)(&((ccdigest_ctx_t)(_ctx_)).hdr->state.u8 + sizeof(uint64_t))) -#else #define ccdigest_state(_di_, _ctx_) ((struct ccdigest_state *)(&((ccdigest_ctx_t)(_ctx_))->state.u8 + sizeof(uint64_t))) -#endif #define ccdigest_state_u8(_di_, _ctx_) ccdigest_u8(ccdigest_state((_di_), (_ctx_))) #define ccdigest_state_u32(_di_, _ctx_) ccdigest_u32(ccdigest_state((_di_), (_ctx_))) #define ccdigest_state_u64(_di_, _ctx_) ccdigest_u64(ccdigest_state((_di_), (_ctx_))) #define ccdigest_state_ccn(_di_, _ctx_) ccdigest_ccn(ccdigest_state((_di_), (_ctx_))) -#if CORECRYPTO_USE_TRANSPARENT_UNION -#define ccdigest_nbits(_di_, _ctx_) (((uint64_t *)(&((ccdigest_ctx_t)(_ctx_)).hdr->state.u8))[0]) -#define ccdigest_data(_di_, _ctx_) (&((ccdigest_ctx_t)(_ctx_)).hdr->state.u8 + (_di_)->state_size + sizeof(uint64_t)) -#define ccdigest_num(_di_, _ctx_) (((unsigned int *)(&((ccdigest_ctx_t)(_ctx_)).hdr->state.u8 + (_di_)->state_size + sizeof(uint64_t) + (_di_)->block_size))[0]) -#else #define ccdigest_nbits(_di_, _ctx_) (((uint64_t *)(&((ccdigest_ctx_t)(_ctx_))->state.u8))[0]) #define ccdigest_data(_di_, _ctx_) (&((ccdigest_ctx_t)(_ctx_))->state.u8 + (_di_)->state_size + sizeof(uint64_t)) #define ccdigest_num(_di_, _ctx_) (((unsigned int *)(&((ccdigest_ctx_t)(_ctx_))->state.u8 + (_di_)->state_size + sizeof(uint64_t) + (_di_)->block_size))[0]) -#endif -#if CORECRYPTO_USE_TRANSPARENT_UNION -/* Digest state field accessors. Consider the implementation private. */ -#define ccdigest_u8(_state_) (&((ccdigest_state_t)(_state_)).hdr->state.u8) -#define ccdigest_u32(_state_) (&((ccdigest_state_t)(_state_)).hdr->state.u32) -#define ccdigest_u64(_state_) (&((ccdigest_state_t)(_state_)).hdr->state.u64) -#define ccdigest_ccn(_state_) (&((ccdigest_state_t)(_state_)).hdr->state.ccn) -#else /* Digest state field accessors. Consider the implementation private. */ #define ccdigest_u8(_state_) (&((ccdigest_state_t)(_state_))->state.u8) #define ccdigest_u32(_state_) (&((ccdigest_state_t)(_state_))->state.u32) #define ccdigest_u64(_state_) (&((ccdigest_state_t)(_state_))->state.u64) #define ccdigest_ccn(_state_) (&((ccdigest_state_t)(_state_))->state.ccn) -#endif /* We could just use memcpy instead of this special macro, but this allows us to use the optimized ccn_set() assembly routine if we have one, which for @@ -156,23 +107,6 @@ void ccdigest_final(const struct ccdigest_info *di, ccdigest_ctx_t ctx, unsigned void ccdigest(const struct ccdigest_info *di, size_t len, const void *data, void *digest); -/* test functions */ -int ccdigest_test(const struct ccdigest_info *di, size_t len, - const void *data, const void *digest); - -int ccdigest_test_chunk(const struct ccdigest_info *di, size_t len, - const void *data, const void *digest, size_t chunk); - -struct ccdigest_vector { - size_t len; - const void *message; - const void *digest; -}; - -int ccdigest_test_vector(const struct ccdigest_info *di, const struct ccdigest_vector *v); -int ccdigest_test_chunk_vector(const struct ccdigest_info *di, const struct ccdigest_vector *v, size_t chunk); - - #define OID_DEF(_VALUE_) ((const unsigned char *)_VALUE_) #define CC_DIGEST_OID_MD2 OID_DEF("\x06\x08\x2A\x86\x48\x86\xF7\x0D\x02\x02") diff --git a/EXTERNAL_HEADERS/corecrypto/ccdigest_priv.h b/EXTERNAL_HEADERS/corecrypto/ccdigest_priv.h index e888a734d..9d42de519 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccdigest_priv.h +++ b/EXTERNAL_HEADERS/corecrypto/ccdigest_priv.h @@ -14,14 +14,7 @@ #include #include -void ccdigest_final_common(const struct ccdigest_info *di, - ccdigest_ctx_t ctx, void *digest); -void ccdigest_final_64be(const struct ccdigest_info *di, ccdigest_ctx_t, - unsigned char *digest); -void ccdigest_final_64le(const struct ccdigest_info *di, ccdigest_ctx_t, - unsigned char *digest); - -CC_INLINE CC_NONNULL_TU((1)) +CC_INLINE CC_NONNULL((1)) bool ccdigest_oid_equal(const struct ccdigest_info *di, ccoid_t oid) { if(di->oid == NULL && CCOID(oid) == NULL) return true; if(di->oid == NULL || CCOID(oid) == NULL) return false; diff --git a/EXTERNAL_HEADERS/corecrypto/ccdrbg.h b/EXTERNAL_HEADERS/corecrypto/ccdrbg.h index af5b010a9..7717d0c03 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccdrbg.h +++ b/EXTERNAL_HEADERS/corecrypto/ccdrbg.h @@ -21,20 +21,10 @@ #include #include -/* error codes */ -#define CCDRBG_STATUS_OK 0 -#define CCDRBG_STATUS_ERROR (-1) -#define CCDRBG_STATUS_NEED_RESEED (-2) -#define CCDRBG_STATUS_PARAM_ERROR (-3) -// If this value is returned, the caller must abort or panic the process for security reasons. -// for example in the case of catastrophic error in -// http://csrc.nist.gov/publications/drafts/800-90/sp800_90a_r1_draft.pdf -// ccdrbg calls abort() or panic(), if they are available in the system. -#define CCDRBG_STATUS_ABORT (-4) /* - * The maximum length of the entropy_input, additional_input (max_additional_input_length) , personalization string + * The maximum length of the entropy_input, additional_input (max_additional_input_length) , personalization string * (max_personalization_string_length) and max_number_of_bits_per_request are implementation dependent - * but shall fit in a 32 bit register and be be less than or equal to the specified maximum length for the + * but shall fit in a 32 bit register and be be less than or equal to the specified maximum length for the * selected DRBG mechanism (NIST 800-90A Section 10). */ @@ -87,9 +77,9 @@ CC_INLINE void ccdrbg_done(const struct ccdrbg_info *info, info->done(drbg); } -CC_INLINE size_t ccdrbg_context_size(const struct ccdrbg_info *drbg) +CC_INLINE size_t ccdrbg_context_size(const struct ccdrbg_info *info) { - return drbg->size; + return info->size; } @@ -110,8 +100,6 @@ void ccdrbg_factory_nistctr(struct ccdrbg_info *info, const struct ccdrbg_nistct * NIST SP 800-90 HMAC_DRBG * the maximum security strengh of drbg is half of output size of the input hash function and it internally is limited to 256 bits */ -extern struct ccdrbg_info ccdrbg_nistdigest_info; - struct ccdrbg_nisthmac_custom { const struct ccdigest_info *di; int strictFIPS; @@ -119,10 +107,4 @@ struct ccdrbg_nisthmac_custom { void ccdrbg_factory_nisthmac(struct ccdrbg_info *info, const struct ccdrbg_nisthmac_custom *custom); - -/* - * Dummy DRBG - */ -extern struct ccdrbg_info ccdrbg_dummy_info; - #endif /* _CORECRYPTO_CCDRBG_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/cchmac.h b/EXTERNAL_HEADERS/corecrypto/cchmac.h index 81c1ab835..048c0de14 100644 --- a/EXTERNAL_HEADERS/corecrypto/cchmac.h +++ b/EXTERNAL_HEADERS/corecrypto/cchmac.h @@ -19,14 +19,7 @@ struct cchmac_ctx { uint8_t b[8]; } CC_ALIGNED(8); -#if CORECRYPTO_USE_TRANSPARENT_UNION -typedef union { - struct cchmac_ctx *hdr; - ccdigest_ctx_t digest; -} cchmac_ctx_t __attribute__((transparent_union)); -#else typedef struct cchmac_ctx* cchmac_ctx_t; -#endif #define cchmac_ctx_size(STATE_SIZE, BLOCK_SIZE) (ccdigest_ctx_size(STATE_SIZE, BLOCK_SIZE) + (STATE_SIZE)) #define cchmac_di_size(_di_) (cchmac_ctx_size((_di_)->state_size, (_di_)->block_size)) @@ -39,43 +32,25 @@ typedef struct cchmac_ctx* cchmac_ctx_t; #define cchmac_di_clear(_di_, _name_) cchmac_ctx_clear((_di_)->state_size, (_di_)->block_size, _name_) /* Return a ccdigest_ctx_t which can be accesed with the macros in ccdigest.h */ -#if CORECRYPTO_USE_TRANSPARENT_UNION -#define cchmac_digest_ctx(_di_, HC) (((cchmac_ctx_t)(HC)).digest) -#else #define cchmac_digest_ctx(_di_, HC) ((ccdigest_ctx_t)(HC)) -#endif /* Accesors for ostate fields, this is all cchmac_ctx_t adds to the ccdigest_ctx_t. */ -#if CORECRYPTO_USE_TRANSPARENT_UNION -#define cchmac_ostate(_di_, HC) ((struct ccdigest_state *)(((cchmac_ctx_t)(HC)).hdr->b + ccdigest_di_size(_di_))) -#else #define cchmac_ostate(_di_, HC) ((struct ccdigest_state *)(((cchmac_ctx_t)(HC))->b + ccdigest_di_size(_di_))) -#endif #define cchmac_ostate8(_di_, HC) (ccdigest_u8(cchmac_ostate(_di_, HC))) #define cchmac_ostate32(_di_, HC) (ccdigest_u32(cchmac_ostate(_di_, HC))) #define cchmac_ostate64(_di_, HC) (ccdigest_u64(cchmac_ostate(_di_, HC))) #define cchmac_ostateccn(_di_, HC) (ccdigest_ccn(cchmac_ostate(_di_, HC))) /* Convenience accessors for ccdigest_ctx_t fields. */ -#if CORECRYPTO_USE_TRANSPARENT_UNION -#define cchmac_istate(_di_, HC) ccdigest_state(_di_, ((cchmac_ctx_t)(HC)).digest) -#else #define cchmac_istate(_di_, HC) ccdigest_state(_di_, ((ccdigest_ctx_t)(HC))) -#endif #define cchmac_istate8(_di_, HC) ccdigest_u8(cchmac_istate(_di_, HC)) #define cchmac_istate32(_di_, HC) ccdigest_u32(cchmac_istate(_di_, HC)) #define cchmac_istate64(_di_, HC) ccdigest_u64(cchmac_istate(_di_, HC)) #define cchmac_istateccn(_di_, HC) ccdigest_ccn(cchmac_istate(_di_, HC)) -#if CORECRYPTO_USE_TRANSPARENT_UNION -#define cchmac_data(_di_, HC) ccdigest_data(_di_, ((cchmac_ctx_t)(HC)).digest) -#define cchmac_num(_di_, HC) ccdigest_num(_di_, ((cchmac_ctx_t)(HC)).digest) -#define cchmac_nbits(_di_, HC) ccdigest_nbits(_di_, ((cchmac_ctx_t)(HC)).digest) -#else #define cchmac_data(_di_, HC) ccdigest_data(_di_, ((ccdigest_ctx_t)(HC))) #define cchmac_num(_di_, HC) ccdigest_num(_di_, ((ccdigest_ctx_t)(HC))) #define cchmac_nbits(_di_, HC) ccdigest_nbits(_di_, ((ccdigest_ctx_t)(HC))) -#endif void cchmac_init(const struct ccdigest_info *di, cchmac_ctx_t ctx, size_t key_len, const void *key); @@ -88,20 +63,4 @@ void cchmac(const struct ccdigest_info *di, size_t key_len, const void *key, size_t data_len, const void *data, unsigned char *mac); -/* Test functions */ - -struct cchmac_test_input { - const struct ccdigest_info *di; - size_t key_len; - const void *key; - size_t data_len; - const void *data; - size_t mac_len; - const void *expected_mac; -}; - -int cchmac_test(const struct cchmac_test_input *input); -int cchmac_test_chunks(const struct cchmac_test_input *input, size_t chunk_size); - - #endif /* _CORECRYPTO_CCHMAC_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/cckprng.h b/EXTERNAL_HEADERS/corecrypto/cckprng.h new file mode 100644 index 000000000..5e5bfcacd --- /dev/null +++ b/EXTERNAL_HEADERS/corecrypto/cckprng.h @@ -0,0 +1,83 @@ +/* + * cckprng.h + * corecrypto + * + * Created on 12/7/2017 + * + * Copyright (c) 2017 Apple Inc. All rights reserved. + * + */ + +#ifndef _CORECRYPTO_CCKPRNG_H_ +#define _CORECRYPTO_CCKPRNG_H_ + +#include + +typedef struct PRNG *PrngRef; +typedef struct cckprng_ctx *cckprng_ctx_t; + +struct cckprng_ctx { + PrngRef prng; + uint64_t bytes_since_entropy; + uint64_t bytes_generated; +}; + +#define CCKPRNG_ENTROPY_INTERVAL (1 << 14) +#define CCKPRNG_RESEED_NTICKS 50 + +/* + @function cckprng_init + @abstract Initialize a kernel PRNG context. + + @param ctx Context for this instance + @param nbytes Length of the seed in bytes + @param seed Pointer to a high-entropy seed + + @result @p CCKPRNG_OK iff successful. Panic on @p CCKPRNG_ABORT. +*/ +int cckprng_init(cckprng_ctx_t ctx, size_t nbytes, const void *seed); + +/* + @function cckprng_reseed + @abstract Reseed a kernel PRNG context immediately. + + @param ctx Context for this instance + @param nbytes Length of the seed in bytes + @param seed Pointer to a high-entropy seed + + @result @p CCKPRNG_OK iff successful. Panic on @p CCKPRNG_ABORT. +*/ +int cckprng_reseed(cckprng_ctx_t ctx, size_t nbytes, const void *seed); + +/* + @function cckprng_addentropy + @abstract Add entropy to a kernel PRNG context. + + @param ctx Context for this instance + @param nbytes Length of the input entropy in bytes + @param seed Pointer to input entropy + + @result @p CCKPRNG_OK iff successful. Panic on @p CCKPRNG_ABORT. + + @discussion Input entropy is stored internally and consumed at the + opportune moment. This will not necessarily be before the next call + to @p cckprng_generate. To force an immediate reseed, call @p + cckprng_reseed. +*/ +int cckprng_addentropy(cckprng_ctx_t ctx, size_t nbytes, const void *entropy); + +/* + @function cckprng_generate + @abstract Generate random values for use in applications. + + @param ctx Context for this instance + @param nbytes Length of the desired output in bytes + @param seed Pointer to the output buffer + + @result @p CCKPRNG_OK iff successful. Panic on @p + CCKPRNG_ABORT. Provide input to @p cckprng_addentropy on @p + CCKPRNG_NEED_ENTROPY. +*/ +int cckprng_generate(cckprng_ctx_t ctx, size_t nbytes, void *out); + +#endif /* _CORECRYPTO_CCKPRNG_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccmd5.h b/EXTERNAL_HEADERS/corecrypto/ccmd5.h index 602fb0868..7e97a76f2 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccmd5.h +++ b/EXTERNAL_HEADERS/corecrypto/ccmd5.h @@ -17,8 +17,6 @@ #define CCMD5_OUTPUT_SIZE 16 #define CCMD5_STATE_SIZE 16 -extern const uint32_t ccmd5_initial_state[4]; - /* Selector */ const struct ccdigest_info *ccmd5_di(void); diff --git a/EXTERNAL_HEADERS/corecrypto/ccmode_factory.h b/EXTERNAL_HEADERS/corecrypto/ccmode_factory.h index 668ea9d59..a9498d1f7 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccmode_factory.h +++ b/EXTERNAL_HEADERS/corecrypto/ccmode_factory.h @@ -14,7 +14,7 @@ #include /* TODO: Remove dependency on this header. */ #include -/* Function and macros defined in this file are only to be used +/* Function and macros defined in this file are only to be used within corecrypto files. */ @@ -83,68 +83,6 @@ const struct ccmode_xts *cc##_cipher_##_xts_##_dir_##_mode(void) return &xts##_cipher_##_##_dir_; \ } -#if 0 - -/* example of how to make the selection function thread safe */ - -struct ccmode_cbc cc3des_cbc_mode_encrypt; -dispatch_once_t cc3des_mode_encrypt_init_once; - -void cc3des_mode_encrypt_init(void *ctx) { - struct ccmode_ecb *ecb = cc3des_ecb_encrypt_mode(); - ccmode_factory_cbc_encrypt(&cc3des_mode_encrypt, ecb); -} - -const struct ccmode_cbc *cc3des_cbc_encrypt_mode(void) { - dispatch_once_f(&cc3des_mode_encrypt_init_once, NULL, cc3des_mode_encrypt_init); - return &cc3des_mode_encrypt; -} - -struct ccmode_cbc cc3des_cbc_mode_encrypt = { - .n = CC3DES_LTC_ECB_ENCRYPT_N, - .init = ccmode_cbc_init, - .cbc = ccmode_cbc_encrypt, - .custom = &cc3des_ltc_ecb_encrypt -}; - -const struct ccmode_cbc *cc3des_cbc_encrypt_mode(void) { - return &cc3des_mode_encrypt; -} - -#endif - - - -int ccmode_cbc_init(const struct ccmode_cbc *cbc, cccbc_ctx *ctx, - size_t rawkey_len, const void *rawkey); -int ccmode_cbc_decrypt(const cccbc_ctx *ctx, cccbc_iv *iv, size_t nblocks, - const void *in, void *out); -int ccmode_cbc_encrypt(const cccbc_ctx *ctx, cccbc_iv *iv, size_t nblocks, - const void *in, void *out); - -struct _ccmode_cbc_key { - const struct ccmode_ecb *ecb; - cc_unit u[]; -}; - -/* Use this to statically initialize a ccmode_cbc object for decryption. */ -#define CCMODE_FACTORY_CBC_DECRYPT(ECB) { \ -.size = ccn_sizeof_size(sizeof(struct _ccmode_cbc_key)) + ccn_sizeof_size((ECB)->block_size) + ccn_sizeof_size((ECB)->size), \ -.block_size = (ECB)->block_size, \ -.init = ccmode_cbc_init, \ -.cbc = ccmode_cbc_decrypt, \ -.custom = (ECB) \ -} - -/* Use this to statically initialize a ccmode_cbc object for encryption. */ -#define CCMODE_FACTORY_CBC_ENCRYPT(ECB) { \ -.size = ccn_sizeof_size(sizeof(struct _ccmode_cbc_key)) + ccn_sizeof_size((ECB)->block_size) + ccn_sizeof_size((ECB)->size), \ -.block_size = (ECB)->block_size, \ -.init = ccmode_cbc_init, \ -.cbc = ccmode_cbc_encrypt, \ -.custom = (ECB) \ -} - /* Use these function to runtime initialize a ccmode_cbc decrypt object (for example if it's part of a larger structure). Normally you would pass a ecb decrypt mode implementation of some underlying algorithm as the ecb @@ -160,37 +98,6 @@ void ccmode_factory_cbc_encrypt(struct ccmode_cbc *cbc, const struct ccmode_ecb *ecb); -int ccmode_cfb_init(const struct ccmode_cfb *cfb, cccfb_ctx *ctx, - size_t rawkey_len, const void *rawkey, - const void *iv); -int ccmode_cfb_decrypt(cccfb_ctx *ctx, size_t nbytes, - const void *in, void *out); -int ccmode_cfb_encrypt(cccfb_ctx *ctx, size_t nbytes, - const void *in, void *out); -struct _ccmode_cfb_key { - const struct ccmode_ecb *ecb; - size_t pad_len; - cc_unit u[]; -}; - -/* Use this to statically initialize a ccmode_cfb object for decryption. */ -#define CCMODE_FACTORY_CFB_DECRYPT(ECB) { \ -.size = ccn_sizeof_size(sizeof(struct _ccmode_cfb_key)) + 2 * ccn_sizeof_size((ECB)->block_size) + ccn_sizeof_size((ECB)->size), \ -.block_size = 1, \ -.init = ccmode_cfb_init, \ -.cfb = ccmode_cfb_decrypt, \ -.custom = (ECB) \ -} - -/* Use this to statically initialize a ccmode_cfb object for encryption. */ -#define CCMODE_FACTORY_CFB_ENCRYPT(ECB) { \ -.size = ccn_sizeof_size(sizeof(struct _ccmode_cfb_key)) + 2 * ccn_sizeof_size((ECB)->block_size) + ccn_sizeof_size((ECB)->size), \ -.block_size = 1, \ -.init = ccmode_cfb_init, \ -.cfb = ccmode_cfb_encrypt, \ -.custom = (ECB) \ -} - /* Use these function to runtime initialize a ccmode_cfb decrypt object (for example if it's part of a larger structure). Normally you would pass a ecb encrypt mode implementation of some underlying algorithm as the ecb @@ -205,36 +112,6 @@ void ccmode_factory_cfb_decrypt(struct ccmode_cfb *cfb, void ccmode_factory_cfb_encrypt(struct ccmode_cfb *cfb, const struct ccmode_ecb *ecb); -int ccmode_cfb8_init(const struct ccmode_cfb8 *cfb8, cccfb8_ctx *ctx, - size_t rawkey_len, const void *rawkey, const void *iv); -int ccmode_cfb8_decrypt(cccfb8_ctx *ctx, size_t nbytes, - const void *in, void *out); -int ccmode_cfb8_encrypt(cccfb8_ctx *ctx, size_t nbytes, - const void *in, void *out); - -struct _ccmode_cfb8_key { - const struct ccmode_ecb *ecb; - cc_unit u[]; -}; - -/* Use this to statically initialize a ccmode_cfb8 object for decryption. */ -#define CCMODE_FACTORY_CFB8_DECRYPT(ECB) { \ -.size = ccn_sizeof_size(sizeof(struct _ccmode_cfb8_key)) + 2 * ccn_sizeof_size((ECB)->block_size) + ccn_sizeof_size((ECB)->size), \ -.block_size = 1, \ -.init = ccmode_cfb8_init, \ -.cfb8 = ccmode_cfb8_decrypt, \ -.custom = (ECB) \ -} - -/* Use this to statically initialize a ccmode_cfb8 object for encryption. */ -#define CCMODE_FACTORY_CFB8_ENCRYPT(ECB) { \ -.size = ccn_sizeof_size(sizeof(struct _ccmode_cfb8_key)) + 2 * ccn_sizeof_size((ECB)->block_size) + ccn_sizeof_size((ECB)->size), \ -.block_size = 1, \ -.init = ccmode_cfb8_init, \ -.cfb8 = ccmode_cfb8_encrypt, \ -.custom = (ECB) \ -} - /* Use these function to runtime initialize a ccmode_cfb8 decrypt object (for example if it's part of a larger structure). Normally you would pass a ecb decrypt mode implementation of some underlying algorithm as the ecb @@ -249,29 +126,6 @@ void ccmode_factory_cfb8_decrypt(struct ccmode_cfb8 *cfb8, void ccmode_factory_cfb8_encrypt(struct ccmode_cfb8 *cfb8, const struct ccmode_ecb *ecb); -int ccmode_ctr_init(const struct ccmode_ctr *ctr, ccctr_ctx *ctx, - size_t rawkey_len, const void *rawkey, const void *iv); -int ccmode_ctr_setctr(const struct ccmode_ctr *mode, ccctr_ctx *ctx, const void *ctr); -int ccmode_ctr_crypt(ccctr_ctx *ctx, size_t nbytes, - const void *in, void *out); - -struct _ccmode_ctr_key { - const struct ccmode_ecb *ecb; - size_t pad_offset; - cc_unit u[]; -}; - -/* Use this to statically initialize a ccmode_ctr object for decryption. */ -#define CCMODE_FACTORY_CTR_CRYPT(ECB_ENCRYPT) { \ -.size = ccn_sizeof_size(sizeof(struct _ccmode_ctr_key)) + 2 * ccn_sizeof_size((ECB_ENCRYPT)->block_size) + ccn_sizeof_size((ECB_ENCRYPT)->size), \ -.block_size = 1, \ -.ecb_block_size = (ECB_ENCRYPT)->block_size, \ -.init = ccmode_ctr_init, \ -.setctr = ccmode_ctr_setctr, \ -.ctr = ccmode_ctr_crypt, \ -.custom = (ECB_ENCRYPT) \ -} - /* Use these function to runtime initialize a ccmode_ctr decrypt object (for example if it's part of a larger structure). Normally you would pass a ecb encrypt mode implementation of some underlying algorithm as the ecb @@ -279,68 +133,6 @@ struct _ccmode_ctr_key { void ccmode_factory_ctr_crypt(struct ccmode_ctr *ctr, const struct ccmode_ecb *ecb); - -/* Create a gcm key from a gcm mode object. - key must point to at least sizeof(CCMODE_GCM_KEY(ecb)) bytes of free - storage. */ -int ccmode_gcm_init(const struct ccmode_gcm *gcm, ccgcm_ctx *ctx, - size_t rawkey_len, const void *rawkey); -int ccmode_gcm_set_iv(ccgcm_ctx *ctx, size_t iv_nbytes, const void *iv); -int ccmode_gcm_aad(ccgcm_ctx *ctx, size_t nbytes, const void *in); -int ccmode_gcm_decrypt(ccgcm_ctx *ctx, size_t nbytes, const void *in, - void *out); -int ccmode_gcm_encrypt(ccgcm_ctx *ctx, size_t nbytes, const void *in, - void *out); - -/*! - @function ccmode_gcm_finalize() finalizes AES-GCM call sequence - @param key encryption or decryption key - @param tag_nbytes length of tag in bytes - @param tag authentication tag - @result 0=success or non zero= error - @discussion For decryption, the tag parameter must be the expected-tag. A secure compare is performed between the provided expected-tag and the computed-tag. If they are the same, 0 is returned. Otherwise, non zero is returned. For encryption, tag is output and provides the authentication tag. - - */ -int ccmode_gcm_finalize(ccgcm_ctx *key, size_t tag_nbytes, void *tag); -int ccmode_gcm_reset(ccgcm_ctx *key); - -#define CCGCM_FLAGS_INIT_WITH_IV 1 - -// Here is what the structure looks like in memory -// [ temp space | length | *ecb | *ecb_key | table | ecb_key ] -// size of table depends on the implementation (VNG vs factory) -// currently, VNG and factory share the same "header" described here -// VNG may add additional data after the header -struct _ccmode_gcm_key { - // 5 blocks of temp space. - unsigned char H[16]; /* multiplier */ - unsigned char X[16]; /* accumulator */ - unsigned char Y[16]; /* counter */ - unsigned char Y_0[16]; /* initial counter */ - unsigned char buf[16]; /* buffer for stuff */ - - // State and length - uint16_t state; /* state the GCM code is in */ - uint16_t flags; /* flags (persistent across reset) */ - uint32_t buf_nbytes; /* length of data in buf */ - - uint64_t aad_nbytes; /* 64-bit counter used for IV and AAD */ - uint64_t text_nbytes; /* 64-bit counter for the plaintext PT */ - - // ECB - const struct ccmode_ecb *ecb; // ecb mode - // Pointer to the ECB key in the buffer - void *ecb_key; // address of the ecb_key in u, set in init function - int encdec; //is it an encrypt or decrypt object - - // Buffer with ECB key and H table if applicable - CC_ALIGNED(16) unsigned char u[]; // ecb key + tables -}; - -#define GCM_ECB_KEY_SIZE(ECB_ENCRYPT) \ - ((5 * ccn_sizeof_size((ECB_ENCRYPT)->block_size)) \ - + ccn_sizeof_size((ECB_ENCRYPT)->size)) - /* Use these function to runtime initialize a ccmode_gcm decrypt object (for example if it's part of a larger structure). For GCM you always pass a ecb encrypt mode implementation of some underlying algorithm as the ecb @@ -355,72 +147,6 @@ void ccmode_factory_gcm_decrypt(struct ccmode_gcm *gcm, void ccmode_factory_gcm_encrypt(struct ccmode_gcm *gcm, const struct ccmode_ecb *ecb_encrypt); - -/* CCM (only NIST approved with AES) */ -int ccmode_ccm_init(const struct ccmode_ccm *ccm, ccccm_ctx *ctx, - size_t rawkey_len, const void *rawkey); -int ccmode_ccm_set_iv(ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nonce_len, const void *nonce, - size_t mac_size, size_t auth_len, size_t data_len); -/* internal function */ -void ccmode_ccm_macdata(ccccm_ctx *key, ccccm_nonce *nonce_ctx, unsigned new_block, size_t nbytes, const void *in); -/* api function - disallows only mac'd data after data to encrypt was sent */ -int ccmode_ccm_cbcmac(ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nbytes, const void *in); -/* internal function */ -void ccmode_ccm_crypt(ccccm_ctx *key, ccccm_nonce *nonce_ctx, size_t nbytes, const void *in, void *out); -int ccmode_ccm_decrypt(ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nbytes, const void *in, - void *out); -int ccmode_ccm_encrypt(ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nbytes, const void *in, - void *out); -int ccmode_ccm_finalize(ccccm_ctx *key, ccccm_nonce *nonce_ctx, void *mac); -int ccmode_ccm_reset(ccccm_ctx *key, ccccm_nonce *nonce_ctx); - -struct _ccmode_ccm_key { - const struct ccmode_ecb *ecb; - cc_unit u[]; -}; - -struct _ccmode_ccm_nonce { - unsigned char A_i[16]; /* crypto block iv */ - unsigned char B_i[16]; /* mac block iv */ - unsigned char MAC[16]; /* crypted mac */ - unsigned char buf[16]; /* crypt buffer */ - - uint32_t mode; /* mode: IV -> AD -> DATA */ - uint32_t buflen; /* length of data in buf */ - uint32_t b_i_len; /* length of cbcmac data in B_i */ - - size_t nonce_size; - size_t mac_size; -}; - -/* Use this to statically initialize a ccmode_ccm object for decryption. */ -#define CCMODE_FACTORY_CCM_DECRYPT(ECB_ENCRYPT) { \ -.size = ccn_sizeof_size(sizeof(struct _ccmode_ccm_key)) + ccn_sizeof_size((ECB_ENCRYPT)->block_size) + ccn_sizeof_size((ECB_ENCRYPT)->size), \ -.nonce_size = ccn_sizeof_size(sizeof(struct _ccmode_ccm_nonce)), \ -.block_size = 1, \ -.init = ccmode_ccm_init, \ -.set_iv = ccmode_ccm_set_iv, \ -.cbcmac = ccmode_ccm_cbcmac, \ -.ccm = ccmode_ccm_decrypt, \ -.finalize = ccmode_ccm_finalize, \ -.reset = ccmode_ccm_reset, \ -.custom = (ECB_ENCRYPT) \ -} - -/* Use this to statically initialize a ccmode_ccm object for encryption. */ -#define CCMODE_FACTORY_CCM_ENCRYPT(ECB_ENCRYPT) { \ -.size = ccn_sizeof_size(sizeof(struct _ccmode_ccm_key)) + ccn_sizeof_size((ECB_ENCRYPT)->block_size) + ccn_sizeof_size((ECB_ENCRYPT)->size), \ -.nonce_size = ccn_sizeof_size(sizeof(struct _ccmode_ccm_nonce)), \ -.block_size = 1, \ -.init = ccmode_ccm_init, \ -.set_iv = ccmode_ccm_set_iv, \ -.cbcmac = ccmode_ccm_cbcmac, \ -.ccm = ccmode_ccm_encrypt, \ -.finalize = ccmode_ccm_finalize, \ -.reset = ccmode_ccm_reset, \ -.custom = (ECB_ENCRYPT) \ -} - /* Use these function to runtime initialize a ccmode_ccm decrypt object (for example if it's part of a larger structure). For CCM you always pass a ecb encrypt mode implementation of some underlying algorithm as the ecb @@ -436,28 +162,6 @@ void ccmode_factory_ccm_decrypt(struct ccmode_ccm *ccm, void ccmode_factory_ccm_encrypt(struct ccmode_ccm *ccm, const struct ccmode_ecb *ecb_encrypt); - -int ccmode_ofb_init(const struct ccmode_ofb *ofb, ccofb_ctx *ctx, - size_t rawkey_len, const void *rawkey, - const void *iv); -int ccmode_ofb_crypt(ccofb_ctx *ctx, size_t nbytes, - const void *in, void *out); - -struct _ccmode_ofb_key { - const struct ccmode_ecb *ecb; - size_t pad_len; - cc_unit u[]; -}; - -/* Use this to statically initialize a ccmode_ofb object. */ -#define CCMODE_FACTORY_OFB_CRYPT(ECB) { \ -.size = ccn_sizeof_size(sizeof(struct _ccmode_ofb_key)) + ccn_sizeof_size((ECB)->block_size) + ccn_sizeof_size((ECB)->size), \ -.block_size = 1, \ -.init = ccmode_ofb_init, \ -.ofb = ccmode_ofb_crypt, \ -.custom = (ECB) \ -} - /* Use these function to runtime initialize a ccmode_ofb encrypt object (for example if it's part of a larger structure). Normally you would pass a ecb encrypt mode implementation of some underlying algorithm as the ecb @@ -465,44 +169,6 @@ struct _ccmode_ofb_key { void ccmode_factory_ofb_crypt(struct ccmode_ofb *ofb, const struct ccmode_ecb *ecb); -int ccmode_omac_decrypt(ccomac_ctx *ctx, size_t nblocks, - const void *tweak, const void *in, void *out); -int ccmode_omac_encrypt(ccomac_ctx *ctx, size_t nblocks, - const void *tweak, const void *in, void *out); - -/* Create a omac key from a omac mode object. The tweak_len here - determines how long the tweak is in bytes, for each subsequent call to - ccmode_omac->omac(). - key must point to at least sizeof(CCMODE_OMAC_KEY(ecb)) bytes of free - storage. */ -int ccmode_omac_init(const struct ccmode_omac *omac, ccomac_ctx *ctx, - size_t tweak_len, size_t rawkey_len, - const void *rawkey); - -struct _ccmode_omac_key { - const struct ccmode_ecb *ecb; - size_t tweak_len; - cc_unit u[]; -}; - -/* Use this to statically initialize a ccmode_omac object for decryption. */ -#define CCMODE_FACTORY_OMAC_DECRYPT(ECB) { \ -.size = ccn_sizeof_size(sizeof(struct _ccmode_omac_key)) + 2 * ccn_sizeof_size((ECB)->size), \ -.block_size = (ECB)->block_size, \ -.init = ccmode_omac_init, \ -.omac = ccmode_omac_decrypt, \ -.custom = (ECB) \ -} - -/* Use this to statically initialize a ccmode_omac object for encryption. */ -#define CCMODE_FACTORY_OMAC_ENCRYPT(ECB) { \ -.size = ccn_sizeof_size(sizeof(struct _ccmode_omac_key)) + 2 * ccn_sizeof_size((ECB)->size), \ -.block_size = (ECB)->block_size, \ -.init = ccmode_omac_init, \ -.omac = ccmode_omac_encrypt, \ -.custom = (ECB) \ -} - /* Use these function to runtime initialize a ccmode_omac decrypt object (for example if it's part of a larger structure). Normally you would pass a ecb decrypt mode implementation of some underlying algorithm as the ecb @@ -517,62 +183,6 @@ void ccmode_factory_omac_decrypt(struct ccmode_omac *omac, void ccmode_factory_omac_encrypt(struct ccmode_omac *omac, const struct ccmode_ecb *ecb); - -/* Function prototypes used by the macros below, do not call directly. */ -int ccmode_xts_init(const struct ccmode_xts *xts, ccxts_ctx *ctx, - size_t key_nbytes, const void *data_key, - const void *tweak_key); -void ccmode_xts_key_sched(const struct ccmode_xts *xts, ccxts_ctx *ctx, - size_t key_nbytes, const void *data_key, - const void *tweak_key); -void *ccmode_xts_crypt(const ccxts_ctx *ctx, ccxts_tweak *tweak, - size_t nblocks, const void *in, void *out); -int ccmode_xts_set_tweak(const ccxts_ctx *ctx, ccxts_tweak *tweak, - const void *iv); - - -struct _ccmode_xts_key { - const struct ccmode_ecb *ecb; - const struct ccmode_ecb *ecb_encrypt; - cc_unit u[]; -}; - -struct _ccmode_xts_tweak { - // FIPS requires that for XTS that no more that 2^20 AES blocks may be processed for any given - // Key, Tweak Key, and tweak combination - // the bytes_processed field in the context will accumuate the number of blocks processed and - // will fail the encrypt/decrypt if the size is violated. This counter will be reset to 0 - // when set_tweak is called. - size_t blocks_processed; - cc_unit u[]; -}; - -/* Use this to statically initialize a ccmode_xts object for decryption. */ -#define CCMODE_FACTORY_XTS_DECRYPT(ECB, ECB_ENCRYPT) { \ -.size = ccn_sizeof_size(sizeof(struct _ccmode_xts_key)) + 2 * ccn_sizeof_size((ECB)->size), \ -.tweak_size = ccn_sizeof_size(sizeof(struct _ccmode_xts_tweak)) + ccn_sizeof_size(ecb->block_size), \ -.block_size = ecb->block_size, \ -.init = ccmode_xts_init, \ -.key_sched = ccmode_xts_key_sched, \ -.set_tweak = ccmode_xts_set_tweak, \ -.xts = ccmode_xts_crypt, \ -.custom = (ECB), \ -.custom1 = (ECB_ENCRYPT) \ -} - -/* Use this to statically initialize a ccmode_xts object for encryption. */ -#define CCMODE_FACTORY_XTS_ENCRYPT(ECB, ECB_ENCRYPT) { \ -.size = ccn_sizeof_size(sizeof(struct _ccmode_xts_key)) + 2 * ccn_sizeof_size((ECB)->size), \ -.tweak_size = ccn_sizeof_size(sizeof(struct _ccmode_xts_tweak)) + ccn_sizeof_size(ecb->block_size), \ -.block_size = ecb->block_size, \ -.init = ccmode_xts_init, \ -.key_sched = ccmode_xts_key_sched, \ -.set_tweak = ccmode_xts_set_tweak, \ -.xts = ccmode_xts_crypt, \ -.custom = (ECB), \ -.custom1 = (ECB_ENCRYPT) \ -} - /* Use these function to runtime initialize a ccmode_xts decrypt object (for example if it's part of a larger structure). Normally you would pass a ecb decrypt mode implementation of some underlying algorithm as the ecb diff --git a/EXTERNAL_HEADERS/corecrypto/ccmode_impl.h b/EXTERNAL_HEADERS/corecrypto/ccmode_impl.h index 795054161..a0c6e24bc 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccmode_impl.h +++ b/EXTERNAL_HEADERS/corecrypto/ccmode_impl.h @@ -35,7 +35,7 @@ struct ccmode_ecb { * 1- ccmod_xxx_init() * 2- ccmod_xxx_decrypt() * 3- ccmod_xxx_encrypt() - * + * * stateful modes CCM and GCM: They provide 7 interface functions that return error codes if a function is called out of state * 1- ccmod_xxx_init() * 2- ccmod_xxx_setiv() @@ -131,7 +131,7 @@ struct ccmode_xts { size_t tweak_size; /* first argument to ccxts_tweak_decl(). Size of the tweak structure, not the expected tweak size */ size_t block_size; - /* Create a xts key from a xts mode object. + /* Create a xts key from a xts mode object. key must point to at least 'size' bytes of free storage. tweak_key must point to at least 'tweak_size' bytes of free storage. key and tweak_key must differ. @@ -139,7 +139,7 @@ struct ccmode_xts { */ int (*init)(const struct ccmode_xts *xts, ccxts_ctx *ctx, size_t key_nbytes, const void *data_key, const void *tweak_key); - + void (*key_sched)(const struct ccmode_xts *xts, ccxts_ctx *ctx, size_t key_nbytes, const void *data_key, const void *tweak_key); @@ -174,7 +174,7 @@ struct ccmode_gcm { const void *custom; }; -//8- GCM mode, statful +//8- CCM mode, stateful cc_aligned_struct(16) ccccm_ctx; cc_aligned_struct(16) ccccm_nonce; @@ -193,6 +193,20 @@ struct ccmode_ccm { const void *custom; }; +/* We need to expose this (currently)to keep CommonCrypto happy. */ +struct _ccmode_ccm_nonce { + unsigned char A_i[16]; /* crypto block iv */ + unsigned char B_i[16]; /* mac block iv */ + unsigned char MAC[16]; /* crypted mac */ + unsigned char buf[16]; /* crypt buffer */ + + uint32_t mode; /* mode: IV -> AD -> DATA */ + uint32_t buflen; /* length of data in buf */ + uint32_t b_i_len; /* length of cbcmac data in B_i */ + + size_t nonce_size; + size_t mac_size; +}; /* OMAC mode. */ cc_aligned_struct(16) ccomac_ctx; diff --git a/EXTERNAL_HEADERS/corecrypto/ccmode_siv.h b/EXTERNAL_HEADERS/corecrypto/ccmode_siv.h index 5186e1227..99322ad2d 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccmode_siv.h +++ b/EXTERNAL_HEADERS/corecrypto/ccmode_siv.h @@ -126,13 +126,4 @@ CC_INLINE int ccsiv_one_shot(const struct ccmode_siv *mode, return rc; } -void ccmode_factory_siv_encrypt(struct ccmode_siv *siv, - const struct ccmode_cbc *cbc, - const struct ccmode_ctr *ctr); - -void ccmode_factory_siv_decrypt(struct ccmode_siv *siv, - const struct ccmode_cbc *cbc, - const struct ccmode_ctr *ctr); - - #endif /* _CORECRYPTO_CCMODE_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccn.h b/EXTERNAL_HEADERS/corecrypto/ccn.h index afaed41ae..2d3e847c9 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccn.h +++ b/EXTERNAL_HEADERS/corecrypto/ccn.h @@ -94,6 +94,8 @@ typedef struct { /* Returns the count (n) of a ccn vector that can represent _size_ bytes. */ #define ccn_nof_size(_size_) (((_size_) + CCN_UNIT_SIZE - 1) / CCN_UNIT_SIZE) +#define ccn_nof_sizeof(_expr_) ccn_nof_size(sizeof (_expr_)) + /* Return the max number of bits a ccn vector of _n_ units can hold. */ #define ccn_bitsof_n(_n_) ((_n_) * CCN_UNIT_BITS) @@ -283,7 +285,7 @@ typedef struct { #define CCN521_N ccn_nof(521) /* Return the number of used units after stripping leading 0 units. */ -CC_PURE CC_NONNULL2 +CC_PURE CC_NONNULL((2)) cc_size ccn_n(cc_size n, const cc_unit *s); /* s >> k -> r return bits shifted out of least significant word in bits [0, n> @@ -292,29 +294,13 @@ cc_size ccn_n(cc_size n, const cc_unit *s); word shifts. */ CC_NONNULL((2, 3)) cc_unit ccn_shift_right(cc_size n, cc_unit *r, const cc_unit *s, size_t k); -CC_NONNULL((2, 3)) -void ccn_shift_right_multi(cc_size n, cc_unit *r,const cc_unit *s, size_t k); - -/* s << k -> r return bits shifted out of most significant word in bits [0, n> - { N bit, scalar -> N bit } N = n * sizeof(cc_unit) * 8 - the _multi version doesn't return the shifted bits, but does support multiple - word shifts */ -CC_NONNULL((2, 3)) -cc_unit ccn_shift_left(cc_size n, cc_unit *r, const cc_unit *s, size_t k); -CC_NONNULL((2, 3)) -void ccn_shift_left_multi(cc_size n, cc_unit *r, const cc_unit *s, size_t k); /* s == 0 -> return 0 | s > 0 -> return index (starting at 1) of most significant bit that is 1. { N bit } N = n * sizeof(cc_unit) * 8 */ -CC_NONNULL2 +CC_NONNULL((2)) size_t ccn_bitlen(cc_size n, const cc_unit *s); -/* Returns the number of bits which are zero before the first one bit - counting from least to most significant bit. */ -CC_NONNULL2 -size_t ccn_trailing_zeros(cc_size n, const cc_unit *s); - /* s == 0 -> return true | s != 0 -> return false { N bit } N = n * sizeof(cc_unit) * 8 */ #define ccn_is_zero(_n_, _s_) (!ccn_n(_n_, _s_)) @@ -348,9 +334,6 @@ int ccn_cmpn(cc_size ns, const cc_unit *s, CC_NONNULL((2, 3, 4)) cc_unit ccn_sub(cc_size n, cc_unit *r, const cc_unit *s, const cc_unit *t); -/* |s - t| -> r return 1 iff t > s, 0 otherwise */ -cc_unit ccn_abs(cc_size n, cc_unit *r, const cc_unit *s, const cc_unit *t); - /* s - v -> r return 1 iff v > s return 0 otherwise. { N bit, sizeof(cc_unit) * 8 bit -> N bit } N = n * sizeof(cc_unit) * 8 */ CC_NONNULL((2, 3)) @@ -388,23 +371,12 @@ cc_unit ccn_addn(cc_size n, cc_unit *r, const cc_unit *s, } -CC_NONNULL((2, 3, 4)) -void ccn_lcm(cc_size n, cc_unit *r2n, const cc_unit *s, const cc_unit *t); - - /* s * t -> r_2n r_2n must not overlap with s nor t { n bit, n bit -> 2 * n bit } n = count * sizeof(cc_unit) * 8 { N bit, N bit -> 2N bit } N = ccn_bitsof(n) */ CC_NONNULL((2, 3, 4)) void ccn_mul(cc_size n, cc_unit *r_2n, const cc_unit *s, const cc_unit *t); -/* s * t -> r_2n r_2n must not overlap with s nor t - { n bit, n bit -> 2 * n bit } n = count * sizeof(cc_unit) * 8 - { N bit, N bit -> 2N bit } N = ccn_bitsof(n) - Provide a workspace for potential speedup */ -CC_NONNULL((1, 3, 4, 5)) -void ccn_mul_ws(cc_ws_t ws, cc_size count, cc_unit *r, const cc_unit *s, const cc_unit *t); - /* s[0..n) * v -> r[0..n)+return value { N bit, sizeof(cc_unit) * 8 bit -> N + sizeof(cc_unit) * 8 bit } N = n * sizeof(cc_unit) * 8 */ CC_NONNULL((2, 3)) @@ -422,28 +394,18 @@ CC_NONNULL((2, 3, 4)) void ccn_mod(cc_size n, cc_unit *r, const cc_unit *a_2n, const cc_unit *d); #endif -/* r = gcd(s, t). - N bit, N bit -> N bit */ -CC_NONNULL((2, 3, 4)) -void ccn_gcd(cc_size n, cc_unit *r, const cc_unit *s, const cc_unit *t); - -/* r = gcd(s, t). - N bit, N bit -> O bit */ -CC_NONNULL((2, 4, 6)) -void ccn_gcdn(cc_size rn, cc_unit *r, cc_size sn, const cc_unit *s, cc_size tn, const cc_unit *t); - /* r = (data, len) treated as a big endian byte array, return -1 if data doesn't fit in r, return 0 otherwise. */ CC_NONNULL((2, 4)) int ccn_read_uint(cc_size n, cc_unit *r, size_t data_size, const uint8_t *data); /* r = (data, len) treated as a big endian byte array, return -1 if data - doesn't fit in r, return 0 otherwise. + doesn't fit in r, return 0 otherwise. ccn_read_uint strips leading zeroes and doesn't care about sign. */ #define ccn_read_int(n, r, data_size, data) ccn_read_uint(n, r, data_size, data) /* Return actual size in bytes needed to serialize s. */ -CC_PURE CC_NONNULL2 +CC_PURE CC_NONNULL((2)) size_t ccn_write_uint_size(cc_size n, const cc_unit *s); /* Serialize s, to out. @@ -473,9 +435,9 @@ cc_size ccn_write_uint_padded(cc_size n, const cc_unit* s, size_t out_size, uint } -/* Return actual size in bytes needed to serialize s as int +/* Return actual size in bytes needed to serialize s as int (adding leading zero if high bit is set). */ -CC_PURE CC_NONNULL2 +CC_PURE CC_NONNULL((2)) size_t ccn_write_int_size(cc_size n, const cc_unit *s); /* Serialize s, to out. @@ -491,55 +453,25 @@ size_t ccn_write_int_size(cc_size n, const cc_unit *s); CC_NONNULL((2, 4)) void ccn_write_int(cc_size n, const cc_unit *s, size_t out_size, void *out); -#if CCN_DEDICATED_SQR - -/* s^2 -> r - { n bit -> 2 * n bit } */ -CC_NONNULL((2, 3)) -void ccn_sqr(cc_size n, cc_unit *r, const cc_unit *s); - -/* s^2 -> r - { n bit -> 2 * n bit } */ -CC_NONNULL((1, 3, 4)) -void ccn_sqr_ws(cc_ws_t ws, cc_size n, cc_unit *r, const cc_unit *s); - -#else - -/* s^2 -> r - { n bit -> 2 * n bit } */ -CC_INLINE CC_NONNULL((2, 3)) -void ccn_sqr(cc_size n, cc_unit *r, const cc_unit *s) { - ccn_mul(n, r, s, s); -} - -/* s^2 -> r - { n bit -> 2 * n bit } */ -CC_INLINE CC_NONNULL((2, 3, 4)) -void ccn_sqr_ws(cc_ws_t ws, cc_size n, cc_unit *r, const cc_unit *s) { - ccn_mul_ws(ws, n, r, s, s); -} - -#endif - /* s -> r { n bit -> n bit } */ CC_NONNULL((2, 3)) void ccn_set(cc_size n, cc_unit *r, const cc_unit *s); -CC_INLINE CC_NONNULL2 +CC_INLINE CC_NONNULL((2)) void ccn_zero(cc_size n, cc_unit *r) { cc_zero(ccn_sizeof_n(n),r); } -CC_INLINE CC_NONNULL2 +CC_INLINE CC_NONNULL((2)) void ccn_clear(cc_size n, cc_unit *r) { cc_clear(ccn_sizeof_n(n),r); } -CC_NONNULL2 +CC_NONNULL((2)) void ccn_zero_multi(cc_size n, cc_unit *r, ...); -CC_INLINE CC_NONNULL2 +CC_INLINE CC_NONNULL((2)) void ccn_seti(cc_size n, cc_unit *r, cc_unit v) { /* assert(n > 0); */ r[0] = v; @@ -589,7 +521,7 @@ void ccn_setn(cc_size n, cc_unit *r, const cc_size s_size, const cc_unit *s) { #endif /* Swap units in r in place from cc_unit vector byte order to big endian byte order (or back). */ -CC_INLINE CC_NONNULL2 +CC_INLINE CC_NONNULL((2)) void ccn_swap(cc_size n, cc_unit *r) { cc_unit *e; for (e = r + n - 1; r < e; ++r, --e) { @@ -609,9 +541,9 @@ void ccn_xor(cc_size n, cc_unit *r, const cc_unit *s, const cc_unit *t) { } /* Debugging */ -CC_NONNULL2 +CC_NONNULL((2)) void ccn_print(cc_size n, const cc_unit *s); -CC_NONNULL3 +CC_NONNULL((3)) void ccn_lprint(cc_size n, const char *label, const cc_unit *s); /* Forward declaration so we don't depend on ccrng.h. */ @@ -631,16 +563,6 @@ int ccn_random(cc_size n, cc_unit *r, struct ccrng_state *rng) { CC_NONNULL((2, 3)) int ccn_random_bits(cc_size nbits, cc_unit *r, struct ccrng_state *rng); -/*! - @brief ccn_make_recip(cc_size nd, cc_unit *recip, const cc_unit *d) computes the reciprocal of d: recip = 2^2b/d where b=bitlen(d) - - @param nd length of array d - @param recip returned reciprocal of size nd+1 - @param d input number d -*/ -CC_NONNULL((2, 3)) -int ccn_make_recip(cc_size nd, cc_unit *recip, const cc_unit *d); - CC_NONNULL((6, 8)) int ccn_div_euclid(cc_size nq, cc_unit *q, cc_size nr, cc_unit *r, cc_size na, const cc_unit *a, cc_size nd, const cc_unit *d); diff --git a/EXTERNAL_HEADERS/corecrypto/ccrc4.h b/EXTERNAL_HEADERS/corecrypto/ccrc4.h index 3b50710a3..eaf644d1d 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccrc4.h +++ b/EXTERNAL_HEADERS/corecrypto/ccrc4.h @@ -26,19 +26,8 @@ struct ccrc4_info { void (*crypt)(ccrc4_ctx *ctx, size_t nbytes, const void *in, void *out); }; - const struct ccrc4_info *ccrc4(void); extern const struct ccrc4_info ccrc4_eay; -struct ccrc4_vector { - size_t keylen; - const void *key; - size_t datalen; - const void *pt; - const void *ct; -}; - -int ccrc4_test(const struct ccrc4_info *rc4, const struct ccrc4_vector *v); - #endif /* _CORECRYPTO_CCRC4_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccrng.h b/EXTERNAL_HEADERS/corecrypto/ccrng.h index 698f412ca..c6bc18a90 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccrng.h +++ b/EXTERNAL_HEADERS/corecrypto/ccrng.h @@ -13,18 +13,6 @@ #include -#define CCERR_DEVICE -100 -#define CCERR_INTERUPTS -101 -#define CCERR_CRYPTO_CONFIG -102 -#define CCERR_PERMS -103 -#define CCERR_PARAMETER -104 -#define CCERR_MEMORY -105 -#define CCERR_FILEDESC -106 -#define CCERR_OUT_OF_ENTROPY -107 -#define CCERR_INTERNAL -108 -#define CCERR_ATFORK -109 -#define CCERR_OVERFLOW -110 - #define CCRNG_STATE_COMMON \ int (*generate)(struct ccrng_state *rng, size_t outlen, void *out); @@ -55,6 +43,6 @@ struct ccrng_state { struct ccrng_state *ccrng(int *error); //call this macro with the rng argument set to output of the call to the ccrng() function -#define ccrng_generate(rng, outlen, out) ((rng)->generate((rng), (outlen), (out))) +#define ccrng_generate(rng, outlen, out) ((rng)->generate((struct ccrng_state *)(rng), (outlen), (out))) #endif /* _CORECRYPTO_CCRNG_H_ */ diff --git a/EXTERNAL_HEADERS/corecrypto/ccrsa.h b/EXTERNAL_HEADERS/corecrypto/ccrsa.h index c821efc40..0f70c3740 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccrsa.h +++ b/EXTERNAL_HEADERS/corecrypto/ccrsa.h @@ -21,17 +21,6 @@ // This limit is relaxed to accommodate potential third-party consumers #define CCRSA_KEYGEN_MAX_NBITS 8192 -// Program error: buffer too small or encrypted message is too small -#define CCRSA_INVALID_INPUT -1 -// Invalid crypto configuration: Hash length versus RSA key size -#define CCRSA_INVALID_CONFIG -2 -// The data is invalid (we won't say more for security -#define CCRSA_DECRYPTION_ERROR -3 - -#define CCRSA_ENCODING_ERROR -4 -#define CCRSA_DECODING_ERROR -5 -#define CCRSA_SIGNATURE_GEN_ERROR -6 - struct ccrsa_full_ctx { __CCZP_ELEMENTS_DEFINITIONS(pb_) } CC_ALIGNED(CCN_UNIT_SIZE); @@ -44,32 +33,9 @@ struct ccrsa_priv_ctx { __CCZP_ELEMENTS_DEFINITIONS(pv_) } CC_ALIGNED(CCN_UNIT_SIZE); - -#if CORECRYPTO_USE_TRANSPARENT_UNION - typedef union { - cczp_t zp; - struct ccrsa_pub_ctx* pub; - struct ccrsa_full_ctx *full; - } ccrsa_full_ctx_t __attribute__((transparent_union)); - typedef struct ccrsa_full_ctx ccrsa_full_ctx; - typedef struct ccrsa_priv_ctx ccrsa_priv_ctx; - - typedef union { - cczp_t zp; - ccrsa_priv_ctx *priv; - } ccrsa_priv_ctx_t __attribute__((transparent_union)); - - -typedef ccrsa_full_ctx_t ccrsa_pub_ctx_t; -typedef struct ccrsa_pub_ctx ccrsa_pub_ctx; - -#else - typedef struct ccrsa_full_ctx* ccrsa_full_ctx_t; - typedef struct ccrsa_pub_ctx* ccrsa_pub_ctx_t; - typedef struct ccrsa_priv_ctx* ccrsa_priv_ctx_t; -#endif - - +typedef struct ccrsa_full_ctx* ccrsa_full_ctx_t; +typedef struct ccrsa_pub_ctx* ccrsa_pub_ctx_t; +typedef struct ccrsa_priv_ctx* ccrsa_priv_ctx_t; /* public key cczp d=e^-1 mod phi(m) priv key cczp priv key cczq dp, dq, qinv @@ -90,7 +56,7 @@ typedef struct ccrsa_pub_ctx ccrsa_pub_ctx; /* Declare a fully scheduled rsa key. Size is the size in bytes each ccn in the key. For example to declare (on the stack or in a struct) a 1021 bit - rsa public key named foo use ccrsa_pub_ctx_decl(ccn_sizeof(1021), foo). + rsa public key named foo use ccrsa_pub_ctx_decl(ccn_sizeof(1021), foo). */ #define ccrsa_full_ctx_decl(_size_, _name_) cc_ctx_decl(struct ccrsa_full_ctx, ccrsa_full_ctx_size(_size_), _name_) #define ccrsa_full_ctx_clear(_size_, _name_) cc_clear(ccrsa_full_ctx_size(_size_), _name_) @@ -101,19 +67,9 @@ typedef struct ccrsa_pub_ctx ccrsa_pub_ctx; // The offsets are computed using pb_ccn. If any object other than ccrsa_full_ctx_t // or ccrsa_pub_ctx_t is passed to the macros, compiler error is generated. - - -#if CORECRYPTO_USE_TRANSPARENT_UNION -//#define ccrsa_ctx_zm(_ctx_) (((ccrsa_pub_ctx_t)(_ctx_)).zp) - - CC_CONST CC_INLINE cczp_t ccrsa_ctx_zm(ccrsa_full_ctx_t _ctx_) { return ((cczp_t)(struct cczp *)((_ctx_).full)); } - CC_CONST CC_INLINE cc_unit *ccrsa_ctx_m(ccrsa_full_ctx_t _ctx_){ return ((_ctx_).full->pb_ccn);} - #define ccrsa_ctx_n(_ctx_) (ccrsa_ctx_zm(_ctx_).zp->n) -#else - #define ccrsa_ctx_zm(_ctx_) ((cczp_t)(_ctx_)) - #define ccrsa_ctx_n(_ctx_) (ccrsa_ctx_zm(_ctx_)->n) - #define ccrsa_ctx_m(_ctx_) ((_ctx_)->pb_ccn) -#endif +#define ccrsa_ctx_zm(_ctx_) ((cczp_t)(_ctx_)) +#define ccrsa_ctx_n(_ctx_) (ccrsa_ctx_zm(_ctx_)->n) +#define ccrsa_ctx_m(_ctx_) ((_ctx_)->pb_ccn) #define ccrsa_ctx_e(_ctx_) (ccrsa_ctx_m(_ctx_) + 2 * ccrsa_ctx_n(_ctx_) + 1) #define ccrsa_ctx_d(_ctx_) (ccrsa_ctx_m(_ctx_) + 3 * ccrsa_ctx_n(_ctx_) + 1) @@ -121,36 +77,13 @@ typedef struct ccrsa_pub_ctx ccrsa_pub_ctx; // accessors to ccrsa private key fields // The offsets are computed using pv_ccn. If any object other than ccrsa_priv_ctx_t // is passed to the macros, compiler error is generated. -#if CORECRYPTO_USE_TRANSPARENT_UNION - -/* rvalue accessors to ccec_key fields. */ -CC_CONST CC_INLINE -ccrsa_priv_ctx_t ccrsa_get_private_ctx_ptr(ccrsa_full_ctx_t fk) { - cc_unit *p = (cc_unit *)fk.full; - cc_size p_size = ccrsa_ctx_n(fk); - p += ccn_nof_size(ccrsa_pub_ctx_size(ccn_sizeof_n(p_size))) + p_size; - ccrsa_priv_ctx *priv = (ccrsa_priv_ctx *)p; - return (ccrsa_priv_ctx_t)priv; -} - -CC_CONST CC_INLINE -ccrsa_pub_ctx_t ccrsa_ctx_public(ccrsa_full_ctx_t fk) { - return (ccrsa_pub_ctx_t) fk.full; -} - -#define ccrsa_ctx_private_zp(FK) ((ccrsa_get_private_ctx_ptr(FK)).zp) -#define ccrsa_ctx_private_zq(FK) ((cczp_t)((ccrsa_get_private_ctx_ptr(FK)).zp.zp->ccn + 2 * ccrsa_ctx_private_zp(FK).zp->n + 1)) -#define ccrsa_ctx_private_dp(FK) ((ccrsa_get_private_ctx_ptr(FK)).zp.zp->ccn + 4 * ccrsa_ctx_private_zp(FK).zp->n + 2 + ccn_nof_size(sizeof(struct cczp))) -#define ccrsa_ctx_private_dq(FK) ((ccrsa_get_private_ctx_ptr(FK)).zp.zp->ccn + 5 * ccrsa_ctx_private_zp(FK).zp->n + 2 + ccn_nof_size(sizeof(struct cczp))) -#define ccrsa_ctx_private_qinv(FK) ((ccrsa_get_private_ctx_ptr(FK)).zp.zp->ccn + 6 * ccrsa_ctx_private_zp(FK).zp->n + 2 + ccn_nof_size(sizeof(struct cczp))) - -#else #define ccrsa_ctx_private_zp(FK) ((cczp_t)ccrsa_get_private_ctx_ptr(FK)) #define ccrsa_ctx_private_zq(FK) ((cczp_t)((ccrsa_get_private_ctx_ptr(FK))->pv_ccn + 2 * ccrsa_ctx_private_zp(FK)->n + 1)) #define ccrsa_ctx_private_dp(FK) ((ccrsa_get_private_ctx_ptr(FK))->pv_ccn + 4 * ccrsa_ctx_private_zp(FK)->n + 2 + ccn_nof_size(sizeof(struct cczp))) #define ccrsa_ctx_private_dq(FK) ((ccrsa_get_private_ctx_ptr(FK))->pv_ccn + 5 * ccrsa_ctx_private_zp(FK)->n + 2 + ccn_nof_size(sizeof(struct cczp))) #define ccrsa_ctx_private_qinv(FK) ((ccrsa_get_private_ctx_ptr(FK))->pv_ccn + 6 * ccrsa_ctx_private_zp(FK)->n + 2 + ccn_nof_size(sizeof(struct cczp))) +/* rvalue accessors to ccec_key fields. */ CC_CONST CC_INLINE ccrsa_priv_ctx_t ccrsa_get_private_ctx_ptr(ccrsa_full_ctx_t fk) { ccrsa_priv_ctx_t priv = (ccrsa_priv_ctx_t)(ccrsa_ctx_d(fk)+ccrsa_ctx_n(fk)); @@ -168,8 +101,6 @@ ccrsa_pub_ctx_t ccrsa_ctx_public(ccrsa_full_ctx_t fk) { return (ccrsa_pub_ctx_t) fk; } -#endif - /* Return exact key bit size */ static inline size_t ccrsa_pubkeylength(ccrsa_pub_ctx_t pubk) { @@ -181,13 +112,13 @@ ccrsa_pubkeylength(ccrsa_pub_ctx_t pubk) { #define CCRSA_PKCS1_PAD_ENCRYPT 2 /* Initialize key based on modulus and e as cc_unit. key->zp.n must already be set. */ -CC_NONNULL_TU((1)) CC_NONNULL((2, 3)) +CC_NONNULL((1, 2, 3)) int ccrsa_init_pub(ccrsa_pub_ctx_t key, const cc_unit *modulus, const cc_unit *e); /* Initialize key based on modulus and e as big endian byte array key->zp.n must already be set. */ -CC_NONNULL_TU((1)) CC_NONNULL((3 ,5)) +CC_NONNULL((1, 3, 5)) int ccrsa_make_pub(ccrsa_pub_ctx_t pubk, size_t exp_nbytes, const uint8_t *exp, size_t mod_nbytes, const uint8_t *mod); @@ -196,7 +127,7 @@ int ccrsa_make_pub(ccrsa_pub_ctx_t pubk, the result in out. Both in and out should be cc_unit aligned and ccrsa_key_n(key) units long. Clients should use ccn_read_uint() to convert bytes to a cc_unit to use for this API.*/ -CC_NONNULL_TU((1)) CC_NONNULL((2, 3)) +CC_NONNULL((1, 2, 3)) int ccrsa_pub_crypt(ccrsa_pub_ctx_t key, cc_unit *out, const cc_unit *in); /* Generate an nbit rsa key pair in key, which should be allocated using @@ -204,19 +135,19 @@ int ccrsa_pub_crypt(ccrsa_pub_ctx_t key, cc_unit *out, const cc_unit *in); byte array exponent e of length e_size is used as the exponent. It's an error to call this function with an exponent larger than nbits. rng must be a pointer to an initialized struct ccrng_state. */ -CC_NONNULL_TU((2)) CC_NONNULL((4, 5)) +CC_NONNULL((2, 4, 5)) int ccrsa_generate_key(size_t nbits, ccrsa_full_ctx_t rsa_ctx, size_t e_size, const void *e, struct ccrng_state *rng) CC_WARN_RESULT; /* Generate RSA key in conformance with FIPS186-4 standard */ -CC_NONNULL_TU((2)) CC_NONNULL((4, 5, 6)) +CC_NONNULL((2, 4, 5, 6)) int ccrsa_generate_fips186_key(size_t nbits, ccrsa_full_ctx_t fk, size_t e_size, const void *eBytes, struct ccrng_state *rng1, struct ccrng_state *rng2) CC_WARN_RESULT; /* Construct RSA key from fix input in conformance with FIPS186-4 standard */ -CC_NONNULL_TU((16)) CC_NONNULL((3, 5, 7, 9, 11, 13, 15)) +CC_NONNULL((3, 5, 7, 9, 11, 13, 15, 16)) int ccrsa_make_fips186_key(size_t nbits, const cc_size e_n, const cc_unit *e, @@ -262,14 +193,14 @@ ccrsa_make_fips186_key(size_t nbits, * @param sigSize The length of generated signature in bytes, which equals the size of the RSA modulus. * @return 0:ok, non-zero:error */ -CC_NONNULL((2,3,5,7,8,9)) +CC_NONNULL((2, 3, 5, 7, 8, 9)) int ccrsa_sign_pss(ccrsa_full_ctx_t key, const struct ccdigest_info* hashAlgorithm, const struct ccdigest_info* MgfHashAlgorithm, size_t saltSize, struct ccrng_state *rng, size_t hSize, const uint8_t *mHash, size_t *sigSize, uint8_t *sig); -CC_NONNULL((2,3,5,7,9)) +CC_NONNULL((2, 3, 5, 7, 9)) int ccrsa_verify_pss(ccrsa_pub_ctx_t key, const struct ccdigest_info* di, const struct ccdigest_info* MgfDi, size_t digestSize, const uint8_t *digest, @@ -290,37 +221,38 @@ int ccrsa_verify_pss(ccrsa_pub_ctx_t key, for the output signature @result 0 iff successful. - + @discussion Null OID is a special case, required to support RFC 4346 where the padding is based on SHA1+MD5. In general it is not recommended to use a NULL OID, except when strictly required for interoperability */ -CC_NONNULL_TU((1)) CC_NONNULL((4, 5, 6)) +CC_NONNULL((1, 4, 5, 6)) int ccrsa_sign_pkcs1v15(ccrsa_full_ctx_t key, const uint8_t *oid, size_t digest_len, const uint8_t *digest, size_t *sig_len, uint8_t *sig); /*! - @function ccrsa_sign_pkcs1v15 - @abstract RSA signature with PKCS#1 v1.5 format per PKCS#1 v2.2 - - @param key Public key - @param oid OID describing the type of digest passed in - @param digest_len Byte length of the digest - @param digest Byte array of digest_len bytes containing the digest - @param sig_len Number of byte of the signature sig. - @param sig Pointer to the signature buffer of sig_len - @param valid Output boolean, true if the signature is valid. - - @result 0 iff successful. - - @discussion Null OID is a special case, required to support RFC 4346 where the padding - is based on SHA1+MD5. In general it is not recommended to use a NULL OID, - except when strictly required for interoperability - */ -CC_NONNULL_TU((1)) CC_NONNULL((4, 6, 7)) + @function ccrsa_verify_pkcs1v15 + @abstract RSA signature with PKCS#1 v1.5 format per PKCS#1 v2.2 + + @param key Public key + @param oid OID describing the type of digest passed in + @param digest_len Byte length of the digest + @param digest Byte array of digest_len bytes containing the digest + @param sig_len Number of byte of the signature sig. + @param sig Pointer to the signature buffer of sig_len + @param valid Output boolean, true if the signature is valid. + + @result 0 iff successful. + + @discussion Null OID is a special case, required to support RFC 4346 + where the padding is based on SHA1+MD5. In general it is not + recommended to use a NULL OID, except when strictly required for + interoperability. +*/ +CC_NONNULL((1, 4, 6, 7)) int ccrsa_verify_pkcs1v15(ccrsa_pub_ctx_t key, const uint8_t *oid, size_t digest_len, const uint8_t *digest, size_t sig_len, const uint8_t *sig, @@ -329,80 +261,80 @@ int ccrsa_verify_pkcs1v15(ccrsa_pub_ctx_t key, const uint8_t *oid, /*! @function ccder_encode_rsa_pub_size @abstract Calculate size of public key export format data package. - + @param key Public key - + @result Returns size required for encoding. */ -CC_NONNULL_TU((1)) +CC_NONNULL((1)) size_t ccder_encode_rsa_pub_size(const ccrsa_pub_ctx_t key); /*! @function ccrsa_export_priv_pkcs1 @abstract Export a public key. - + @param key Public key @param der Beginning of output DER buffer @param der_end End of output DER buffer */ -CC_NONNULL_TU((1)) CC_NONNULL((2)) CC_NONNULL((3)) +CC_NONNULL((1, 2, 3)) uint8_t *ccder_encode_rsa_pub(const ccrsa_pub_ctx_t key, uint8_t *der, uint8_t *der_end); /*! @function ccder_encode_rsa_priv_size @abstract Calculate size of full key exported in PKCS#1 format. - + @param key Full key - + @result Returns size required for encoding. */ -CC_NONNULL_TU((1)) +CC_NONNULL((1)) size_t ccder_encode_rsa_priv_size(const ccrsa_full_ctx_t key); /*! @function ccder_encode_rsa_priv @abstract Export a full key in PKCS#1 format. - + @param key Full key @param der Beginning of output DER buffer @param der_end End of output DER buffer */ -CC_NONNULL_TU((1)) CC_NONNULL((2)) CC_NONNULL((3)) +CC_NONNULL((1, 2, 3)) uint8_t *ccder_encode_rsa_priv(const ccrsa_full_ctx_t key, const uint8_t *der, uint8_t *der_end); /*! @function ccder_decode_rsa_pub_n @abstract Calculate "n" for a public key imported from a data package. PKCS #1 format - + @param der Beginning of input DER buffer @param der_end End of input DER buffer - + @result the "n" of the RSA key that would result from the import. This can be used to declare the key itself. */ -CC_NONNULL((1)) CC_NONNULL((2)) +CC_NONNULL((1, 2)) cc_size ccder_decode_rsa_pub_n(const uint8_t *der, const uint8_t *der_end); /*! @function ccder_decode_rsa_pub @abstract Import a public RSA key from a package in public key format. PKCS #1 format - + @param key Public key (n must be set) @param der Beginning of input DER buffer @param der_end End of input DER buffer - + @result Key is initialized using the data in the public key message. */ -CC_NONNULL_TU((1)) CC_NONNULL((2)) CC_NONNULL((3)) +CC_NONNULL((1, 2, 3)) const uint8_t *ccder_decode_rsa_pub(const ccrsa_pub_ctx_t key, const uint8_t *der, const uint8_t *der_end); /*! @@ -416,7 +348,7 @@ const uint8_t *ccder_decode_rsa_pub(const ccrsa_pub_ctx_t key, const uint8_t *de to declare the key itself. */ -CC_NONNULL((1)) CC_NONNULL((2)) +CC_NONNULL((1, 2)) cc_size ccder_decode_rsa_pub_x509_n(const uint8_t *der, const uint8_t *der_end); /*! @@ -430,48 +362,48 @@ cc_size ccder_decode_rsa_pub_x509_n(const uint8_t *der, const uint8_t *der_end); @result Key is initialized using the data in the public key message. */ -CC_NONNULL_TU((1)) CC_NONNULL((2)) CC_NONNULL((3)) +CC_NONNULL((1, 2, 3)) const uint8_t *ccder_decode_rsa_pub_x509(const ccrsa_pub_ctx_t key, const uint8_t *der, const uint8_t *der_end); /*! @function ccder_decode_rsa_priv_n @abstract Calculate "n" for a private key imported from a data package. - + @param der Beginning of input DER buffer @param der_end End of input DER buffer - + @result the "n" of the RSA key that would result from the import. This can be used to declare the key itself. */ -CC_NONNULL((1)) CC_NONNULL((2)) +CC_NONNULL((1, 2)) cc_size ccder_decode_rsa_priv_n(const uint8_t *der, const uint8_t *der_end); /*! @function ccder_decode_rsa_priv @abstract Import a private RSA key from a package in PKCS#1 format. - + @param key Full key (n must be set) @param der Beginning of input DER buffer @param der_end End of input DER buffer - + @result Key is initialized using the data in the public key message. */ -CC_NONNULL_TU((1)) CC_NONNULL((2)) CC_NONNULL((3)) +CC_NONNULL((1, 2, 3)) const uint8_t *ccder_decode_rsa_priv(const ccrsa_full_ctx_t key, const uint8_t *der, const uint8_t *der_end); /*! @function ccrsa_export_pub_size @abstract Calculate size of public key exported data package. - + @param key Public key - + @result Returns size required for encoding. */ -CC_CONST CC_INLINE CC_NONNULL_TU((1)) +CC_CONST CC_INLINE CC_NONNULL((1)) size_t ccrsa_export_pub_size(const ccrsa_pub_ctx_t key) { return ccder_encode_rsa_pub_size(key); } @@ -479,21 +411,21 @@ size_t ccrsa_export_pub_size(const ccrsa_pub_ctx_t key) { /*! @function ccrsa_export_pub @abstract Export a public key in public key format. - + @param key Public key @param out_len Allocated size @param out Output buffer */ -CC_NONNULL_TU((1)) CC_NONNULL((3)) +CC_NONNULL((1, 3)) int ccrsa_export_pub(const ccrsa_pub_ctx_t key, size_t out_len, uint8_t *out); /*! @function ccrsa_import_pub_n @abstract Calculate "n" for a public key imported from a data package. - + @param inlen Length of public key package data @param der pointer to public key package data - + @result the "n" of the RSA key that would result from the import. This can be used to declare the key itself. */ @@ -510,27 +442,27 @@ cc_size ccrsa_import_pub_n(size_t inlen, const uint8_t *der) { /*! @function ccrsa_import_pub @abstract Import a public RSA key from a package in public key format. - + @param key Public key (n must be set) @param inlen Length of public key package data @param der pointer to public key package data - + @result Key is initialized using the data in the public key message. */ -CC_NONNULL_TU((1)) CC_NONNULL((3)) +CC_NONNULL((1, 3)) int ccrsa_import_pub(ccrsa_pub_ctx_t key, size_t inlen, const uint8_t *der); /*! @function ccrsa_export_priv_size @abstract Calculate size of full key exported in PKCS#1 format. - + @param key Full key - + @result Returns size required for encoding. */ -CC_CONST CC_INLINE CC_NONNULL_TU((1)) +CC_CONST CC_INLINE CC_NONNULL((1)) size_t ccrsa_export_priv_size(const ccrsa_full_ctx_t key) { return ccder_encode_rsa_priv_size(key); } @@ -538,13 +470,13 @@ size_t ccrsa_export_priv_size(const ccrsa_full_ctx_t key) { /*! @function ccrsa_export_priv @abstract Export a full key in PKCS#1 format. - + @param key Full key @param out_len Allocated size @param out Output buffer */ -CC_CONST CC_INLINE CC_NONNULL_TU((1)) CC_NONNULL((3)) +CC_CONST CC_INLINE CC_NONNULL((1, 3)) int ccrsa_export_priv(const ccrsa_full_ctx_t key, size_t out_len, uint8_t *out) { return (ccder_encode_rsa_priv(key, out, out+out_len) != out); } @@ -552,10 +484,10 @@ int ccrsa_export_priv(const ccrsa_full_ctx_t key, size_t out_len, uint8_t *out) /*! @function ccrsa_import_priv_n @abstract Calculate size of full key exported in PKCS#1 format. - + @param inlen Length of PKCS#1 package data @param der pointer to PKCS#1 package data - + @result the "n" of the RSA key that would result from the import. This can be used to declare the key itself. */ @@ -568,24 +500,24 @@ cc_size ccrsa_import_priv_n(size_t inlen, const uint8_t *der) { /*! @function ccrsa_import_priv @abstract Import a full RSA key from a package in PKCS#1 format. - + @param key Full key (n must be set) @param inlen Length of PKCS#1 package data @param der pointer to PKCS#1 package data - + @result Key is initialized using the data in the PKCS#1 message. */ -CC_CONST CC_INLINE CC_NONNULL_TU((1)) CC_NONNULL((3)) +CC_CONST CC_INLINE CC_NONNULL((1, 3)) int ccrsa_import_priv(ccrsa_full_ctx_t key, size_t inlen, const uint8_t *der) { return (ccder_decode_rsa_priv(key, der, der+inlen) == NULL); } -CC_NONNULL_TU((1)) CC_NONNULL2 +CC_NONNULL((1, 2)) int ccrsa_get_pubkey_components(const ccrsa_pub_ctx_t pubkey, uint8_t *modulus, size_t *modulusLength, uint8_t *exponent, size_t *exponentLength); -CC_NONNULL_TU((1)) CC_NONNULL2 +CC_NONNULL((1, 2)) int ccrsa_get_fullkey_components(const ccrsa_full_ctx_t key, uint8_t *modulus, size_t *modulusLength, uint8_t *exponent, size_t *exponentLength, uint8_t *p, size_t *pLength, uint8_t *q, size_t *qLength); diff --git a/EXTERNAL_HEADERS/corecrypto/ccsha1.h b/EXTERNAL_HEADERS/corecrypto/ccsha1.h index 3372324b9..3f343401e 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccsha1.h +++ b/EXTERNAL_HEADERS/corecrypto/ccsha1.h @@ -21,23 +21,11 @@ /* sha1 selector */ const struct ccdigest_info *ccsha1_di(void); -extern const uint32_t ccsha1_initial_state[5]; - -/* shared between several implementations */ -void ccsha1_final(const struct ccdigest_info *di, ccdigest_ctx_t, - unsigned char *digest); - - /* Implementations */ extern const struct ccdigest_info ccsha1_ltc_di; extern const struct ccdigest_info ccsha1_eay_di; #if CCSHA1_VNG_INTEL -//extern const struct ccdigest_info ccsha1_vng_intel_di; -#if defined(__x86_64__) -extern const struct ccdigest_info ccsha1_vng_intel_AVX2_di; -extern const struct ccdigest_info ccsha1_vng_intel_AVX1_di; -#endif extern const struct ccdigest_info ccsha1_vng_intel_SupplementalSSE3_di; #endif diff --git a/EXTERNAL_HEADERS/corecrypto/ccsha2.h b/EXTERNAL_HEADERS/corecrypto/ccsha2.h index 37a646ec6..995ef7e26 100644 --- a/EXTERNAL_HEADERS/corecrypto/ccsha2.h +++ b/EXTERNAL_HEADERS/corecrypto/ccsha2.h @@ -38,33 +38,14 @@ const struct ccdigest_info *ccsha512_di(void); #define CCSHA256_OUTPUT_SIZE 32 #define CCSHA256_STATE_SIZE 32 extern const struct ccdigest_info ccsha256_ltc_di; -extern const struct ccdigest_info ccsha256_v6m_di; #if CCSHA2_VNG_INTEL -#if defined __x86_64__ -extern const struct ccdigest_info ccsha224_vng_intel_AVX2_di; -extern const struct ccdigest_info ccsha224_vng_intel_AVX1_di; -extern const struct ccdigest_info ccsha256_vng_intel_AVX2_di; -extern const struct ccdigest_info ccsha256_vng_intel_AVX1_di; -extern const struct ccdigest_info ccsha384_vng_intel_AVX2_di; -extern const struct ccdigest_info ccsha384_vng_intel_AVX1_di; -extern const struct ccdigest_info ccsha384_vng_intel_SupplementalSSE3_di; -extern const struct ccdigest_info ccsha512_vng_intel_AVX2_di; -extern const struct ccdigest_info ccsha512_vng_intel_AVX1_di; -extern const struct ccdigest_info ccsha512_vng_intel_SupplementalSSE3_di; -#endif extern const struct ccdigest_info ccsha224_vng_intel_SupplementalSSE3_di; extern const struct ccdigest_info ccsha256_vng_intel_SupplementalSSE3_di; #endif #if CCSHA2_VNG_ARMV7NEON extern const struct ccdigest_info ccsha224_vng_armv7neon_di; extern const struct ccdigest_info ccsha256_vng_armv7neon_di; -extern const struct ccdigest_info ccsha384_vng_arm64_di; -extern const struct ccdigest_info ccsha384_vng_armv7neon_di; -extern const struct ccdigest_info ccsha512_vng_arm64_di; -extern const struct ccdigest_info ccsha512_vng_armv7neon_di; #endif -extern const uint32_t ccsha256_K[64]; -extern const uint64_t ccsha512_K[80]; /* SHA224 */ #define CCSHA224_OUTPUT_SIZE 28 diff --git a/EXTERNAL_HEADERS/corecrypto/cczp.h b/EXTERNAL_HEADERS/corecrypto/cczp.h index f06b96a9d..d392432dc 100644 --- a/EXTERNAL_HEADERS/corecrypto/cczp.h +++ b/EXTERNAL_HEADERS/corecrypto/cczp.h @@ -14,58 +14,44 @@ #include #include -/* - Don't use cczp_hd struct directly, except in static tables such as eliptic curve parameter definitions. - - Declare cczp objects using cczp_decl_n(). It allocates cc_unit arrays of the length returned by either cczp_nof_n() or cczp_short_nof_n(). +/* + Don't use cczp_hd struct directly, except in static tables such as eliptic curve parameter + definitions. + + Declare cczp objects using cczp_decl_n(). It allocates cc_unit arrays of the length returned by + either cczp_nof_n() or cczp_short_nof_n(). */ struct cczp; -#if CORECRYPTO_USE_TRANSPARENT_UNION - -typedef union { - cc_unit *u; - struct cczp *zp; - //cczp_const_t czp; //for automatic type cast - //struct cczp_prime *prime; -} cczp_t __attribute__((transparent_union)); - -typedef union { - const cc_unit *u; - const struct cczp *zp; - //const struct cczp_prime *prime; - cczp_t _nczp; -} cczp_const_t __attribute__((transparent_union)); - -#else - typedef struct cczp* cczp_t; - typedef const struct cczp* cczp_const_t; -#endif + +typedef struct cczp *cczp_t; +typedef const struct cczp *cczp_const_t; + typedef void (*ccmod_func_t)(cc_ws_t ws, cczp_const_t zp, cc_unit *r, const cc_unit *s); // keep cczp_hd and cczp structures consistent // cczp_hd is typecasted to cczp to read EC curve params // options field is to specify Montgomery arithmetic, bit field, etc -// make sure n is the first element see ccrsa_ctx_n macro +// make sure n is the first element see ccrsa_ctx_n macro #define __CCZP_HEADER_ELEMENTS_DEFINITIONS(pre) \ -cc_size pre ## n;\ -cc_unit pre ## options;\ -ccmod_func_t pre ## mod_prime; + cc_size pre##n; \ + cc_unit pre##options; \ + ccmod_func_t pre##mod_prime; -#define __CCZP_ELEMENTS_DEFINITIONS(pre) \ -__CCZP_HEADER_ELEMENTS_DEFINITIONS(pre) \ -cc_unit pre ## ccn[]; +#define __CCZP_ELEMENTS_DEFINITIONS(pre) \ + __CCZP_HEADER_ELEMENTS_DEFINITIONS(pre) \ + cc_unit pre##ccn[]; -//cczp_hd must be defined separetly without variable length array ccn[], because it is used in sructures such as ccdh_gp_decl_n -struct cczp_hd{ +// cczp_hd must be defined separetly without variable length array ccn[], because it is used in +// sructures such as ccdh_gp_decl_n +struct cczp_hd { __CCZP_HEADER_ELEMENTS_DEFINITIONS() -} CC_ALIGNED(CCN_UNIT_SIZE); +} CC_ALIGNED(CCN_UNIT_SIZE); struct cczp { __CCZP_ELEMENTS_DEFINITIONS() } CC_ALIGNED(CCN_UNIT_SIZE); - /* Return the size of an cczp where each ccn is _size_ bytes. */ #define cczp_size(_size_) (sizeof(struct cczp) + ccn_sizeof_n(1) + 2 * (_size_)) @@ -79,95 +65,56 @@ struct cczp { with cczp_add, cczp_sub, cczp_div2, cczp_mod_inv. */ #define cczp_short_nof_n(_n_) (ccn_nof_size(sizeof(struct cczp)) + (_n_)) -#define cczp_decl_n(_n_, _name_) cc_ctx_decl(struct cczp, ccn_sizeof_n(cczp_nof_n(_n_)), _name_) -#define cczp_short_decl_n(_n_, _name_) cc_ctx_decl(struct cczp_short, ccn_sizeof_n(cczp_short_nof_n(_n_)), _name_) - -#define cczp_clear_n(_n_, _name_) cc_clear(ccn_sizeof_n(cczp_nof_n(_n_)), _name_) -#define cczp_short_clear_n(_n_, _name_) cc_clear(ccn_sizeof_n(cczp_short_nof_n(_n_)), _name_) - -#if CORECRYPTO_USE_TRANSPARENT_UNION - #define CCZP_N(ZP) (((cczp_t)(ZP)).zp->n) - #define CCZP_MOD(ZP) (((cczp_t)(ZP)).zp->mod_prime) - #define CCZP_PRIME(ZP) (((cczp_t)(ZP)).zp->ccn) - #define CCZP_RECIP(ZP) (((cczp_t)(ZP)).zp->ccn + cczp_n(ZP)) - #define CCZP_OPS(ZP) ((ZP).zp->options) - #define CCZP_MOD_PRIME(ZP) CCZP_MOD(ZP) - -CC_CONST CC_NONNULL_TU((1)) -static inline cc_size cczp_n(cczp_const_t zp) { - return zp.zp->n; -} - -CC_CONST CC_NONNULL_TU((1)) -static inline cc_unit cczp_options(cczp_const_t zp) { - return zp.zp->options; -} - -CC_CONST CC_NONNULL_TU((1)) -static inline ccmod_func_t cczp_mod_prime(cczp_const_t zp) { - return zp.zp->mod_prime; -} - -CC_CONST CC_NONNULL_TU((1)) -static inline const cc_unit *cczp_prime(cczp_const_t zp) { - return zp.zp->ccn; -} - -/* Return a pointer to the Reciprocal or Montgomery constant of zp, which is - allocated cczp_n(zp) + 1 units long. */ -CC_CONST CC_NONNULL_TU((1)) - -static inline const cc_unit *cczp_recip(cczp_const_t zp) { - return zp.zp->ccn + zp.zp->n; -} - -#else - #define CCZP_N(ZP) ((ZP)->n) - #define CCZP_MOD(ZP) ((ZP)->mod_prime) - #define CCZP_MOD_PRIME(ZP) CCZP_MOD(ZP) - #define CCZP_PRIME(ZP) ((ZP)->ccn) - #define CCZP_RECIP(ZP) ((ZP)->ccn + CCZP_N(ZP)) - #define CCZP_OPS(ZP) ((ZP)->options) -CC_CONST CC_NONNULL_TU((1)) -static inline cc_size cczp_n(cczp_const_t zp) { +#define cczp_decl_n(_n_, _name_) cc_ctx_decl(struct cczp, ccn_sizeof_n(cczp_nof_n(_n_)), _name_) +#define cczp_short_decl_n(_n_, _name_) \ + cc_ctx_decl(struct cczp_short, ccn_sizeof_n(cczp_short_nof_n(_n_)), _name_) + +#define cczp_clear_n(_n_, _name_) cc_clear(ccn_sizeof_n(cczp_nof_n(_n_)), _name_) +#define cczp_short_clear_n(_n_, _name_) cc_clear(ccn_sizeof_n(cczp_short_nof_n(_n_)), _name_) + +#define CCZP_N(ZP) ((ZP)->n) +#define CCZP_MOD(ZP) ((ZP)->mod_prime) +#define CCZP_MOD_PRIME(ZP) CCZP_MOD(ZP) +#define CCZP_PRIME(ZP) ((ZP)->ccn) +#define CCZP_RECIP(ZP) ((ZP)->ccn + CCZP_N(ZP)) +#define CCZP_OPS(ZP) ((ZP)->options) +CC_CONST CC_NONNULL((1)) static inline cc_size cczp_n(cczp_const_t zp) +{ return zp->n; } -CC_CONST CC_NONNULL_TU((1)) -static inline cc_unit cczp_options(cczp_const_t zp) { +CC_CONST CC_NONNULL((1)) static inline cc_unit cczp_options(cczp_const_t zp) +{ return zp->options; } -CC_CONST CC_NONNULL_TU((1)) -static inline ccmod_func_t cczp_mod_prime(cczp_const_t zp) { +CC_CONST CC_NONNULL((1)) static inline ccmod_func_t cczp_mod_prime(cczp_const_t zp) +{ return zp->mod_prime; } -CC_CONST CC_NONNULL_TU((1)) -static inline const cc_unit *cczp_prime(cczp_const_t zp) { +CC_CONST CC_NONNULL((1)) static inline const cc_unit *cczp_prime(cczp_const_t zp) +{ return zp->ccn; } /* Return a pointer to the Reciprocal or Montgomery constant of zp, which is allocated cczp_n(zp) + 1 units long. */ -CC_CONST CC_NONNULL_TU((1)) +CC_CONST CC_NONNULL((1)) -static inline const cc_unit *cczp_recip(cczp_const_t zp) { + static inline const cc_unit *cczp_recip(cczp_const_t zp) +{ return zp->ccn + zp->n; } -#endif - - -CC_CONST CC_NONNULL_TU((1)) -CC_INLINE size_t cczp_bitlen(cczp_const_t zp) { +CC_CONST CC_NONNULL((1)) CC_INLINE size_t cczp_bitlen(cczp_const_t zp) +{ return ccn_bitlen(cczp_n(zp), cczp_prime(zp)); } - /* Ensure both cczp_mod_prime(zp) and cczp_recip(zp) are valid. cczp_n and cczp_prime must have been previously initialized. */ -CC_NONNULL_TU((1)) +CC_NONNULL((1)) int cczp_init(cczp_t zp); /* Compute r = s2n mod cczp_prime(zp). Will write cczp_n(zp) @@ -175,16 +122,14 @@ int cczp_init(cczp_t zp); identical they must not overlap. Before calling this function either cczp_init(zp) must have been called or both CCZP_MOD_PRIME((cc_unit *)zp) and CCZP_RECIP((cc_unit *)zp) must be initialized some other way. */ -CC_NONNULL_TU((1)) CC_NONNULL((2, 3)) -void cczp_mod(cc_ws_t ws, cczp_const_t zp, cc_unit *r, const cc_unit *s2n); +CC_NONNULL((1, 2, 3)) void cczp_mod(cc_ws_t ws, cczp_const_t zp, cc_unit *r, const cc_unit *s2n); /* Compute r = sn mod cczp_prime(zp), Will write cczp_n(zp) units to r and reads sn units units from s. If r and s are not identical they must not overlap. Before calling this function either cczp_init(zp) must have been called or both CCZP_MOD_PRIME((cc_unit *)zp) and CCZP_RECIP((cc_unit *)zp) must be initialized some other way. */ -CC_NONNULL_TU((1)) CC_NONNULL((2, 4)) -int cczp_modn(cczp_const_t zp, cc_unit *r, cc_size ns, const cc_unit *s); +CC_NONNULL((1, 2, 4)) int cczp_modn(cczp_const_t zp, cc_unit *r, cc_size ns, const cc_unit *s); /* Compute r = x * y mod cczp_prime(zp). Will write cczp_n(zp) units to r and reads cczp_n(zp) units units from both x and y. If r and x are not @@ -192,44 +137,20 @@ int cczp_modn(cczp_const_t zp, cc_unit *r, cc_size ns, const cc_unit *s); calling this function either cczp_init(zp) must have been called or both CCZP_MOD_PRIME((cc_unit *)zp) and CCZP_RECIP((cc_unit *)zp) must be initialized some other way. */ -CC_NONNULL_TU((1)) CC_NONNULL((2, 3, 4)) +CC_NONNULL((1, 2, 3, 4)) void cczp_mul(cczp_const_t zp, cc_unit *t, const cc_unit *x, const cc_unit *y); -CC_NONNULL_TU((1)) CC_NONNULL((2, 3, 4, 5)) -void cczp_mul_ws(cc_ws_t ws, cczp_const_t zp, cc_unit *t, const cc_unit *x, const cc_unit *y); - -/* Compute r = x * x mod cczp_prime(zp). Will write cczp_n(zp) units to r - and reads cczp_n(zp) units from x. If r and x are not identical they must - not overlap. Before calling this function either cczp_init(zp) must have - been called or both CCZP_MOD_PRIME((cc_unit *)zp) and - CCZP_RECIP((cc_unit *)zp) must be initialized some other way. */ -CC_NONNULL_TU((1)) CC_NONNULL((2, 3)) -void cczp_sqr(cczp_const_t zp, cc_unit *r, const cc_unit *x); - -CC_NONNULL_TU((1)) CC_NONNULL((2, 3, 4)) -void cczp_sqr_ws(cc_ws_t ws, cczp_const_t zp, cc_unit *r, const cc_unit *x); - -/* Compute r = x^(1/2) mod cczp_prime(zp). Will write cczp_n(zp) units to r - and reads cczp_n(zp) units from x. If r and x are not identical they must - not overlap. Before calling this function either cczp_init(zp) must have - been called or both CCZP_MOD_PRIME((cc_unit *)zp) and - CCZP_RECIP((cc_unit *)zp) must be initialized some other way. - Only support prime = 3 mod 4 */ -CC_NONNULL_TU((1)) CC_NONNULL((2, 3)) -int cczp_sqrt(cczp_const_t zp, cc_unit *r, const cc_unit *x); - /* Compute r = m ^ e mod cczp_prime(zp), using Montgomery ladder. - writes cczp_n(zp) units to r - reads cczp_n(zp) units units from m and e - - if r and m are not identical they must not overlap. + - if r and m are not identical they must not overlap. - r and e must not overlap nor be identical. - before calling this function either cczp_init(zp) must have been called or both CCZP_MOD_PRIME((cc_unit *)zp) and CCZP_RECIP((cc_unit *)zp) must be initialized some other way. */ -CC_NONNULL_TU((1)) CC_NONNULL((2, 3, 4)) -int cczp_power(cczp_const_t zp, cc_unit *r, const cc_unit *m, - const cc_unit *e); +CC_NONNULL((1, 2, 3, 4)) +int cczp_power(cczp_const_t zp, cc_unit *r, const cc_unit *m, const cc_unit *e); /* Compute r = m ^ e mod cczp_prime(zp), using Square Square Multiply Always. - writes cczp_n(zp) units to r @@ -238,101 +159,55 @@ int cczp_power(cczp_const_t zp, cc_unit *r, const cc_unit *m, - r and e must not overlap nor be identical. - before calling this function either cczp_init(zp) must have been called or both CCZP_MOD_PRIME((cc_unit *)zp) and CCZP_RECIP((cc_unit *)zp) must - be initialized some other way. - + be initialized some other way. + Important: This function is intented to be constant time but is more likely to leak information due to memory cache. Only used with randomized input */ -CC_NONNULL_TU((1)) CC_NONNULL((2, 3, 4)) -int cczp_power_ssma(cczp_const_t zp, cc_unit *r, const cc_unit *m, - const cc_unit *e); - -int cczp_power_ssma_ws(cc_ws_t ws, cczp_const_t zp, cc_unit *r, const cc_unit *s, const cc_unit *e); - -/* Compute r = m ^ e mod cczp_prime(zp). Will write cczp_n(zp) units to r and - reads cczp_n(zp) units units from m. Reads ebitlen bits from e. - m must be <= to cczp_prime(zp). If r and m are not identical they must not - overlap. r and e must not overlap nor be identical. - Before calling this function either cczp_init(zp) must have been called - or both CCZP_MOD_PRIME((cc_unit *)zp) and CCZP_RECIP((cc_unit *)zp) must - be initialized some other way. */ -CC_NONNULL_TU((1)) CC_NONNULL((2, 3, 5)) -int cczp_powern(cczp_const_t zp, cc_unit *r, const cc_unit *s, - size_t ebitlen, const cc_unit *e); - -/* Compute r = x + y mod cczp_prime(zp). Will write cczp_n(zp) units to r and - reads cczp_n(zp) units units from x and y. If r and x are not identical - they must not overlap. Only cczp_n(zp) and cczp_prime(zp) need to be valid. - Can be used with cczp_short_nof_n sized cc_unit array zp. */ -CC_NONNULL_TU((1)) CC_NONNULL((2, 3, 4)) -void cczp_add(cczp_const_t zp, cc_unit *r, const cc_unit *x, - const cc_unit *y); - -CC_NONNULL_TU((1)) CC_NONNULL((2, 3, 4, 5)) -void cczp_add_ws(cc_ws_t ws, cczp_const_t zp, cc_unit *r, const cc_unit *x, - const cc_unit *y); - -/* Compute r = x - y mod cczp_prime(zp). Will write cczp_n(zp) units to r and - reads cczp_n(zp) units units from x and y. If r and x are not identical - they must not overlap. Only cczp_n(zp) and cczp_prime(zp) need to be valid. - Can be used with cczp_short_nof_n sized cc_unit array zp. */ -CC_NONNULL_TU((1)) CC_NONNULL((2, 3, 4)) -void cczp_sub(cczp_const_t zp, cc_unit *r, const cc_unit *x, const cc_unit *y); - -CC_NONNULL_TU((1)) CC_NONNULL((2, 3, 4, 5)) -void cczp_sub_ws(cc_ws_t ws, cczp_const_t zp, cc_unit *r, const cc_unit *x, - const cc_unit *y); - -/* Compute r = x / 2 mod cczp_prime(zp). Will write cczp_n(zp) units to r and - reads cczp_n(zp) units units from x. If r and x are not identical - they must not overlap. Only cczp_n(zp) and cczp_prime(zp) need to be valid. - Can be used with cczp_short_nof_n sized cc_unit array zp. */ -CC_NONNULL_TU((1)) CC_NONNULL((2, 3)) -void cczp_div2(cczp_const_t zp, cc_unit *r, const cc_unit *x); - -/* Compute q = a_2n / cczp_prime(zd) (mod cczp_prime(zd)) . Will write cczp_n(zd) - units to q and r. Will read 2 * cczp_n(zd) units units from a. If r and a - are not identical they must not overlap. Before calling this function - either cczp_init(zp) must have been called or both - CCZP_MOD_PRIME((cc_unit *)zp) and CCZP_RECIP((cc_unit *)zp) must be - initialized some other way. */ -CC_NONNULL_TU((1)) CC_NONNULL((2, 3, 4)) -void cczp_div(cczp_const_t zd, cc_unit *q, cc_unit *r, const cc_unit *a_2n); - +CC_NONNULL((1, 2, 3, 4)) +int cczp_power_ssma(cczp_const_t zp, cc_unit *r, const cc_unit *m, const cc_unit *e); /*! @brief cczp_inv(zp, r, x) computes r = x^-1 (mod p) , where p=cczp_prime(zp). - @discussion It is a general function and works for any p. It validates the inputs. r and x can overlap. It writes n =cczp_n(zp) units to r, and read n units units from x and p. The output r is overwriten only if the inverse is correctly computed. This function is not constant time in absolute sense, but it does not have data dependent 'if' statements in the code. - @param zp The input zp. cczp_n(zp) and cczp_prime(zp) need to be valid. cczp_init(zp) need not to be called before invoking cczp_inv(). + @discussion It is a general function and works for any p. It validates the inputs. r and x can + overlap. It writes n =cczp_n(zp) units to r, and read n units units from x and p. The output r is + overwriten only if the inverse is correctly computed. This function is not constant time in + absolute sense, but it does not have data dependent 'if' statements in the code. + @param zp The input zp. cczp_n(zp) and cczp_prime(zp) need to be valid. cczp_init(zp) need not to + be called before invoking cczp_inv(). @param x input big integer @param r output big integer @return 0 if inverse exists and correctly computed. */ -CC_NONNULL_TU((1)) CC_NONNULL((2, 3)) - +CC_NONNULL((1, 2, 3)) int cczp_inv(cczp_const_t zp, cc_unit *r, const cc_unit *x); /*! @brief cczp_inv_odd(zp, r, x) computes r = x^-1 (mod p) , where p=cczp_prime(zp) is an odd number. @discussion r and x can overlap. - @param zp The input zp. cczp_n(zp) and cczp_prime(zp) need to be valid. cczp_init(zp) need not to be called before invoking. + @param zp The input zp. cczp_n(zp) and cczp_prime(zp) need to be valid. cczp_init(zp) need not to + be called before invoking. @param x input big integer @param r output big integer @return 0 if successful */ -CC_NONNULL_TU((1)) CC_NONNULL((2, 3)) -int cczp_inv_odd(cczp_const_t zp, cc_unit *r, const cc_unit *x); +CC_NONNULL((1, 2, 3)) int cczp_inv_odd(cczp_const_t zp, cc_unit *r, const cc_unit *x); /*! - @brief cczp_inv_field(zp, r, x) computes r = x^-1 (mod p) , where p=cczp_prime(zp) is a prime number number. - @discussion r and x must NOT overlap. The excution time of the function is independent to the value of the input x. It works only if p is a field. That is, when p is a prime. It supports Montgomery and non-Montgomery form of zp. It leaks the value of the prime and should only be used be used for public (not secret) primes (ex. Elliptic Curves) - - @param zp The input zp. cczp_n(zp) and cczp_prime(zp) need to be valid. cczp_init(zp) need not to be called before invoking cczp_inv_field(). + @brief cczp_inv_field(zp, r, x) computes r = x^-1 (mod p) , where p=cczp_prime(zp) is a prime + number number. + @discussion r and x must NOT overlap. The excution time of the function is independent to the value + of the input x. It works only if p is a field. That is, when p is a prime. It supports Montgomery + and non-Montgomery form of zp. It leaks the value of the prime and should only be used be used for + public (not secret) primes (ex. Elliptic Curves) + + @param zp The input zp. cczp_n(zp) and cczp_prime(zp) need to be valid. cczp_init(zp) need not to + be called before invoking cczp_inv_field(). @param x input big unteger @param r output big integer @return 0 if inverse exists and correctly computed. */ -CC_NONNULL_TU((1)) CC_NONNULL((2, 3)) +CC_NONNULL((1, 2, 3)) int cczp_inv_field(cczp_const_t zp, cc_unit *r, const cc_unit *x); #endif /* _CORECRYPTO_CCZP_H_ */ diff --git a/EXTERNAL_HEADERS/img4/api.h b/EXTERNAL_HEADERS/img4/api.h new file mode 100644 index 000000000..56b875bc2 --- /dev/null +++ b/EXTERNAL_HEADERS/img4/api.h @@ -0,0 +1,56 @@ +/*! + * @header + * API definitions. + */ +#ifndef __IMG4_API_H +#define __IMG4_API_H + +#ifndef __IMG4_INDIRECT +#error "Please #include instead of this file directly" +#endif // __IMG4_INDIRECT + +#ifndef KERNEL +#include +#endif + +#if !XNU_KERNEL_PRIVATE +#include +#endif + +/*! + * @const IMG4_API_VERSION + * The API version of the library. This version will be changed in accordance + * with new API introductions so that callers may submit code to the build that + * adopts those new APIs before the APIs land by using the following pattern: + * + * #if IMG4_API_VERSION >= 20180424 + * img4_new_api(); + * #endif + * + * In this example, the library maintainer and API adopter agree on an API + * version of 20180424 ahead of time for the introduction of + * img4_new_api(). When a libdarwin with that API version is submitted, the + * project is rebuilt, and the new API becomes active. + * + * Breaking API changes will be both covered under this mechanism as well as + * individual preprocessor macros in this header that declare new behavior as + * required. + */ +#define IMG4_API_VERSION (20180112u) + +#if !defined(KERNEL) && !IMG4_PROJECT_BUILD +#define IMG4_API_AVAILABLE_20180112 \ + __API_UNAVAILABLE(macos) \ + API_AVAILABLE(ios(12.0), tvos(12.0), watchos(5.0)) +#else +#define IMG4_API_AVAILABLE_20180112 +#endif + +/*! + * @typedef img4_struct_version_t + * A type describing the version of a structure in the library. + */ +IMG4_API_AVAILABLE_20180112 +typedef uint16_t img4_struct_version_t; + +#endif // __IMG4_API_H diff --git a/EXTERNAL_HEADERS/img4/environment.h b/EXTERNAL_HEADERS/img4/environment.h new file mode 100644 index 000000000..d5c4f4902 --- /dev/null +++ b/EXTERNAL_HEADERS/img4/environment.h @@ -0,0 +1,314 @@ +/*! + * @header + * Image4 environment interfaces. + */ +#ifndef __IMG4_ENVIRONMENT_H +#define __IMG4_ENVIRONMENT_H + +#ifndef __IMG4_INDIRECT +#error "Please #include instead of this file directly" +#endif // __IMG4_INDIRECT + +/*! + * @const IMG4_ENVIRONMENT_VERSION + * The version of the {@link img4_environment_t} structure supported by the + * implementation. See {@link _img4_environment} for complete definition. + */ +#define IMG4_ENVIRONMENT_VERSION ((img4_struct_version_t)0) + +/*! + * @typedef img4_crypto_selector_t + * A CoreCrypto selector routine. + */ +IMG4_API_AVAILABLE_20180112 +typedef const struct ccdigest_info *(*img4_crypto_selector_t)(void); + +/*! + * @typedef img4_crypto_t + * A structure describing a crypto algorithm used by Image4. + * + * @property i4c_name + * The human-readable string for the crypto algorithm (e.g. "sha1"). + * + * @property i4c_select + * The CoreCrypto selector routine for the algorithm + * + * @property i4c_hash_len + * The length of the hash computed by the algorithm. + * + * @property i4c_truncated_hash_len + * The truncated length of the hash computed by the algorithm. + * + * @property __opaque + * Reserved for the implementation. + */ +IMG4_API_AVAILABLE_20180112 +typedef struct _img4_crypto { + const char *i4c_name; + img4_crypto_selector_t i4c_select; + uint32_t i4c_hash_len; + uint32_t i4c_truncated_hash_len; + const void *__opaque; +} img4_crypto_t; + +/*! + * @const IMG4_CRYPTO_SHA1 + * The Image4 SHA1 implementation. + */ +IMG4_API_AVAILABLE_20180112 +OS_EXPORT +const img4_crypto_t _img4_crypto_sha1; +#define IMG4_CRYPTO_SHA1 (&_img4_crypto_sha1) + +/*! + * @const IMG4_CRYPTO_SHA384 + * The Image4 SHA-384 implementation. + */ +IMG4_API_AVAILABLE_20180112 +OS_EXPORT +const img4_crypto_t _img4_crypto_sha384; +#define IMG4_CRYPTO_SHA384 (&_img4_crypto_sha384) + +/*! + * @typedef img4_environment_t + * A type describing an Image4 environment. + */ +IMG4_API_AVAILABLE_20180112 +typedef struct _img4_environment img4_environment_t; + +/*! + * @typedef img4_environment_get_crypto_t + * A function which obtains a crypto descriptor for the host environment. + * + * @param i4e + * The environment descriptor. + * + * @param crypto + * A pointer to the storage in which the pointer to the host's crypto descriptor + * will be written. + * + * @param ctx + * The context pointer supplied to {@link img4_init}. + * + * @result + * Upon successfully fetching the property value, zero should be returned. + * Otherwise, the following error codes should be returned: + * + * [ENOENT] The property does not exist in the environment + */ +IMG4_API_AVAILABLE_20180112 +typedef errno_t (*img4_environment_get_crypto_t)( + const img4_environment_t *i4e, + const img4_crypto_t **crypto, + const void *ctx); + +/*! + * @typedef img4_environment_get_bool_t + * A function which obtains a Boolean property from the host environment. + * + * @param val + * A pointer to storage in which the value will be written. + * + * @param ctx + * The context pointer supplied to {@link img4_init}. + * + * @result + * Upon successfully fetching the property value, zero should be returned. + * Otherwise, the following error codes should be returned: + * + * [ENOENT] The property does not exist in the environment + * [EFTYPE] The property is not expressible as a Boolean + */ +IMG4_API_AVAILABLE_20180112 +typedef errno_t (*img4_environment_get_bool_t)( + const img4_environment_t *i4e, + bool *val, + const void *ctx); + +/*! + * @typedef img4_environment_get_uint32_t + * A function which obtains an unsigned 32-bit integer property from the host + * environment. + * + * @param val + * A pointer to storage in which the value will be written. + * + * @param ctx + * The context pointer supplied to {@link img4_init}. + * + * @result + * Upon successfully fetching the property value, zero should be returned. + * Otherwise, the following error codes should be returned: + * + * [ENOENT] The property does not exist in the environment + * [EFTYPE] The property is not expressible as an unsigned 32-bit integer + */ +IMG4_API_AVAILABLE_20180112 +typedef errno_t (*img4_environment_get_uint32_t)( + const img4_environment_t *i4e, + uint32_t *val, + const void *ctx); + +/*! + * @typedef img4_environment_get_uint64_t + * A function which obtains an unsigned 64-bit integer property from the host + * environment. + * + * @param val + * A pointer to storage in which the value will be written. + * + * @param ctx + * The context pointer supplied to {@link img4_init}. + * + * @result + * Upon successfully fetching the property value, zero should be returned. + * Otherwise, the following error codes should be returned: + * + * [ENOENT] The property does not exist in the environment + * [EFTYPE] The property is not expressible as an unsigned 64-bit + * integer + */ +IMG4_API_AVAILABLE_20180112 +typedef errno_t (*img4_environment_get_uint64_t)( + const img4_environment_t *i4e, + uint64_t *val, + const void *ctx); + +/*! + * @typedef img4_environment_get_data_t + * A function which obtains a property which is a raw sequence of bytes from the + * host environment. + * + * @param bytes + * A pointer to storage in which the value will be written. + * + * @param len + * A pointer to the length of the buffer referred to be {@link val}. Upon + * successful return, this storage should contain the number of bytes written. + * + * @param ctx + * The context pointer supplied to {@link img4_init}. + * + * @result + * Upon successfully fetching the property value, zero should be returned. + * Otherwise, the following error codes should be returned: + * + * [ENOENT] The property does not exist in the environment + * [EFTYPE] The property is not expressible as a raw sequence of bytes + * [ERANGE] The buffer was not large enough to hold the property + */ +IMG4_API_AVAILABLE_20180112 +typedef errno_t (*img4_environment_get_data_t)( + const img4_environment_t *i4e, + uint8_t *bytes, + uint32_t *len, + const void *ctx); + +/*! + * @struct _img4_environment + * A type describing a host environment. + * + * @property i4e_version + * The version of the environment structure. Pass + * {@link IMG4_ENVIRONMENT_VERSION}. + * + * @property i4e_name + * A human-readable description of the environment. + * + * @property i4e_crypto + * A pointer to a function which returns the crypto implementation for the + * environment. + * + * @property i4e_cert_epoch + * A pointer to a function which returns the certificate epoch for the + * environment. + * + * @property i4e_board_id + * A pointer to a function which returns the board identifier for the + * environment. + * + * @property i4e_chip_id + * A pointer to a function which returns the chip design identifier for the + * environment. + * + * @property i4e_ecid + * A pointer to a function which returns the unique chip identifier for the + * environment. + * + * @property i4e_security_domain + * A pointer to a function which returns the security domain for the + * environment. + * + * @property i4e_cert_prod + * A pointer to a function which returns the certificate production status for + * the environment. This indicates whether the environment's leaf certificate + * must be production or development. + * + * - true the environment's leaf certificate must be production + * - false the environment's leaf certificate may be development + * + * @property i4e_cert_security + * A pointer to a function which returns the certificate security mode for the + * environment. This indicates Whether the leaf certificate must be secure. + * + * @property i4e_ap_nonce_hash + * A pointer to a function which returns the hash of the AP nonce for the + * environment. + * + * @property i4e_prevent_mixnmatch + * A pointer to a function which returns whether the environment prevents mix- + * n-match. + * + * - true the environment disallows mix-n-match + * - false the environment allows mix-n-match + * + * @property i4e_boot_manifest_hash + * A pointer to a function which returns the hash of the manifest from which + * mix-n-match policy derives. + * + * @property i4e_eff_security + * A pointer to a function which returns the effective security mode for the + * environment. + * + * @property i4e_eff_prod + * A pointer to a function which returns the effective production status for the + * environment. + * + * @property i4e_ap_nonce_trust + * A pointer to a function which returns whether the AP nonce must be + * exclusively fetched from main memory. + * + * - true the AP nonce hash must be fetched from main memory exclusively; + * persistent storage is not trustworthy + * - false the AP nonce hash may be fetched from persistent storage + */ +struct _img4_environment { + img4_struct_version_t i4e_version; + const char *i4e_name; + img4_environment_get_crypto_t i4e_crypto; + img4_environment_get_uint32_t i4e_cert_epoch; + img4_environment_get_uint32_t i4e_board_id; + img4_environment_get_uint32_t i4e_chip_id; + img4_environment_get_uint64_t i4e_ecid; + img4_environment_get_uint32_t i4e_security_domain; + img4_environment_get_bool_t i4e_cert_prod; + img4_environment_get_bool_t i4e_cert_security; + img4_environment_get_data_t i4e_ap_nonce_hash; + img4_environment_get_bool_t i4e_prevent_mixnmatch; + img4_environment_get_data_t i4e_boot_manifest_hash; + img4_environment_get_bool_t i4e_eff_prod; + img4_environment_get_bool_t i4e_eff_security; + img4_environment_get_bool_t i4e_ap_nonce_trust; +} IMG4_API_AVAILABLE_20180112; + +/*! + * @const IMG4_ENVIRONMENT_PLATFORM + * The environment for the host that uses the default platform implementation to + * resolve the environment. + */ +IMG4_API_AVAILABLE_20180112 +OS_EXPORT +const struct _img4_environment _img4_environment_platform; +#define IMG4_ENVIRONMENT_PLATFORM (&_img4_environment_platform) + +#endif // __IMG4_ENVIRONMENT_H diff --git a/EXTERNAL_HEADERS/img4/img4.h b/EXTERNAL_HEADERS/img4/img4.h new file mode 100644 index 000000000..13b053fc9 --- /dev/null +++ b/EXTERNAL_HEADERS/img4/img4.h @@ -0,0 +1,543 @@ +/*! + * @header + * Image4 interfaces. These interfaces encapsulate the basic concepts required + * for authenticating and validating Image4 manifests as being authoritative. + * These concepts are: + * + * Environment + * An environment is a description of a host comprised of hardware identifiers + * and policy configurations. For example, the environment of an iPhone may + * include the following hardware identifiers (among others): + * + * ChipID + * A number identifying the chip design. + * + * BoardID + * A number identifying the board. + * + * UniqueChipID / ECID + * A number uniquely identifying a specific instance of a chip. + * + * The environment also includes policy information derived by previous stages + * of secure boot. Examples of such policy are: + * + * Mix-n-Match Prevention + * Whether firmware payloads from multiple, valid secure boot manifests + * should be prevented from being executed on the host environment. The + * default is true. + * + * Manifest + * An Image4 manifest is a set of constraints that describe a host environment. + * For example, a manifest may have been signed such that it is only valid for a + * single host environment. In this case, the manifest may include specific + * values for ChipID, BoardID, UniqueChipID, etc. Such a manifest is said to be + * personalized for that environment. + * + * If an environment meets the constraints in a manifest, that manifest is said + * to be authoritative over the environment. + * + * The manifest also includes one or more objects which may be executed in the + * environment. + * + * Object + * An object is a description of a payload. An object can describe any payload, + * not just the payload that is in the Image4. An object describes a payload by + * means of its digest. Examples of objects present in a secure boot manifest + * are the kernelcache and the static trust cache. + * + * If an authoritative manifest accurately describes an object, then that object + * may be executed in the host environment. The mechanics of execution typically + * involve mapping its payload into a privileged memory region. For example, + * when the kernelcache is executed, its payload bytes are mapped into the range + * of memory associated with supervisor mode. + * + * Payload + * A payload is the raw sequence of bytes that is described by an object. When + * described via an Image4 object, payloads are first wrapped in Image4 encoding + * to associate a tag with them. The resulting series of bytes is what is + * contained in a .im4p file. + * + * An Image4 file may only contain a single payload (even though a manifest may + * describe multiple payloads through multiple objects). + * + * Tag + * A tag is a FourCC which can identify any of the following: + * + * - an object property (e.g. the 'DGST' property) + * - a manifest property (e.g. the 'BORD' property) + * - a certificate property + * - a type of object (e.g. 'krnl') + * + * Tags comprised of all-caps are reserved for the Image4 specification. + */ + + +#ifndef __IMG4_H +#define __IMG4_H + +#include +#include +#include +#include + +#define __IMG4_INDIRECT 1 + +/* + * This header is used in the pmap layer in xnu, which is in osfmk, which does + * not have access to most of the BSD headers. (But for some reason it does have + * access to sys/cdefs.h.) The only thing we need from that header is the + * errno_t typedef though, so if we can't get to it, then just typeded it + * ourselves. + */ +#if MACH_KERNEL_PRIVATE +typedef int errno_t; +#else +#include +#endif + +#if !IMG4_PROJECT_BUILD +#include +#endif + +__BEGIN_DECLS; + +/*! + * @typedef img4_tag_t + * A type describing an Image4 tag. + */ +IMG4_API_AVAILABLE_20180112 +typedef uint32_t img4_tag_t; + +/*! + * @typedef img4_section_t + * A type describing the sections of an Image4 object. + * + * @const IMG4_SECTION_MANIFEST + * The manifest section. + * + * @const IMG4_SECTION_OBJECT + * The object section. + * + * @const IMG4_SECTION_RESTOREINFO + * The restore info section. + */ +OS_ENUM(img4_section, uint8_t, + IMG4_SECTION_MANIFEST, + IMG4_SECTION_OBJECT, + IMG4_SECTION_RESTOREINFO, +) IMG4_API_AVAILABLE_20180112; + +/*! + * @typedef img4_custom_tag_handler_t + * A handler for a tag unrecognized by the implementation. + * + * @param tag + * The FourCC tag. + * + * @param ctx + * The user-provided context pointer given to either + * {@link img4_get_trusted_payload} or + * {@link img4_get_trusted_external_payload}. + */ +IMG4_API_AVAILABLE_20180112 +typedef errno_t (*img4_custom_tag_handler_t)( + img4_tag_t tag, + img4_section_t section, + void *ctx); + +/*! + * @typedef img4_custom_tag_t + * A type describing a custom tag and its handler. + * + * @property i4ct_tag + * The FourCC tag. + * + * @property i4ct_section + * The section in which the tag is expected. If {@link IMG4_SECTION_OBJECT} is + * given, the object corresponding to the tag given to + * {@link img4_get_trusted_payload} or {@link img4_get_trusted_external_payload} + * will be consulted for the tag. + * + * @property i4ct_handler + * The handler for the tag. + */ +IMG4_API_AVAILABLE_20180112 +typedef struct _img4_custom_tag { + img4_tag_t i4ct_tag; + img4_section_t i4ct_section; + img4_custom_tag_handler_t i4ct_handler; +} img4_custom_tag_t; + +/*! + * @typedef img4_destructor_t + * A type describing a destructor routine for an Image4 object. + * + * @param ptr + * A pointer to the buffer to dispose of. + * + * @param len + * The length of the buffer. + */ +IMG4_API_AVAILABLE_20180112 +typedef void (*img4_destructor_t)( + void *ptr, + size_t len); + +/*! + * @typedef img4_flags_t + * A flagset modifying the behavior of an {@link img4_t}. + * + * @const I4F_INIT + * No flags set. This value is suitable for initialization purposes. + * + * @const I4F_TRUST_MANIFEST + * Causes the implementation to bypass trust evaluation for the manifest, i.e. + * it will not verify that a manifest has been signed by Apple before trusting + * it. + * + * This option is for testing purposes only and is not respected on the RELEASE + * variant of the implementation. + * + * @const I4F_FORCE_MIXNMATCH + * Causes the implementation to bypass mix-n-match policy evaluation and always + * allow mix-n-match, irrespective of the previous boot stage's conclusion or + * manifest policy. + * + * This option is for testing purposes only and is not respected on the RELEASE + * variant of the implementation. + */ +OS_ENUM(img4_flags, uint64_t, + I4F_INIT = 0, + I4F_TRUST_MANIFEST = (1 << 0), + I4F_FORCE_MIXNMATCH = (1 << 1), +) IMG4_API_AVAILABLE_20180112; + +#if TARGET_OS_OSX || defined(PLATFORM_MacOSX) +typedef char _img4_opaque_data_64[656]; +typedef char _img4_opaque_data_32[476]; +#elif TARGET_OS_IOS || defined(PLATFORM_iPhoneOS) +typedef char _img4_opaque_data_64[656]; +typedef char _img4_opaque_data_32[476]; +#elif TARGET_OS_WATCH || defined(PLATFORM_WatchOS) +typedef char _img4_opaque_data_64[656]; +typedef char _img4_opaque_data_32[488]; +#elif TARGET_OS_TV || defined(PLATFORM_tvOS) || defined(PLATFORM_AppleTVOS) +typedef char _img4_opaque_data_64[656]; +typedef char _img4_opaque_data_32[476]; +#elif TARGET_OS_BRIDGE || defined(PLATFORM_BridgeOS) +typedef char _img4_opaque_data_64[656]; +typedef char _img4_opaque_data_32[476]; +#else +#error "Unsupported platform" +#endif + +/*! + * @typedef img4_t + * An opaque structure representing Image4 data. The Image4 data must contain a + * manifest and may optionally contain a payload. Neither this type nor the APIs + * APIs which manipulate it are thread-safe. + */ +IMG4_API_AVAILABLE_20180112 +typedef struct _img4 { +#if __ILP64__ || __LP64__ + _img4_opaque_data_64 __opaque; +#else + _img4_opaque_data_32 __opaque; +#endif +} img4_t; + +#if TARGET_OS_OSX || defined(PLATFORM_MacOSX) +typedef char _img4_payload_opaque_data_64[488]; +typedef char _img4_payload_opaque_data_32[316]; +#elif TARGET_OS_IOS || defined(PLATFORM_iPhoneOS) +typedef char _img4_payload_opaque_data_64[488]; +typedef char _img4_payload_opaque_data_32[316]; +#elif TARGET_OS_WATCH || defined(PLATFORM_WatchOS) +typedef char _img4_payload_opaque_data_64[488]; +typedef char _img4_payload_opaque_data_32[316]; +#elif TARGET_OS_TV || defined(PLATFORM_tvOS) || defined(PLATFORM_AppleTVOS) +typedef char _img4_payload_opaque_data_64[488]; +typedef char _img4_payload_opaque_data_32[316]; +#elif TARGET_OS_BRIDGE || defined(PLATFORM_BridgeOS) +typedef char _img4_payload_opaque_data_64[488]; +typedef char _img4_payload_opaque_data_32[316]; +#else +#error "Unsupported platform" +#endif + +/*! + * @typedef img4_payload_t + * An opaque structure describing Image4 payload data. Neither this type nor the + * APIs which manipulate it are thread-safe. + */ +IMG4_API_AVAILABLE_20180112 +typedef struct _img4_payload { +#if __ILP64__ || __LP64__ + _img4_payload_opaque_data_64 __opaque; +#else + _img4_payload_opaque_data_32 __opaque; +#endif +} img4_payload_t; + +#if !IMG4_PROJECT_BUILD +#include +#include +#endif + +/*! + * @function img4_init + * Initializes an Image4. + * + * @param i4 + * A pointer to the storage to initialize. + * + * @param flags + * Flags to modify initialization. + * + * @param bytes + * The Image4 data from which to initialize. If a destructor is provided, + * control of this buffer transfers to the Image4. + * + * @param len + * The length of the Image4 data. + * + * @param destructor + * A destructor for the Image4 data. May be NULL if the buffer does not require + * explicit deallocation (e.g. because the buffer is stack data). + * + * @result + * Upon success, zero is returned. The implementation may also return one of the + * following error codes directly: + * + * [EILSEQ] The data is not valid Image4 data + * [EFTYPE] The data does not contain an Image4 manifest + * + * @discussion + * The bytes given to this routine must represent an Image4 manifest. They may + * optionally also represent an Image4 payload. + */ +IMG4_API_AVAILABLE_20180112 +OS_EXPORT OS_WARN_RESULT OS_NONNULL1 OS_NONNULL3 +errno_t +img4_init(img4_t *i4, img4_flags_t flags, const uint8_t *bytes, size_t len, + img4_destructor_t destructor); + +/*! + * @function img4_set_custom_tag_handler + * Sets custom tag handlers for an Image4. These handlers are invoked during + * trust evaluation of the Image4. + * + * @param i4 + * The Image4 to modify. + * + * @param tags + * An array of custom tag structures which specify the custom tags expected. + * This must be constant storage. Passing heap or stack storage will result in + * undefined behavior. + * + * @param tags_cnt + * The number of items in the {@link tags} array. + * + * @discussion + * Invocations of custom tag handlers occur during trust evaluation. You should + * not assume that the Image4 is trusted within the scope of a custom tag + * handler. Trustworthiness can only be determined by consulting the return + * value of {@link img4_get_trusted_payload} or + * {@link img4_get_trusted_external_payload}. + */ +IMG4_API_AVAILABLE_20180112 +OS_EXPORT OS_NONNULL1 OS_NONNULL2 +void +img4_set_custom_tag_handler(img4_t *i4, + const img4_custom_tag_t *tags, size_t tags_cnt); + +/*! + * @function img4_get_trusted_payload + * Obtains the trusted payload bytes from the Image4. + * + * @param i4 + * The Image4 to query. + * + * @param tag + * The tag for the payload to obtain. + * + * @param env + * The environment against which to validate the Image4. + * + * @param ctx + * The context pointer to pass to the routines defined in the environment (if + * a custom environment was passed) and to any custom tag handlers. + * + * @param bytes + * A pointer to the storage where the pointer to the payload buffer will be + * written on success. + * + * @param len + * A pointer to the storage where the length of the payload buffer will be + * written on success. + * + * @result + * Upon success, zero is returned. The implementation may also return one of the + * following error codes directly: + * + * [ENOENT] The Image4 does not contain a payload for the specified tag + * [EAUTH] The Image4 manifest was not authentic + * [EACCES] The environment given does not satisfy the manifest + * constraints + * [EACCES] The environment and manifest do not agree on a digest + * algorithm + * [EILSEQ] The payload for the given tag does not match its description + * in the manifest + * [EIO] The payload could not be fetched + * + * Additionally, errors from the routines specified in the + * {@link img4_environment_t} may be returned. + * + * @discussion + * This routine will perform the following validation: + * + * 1. Validate that the Image4 manifest is authentic (i.e. was signed by + * Apple) + * 2. Validate that the given environment satisfies the constraints in the + * manifest + * 3. Validate that the measurement of the payload for the given tag matches + * the measurement in the manifest + * + * If any one of these validation checks fails, the payload is considered + * untrustworthy and is not returned. + */ +IMG4_API_AVAILABLE_20180112 +OS_EXPORT OS_WARN_RESULT OS_NONNULL1 OS_NONNULL3 OS_NONNULL5 OS_NONNULL6 +errno_t +img4_get_trusted_payload(img4_t *i4, img4_tag_t tag, + const img4_environment_t *env, void *ctx, + const uint8_t **bytes, size_t *len); + +/*! + * @function img4_get_trusted_external_payload + * Obtains the trusted payload bytes from the external Image4 payload after + * validating them against the object description in the Image4's manifest. + * + * @param i4 + * The Image4 to query. + * + * @param payload + * The payload to validate. + * + * @param env + * The environment against which to validate the Image4. + * + * @param ctx + * The context pointer to pass to the routines defined in the environment and to + * any custom tag handlers. + * + * @param bytes + * A pointer to the storage where the pointer to the payload buffer will be + * written on success. + * + * @param len + * A pointer to the storage where the length of the payload buffer will be + * written on success. + * + * @result + * Upon success, zero is returned. The implementation may also return one of the + * following error codes directly: + * + * [ENOENT] The Image4 does not contain an object describing the given + * payload + * [EAUTH] The Image4 manifest was not authentic + * [EACCES] The environment given does not satisfy the manifest + * constraints + * [EACCES] The environment and manifest do not agree on a digest + * algorithm + * [EILSEQ] The payload for the given tag does not match its description + * in the manifest + * [EIO] The payload could not be fetched + * + * Otherwise, an error from the underlying Image4 implementation will be + * returned. + * + * @discussion + * This routine performs the same validation steps as + * {@link img4_get_trusted_payload}. + */ +IMG4_API_AVAILABLE_20180112 +OS_EXPORT OS_WARN_RESULT OS_NONNULL1 OS_NONNULL2 +errno_t +img4_get_trusted_external_payload(img4_t *i4, img4_payload_t *payload, + const img4_environment_t *env, void *ctx, + const uint8_t **bytes, size_t *len); + +/*! + * @function img4_get_entitlement_bool + * Queries the Image4 manifest for a Boolean entitlement value. + * + * @param i4 + * The Image4 to query. + * + * @param entitlement + * The tag for the entitlement to query. + * + * @result + * The Boolean value of the entitlement. If the entitlement was not present, + * false is returned. If the entitlement was present but did not have a Boolean + * value, false is returned. + * + * @discussion + * This routine does not trigger validation of the Image4. Therefore the result + * result of this routine cannot be used to confer trust without also having + * obtained a valid payload. + */ +IMG4_API_AVAILABLE_20180112 +OS_EXPORT OS_WARN_RESULT OS_NONNULL1 +bool +img4_get_entitlement_bool(img4_t *i4, img4_tag_t entitlement); + +/*! + * @function img4_get_object_entitlement_bool + * Queries the specified object in the Image4 manifest for a Boolean entitlement + * value. + * + * @param i4 + * The Image4 to query. + * + * @param object + * The tag for the object to query. + * + * @param entitlement + * The tag for the entitlement to query. + * + * @result + * The Boolean value of the entitlement. If the entitlement was not present, + * false is returned. If the entitlement was present but did not have a Boolean + * value, false is returned. If the object specified was not present, false is + * returned. + * + * @discussion + * See discussion for {@link img4_get_entitlement_bool}. + */ +IMG4_API_AVAILABLE_20180112 +OS_EXPORT OS_WARN_RESULT OS_NONNULL1 +bool +img4_get_object_entitlement_bool(img4_t *i4, img4_tag_t object, + img4_tag_t entitlement); + +/*! + * @function img4_destroy + * Destroys an Image4 and disposes of associated resources. + * + * @param i4 + * The Image4 to destroy. + * + * @discussion + * The destructor passed to {@link img4_init} is called as a result of this + * routine, if any was set. + */ +IMG4_API_AVAILABLE_20180112 +OS_EXPORT OS_NONNULL1 +void +img4_destroy(img4_t *i4); + +__END_DECLS; + +#endif // __IMG4_H diff --git a/EXTERNAL_HEADERS/img4/payload.h b/EXTERNAL_HEADERS/img4/payload.h new file mode 100644 index 000000000..5a3ba810d --- /dev/null +++ b/EXTERNAL_HEADERS/img4/payload.h @@ -0,0 +1,70 @@ +/*! + * @header + * Image4 payload interfaces. These interfaces provide a lightweight type for + * working with an Image4 payload that is described by a separate manifest (e.g. + * a .im4p file whose contents are described by an object in a manifest from a + * .im4m file). + * + * No direct access is provided to the raw payload bytes encapsulated by the + * Image4 payload by design. The intent is that in order to access the raw + * bytes, the payload object must be validated against a manifest object using + * the {@link img4_get_trusted_external_payload} interface. + */ +#ifndef __IMG4_PAYLOAD_H +#define __IMG4_PAYLOAD_H + +#ifndef __IMG4_INDIRECT +#error "Please #include instead of this file directly" +#endif // __IMG4_INDIRECT + +/*! + * @function img4_payload_init + * + * @param i4p + * A pointer to the payload object to initialize. + * + * @param tag + * The expected tag for the payload. + * + * @param bytes + * The buffer containing the Image4 payload. + * + * @param len + * The length of the buffer. + * + * @param destructor + * A pointer to a routine to dispose of the buffer. May be NULL if the buffer + * does not require explicit disposal (e.g. the buffer is stack memory). + * + * @result + * Upon success, zero is returned. Otherwise, one of the following error codes: + * + * [EILSEQ] The data is not valid Image4 data + * [EFTYPE] The data does not contain an Image4 payload + * [ENOENT] The bytes do not contain a payload for the specified tag + */ +IMG4_API_AVAILABLE_20180112 +OS_EXPORT OS_WARN_RESULT OS_NONNULL1 OS_NONNULL3 OS_NONNULL5 +errno_t +img4_payload_init(img4_payload_t *i4p, img4_tag_t tag, + const uint8_t *bytes, size_t len, img4_destructor_t destructor); + +/*! + * @function img4_payload_destroy + * Disposes of the resources associated with the payload object. + * + * @param i4p + * The payload object of which to dispose. + * + * @discussion + * This routine does not deallocate the storage for the payload object itself, + * only the associated resources. This routine will cause the destructor given + * in {@link img4_payload_init} to be called, if any. + */ +IMG4_API_AVAILABLE_20180112 +OS_EXPORT OS_NONNULL1 +void +img4_payload_destroy(img4_payload_t *i4p); + +#endif // __IMG4_PAYLOAD_H + diff --git a/EXTERNAL_HEADERS/ptrauth.h b/EXTERNAL_HEADERS/ptrauth.h new file mode 100644 index 000000000..b6db0fb14 --- /dev/null +++ b/EXTERNAL_HEADERS/ptrauth.h @@ -0,0 +1,338 @@ +/*===---- ptrauth.h - Pointer authentication -------------------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __PTRAUTH_H +#define __PTRAUTH_H + +#include + +typedef enum { + ptrauth_key_asia = 0, + ptrauth_key_asib = 1, + ptrauth_key_asda = 2, + ptrauth_key_asdb = 3, + + /* A process-independent key which can be used to sign code pointers. + Signing and authenticating with this key is a no-op in processes + which disable ABI pointer authentication. */ + ptrauth_key_process_independent_code = ptrauth_key_asia, + + /* A process-specific key which can be used to sign code pointers. + Signing and authenticating with this key is enforced even in processes + which disable ABI pointer authentication. */ + ptrauth_key_process_dependent_code = ptrauth_key_asib, + + /* A process-independent key which can be used to sign data pointers. + Signing and authenticating with this key is a no-op in processes + which disable ABI pointer authentication. */ + ptrauth_key_process_independent_data = ptrauth_key_asda, + + /* A process-specific key which can be used to sign data pointers. + Signing and authenticating with this key is a no-op in processes + which disable ABI pointer authentication. */ + ptrauth_key_process_dependent_data = ptrauth_key_asdb, + + /* The key used to sign C function pointers. + The extra data is always 0. */ + ptrauth_key_function_pointer = ptrauth_key_process_independent_code, + + /* The key used to sign return addresses on the stack. + The extra data is based on the storage address of the return address. + On ARM64, that is always the storage address of the return address plus 8 + (or, in other words, the value of the stack pointer on function entry) */ + ptrauth_key_return_address = ptrauth_key_process_dependent_code, + + /* The key used to sign frame pointers on the stack. + The extra data is based on the storage address of the frame pointer. + On ARM64, that is always the storage address of the frame pointer plus 16 + (or, in other words, the value of the stack pointer on function entry) */ + ptrauth_key_frame_pointer = ptrauth_key_process_dependent_data, + + /* The key used to sign block function pointers, including: + invocation functions, + block object copy functions, + block object destroy functions, + __block variable copy functions, and + __block variable destroy functions. + The extra data is always the address at which the function pointer + is stored. + + Note that block object pointers themselves (i.e. the direct + representations of values of block-pointer type) are not signed. */ + ptrauth_key_block_function = ptrauth_key_asia, + + /* The key used to sign C++ v-table pointers. + The extra data is always 0. */ + ptrauth_key_cxx_vtable_pointer = ptrauth_key_asda, + + /* Other pointers signed under the ABI use private ABI rules. */ + +} ptrauth_key; + +/* An integer type of the appropriate size for an extra-data argument. */ +typedef uintptr_t ptrauth_extra_data_t; + +/* An integer type of the appropriate size for a generic signature. */ +typedef uintptr_t ptrauth_generic_signature_t; + +/* A signed pointer value embeds the original pointer together with + a signature that attests to the validity of that pointer. Because + this signature must use only "spare" bits of the pointer, a + signature's validity is probabilistic in practice: it is unlikely + but still plausible that an invalidly-derived signature will + somehow equal the correct signature and therefore successfully + authenticate. Nonetheless, this scheme provides a strong degree + of protection against certain kinds of attacks. */ + +/* Authenticating a pointer that was not signed with the given key + and extra-data value will (likely) fail. However, an + authentication failure will not lead immediately to a trap. + Instead, it will yield a value which is guaranteed to trap + if actually dereferenced. */ + +/* The null function pointer is always the all-zero bit pattern. + Signing an all-zero bit pattern will embed a (likely) non-zero + signature in the result, and so the result will not seem to be + a null function pointer. Authenticating this value will yield + a null function pointer back. However, authenticating an + all-zero bit pattern will probably fail, because the + authentication will expect a (likely) non-zero signature to + embedded in the value. + + Because of this, if a pointer may validly be null, you should + check for null before attempting to authenticate it. */ + +#ifdef __PTRAUTH_INTRINSICS__ + +/* Strip the signature from a value without authenticating it. + + If the value is a function pointer, the result will not be a + legal function pointer because of the missing signature, and + attempting to call it will result in an authentication failure. + + The value must be an expression of pointer type. + The key must be a constant expression of type ptrauth_key. + The result will have the same type as the original value. */ +#define ptrauth_strip(__value, __key) \ + __builtin_ptrauth_strip(__value, __key) + +/* Blend a pointer and a small integer to form a new extra-data + discriminator. Not all bits of the inputs are guaranteed to + contribute to the result. + + On ARM64, only the low 16 bits of the integer will be considered. + + For the purposes of ptrauth_sign_constant, the result of calling + this function is considered a constant expression if the arguments + are constant. Some restrictions may be imposed on the pointer. + + The first argument must be an expression of pointer type. + The second argument must be an expression of integer type. + The result will have type uintptr_t. */ +#define ptrauth_blend_discriminator(__pointer, __integer) \ + __builtin_ptrauth_blend_discriminator(__pointer, __integer) + +/* Add a signature to the given pointer value using a specific key, + using the given extra data as a salt to the signing process. + + The value must be a constant expression of pointer type. + The key must be a constant expression of type ptrauth_key. + The extra data must be a constant expression of pointer or integer type; + if an integer, it will be coerced to ptrauth_extra_data_t. + The result will have the same type as the original value. + + This is a constant expression if the extra data is an integer or + null pointer constant. */ +#define ptrauth_sign_constant(__value, __key, __data) \ + __builtin_ptrauth_sign_constant(__value, __key, __data) + +/* Add a signature to the given pointer value using a specific key, + using the given extra data as a salt to the signing process. + + This operation does not authenticate the original value and is + therefore potentially insecure if an attacker could possibly + control that value. + + The value must be an expression of pointer type. + The key must be a constant expression of type ptrauth_key. + The extra data must be an expression of pointer or integer type; + if an integer, it will be coerced to ptrauth_extra_data_t. + The result will have the same type as the original value. */ +#define ptrauth_sign_unauthenticated(__value, __key, __data) \ + __builtin_ptrauth_sign_unauthenticated(__value, __key, __data) + +/* Authenticate a pointer using one scheme and resign it using another. + + If the result is subsequently authenticated using the new scheme, that + authentication is guaranteed to fail if and only if the initial + authentication failed. + + The value must be an expression of pointer type. + The key must be a constant expression of type ptrauth_key. + The extra data must be an expression of pointer or integer type; + if an integer, it will be coerced to ptrauth_extra_data_t. + The result will have the same type as the original value. + + This operation is guaranteed to not leave the intermediate value + available for attack before it is re-signed. + + Do not pass a null pointer to this function. A null pointer + will not successfully authenticate. */ +#define ptrauth_auth_and_resign(__value, __old_key, __old_data, __new_key, __new_data) \ + __builtin_ptrauth_auth_and_resign(__value, __old_key, __old_data, __new_key, __new_data) + +/* Authenticate a pointer using one scheme and resign it as a C + function pointer. + + If the result is subsequently authenticated using the new scheme, that + authentication is guaranteed to fail if and only if the initial + authentication failed. + + The value must be an expression of function pointer type. + The key must be a constant expression of type ptrauth_key. + The extra data must be an expression of pointer or integer type; + if an integer, it will be coerced to ptrauth_extra_data_t. + The result will have the same type as the original value. + + This operation is guaranteed to not leave the intermediate value + available for attack before it is re-signed. Additionally, if this + expression is used syntactically as the function expression in a + call, only a single authentication will be performed. */ +#define ptrauth_auth_function(__value, __old_key, __old_data) \ + ptrauth_auth_and_resign(__value, __old_key, __old_data, ptrauth_key_function_pointer, 0) + +/* Authenticate a data pointer. + + The value must be an expression of non-function pointer type. + The key must be a constant expression of type ptrauth_key. + The extra data must be an expression of pointer or integer type; + if an integer, it will be coerced to ptrauth_extra_data_t. + The result will have the same type as the original value. + + If the authentication fails, dereferencing the resulting pointer + will fail. */ +#define ptrauth_auth_data(__value, __old_key, __old_data) \ + __builtin_ptrauth_auth(__value, __old_key, __old_data) + +/* Return an extra-discriminator value which can validly be used + as the second argument to ptrauth_blend_discriminator or the + third argument to the __ptrauth qualifier. + + The argument must be a string literal. + A call to this function is an integer constant expression. */ +#define ptrauth_string_discriminator(__string) \ + __builtin_ptrauth_string_discriminator(__string) + +/* Compute a full pointer-width generic signature for the given + value, using the given data as a salt. + + This generic signature is process-independent, but may not be + consistent across reboots. + + This can be used to validate the integrity of arbitrary data + by storing a signature for that data together with it. Because + the signature is pointer-sized, if the stored signature matches + the result of re-signing the current data, a match provides very + strong evidence that the data has not been corrupted. + + The value must be an expression of pointer or integer type; if + an integer, it will be coerced to uintptr_t. + The extra data must be an expression of pointer or integer type; + if an integer, it will be coerced to ptrauth_extra_data_t. + The result will have type ptrauth_generic_signature_t. + + This operation will compute a meaningful signature even in processes + which disable ABI pointer authentication. */ +#define ptrauth_sign_generic_data(__value, __data) \ + __builtin_ptrauth_sign_generic_data(__value, __data) + + +/* Define some standard __ptrauth qualifiers used in the ABI. */ +#define __ptrauth_function_pointer \ + __ptrauth(ptrauth_key_function_pointer,0,0) +#define __ptrauth_return_address \ + __ptrauth(ptrauth_key_return_address,1,0) +#define __ptrauth_block_invocation_pointer \ + __ptrauth(ptrauth_key_function_pointer,1,0) +#define __ptrauth_block_copy_helper \ + __ptrauth(ptrauth_key_function_pointer,1,0) +#define __ptrauth_block_destroy_helper \ + __ptrauth(ptrauth_key_function_pointer,1,0) +#define __ptrauth_block_byref_copy_helper \ + __ptrauth(ptrauth_key_function_pointer,1,0) +#define __ptrauth_block_byref_destroy_helper \ + __ptrauth(ptrauth_key_function_pointer,1,0) +#define __ptrauth_objc_method_list_imp \ + __ptrauth(ptrauth_key_function_pointer,1,0) +#define __ptrauth_cxx_vtable_pointer \ + __ptrauth(ptrauth_key_cxx_vtable_pointer,0,0) +#define __ptrauth_cxx_vtt_vtable_pointer \ + __ptrauth(ptrauth_key_cxx_vtable_pointer,0,0) +#define __ptrauth_swift_heap_object_destructor \ + __ptrauth(ptrauth_key_function_pointer,1,0xbbbf) + +/* Some situations in the C++ and Swift ABIs use declaration-specific + or type-specific extra discriminators. */ +#define __ptrauth_cxx_virtual_function_pointer(__declkey) \ + __ptrauth(ptrauth_key_function_pointer,1,__declkey) +#define __ptrauth_swift_function_pointer(__typekey) \ + __ptrauth(ptrauth_key_function_pointer,0,__typekey) +#define __ptrauth_swift_class_method_pointer(__declkey) \ + __ptrauth(ptrauth_key_function_pointer,1,__declkey) +#define __ptrauth_swift_protocol_witness_function_pointer(__declkey) \ + __ptrauth(ptrauth_key_function_pointer,1,__declkey) +#define __ptrauth_swift_value_witness_function_pointer(__key) \ + __ptrauth(ptrauth_key_function_pointer,1,__key) + +#else + +#define ptrauth_strip(__value, __key) __value +#define ptrauth_blend_discriminator(__pointer, __integer) ((uintptr_t)0) +#define ptrauth_sign_constant(__value, __key, __data) __value +#define ptrauth_sign_unauthenticated(__value, __key, __data) __value +#define ptrauth_auth_and_resign(__value, __old_key, __old_data, __new_key, __new_data) __value +#define ptrauth_auth_function(__value, __old_key, __old_data) __value +#define ptrauth_auth_data(__value, __old_key, __old_data) __value +#define ptrauth_string_discriminator(__string) ((int)0) +#define ptrauth_sign_generic_data(__value, __data) ((ptrauth_generic_signature_t)0) + +#define __ptrauth_function_pointer +#define __ptrauth_return_address +#define __ptrauth_block_invocation_pointer +#define __ptrauth_block_copy_helper +#define __ptrauth_block_destroy_helper +#define __ptrauth_block_byref_copy_helper +#define __ptrauth_block_byref_destroy_helper +#define __ptrauth_objc_method_list_imp +#define __ptrauth_cxx_vtable_pointer +#define __ptrauth_cxx_vtt_vtable_pointer +#define __ptrauth_swift_heap_object_destructor +#define __ptrauth_cxx_virtual_function_pointer(__declkey) +#define __ptrauth_swift_function_pointer(__typekey) +#define __ptrauth_swift_class_method_pointer(__declkey) +#define __ptrauth_swift_protocol_witness_function_pointer(__declkey) +#define __ptrauth_swift_value_witness_function_pointer(__key) + +#endif /* __PTRAUTH_INTRINSICS__ */ + +#endif /* __PTRAUTH_H */ diff --git a/Makefile b/Makefile index 1660223f5..31de51ae8 100644 --- a/Makefile +++ b/Makefile @@ -220,7 +220,7 @@ EXPINC_SUBDIRS_X86_64H = $(EXPINC_SUBDIRS) EXPINC_SUBDIRS_ARM = $(EXPINC_SUBDIRS) EXPINC_SUBDIRS_ARM64 = $(EXPINC_SUBDIRS) -SETUP_SUBDIRS = SETUP san +SETUP_SUBDIRS = SETUP osfmk san COMP_SUBDIRS_X86_64 = $(ALL_SUBDIRS) COMP_SUBDIRS_X86_64H = $(ALL_SUBDIRS) @@ -240,7 +240,17 @@ endif # all other RC_ProjectName installapi_libkdd installhdrs_libkdd install_libkdd: cd libkdd; \ - xcodebuild -target libkdd $(subst _libkdd,,$@) \ + xcodebuild -target Default $(subst _libkdd,,$@) \ + "SRCROOT=$(SRCROOT)/libkdd" \ + "OBJROOT=$(OBJROOT)" \ + "SYMROOT=$(SYMROOT)" \ + "DSTROOT=$(DSTROOT)" \ + "SDKROOT=$(SDKROOT)" + + +installapi_libkdd_tests installhdrs_libkdd_tests install_libkdd_tests: + cd libkdd; \ + xcodebuild -target tests $(subst _libkdd_tests,,$@) \ "SRCROOT=$(SRCROOT)/libkdd" \ "OBJROOT=$(OBJROOT)" \ "SYMROOT=$(SYMROOT)" \ @@ -250,7 +260,7 @@ installapi_libkdd installhdrs_libkdd install_libkdd: installapi_libkdd_host installhdrs_libkdd_host install_libkdd_host: cd libkdd; \ - xcodebuild -target kdd.framework $(subst _libkdd_host,,$@) \ + xcodebuild -configuration ReleaseHost -target kdd.framework $(subst _libkdd_host,,$@) \ "SRCROOT=$(SRCROOT)/libkdd" \ "OBJROOT=$(OBJROOT)" \ "SYMROOT=$(SYMROOT)" \ @@ -265,3 +275,5 @@ installapi_libkdd_host installhdrs_libkdd_host install_libkdd_host: xnu_tests: $(MAKE) -C $(SRCROOT)/tools/tests $(if $(filter -j,$(MAKEFLAGS)),,$(MAKEJOBS)) \ SRCROOT=$(SRCROOT)/tools/tests + $(MAKE) -C $(SRCROOT)/tests $(if $(filter -j,$(MAKEFLAGS)),,$(MAKEJOBS)) \ + SRCROOT=$(SRCROOT)/tests diff --git a/README.md b/README.md index dc1bbbae6..0e9d6b708 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ What is XNU? =========== XNU kernel is part of the Darwin operating system for use in macOS and iOS operating systems. XNU is an acronym for X is Not Unix. -XNU is a hybrid kernel combining the Mach kernel developed at Carnegie Mellon University with components from FreeBSD and C++ API for writing drivers called IOKit. +XNU is a hybrid kernel combining the Mach kernel developed at Carnegie Mellon University with components from FreeBSD and a C++ API for writing drivers called IOKit. XNU runs on x86_64 for both single processor and multi-processor configurations. XNU Source Tree @@ -190,8 +190,8 @@ The header files in framework's `PrivateHeaders` are only available for ** Apple The directory containing the header file should have a Makefile that creates the list of files that should be installed at different locations. -If you are adding first header file in a directory, you will need to -create Makefile similar to xnu/bsd/sys/Makefile. +If you are adding the first header file in a directory, you will need to +create Makefile similar to `xnu/bsd/sys/Makefile`. Add your header file to the correct file list depending on where you want to install it. The default locations where the header files are installed @@ -213,7 +213,13 @@ from each file list are - `$(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders` The Makefile combines the file lists mentioned above into different -install lists which are used by build system to install the header files. +install lists which are used by build system to install the header files. There +are two types of install lists: machine-dependent and machine-independent. +These lists are indicated by the presence of `MD` and `MI` in the build +setting, respectively. If your header is architecture-specific, then you should +use a machine-dependent install list (e.g. `INSTALL_MD_LIST`). If your header +should be installed for all architectures, then you should use a +machine-independent install list (e.g. `INSTALL_MI_LIST`). If the install list that you are interested does not exist, create it by adding the appropriate file lists. The default install lists, its @@ -270,28 +276,53 @@ want to export a function only to kernel level but not user level. Some pre-defined macros and their descriptions are - - a. `PRIVATE` : If true, code is available to all of the xnu kernel and is - not available in kernel extensions and user level header files. The - header files installed in all the paths described above in (1) will not - have code enclosed within this macro. - - b. `KERNEL_PRIVATE` : If true, code is available to all of the xnu kernel and Apple - internal kernel extensions. - - c. `BSD_KERNEL_PRIVATE` : If true, code is available to the xnu/bsd part of - the kernel and is not available to rest of the kernel, kernel extensions - and user level header files. The header files installed in all the - paths described above in (1) will not have code enclosed within this macro. - - d. `KERNEL` : If true, code is available only in kernel and kernel - extensions and is not available in user level header files. Only the + a. `PRIVATE` : If defined, enclosed definitions are considered System + Private Interfaces. These are visible within xnu and + exposed in user/kernel headers installed within the AppleInternal + "PrivateHeaders" sections of the System and Kernel frameworks. + b. `KERNEL_PRIVATE` : If defined, enclosed code is available to all of xnu + kernel and Apple internal kernel extensions and omitted from user + headers. + c. `BSD_KERNEL_PRIVATE` : If defined, enclosed code is visible exclusively + within the xnu/bsd module. + d. `MACH_KERNEL_PRIVATE`: If defined, enclosed code is visible exclusively + within the xnu/osfmk module. + e. `XNU_KERNEL_PRIVATE`: If defined, enclosed code is visible exclusively + within xnu. + f. `KERNEL` : If defined, enclosed code is available within xnu and kernel + extensions and is not visible in user level header files. Only the header files installed in following paths will have the code - $(DSTROOT)/System/Library/Frameworks/Kernel.framework/Headers $(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders - you should check [Testing the kernel][] for details. +Conditional compilation +======================= +`xnu` offers the following mechanisms for conditionally compiling code: + + a. *CPU Characteristics* If the code you are guarding has specific + characterstics that will vary only based on the CPU architecture being + targeted, use this option. Prefer checking for features of the + architecture (e.g. `__LP64__`, `__LITTLE_ENDIAN__`, etc.). + b. *New Features* If the code you are guarding, when taken together, + implements a feature, you should define a new feature in `config/MASTER` + and use the resulting `CONFIG` preprocessor token (e.g. for a feature + named `config_virtual_memory`, check for `#if CONFIG_VIRTUAL_MEMORY`). + This practice ensures that existing features may be brought to other + platforms by simply changing a feature switch. + c. *Existing Features* You can use existing features if your code is + strongly tied to them (e.g. use `SECURE_KERNEL` if your code implements + new functionality that is exclusively relevant to the trusted kernel and + updates the definition/understanding of what being a trusted kernel means). + +It is recommended that you avoid compiling based on the target platform. `xnu` +does not define the platform macros from `TargetConditionals.h` +(`TARGET_OS_OSX`, `TARGET_OS_IOS`, etc.). + + +There is a `TARGET_OS_EMBEDDED` macro, but this should be avoided as it is in +general too broad a definition for most functionality. How to add a new syscall ======================== diff --git a/SETUP/kextsymboltool/kextsymboltool.c b/SETUP/kextsymboltool/kextsymboltool.c index edb6dfaea..7e0d49a2d 100644 --- a/SETUP/kextsymboltool/kextsymboltool.c +++ b/SETUP/kextsymboltool/kextsymboltool.c @@ -474,6 +474,10 @@ lookup_arch(const char *archstring) static const NXArchInfo archlist[] = { { "x86_64", 0x01000007 /* CPU_TYPE_X86_64 */, 3 /* CPU_SUBTYPE_X86_64_ALL */, NX_LittleEndian, NULL }, { "x86_64h", 0x01000007 /* CPU_TYPE_X86_64 */, 8 /* CPU_SUBTYPE_X86_64_H */, NX_LittleEndian, NULL }, + { "armv7", 12 /* CPU_TYPE_ARM */, 9 /* CPU_SUBTYPE_ARM_V7 */, NX_LittleEndian, NULL }, + { "armv7s", 12 /* CPU_TYPE_ARM */, 11 /* CPU_SUBTYPE_ARM_V7S */, NX_LittleEndian, NULL }, + { "armv7k", 12 /* CPU_TYPE_ARM */, 12 /* CPU_SUBTYPE_ARM_V7K */, NX_LittleEndian, NULL }, + { "arm64", 0x0100000c /* CPU_TYPE_ARM64 */, 0 /* CPU_SUBTYPE_ARM64_ALL */, NX_LittleEndian, NULL }, }; unsigned long i; diff --git a/bsd/Makefile b/bsd/Makefile index c0cdd42fd..f79dc7046 100644 --- a/bsd/Makefile +++ b/bsd/Makefile @@ -19,6 +19,7 @@ INSTINC_SUBDIRS = \ netkey \ nfs \ security \ + pthread \ sys \ uuid \ vfs @@ -49,6 +50,7 @@ EXPINC_SUBDIRS = \ netinet6 \ netkey \ security \ + pthread \ sys \ uuid \ vfs \ diff --git a/bsd/arm/_mcontext.h b/bsd/arm/_mcontext.h index 5a2e735cf..7d03ebe75 100644 --- a/bsd/arm/_mcontext.h +++ b/bsd/arm/_mcontext.h @@ -79,7 +79,7 @@ _STRUCT_MCONTEXT64 #ifndef _MCONTEXT_T #define _MCONTEXT_T -#if defined(__LP64__) +#if defined(__arm64__) typedef _STRUCT_MCONTEXT64 *mcontext_t; #define _STRUCT_MCONTEXT _STRUCT_MCONTEXT64 #else diff --git a/bsd/arm/fasttrap_isa.h b/bsd/arm/fasttrap_isa.h index eb577a43f..e72118ebc 100644 --- a/bsd/arm/fasttrap_isa.h +++ b/bsd/arm/fasttrap_isa.h @@ -107,6 +107,7 @@ typedef struct fasttrap_machtp { #define FASTTRAP_T_ARM64_ADR 36 #define FASTTRAP_T_ARM64_PRFM 37 #define FASTTRAP_T_ARM64_EXCLUSIVE_MEM 38 +#define FASTTRAP_T_ARM64_RETAB 39 #endif #if defined (__arm__) @@ -130,6 +131,8 @@ typedef struct fasttrap_machtp { #define FASTTRAP_FN_ARM 1 #define FASTTRAP_FN_THUMB 2 #define FASTTRAP_FN_USDT 3 +#define FASTTRAP_FN_ARM64 4 +#define FASTTRAP_FN_ARM64_32 5 #define ARM_RM(x) ((x) & 0xF) #define ARM_RS(x) (((x) >> 8) & 0xF) @@ -221,6 +224,9 @@ typedef struct fasttrap_machtp { #define FASTTRAP_ARM64_OP_MASK_EXCL_MEM 0x3f000000 /* Bits to check for exclusive memory operation */ #define FASTTRAP_ARM64_OP_VALUE_EXCL_MEM 0x08000000 /* Value to find */ + +#define FASTTRAP_ARM64_OP_MASK_RETAB 0xfffffc1f /* Bits to check for retab Rt */ +#define FASTTRAP_ARM64_OP_VALUE_RETAB 0xd65f0c1f /* Value to find */ #endif /* defined(__arm64__) */ #ifdef __cplusplus diff --git a/bsd/arm/types.h b/bsd/arm/types.h index 18906141c..e84405b12 100644 --- a/bsd/arm/types.h +++ b/bsd/arm/types.h @@ -128,10 +128,17 @@ typedef __int32_t user32_ssize_t; typedef __int32_t user32_long_t; typedef __uint32_t user32_ulong_t; typedef __int32_t user32_time_t; -#if __arm__ && (__BIGGEST_ALIGNMENT__ > 4) -typedef __int64_t user32_off_t; + +/* + * This alignment is required to ensure symmetry between userspace and kernelspace + * when the kernel is 64-bit and the user application is 32-bit. All currently + * supported ARM slices (arm64/armv7k/arm64_32) contain the same type alignment + * ABI so this alignment isn't needed for ARM. + */ +#if defined(__x86_64__) +typedef __int64_t user32_off_t __attribute__((aligned(4))); #else -typedef __int64_t user32_off_t __attribute__((aligned(4))); +typedef __int64_t user32_off_t; #endif #endif /* KERNEL */ diff --git a/bsd/bsm/audit.h b/bsd/bsm/audit.h index 525363788..1f6b2476e 100644 --- a/bsd/bsm/audit.h +++ b/bsd/bsm/audit.h @@ -79,9 +79,9 @@ /* * IPC types. */ -#define AT_IPC_MSG ((u_char)1) /* Message IPC id. */ -#define AT_IPC_SEM ((u_char)2) /* Semaphore IPC id. */ -#define AT_IPC_SHM ((u_char)3) /* Shared mem IPC id. */ +#define AT_IPC_MSG ((unsigned char)1) /* Message IPC id. */ +#define AT_IPC_SEM ((unsigned char)2) /* Semaphore IPC id. */ +#define AT_IPC_SHM ((unsigned char)3) /* Shared mem IPC id. */ /* * Audit conditions. @@ -127,6 +127,10 @@ #define A_SETCOND 38 #define A_GETSFLAGS 39 #define A_SETSFLAGS 40 +#define A_GETCTLMODE 41 +#define A_SETCTLMODE 42 +#define A_GETEXPAFTER 43 +#define A_SETEXPAFTER 44 /* * Audit policy controls. @@ -167,6 +171,24 @@ #define AU_IPv4 4 #define AU_IPv6 16 +/* + * Reserved audit class mask indicating which classes are unable to have + * events added or removed by unentitled processes. + */ +#define AU_CLASS_MASK_RESERVED 0x10000000 + +/* + * Audit control modes + */ +#define AUDIT_CTLMODE_NORMAL ((unsigned char)1) +#define AUDIT_CTLMODE_EXTERNAL ((unsigned char)2) + +/* + * Audit file expire_after op modes + */ +#define AUDIT_EXPIRE_OP_AND ((unsigned char)0) +#define AUDIT_EXPIRE_OP_OR ((unsigned char)1) + __BEGIN_DECLS typedef uid_t au_id_t; @@ -175,6 +197,7 @@ typedef u_int16_t au_event_t; typedef u_int16_t au_emod_t; typedef u_int32_t au_class_t; typedef u_int64_t au_asflgs_t __attribute__ ((aligned (8))); +typedef unsigned char au_ctlmode_t; struct au_tid { dev_t port; @@ -237,6 +260,13 @@ struct au_session { }; typedef struct au_session au_session_t; +struct au_expire_after { + time_t age; /* Age after which trail files should be expired */ + size_t size; /* Aggregate trail size when files should be expired */ + unsigned char op_type; /* Operator used with the above values to determine when files should be expired */ +}; +typedef struct au_expire_after au_expire_after_t; + /* * Contents of token_t are opaque outside of libbsm. */ diff --git a/bsd/bsm/audit_internal.h b/bsd/bsm/audit_internal.h index 71a51307a..c2103f32c 100644 --- a/bsd/bsm/audit_internal.h +++ b/bsd/bsm/audit_internal.h @@ -75,6 +75,7 @@ typedef struct au_record au_record_t; #define AUDIT_HEADER_SIZE 18 #define MAX_AUDIT_HEADER_SIZE (5*sizeof(u_int32_t)+18) #define AUDIT_TRAILER_SIZE 7 +#define MAX_AUDIT_IDENTITY_SIZE 179 /* * BSM token streams store fields in big endian byte order, so as to be diff --git a/bsd/bsm/audit_kevents.h b/bsd/bsm/audit_kevents.h index 391425f91..31e6353d7 100644 --- a/bsd/bsm/audit_kevents.h +++ b/bsd/bsm/audit_kevents.h @@ -814,6 +814,7 @@ #define AUE_WATCHEVENT AUE_NULL #define AUE_WORKQOPEN AUE_NULL #define AUE_WORKQOPS AUE_NULL +#define AUE_WORKLOOPCTL AUE_NULL #define AUE_PERSONA AUE_NULL #define AUE_USRCTL AUE_NULL #define AUE_NEXUS AUE_NULL diff --git a/bsd/bsm/audit_record.h b/bsd/bsm/audit_record.h index 2b6ae891a..bedcb800a 100644 --- a/bsd/bsm/audit_record.h +++ b/bsd/bsm/audit_record.h @@ -126,6 +126,11 @@ #define AUT_SOCKINET128 0x81 /* XXX */ #define AUT_SOCKUNIX 0x82 /* XXX */ +/* Apple specific tokens*/ +#define AUT_IDENTITY 0xed +#define AUT_KRB5_PRINCIPAL 0xee +#define AUT_CERT_HASH 0xef + /* print values for the arbitrary token */ #define AUP_BINARY 0 #define AUP_OCTAL 1 @@ -272,14 +277,21 @@ token_t *au_to_subject64_ex(au_id_t auid, uid_t euid, gid_t egid, uid_t ruid, #if defined(_KERNEL) || defined(KERNEL) token_t *au_to_exec_args(char *args, int argc); token_t *au_to_exec_env(char *envs, int envc); +token_t *au_to_certificate_hash(char *hash, int hashc); +token_t *au_to_krb5_principal(char *principal, int princ); #else token_t *au_to_exec_args(char **argv); token_t *au_to_exec_env(char **envp); +token_t *au_to_certificate_hash(char **hash); +token_t *au_to_krb5_principal(char **principal); #endif token_t *au_to_text(const char *text); token_t *au_to_kevent(struct kevent *kev); token_t *au_to_trailer(int rec_size); token_t *au_to_zonename(const char *zonename); +token_t *au_to_identity(uint32_t signer_type, const char* signing_id, + u_char signing_id_trunc, const char* team_id, u_char team_id_trunc, + uint8_t* cdhash, uint16_t cdhash_len); /* * BSM library routines for converting between local and BSM constant spaces. diff --git a/bsd/conf/Makefile.template b/bsd/conf/Makefile.template index afe23cf34..c38c2ffb6 100644 --- a/bsd/conf/Makefile.template +++ b/bsd/conf/Makefile.template @@ -160,6 +160,7 @@ OBJS_NO_CAST_ALIGN = \ dtrace.o \ fasttrap.o \ fasttrap_isa.o \ + fbt.o \ fbt_arm.o \ fbt_x86.o \ if_bond.o \ @@ -228,6 +229,7 @@ $(foreach file,$(OBJS_NO_CAST_ALIGN),$(eval $(call add_perfile_cflags,$(file),-W OBJS_NO_PACKED_ADDRESS = \ ah_core.o \ ah_input.o \ + dlil.o \ esp_input.o \ esp_output.o \ frag6.o \ @@ -242,6 +244,7 @@ OBJS_NO_PACKED_ADDRESS = \ ipsec.o \ mld6.o \ mptcp_opt.o \ + nat464_utils.o \ nd6.o \ nd6_nbr.o \ nd6_prproxy.o \ diff --git a/bsd/conf/files b/bsd/conf/files index 4bf42f392..65e1393f6 100644 --- a/bsd/conf/files +++ b/bsd/conf/files @@ -117,7 +117,6 @@ bsd/dev/dtrace/lockstat.c optional config_dtrace bsd/dev/dtrace/dtrace_ptss.c optional config_dtrace bsd/dev/dtrace/dtrace_subr.c optional config_dtrace bsd/dev/dtrace/dtrace_glue.c standard -bsd/dev/dtrace/dtrace_alloc.c optional config_dtrace bsd/dev/dtrace/blist.c optional config_dtrace bsd/dev/dtrace/fbt.c optional config_dtrace bsd/dev/dtrace/sdt.c optional config_dtrace @@ -218,6 +217,7 @@ bsd/net/net_perf.c optional networking bsd/net/if_gif.c optional gif bsd/net/if_stf.c optional stf bsd/net/if_ports_used.c optional networking +bsd/net/if_low_power_mode.c optional networking bsd/net/kpi_interface.c optional networking bsd/net/kpi_protocol.c optional networking bsd/net/kpi_interfacefilter.c optional networking @@ -228,6 +228,7 @@ bsd/net/necp.c optional necp bsd/net/necp_client.c optional necp bsd/net/network_agent.c optional networking bsd/net/if_pflog.c optional pflog +bsd/net/nat464_utils.c optional networking bsd/net/pf.c optional pf bsd/net/pf_if.c optional pf bsd/net/pf_ioctl.c optional pf @@ -380,6 +381,10 @@ bsd/security/audit/audit_session.c standard bsd/security/audit/audit_syscalls.c standard bsd/security/audit/audit_worker.c optional config_audit +bsd/pthread/pthread_shims.c standard +bsd/pthread/pthread_priority.c standard +bsd/pthread/pthread_workqueue.c standard + bsd/kern/bsd_init.c standard ./init_sysent.c standard bsd/kern/kdebug.c standard @@ -469,7 +474,6 @@ bsd/kern/posix_shm.c standard bsd/kern/qsort.c standard bsd/kern/kpi_socket.c optional sockets bsd/kern/kpi_socketfilter.c optional sockets -bsd/kern/pthread_shims.c standard bsd/kern/proc_info.c standard bsd/kern/process_policy.c standard bsd/kern/kern_overrides.c standard @@ -503,4 +507,6 @@ bsd/miscfs/nullfs/null_subr.c optional nullfs bsd/miscfs/nullfs/null_vfsops.c optional nullfs bsd/miscfs/nullfs/null_vnops.c optional nullfs +bsd/tests/bsd_tests.c optional config_xnupost +bsd/tests/pmap_test_sysctl.c optional config_xnupost diff --git a/bsd/conf/files.arm64 b/bsd/conf/files.arm64 index 64009971c..7761c03ac 100644 --- a/bsd/conf/files.arm64 +++ b/bsd/conf/files.arm64 @@ -9,6 +9,7 @@ bsd/dev/arm/unix_signal.c standard bsd/dev/arm64/cpu_in_cksum.s standard + bsd/dev/arm64/dtrace_isa.c optional config_dtrace bsd/dev/arm64/dtrace_subr_arm.c optional config_dtrace bsd/dev/arm64/fbt_arm.c optional config_dtrace diff --git a/bsd/dev/arm/dtrace_isa.c b/bsd/dev/arm/dtrace_isa.c index d38831ba3..07397b4b8 100644 --- a/bsd/dev/arm/dtrace_isa.c +++ b/bsd/dev/arm/dtrace_isa.c @@ -28,6 +28,7 @@ #define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from * mach/ppc/thread_status.h */ +#include #include #include @@ -175,12 +176,16 @@ uint64_t dtrace_getreg(struct regs * savearea, uint_t reg) { struct arm_saved_state *regs = (struct arm_saved_state *) savearea; - + if (regs == NULL) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); + return (0); + } /* beyond register limit? */ if (reg > ARM_SAVED_STATE32_COUNT - 1) { DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); return (0); } + return (uint64_t) ((unsigned int *) (&(regs->r)))[reg]; } @@ -629,3 +634,12 @@ dtrace_arm_condition_true(int cond, int cpsr) return taken; } + +void dtrace_flush_caches(void) +{ + /* TODO There were some problems with flushing just the cache line that had been modified. + * For now, we'll flush the entire cache, until we figure out how to flush just the patched block. + */ + FlushPoU_Dcache(); + InvalidatePoU_Icache(); +} diff --git a/bsd/dev/arm/fasttrap_isa.c b/bsd/dev/arm/fasttrap_isa.c index d48b48a71..07d41a228 100644 --- a/bsd/dev/arm/fasttrap_isa.c +++ b/bsd/dev/arm/fasttrap_isa.c @@ -113,16 +113,6 @@ extern int dtrace_decode_thumb(uint32_t instr); extern int dtrace_arm_condition_true(int cond, int cpsr); -static -void flush_caches(void) -{ - /* TODO There were some problems with flushing just the cache line that had been modified. - * For now, we'll flush the entire cache, until we figure out how to flush just the patched block. - */ - FlushPoU_Dcache(); - InvalidatePoU_Icache(); -} - int fasttrap_tracepoint_init(proc_t *p, fasttrap_tracepoint_t *tp, user_addr_t pc, fasttrap_probe_type_t type) @@ -202,90 +192,6 @@ fasttrap_tracepoint_init(proc_t *p, fasttrap_tracepoint_t *tp, return (0); } -// These are not exported from vm_map.h. -extern kern_return_t vm_map_write_user(vm_map_t map, void *src_p, vm_map_address_t dst_addr, vm_size_t size); - -/* Patches the instructions. Almost like uwrite, but need special instructions on ARM to flush the caches. */ -static -int patchInst(proc_t *p, void *buf, user_size_t len, user_addr_t a) -{ - kern_return_t ret; - - ASSERT(p != NULL); - ASSERT(p->task != NULL); - - task_t task = p->task; - - /* - * Grab a reference to the task vm_map_t to make sure - * the map isn't pulled out from under us. - * - * Because the proc_lock is not held at all times on all code - * paths leading here, it is possible for the proc to have - * exited. If the map is null, fail. - */ - vm_map_t map = get_task_map_reference(task); - if (map) { - /* Find the memory permissions. */ - uint32_t nestingDepth=999999; - vm_region_submap_short_info_data_64_t info; - mach_msg_type_number_t count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64; - mach_vm_address_t address = (mach_vm_address_t)a; - mach_vm_size_t sizeOfRegion = (mach_vm_size_t)len; - - ret = mach_vm_region_recurse(map, &address, &sizeOfRegion, &nestingDepth, (vm_region_recurse_info_t)&info, &count); - if (ret != KERN_SUCCESS) - goto done; - - vm_prot_t reprotect; - - if (!(info.protection & VM_PROT_WRITE)) { - /* Save the original protection values for restoration later */ - reprotect = info.protection; - if (info.max_protection & VM_PROT_WRITE) { - /* The memory is not currently writable, but can be made writable. */ - /* Making it both writable and executable at the same time causes warning on embedded */ - ret = mach_vm_protect (map, (mach_vm_offset_t)a, (mach_vm_size_t)len, 0, (reprotect & ~VM_PROT_EXECUTE) | VM_PROT_WRITE); - } else { - /* - * The memory is not currently writable, and cannot be made writable. We need to COW this memory. - * - * Strange, we can't just say "reprotect | VM_PROT_COPY", that fails. - */ - ret = mach_vm_protect (map, (mach_vm_offset_t)a, (mach_vm_size_t)len, 0, VM_PROT_COPY | VM_PROT_READ | VM_PROT_WRITE); - } - - if (ret != KERN_SUCCESS) - goto done; - - } else { - /* The memory was already writable. */ - reprotect = VM_PROT_NONE; - } - - ret = vm_map_write_user( map, - buf, - (vm_map_address_t)a, - (vm_size_t)len); - - flush_caches(); - - if (ret != KERN_SUCCESS) - goto done; - - if (reprotect != VM_PROT_NONE) { - ASSERT(reprotect & VM_PROT_EXECUTE); - ret = mach_vm_protect (map, (mach_vm_offset_t)a, (mach_vm_size_t)len, 0, reprotect); - } - -done: - vm_map_deallocate(map); - } else - ret = KERN_TERMINATED; - - return (int)ret; -} - int fasttrap_tracepoint_install(proc_t *p, fasttrap_tracepoint_t *tp) { @@ -299,7 +205,7 @@ fasttrap_tracepoint_install(proc_t *p, fasttrap_tracepoint_t *tp) instr = FASTTRAP_ARM_INSTR; } - if (patchInst(p, &instr, size, tp->ftt_pc) != 0) + if (uwrite(p, &instr, size, tp->ftt_pc) != 0) return (-1); tp->ftt_installed = 1; @@ -327,7 +233,7 @@ fasttrap_tracepoint_remove(proc_t *p, fasttrap_tracepoint_t *tp) if (instr != FASTTRAP_ARM_INSTR) goto end; } - if (patchInst(p, &tp->ftt_instr, size, tp->ftt_pc) != 0) + if (uwrite(p, &tp->ftt_instr, size, tp->ftt_pc) != 0) return (-1); end: @@ -1154,7 +1060,7 @@ fasttrap_pid_probe(arm_saved_state_t *regs) SET32(scratch+i, FASTTRAP_ARM_RET_INSTR); i += 4; } - if (patchInst(p, scratch, i, uthread->t_dtrace_scratch->write_addr) != KERN_SUCCESS) { + if (uwrite(p, scratch, i, uthread->t_dtrace_scratch->write_addr) != KERN_SUCCESS) { fasttrap_sigtrap(p, uthread, pc); new_pc = pc; break; diff --git a/bsd/dev/arm/fbt_arm.c b/bsd/dev/arm/fbt_arm.c index c594f9c92..9205cfb21 100644 --- a/bsd/dev/arm/fbt_arm.c +++ b/bsd/dev/arm/fbt_arm.c @@ -99,9 +99,6 @@ extern int fbt_probetab_mask; kern_return_t fbt_perfCallback(int, struct arm_saved_state *, __unused int, __unused int); -static int fbt_uninstrumented_arm = 0; -static const int fbt_log_uninstrumented = 0; - extern int dtrace_arm_condition_true(int cond, int cpsr); @@ -212,7 +209,7 @@ fbt_invop(uintptr_t addr, uintptr_t * stack, uintptr_t rval) } CPU->cpu_dtrace_invop_underway = 0; } - + /* On other architectures, we return a DTRACE constant to let the callback function know what was replaced. On the ARM, since the function prologue/epilogue machine code @@ -256,7 +253,7 @@ fbt_perfCallback( ); emul = dtrace_invop(regs->pc, (uintptr_t*) regs, regs->r[0]); - + __asm__ volatile( "Ldtrace_invop_callsite_post_label:\n" ".data\n" @@ -335,7 +332,7 @@ fbt_perfCallback( } void -fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, char *modname, char* symbolName, machine_inst_t* symbolStart) +fbt_provide_probe(struct modctl *ctl, const char *modname, const char* symbolName, machine_inst_t* symbolStart, machine_inst_t *instrHigh) { unsigned int j; int doenable = 0; @@ -344,11 +341,11 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c fbt_probe_t *newfbt, *retfbt, *entryfbt; machine_inst_t *instr, *pushinstr = NULL, *limit, theInstr; int foundPushLR, savedRegs; - + /* * Guard against null symbols */ - if (!symbolStart || !instrLow || !instrHigh) { + if (!symbolStart || !instrHigh || instrHigh < symbolStart) { kprintf("dtrace: %s has an invalid address\n", symbolName); return; } @@ -360,7 +357,7 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c savedRegs = -1; limit = (machine_inst_t *)instrHigh; for (j = 0, instr = symbolStart, theInstr = 0; - (j < 8) && ((uintptr_t)instr >= instrLow) && (instrHigh > (uintptr_t)(instr)); j++, instr++) + (j < 8) && instr < instrHigh; j++, instr++) { theInstr = *instr; if (FBT_IS_THUMB_PUSH_LR(theInstr)) { @@ -390,7 +387,7 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c newfbt = kmem_zalloc(sizeof(fbt_probe_t), KM_SLEEP); newfbt->fbtp_next = NULL; strlcpy( (char *)&(newfbt->fbtp_name), symbolName, MAX_FBTP_NAME_CHARS ); - + if (thisid != 0) { /* * The dtrace_probe previously existed, so we have to hook @@ -432,7 +429,7 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c newfbt->fbtp_currentval = 0; newfbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)]; fbt_probetab[FBT_ADDR2NDX(instr)] = newfbt; - + if (doenable) fbt_enable(NULL, newfbt->fbtp_id, newfbt); @@ -446,7 +443,7 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c doenable=0; thisid = dtrace_probe_lookup(fbt_id, modname, symbolName, FBT_RETURN); - + if (thisid != 0) { /* The dtrace_probe previously existed, so we have to * find the end of the existing fbt chain. If we find @@ -501,7 +498,7 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c * OK, it's an instruction. */ theInstr = *instr; - + /* Walked onto the start of the next routine? If so, bail out from this function */ if (FBT_IS_THUMB_PUSH_LR(theInstr)) { if (!retfbt) @@ -560,7 +557,7 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c */ newfbt = kmem_zalloc(sizeof(fbt_probe_t), KM_SLEEP); - newfbt->fbtp_next = NULL; + newfbt->fbtp_next = NULL; strlcpy( (char *)&(newfbt->fbtp_name), symbolName, MAX_FBTP_NAME_CHARS ); if (retfbt == NULL) { @@ -593,89 +590,3 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c goto again; } -void -fbt_provide_module_kernel_syms(struct modctl *ctl) -{ - kernel_mach_header_t *mh; - struct load_command *cmd; - kernel_segment_command_t *orig_ts = NULL, *orig_le = NULL; - struct symtab_command *orig_st = NULL; - kernel_nlist_t *sym = NULL; - char *strings; - uintptr_t instrLow, instrHigh; - char *modname; - unsigned int i; - - mh = (kernel_mach_header_t *)(ctl->mod_address); - modname = ctl->mod_modname; - - /* - * Employees of dtrace and their families are ineligible. Void - * where prohibited. - */ - - if (mh->magic != MH_MAGIC_KERNEL) - return; - - cmd = (struct load_command *) & mh[1]; - for (i = 0; i < mh->ncmds; i++) { - if (cmd->cmd == LC_SEGMENT_KERNEL) { - kernel_segment_command_t *orig_sg = (kernel_segment_command_t *) cmd; - - if (LIT_STRNEQL(orig_sg->segname, SEG_TEXT)) - orig_ts = orig_sg; - else if (LIT_STRNEQL(orig_sg->segname, SEG_LINKEDIT)) - orig_le = orig_sg; - else if (LIT_STRNEQL(orig_sg->segname, "")) - orig_ts = orig_sg; /* kexts have a single - * unnamed segment */ - } else if (cmd->cmd == LC_SYMTAB) - orig_st = (struct symtab_command *) cmd; - - cmd = (struct load_command *) ((caddr_t) cmd + cmd->cmdsize); - } - - if ((orig_ts == NULL) || (orig_st == NULL) || (orig_le == NULL)) - return; - - sym = (kernel_nlist_t *)(orig_le->vmaddr + orig_st->symoff - orig_le->fileoff); - strings = (char *)(orig_le->vmaddr + orig_st->stroff - orig_le->fileoff); - - /* Find extent of the TEXT section */ - instrLow = (uintptr_t) orig_ts->vmaddr; - instrHigh = (uintptr_t) (orig_ts->vmaddr + orig_ts->vmsize); - - for (i = 0; i < orig_st->nsyms; i++) { - uint8_t n_type = sym[i].n_type & (N_TYPE | N_EXT); - char *name = strings + sym[i].n_un.n_strx; - - /* Check that the symbol is a global and that it has a name. */ - if (((N_SECT | N_EXT) != n_type && (N_ABS | N_EXT) != n_type)) - continue; - - if (0 == sym[i].n_un.n_strx) /* iff a null, "", name. */ - continue; - - /* Lop off omnipresent leading underscore. */ - if (*name == '_') - name += 1; - - - if (sym[i].n_sect == 1 && !(sym[i].n_desc & N_ARM_THUMB_DEF)) { - /* A function but not a Thumb function */ - fbt_uninstrumented_arm++; - if (fbt_log_uninstrumented) - kprintf("dtrace: fbt: Skipping ARM mode function %s at %08x\n",name,(unsigned)sym[i].n_value); - - continue; - } - - /* - * We're only blacklisting functions in the kernel for now. - */ - if (MOD_IS_MACH_KERNEL(ctl) && fbt_excluded(name)) - continue; - - fbt_provide_probe(ctl, instrLow, instrHigh, modname, name, (machine_inst_t*)sym[i].n_value); - } -} diff --git a/bsd/dev/arm/kern_machdep.c b/bsd/dev/arm/kern_machdep.c index 312952ac9..2c27afaf2 100644 --- a/bsd/dev/arm/kern_machdep.c +++ b/bsd/dev/arm/kern_machdep.c @@ -14,6 +14,7 @@ #include #include #include +#include #if __arm64__ extern int bootarg_no64exec; /* bsd_init.c */ @@ -49,7 +50,8 @@ int grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype) { #if __arm64__ - cpu_subtype_t hostsubtype = (exectype & CPU_ARCH_ABI64) ? cpu_subtype() : cpu_subtype32(); + cpu_subtype_t hostsubtype = + (exectype & CPU_ARCH_ABI64) ? cpu_subtype() : cpu_subtype32(); #else cpu_subtype_t hostsubtype = cpu_subtype(); #endif /* __arm64__ */ @@ -63,14 +65,14 @@ grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype) case CPU_SUBTYPE_ARM64_V8: switch (execsubtype) { case CPU_SUBTYPE_ARM64_V8: - return 9; + return 10; case CPU_SUBTYPE_ARM64_ALL: - return 8; + return 9; } break; + } /* switch (hostsubtype) */ - break; #else /* __arm64__ */ case CPU_TYPE_ARM: diff --git a/bsd/dev/arm/sysctl.c b/bsd/dev/arm/sysctl.c index a1ee66f16..d97e80e21 100644 --- a/bsd/dev/arm/sysctl.c +++ b/bsd/dev/arm/sysctl.c @@ -7,6 +7,11 @@ #include +#include +#include +#include +#include + extern int trap_on_alignment_fault; extern uint64_t wake_abstime; @@ -58,3 +63,121 @@ SYSCTL_PROC(_machdep, OID_AUTO, wake_conttime, 0, 0, sysctl_wake_conttime, "I", "Continuous Time at the last wakeup"); +/* + * For source compatibility, here's some machdep.cpu mibs that + * use host_info() to simulate reasonable answers. + */ + +SYSCTL_NODE(_machdep, OID_AUTO, cpu, CTLFLAG_RW|CTLFLAG_LOCKED, 0, + "CPU info"); + +static int +arm_host_info SYSCTL_HANDLER_ARGS +{ + __unused struct sysctl_oid *unused_oidp = oidp; + + host_basic_info_data_t hinfo; + mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT; +#define BSD_HOST 1 + kern_return_t kret = host_info((host_t)BSD_HOST, + HOST_BASIC_INFO, (host_info_t)&hinfo, &count); + if (KERN_SUCCESS != kret) + return (EINVAL); + + if (sizeof (uint32_t) != arg2) + panic("size mismatch"); + + uintptr_t woffset = (uintptr_t)arg1 / sizeof (uint32_t); + uint32_t datum = *(uint32_t *)(((uint32_t *)&hinfo) + woffset); + return (SYSCTL_OUT(req, &datum, sizeof (datum))); +} + +/* + * machdep.cpu.cores_per_package + * + * x86: derived from CPUID data. + * ARM: how many physical cores we have in the AP; aka hw.physicalcpu_max + */ +static +SYSCTL_PROC(_machdep_cpu, OID_AUTO, cores_per_package, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, + (void *)offsetof(host_basic_info_data_t, physical_cpu_max), + sizeof (integer_t), + arm_host_info, "I", "CPU cores per package"); + +/* + * machdep.cpu.core_count + * + * x86: derived from CPUID data. + * ARM: # active physical cores in the AP; aka hw.physicalcpu + */ +static +SYSCTL_PROC(_machdep_cpu, OID_AUTO, core_count, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, + (void *)offsetof(host_basic_info_data_t, physical_cpu), + sizeof (integer_t), + arm_host_info, "I", "Number of enabled cores per package"); + +/* + * machdep.cpu.logical_per_package + * + * x86: derived from CPUID data. Returns ENOENT if HTT bit not set, but + * most x64 CPUs have that, so assume it's available. + * ARM: total # logical cores in the AP; aka hw.logicalcpu_max + */ +static +SYSCTL_PROC(_machdep_cpu, OID_AUTO, logical_per_package, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, + (void *)offsetof(host_basic_info_data_t, logical_cpu_max), + sizeof (integer_t), + arm_host_info, "I", "CPU logical cpus per package"); + +/* + * machdep.cpu.thread_count + * + * x86: derived from CPUID data. + * ARM: # active logical cores in the AP; aka hw.logicalcpu + */ +static +SYSCTL_PROC(_machdep_cpu, OID_AUTO, thread_count, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, + (void *)offsetof(host_basic_info_data_t, logical_cpu), + sizeof (integer_t), + arm_host_info, "I", "Number of enabled threads per package"); + +/* + * machdep.cpu.brand_string + * + * x86: derived from CPUID data. + * ARM: cons something up from the CPUID register. Could include cpufamily + * here and map it to a "marketing" name, but there's no obvious need; + * the value is already exported via the commpage. So keep it simple. + */ +static int +make_brand_string SYSCTL_HANDLER_ARGS +{ + __unused struct sysctl_oid *unused_oidp = oidp; + __unused void *unused_arg1 = arg1; + __unused int unused_arg2 = arg2; + + const char *impl; + + switch (cpuid_info()->arm_info.arm_implementor) { + case CPU_VID_APPLE: + impl = "Apple"; + break; + case CPU_VID_ARM: + impl = "ARM"; + break; + default: + impl = "ARM architecture"; + break; + } + char buf[80]; + snprintf(buf, sizeof (buf), "%s processor", impl); + return (SYSCTL_OUT(req, buf, strlen(buf) + 1)); +} + +SYSCTL_PROC(_machdep_cpu, OID_AUTO, brand_string, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, make_brand_string, "A", "CPU brand string"); diff --git a/bsd/dev/arm/systemcalls.c b/bsd/dev/arm/systemcalls.c index df9f22d09..2fa6a6580 100644 --- a/bsd/dev/arm/systemcalls.c +++ b/bsd/dev/arm/systemcalls.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -39,7 +40,7 @@ unix_syscall(struct arm_saved_state * regs, thread_t thread_act, static int arm_get_syscall_args(uthread_t, struct arm_saved_state *, struct sysent *); static int arm_get_u32_syscall_args(uthread_t, arm_saved_state32_t *, struct sysent *); -static void arm_prepare_u32_syscall_return(struct sysent *, arm_saved_state32_t *, uthread_t, int); +static void arm_prepare_u32_syscall_return(struct sysent *, arm_saved_state_t *, uthread_t, int); static void arm_prepare_syscall_return(struct sysent *, struct arm_saved_state *, uthread_t, int); static int arm_get_syscall_number(struct arm_saved_state *); static void arm_trace_unix_syscall(int, struct arm_saved_state *); @@ -274,16 +275,20 @@ unix_syscall_return(int error) } static void -arm_prepare_u32_syscall_return(struct sysent *callp, arm_saved_state32_t *regs, uthread_t uthread, int error) +arm_prepare_u32_syscall_return(struct sysent *callp, arm_saved_state_t *regs, uthread_t uthread, int error) { + assert(is_saved_state32(regs)); + + arm_saved_state32_t *ss32 = saved_state32(regs); + if (error == ERESTART) { - regs->pc -= 4; + ss32->pc -= 4; } else if (error != EJUSTRETURN) { if (error) { - regs->save_r0 = error; - regs->save_r1 = 0; + ss32->save_r0 = error; + ss32->save_r1 = 0; /* set the carry bit to execute cerror routine */ - regs->cpsr |= PSR_CF; + ss32->cpsr |= PSR_CF; unix_syscall_return_kprintf("error: setting carry to trigger cerror call\n"); } else { /* (not error) */ switch (callp->sy_return_type) { @@ -294,12 +299,12 @@ arm_prepare_u32_syscall_return(struct sysent *callp, arm_saved_state32_t *regs, case _SYSCALL_RET_SIZE_T: case _SYSCALL_RET_SSIZE_T: case _SYSCALL_RET_UINT64_T: - regs->save_r0 = uthread->uu_rval[0]; - regs->save_r1 = uthread->uu_rval[1]; + ss32->save_r0 = uthread->uu_rval[0]; + ss32->save_r1 = uthread->uu_rval[1]; break; case _SYSCALL_RET_NONE: - regs->save_r0 = 0; - regs->save_r1 = 0; + ss32->save_r0 = 0; + ss32->save_r1 = 0; break; default: panic("unix_syscall: unknown return type"); @@ -436,7 +441,7 @@ arm_clear_syscall_error(struct arm_saved_state * state) } #elif defined(__arm64__) -static void arm_prepare_u64_syscall_return(struct sysent *, arm_saved_state64_t *, uthread_t, int); +static void arm_prepare_u64_syscall_return(struct sysent *, arm_saved_state_t *, uthread_t, int); static int arm_get_u64_syscall_args(uthread_t, arm_saved_state64_t *, struct sysent *); static int @@ -460,6 +465,10 @@ arm_get_u64_syscall_args(uthread_t uthread, arm_saved_state64_t *regs, struct sy { int indirect_offset, regparams; +#if CONFIG_REQUIRES_U32_MUNGING + sy_munge_t *mungerp; +#endif + indirect_offset = (regs->x[ARM64_SYSCALL_CODE_REG_NUM] == 0) ? 1 : 0; regparams = 9 - indirect_offset; @@ -472,6 +481,30 @@ arm_get_u64_syscall_args(uthread_t uthread, arm_saved_state64_t *regs, struct sy } memcpy(&uthread->uu_arg[0], ®s->x[indirect_offset], callp->sy_narg * sizeof(uint64_t)); + +#if CONFIG_REQUIRES_U32_MUNGING + /* + * The indirect system call interface is vararg based. For armv7k, arm64_32, + * and arm64, this means we simply lay the values down on the stack, padded to + * a width multiple (4 bytes for armv7k and arm64_32, 8 bytes for arm64). + * The arm64(_32) stub for syscall will load this data into the registers and + * then trap. This gives us register state that corresponds to what we would + * expect from a armv7 task, so in this particular case we need to munge the + * arguments. + * + * TODO: Is there a cleaner way to do this check? What we're actually + * interested in is whether the task is arm64_32. We don't appear to guarantee + * that uu_proc is populated here, which is why this currently uses the + * thread_t. + */ + mungerp = callp->sy_arg_munge32; + assert(uthread->uu_thread); + + if (indirect_offset && !ml_thread_is64bit(uthread->uu_thread)) { + (*mungerp)(&uthread->uu_arg[0]); + } +#endif + return 0; } /* @@ -550,45 +583,49 @@ static void arm_prepare_syscall_return(struct sysent *callp, struct arm_saved_state *state, uthread_t uthread, int error) { if (is_saved_state32(state)) { - arm_prepare_u32_syscall_return(callp, saved_state32(state), uthread, error); + arm_prepare_u32_syscall_return(callp, state, uthread, error); } else { - arm_prepare_u64_syscall_return(callp, saved_state64(state), uthread, error); + arm_prepare_u64_syscall_return(callp, state, uthread, error); } } static void -arm_prepare_u64_syscall_return(struct sysent *callp, arm_saved_state64_t *regs, uthread_t uthread, int error) +arm_prepare_u64_syscall_return(struct sysent *callp, arm_saved_state_t *regs, uthread_t uthread, int error) { + assert(is_saved_state64(regs)); + + arm_saved_state64_t *ss64 = saved_state64(regs); + if (error == ERESTART) { - regs->pc -= 4; + ss64->pc -= 4; } else if (error != EJUSTRETURN) { if (error) { - regs->x[0] = error; - regs->x[1] = 0; + ss64->x[0] = error; + ss64->x[1] = 0; /* * Set the carry bit to execute cerror routine. * ARM64_TODO: should we have a separate definition? * The bits are the same. */ - regs->cpsr |= PSR_CF; + ss64->cpsr |= PSR_CF; unix_syscall_return_kprintf("error: setting carry to trigger cerror call\n"); } else { /* (not error) */ switch (callp->sy_return_type) { case _SYSCALL_RET_INT_T: - regs->x[0] = uthread->uu_rval[0]; - regs->x[1] = uthread->uu_rval[1]; + ss64->x[0] = uthread->uu_rval[0]; + ss64->x[1] = uthread->uu_rval[1]; break; case _SYSCALL_RET_UINT_T: - regs->x[0] = (u_int)uthread->uu_rval[0]; - regs->x[1] = (u_int)uthread->uu_rval[1]; + ss64->x[0] = (u_int)uthread->uu_rval[0]; + ss64->x[1] = (u_int)uthread->uu_rval[1]; break; case _SYSCALL_RET_OFF_T: case _SYSCALL_RET_ADDR_T: case _SYSCALL_RET_SIZE_T: case _SYSCALL_RET_SSIZE_T: case _SYSCALL_RET_UINT64_T: - regs->x[0] = *((uint64_t *)(&uthread->uu_rval[0])); - regs->x[1] = 0; + ss64->x[0] = *((uint64_t *)(&uthread->uu_rval[0])); + ss64->x[1] = 0; break; case _SYSCALL_RET_NONE: break; diff --git a/bsd/dev/arm/unix_signal.c b/bsd/dev/arm/unix_signal.c index 51c4d7e48..0bc010816 100644 --- a/bsd/dev/arm/unix_signal.c +++ b/bsd/dev/arm/unix_signal.c @@ -30,8 +30,14 @@ extern struct arm_saved_state *get_user_regs(thread_t); extern user_addr_t thread_get_cthread_self(void); extern kern_return_t thread_getstatus(thread_t act, int flavor, thread_state_t tstate, mach_msg_type_number_t *count); +extern kern_return_t thread_getstatus_to_user(thread_t act, int flavor, + thread_state_t tstate, mach_msg_type_number_t *count); +extern kern_return_t machine_thread_state_convert_to_user(thread_t act, int flavor, + thread_state_t tstate, mach_msg_type_number_t *count); extern kern_return_t thread_setstatus(thread_t thread, int flavor, thread_state_t tstate, mach_msg_type_number_t count); +extern kern_return_t thread_setstatus_from_user(thread_t thread, int flavor, + thread_state_t tstate, mach_msg_type_number_t count); /* XXX Put these someplace smarter... */ typedef struct mcontext32 mcontext32_t; typedef struct mcontext64 mcontext64_t; @@ -50,18 +56,24 @@ typedef struct mcontext64 mcontext64_t; #endif static int -sendsig_get_state32(thread_t th_act, mcontext32_t *mcp) +sendsig_get_state32(thread_t th_act, arm_thread_state_t *ts, mcontext32_t *mcp) { void *tstate; mach_msg_type_number_t state_count; - assert(!proc_is64bit(current_proc())); + assert(!proc_is64bit_data(current_proc())); - tstate = (void *) &mcp->ss; + tstate = (void *) ts; state_count = ARM_THREAD_STATE_COUNT; if (thread_getstatus(th_act, ARM_THREAD_STATE, (thread_state_t) tstate, &state_count) != KERN_SUCCESS) return EINVAL; + mcp->ss = *ts; + tstate = (void *) &mcp->ss; + state_count = ARM_THREAD_STATE_COUNT; + if (machine_thread_state_convert_to_user(th_act, ARM_THREAD_STATE, (thread_state_t) tstate, &state_count) != KERN_SUCCESS) + return EINVAL; + tstate = (void *) &mcp->es; state_count = ARM_EXCEPTION_STATE_COUNT; if (thread_getstatus(th_act, ARM_EXCEPTION_STATE, (thread_state_t) tstate, &state_count) != KERN_SUCCESS) @@ -69,7 +81,7 @@ sendsig_get_state32(thread_t th_act, mcontext32_t *mcp) tstate = (void *) &mcp->fs; state_count = ARM_VFP_STATE_COUNT; - if (thread_getstatus(th_act, ARM_VFP_STATE, (thread_state_t) tstate, &state_count) != KERN_SUCCESS) + if (thread_getstatus_to_user(th_act, ARM_VFP_STATE, (thread_state_t) tstate, &state_count) != KERN_SUCCESS) return EINVAL; return 0; @@ -77,25 +89,31 @@ sendsig_get_state32(thread_t th_act, mcontext32_t *mcp) #if defined(__arm64__) struct user_sigframe64 { - /* We can pass the last arg in a register for ARM64 */ + /* We can pass the last two args in registers for ARM64 */ user64_siginfo_t sinfo; struct user_ucontext64 uctx; mcontext64_t mctx; }; static int -sendsig_get_state64(thread_t th_act, mcontext64_t *mcp) +sendsig_get_state64(thread_t th_act, arm_thread_state64_t *ts, mcontext64_t *mcp) { void *tstate; mach_msg_type_number_t state_count; - assert(proc_is64bit(current_proc())); + assert(proc_is64bit_data(current_proc())); - tstate = (void *) &mcp->ss; + tstate = (void *) ts; state_count = ARM_THREAD_STATE64_COUNT; if (thread_getstatus(th_act, ARM_THREAD_STATE64, (thread_state_t) tstate, &state_count) != KERN_SUCCESS) return EINVAL; + mcp->ss = *ts; + tstate = (void *) &mcp->ss; + state_count = ARM_THREAD_STATE64_COUNT; + if (machine_thread_state_convert_to_user(th_act, ARM_THREAD_STATE64, (thread_state_t) tstate, &state_count) != KERN_SUCCESS) + return EINVAL; + tstate = (void *) &mcp->es; state_count = ARM_EXCEPTION_STATE64_COUNT; if (thread_getstatus(th_act, ARM_EXCEPTION_STATE64, (thread_state_t) tstate, &state_count) != KERN_SUCCESS) @@ -103,7 +121,7 @@ sendsig_get_state64(thread_t th_act, mcontext64_t *mcp) tstate = (void *) &mcp->ns; state_count = ARM_NEON_STATE64_COUNT; - if (thread_getstatus(th_act, ARM_NEON_STATE64, (thread_state_t) tstate, &state_count) != KERN_SUCCESS) + if (thread_getstatus_to_user(th_act, ARM_NEON_STATE64, (thread_state_t) tstate, &state_count) != KERN_SUCCESS) return EINVAL; return 0; @@ -127,15 +145,16 @@ sendsig_fill_uctx64(user_ucontext64_t *uctx, int oonstack, int mask, user64_addr static kern_return_t sendsig_set_thread_state64(arm_thread_state64_t *regs, user64_addr_t catcher, int infostyle, int sig, user64_addr_t p_sinfo, - user64_addr_t p_uctx, user64_addr_t trampact, user64_addr_t sp, thread_t th_act) + user64_addr_t p_uctx, user64_addr_t token, user64_addr_t trampact, user64_addr_t sp, thread_t th_act) { - assert(proc_is64bit(current_proc())); + assert(proc_is64bit_data(current_proc())); regs->x[0] = catcher; regs->x[1] = infostyle; regs->x[2] = sig; regs->x[3] = p_sinfo; regs->x[4] = p_uctx; + regs->x[5] = token; regs->pc = trampact; regs->cpsr = PSR64_USER64_DEFAULT; regs->sp = sp; @@ -165,7 +184,7 @@ sendsig_set_thread_state32(arm_thread_state_t *regs, user32_addr_t trampact, user32_addr_t sp, thread_t th_act) { - assert(!proc_is64bit(current_proc())); + assert(!proc_is64bit_data(current_proc())); regs->r[0] = catcher; regs->r[1] = infostyle; @@ -220,6 +239,7 @@ sendsig_do_dtrace(uthread_t ut, user_siginfo_t *sinfo, int sig, user_addr_t catc struct user_sigframe32 { user32_addr_t puctx; + user32_addr_t token; user32_siginfo_t sinfo; struct user_ucontext32 uctx; mcontext32_t mctx; @@ -238,6 +258,16 @@ sendsig( __unused uint32_t code ) { + union { + struct ts32 { + arm_thread_state_t ss; + } ts32; +#if defined(__arm64__) + struct ts64 { + arm_thread_state64_t ss; + } ts64; +#endif + } ts; union { struct user_sigframe32 uf32; #if defined(__arm64__) @@ -252,10 +282,13 @@ sendsig( thread_t th_act; struct uthread *ut; user_size_t stack_size = 0; + user_addr_t p_uctx, token_uctx; + kern_return_t kr; th_act = current_thread(); ut = get_bsdthread_info(th_act); + bzero(&ts, sizeof(ts)); bzero(&user_frame, sizeof(user_frame)); if (p->p_sigacts->ps_siginfo & sigmask(sig)) @@ -269,16 +302,16 @@ sendsig( /* * Get sundry thread state. */ - if (proc_is64bit(p)) { + if (proc_is64bit_data(p)) { #ifdef __arm64__ - if (sendsig_get_state64(th_act, &user_frame.uf64.mctx) != 0) { + if (sendsig_get_state64(th_act, &ts.ts64.ss, &user_frame.uf64.mctx) != 0) { goto bad2; } #else panic("Shouldn't have 64-bit thread states on a 32-bit kernel."); #endif } else { - if (sendsig_get_state32(th_act, &user_frame.uf32.mctx) != 0) { + if (sendsig_get_state32(th_act, &ts.ts32.ss, &user_frame.uf32.mctx) != 0) { goto bad2; } } @@ -297,15 +330,15 @@ sendsig( * Get stack pointer, and allocate enough space * for signal handler data. */ - if (proc_is64bit(p)) { + if (proc_is64bit_data(p)) { #if defined(__arm64__) - sp = CAST_USER_ADDR_T(user_frame.uf64.mctx.ss.sp); + sp = CAST_USER_ADDR_T(ts.ts64.ss.sp); sp = (sp - sizeof(user_frame.uf64) - C_64_REDZONE_LEN) & ~0xf; /* Make sure to align to 16 bytes and respect red zone */ #else panic("Shouldn't have 64-bit thread states on a 32-bit kernel."); #endif } else { - sp = CAST_USER_ADDR_T(user_frame.uf32.mctx.ss.sp); + sp = CAST_USER_ADDR_T(ts.ts32.ss.sp); sp -= sizeof(user_frame.uf32); #if defined(__arm__) && (__BIGGEST_ALIGNMENT__ > 4) sp &= ~0xf; /* Make sure to align to 16 bytes for armv7k */ @@ -318,7 +351,7 @@ sendsig( /* * Fill in ucontext (points to mcontext, i.e. thread states). */ - if (proc_is64bit(p)) { + if (proc_is64bit_data(p)) { #if defined(__arm64__) sendsig_fill_uctx64(&user_frame.uf64.uctx, oonstack, mask, sp, (user64_size_t)stack_size, (user64_addr_t)&((struct user_sigframe64*)sp)->mctx); @@ -336,16 +369,16 @@ sendsig( bzero((caddr_t) & sinfo, sizeof(sinfo)); sinfo.si_signo = sig; - if (proc_is64bit(p)) { + if (proc_is64bit_data(p)) { #if defined(__arm64__) - sinfo.si_addr = user_frame.uf64.mctx.ss.pc; - sinfo.pad[0] = user_frame.uf64.mctx.ss.sp; + sinfo.si_addr = ts.ts64.ss.pc; + sinfo.pad[0] = ts.ts64.ss.sp; #else panic("Shouldn't have 64-bit thread states on a 32-bit kernel."); #endif } else { - sinfo.si_addr = user_frame.uf32.mctx.ss.pc; - sinfo.pad[0] = user_frame.uf32.mctx.ss.sp; + sinfo.si_addr = ts.ts32.ss.pc; + sinfo.pad[0] = ts.ts32.ss.sp; } switch (sig) { @@ -368,7 +401,7 @@ sendsig( break; case SIGBUS: - if (proc_is64bit(p)) { + if (proc_is64bit_data(p)) { #if defined(__arm64__) sinfo.si_addr = user_frame.uf64.mctx.es.far; #else @@ -382,7 +415,7 @@ sendsig( break; case SIGSEGV: - if (proc_is64bit(p)) { + if (proc_is64bit_data(p)) { #if defined(__arm64__) sinfo.si_addr = user_frame.uf64.mctx.es.far; #else @@ -460,40 +493,64 @@ sendsig( /* * Copy signal-handling frame out to user space, set thread state. */ - if (proc_is64bit(p)) { + if (proc_is64bit_data(p)) { #if defined(__arm64__) + user64_addr_t token; + /* * mctx filled in when we get state. uctx filled in by * sendsig_fill_uctx64(). We fill in the sinfo now. */ siginfo_user_to_user64(&sinfo, &user_frame.uf64.sinfo); + p_uctx = (user_addr_t)&((struct user_sigframe64*)sp)->uctx; + /* + * Generate the validation token for sigreturn + */ + token_uctx = p_uctx; + kr = machine_thread_siguctx_pointer_convert_to_user(th_act, &token_uctx); + assert(kr == KERN_SUCCESS); + token = (user64_addr_t)token_uctx ^ (user64_addr_t)ps->ps_sigreturn_token; + if (copyout(&user_frame.uf64, sp, sizeof(user_frame.uf64)) != 0) { goto bad; } - if (sendsig_set_thread_state64(&user_frame.uf64.mctx.ss, + if (sendsig_set_thread_state64(&ts.ts64.ss, catcher, infostyle, sig, (user64_addr_t)&((struct user_sigframe64*)sp)->sinfo, - (user64_addr_t)&((struct user_sigframe64*)sp)->uctx, trampact, sp, th_act) != KERN_SUCCESS) + (user64_addr_t)p_uctx, token, trampact, sp, th_act) != KERN_SUCCESS) goto bad; #else panic("Shouldn't have 64-bit thread states on a 32-bit kernel."); #endif } else { + user32_addr_t token; + /* * mctx filled in when we get state. uctx filled in by - * sendsig_fill_uctx32(). We fill in the sinfo and *pointer* - * to uctx now. + * sendsig_fill_uctx32(). We fill in the sinfo, *pointer* + * to uctx and token now. */ siginfo_user_to_user32(&sinfo, &user_frame.uf32.sinfo); - user_frame.uf32.puctx = (user32_addr_t) &((struct user_sigframe32*)sp)->uctx; + + p_uctx = (user_addr_t)&((struct user_sigframe32*)sp)->uctx; + /* + * Generate the validation token for sigreturn + */ + token_uctx = (user_addr_t)p_uctx; + kr = machine_thread_siguctx_pointer_convert_to_user(th_act, &token_uctx); + assert(kr == KERN_SUCCESS); + token = (user32_addr_t)token_uctx ^ (user32_addr_t)ps->ps_sigreturn_token; + + user_frame.uf32.puctx = (user32_addr_t)p_uctx; + user_frame.uf32.token = token; if (copyout(&user_frame.uf32, sp, sizeof(user_frame.uf32)) != 0) { goto bad; } - if (sendsig_set_thread_state32(&user_frame.uf32.mctx.ss, + if (sendsig_set_thread_state32(&ts.ts32.ss, CAST_DOWN_EXPLICIT(user32_addr_t, catcher), infostyle, sig, (user32_addr_t)&((struct user_sigframe32*)sp)->sinfo, CAST_DOWN_EXPLICIT(user32_addr_t, trampact), CAST_DOWN_EXPLICIT(user32_addr_t, sp), th_act) != KERN_SUCCESS) goto bad; @@ -530,7 +587,7 @@ sigreturn_copyin_ctx32(struct user_ucontext32 *uctx, mcontext32_t *mctx, user_ad { int error; - assert(!proc_is64bit(current_proc())); + assert(!proc_is64bit_data(current_proc())); error = copyin(uctx_addr, uctx, sizeof(*uctx)); if (error) { @@ -557,7 +614,7 @@ sigreturn_copyin_ctx32(struct user_ucontext32 *uctx, mcontext32_t *mctx, user_ad static int sigreturn_set_state32(thread_t th_act, mcontext32_t *mctx) { - assert(!proc_is64bit(current_proc())); + assert(!proc_is64bit_data(current_proc())); /* validate the thread state, set/reset appropriate mode bits in cpsr */ #if defined(__arm__) @@ -568,10 +625,10 @@ sigreturn_set_state32(thread_t th_act, mcontext32_t *mctx) #error Unknown architecture. #endif - if (thread_setstatus(th_act, ARM_THREAD_STATE, (void *)&mctx->ss, ARM_THREAD_STATE_COUNT) != KERN_SUCCESS) { + if (thread_setstatus_from_user(th_act, ARM_THREAD_STATE, (void *)&mctx->ss, ARM_THREAD_STATE_COUNT) != KERN_SUCCESS) { return (EINVAL); } - if (thread_setstatus(th_act, ARM_VFP_STATE, (void *)&mctx->fs, ARM_VFP_STATE_COUNT) != KERN_SUCCESS) { + if (thread_setstatus_from_user(th_act, ARM_VFP_STATE, (void *)&mctx->fs, ARM_VFP_STATE_COUNT) != KERN_SUCCESS) { return (EINVAL); } @@ -584,7 +641,7 @@ sigreturn_copyin_ctx64(struct user_ucontext64 *uctx, mcontext64_t *mctx, user_ad { int error; - assert(proc_is64bit(current_proc())); + assert(proc_is64bit_data(current_proc())); error = copyin(uctx_addr, uctx, sizeof(*uctx)); if (error) { @@ -611,15 +668,15 @@ sigreturn_copyin_ctx64(struct user_ucontext64 *uctx, mcontext64_t *mctx, user_ad static int sigreturn_set_state64(thread_t th_act, mcontext64_t *mctx) { - assert(proc_is64bit(current_proc())); + assert(proc_is64bit_data(current_proc())); /* validate the thread state, set/reset appropriate mode bits in cpsr */ mctx->ss.cpsr = (mctx->ss.cpsr & ~PSR64_MODE_MASK) | PSR64_USER64_DEFAULT; - if (thread_setstatus(th_act, ARM_THREAD_STATE64, (void *)&mctx->ss, ARM_THREAD_STATE64_COUNT) != KERN_SUCCESS) { + if (thread_setstatus_from_user(th_act, ARM_THREAD_STATE64, (void *)&mctx->ss, ARM_THREAD_STATE64_COUNT) != KERN_SUCCESS) { return (EINVAL); } - if (thread_setstatus(th_act, ARM_NEON_STATE64, (void *)&mctx->ns, ARM_NEON_STATE64_COUNT) != KERN_SUCCESS) { + if (thread_setstatus_from_user(th_act, ARM_NEON_STATE64, (void *)&mctx->ns, ARM_NEON_STATE64_COUNT) != KERN_SUCCESS) { return (EINVAL); } @@ -648,14 +705,18 @@ sigreturn( #endif } mctx; + struct sigacts *ps = p->p_sigacts; int error, sigmask = 0, onstack = 0; thread_t th_act; struct uthread *ut; + uint32_t sigreturn_validation; + user_addr_t token_uctx; + kern_return_t kr; th_act = current_thread(); ut = (struct uthread *) get_bsdthread_info(th_act); - if (proc_is64bit(p)) { + if (proc_is64bit_data(p)) { #if defined(__arm64__) error = sigreturn_copyin_ctx64(&uctx.uc64, &mctx.mc64, uap->uctx); if (error != 0) { @@ -686,18 +747,54 @@ sigreturn( if (ut->uu_siglist & ~ut->uu_sigmask) signal_setast(current_thread()); - if (proc_is64bit(p)) { + sigreturn_validation = atomic_load_explicit( + &ps->ps_sigreturn_validation, memory_order_relaxed); + token_uctx = uap->uctx; + kr = machine_thread_siguctx_pointer_convert_to_user(th_act, &token_uctx); + assert(kr == KERN_SUCCESS); + + if (proc_is64bit_data(p)) { #if defined(__arm64__) + user64_addr_t token; + token = (user64_addr_t)token_uctx ^ (user64_addr_t)ps->ps_sigreturn_token; + if ((user64_addr_t)uap->token != token) { +#if DEVELOPMENT || DEBUG + printf("process %s[%d] sigreturn token mismatch: received 0x%llx expected 0x%llx\n", + p->p_comm, p->p_pid, (user64_addr_t)uap->token, token); +#endif /* DEVELOPMENT || DEBUG */ + if (sigreturn_validation != PS_SIGRETURN_VALIDATION_DISABLED) { + return EINVAL; + } + } error = sigreturn_set_state64(th_act, &mctx.mc64); if (error != 0) { +#if DEVELOPMENT || DEBUG + printf("process %s[%d] sigreturn set_state64 error %d\n", + p->p_comm, p->p_pid, error); +#endif /* DEVELOPMENT || DEBUG */ return error; } #else panic("Shouldn't have 64-bit thread states on a 32-bit kernel."); #endif } else { + user32_addr_t token; + token = (user32_addr_t)token_uctx ^ (user32_addr_t)ps->ps_sigreturn_token; + if ((user32_addr_t)uap->token != token) { +#if DEVELOPMENT || DEBUG + printf("process %s[%d] sigreturn token mismatch: received 0x%x expected 0x%x\n", + p->p_comm, p->p_pid, (user32_addr_t)uap->token, token); +#endif /* DEVELOPMENT || DEBUG */ + if (sigreturn_validation != PS_SIGRETURN_VALIDATION_DISABLED) { + return EINVAL; + } + } error = sigreturn_set_state32(th_act, &mctx.mc32); if (error != 0) { +#if DEVELOPMENT || DEBUG + printf("process %s[%d] sigreturn sigreturn_set_state32 error %d\n", + p->p_comm, p->p_pid, error); +#endif /* DEVELOPMENT || DEBUG */ return error; } } @@ -706,32 +803,22 @@ sigreturn( } /* - * machine_exception() performs MD translation - * of a mach exception to a unix signal and code. + * machine_exception() performs machine-dependent translation + * of a mach exception to a unix signal. */ - -boolean_t -machine_exception( - int exception, - mach_exception_subcode_t code, - __unused mach_exception_subcode_t subcode, - int *unix_signal, - mach_exception_subcode_t * unix_code -) +int +machine_exception(int exception, + __unused mach_exception_code_t code, + __unused mach_exception_subcode_t subcode) { switch (exception) { - case EXC_BAD_INSTRUCTION: - *unix_signal = SIGILL; - *unix_code = code; - break; - - case EXC_ARITHMETIC: - *unix_signal = SIGFPE; - *unix_code = code; - break; + case EXC_BAD_INSTRUCTION: + return SIGILL; - default: - return (FALSE); + case EXC_ARITHMETIC: + return SIGFPE; } - return (TRUE); + + return 0; } + diff --git a/bsd/dev/arm64/cpu_in_cksum.s b/bsd/dev/arm64/cpu_in_cksum.s index 00a00c667..86d892aa3 100644 --- a/bsd/dev/arm64/cpu_in_cksum.s +++ b/bsd/dev/arm64/cpu_in_cksum.s @@ -51,9 +51,15 @@ * routine expects "mbuf-like" argument, and it does not expect the mbuf to be * authentic; it only cares about 3 fields. */ +#if defined(__LP64__) #define M_NEXT 0 #define M_DATA 16 // 8-byte address, would be aligned to 8-byte boundary #define M_LEN 24 +#else +#define M_NEXT 0 +#define M_DATA 8 +#define M_LEN 12 +#endif .globl _os_cpu_in_cksum_mbuf .text @@ -98,6 +104,14 @@ _os_cpu_in_cksum_mbuf: #define Wmlen w6 #define t x7 #define data x8 +#if defined(__LP64__) + #define ptr_m x0 + #define ptr_data x8 +#else + #define ptr_m w0 + #define ptr_data w8 +#endif + mov needs_swap, #0 // needs_swap = FALSE; mov started_on_odd, #0 // started_on_odd = FALSE; @@ -128,7 +142,7 @@ _os_cpu_in_cksum_mbuf: ldr Wmlen, [m, #M_LEN] // mlen = m->m_len; cmp mlen, off b.le 1f - ldr data, [m, #M_DATA] // mtod(m, uint8_t *) + ldr ptr_data, [m, #M_DATA] // mtod(m, uint8_t *) sub mlen, mlen, off // mlen -= off; add data, data, off // data = mtod(m, uint8_t *) + off; b L_post_initial_offset @@ -138,7 +152,7 @@ _os_cpu_in_cksum_mbuf: mov x0, x3 ret lr 2: - ldr m, [m, #M_NEXT] + ldr ptr_m, [m, #M_NEXT] b 0b L_loop: // for (; len > 0; m = m->m_next) { @@ -152,7 +166,7 @@ L_loop: // for (; len > 0; m = m->m_next) { */ cbz m, Lin_cksum_whoops // if (m == NULL) return -1; ldr Wmlen, [m, #M_LEN] // mlen = m->m_len; - ldr data, [m, #M_DATA] // mtod(m, uint8_t *) + ldr ptr_data, [m, #M_DATA] // mtod(m, uint8_t *) L_post_initial_offset: /* @@ -374,7 +388,7 @@ L0_bytes: L_continue: cmp len, #0 - ldr m, [m, #M_NEXT] // m = m->m_next + ldr ptr_m, [m, #M_NEXT] // m = m->m_next b.gt L_loop /* diff --git a/bsd/dev/arm64/disassembler.c b/bsd/dev/arm64/disassembler.c index a00f8d0eb..7195d0d72 100644 --- a/bsd/dev/arm64/disassembler.c +++ b/bsd/dev/arm64/disassembler.c @@ -1124,7 +1124,8 @@ struct arm64_decode_entry arm64_decode_table[] = { { .mask = FASTTRAP_ARM64_OP_MASK_ADRP, .value = FASTTRAP_ARM64_OP_VALUE_ADRP, .type = FASTTRAP_T_ARM64_ADRP }, { .mask = FASTTRAP_ARM64_OP_MASK_ADR, .value = FASTTRAP_ARM64_OP_VALUE_ADR, .type = FASTTRAP_T_ARM64_ADR }, { .mask = FASTTRAP_ARM64_OP_MASK_PRFM, .value = FASTTRAP_ARM64_OP_VALUE_PRFM, .type = FASTTRAP_T_ARM64_PRFM }, - { .mask = FASTTRAP_ARM64_OP_MASK_EXCL_MEM, .value = FASTTRAP_ARM64_OP_VALUE_EXCL_MEM, .type = FASTTRAP_T_ARM64_EXCLUSIVE_MEM }}; + { .mask = FASTTRAP_ARM64_OP_MASK_EXCL_MEM, .value = FASTTRAP_ARM64_OP_VALUE_EXCL_MEM, .type = FASTTRAP_T_ARM64_EXCLUSIVE_MEM }, + { .mask = FASTTRAP_ARM64_OP_MASK_RETAB, .value = FASTTRAP_ARM64_OP_VALUE_RETAB, .type = FASTTRAP_T_ARM64_RETAB }}; #define NUM_DECODE_ENTRIES (sizeof(arm64_decode_table) / sizeof(struct arm64_decode_entry)) diff --git a/bsd/dev/arm64/dtrace_isa.c b/bsd/dev/arm64/dtrace_isa.c index 3f81cb706..bd2716b95 100644 --- a/bsd/dev/arm64/dtrace_isa.c +++ b/bsd/dev/arm64/dtrace_isa.c @@ -28,11 +28,15 @@ #define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from * mach/ppc/thread_status.h */ +#include #include #include #include +#if __has_include() +#include +#endif #include #include #include @@ -194,6 +198,11 @@ dtrace_getreg(struct regs * savearea, uint_t reg) { struct arm_saved_state *regs = (struct arm_saved_state *) savearea; + if (regs == NULL) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); + return (0); + } + if (is_saved_state32(regs)) { // Fix special registers if user is 32 bits switch (reg) { @@ -231,7 +240,7 @@ dtrace_getustack_common(uint64_t * pcstack, int pcstack_limit, user_addr_t pc, user_addr_t sp) { int ret = 0; - boolean_t is64bit = proc_is64bit(current_proc()); + boolean_t is64bit = proc_is64bit_data(current_proc()); ASSERT(pcstack == NULL || pcstack_limit > 0); @@ -359,7 +368,7 @@ void dtrace_getufpstack(uint64_t * pcstack, uint64_t * fpstack, int pcstack_limit) { thread_t thread = current_thread(); - boolean_t is64bit = proc_is64bit(current_proc()); + boolean_t is64bit = proc_is64bit_data(current_proc()); savearea_t *regs; user_addr_t pc, sp; volatile uint16_t *flags = (volatile uint16_t *) & cpu_core[CPU->cpu_id].cpuc_dtrace_flags; @@ -608,7 +617,11 @@ dtrace_getarg(int arg, int aframes, dtrace_mstate_t *mstate, dtrace_vstate_t *vs for (i = 1; i <= aframes; ++i) { fp = fp->backchain; +#if __has_feature(ptrauth_returns) + pc = (uintptr_t)ptrauth_strip((void*)fp->retaddr, ptrauth_key_return_address); +#else pc = fp->retaddr; +#endif if (dtrace_invop_callsite_pre != NULL && pc > (uintptr_t) dtrace_invop_callsite_pre @@ -628,7 +641,7 @@ dtrace_getarg(int arg, int aframes, dtrace_mstate_t *mstate, dtrace_vstate_t *vs } else { /* the argument will be found in the stack */ fp = (struct frame*) saved_state->sp; - stack = (uintptr_t*) &fp[1]; + stack = (uintptr_t*) &fp[1]; arg -= (inreg + 1); } @@ -694,3 +707,12 @@ dtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit)) func(VM_MAX_KERNEL_ADDRESS + 1, ~(uintptr_t)0); } +void dtrace_flush_caches(void) +{ + /* TODO There were some problems with flushing just the cache line that had been modified. + * For now, we'll flush the entire cache, until we figure out how to flush just the patched block. + */ + FlushPoU_Dcache(); + InvalidatePoU_Icache(); +} + diff --git a/bsd/dev/arm64/fasttrap_isa.c b/bsd/dev/arm64/fasttrap_isa.c index c0af6a9e2..8643cbd92 100644 --- a/bsd/dev/arm64/fasttrap_isa.c +++ b/bsd/dev/arm64/fasttrap_isa.c @@ -36,7 +36,6 @@ #define _KERNEL /* Solaris vs. Darwin */ #endif #endif - #include #include #include @@ -54,6 +53,11 @@ #include +#if __has_include() +#include +#endif + + extern dtrace_id_t dtrace_probeid_error; /* Solaris proc_t is the struct. Darwin's proc_t is a pointer to it. */ @@ -117,17 +121,6 @@ extern int dtrace_decode_thumb(uint32_t instr); #define ARM_LDR_UF (1 << 23) #define ARM_LDR_BF (1 << 22) -static void -flush_caches(void) -{ - /* TODO There were some problems with flushing just the cache line that had been modified. - * For now, we'll flush the entire cache, until we figure out how to flush just the patched block. - */ - FlushPoU_Dcache(); - InvalidatePoU_Icache(); -} - - static int fasttrap_tracepoint_init32 (proc_t *, fasttrap_tracepoint_t *, user_addr_t, fasttrap_probe_type_t); static int fasttrap_tracepoint_init64 (proc_t *, fasttrap_tracepoint_t *, user_addr_t, fasttrap_probe_type_t); @@ -135,7 +128,7 @@ int fasttrap_tracepoint_init(proc_t *p, fasttrap_tracepoint_t *tp, user_addr_t pc, fasttrap_probe_type_t type) { - if (proc_is64bit(p)) { + if (proc_is64bit_data(p)) { return fasttrap_tracepoint_init64(p, tp, pc, type); } else { return fasttrap_tracepoint_init32(p, tp, pc, type); @@ -250,6 +243,8 @@ fasttrap_tracepoint_init64(proc_t *p, fasttrap_tracepoint_t *tp, if (tp->ftt_fntype != FASTTRAP_FN_DONE_INIT) { switch(tp->ftt_fntype) { case FASTTRAP_FN_UNKNOWN: + case FASTTRAP_FN_ARM64: + case FASTTRAP_FN_ARM64_32: /* * On arm64 there is no distinction between * arm vs. thumb mode instruction types. @@ -299,90 +294,6 @@ fasttrap_tracepoint_init64(proc_t *p, fasttrap_tracepoint_t *tp, return (0); } -// These are not exported from vm_map.h. -extern kern_return_t vm_map_write_user(vm_map_t map, void *src_p, vm_map_address_t dst_addr, vm_size_t size); - -/* Patches the instructions. Almost like uwrite, but need special instructions on ARM to flush the caches. */ -static -int patchInst(proc_t *p, void *buf, user_size_t len, user_addr_t a) -{ - kern_return_t ret; - - ASSERT(p != NULL); - ASSERT(p->task != NULL); - - task_t task = p->task; - - /* - * Grab a reference to the task vm_map_t to make sure - * the map isn't pulled out from under us. - * - * Because the proc_lock is not held at all times on all code - * paths leading here, it is possible for the proc to have - * exited. If the map is null, fail. - */ - vm_map_t map = get_task_map_reference(task); - if (map) { - /* Find the memory permissions. */ - uint32_t nestingDepth=999999; - vm_region_submap_short_info_data_64_t info; - mach_msg_type_number_t count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64; - mach_vm_address_t address = (mach_vm_address_t)a; - mach_vm_size_t sizeOfRegion = (mach_vm_size_t)len; - - ret = mach_vm_region_recurse(map, &address, &sizeOfRegion, &nestingDepth, (vm_region_recurse_info_t)&info, &count); - if (ret != KERN_SUCCESS) - goto done; - - vm_prot_t reprotect; - - if (!(info.protection & VM_PROT_WRITE)) { - /* Save the original protection values for restoration later */ - reprotect = info.protection; - if (info.max_protection & VM_PROT_WRITE) { - /* The memory is not currently writable, but can be made writable. */ - /* Making it both writable and executable at the same time causes warning on embedded */ - ret = mach_vm_protect (map, (mach_vm_offset_t)a, (mach_vm_size_t)len, 0, (reprotect & ~VM_PROT_EXECUTE) | VM_PROT_WRITE); - } else { - /* - * The memory is not currently writable, and cannot be made writable. We need to COW this memory. - * - * Strange, we can't just say "reprotect | VM_PROT_COPY", that fails. - */ - ret = mach_vm_protect (map, (mach_vm_offset_t)a, (mach_vm_size_t)len, 0, VM_PROT_COPY | VM_PROT_READ | VM_PROT_WRITE); - } - - if (ret != KERN_SUCCESS) - goto done; - - } else { - /* The memory was already writable. */ - reprotect = VM_PROT_NONE; - } - - ret = vm_map_write_user( map, - buf, - (vm_map_address_t)a, - (vm_size_t)len); - - flush_caches(); - - if (ret != KERN_SUCCESS) - goto done; - - if (reprotect != VM_PROT_NONE) { - ASSERT(reprotect & VM_PROT_EXECUTE); - ret = mach_vm_protect (map, (mach_vm_offset_t)a, (mach_vm_size_t)len, 0, reprotect); - } - -done: - vm_map_deallocate(map); - } else - ret = KERN_TERMINATED; - - return (int)ret; -} - int fasttrap_tracepoint_install(proc_t *p, fasttrap_tracepoint_t *tp) { @@ -390,7 +301,7 @@ fasttrap_tracepoint_install(proc_t *p, fasttrap_tracepoint_t *tp) uint32_t instr; int size; - if (proc_is64bit(p)) { + if (proc_is64bit_data(p)) { size = 4; instr = FASTTRAP_ARM64_INSTR; } @@ -403,7 +314,7 @@ fasttrap_tracepoint_install(proc_t *p, fasttrap_tracepoint_t *tp) } } - if (patchInst(p, &instr, size, tp->ftt_pc) != 0) + if (uwrite(p, &instr, size, tp->ftt_pc) != 0) return (-1); tp->ftt_installed = 1; @@ -418,7 +329,7 @@ fasttrap_tracepoint_remove(proc_t *p, fasttrap_tracepoint_t *tp) uint32_t instr; int size; - if (proc_is64bit(p)) { + if (proc_is64bit_data(p)) { /* * Distinguish between read or write failures and a changed * instruction. @@ -447,7 +358,7 @@ fasttrap_tracepoint_remove(proc_t *p, fasttrap_tracepoint_t *tp) } } - if (patchInst(p, &tp->ftt_instr, size, tp->ftt_pc) != 0) + if (uwrite(p, &tp->ftt_instr, size, tp->ftt_pc) != 0) return (-1); end: @@ -501,7 +412,7 @@ fasttrap_return_common(proc_t *p, arm_saved_state_t *regs, user_addr_t pc, user_ } else { /* ARM64_TODO - check for FASTTRAP_T_RET */ - if ((tp->ftt_type != FASTTRAP_T_ARM64_RET) && + if ((tp->ftt_type != FASTTRAP_T_ARM64_RET || tp->ftt_type != FASTTRAP_T_ARM64_RETAB) && new_pc - probe->ftp_faddr < probe->ftp_fsize) continue; } @@ -1214,7 +1125,7 @@ fasttrap_pid_probe_handle_patched_instr32(arm_saved_state_t *state, fasttrap_tra SET32(scratch+i, FASTTRAP_ARM32_RET_INSTR); i += 4; } - if (patchInst(p, scratch, i, uthread->t_dtrace_scratch->write_addr) != KERN_SUCCESS) { + if (uwrite(p, scratch, i, uthread->t_dtrace_scratch->write_addr) != KERN_SUCCESS) { fasttrap_sigtrap(p, uthread, pc); new_pc = pc; break; @@ -1280,7 +1191,7 @@ fasttrap_pid_probe_thunk_instr64(arm_saved_state_t *state, fasttrap_tracepoint_t return; } - if (patchInst(p, local_scratch, (num_instrs + 1) * sizeof(uint32_t), user_scratch_area) != KERN_SUCCESS) { + if (uwrite(p, local_scratch, (num_instrs + 1) * sizeof(uint32_t), user_scratch_area) != KERN_SUCCESS) { fasttrap_sigtrap(p, uthread, pc); *pc_out = pc; return; @@ -1292,6 +1203,7 @@ fasttrap_pid_probe_thunk_instr64(arm_saved_state_t *state, fasttrap_tracepoint_t /* We may or may not be about to run a return probe (but we wouldn't thunk ret lr)*/ uthread->t_dtrace_ret = (tp->ftt_retids != NULL); assert(tp->ftt_type != FASTTRAP_T_ARM64_RET); + assert(tp->ftt_type != FASTTRAP_T_ARM64_RETAB); /* Set address of instruction we've patched */ uthread->t_dtrace_pc = pc; @@ -1729,10 +1641,22 @@ fasttrap_pid_probe_handle_patched_instr64(arm_saved_state_t *state, fasttrap_tra /* Set PC to register value (xzr, not sp) */ new_pc = get_saved_state64_regno(regs64, regno, 1); + *was_simulated = 1; break; } + case FASTTRAP_T_ARM64_RETAB: + { + /* Set PC to register value (xzr, not sp) */ + new_pc = get_saved_state64_regno(regs64, 30, 1); +#if __has_feature(ptrauth_calls) + new_pc = (user_addr_t) ptrauth_strip((void *)new_pc, ptrauth_key_return_address); +#endif + *was_simulated = 1; + break; + + } /* * End branches. */ diff --git a/bsd/dev/arm64/fbt_arm.c b/bsd/dev/arm64/fbt_arm.c index c2f348f9a..3364a066e 100644 --- a/bsd/dev/arm64/fbt_arm.c +++ b/bsd/dev/arm64/fbt_arm.c @@ -61,6 +61,10 @@ #include +#if __has_include() +#include +#endif + #define DTRACE_INVOP_PUSH_FRAME 11 #define DTRACE_INVOP_NOP_SKIP 4 @@ -90,7 +94,7 @@ (((x) & 0xffc07fff) == 0xa9407bfd || ((x) & 0xffc07fff) == 0xa8c07bfd) #define FBT_IS_ARM64_ADD_FP_SP(x) (((x) & 0xffc003ff) == 0x910003fd) /* add fp, sp, #val (add fp, sp, #0 == mov fp, sp) */ -#define FBT_IS_ARM64_RET(x) ((x) == 0xd65f03c0) /* ret */ +#define FBT_IS_ARM64_RET(x) (((x) == 0xd65f03c0) || ((x) == 0xd65f0fff)) /* ret, retab */ #define FBT_B_MASK 0xff000000 @@ -128,19 +132,19 @@ fbt_invop(uintptr_t addr, uintptr_t * stack, uintptr_t rval) if (fbt->fbtp_roffset == 0) { /* * Stack looks like this: - * + * * [Higher addresses] - * + * * Frame of caller * Extra args for callee - * ------------------------ + * ------------------------ * Frame from traced function: * ------------------------ * arm_context_t * ------------------------ * Frame from trap handler: * The traced function never got to mov fp, sp, - * so there is no frame in the backtrace pointing + * so there is no frame in the backtrace pointing * to the frame on the stack containing the LR in the * caller. * ------------------------ @@ -155,29 +159,29 @@ fbt_invop(uintptr_t addr, uintptr_t * stack, uintptr_t rval) arm_saved_state_t *regs = (arm_saved_state_t *)(&((arm_context_t *)stack)->ss); - /* - * cpu_dtrace_caller compensates for fact that the traced function never got to update its fp. - * When walking the stack, when we reach the frame where we extract a PC in the patched + /* + * cpu_dtrace_caller compensates for fact that the traced function never got to update its fp. + * When walking the stack, when we reach the frame where we extract a PC in the patched * function, we put the cpu_dtrace_caller in the backtrace instead. The next frame we extract - * will be in the caller's caller, so we output a backtrace starting at the caller and going + * will be in the caller's caller, so we output a backtrace starting at the caller and going * sequentially up the stack. */ - CPU->cpu_dtrace_caller = get_saved_state_lr(regs); + CPU->cpu_dtrace_caller = get_saved_state_lr(regs); dtrace_probe(fbt->fbtp_id, get_saved_state_reg(regs, 0), get_saved_state_reg(regs, 1), get_saved_state_reg(regs, 2), get_saved_state_reg(regs, 3),get_saved_state_reg(regs, 4)); CPU->cpu_dtrace_caller = 0; } else { /* * When fbtp_roffset is non-zero, we know we are handling a return probe point. - * + * * * Stack looks like this, as we've already popped the frame in the traced callee, and * we trap with lr set to the return address in the caller. * [Higher addresses] - * + * * Frame of caller * Extra args for callee - * ------------------------ + * ------------------------ * arm_context_t * ------------------------ * Frame from trap handler: @@ -198,7 +202,7 @@ fbt_invop(uintptr_t addr, uintptr_t * stack, uintptr_t rval) } CPU->cpu_dtrace_invop_underway = 0; } - + /* On other architectures, we return a DTRACE constant to let the callback function know what was replaced. On the ARM, since the function prologue/epilogue machine code @@ -280,8 +284,11 @@ fbt_perfCallback( retval = KERN_SUCCESS; } else if (FBT_IS_ARM64_RET(emul)) { lr = get_saved_state_lr(regs); +#if __has_feature(ptrauth_calls) + lr = (user_addr_t) ptrauth_strip((void *)lr, ptrauth_key_return_address); +#endif set_saved_state_pc(regs, lr); - retval = KERN_SUCCESS; + retval = KERN_SUCCESS; } else if (FBT_IS_ARM64_B_INSTR(emul)) { pc = get_saved_state_pc(regs); imm = FBT_GET_ARM64_B_IMM(emul); @@ -301,20 +308,19 @@ fbt_perfCallback( } void -fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, char *modname, char* symbolName, machine_inst_t* symbolStart) +fbt_provide_probe(struct modctl *ctl, const char *modname, const char* symbolName, machine_inst_t* symbolStart, machine_inst_t *instrHigh) { - unsigned int j; int doenable = 0; dtrace_id_t thisid; fbt_probe_t *newfbt, *retfbt, *entryfbt; machine_inst_t *instr, *pushinstr = NULL, *limit, theInstr; int foundPushLR, savedRegs; - + /* - * Guard against null symbols + * Guard against null and invalid symbols */ - if (!symbolStart || !instrLow || !instrHigh) { + if (!symbolStart || !instrHigh || instrHigh < symbolStart) { kprintf("dtrace: %s has an invalid address\n", symbolName); return; } @@ -322,15 +328,13 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c /* * Assume the compiler doesn't schedule instructions in the prologue. */ - foundPushLR = 0; savedRegs = -1; limit = (machine_inst_t *)instrHigh; assert(sizeof(*instr) == 4); - for (j = 0, instr = symbolStart, theInstr = 0; - (j < 8) && ((uintptr_t)instr >= instrLow) && (instrHigh > (uintptr_t)(instr)); j++, instr++) + for (instr = symbolStart, theInstr = 0; instr < instrHigh; instr++) { /* * Count the number of time we pushed something onto the stack @@ -361,7 +365,7 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c newfbt = kmem_zalloc(sizeof(fbt_probe_t), KM_SLEEP); newfbt->fbtp_next = NULL; strlcpy( (char *)&(newfbt->fbtp_name), symbolName, MAX_FBTP_NAME_CHARS ); - + if (thisid != 0) { /* * The dtrace_probe previously existed, so we have to hook @@ -417,7 +421,7 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c doenable=0; thisid = dtrace_probe_lookup(fbt_id, modname, symbolName, FBT_RETURN); - + if (thisid != 0) { /* The dtrace_probe previously existed, so we have to * find the end of the existing fbt chain. If we find @@ -455,7 +459,7 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c * OK, it's an instruction. */ theInstr = *instr; - + /* Walked onto the start of the next routine? If so, bail out from this function */ if (FBT_IS_ARM64_FRAME_PUSH(theInstr)) { if (!retfbt) @@ -498,7 +502,7 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c return; newfbt = kmem_zalloc(sizeof(fbt_probe_t), KM_SLEEP); - newfbt->fbtp_next = NULL; + newfbt->fbtp_next = NULL; strlcpy( (char *)&(newfbt->fbtp_name), symbolName, MAX_FBTP_NAME_CHARS ); if (retfbt == NULL) { @@ -529,80 +533,3 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c instr++; goto again; } - -void -fbt_provide_module_kernel_syms(struct modctl *ctl) -{ - kernel_mach_header_t *mh; - struct load_command *cmd; - kernel_segment_command_t *orig_ts = NULL, *orig_le = NULL; - struct symtab_command *orig_st = NULL; - kernel_nlist_t *sym = NULL; - char *strings; - uintptr_t instrLow, instrHigh; - char *modname; - unsigned int i; - - mh = (kernel_mach_header_t *)(ctl->mod_address); - modname = ctl->mod_modname; - - /* - * Employees of dtrace and their families are ineligible. Void - * where prohibited. - */ - - if (mh->magic != MH_MAGIC_KERNEL) - return; - - cmd = (struct load_command *) & mh[1]; - for (i = 0; i < mh->ncmds; i++) { - if (cmd->cmd == LC_SEGMENT_KERNEL) { - kernel_segment_command_t *orig_sg = (kernel_segment_command_t *) cmd; - - if (LIT_STRNEQL(orig_sg->segname, SEG_TEXT)) - orig_ts = orig_sg; - else if (LIT_STRNEQL(orig_sg->segname, SEG_LINKEDIT)) - orig_le = orig_sg; - else if (LIT_STRNEQL(orig_sg->segname, "")) - orig_ts = orig_sg; /* kexts have a single - * unnamed segment */ - } else if (cmd->cmd == LC_SYMTAB) - orig_st = (struct symtab_command *) cmd; - - cmd = (struct load_command *) ((caddr_t) cmd + cmd->cmdsize); - } - - if ((orig_ts == NULL) || (orig_st == NULL) || (orig_le == NULL)) - return; - - sym = (kernel_nlist_t *)(orig_le->vmaddr + orig_st->symoff - orig_le->fileoff); - strings = (char *)(orig_le->vmaddr + orig_st->stroff - orig_le->fileoff); - - /* Find extent of the TEXT section */ - instrLow = (uintptr_t) orig_ts->vmaddr; - instrHigh = (uintptr_t) (orig_ts->vmaddr + orig_ts->vmsize); - - for (i = 0; i < orig_st->nsyms; i++) { - uint8_t n_type = sym[i].n_type & (N_TYPE | N_EXT); - char *name = strings + sym[i].n_un.n_strx; - - /* Check that the symbol is a global and that it has a name. */ - if (((N_SECT | N_EXT) != n_type && (N_ABS | N_EXT) != n_type)) - continue; - - if (0 == sym[i].n_un.n_strx) /* iff a null, "", name. */ - continue; - - /* Lop off omnipresent leading underscore. */ - if (*name == '_') - name += 1; - - /* - * We're only blacklisting functions in the kernel for now. - */ - if (MOD_IS_MACH_KERNEL(ctl) && fbt_excluded(name)) - continue; - - fbt_provide_probe(ctl, instrLow, instrHigh, modname, name, (machine_inst_t*)sym[i].n_value); - } -} diff --git a/bsd/dev/arm64/sysctl.c b/bsd/dev/arm64/sysctl.c index 22dcc12d7..deb952d44 100644 --- a/bsd/dev/arm64/sysctl.c +++ b/bsd/dev/arm64/sysctl.c @@ -7,6 +7,11 @@ #include +#include +#include +#include +#include + extern uint64_t wake_abstime; static @@ -53,3 +58,121 @@ SYSCTL_PROC(_machdep, OID_AUTO, wake_conttime, "Continuous Time at the last wakeup"); +/* + * For source compatibility, here's some machdep.cpu mibs that + * use host_info() to simulate reasonable answers. + */ + +SYSCTL_NODE(_machdep, OID_AUTO, cpu, CTLFLAG_RW|CTLFLAG_LOCKED, 0, + "CPU info"); + +static int +arm_host_info SYSCTL_HANDLER_ARGS +{ + __unused struct sysctl_oid *unused_oidp = oidp; + + host_basic_info_data_t hinfo; + mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT; +#define BSD_HOST 1 + kern_return_t kret = host_info((host_t)BSD_HOST, + HOST_BASIC_INFO, (host_info_t)&hinfo, &count); + if (KERN_SUCCESS != kret) + return (EINVAL); + + if (sizeof (uint32_t) != arg2) + panic("size mismatch"); + + uintptr_t woffset = (uintptr_t)arg1 / sizeof (uint32_t); + uint32_t datum = *(uint32_t *)(((uint32_t *)&hinfo) + woffset); + return (SYSCTL_OUT(req, &datum, sizeof (datum))); +} + +/* + * machdep.cpu.cores_per_package + * + * x86: derived from CPUID data. + * ARM: how many physical cores we have in the AP; aka hw.physicalcpu_max + */ +static +SYSCTL_PROC(_machdep_cpu, OID_AUTO, cores_per_package, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, + (void *)offsetof(host_basic_info_data_t, physical_cpu_max), + sizeof (integer_t), + arm_host_info, "I", "CPU cores per package"); + +/* + * machdep.cpu.core_count + * + * x86: derived from CPUID data. + * ARM: # active physical cores in the AP; aka hw.physicalcpu + */ +static +SYSCTL_PROC(_machdep_cpu, OID_AUTO, core_count, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, + (void *)offsetof(host_basic_info_data_t, physical_cpu), + sizeof (integer_t), + arm_host_info, "I", "Number of enabled cores per package"); + +/* + * machdep.cpu.logical_per_package + * + * x86: derived from CPUID data. Returns ENOENT if HTT bit not set, but + * most x64 CPUs have that, so assume it's available. + * ARM: total # logical cores in the AP; aka hw.logicalcpu_max + */ +static +SYSCTL_PROC(_machdep_cpu, OID_AUTO, logical_per_package, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, + (void *)offsetof(host_basic_info_data_t, logical_cpu_max), + sizeof (integer_t), + arm_host_info, "I", "CPU logical cpus per package"); + +/* + * machdep.cpu.thread_count + * + * x86: derived from CPUID data. + * ARM: # active logical cores in the AP; aka hw.logicalcpu + */ +static +SYSCTL_PROC(_machdep_cpu, OID_AUTO, thread_count, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, + (void *)offsetof(host_basic_info_data_t, logical_cpu), + sizeof (integer_t), + arm_host_info, "I", "Number of enabled threads per package"); + +/* + * machdep.cpu.brand_string + * + * x86: derived from CPUID data. + * ARM: cons something up from the CPUID register. Could include cpufamily + * here and map it to a "marketing" name, but there's no obvious need; + * the value is already exported via the commpage. So keep it simple. + */ +static int +make_brand_string SYSCTL_HANDLER_ARGS +{ + __unused struct sysctl_oid *unused_oidp = oidp; + __unused void *unused_arg1 = arg1; + __unused int unused_arg2 = arg2; + + const char *impl; + + switch (cpuid_info()->arm_info.arm_implementor) { + case CPU_VID_APPLE: + impl = "Apple"; + break; + case CPU_VID_ARM: + impl = "ARM"; + break; + default: + impl = "ARM architecture"; + break; + } + char buf[80]; + snprintf(buf, sizeof (buf), "%s processor", impl); + return (SYSCTL_OUT(req, buf, strlen(buf) + 1)); +} + +SYSCTL_PROC(_machdep_cpu, OID_AUTO, brand_string, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, make_brand_string, "A", "CPU brand string"); diff --git a/bsd/dev/dtrace/dtrace.c b/bsd/dev/dtrace/dtrace.c index 587a8b438..a83adc712 100644 --- a/bsd/dev/dtrace/dtrace.c +++ b/bsd/dev/dtrace/dtrace.c @@ -120,6 +120,8 @@ extern kmod_info_t g_kernel_kmod_info; extern void dtrace_suspend(void); extern void dtrace_resume(void); +extern void dtrace_early_init(void); +extern int dtrace_keep_kernel_symbols(void); extern void dtrace_init(void); extern void helper_init(void); extern void fasttrap_init(void); @@ -131,6 +133,7 @@ extern void dtrace_postinit(void); extern void dtrace_proc_fork(proc_t*, proc_t*, int); extern void dtrace_proc_exec(proc_t*); extern void dtrace_proc_exit(proc_t*); + /* * DTrace Tunable Variables * @@ -205,13 +208,14 @@ unsigned int dtrace_max_cpus = 0; /* number of enabled cpus */ */ static dev_info_t *dtrace_devi; /* device info */ static vmem_t *dtrace_arena; /* probe ID arena */ -static taskq_t *dtrace_taskq; /* task queue */ static dtrace_probe_t **dtrace_probes; /* array of all probes */ static int dtrace_nprobes; /* number of probes */ static dtrace_provider_t *dtrace_provider; /* provider list */ static dtrace_meta_t *dtrace_meta_pid; /* user-land meta provider */ static int dtrace_opens; /* number of opens */ static int dtrace_helpers; /* number of helpers */ +static dtrace_hash_t *dtrace_strings; +static dtrace_hash_t *dtrace_byprov; /* probes hashed by provider */ static dtrace_hash_t *dtrace_bymod; /* probes hashed by module */ static dtrace_hash_t *dtrace_byfunc; /* probes hashed by function */ static dtrace_hash_t *dtrace_byname; /* probes hashed by name */ @@ -237,7 +241,7 @@ static int dtrace_dof_mode; /* See dtrace_impl.h for a description of Darwin's */ int dtrace_kernel_symbol_mode; /* See dtrace_impl.h for a description of Darwin's kernel symbol modes. */ static uint32_t dtrace_wake_clients; - +static uint8_t dtrace_kerneluuid[16]; /* the 128-bit uuid */ /* * To save memory, some common memory allocations are given a @@ -328,17 +332,17 @@ dtrace_enable_nullop(void) return (0); } -static dtrace_pops_t dtrace_provider_ops = { - (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop, - (void (*)(void *, struct modctl *))dtrace_nullop, - (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop, - (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, - (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, - (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, - NULL, - NULL, - NULL, - (void (*)(void *, dtrace_id_t, void *))dtrace_nullop +static dtrace_pops_t dtrace_provider_ops = { + .dtps_provide = (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop, + .dtps_provide_module = (void (*)(void *, struct modctl *))dtrace_nullop, + .dtps_enable = (int (*)(void *, dtrace_id_t, void *))dtrace_nullop, + .dtps_disable = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, + .dtps_suspend = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, + .dtps_resume = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, + .dtps_getargdesc = NULL, + .dtps_getargval = NULL, + .dtps_usermode = NULL, + .dtps_destroy = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, }; static dtrace_id_t dtrace_probeid_begin; /* special BEGIN probe */ @@ -393,18 +397,22 @@ static lck_mtx_t dtrace_errlock; * outside of the implementation. There is no real structure to this cpp * mishmash -- but is there ever? */ -#define DTRACE_HASHSTR(hash, probe) \ - dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs))) -#define DTRACE_HASHNEXT(hash, probe) \ - (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs) +#define DTRACE_GETSTR(hash, elm) \ + (hash->dth_getstr(elm, hash->dth_stroffs)) + +#define DTRACE_HASHSTR(hash, elm) \ + dtrace_hash_str(DTRACE_GETSTR(hash, elm)) + +#define DTRACE_HASHNEXT(hash, elm) \ + (void**)((uintptr_t)(elm) + (hash)->dth_nextoffs) -#define DTRACE_HASHPREV(hash, probe) \ - (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs) +#define DTRACE_HASHPREV(hash, elm) \ + (void**)((uintptr_t)(elm) + (hash)->dth_prevoffs) #define DTRACE_HASHEQ(hash, lhs, rhs) \ - (strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \ - *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0) + (strcmp(DTRACE_GETSTR(hash, lhs), \ + DTRACE_GETSTR(hash, rhs)) == 0) #define DTRACE_AGGHASHSIZE_SLEW 17 @@ -756,6 +764,9 @@ sysctl_dtrace_dof_maxsize SYSCTL_HANDLER_ARGS if (value <= 0) return (ERANGE); + if (value >= dtrace_copy_maxsize()) + return (ERANGE); + lck_mtx_lock(&dtrace_lock); dtrace_dof_maxsize = value; lck_mtx_unlock(&dtrace_lock); @@ -851,6 +862,15 @@ SYSCTL_PROC(_kern_dtrace, OID_AUTO, provide_private_probes, &dtrace_provide_private_probes, 0, sysctl_dtrace_provide_private_probes, "I", "provider must provide the private probes"); +/* + * kern.dtrace.dof_mode + * + * Returns the current DOF mode. + * This value is read-only. + */ +SYSCTL_INT(_kern_dtrace, OID_AUTO, dof_mode, CTLFLAG_RD | CTLFLAG_LOCKED, + &dtrace_dof_mode, 0, "dtrace dof mode"); + /* * DTrace Probe Context Functions * @@ -7012,12 +7032,33 @@ dtrace_hash_str(const char *p) return (hval); } +static const char* +dtrace_strkey_probe_provider(void *elm, uintptr_t offs) +{ +#pragma unused(offs) + dtrace_probe_t *probe = (dtrace_probe_t*)elm; + return probe->dtpr_provider->dtpv_name; +} + +static const char* +dtrace_strkey_offset(void *elm, uintptr_t offs) +{ + return ((char *)((uintptr_t)(elm) + offs)); +} + +static const char* +dtrace_strkey_deref_offset(void *elm, uintptr_t offs) +{ + return *((char **)((uintptr_t)(elm) + offs)); +} + static dtrace_hash_t * -dtrace_hash_create(uintptr_t stroffs, uintptr_t nextoffs, uintptr_t prevoffs) +dtrace_hash_create(dtrace_strkey_f func, uintptr_t arg, uintptr_t nextoffs, uintptr_t prevoffs) { dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP); - hash->dth_stroffs = stroffs; + hash->dth_getstr = func; + hash->dth_stroffs = arg; hash->dth_nextoffs = nextoffs; hash->dth_prevoffs = prevoffs; @@ -7066,10 +7107,10 @@ dtrace_hash_resize(dtrace_hash_t *hash) for (i = 0; i < size; i++) { for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) { - dtrace_probe_t *probe = bucket->dthb_chain; + void *elm = bucket->dthb_chain; - ASSERT(probe != NULL); - ndx = DTRACE_HASHSTR(hash, probe) & new_mask; + ASSERT(elm != NULL); + ndx = DTRACE_HASHSTR(hash, elm) & new_mask; next = bucket->dthb_next; bucket->dthb_next = new_tab[ndx]; @@ -7084,12 +7125,12 @@ dtrace_hash_resize(dtrace_hash_t *hash) } static void -dtrace_hash_add(dtrace_hash_t *hash, dtrace_probe_t *new) +dtrace_hash_add(dtrace_hash_t *hash, void *new) { int hashval = DTRACE_HASHSTR(hash, new); int ndx = hashval & hash->dth_mask; dtrace_hashbucket_t *bucket = hash->dth_tab[ndx]; - dtrace_probe_t **nextp, **prevp; + void **nextp, **prevp; for (; bucket != NULL; bucket = bucket->dthb_next) { if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new)) @@ -7122,23 +7163,29 @@ dtrace_hash_add(dtrace_hash_t *hash, dtrace_probe_t *new) bucket->dthb_len++; } -static dtrace_probe_t * -dtrace_hash_lookup(dtrace_hash_t *hash, dtrace_probe_t *template) +static void * +dtrace_hash_lookup_string(dtrace_hash_t *hash, const char *str) { - int hashval = DTRACE_HASHSTR(hash, template); + int hashval = dtrace_hash_str(str); int ndx = hashval & hash->dth_mask; dtrace_hashbucket_t *bucket = hash->dth_tab[ndx]; for (; bucket != NULL; bucket = bucket->dthb_next) { - if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template)) + if (strcmp(str, DTRACE_GETSTR(hash, bucket->dthb_chain)) == 0) return (bucket->dthb_chain); } return (NULL); } +static dtrace_probe_t * +dtrace_hash_lookup(dtrace_hash_t *hash, void *template) +{ + return dtrace_hash_lookup_string(hash, DTRACE_GETSTR(hash, template)); +} + static int -dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template) +dtrace_hash_collisions(dtrace_hash_t *hash, void *template) { int hashval = DTRACE_HASHSTR(hash, template); int ndx = hashval & hash->dth_mask; @@ -7153,19 +7200,19 @@ dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template) } static void -dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe) +dtrace_hash_remove(dtrace_hash_t *hash, void *elm) { - int ndx = DTRACE_HASHSTR(hash, probe) & hash->dth_mask; + int ndx = DTRACE_HASHSTR(hash, elm) & hash->dth_mask; dtrace_hashbucket_t *bucket = hash->dth_tab[ndx]; - dtrace_probe_t **prevp = DTRACE_HASHPREV(hash, probe); - dtrace_probe_t **nextp = DTRACE_HASHNEXT(hash, probe); + void **prevp = DTRACE_HASHPREV(hash, elm); + void **nextp = DTRACE_HASHNEXT(hash, elm); /* - * Find the bucket that we're removing this probe from. + * Find the bucket that we're removing this elm from. */ for (; bucket != NULL; bucket = bucket->dthb_next) { - if (DTRACE_HASHEQ(hash, bucket->dthb_chain, probe)) + if (DTRACE_HASHEQ(hash, bucket->dthb_chain, elm)) break; } @@ -7174,12 +7221,12 @@ dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe) if (*prevp == NULL) { if (*nextp == NULL) { /* - * The removed probe was the only probe on this + * The removed element was the only element on this * bucket; we need to remove the bucket. */ dtrace_hashbucket_t *b = hash->dth_tab[ndx]; - ASSERT(bucket->dthb_chain == probe); + ASSERT(bucket->dthb_chain == elm); ASSERT(b != NULL); if (b == bucket) { @@ -7219,20 +7266,63 @@ dtrace_badattr(const dtrace_attribute_t *a) } /* - * Return a duplicate copy of a string. If the specified string is NULL, - * this function returns a zero-length string. - * APPLE NOTE: Darwin employs size bounded string operation. + * Returns a dtrace-managed copy of a string, and will + * deduplicate copies of the same string. + * If the specified string is NULL, returns an empty string */ static char * -dtrace_strdup(const char *str) +dtrace_strref(const char *str) { + dtrace_string_t *s = NULL; size_t bufsize = (str != NULL ? strlen(str) : 0) + 1; - char *new = kmem_zalloc(bufsize, KM_SLEEP); - if (str != NULL) - (void) strlcpy(new, str, bufsize); + LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED); - return (new); + if (str == NULL) + str = ""; + + for (s = dtrace_hash_lookup_string(dtrace_strings, str); s != NULL; + s = *(DTRACE_HASHNEXT(dtrace_strings, s))) { + if (strncmp(str, s->dtst_str, bufsize) != 0) { + continue; + } + ASSERT(s->dtst_refcount != UINT32_MAX); + s->dtst_refcount++; + return s->dtst_str; + } + + s = kmem_zalloc(sizeof(dtrace_string_t) + bufsize, KM_SLEEP); + s->dtst_refcount = 1; + (void) strlcpy(s->dtst_str, str, bufsize); + + dtrace_hash_add(dtrace_strings, s); + + return s->dtst_str; +} + +static void +dtrace_strunref(const char *str) +{ + ASSERT(str != NULL); + dtrace_string_t *s = NULL; + size_t bufsize = strlen(str) + 1; + + LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED); + + for (s = dtrace_hash_lookup_string(dtrace_strings, str); s != NULL; + s = *(DTRACE_HASHNEXT(dtrace_strings, s))) { + if (strncmp(str, s->dtst_str, bufsize) != 0) { + continue; + } + ASSERT(s->dtst_refcount != 0); + s->dtst_refcount--; + if (s->dtst_refcount == 0) { + dtrace_hash_remove(dtrace_strings, s); + kmem_free(s, sizeof(dtrace_string_t) + bufsize); + } + return; + } + panic("attempt to unref non-existent string %s", str); } #define DTRACE_ISALPHA(c) \ @@ -7529,9 +7619,27 @@ static int dtrace_match_string(const char *s, const char *p, int depth) { #pragma unused(depth) /* __APPLE__ */ + return (s != NULL && s == p); +} - /* APPLE NOTE: Darwin employs size bounded string operation. */ - return (s != NULL && strncmp(s, p, strlen(s) + 1) == 0); +/*ARGSUSED*/ +static int +dtrace_match_module(const char *s, const char *p, int depth) +{ +#pragma unused(depth) /* __APPLE__ */ + size_t len; + if (s == NULL || p == NULL) + return (0); + + len = strlen(p); + + if (strncmp(p, s, len) != 0) + return (0); + + if (s[len] == '.' || s[len] == '\0') + return (1); + + return (0); } /*ARGSUSED*/ @@ -7554,7 +7662,18 @@ static int dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid, zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *, void *), void *arg1, void *arg2) { - dtrace_probe_t template, *probe; + dtrace_probe_t *probe; + dtrace_provider_t prov_template = { + .dtpv_name = (char *)(uintptr_t)pkp->dtpk_prov + }; + + dtrace_probe_t template = { + .dtpr_provider = &prov_template, + .dtpr_mod = (char *)(uintptr_t)pkp->dtpk_mod, + .dtpr_func = (char *)(uintptr_t)pkp->dtpk_func, + .dtpr_name = (char *)(uintptr_t)pkp->dtpk_name + }; + dtrace_hash_t *hash = NULL; int len, rc, best = INT_MAX, nmatched = 0; dtrace_id_t i; @@ -7575,16 +7694,19 @@ dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid, return (nmatched); } - template.dtpr_mod = (char *)(uintptr_t)pkp->dtpk_mod; - template.dtpr_func = (char *)(uintptr_t)pkp->dtpk_func; - template.dtpr_name = (char *)(uintptr_t)pkp->dtpk_name; - /* - * We want to find the most distinct of the module name, function - * name, and name. So for each one that is not a glob pattern or - * empty string, we perform a lookup in the corresponding hash and - * use the hash table with the fewest collisions to do our search. + * We want to find the most distinct of the provider name, module name, + * function name, and name. So for each one that is not a glob + * pattern or empty string, we perform a lookup in the corresponding + * hash and use the hash table with the fewest collisions to do our + * search. */ + if (pkp->dtpk_pmatch == &dtrace_match_string && + (len = dtrace_hash_collisions(dtrace_byprov, &template)) < best) { + best = len; + hash = dtrace_byprov; + } + if (pkp->dtpk_mmatch == &dtrace_match_string && (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) { best = len; @@ -7671,6 +7793,24 @@ dtrace_probekey_func(const char *p) return (&dtrace_match_string); } +static dtrace_probekey_f * +dtrace_probekey_module_func(const char *p) +{ + LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED); + + dtrace_probekey_f *f = dtrace_probekey_func(p); + if (f == &dtrace_match_string) { + dtrace_probe_t template = { + .dtpr_mod = (char *)(uintptr_t)p, + }; + if (dtrace_hash_lookup(dtrace_bymod, &template) == NULL) { + return (&dtrace_match_module); + } + return (&dtrace_match_string); + } + return f; +} + /* * Build a probe comparison key for use with dtrace_match_probe() from the * given probe description. By convention, a null key only matches anchored @@ -7680,16 +7820,17 @@ dtrace_probekey_func(const char *p) static void dtrace_probekey(const dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp) { - pkp->dtpk_prov = pdp->dtpd_provider; + + pkp->dtpk_prov = dtrace_strref(pdp->dtpd_provider); pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider); - pkp->dtpk_mod = pdp->dtpd_mod; - pkp->dtpk_mmatch = dtrace_probekey_func(pdp->dtpd_mod); + pkp->dtpk_mod = dtrace_strref(pdp->dtpd_mod); + pkp->dtpk_mmatch = dtrace_probekey_module_func(pdp->dtpd_mod); - pkp->dtpk_func = pdp->dtpd_func; + pkp->dtpk_func = dtrace_strref(pdp->dtpd_func); pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func); - pkp->dtpk_name = pdp->dtpd_name; + pkp->dtpk_name = dtrace_strref(pdp->dtpd_name); pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name); pkp->dtpk_id = pdp->dtpd_id; @@ -7702,6 +7843,15 @@ dtrace_probekey(const dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp) pkp->dtpk_fmatch = &dtrace_match_nonzero; } +static void +dtrace_probekey_release(dtrace_probekey_t *pkp) +{ + dtrace_strunref(pkp->dtpk_prov); + dtrace_strunref(pkp->dtpk_mod); + dtrace_strunref(pkp->dtpk_func); + dtrace_strunref(pkp->dtpk_name); +} + static int dtrace_cond_provider_match(dtrace_probedesc_t *desc, void *data) { @@ -7779,13 +7929,6 @@ dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv, provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP); - /* APPLE NOTE: Darwin employs size bounded string operation. */ - { - size_t bufsize = strlen(name) + 1; - provider->dtpv_name = kmem_alloc(bufsize, KM_SLEEP); - (void) strlcpy(provider->dtpv_name, name, bufsize); - } - provider->dtpv_attr = *pap; provider->dtpv_priv.dtpp_flags = priv; if (cr != NULL) { @@ -7820,6 +7963,9 @@ dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv, if (pops == &dtrace_provider_ops) { LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED); LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED); + + provider->dtpv_name = dtrace_strref(name); + ASSERT(dtrace_anon.dta_enabling == NULL); /* @@ -7834,6 +7980,8 @@ dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv, lck_mtx_lock(&dtrace_provider_lock); lck_mtx_lock(&dtrace_lock); + provider->dtpv_name = dtrace_strref(name); + /* * If there is at least one provider registered, we'll add this * provider after the first provider. @@ -7878,8 +8026,11 @@ dtrace_unregister(dtrace_provider_id_t id) { dtrace_provider_t *old = (dtrace_provider_t *)id; dtrace_provider_t *prev = NULL; - int i, self = 0; - dtrace_probe_t *probe, *first = NULL; + int self = 0; + dtrace_probe_t *probe, *first = NULL, *next = NULL; + dtrace_probe_t template = { + .dtpr_provider = old + }; if (old->dtpv_pops.dtps_enable == (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop) { @@ -7940,14 +8091,12 @@ dtrace_unregister(dtrace_provider_id_t id) * All of the probes for this provider are disabled; we can safely * remove all of them from their hash chains and from the probe array. */ - for (i = 0; i < dtrace_nprobes && old->dtpv_probe_count!=0; i++) { - if ((probe = dtrace_probes[i]) == NULL) - continue; - + for (probe = dtrace_hash_lookup(dtrace_byprov, &template); probe != NULL; + probe = *(DTRACE_HASHNEXT(dtrace_byprov, probe))) { if (probe->dtpr_provider != old) continue; - dtrace_probes[i] = NULL; + dtrace_probes[probe->dtpr_id - 1] = NULL; old->dtpv_probe_count--; dtrace_hash_remove(dtrace_bymod, probe); @@ -7958,11 +8107,19 @@ dtrace_unregister(dtrace_provider_id_t id) first = probe; probe->dtpr_nextmod = NULL; } else { + /* + * Use nextmod as the chain of probes to remove + */ probe->dtpr_nextmod = first; first = probe; } } + for (probe = first; probe != NULL; probe = next) { + next = probe->dtpr_nextmod; + dtrace_hash_remove(dtrace_byprov, probe); + } + /* * The provider's probes have been removed from the hash chains and * from the probe array. Now issue a dtrace_sync() to be sure that @@ -7970,14 +8127,14 @@ dtrace_unregister(dtrace_provider_id_t id) */ dtrace_sync(); - for (probe = first; probe != NULL; probe = first) { - first = probe->dtpr_nextmod; + for (probe = first; probe != NULL; probe = next) { + next = probe->dtpr_nextmod; old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id, probe->dtpr_arg); - kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1); - kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1); - kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1); + dtrace_strunref(probe->dtpr_mod); + dtrace_strunref(probe->dtpr_func); + dtrace_strunref(probe->dtpr_name); vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1); zfree(dtrace_probe_t_zone, probe); } @@ -7998,13 +8155,14 @@ dtrace_unregister(dtrace_provider_id_t id) prev->dtpv_next = old->dtpv_next; } + dtrace_strunref(old->dtpv_name); + if (!self) { lck_mtx_unlock(&dtrace_lock); lck_mtx_unlock(&mod_lock); lck_mtx_unlock(&dtrace_provider_lock); } - kmem_free(old->dtpv_name, strlen(old->dtpv_name) + 1); kmem_free(old, sizeof (dtrace_provider_t)); return (0); @@ -8054,8 +8212,10 @@ int dtrace_condense(dtrace_provider_id_t id) { dtrace_provider_t *prov = (dtrace_provider_t *)id; - int i; - dtrace_probe_t *probe; + dtrace_probe_t *probe, *first = NULL; + dtrace_probe_t template = { + .dtpr_provider = prov + }; /* * Make sure this isn't the dtrace provider itself. @@ -8069,9 +8229,8 @@ dtrace_condense(dtrace_provider_id_t id) /* * Attempt to destroy the probes associated with this provider. */ - for (i = 0; i < dtrace_nprobes; i++) { - if ((probe = dtrace_probes[i]) == NULL) - continue; + for (probe = dtrace_hash_lookup(dtrace_byprov, &template); probe != NULL; + probe = *(DTRACE_HASHNEXT(dtrace_byprov, probe))) { if (probe->dtpr_provider != prov) continue; @@ -8079,20 +8238,35 @@ dtrace_condense(dtrace_provider_id_t id) if (probe->dtpr_ecb != NULL) continue; - dtrace_probes[i] = NULL; + dtrace_probes[probe->dtpr_id - 1] = NULL; prov->dtpv_probe_count--; dtrace_hash_remove(dtrace_bymod, probe); dtrace_hash_remove(dtrace_byfunc, probe); dtrace_hash_remove(dtrace_byname, probe); - prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, i + 1, + prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id, probe->dtpr_arg); - kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1); - kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1); - kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1); + dtrace_strunref(probe->dtpr_mod); + dtrace_strunref(probe->dtpr_func); + dtrace_strunref(probe->dtpr_name); + if (first == NULL) { + first = probe; + probe->dtpr_nextmod = NULL; + } else { + /* + * Use nextmod as the chain of probes to remove + */ + probe->dtpr_nextmod = first; + first = probe; + } + } + + for (probe = first; probe != NULL; probe = first) { + first = probe->dtpr_nextmod; + dtrace_hash_remove(dtrace_byprov, probe); + vmem_free(dtrace_arena, (void *)((uintptr_t)probe->dtpr_id), 1); zfree(dtrace_probe_t_zone, probe); - vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1); } lck_mtx_unlock(&dtrace_lock); @@ -8136,13 +8310,14 @@ dtrace_probe_create(dtrace_provider_id_t prov, const char *mod, probe->dtpr_id = id; probe->dtpr_gen = dtrace_probegen++; - probe->dtpr_mod = dtrace_strdup(mod); - probe->dtpr_func = dtrace_strdup(func); - probe->dtpr_name = dtrace_strdup(name); + probe->dtpr_mod = dtrace_strref(mod); + probe->dtpr_func = dtrace_strref(func); + probe->dtpr_name = dtrace_strref(name); probe->dtpr_arg = arg; probe->dtpr_aframes = aframes; probe->dtpr_provider = provider; + dtrace_hash_add(dtrace_byprov, probe); dtrace_hash_add(dtrace_bymod, probe); dtrace_hash_add(dtrace_byfunc, probe); dtrace_hash_add(dtrace_byname, probe); @@ -8225,19 +8400,23 @@ dtrace_probe_lookup(dtrace_provider_id_t prid, const char *mod, dtrace_id_t id; int match; - pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name; + lck_mtx_lock(&dtrace_lock); + + pkey.dtpk_prov = dtrace_strref(((dtrace_provider_t *)prid)->dtpv_name); pkey.dtpk_pmatch = &dtrace_match_string; - pkey.dtpk_mod = mod; + pkey.dtpk_mod = dtrace_strref(mod); pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul; - pkey.dtpk_func = func; + pkey.dtpk_func = dtrace_strref(func); pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul; - pkey.dtpk_name = name; + pkey.dtpk_name = dtrace_strref(name); pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul; pkey.dtpk_id = DTRACE_IDNONE; - lck_mtx_lock(&dtrace_lock); match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0, dtrace_probe_lookup_match, &id, NULL); + + dtrace_probekey_release(&pkey); + lck_mtx_unlock(&dtrace_lock); ASSERT(match == 1 || match == 0); @@ -8382,6 +8561,7 @@ dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab, dtr uint32_t priv; uid_t uid; zoneid_t zoneid; + int err; LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED); @@ -8400,8 +8580,11 @@ dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab, dtr dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred, &priv, &uid, &zoneid); - return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable, - enab, ep)); + err = dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable, enab, ep); + + dtrace_probekey_release(&pkey); + + return err; } /* @@ -8637,14 +8820,6 @@ dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg, meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP); meta->dtm_mops = *mops; - - /* APPLE NOTE: Darwin employs size bounded string operation. */ - { - size_t bufsize = strlen(name) + 1; - meta->dtm_name = kmem_alloc(bufsize, KM_SLEEP); - (void) strlcpy(meta->dtm_name, name, bufsize); - } - meta->dtm_arg = arg; lck_mtx_lock(&dtrace_meta_lock); @@ -8655,11 +8830,12 @@ dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg, lck_mtx_unlock(&dtrace_meta_lock); cmn_err(CE_WARN, "failed to register meta-register %s: " "user-land meta-provider exists", name); - kmem_free(meta->dtm_name, strlen(meta->dtm_name) + 1); kmem_free(meta, sizeof (dtrace_meta_t)); return (EINVAL); } + meta->dtm_name = dtrace_strref(name); + dtrace_meta_pid = meta; *idp = (dtrace_meta_provider_id_t)meta; @@ -8718,10 +8894,11 @@ dtrace_meta_unregister(dtrace_meta_provider_id_t id) *pp = NULL; + dtrace_strunref(old->dtm_name); + lck_mtx_unlock(&dtrace_lock); lck_mtx_unlock(&dtrace_meta_lock); - kmem_free(old->dtm_name, strlen(old->dtm_name) + 1); kmem_free(old, sizeof (dtrace_meta_t)); return (0); @@ -12024,7 +12201,7 @@ dtrace_dof_create(dtrace_state_t *state) LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED); - dof = dt_kmem_zalloc_aligned(len, 8, KM_SLEEP); + dof = kmem_zalloc_aligned(len, 8, KM_SLEEP); dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0; dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1; dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2; @@ -12102,11 +12279,11 @@ dtrace_dof_copyin(user_addr_t uarg, int *errp) return (NULL); } - dof = dt_kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP); + dof = kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP); if (copyin(uarg, dof, hdr.dofh_loadsz) != 0 || dof->dofh_loadsz != hdr.dofh_loadsz) { - dt_kmem_free_aligned(dof, hdr.dofh_loadsz); + kmem_free_aligned(dof, hdr.dofh_loadsz); *errp = EFAULT; return (NULL); } @@ -12146,10 +12323,10 @@ dtrace_dof_copyin_from_proc(proc_t* p, user_addr_t uarg, int *errp) return (NULL); } - dof = dt_kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP); + dof = kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP); if (uread(p, dof, hdr.dofh_loadsz, uarg) != KERN_SUCCESS) { - dt_kmem_free_aligned(dof, hdr.dofh_loadsz); + kmem_free_aligned(dof, hdr.dofh_loadsz); *errp = EFAULT; return (NULL); } @@ -12160,13 +12337,13 @@ dtrace_dof_copyin_from_proc(proc_t* p, user_addr_t uarg, int *errp) static void dtrace_dof_destroy(dof_hdr_t *dof) { - dt_kmem_free_aligned(dof, dof->dofh_loadsz); + kmem_free_aligned(dof, dof->dofh_loadsz); } static dof_hdr_t * dtrace_dof_property(const char *name) { - unsigned int len; + unsigned int len = 0; dof_hdr_t *dof; if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) { @@ -12177,7 +12354,7 @@ dtrace_dof_property(const char *name) return NULL; } - dof = dt_kmem_alloc_aligned(len, 8, KM_SLEEP); + dof = kmem_alloc_aligned(len, 8, KM_SLEEP); if (!PEReadNVRAMProperty(name, dof, &len)) { dtrace_dof_destroy(dof); @@ -12789,8 +12966,8 @@ dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr, return (-1); } - if (dof->dofh_secsize == 0) { - dtrace_dof_error(dof, "zero section header size"); + if (dof->dofh_secsize < sizeof(dof_sec_t)) { + dtrace_dof_error(dof, "invalid section header size"); return (-1); } @@ -13183,7 +13360,7 @@ dtrace_state_create(dev_t *devp, cred_t *cr, dtrace_state_t **new_state) major = ddi_driver_major(dtrace_devi); } - state->dts_dev = makedevice(major, minor); + state->dts_dev = makedev(major, minor); if (devp != NULL) *devp = state->dts_dev; @@ -13231,6 +13408,10 @@ dtrace_state_create(dev_t *devp, cred_t *cr, dtrace_state_t **new_state) * the normal checks are bypassed. */ #if defined(__APPLE__) + if (cr != NULL) { + kauth_cred_ref(cr); + state->dts_cred.dcr_cred = cr; + } if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) { if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) { /* @@ -13927,7 +14108,7 @@ dtrace_state_destroy(dtrace_state_t *state) * Release the credential hold we took in dtrace_state_create(). */ if (state->dts_cred.dcr_cred != NULL) - crfree(state->dts_cred.dcr_cred); + kauth_cred_unref(&state->dts_cred.dcr_cred); /* * Now we can safely disable and destroy any enabled probes. Because @@ -14006,6 +14187,20 @@ dtrace_state_destroy(dtrace_state_t *state) /* * DTrace Anonymous Enabling Functions */ + +int +dtrace_keep_kernel_symbols(void) +{ + if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) { + return 0; + } + + if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) + return 1; + + return 0; +} + static dtrace_state_t * dtrace_anon_grab(void) { @@ -14049,6 +14244,7 @@ dtrace_anon_property(void) break; } +#ifdef illumos /* * We want to create anonymous state, so we need to transition * the kernel debugger to indicate that DTrace is active. If @@ -14061,6 +14257,7 @@ dtrace_anon_property(void) dtrace_dof_destroy(dof); break; } +#endif /* * If we haven't allocated an anonymous state, we'll do so now. @@ -14308,6 +14505,7 @@ dtrace_helper_destroygen(proc_t* p, int gen) dtrace_vstate_t *vstate; uint_t i; + LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED); LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED); if (help == NULL || gen > help->dthps_generation) @@ -14373,13 +14571,11 @@ dtrace_helper_destroygen(proc_t* p, int gen) /* * If we have a meta provider, remove this helper provider. */ - lck_mtx_lock(&dtrace_meta_lock); if (dtrace_meta_pid != NULL) { ASSERT(dtrace_deferred_pid == NULL); dtrace_helper_provider_remove(&prov->dthp_prov, p); } - lck_mtx_unlock(&dtrace_meta_lock); dtrace_helper_provider_destroy(prov); @@ -14485,9 +14681,9 @@ static void dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help, dof_helper_t *dofhp) { + LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED); LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED); - lck_mtx_lock(&dtrace_meta_lock); lck_mtx_lock(&dtrace_lock); if (!dtrace_attached() || dtrace_meta_pid == NULL) { @@ -14536,8 +14732,6 @@ dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help, p); } } - - lck_mtx_unlock(&dtrace_meta_lock); } static int @@ -14843,6 +15037,7 @@ dtrace_helper_slurp(proc_t* p, dof_hdr_t *dof, dof_helper_t *dhp) int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1; uintptr_t daddr = (uintptr_t)dof; + LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED); LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED); if ((help = p->p_dtrace_helpers) == NULL) @@ -15008,7 +15203,7 @@ dtrace_lazy_dofs_add(proc_t *p, dof_ioctl_data_t* incoming_dofs, int *dofs_claim * Any existing helpers force non-lazy behavior. */ if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) { - lck_mtx_lock(&p->p_dtrace_sprlock); + dtrace_sprlock(p); dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs; unsigned int existing_dofs_count = (existing_dofs) ? existing_dofs->dofiod_count : 0; @@ -15071,7 +15266,7 @@ dtrace_lazy_dofs_add(proc_t *p, dof_ioctl_data_t* incoming_dofs, int *dofs_claim #endif /* DEBUG */ unlock: - lck_mtx_unlock(&p->p_dtrace_sprlock); + dtrace_sprunlock(p); } else { rval = EACCES; } @@ -15101,7 +15296,7 @@ dtrace_lazy_dofs_remove(proc_t *p, int generation) * Any existing helpers force non-lazy behavior. */ if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) { - lck_mtx_lock(&p->p_dtrace_sprlock); + dtrace_sprlock(p); dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs; @@ -15158,9 +15353,8 @@ dtrace_lazy_dofs_remove(proc_t *p, int generation) #endif } - - lck_mtx_unlock(&p->p_dtrace_sprlock); - } else { + dtrace_sprunlock(p); + } else { rval = EACCES; } @@ -15173,14 +15367,14 @@ void dtrace_lazy_dofs_destroy(proc_t *p) { lck_rw_lock_shared(&dtrace_dof_mode_lock); - lck_mtx_lock(&p->p_dtrace_sprlock); + dtrace_sprlock(p); ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL); dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs; p->p_dtrace_lazy_dofs = NULL; - lck_mtx_unlock(&p->p_dtrace_sprlock); + dtrace_sprunlock(p); lck_rw_unlock_shared(&dtrace_dof_mode_lock); if (lazy_dofs) { @@ -15205,7 +15399,7 @@ dtrace_lazy_dofs_process(proc_t *p) { * fault in the dof. We could fix this by holding locks longer, * but the errors are benign. */ - lck_mtx_lock(&p->p_dtrace_sprlock); + dtrace_sprlock(p); ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL); @@ -15214,8 +15408,8 @@ dtrace_lazy_dofs_process(proc_t *p) { dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs; p->p_dtrace_lazy_dofs = NULL; - lck_mtx_unlock(&p->p_dtrace_sprlock); - + dtrace_sprunlock(p); + lck_mtx_lock(&dtrace_meta_lock); /* * Process each dof_helper_t */ @@ -15270,8 +15464,10 @@ dtrace_lazy_dofs_process(proc_t *p) { lck_mtx_unlock(&dtrace_lock); } } - + lck_mtx_unlock(&dtrace_meta_lock); kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count)); + } else { + lck_mtx_unlock(&dtrace_meta_lock); } } @@ -15295,7 +15491,7 @@ dtrace_lazy_dofs_duplicate(proc_t *parent, proc_t *child) LCK_MTX_ASSERT(&child->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED); lck_rw_lock_shared(&dtrace_dof_mode_lock); - lck_mtx_lock(&parent->p_dtrace_sprlock); + dtrace_sprlock(parent); /* * We need to make sure that the transition to lazy dofs -> helpers @@ -15315,12 +15511,12 @@ dtrace_lazy_dofs_duplicate(proc_t *parent, proc_t *child) bcopy(parent_dofs, child_dofs, parent_dofs_size); } - lck_mtx_unlock(&parent->p_dtrace_sprlock); + dtrace_sprunlock(parent); if (child_dofs) { - lck_mtx_lock(&child->p_dtrace_sprlock); + dtrace_sprlock(child); child->p_dtrace_lazy_dofs = child_dofs; - lck_mtx_unlock(&child->p_dtrace_sprlock); + dtrace_sprunlock(child); /** * We process the DOF at this point if the mode is set to * LAZY_OFF. This can happen if DTrace is still processing the @@ -15365,6 +15561,7 @@ dtrace_helpers_destroy(proc_t* p) dtrace_vstate_t *vstate; uint_t i; + lck_mtx_lock(&dtrace_meta_lock); lck_mtx_lock(&dtrace_lock); ASSERT(p->p_dtrace_helpers != NULL); @@ -15398,7 +15595,6 @@ dtrace_helpers_destroy(proc_t* p) * Destroy the helper providers. */ if (help->dthps_maxprovs > 0) { - lck_mtx_lock(&dtrace_meta_lock); if (dtrace_meta_pid != NULL) { ASSERT(dtrace_deferred_pid == NULL); @@ -15428,7 +15624,6 @@ dtrace_helpers_destroy(proc_t* p) lck_mtx_unlock(&dtrace_lock); } - lck_mtx_unlock(&dtrace_meta_lock); for (i = 0; i < help->dthps_nprovs; i++) { dtrace_helper_provider_destroy(help->dthps_provs[i]); @@ -15447,6 +15642,7 @@ dtrace_helpers_destroy(proc_t* p) --dtrace_helpers; lck_mtx_unlock(&dtrace_lock); + lck_mtx_unlock(&dtrace_meta_lock); } static void @@ -15459,6 +15655,7 @@ dtrace_helpers_duplicate(proc_t *from, proc_t *to) uint_t i; int j, sz, hasprovs = 0; + lck_mtx_lock(&dtrace_meta_lock); lck_mtx_lock(&dtrace_lock); ASSERT(from->p_dtrace_helpers != NULL); ASSERT(dtrace_helpers > 0); @@ -15530,6 +15727,8 @@ dtrace_helpers_duplicate(proc_t *from, proc_t *to) if (hasprovs) dtrace_helper_provider_register(to, newhelp, NULL); + + lck_mtx_unlock(&dtrace_meta_lock); } /** @@ -15550,7 +15749,7 @@ dtrace_proc_fork(proc_t *parent_proc, proc_t *child_proc, int spawn) * the p_dtrace_sprlock lock. A full sprlock would * task_suspend the parent. */ - lck_mtx_lock(&parent_proc->p_dtrace_sprlock); + dtrace_sprlock(parent_proc); /* * Remove all DTrace tracepoints from the child process. We @@ -15561,7 +15760,7 @@ dtrace_proc_fork(proc_t *parent_proc, proc_t *child_proc, int spawn) dtrace_fasttrap_fork(parent_proc, child_proc); } - lck_mtx_unlock(&parent_proc->p_dtrace_sprlock); + dtrace_sprunlock(parent_proc); /* * Duplicate any lazy dof(s). This must be done while NOT @@ -15851,7 +16050,7 @@ dtrace_module_loaded(struct kmod_info *kmod, uint32_t flag) ctl->mod_loaded = 1; ctl->mod_flags = 0; ctl->mod_user_symbols = NULL; - + /* * Find the UUID for this module, if it has one */ @@ -15870,6 +16069,15 @@ dtrace_module_loaded(struct kmod_info *kmod, uint32_t flag) if (ctl->mod_address == g_kernel_kmod_info.address) { ctl->mod_flags |= MODCTL_IS_MACH_KERNEL; + memcpy(dtrace_kerneluuid, ctl->mod_uuid, sizeof(dtrace_kerneluuid)); + } + /* + * Static kexts have a UUID that is not used for symbolication, as all their + * symbols are in kernel + */ + else if ((flag & KMOD_DTRACE_STATIC_KEXT) == KMOD_DTRACE_STATIC_KEXT) { + memcpy(ctl->mod_uuid, dtrace_kerneluuid, sizeof(dtrace_kerneluuid)); + ctl->mod_flags |= MODCTL_IS_STATIC_KEXT; } } dtrace_modctl_add(ctl); @@ -16078,6 +16286,7 @@ dtrace_module_unloaded(struct kmod_info *kmod) probe->dtpr_provider->dtpv_probe_count--; next = probe->dtpr_nextmod; + dtrace_hash_remove(dtrace_byprov, probe); dtrace_hash_remove(dtrace_bymod, probe); dtrace_hash_remove(dtrace_byfunc, probe); dtrace_hash_remove(dtrace_byname, probe); @@ -16103,9 +16312,9 @@ dtrace_module_unloaded(struct kmod_info *kmod) prov = probe->dtpr_provider; prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id, probe->dtpr_arg); - kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1); - kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1); - kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1); + dtrace_strunref(probe->dtpr_mod); + dtrace_strunref(probe->dtpr_func); + dtrace_strunref(probe->dtpr_name); vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1); zfree(dtrace_probe_t_zone, probe); @@ -16242,9 +16451,8 @@ dtrace_toxrange_add(uintptr_t base, uintptr_t limit) */ /*ARGSUSED*/ static int -dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) +dtrace_attach(dev_info_t *devi) { -#pragma unused(cmd) /* __APPLE__ */ dtrace_provider_id_t id; dtrace_state_t *state = NULL; dtrace_enabling_t *enab; @@ -16254,8 +16462,6 @@ dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) lck_mtx_lock(&dtrace_lock); /* Darwin uses BSD cloning device driver to automagically obtain minor device number. */ - - ddi_report_dev(devi); dtrace_devi = devi; dtrace_modload = dtrace_module_loaded; @@ -16274,8 +16480,6 @@ dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1, NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER); - dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri, - 1, INT_MAX, 0); dtrace_state_cache = kmem_cache_create("dtrace_state_cache", sizeof (dtrace_dstate_percpu_t) * (int)NCPU, DTRACE_STATE_ALIGN, @@ -16283,15 +16487,23 @@ dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED); - dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod), + dtrace_byprov = dtrace_hash_create(dtrace_strkey_probe_provider, + 0, /* unused */ + offsetof(dtrace_probe_t, dtpr_nextprov), + offsetof(dtrace_probe_t, dtpr_prevprov)); + + dtrace_bymod = dtrace_hash_create(dtrace_strkey_deref_offset, + offsetof(dtrace_probe_t, dtpr_mod), offsetof(dtrace_probe_t, dtpr_nextmod), offsetof(dtrace_probe_t, dtpr_prevmod)); - dtrace_byfunc = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_func), + dtrace_byfunc = dtrace_hash_create(dtrace_strkey_deref_offset, + offsetof(dtrace_probe_t, dtpr_func), offsetof(dtrace_probe_t, dtpr_nextfunc), offsetof(dtrace_probe_t, dtpr_prevfunc)); - dtrace_byname = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_name), + dtrace_byname = dtrace_hash_create(dtrace_strkey_deref_offset, + offsetof(dtrace_probe_t, dtpr_name), offsetof(dtrace_probe_t, dtpr_nextname), offsetof(dtrace_probe_t, dtpr_prevname)); @@ -16443,6 +16655,7 @@ dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) dtrace_opens++; dtrace_membar_producer(); +#ifdef illumos /* * If the kernel debugger is active (that is, if the kernel debugger * modified text in some way), we won't allow the open. @@ -16453,13 +16666,17 @@ dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) lck_mtx_unlock(&cpu_lock); return (EBUSY); } +#endif rv = dtrace_state_create(devp, cred_p, &state); lck_mtx_unlock(&cpu_lock); if (rv != 0 || state == NULL) { - if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) + if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) { +#ifdef illumos (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE); +#endif + } lck_mtx_unlock(&dtrace_lock); /* propagate EAGAIN or ERESTART */ return (rv); @@ -16557,9 +16774,12 @@ dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p) * Only relinquish control of the kernel debugger interface when there * are no consumers and no anonymous enablings. */ - if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) + if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) { +#ifdef illumos (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE); - +#endif + } + lck_mtx_unlock(&dtrace_lock); lck_mtx_unlock(&cpu_lock); @@ -16700,6 +16920,7 @@ dtrace_ioctl_helper(u_long cmd, caddr_t arg, int *rv) dof_hdr_t *dof = dtrace_dof_copyin(dhp->dofhp_dof, &rval); if (dof != NULL) { + lck_mtx_lock(&dtrace_meta_lock); lck_mtx_lock(&dtrace_lock); /* @@ -16711,6 +16932,7 @@ dtrace_ioctl_helper(u_long cmd, caddr_t arg, int *rv) } lck_mtx_unlock(&dtrace_lock); + lck_mtx_unlock(&dtrace_meta_lock); } } while (++i < multi_dof->dofiod_count && rval == 0); } @@ -16751,9 +16973,11 @@ dtrace_ioctl_helper(u_long cmd, caddr_t arg, int *rv) * EACCES means non-lazy */ if (rval == EACCES) { + lck_mtx_lock(&dtrace_meta_lock); lck_mtx_lock(&dtrace_lock); rval = dtrace_helper_destroygen(p, generation); lck_mtx_unlock(&dtrace_lock); + lck_mtx_unlock(&dtrace_meta_lock); } return (rval); @@ -17106,17 +17330,15 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv desc.dtpd_id++; } - if (cmd == DTRACEIOC_PROBEMATCH) { - dtrace_probekey(&desc, &pkey); - pkey.dtpk_id = DTRACE_IDNONE; - } - dtrace_cred2priv(cr, &priv, &uid, &zoneid); lck_mtx_lock(&dtrace_lock); - if (cmd == DTRACEIOC_PROBEMATCH) { - /* Quiet compiler warning */ + if (cmd == DTRACEIOC_PROBEMATCH) { + dtrace_probekey(&desc, &pkey); + pkey.dtpk_id = DTRACE_IDNONE; + + /* Quiet compiler warning */ for (i = desc.dtpd_id; i <= (dtrace_id_t)dtrace_nprobes; i++) { if ((probe = dtrace_probes[i - 1]) != NULL && (m = dtrace_match_probe(probe, &pkey, @@ -17128,6 +17350,7 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv lck_mtx_unlock(&dtrace_lock); return (EINVAL); } + dtrace_probekey_release(&pkey); } else { /* Quiet compiler warning */ @@ -17639,7 +17862,7 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv ctl->mod_flags |= MODCTL_FBT_PROVIDE_PRIVATE_PROBES; ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl)); - if (!MOD_SYMBOLS_DONE(ctl)) { + if (!MOD_SYMBOLS_DONE(ctl) && !MOD_IS_STATIC_KEXT(ctl)) { dtmul_count++; rval = EINVAL; } @@ -17695,7 +17918,7 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv * are available, add user syms if the module might use them. */ ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl)); - if (!MOD_SYMBOLS_DONE(ctl)) { + if (!MOD_SYMBOLS_DONE(ctl) && !MOD_IS_STATIC_KEXT(ctl)) { UUID* uuid = &uuids_list->dtmul_uuid[dtmul_count]; if (dtmul_count++ < uuids_list->dtmul_count) { memcpy(uuid, ctl->mod_uuid, sizeof(UUID)); @@ -17811,32 +18034,24 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv ctl->mod_flags |= MODCTL_FBT_PROVIDE_PRIVATE_PROBES; ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl)); - if (MOD_HAS_UUID(ctl) && !MOD_SYMBOLS_DONE(ctl)) { - if (memcmp(module_symbols->dtmodsyms_uuid, ctl->mod_uuid, sizeof(UUID)) == 0) { - /* BINGO! */ - ctl->mod_user_symbols = module_symbols; - break; - } + if (MOD_HAS_UUID(ctl) && !MOD_SYMBOLS_DONE(ctl) && memcmp(module_symbols->dtmodsyms_uuid, ctl->mod_uuid, sizeof(UUID)) == 0) { + dtrace_provider_t *prv; + ctl->mod_user_symbols = module_symbols; + + /* + * We're going to call each providers per-module provide operation + * specifying only this module. + */ + for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next) + prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl); + /* + * We gave every provider a chance to provide with the user syms, go ahead and clear them + */ + ctl->mod_user_symbols = NULL; /* MUST reset this to clear HAS_USERSPACE_SYMBOLS */ } ctl = ctl->mod_next; } - if (ctl) { - dtrace_provider_t *prv; - - /* - * We're going to call each providers per-module provide operation - * specifying only this module. - */ - for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next) - prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl); - - /* - * We gave every provider a chance to provide with the user syms, go ahead and clear them - */ - ctl->mod_user_symbols = NULL; /* MUST reset this to clear HAS_USERSPACE_SYMBOLS */ - } - lck_mtx_unlock(&mod_lock); lck_mtx_unlock(&dtrace_provider_lock); @@ -17972,9 +18187,13 @@ dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) dtrace_probes = NULL; dtrace_nprobes = 0; + dtrace_hash_destroy(dtrace_strings); + dtrace_hash_destroy(dtrace_byprov); dtrace_hash_destroy(dtrace_bymod); dtrace_hash_destroy(dtrace_byfunc); dtrace_hash_destroy(dtrace_byname); + dtrace_strings = NULL; + dtrace_byprov = NULL; dtrace_bymod = NULL; dtrace_byfunc = NULL; dtrace_byname = NULL; @@ -18002,6 +18221,7 @@ dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) lck_mtx_unlock(&dtrace_lock); lck_mtx_unlock(&dtrace_provider_lock); +#ifdef illumos /* * We don't destroy the task queue until after we have dropped our * locks (taskq_destroy() may block on running tasks). To prevent @@ -18012,6 +18232,7 @@ dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) */ taskq_destroy(dtrace_taskq); dtrace_taskq = NULL; +#endif return (DDI_SUCCESS); } @@ -18223,6 +18444,19 @@ lck_grp_t* dtrace_lck_grp; static int gMajDevNo; +void dtrace_early_init (void) +{ + dtrace_restriction_policy_load(); + + /* + * See dtrace_impl.h for a description of kernel symbol modes. + * The default is to wait for symbols from userspace (lazy symbols). + */ + if (!PE_parse_boot_argn("dtrace_kernel_symbol_mode", &dtrace_kernel_symbol_mode, sizeof (dtrace_kernel_symbol_mode))) { + dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE; + } +} + void dtrace_init( void ) { @@ -18274,13 +18508,6 @@ dtrace_init( void ) return; } -#if defined(DTRACE_MEMORY_ZONES) - /* - * Initialize the dtrace kalloc-emulation zones. - */ - dtrace_alloc_init(); -#endif /* DTRACE_MEMORY_ZONES */ - /* * Allocate the dtrace_probe_t zone */ @@ -18347,6 +18574,11 @@ dtrace_init( void ) (void)dtrace_abs_to_nano(0LL); /* Force once only call to clock_timebase_info (which can take a lock) */ + dtrace_strings = dtrace_hash_create(dtrace_strkey_offset, + offsetof(dtrace_string_t, dtst_str), + offsetof(dtrace_string_t, dtst_next), + offsetof(dtrace_string_t, dtst_prev)); + dtrace_isa_init(); /* * See dtrace_impl.h for a description of dof modes. @@ -18386,16 +18618,6 @@ dtrace_init( void ) break; } - /* - * See dtrace_impl.h for a description of kernel symbol modes. - * The default is to wait for symbols from userspace (lazy symbols). - */ - if (!PE_parse_boot_argn("dtrace_kernel_symbol_mode", &dtrace_kernel_symbol_mode, sizeof (dtrace_kernel_symbol_mode))) { - dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE; - } - - dtrace_restriction_policy_load(); - gDTraceInited = 1; } else @@ -18410,7 +18632,7 @@ dtrace_postinit(void) * run. That way, anonymous DOF enabled under dtrace_attach() is safe * to go. */ - dtrace_attach( (dev_info_t *)(uintptr_t)makedev(gMajDevNo, 0), 0 ); /* Punning a dev_t to a dev_info_t* */ + dtrace_attach( (dev_info_t *)(uintptr_t)makedev(gMajDevNo, 0)); /* Punning a dev_t to a dev_info_t* */ /* * Add the mach_kernel to the module list for lazy processing diff --git a/bsd/dev/dtrace/dtrace_alloc.c b/bsd/dev/dtrace/dtrace_alloc.c deleted file mode 100644 index e43ca8ce5..000000000 --- a/bsd/dev/dtrace/dtrace_alloc.c +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Copyright (c) 2005-2007 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - * DTrace kalloc emulation. - * - * This is a subset of kalloc functionality, to allow dtrace - * specific allocation to be accounted for separately from the - * general kalloc pool. - * - * Note that allocations greater than dalloc_max still go into - * the kalloc.large bucket, as it seems impossible to emulate - * that functionality in the bsd kern. - */ - -#include -#include -#include -#include -#include - -#if defined(DTRACE_MEMORY_ZONES) - -#define DTRACE_ALLOC_MINSIZE 16 - -vm_size_t dtrace_alloc_max; -vm_size_t dtrace_alloc_max_prerounded; -int first_d_zone = -1; -struct zone *d_zone[16]; -static const char *d_zone_name[16] = { - "dtrace.1", "dtrace.2", - "dtrace.4", "dtrace.8", - "dtrace.16", "dtrace.32", - "dtrace.64", "dtrace.128", - "dtrace.256", "dtrace.512", - "dtrace.1024", "dtrace.2048", - "dtrace.4096", "dtrace.8192", - "dtrace.16384", "dtrace.32768" -}; - -unsigned long d_zone_max[16] = { - 1024, /* 1 Byte */ - 1024, /* 2 Byte */ - 1024, /* 4 Byte */ - 1024, /* 8 Byte */ - 1024, /* 16 Byte */ - 4096, /* 32 Byte */ - 4096, /* 64 Byte */ - 4096, /* 128 Byte */ - 4096, /* 256 Byte */ - 1024, /* 512 Byte */ - 1024, /* 1024 Byte */ - 1024, /* 2048 Byte */ - 1024, /* 4096 Byte */ - 4096, /* 8192 Byte */ - 64, /* 16384 Byte */ - 64, /* 32768 Byte */ -}; - -void dtrace_alloc_init(void) -{ - vm_size_t size; - int i; - - if (PAGE_SIZE < 16*1024) - dtrace_alloc_max = 16*1024; - else - dtrace_alloc_max = PAGE_SIZE; - dtrace_alloc_max_prerounded = dtrace_alloc_max / 2 + 1; - - /* - * Allocate a zone for each size we are going to handle. - * We specify non-paged memory. - */ - for (i = 0, size = 1; size < dtrace_alloc_max; i++, size <<= 1) { - if (size < DTRACE_ALLOC_MINSIZE) { - d_zone[i] = NULL; - continue; - } - if (size == DTRACE_ALLOC_MINSIZE) { - first_d_zone = i; - } - d_zone[i] = zinit(size, d_zone_max[i] * size, size, d_zone_name[i]); - } -} - -void *dtrace_alloc(vm_size_t size) -{ - int zindex; - vm_size_t allocsize; - - /* - * If size is too large for a zone, then use kmem_alloc. - * (We use kmem_alloc instead of kmem_alloc_kobject so that - * krealloc can use kmem_realloc.) - */ - - if (size >= dtrace_alloc_max_prerounded) { - return _MALLOC(size, M_TEMP, M_WAITOK); - } - - /* compute the size of the block that we will actually allocate */ - allocsize = DTRACE_ALLOC_MINSIZE; - zindex = first_d_zone; - while (allocsize < size) { - allocsize <<= 1; - zindex++; - } - - return(zalloc_canblock(d_zone[zindex], TRUE)); -} - -void dtrace_free(void *data, vm_size_t size) -{ - int zindex; - vm_size_t freesize; - - if (size >= dtrace_alloc_max_prerounded) { - _FREE(data, M_TEMP); - return; - } - - /* compute the size of the block that we actually allocated from */ - freesize = DTRACE_ALLOC_MINSIZE; - zindex = first_d_zone; - while (freesize < size) { - freesize <<= 1; - zindex++; - } - - /* free to the appropriate zone */ - zfree(d_zone[zindex], data); -} - -#endif /* DTRACE_MEMORY_ZONES */ diff --git a/bsd/dev/dtrace/dtrace_glue.c b/bsd/dev/dtrace/dtrace_glue.c index d47102fe6..bfda934bc 100644 --- a/bsd/dev/dtrace/dtrace_glue.c +++ b/bsd/dev/dtrace/dtrace_glue.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -72,8 +73,22 @@ /* Solaris proc_t is the struct. Darwin's proc_t is a pointer to it. */ #define proc_t struct proc /* Steer clear of the Darwin typedef for proc_t */ +void +dtrace_sprlock(proc_t *p) +{ + lck_mtx_assert(&p->p_mlock, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_lock(&p->p_dtrace_sprlock); +} + +void +dtrace_sprunlock(proc_t *p) +{ + lck_mtx_unlock(&p->p_dtrace_sprlock); + +} + /* Not called from probe context */ -proc_t * +proc_t * sprlock(pid_t pid) { proc_t* p; @@ -84,9 +99,9 @@ sprlock(pid_t pid) task_suspend_internal(p->task); - proc_lock(p); + dtrace_sprlock(p); - lck_mtx_lock(&p->p_dtrace_sprlock); + proc_lock(p); return p; } @@ -96,10 +111,10 @@ void sprunlock(proc_t *p) { if (p != PROC_NULL) { - lck_mtx_unlock(&p->p_dtrace_sprlock); - proc_unlock(p); + dtrace_sprunlock(p); + task_resume_internal(p->task); proc_rele(p); @@ -184,7 +199,7 @@ uwrite(proc_t *p, void *buf, user_size_t len, user_addr_t a) if (info.max_protection & VM_PROT_WRITE) { /* The memory is not currently writable, but can be made writable. */ - ret = mach_vm_protect (map, (mach_vm_offset_t)a, (mach_vm_size_t)len, 0, reprotect | VM_PROT_WRITE); + ret = mach_vm_protect (map, (mach_vm_offset_t)a, (mach_vm_size_t)len, 0, (reprotect & ~VM_PROT_EXECUTE) | VM_PROT_WRITE); } else { /* * The memory is not currently writable, and cannot be made writable. We need to COW this memory. @@ -207,6 +222,8 @@ uwrite(proc_t *p, void *buf, user_size_t len, user_addr_t a) (vm_map_address_t)a, (vm_size_t)len); + dtrace_flush_caches(); + if (ret != KERN_SUCCESS) goto done; @@ -270,10 +287,6 @@ PRIV_POLICY_ONLY(void *cr, int priv, int boolean) return kauth_cred_issuser(cr); /* XXX TODO: HAS_PRIVILEGE(cr, priv); */ } -/* XXX Get around const poisoning using structure assigns */ -gid_t -crgetgid(const cred_t *cr) { cred_t copy_cr = *cr; return kauth_cred_getgid(©_cr); } - uid_t crgetuid(const cred_t *cr) { cred_t copy_cr = *cr; return kauth_cred_getuid(©_cr); } @@ -577,15 +590,6 @@ cyclic_remove(cyclic_id_t cyclic) } } -/* - * ddi - */ -void -ddi_report_dev(dev_info_t *devi) -{ -#pragma unused(devi) -} - kern_return_t _dtrace_register_anon_DOF(char *, uchar_t *, uint_t); kern_return_t @@ -630,29 +634,6 @@ getminor ( dev_t d ) return (minor_t) minor(d); } -dev_t -makedevice(major_t major, minor_t minor) -{ - return makedev( major, minor ); -} - -int ddi_getprop(dev_t dev, dev_info_t *dip, int flags, const char *name, int defvalue) -{ -#pragma unused(dev, dip, flags, name) - - return defvalue; -} - -/* - * Kernel Debug Interface - */ -int -kdi_dtrace_set(kdi_dtrace_set_t ignore) -{ -#pragma unused(ignore) - return 0; /* Success */ -} - extern void Debugger(const char*); void @@ -663,7 +644,7 @@ debug_enter(char *c) { Debugger(c); } */ void * -dt_kmem_alloc(size_t size, int kmflag) +dt_kmem_alloc_site(size_t size, int kmflag, vm_allocation_site_t *site) { #pragma unused(kmflag) @@ -671,15 +652,12 @@ dt_kmem_alloc(size_t size, int kmflag) * We ignore the M_NOWAIT bit in kmflag (all of kmflag, in fact). * Requests larger than 8K with M_NOWAIT fail in kalloc_canblock. */ -#if defined(DTRACE_MEMORY_ZONES) - return dtrace_alloc(size); -#else - return kalloc(size); -#endif + vm_size_t vsize = size; + return kalloc_canblock(&vsize, TRUE, site); } void * -dt_kmem_zalloc(size_t size, int kmflag) +dt_kmem_zalloc_site(size_t size, int kmflag, vm_allocation_site_t *site) { #pragma unused(kmflag) @@ -687,11 +665,8 @@ dt_kmem_zalloc(size_t size, int kmflag) * We ignore the M_NOWAIT bit in kmflag (all of kmflag, in fact). * Requests larger than 8K with M_NOWAIT fail in kalloc_canblock. */ -#if defined(DTRACE_MEMORY_ZONES) - void* buf = dtrace_alloc(size); -#else - void* buf = kalloc(size); -#endif + vm_size_t vsize = size; + void* buf = kalloc_canblock(&vsize, TRUE, site); if(!buf) return NULL; @@ -713,21 +688,18 @@ dt_kmem_free(void *buf, size_t size) ASSERT(size > 0); -#if defined(DTRACE_MEMORY_ZONES) - dtrace_free(buf, size); -#else kfree(buf, size); -#endif } /* - * aligned kmem allocator + * aligned dt_kmem allocator * align should be a power of two */ -void* dt_kmem_alloc_aligned(size_t size, size_t align, int kmflag) +void* +dt_kmem_alloc_aligned_site(size_t size, size_t align, int kmflag, vm_allocation_site_t *site) { void *mem, **addr_to_free; intptr_t mem_aligned; @@ -742,7 +714,7 @@ void* dt_kmem_alloc_aligned(size_t size, size_t align, int kmflag) * the address to free and the total size of the buffer. */ hdr_size = sizeof(size_t) + sizeof(void*); - mem = dt_kmem_alloc(size + align + hdr_size, kmflag); + mem = dt_kmem_alloc_site(size + align + hdr_size, kmflag, site); if (mem == NULL) return NULL; @@ -759,11 +731,12 @@ void* dt_kmem_alloc_aligned(size_t size, size_t align, int kmflag) return (void*) mem_aligned; } -void* dt_kmem_zalloc_aligned(size_t size, size_t align, int kmflag) +void* +dt_kmem_zalloc_aligned_site(size_t size, size_t align, int kmflag, vm_allocation_site_t *s) { void* buf; - buf = dt_kmem_alloc_aligned(size, align, kmflag); + buf = dt_kmem_alloc_aligned_site(size, align, kmflag, s); if(!buf) return NULL; @@ -773,7 +746,8 @@ void* dt_kmem_zalloc_aligned(size_t size, size_t align, int kmflag) return buf; } -void dt_kmem_free_aligned(void* buf, size_t size) +void +dt_kmem_free_aligned(void* buf, size_t size) { #pragma unused(size) intptr_t ptr = (intptr_t) buf; @@ -829,44 +803,6 @@ kmem_cache_destroy(kmem_cache_t *cp) #pragma unused(cp) } -/* - * taskq - */ -extern void thread_call_setup(thread_call_t, thread_call_func_t, thread_call_param_t); /* XXX MACH_KERNEL_PRIVATE */ - -static void -_taskq_apply( task_func_t func, thread_call_param_t arg ) -{ - func( (void *)arg ); -} - -taskq_t * -taskq_create(const char *name, int nthreads, pri_t pri, int minalloc, - int maxalloc, uint_t flags) -{ -#pragma unused(name,nthreads,pri,minalloc,maxalloc,flags) - - return (taskq_t *)thread_call_allocate( (thread_call_func_t)_taskq_apply, NULL ); -} - -taskqid_t -taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) -{ -#pragma unused(flags) - thread_call_setup( (thread_call_t) tq, (thread_call_func_t)_taskq_apply, (thread_call_param_t)func ); - thread_call_enter1( (thread_call_t) tq, (thread_call_param_t)arg ); - return (taskqid_t) tq /* for lack of anything better */; -} - -void -taskq_destroy(taskq_t *tq) -{ - thread_call_cancel( (thread_call_t) tq ); - thread_call_free( (thread_call_t) tq ); -} - -pri_t maxclsyspri; - /* * vmem (Solaris "slab" allocator) used by DTrace solely to hand out resource ids */ @@ -1182,19 +1118,26 @@ dtrace_copyoutstr(uintptr_t src, user_addr_t dst, size_t len, volatile uint16_t extern const int copysize_limit_panic; +int dtrace_copy_maxsize(void) +{ + return copysize_limit_panic; +} + + int dtrace_buffer_copyout(const void *kaddr, user_addr_t uaddr, vm_size_t nbytes) { + int maxsize = dtrace_copy_maxsize(); /* * Partition the copyout in copysize_limit_panic-sized chunks */ - while (nbytes >= (vm_size_t)copysize_limit_panic) { - if (copyout(kaddr, uaddr, copysize_limit_panic) != 0) + while (nbytes >= (vm_size_t)maxsize) { + if (copyout(kaddr, uaddr, maxsize) != 0) return (EFAULT); - nbytes -= copysize_limit_panic; - uaddr += copysize_limit_panic; - kaddr += copysize_limit_panic; + nbytes -= maxsize; + uaddr += maxsize; + kaddr += maxsize; } if (nbytes > 0) { if (copyout(kaddr, uaddr, nbytes) != 0) @@ -1321,22 +1264,6 @@ fuword64(user_addr_t uaddr, uint64_t *value) return 0; } -void -fuword8_noerr(user_addr_t uaddr, uint8_t *value) -{ - if (copyin((const user_addr_t)uaddr, (char *)value, sizeof(uint8_t))) { - *value = 0; - } -} - -void -fuword16_noerr(user_addr_t uaddr, uint16_t *value) -{ - if (copyin((const user_addr_t)uaddr, (char *)value, sizeof(uint16_t))) { - *value = 0; - } -} - void fuword32_noerr(user_addr_t uaddr, uint32_t *value) { @@ -1373,27 +1300,6 @@ suword32(user_addr_t addr, uint32_t value) return 0; } -int -suword16(user_addr_t addr, uint16_t value) -{ - if (copyout((const void *)&value, addr, sizeof(value)) != 0) { - return -1; - } - - return 0; -} - -int -suword8(user_addr_t addr, uint8_t value) -{ - if (copyout((const void *)&value, addr, sizeof(value)) != 0) { - return -1; - } - - return 0; -} - - /* * Miscellaneous */ @@ -1537,6 +1443,12 @@ dtrace_getstackdepth(int aframes) return (depth - aframes); } +int +dtrace_addr_in_module(void* addr, struct modctl *ctl) +{ + return OSKextKextForAddress(addr) == (void*)ctl->mod_address; +} + /* * Unconsidered */ diff --git a/bsd/dev/dtrace/dtrace_ptss.c b/bsd/dev/dtrace/dtrace_ptss.c index 1ce9c28d8..6741f5563 100644 --- a/bsd/dev/dtrace/dtrace_ptss.c +++ b/bsd/dev/dtrace/dtrace_ptss.c @@ -167,7 +167,6 @@ dtrace_ptss_allocate_page(struct proc* p) mach_vm_size_t size = PAGE_MAX_SIZE; mach_vm_offset_t addr = 0; -#if CONFIG_EMBEDDED mach_vm_offset_t write_addr = 0; /* * The embedded OS has extra permissions for writable and executable pages. @@ -175,16 +174,11 @@ dtrace_ptss_allocate_page(struct proc* p) */ vm_prot_t cur_protection = VM_PROT_READ|VM_PROT_EXECUTE; vm_prot_t max_protection = VM_PROT_READ|VM_PROT_EXECUTE|VM_PROT_WRITE; -#else - vm_prot_t cur_protection = VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE; - vm_prot_t max_protection = VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE; -#endif /* CONFIG_EMBEDDED */ - kern_return_t kr = mach_vm_map_kernel(map, &addr, size, 0, VM_FLAGS_ANYWHERE, VM_KERN_MEMORY_NONE, IPC_PORT_NULL, 0, FALSE, cur_protection, max_protection, VM_INHERIT_DEFAULT); + kern_return_t kr = mach_vm_map_kernel(map, &addr, size, 0, VM_FLAGS_ANYWHERE, VM_MAP_KERNEL_FLAGS_NONE, VM_KERN_MEMORY_NONE, IPC_PORT_NULL, 0, FALSE, cur_protection, max_protection, VM_INHERIT_DEFAULT); if (kr != KERN_SUCCESS) { goto err; } -#if CONFIG_EMBEDDED /* * If on embedded, remap the scratch space as writable at another * virtual address @@ -196,14 +190,12 @@ dtrace_ptss_allocate_page(struct proc* p) kr = mach_vm_protect (map, (mach_vm_offset_t)write_addr, (mach_vm_size_t)size, 0, VM_PROT_READ | VM_PROT_WRITE); if (kr != KERN_SUCCESS) goto err; -#endif + // Chain the page entries. int i; for (i=0; ientries[i].addr = addr + (i * DTRACE_PTSS_SCRATCH_SPACE_PER_THREAD); -#if CONFIG_EMBEDDED ptss_page->entries[i].write_addr = write_addr + (i * DTRACE_PTSS_SCRATCH_SPACE_PER_THREAD); -#endif ptss_page->entries[i].next = &ptss_page->entries[i+1]; } @@ -243,10 +235,8 @@ dtrace_ptss_free_page(struct proc* p, struct dtrace_ptss_page* ptss_page) // Silent failures, no point in checking return code. mach_vm_deallocate(map, addr, size); -#ifdef CONFIG_EMBEDDED mach_vm_address_t write_addr = ptss_page->entries[0].write_addr; mach_vm_deallocate(map, write_addr, size); -#endif vm_map_deallocate(map); } diff --git a/bsd/dev/dtrace/fasttrap.c b/bsd/dev/dtrace/fasttrap.c index 8425b98af..9ce7ccc9c 100644 --- a/bsd/dev/dtrace/fasttrap.c +++ b/bsd/dev/dtrace/fasttrap.c @@ -46,6 +46,8 @@ #include #include +#include + #include #include #include @@ -143,7 +145,6 @@ qsort(void *a, size_t n, size_t es, int (*cmp)(const void *, const void *)); * never hold the provider lock and creation lock simultaneously */ -static dev_info_t *fasttrap_devi; static dtrace_meta_provider_id_t fasttrap_meta_id; static thread_t fasttrap_cleanup_thread; @@ -401,7 +402,6 @@ fasttrap_pid_cleanup_providers(void) return later; } -#ifdef FASTTRAP_ASYNC_REMOVE typedef struct fasttrap_tracepoint_spec { pid_t fttps_pid; user_addr_t fttps_pc; @@ -473,13 +473,13 @@ fasttrap_tracepoint_retire(proc_t *p, fasttrap_tracepoint_t *tp) s->fttps_pc = tp->ftt_pc; if (fasttrap_cur_retired == fasttrap_retired_size) { - fasttrap_retired_size *= 2; fasttrap_tracepoint_spec_t *new_retired = kmem_zalloc( - fasttrap_retired_size * - sizeof(fasttrap_tracepoint_t*), + fasttrap_retired_size * 2 * + sizeof(*fasttrap_retired_spec), KM_SLEEP); - memcpy(new_retired, fasttrap_retired_spec, sizeof(fasttrap_tracepoint_t*) * fasttrap_retired_size); - kmem_free(fasttrap_retired_spec, sizeof(fasttrap_tracepoint_t*) * (fasttrap_retired_size / 2)); + memcpy(new_retired, fasttrap_retired_spec, sizeof(*fasttrap_retired_spec) * fasttrap_retired_size); + kmem_free(fasttrap_retired_spec, sizeof(*fasttrap_retired_spec) * fasttrap_retired_size); + fasttrap_retired_size *= 2; fasttrap_retired_spec = new_retired; } @@ -489,15 +489,6 @@ fasttrap_tracepoint_retire(proc_t *p, fasttrap_tracepoint_t *tp) fasttrap_pid_cleanup(FASTTRAP_CLEANUP_TRACEPOINT); } -#else -void fasttrap_tracepoint_retire(proc_t *p, fasttrap_tracepoint_t *tp) -{ - if (tp->ftt_retired) - return; - - fasttrap_tracepoint_remove(p, tp); -} -#endif static void fasttrap_pid_cleanup_compute_priority(void) @@ -533,11 +524,9 @@ fasttrap_pid_cleanup_cb(void) if (work & FASTTRAP_CLEANUP_PROVIDER) { later = fasttrap_pid_cleanup_providers(); } -#ifdef FASTTRAP_ASYNC_REMOVE if (work & FASTTRAP_CLEANUP_TRACEPOINT) { fasttrap_tracepoint_cleanup(); } -#endif lck_mtx_lock(&fasttrap_cleanup_mtx); fasttrap_pid_cleanup_compute_priority(); @@ -1162,6 +1151,25 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg) return(0); } + if ((p->p_csflags & (CS_KILL|CS_HARD))) { + proc_unlock(p); + for (i = 0; i < DTRACE_NCLIENTS; i++) { + dtrace_state_t *state = dtrace_state_get(i); + if (state == NULL) + continue; + if (state->dts_cred.dcr_cred == NULL) + continue; + mac_proc_check_get_task(state->dts_cred.dcr_cred, p); + } + rc = cs_allow_invalid(p); + proc_lock(p); + if (rc == 0) { + sprunlock(p); + cmn_err(CE_WARN, "process doesn't allow invalid code pages, failing to install fasttrap probe\n"); + return (0); + } + } + /* * APPLE NOTE: We do not have an equivalent thread structure to Solaris. * Solaris uses its ulwp_t struct for scratch space to support the pid provider. @@ -1380,29 +1388,29 @@ static const dtrace_pattr_t pid_attr = { }; static dtrace_pops_t pid_pops = { - fasttrap_pid_provide, - NULL, - fasttrap_pid_enable, - fasttrap_pid_disable, - NULL, - NULL, - fasttrap_pid_getargdesc, - fasttrap_pid_getarg, - NULL, - fasttrap_pid_destroy + .dtps_provide = fasttrap_pid_provide, + .dtps_provide_module = NULL, + .dtps_enable = fasttrap_pid_enable, + .dtps_disable = fasttrap_pid_disable, + .dtps_suspend = NULL, + .dtps_resume = NULL, + .dtps_getargdesc = fasttrap_pid_getargdesc, + .dtps_getargval = fasttrap_pid_getarg, + .dtps_usermode = NULL, + .dtps_destroy = fasttrap_pid_destroy }; static dtrace_pops_t usdt_pops = { - fasttrap_pid_provide, - NULL, - fasttrap_pid_enable, - fasttrap_pid_disable, - NULL, - NULL, - fasttrap_pid_getargdesc, - fasttrap_usdt_getarg, - NULL, - fasttrap_pid_destroy + .dtps_provide = fasttrap_pid_provide, + .dtps_provide_module = NULL, + .dtps_enable = fasttrap_pid_enable, + .dtps_disable = fasttrap_pid_disable, + .dtps_suspend = NULL, + .dtps_resume = NULL, + .dtps_getargdesc = fasttrap_pid_getargdesc, + .dtps_getargval = fasttrap_usdt_getarg, + .dtps_usermode = NULL, + .dtps_destroy = fasttrap_pid_destroy }; static fasttrap_proc_t * @@ -1593,10 +1601,7 @@ fasttrap_provider_lookup(proc_t *p, fasttrap_provider_type_t provider_type, cons * APPLE NOTE: We have no equivalent to crhold, * even though there is a cr_ref filed in ucred. */ - // lck_mtx_lock(&p->p_crlock; - crhold(p->p_ucred); - cred = p->p_ucred; - // lck_mtx_unlock(&p->p_crlock); + cred = kauth_cred_proc_ref(p); proc_unlock(p); new_fp = kmem_zalloc(sizeof (fasttrap_provider_t), KM_SLEEP); @@ -1625,7 +1630,7 @@ fasttrap_provider_lookup(proc_t *p, fasttrap_provider_type_t provider_type, cons lck_mtx_lock(&fp->ftp_mtx); lck_mtx_unlock(&bucket->ftb_mtx); fasttrap_provider_free(new_fp); - crfree(cred); + kauth_cred_unref(&cred); return (fp); } } @@ -1647,7 +1652,7 @@ fasttrap_provider_lookup(proc_t *p, fasttrap_provider_type_t provider_type, cons &new_fp->ftp_provid) != 0) { lck_mtx_unlock(&bucket->ftb_mtx); fasttrap_provider_free(new_fp); - crfree(cred); + kauth_cred_unref(&cred); return (NULL); } @@ -1657,7 +1662,8 @@ fasttrap_provider_lookup(proc_t *p, fasttrap_provider_type_t provider_type, cons lck_mtx_lock(&new_fp->ftp_mtx); lck_mtx_unlock(&bucket->ftb_mtx); - crfree(cred); + kauth_cred_unref(&cred); + return (new_fp); } @@ -1850,16 +1856,6 @@ fasttrap_add_probe(fasttrap_probe_spec_t *pdata) if (p == PROC_NULL) return (ESRCH); - /* - * Set that the process is allowed to run modified code and - * bail if it is not allowed to - */ -#if CONFIG_EMBEDDED - if ((p->p_csflags & (CS_KILL|CS_HARD)) && !cs_allow_invalid(p)) { - proc_rele(p); - return (EPERM); - } -#endif if ((provider = fasttrap_provider_lookup(p, pdata->ftps_provider_type, provider_name, &pid_attr)) == NULL) { proc_rele(p); @@ -2339,10 +2335,10 @@ fasttrap_meta_provider_name(void *arg) } static dtrace_mops_t fasttrap_mops = { - fasttrap_meta_create_probe, - fasttrap_meta_provide, - fasttrap_meta_remove, - fasttrap_meta_provider_name + .dtms_create_probe = fasttrap_meta_create_probe, + .dtms_provide_proc = fasttrap_meta_provide, + .dtms_remove_proc = fasttrap_meta_remove, + .dtms_provider_name = fasttrap_meta_provider_name }; /* @@ -2522,22 +2518,11 @@ fasttrap_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int * return (EINVAL); } -static int -fasttrap_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) +static void +fasttrap_attach(void) { ulong_t nent; - - switch (cmd) { - case DDI_ATTACH: - break; - case DDI_RESUME: - return (DDI_SUCCESS); - default: - return (DDI_FAILURE); - } - - ddi_report_dev(devi); - fasttrap_devi = devi; + unsigned int i; /* * Install our hooks into fork(2), exec(2), and exit(2). @@ -2553,17 +2538,6 @@ fasttrap_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) */ fasttrap_max = (sane_size >> 28) * 100000; -#if CONFIG_EMBEDDED -#if defined(__LP64__) - /* - * On embedded, the zone map does not grow with the memory size over 1GB - * (see osfmk/vm/vm_init.c) - */ - if (fasttrap_max > 400000) { - fasttrap_max = 400000; - } -#endif -#endif if (fasttrap_max == 0) fasttrap_max = 50000; @@ -2573,8 +2547,12 @@ fasttrap_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) /* * Conjure up the tracepoints hashtable... */ +#ifdef illumos nent = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS, "fasttrap-hash-size", FASTTRAP_TPOINTS_DEFAULT_SIZE); +#else + nent = FASTTRAP_TPOINTS_DEFAULT_SIZE; +#endif if (nent <= 0 || nent > 0x1000000) nent = FASTTRAP_TPOINTS_DEFAULT_SIZE; @@ -2589,11 +2567,7 @@ fasttrap_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) sizeof (fasttrap_bucket_t), KM_SLEEP); ASSERT(fasttrap_tpoints.fth_table != NULL); - /* - * APPLE NOTE: explicitly initialize all locks... - */ - unsigned int i; - for (i=0; i> 4) & fbt_probetab_mask) #define FBT_PROBETAB_SIZE 0x8000 /* 32k entries -- 128K total */ -static dev_info_t *fbt_devi; static int fbt_probetab_size; dtrace_provider_id_t fbt_id; fbt_probe_t **fbt_probetab; @@ -91,12 +94,23 @@ void fbt_init( void ); static const char * critical_blacklist[] = { "Call_DebuggerC", + "DebuggerCall", + "DebuggerTrapWithState", + "DebuggerXCallEnter", + "IOCPURunPlatformPanicActions", + "PEARMDebugPanicHook", + "PEHaltRestart", + "SavePanicInfo", "SysChoked", "_ZN9IOService14newTemperatureElPS_", /* IOService::newTemperature */ "_ZN9IOService26temperatureCriticalForZoneEPS_", /* IOService::temperatureCriticalForZone */ "_ZNK6OSData14getBytesNoCopyEv", /* Data::getBytesNoCopy, IOHibernateSystemWake path */ + "__ZN16IOPlatformExpert11haltRestartEj", + "__ZN18IODTPlatformExpert11haltRestartEj", + "__ZN9IODTNVRAM13savePanicInfoEPhy" "_disable_preemption", "_enable_preemption", + "alternate_debugger_enter", "bcopy_phys", "console_cpu_alloc", "console_cpu_free", @@ -136,12 +150,18 @@ static const char * critical_blacklist[] = "enter_lohandler", "fbt_invop", "fbt_perfCallback", + "get_preemption_level" "get_threadtask", "handle_pending_TLB_flushes", "hw_compare_and_store", "interrupt", + "is_saved_state32", + "kernel_preempt_check", "kernel_trap", "kprintf", + "ks_dispatch_kernel", + "ks_dispatch_user", + "ks_kernel_trap", "lo_alltraps", "lock_debugger", "machine_idle_cstate", @@ -153,6 +173,9 @@ static const char * critical_blacklist[] = "nanotime_to_absolutetime", "packA", "panic", + "phystokv", + "phystokv_range", + "pltrace", "pmKextRegister", "pmMarkAllCPUsOff", "pmSafeMode", @@ -167,18 +190,28 @@ static const char * critical_blacklist[] = "power_management_init", "preemption_underflow_panic", "register_cpu_setup_func", + "ret64_iret" + "ret_to_user" + "return_to_kernel", + "return_to_user", + "saved_state64", "sdt_invop", "sprlock", "sprunlock", + "strlen", + "strncmp", "t_invop", "tmrCvt", - "uread", - "uwrite", + "trap_from_kernel", + "uart_putc", "unlock_debugger", "unpackA", "unregister_cpu_setup_func", + "uread", + "uwrite", "vstart" }; + #define CRITICAL_BLACKLIST_COUNT (sizeof(critical_blacklist)/sizeof(critical_blacklist[0])) /* @@ -192,6 +225,7 @@ static const char * probe_ctx_closure[] = "IS_64BIT_PROCESS", "OSCompareAndSwap", "SetIdlePop", + "__dtrace_probe", "absolutetime_to_microtime", "act_set_astbsd", "arm_init_idle_cpu", @@ -287,7 +321,7 @@ fbt_module_excluded(struct modctl* ctl) if (ctl->mod_address == 0 || ctl->mod_size == 0) { return TRUE; } - + if (ctl->mod_loaded == 0) { return TRUE; } @@ -434,9 +468,12 @@ fbt_excluded(const char* name) return TRUE; #endif - #ifdef __x86_64__ if (LIT_STRNSTART(name, "machine_") || + LIT_STRNSTART(name, "idt64") || + LIT_STRNSTART(name, "ks_") || + LIT_STRNSTART(name, "hndl_") || + LIT_STRNSTART(name, "_intr_") || LIT_STRNSTART(name, "mapping_") || LIT_STRNSTART(name, "tsc_") || LIT_STRNSTART(name, "pmCPU") || @@ -532,7 +569,7 @@ fbt_enable(void *arg, dtrace_id_t id, void *parg) for (; fbt != NULL; fbt = fbt->fbtp_next) { ctl = fbt->fbtp_ctl; - + if (!ctl->mod_loaded) { if (fbt_verbose) { cmn_err(CE_NOTE, "fbt is failing for probe %s " @@ -556,7 +593,7 @@ fbt_enable(void *arg, dtrace_id_t id, void *parg) } continue; - } + } dtrace_casptr(&tempDTraceTrapHook, NULL, fbt_perfCallback); if (tempDTraceTrapHook != (perfCallback)fbt_perfCallback) { @@ -576,7 +613,7 @@ fbt_enable(void *arg, dtrace_id_t id, void *parg) kasan_fakestack_suspend(); #endif - (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_patchval, (vm_offset_t)fbt->fbtp_patchpoint, + (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_patchval, (vm_offset_t)fbt->fbtp_patchpoint, sizeof(fbt->fbtp_patchval)); /* * Make the patched instruction visible via a data + instruction @@ -590,9 +627,9 @@ fbt_enable(void *arg, dtrace_id_t id, void *parg) } } - + dtrace_membar_consumer(); - + return (0); } @@ -606,12 +643,12 @@ fbt_disable(void *arg, dtrace_id_t id, void *parg) for (; fbt != NULL; fbt = fbt->fbtp_next) { ctl = fbt->fbtp_ctl; - + if (!ctl->mod_loaded || (ctl->mod_loadcnt != fbt->fbtp_loadcnt)) continue; if (fbt->fbtp_currentval != fbt->fbtp_savedval) { - (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_savedval, (vm_offset_t)fbt->fbtp_patchpoint, + (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_savedval, (vm_offset_t)fbt->fbtp_patchpoint, sizeof(fbt->fbtp_savedval)); /* * Make the patched instruction visible via a data + instruction @@ -647,19 +684,19 @@ fbt_suspend(void *arg, dtrace_id_t id, void *parg) if (!ctl->mod_loaded || (ctl->mod_loadcnt != fbt->fbtp_loadcnt)) continue; - (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_savedval, (vm_offset_t)fbt->fbtp_patchpoint, + (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_savedval, (vm_offset_t)fbt->fbtp_patchpoint, sizeof(fbt->fbtp_savedval)); - + /* * Make the patched instruction visible via a data + instruction * cache flush for the platforms that need it */ flush_dcache((vm_offset_t)fbt->fbtp_patchpoint,(vm_size_t)sizeof(fbt->fbtp_savedval), 0); invalidate_icache((vm_offset_t)fbt->fbtp_patchpoint,(vm_size_t)sizeof(fbt->fbtp_savedval), 0); - + fbt->fbtp_currentval = fbt->fbtp_savedval; } - + dtrace_membar_consumer(); } @@ -677,7 +714,7 @@ fbt_resume(void *arg, dtrace_id_t id, void *parg) ASSERT(ctl->mod_nenabled > 0); if (!ctl->mod_loaded || (ctl->mod_loadcnt != fbt->fbtp_loadcnt)) continue; - + dtrace_casptr(&tempDTraceTrapHook, NULL, fbt_perfCallback); if (tempDTraceTrapHook != (perfCallback)fbt_perfCallback) { if (fbt_verbose) { @@ -687,123 +724,21 @@ fbt_resume(void *arg, dtrace_id_t id, void *parg) } return; } - - (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_patchval, (vm_offset_t)fbt->fbtp_patchpoint, + + (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_patchval, (vm_offset_t)fbt->fbtp_patchpoint, sizeof(fbt->fbtp_patchval)); -#if CONFIG_EMBEDDED /* * Make the patched instruction visible via a data + instruction cache flush. */ flush_dcache((vm_offset_t)fbt->fbtp_patchpoint,(vm_size_t)sizeof(fbt->fbtp_patchval), 0); invalidate_icache((vm_offset_t)fbt->fbtp_patchpoint,(vm_size_t)sizeof(fbt->fbtp_patchval), 0); -#endif - - fbt->fbtp_currentval = fbt->fbtp_patchval; - } - - dtrace_membar_consumer(); -} -/* - * APPLE NOTE: fbt_getargdesc not implemented - */ -#if !defined(__APPLE__) -/*ARGSUSED*/ -static void -fbt_getargdesc(void *arg, dtrace_id_t id, void *parg, dtrace_argdesc_t *desc) -{ - fbt_probe_t *fbt = parg; - struct modctl *ctl = fbt->fbtp_ctl; - struct module *mp = ctl->mod_mp; - ctf_file_t *fp = NULL, *pfp; - ctf_funcinfo_t f; - int error; - ctf_id_t argv[32], type; - int argc = sizeof (argv) / sizeof (ctf_id_t); - const char *parent; - - if (!ctl->mod_loaded || (ctl->mod_loadcnt != fbt->fbtp_loadcnt)) - goto err; - - if (fbt->fbtp_roffset != 0 && desc->dtargd_ndx == 0) { - (void) strlcpy(desc->dtargd_native, "int", - sizeof(desc->dtargd_native)); - return; - } - - if ((fp = ctf_modopen(mp, &error)) == NULL) { - /* - * We have no CTF information for this module -- and therefore - * no args[] information. - */ - goto err; - } - - /* - * If we have a parent container, we must manually import it. - */ - if ((parent = ctf_parent_name(fp)) != NULL) { - struct modctl *mp = &modules; - struct modctl *mod = NULL; - - /* - * We must iterate over all modules to find the module that - * is our parent. - */ - do { - if (strcmp(mp->mod_modname, parent) == 0) { - mod = mp; - break; - } - } while ((mp = mp->mod_next) != &modules); - - if (mod == NULL) - goto err; - - if ((pfp = ctf_modopen(mod->mod_mp, &error)) == NULL) { - goto err; - } - - if (ctf_import(fp, pfp) != 0) { - ctf_close(pfp); - goto err; - } - - ctf_close(pfp); - } - - if (ctf_func_info(fp, fbt->fbtp_symndx, &f) == CTF_ERR) - goto err; - - if (fbt->fbtp_roffset != 0) { - if (desc->dtargd_ndx > 1) - goto err; - - ASSERT(desc->dtargd_ndx == 1); - type = f.ctc_return; - } else { - if (desc->dtargd_ndx + 1 > f.ctc_argc) - goto err; - - if (ctf_func_args(fp, fbt->fbtp_symndx, argc, argv) == CTF_ERR) - goto err; - - type = argv[desc->dtargd_ndx]; - } - - if (ctf_type_name(fp, type, desc->dtargd_native, - DTRACE_ARGTYPELEN) != NULL) { - ctf_close(fp); - return; + fbt->fbtp_currentval = fbt->fbtp_patchval; } -err: - if (fp != NULL) - ctf_close(fp); - desc->dtargd_ndx = DTRACE_ARGNONE; + dtrace_membar_consumer(); } -#endif /* __APPLE__ */ static void fbt_provide_module_user_syms(struct modctl *ctl) @@ -827,11 +762,8 @@ fbt_provide_module_user_syms(struct modctl *ctl) if (*name == '_') name += 1; - /* - * We're only blacklisting functions in the kernel for now. - */ - if (MOD_IS_MACH_KERNEL(ctl) && fbt_excluded(name)) - continue; + if (MOD_IS_MACH_KERNEL(ctl) && fbt_excluded(name)) + continue; /* * Ignore symbols with a null address @@ -839,11 +771,139 @@ fbt_provide_module_user_syms(struct modctl *ctl) if (!symbol->dtsym_addr) continue; - fbt_provide_probe(ctl, (uintptr_t)symbol->dtsym_addr, (uintptr_t)(symbol->dtsym_addr + symbol->dtsym_size), modname, name, (machine_inst_t*)(uintptr_t)symbol->dtsym_addr); + /* + * Ignore symbols not part of this module + */ + if (!dtrace_addr_in_module((void*)symbol->dtsym_addr, ctl)) + continue; + + fbt_provide_probe(ctl, modname, name, (machine_inst_t*)(uintptr_t)symbol->dtsym_addr, (machine_inst_t*)(uintptr_t)(symbol->dtsym_addr + symbol->dtsym_size)); } } } +static void +fbt_provide_kernel_section(struct modctl *ctl, kernel_section_t *sect, kernel_nlist_t *sym, uint32_t nsyms, const char *strings) +{ + uintptr_t sect_start = (uintptr_t)sect->addr; + uintptr_t sect_end = (uintptr_t)sect->size + sect->addr; + unsigned int i; + + if ((sect->flags & S_ATTR_PURE_INSTRUCTIONS) != S_ATTR_PURE_INSTRUCTIONS) { + return; + } + + for (i = 0; i < nsyms; i++) { + uint8_t n_type = sym[i].n_type & (N_TYPE | N_EXT); + const char *name = strings + sym[i].n_un.n_strx; + uint64_t limit; + + if (sym[i].n_value < sect_start || sym[i].n_value > sect_end) + continue; + + /* Check that the symbol is a global and that it has a name. */ + if (((N_SECT | N_EXT) != n_type && (N_ABS | N_EXT) != n_type)) + continue; + + if (0 == sym[i].n_un.n_strx) /* iff a null, "", name. */ + continue; + + /* Lop off omnipresent leading underscore. */ + if (*name == '_') + name += 1; + +#if defined(__arm__) + // Skip non-thumb functions on arm32 + if (sym[i].n_sect == 1 && !(sym[i].n_desc & N_ARM_THUMB_DEF)) { + continue; + } +#endif /* defined(__arm__) */ + + if (MOD_IS_MACH_KERNEL(ctl) && fbt_excluded(name)) + continue; + + /* + * Find the function boundary by looking at either the + * end of the section or the beginning of the next symbol + */ + if (i == nsyms - 1) { + limit = sect_end; + } + else { + limit = sym[i + 1].n_value; + } + + fbt_provide_probe(ctl, ctl->mod_modname, name, (machine_inst_t*)sym[i].n_value, (machine_inst_t*)limit); + } + +} + +static int +fbt_sym_cmp(const void *ap, const void *bp) +{ + return (int)(((const kernel_nlist_t*)ap)->n_value - ((const kernel_nlist_t*)bp)->n_value); +} + +static void +fbt_provide_module_kernel_syms(struct modctl *ctl) +{ + kernel_mach_header_t *mh = (kernel_mach_header_t *)(ctl->mod_address); + kernel_segment_command_t *seg; + struct load_command *cmd; + kernel_segment_command_t *linkedit = NULL; + struct symtab_command *symtab = NULL; + kernel_nlist_t *syms = NULL, *sorted_syms = NULL; + const char *strings; + unsigned int i; + size_t symlen; + + if (mh->magic != MH_MAGIC_KERNEL) + return; + + cmd = (struct load_command *) &mh[1]; + for (i = 0; i < mh->ncmds; i++) { + if (cmd->cmd == LC_SEGMENT_KERNEL) { + kernel_segment_command_t *orig_sg = (kernel_segment_command_t *) cmd; + if (LIT_STRNEQL(orig_sg->segname, SEG_LINKEDIT)) + linkedit = orig_sg; + } else if (cmd->cmd == LC_SYMTAB) { + symtab = (struct symtab_command *) cmd; + } + if (symtab && linkedit) { + break; + } + cmd = (struct load_command *) ((caddr_t) cmd + cmd->cmdsize); + } + if ((symtab == NULL) || (linkedit == NULL)) { + return; + } + + syms = (kernel_nlist_t *)(linkedit->vmaddr + symtab->symoff - linkedit->fileoff); + strings = (const char *)(linkedit->vmaddr + symtab->stroff - linkedit->fileoff); + + /* + * Make a copy of the symbol table and sort it to not cross into the next function + * when disassembling the function + */ + symlen = sizeof(kernel_nlist_t) * symtab->nsyms; + sorted_syms = kmem_alloc(symlen, KM_SLEEP); + bcopy(syms, sorted_syms, symlen); + qsort(sorted_syms, symtab->nsyms, sizeof(kernel_nlist_t), fbt_sym_cmp); + + for (seg = firstsegfromheader(mh); seg != NULL; seg = nextsegfromheader(mh, seg)) { + kernel_section_t *sect = firstsect(seg); + + if (strcmp(seg->segname, "__KLD") == 0) { + continue; + } + + for (sect = firstsect(seg); sect != NULL; sect = nextsect(seg, sect)) { + fbt_provide_kernel_section(ctl, sect, sorted_syms, symtab->nsyms, strings); + } + } + + kmem_free(sorted_syms, symlen); +} void fbt_provide_module(void *arg, struct modctl *ctl) @@ -893,16 +953,16 @@ static dtrace_pattr_t fbt_attr = { }; static dtrace_pops_t fbt_pops = { - NULL, - fbt_provide_module, - fbt_enable, - fbt_disable, - fbt_suspend, - fbt_resume, - NULL, /* APPLE NOTE: fbt_getargdesc not implemented */ - NULL, - NULL, - fbt_destroy + .dtps_provide = NULL, + .dtps_provide_module = fbt_provide_module, + .dtps_enable = fbt_enable, + .dtps_disable = fbt_disable, + .dtps_suspend = fbt_suspend, + .dtps_resume = fbt_resume, + .dtps_getargdesc = NULL, /* APPLE NOTE: fbt_getargdesc implemented in userspace */ + .dtps_getargval = NULL, + .dtps_usermode = NULL, + .dtps_destroy = fbt_destroy }; static void @@ -916,17 +976,8 @@ fbt_cleanup(dev_info_t *devi) } static int -fbt_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) +fbt_attach(dev_info_t *devi) { - switch (cmd) { - case DDI_ATTACH: - break; - case DDI_RESUME: - return (DDI_SUCCESS); - default: - return (DDI_FAILURE); - } - if (fbt_probetab_size == 0) fbt_probetab_size = FBT_PROBETAB_SIZE; @@ -944,9 +995,6 @@ fbt_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) return (DDI_FAILURE); } - ddi_report_dev(devi); - fbt_devi = devi; - return (DDI_SUCCESS); } @@ -1024,8 +1072,6 @@ static struct cdevsw fbt_cdevsw = 0 /* type */ }; -static int fbt_inited = 0; - #undef kmem_alloc /* from its binding to dt_kmem_alloc glue */ #undef kmem_free /* from its binding to dt_kmem_free glue */ #include @@ -1033,22 +1079,15 @@ static int fbt_inited = 0; void fbt_init( void ) { - if (0 == fbt_inited) - { - int majdevno = cdevsw_add(FBT_MAJOR, &fbt_cdevsw); - - if (majdevno < 0) { - printf("fbt_init: failed to allocate a major number!\n"); - return; - } - - PE_parse_boot_argn("IgnoreFBTBlacklist", &ignore_fbt_blacklist, sizeof (ignore_fbt_blacklist)); + int majdevno = cdevsw_add(FBT_MAJOR, &fbt_cdevsw); - fbt_attach( (dev_info_t *)(uintptr_t)majdevno, DDI_ATTACH ); - - fbt_inited = 1; /* Ensure this initialization occurs just one time. */ + if (majdevno < 0) { + printf("fbt_init: failed to allocate a major number!\n"); + return; } - else - panic("fbt_init: called twice!\n"); + + PE_parse_boot_argn("IgnoreFBTBlacklist", &ignore_fbt_blacklist, sizeof (ignore_fbt_blacklist)); + + fbt_attach((dev_info_t*)(uintptr_t)majdevno); } #undef FBT_MAJOR diff --git a/bsd/dev/dtrace/lockstat.c b/bsd/dev/dtrace/lockstat.c index 2aebd0a1e..3d06c46f4 100644 --- a/bsd/dev/dtrace/lockstat.c +++ b/bsd/dev/dtrace/lockstat.c @@ -176,25 +176,6 @@ lockstat_probe_t lockstat_probes[] = dtrace_id_t lockstat_probemap[LS_NPROBES]; -#if CONFIG_DTRACE -#if defined(__x86_64__) -extern void lck_mtx_lock_lockstat_patch_point(void); -extern void lck_mtx_try_lock_lockstat_patch_point(void); -extern void lck_mtx_try_lock_spin_lockstat_patch_point(void); -extern void lck_mtx_unlock_lockstat_patch_point(void); -extern void lck_mtx_lock_ext_lockstat_patch_point(void); -extern void lck_mtx_ext_unlock_lockstat_patch_point(void); -extern void lck_mtx_lock_spin_lockstat_patch_point(void); -#endif -#if defined (__arm__) - -#endif - -#if defined (__arm64__) - -#endif -#endif /* CONFIG_DTRACE */ - typedef struct lockstat_assembly_probe { int lsap_probe; vm_offset_t * lsap_patch_point; @@ -203,26 +184,8 @@ typedef struct lockstat_assembly_probe { lockstat_assembly_probe_t assembly_probes[] = { -#if CONFIG_DTRACE -#if defined(__x86_64__) - /* - * On x86 these points are better done via hot patches, which ensure - * there is zero overhead when not in use. On x86 these patch points - * are swapped between the return instruction and a no-op, with the - * Dtrace call following the return. - */ - { LS_LCK_MTX_LOCK_ACQUIRE, (vm_offset_t *) lck_mtx_lock_lockstat_patch_point }, - { LS_LCK_MTX_TRY_LOCK_ACQUIRE, (vm_offset_t *) lck_mtx_try_lock_lockstat_patch_point }, - { LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, (vm_offset_t *) lck_mtx_try_lock_spin_lockstat_patch_point }, - { LS_LCK_MTX_UNLOCK_RELEASE, (vm_offset_t *) lck_mtx_unlock_lockstat_patch_point }, - { LS_LCK_MTX_EXT_LOCK_ACQUIRE, (vm_offset_t *) lck_mtx_lock_ext_lockstat_patch_point }, - { LS_LCK_MTX_EXT_UNLOCK_RELEASE, (vm_offset_t *) lck_mtx_ext_unlock_lockstat_patch_point }, - { LS_LCK_MTX_LOCK_SPIN_ACQUIRE, (vm_offset_t *) lck_mtx_lock_spin_lockstat_patch_point }, -#endif - /* No assembly patch points for ARM */ -#endif /* CONFIG_DTRACE */ { LS_LCK_INVALID, NULL } -}; + }; /* @@ -290,7 +253,6 @@ lockstat_probe_wrapper(int probe, uintptr_t lp, int rwflag) } } -static dev_info_t *lockstat_devi; /* saved in xxattach() for xxinfo() */ static dtrace_provider_id_t lockstat_id; /*ARGSUSED*/ @@ -410,30 +372,21 @@ static dtrace_pattr_t lockstat_attr = { }; static dtrace_pops_t lockstat_pops = { - lockstat_provide, - NULL, - lockstat_enable, - lockstat_disable, - NULL, - NULL, - lockstat_getargdesc, - NULL, - NULL, - lockstat_destroy + .dtps_provide = lockstat_provide, + .dtps_provide_module = NULL, + .dtps_enable = lockstat_enable, + .dtps_disable = lockstat_disable, + .dtps_suspend = NULL, + .dtps_resume = NULL, + .dtps_getargdesc = lockstat_getargdesc, + .dtps_getargval = NULL, + .dtps_usermode = NULL, + .dtps_destroy = lockstat_destroy }; static int -lockstat_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) +lockstat_attach(dev_info_t *devi) { - switch (cmd) { - case DDI_ATTACH: - break; - case DDI_RESUME: - return (DDI_SUCCESS); - default: - return (DDI_FAILURE); - } - if (ddi_create_minor_node(devi, "lockstat", S_IFCHR, 0, DDI_PSEUDO, 0) == DDI_FAILURE || dtrace_register("lockstat", &lockstat_attr, DTRACE_PRIV_KERNEL, @@ -445,8 +398,6 @@ lockstat_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) lockstat_probe = dtrace_probe; membar_producer(); - ddi_report_dev(devi); - lockstat_devi = devi; return (DDI_SUCCESS); } @@ -482,25 +433,17 @@ static struct cdevsw lockstat_cdevsw = 0 /* type */ }; -static int gLockstatInited = 0; - void lockstat_init( void ); void lockstat_init( void ) { - if (0 == gLockstatInited) - { - int majdevno = cdevsw_add(LOCKSTAT_MAJOR, &lockstat_cdevsw); - - if (majdevno < 0) { - printf("lockstat_init: failed to allocate a major number!\n"); - gLockstatInited = 0; - return; - } + int majdevno = cdevsw_add(LOCKSTAT_MAJOR, &lockstat_cdevsw); + + if (majdevno < 0) { + printf("lockstat_init: failed to allocate a major number!\n"); + return; + } - lockstat_attach( (dev_info_t *)(uintptr_t)majdevno, DDI_ATTACH ); - gLockstatInited = 1; - } else - panic("lockstat_init: called twice!\n"); + lockstat_attach((dev_info_t*)(uintptr_t)majdevno); } #undef LOCKSTAT_MAJOR diff --git a/bsd/dev/dtrace/profile_prvd.c b/bsd/dev/dtrace/profile_prvd.c index 259fab8bc..a76f901c4 100644 --- a/bsd/dev/dtrace/profile_prvd.c +++ b/bsd/dev/dtrace/profile_prvd.c @@ -65,7 +65,6 @@ extern struct arm_saved_state *find_kern_regs(thread_t); extern void profile_init(void); -static dev_info_t *profile_devi; static dtrace_provider_id_t profile_id; /* @@ -645,30 +644,21 @@ static dtrace_pattr_t profile_attr = { }; static dtrace_pops_t profile_pops = { - profile_provide, - NULL, - profile_enable, - profile_disable, - NULL, - NULL, - profile_getargdesc, - profile_getarg, - profile_usermode, - profile_destroy + .dtps_provide = profile_provide, + .dtps_provide_module = NULL, + .dtps_enable = profile_enable, + .dtps_disable = profile_disable, + .dtps_suspend = NULL, + .dtps_resume = NULL, + .dtps_getargdesc = profile_getargdesc, + .dtps_getargval = profile_getarg, + .dtps_usermode = profile_usermode, + .dtps_destroy = profile_destroy }; static int -profile_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) +profile_attach(dev_info_t *devi) { - switch (cmd) { - case DDI_ATTACH: - break; - case DDI_RESUME: - return (DDI_SUCCESS); - default: - return (DDI_FAILURE); - } - if (ddi_create_minor_node(devi, "profile", S_IFCHR, 0, DDI_PSEUDO, 0) == DDI_FAILURE || dtrace_register("profile", &profile_attr, @@ -680,8 +670,6 @@ profile_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) profile_max = PROFILE_MAX_DEFAULT; - ddi_report_dev(devi); - profile_devi = devi; return (DDI_SUCCESS); } @@ -741,24 +729,15 @@ static struct cdevsw profile_cdevsw = 0 /* type */ }; -static int gProfileInited = 0; - void profile_init( void ) { - if (0 == gProfileInited) - { - int majdevno = cdevsw_add(PROFILE_MAJOR, &profile_cdevsw); - - if (majdevno < 0) { - printf("profile_init: failed to allocate a major number!\n"); - gProfileInited = 0; - return; - } + int majdevno = cdevsw_add(PROFILE_MAJOR, &profile_cdevsw); - profile_attach( (dev_info_t *)(uintptr_t)majdevno, DDI_ATTACH ); + if (majdevno < 0) { + printf("profile_init: failed to allocate a major number!\n"); + return; + } - gProfileInited = 1; - } else - panic("profile_init: called twice!\n"); + profile_attach( (dev_info_t*)(uintptr_t)majdevno); } #undef PROFILE_MAJOR diff --git a/bsd/dev/dtrace/sdt.c b/bsd/dev/dtrace/sdt.c index 35937cdf3..1abf6f14c 100644 --- a/bsd/dev/dtrace/sdt.c +++ b/bsd/dev/dtrace/sdt.c @@ -81,7 +81,6 @@ extern kern_return_t fbt_perfCallback(int, struct savearea_t *, uintptr_t *, int #define DTRACE_PROBE_PREFIX "_dtrace_probe$" -static dev_info_t *sdt_devi; static int sdt_verbose = 0; sdt_probe_t **sdt_probetab; int sdt_probetab_size; @@ -328,23 +327,22 @@ sdt_disable(void *arg, dtrace_id_t id, void *parg) } static dtrace_pops_t sdt_pops = { - NULL, - sdt_provide_module, - sdt_enable, - sdt_disable, - NULL, - NULL, - sdt_getargdesc, - sdt_getarg, - NULL, - sdt_destroy + .dtps_provide = NULL, + .dtps_provide_module = sdt_provide_module, + .dtps_enable = sdt_enable, + .dtps_disable = sdt_disable, + .dtps_suspend = NULL, + .dtps_resume = NULL, + .dtps_getargdesc = sdt_getargdesc, + .dtps_getargval = sdt_getarg, + .dtps_usermode = NULL, + .dtps_destroy = sdt_destroy, }; /*ARGSUSED*/ static int -sdt_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) +sdt_attach(dev_info_t *devi) { -#pragma unused(cmd) sdt_provider_t *prov; if (ddi_create_minor_node(devi, "sdt", S_IFCHR, @@ -354,9 +352,6 @@ sdt_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) return (DDI_FAILURE); } - ddi_report_dev(devi); - sdt_devi = devi; - if (sdt_probetab_size == 0) sdt_probetab_size = SDT_PROBETAB_SIZE; @@ -446,164 +441,162 @@ static struct cdevsw sdt_cdevsw = 0 /* type */ }; -static int gSDTInited = 0; static struct modctl g_sdt_kernctl; static struct module g_sdt_mach_module; #include #include -void sdt_init( void ) +void sdt_early_init( void ) { - if (0 == gSDTInited) - { - int majdevno = cdevsw_add(SDT_MAJOR, &sdt_cdevsw); + if (dtrace_sdt_probes_restricted()) { + return; + } + if (MH_MAGIC_KERNEL != _mh_execute_header.magic) { + g_sdt_kernctl.mod_address = (vm_address_t)NULL; + g_sdt_kernctl.mod_size = 0; + } else { + kernel_mach_header_t *mh; + struct load_command *cmd; + kernel_segment_command_t *orig_ts = NULL, *orig_le = NULL; + struct symtab_command *orig_st = NULL; + kernel_nlist_t *sym = NULL; + char *strings; + unsigned int i; - if (majdevno < 0) { - printf("sdt_init: failed to allocate a major number!\n"); - gSDTInited = 0; - return; + g_sdt_mach_module.sdt_nprobes = 0; + g_sdt_mach_module.sdt_probes = NULL; + + g_sdt_kernctl.mod_address = (vm_address_t)&g_sdt_mach_module; + g_sdt_kernctl.mod_size = 0; + strncpy((char *)&(g_sdt_kernctl.mod_modname), "mach_kernel", KMOD_MAX_NAME); + + g_sdt_kernctl.mod_next = NULL; + g_sdt_kernctl.mod_stale = NULL; + g_sdt_kernctl.mod_id = 0; + g_sdt_kernctl.mod_loadcnt = 1; + g_sdt_kernctl.mod_loaded = 1; + g_sdt_kernctl.mod_flags = 0; + g_sdt_kernctl.mod_nenabled = 0; + + mh = &_mh_execute_header; + cmd = (struct load_command*) &mh[1]; + for (i = 0; i < mh->ncmds; i++) { + if (cmd->cmd == LC_SEGMENT_KERNEL) { + kernel_segment_command_t *orig_sg = (kernel_segment_command_t *) cmd; + + if (LIT_STRNEQL(orig_sg->segname, SEG_TEXT)) + orig_ts = orig_sg; + else if (LIT_STRNEQL(orig_sg->segname, SEG_LINKEDIT)) + orig_le = orig_sg; + else if (LIT_STRNEQL(orig_sg->segname, "")) + orig_ts = orig_sg; /* kexts have a single unnamed segment */ + } + else if (cmd->cmd == LC_SYMTAB) + orig_st = (struct symtab_command *) cmd; + + cmd = (struct load_command *) ((uintptr_t) cmd + cmd->cmdsize); } - - if (dtrace_sdt_probes_restricted()) { + + if ((orig_ts == NULL) || (orig_st == NULL) || (orig_le == NULL)) return; - } - - if (MH_MAGIC_KERNEL != _mh_execute_header.magic) { - g_sdt_kernctl.mod_address = (vm_address_t)NULL; - g_sdt_kernctl.mod_size = 0; - } else { - kernel_mach_header_t *mh; - struct load_command *cmd; - kernel_segment_command_t *orig_ts = NULL, *orig_le = NULL; - struct symtab_command *orig_st = NULL; - kernel_nlist_t *sym = NULL; - char *strings; - unsigned int i; + + sym = (kernel_nlist_t *)(orig_le->vmaddr + orig_st->symoff - orig_le->fileoff); + strings = (char *)(orig_le->vmaddr + orig_st->stroff - orig_le->fileoff); + + for (i = 0; i < orig_st->nsyms; i++) { + uint8_t n_type = sym[i].n_type & (N_TYPE | N_EXT); + char *name = strings + sym[i].n_un.n_strx; + const char *prev_name; + unsigned long best; + unsigned int j; - g_sdt_mach_module.sdt_nprobes = 0; - g_sdt_mach_module.sdt_probes = NULL; + /* Check that the symbol is a global and that it has a name. */ + if (((N_SECT | N_EXT) != n_type && (N_ABS | N_EXT) != n_type)) + continue; - g_sdt_kernctl.mod_address = (vm_address_t)&g_sdt_mach_module; - g_sdt_kernctl.mod_size = 0; - strncpy((char *)&(g_sdt_kernctl.mod_modname), "mach_kernel", KMOD_MAX_NAME); + if (0 == sym[i].n_un.n_strx) /* iff a null, "", name. */ + continue; - g_sdt_kernctl.mod_next = NULL; - g_sdt_kernctl.mod_stale = NULL; - g_sdt_kernctl.mod_id = 0; - g_sdt_kernctl.mod_loadcnt = 1; - g_sdt_kernctl.mod_loaded = 1; - g_sdt_kernctl.mod_flags = 0; - g_sdt_kernctl.mod_nenabled = 0; + /* Lop off omnipresent leading underscore. */ + if (*name == '_') + name += 1; - mh = &_mh_execute_header; - cmd = (struct load_command*) &mh[1]; - for (i = 0; i < mh->ncmds; i++) { - if (cmd->cmd == LC_SEGMENT_KERNEL) { - kernel_segment_command_t *orig_sg = (kernel_segment_command_t *) cmd; - - if (LIT_STRNEQL(orig_sg->segname, SEG_TEXT)) - orig_ts = orig_sg; - else if (LIT_STRNEQL(orig_sg->segname, SEG_LINKEDIT)) - orig_le = orig_sg; - else if (LIT_STRNEQL(orig_sg->segname, "")) - orig_ts = orig_sg; /* kexts have a single unnamed segment */ - } - else if (cmd->cmd == LC_SYMTAB) - orig_st = (struct symtab_command *) cmd; + if (strncmp(name, DTRACE_PROBE_PREFIX, sizeof(DTRACE_PROBE_PREFIX) - 1) == 0) { + sdt_probedesc_t *sdpd = kmem_alloc(sizeof(sdt_probedesc_t), KM_SLEEP); + int len = strlen(name) + 1; - cmd = (struct load_command *) ((uintptr_t) cmd + cmd->cmdsize); - } - - if ((orig_ts == NULL) || (orig_st == NULL) || (orig_le == NULL)) - return; - - sym = (kernel_nlist_t *)(orig_le->vmaddr + orig_st->symoff - orig_le->fileoff); - strings = (char *)(orig_le->vmaddr + orig_st->stroff - orig_le->fileoff); - - for (i = 0; i < orig_st->nsyms; i++) { - uint8_t n_type = sym[i].n_type & (N_TYPE | N_EXT); - char *name = strings + sym[i].n_un.n_strx; - const char *prev_name; - unsigned long best; - unsigned int j; + sdpd->sdpd_name = kmem_alloc(len, KM_SLEEP); + strncpy(sdpd->sdpd_name, name, len); /* NUL termination is ensured. */ - /* Check that the symbol is a global and that it has a name. */ - if (((N_SECT | N_EXT) != n_type && (N_ABS | N_EXT) != n_type)) - continue; + prev_name = ""; + best = 0; - if (0 == sym[i].n_un.n_strx) /* iff a null, "", name. */ - continue; - - /* Lop off omnipresent leading underscore. */ - if (*name == '_') - name += 1; - - if (strncmp(name, DTRACE_PROBE_PREFIX, sizeof(DTRACE_PROBE_PREFIX) - 1) == 0) { - sdt_probedesc_t *sdpd = kmem_alloc(sizeof(sdt_probedesc_t), KM_SLEEP); - int len = strlen(name) + 1; - - sdpd->sdpd_name = kmem_alloc(len, KM_SLEEP); - strncpy(sdpd->sdpd_name, name, len); /* NUL termination is ensured. */ - - prev_name = ""; - best = 0; - - /* - * Find the symbol immediately preceding the sdt probe site just discovered, - * that symbol names the function containing the sdt probe. - */ - for (j = 0; j < orig_st->nsyms; j++) { - uint8_t jn_type = sym[j].n_type & N_TYPE; - char *jname = strings + sym[j].n_un.n_strx; - - if ((N_SECT != jn_type && N_ABS != jn_type)) - continue; - - if (0 == sym[j].n_un.n_strx) /* iff a null, "", name. */ - continue; - - if (*jname == '_') - jname += 1; - - if (*(unsigned long *)sym[i].n_value <= (unsigned long)sym[j].n_value) - continue; - - if ((unsigned long)sym[j].n_value > best) { - best = (unsigned long)sym[j].n_value; - prev_name = jname; - } + /* + * Find the symbol immediately preceding the sdt probe site just discovered, + * that symbol names the function containing the sdt probe. + */ + for (j = 0; j < orig_st->nsyms; j++) { + uint8_t jn_type = sym[j].n_type & N_TYPE; + char *jname = strings + sym[j].n_un.n_strx; + + if ((N_SECT != jn_type && N_ABS != jn_type)) + continue; + + if (0 == sym[j].n_un.n_strx) /* iff a null, "", name. */ + continue; + + if (*jname == '_') + jname += 1; + + if (*(unsigned long *)sym[i].n_value <= (unsigned long)sym[j].n_value) + continue; + + if ((unsigned long)sym[j].n_value > best) { + best = (unsigned long)sym[j].n_value; + prev_name = jname; } - - sdpd->sdpd_func = kmem_alloc((len = strlen(prev_name) + 1), KM_SLEEP); - strncpy(sdpd->sdpd_func, prev_name, len); /* NUL termination is ensured. */ - - sdpd->sdpd_offset = *(unsigned long *)sym[i].n_value; + } + + sdpd->sdpd_func = kmem_alloc((len = strlen(prev_name) + 1), KM_SLEEP); + strncpy(sdpd->sdpd_func, prev_name, len); /* NUL termination is ensured. */ + + sdpd->sdpd_offset = *(unsigned long *)sym[i].n_value; #if defined(__arm__) - /* PR8353094 - mask off thumb-bit */ - sdpd->sdpd_offset &= ~0x1U; + /* PR8353094 - mask off thumb-bit */ + sdpd->sdpd_offset &= ~0x1U; #elif defined(__arm64__) - sdpd->sdpd_offset &= ~0x1LU; + sdpd->sdpd_offset &= ~0x1LU; #endif /* __arm__ */ #if 0 - printf("sdt_init: sdpd_offset=0x%lx, n_value=0x%lx, name=%s\n", - sdpd->sdpd_offset, *(unsigned long *)sym[i].n_value, name); + printf("sdt_init: sdpd_offset=0x%lx, n_value=0x%lx, name=%s\n", + sdpd->sdpd_offset, *(unsigned long *)sym[i].n_value, name); #endif - sdpd->sdpd_next = g_sdt_mach_module.sdt_probes; - g_sdt_mach_module.sdt_probes = sdpd; - } else { - prev_name = name; - } + sdpd->sdpd_next = g_sdt_mach_module.sdt_probes; + g_sdt_mach_module.sdt_probes = sdpd; + } else { + prev_name = name; } } + } +} + +void sdt_init( void ) +{ + int majdevno = cdevsw_add(SDT_MAJOR, &sdt_cdevsw); - sdt_attach( (dev_info_t *)(uintptr_t)majdevno, DDI_ATTACH ); - - gSDTInited = 1; - } else - panic("sdt_init: called twice!\n"); + if (majdevno < 0) { + printf("sdt_init: failed to allocate a major number!\n"); + return; + } + + if (dtrace_sdt_probes_restricted()) { + return; + } + + sdt_attach((dev_info_t*)(uintptr_t)majdevno); } #undef SDT_MAJOR diff --git a/bsd/dev/dtrace/sdt_subr.c b/bsd/dev/dtrace/sdt_subr.c index 3fc2b9aa0..03174ee08 100644 --- a/bsd/dev/dtrace/sdt_subr.c +++ b/bsd/dev/dtrace/sdt_subr.c @@ -136,6 +136,7 @@ sdt_argdesc_t sdt_args[] = { { "proc", "exec-failure", 0, 0, "int", NULL }, /* proc:::exec-success has no arguments */ { "proc", "exit", 0, 0, "int", NULL }, + { "proc", "exited", 0, 0, "struct proc *", "psinfo_t *"}, { "proc", "fault", 0, 0, "int", NULL }, { "proc", "fault", 1, 1, "siginfo_t *", NULL }, { "proc", "lwp-create", 0, 0, "struct thread *", "lwpsinfo_t *" }, diff --git a/bsd/dev/dtrace/systrace.c b/bsd/dev/dtrace/systrace.c index 9c0c34f63..10ba83433 100644 --- a/bsd/dev/dtrace/systrace.c +++ b/bsd/dev/dtrace/systrace.c @@ -376,7 +376,6 @@ dtrace_systrace_syscall_return(unsigned short code, int rval, int *rv) #error 1 << SYSTRACE_SHIFT must exceed number of system calls #endif -static dev_info_t *systrace_devi; static dtrace_provider_id_t systrace_id; /* @@ -532,31 +531,22 @@ static dtrace_pattr_t systrace_attr = { }; static dtrace_pops_t systrace_pops = { - systrace_provide, - NULL, - systrace_enable, - systrace_disable, - NULL, - NULL, - systrace_getargdesc, - systrace_getargval, - NULL, - systrace_destroy + .dtps_provide = systrace_provide, + .dtps_provide_module = NULL, + .dtps_enable = systrace_enable, + .dtps_disable = systrace_disable, + .dtps_suspend = NULL, + .dtps_resume = NULL, + .dtps_getargdesc = systrace_getargdesc, + .dtps_getargval = systrace_getargval, + .dtps_usermode = NULL, + .dtps_destroy = systrace_destroy }; static int -systrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) +systrace_attach(dev_info_t *devi) { - switch (cmd) { - case DDI_ATTACH: - break; - case DDI_RESUME: - return (DDI_SUCCESS); - default: - return (DDI_FAILURE); - } - - systrace_probe = (void(*))&dtrace_probe; + systrace_probe = (void*)&dtrace_probe; membar_enter(); if (ddi_create_minor_node(devi, "systrace", S_IFCHR, 0, @@ -568,9 +558,6 @@ systrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) return (DDI_FAILURE); } - ddi_report_dev(devi); - systrace_devi = devi; - return (DDI_SUCCESS); } @@ -657,7 +644,6 @@ void (*machtrace_probe)(dtrace_id_t, uint64_t, uint64_t, static uint64_t machtrace_getarg(void *, dtrace_id_t, void *, int, int); -static dev_info_t *machtrace_devi; static dtrace_provider_id_t machtrace_id; static kern_return_t @@ -897,30 +883,21 @@ static dtrace_pattr_t machtrace_attr = { }; static dtrace_pops_t machtrace_pops = { - machtrace_provide, - NULL, - machtrace_enable, - machtrace_disable, - NULL, - NULL, - NULL, - machtrace_getarg, - NULL, - machtrace_destroy + .dtps_provide = machtrace_provide, + .dtps_provide_module = NULL, + .dtps_enable = machtrace_enable, + .dtps_disable = machtrace_disable, + .dtps_suspend = NULL, + .dtps_resume = NULL, + .dtps_getargdesc = NULL, + .dtps_getargval = machtrace_getarg, + .dtps_usermode = NULL, + .dtps_destroy = machtrace_destroy }; static int -machtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) +machtrace_attach(dev_info_t *devi) { - switch (cmd) { - case DDI_ATTACH: - break; - case DDI_RESUME: - return (DDI_SUCCESS); - default: - return (DDI_FAILURE); - } - machtrace_probe = dtrace_probe; membar_enter(); @@ -928,14 +905,11 @@ machtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) DDI_PSEUDO, 0) == DDI_FAILURE || dtrace_register("mach_trap", &machtrace_attr, DTRACE_PRIV_USER, NULL, &machtrace_pops, NULL, &machtrace_id) != 0) { - machtrace_probe = (void (*))&systrace_stub; + machtrace_probe = (void*)&systrace_stub; ddi_remove_minor_node(devi, NULL); return (DDI_FAILURE); } - ddi_report_dev(devi); - machtrace_devi = devi; - return (DDI_SUCCESS); } @@ -971,31 +945,23 @@ static struct cdevsw systrace_cdevsw = 0 /* type */ }; -static int gSysTraceInited = 0; - void systrace_init( void ); void systrace_init( void ) { - if (0 == gSysTraceInited) { - if (dtrace_sdt_probes_restricted()) { - return; - } - - int majdevno = cdevsw_add(SYSTRACE_MAJOR, &systrace_cdevsw); + if (dtrace_sdt_probes_restricted()) { + return; + } - if (majdevno < 0) { - printf("systrace_init: failed to allocate a major number!\n"); - gSysTraceInited = 0; - return; - } + int majdevno = cdevsw_add(SYSTRACE_MAJOR, &systrace_cdevsw); - systrace_attach( (dev_info_t *)(uintptr_t)majdevno, DDI_ATTACH ); - machtrace_attach( (dev_info_t *)(uintptr_t)majdevno, DDI_ATTACH ); + if (majdevno < 0) { + printf("systrace_init: failed to allocate a major number!\n"); + return; + } - gSysTraceInited = 1; - } else - panic("systrace_init: called twice!\n"); + systrace_attach((dev_info_t*)(uintptr_t)majdevno); + machtrace_attach((dev_info_t*)(uintptr_t)majdevno); } #undef SYSTRACE_MAJOR @@ -1012,7 +978,7 @@ systrace_getargval(void *arg, dtrace_id_t id, void *parg, int argno, int aframes uargs = uthread->t_dtrace_syscall_args; if (!uargs) return(0); - if (argno < 0 || argno > SYSTRACE_NARGS) + if (argno < 0 || argno >= SYSTRACE_NARGS) return(0); DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); diff --git a/bsd/dev/i386/dtrace_isa.c b/bsd/dev/i386/dtrace_isa.c index cb6795580..458fc15b3 100644 --- a/bsd/dev/i386/dtrace_isa.c +++ b/bsd/dev/i386/dtrace_isa.c @@ -183,6 +183,11 @@ dtrace_getreg(struct regs *savearea, uint_t reg) boolean_t is64Bit = proc_is64bit(current_proc()); x86_saved_state_t *regs = (x86_saved_state_t *)savearea; + if (regs == NULL) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); + return (0); + } + if (is64Bit) { if (reg <= SS) { reg = regmap[reg]; diff --git a/bsd/dev/i386/dtrace_subr_x86.c b/bsd/dev/i386/dtrace_subr_x86.c index c4ab38a31..a5064d688 100644 --- a/bsd/dev/i386/dtrace_subr_x86.c +++ b/bsd/dev/i386/dtrace_subr_x86.c @@ -307,3 +307,9 @@ dtrace_safe_defer_signal(void) return 0; } + +void +dtrace_flush_caches(void) +{ + +} diff --git a/bsd/dev/i386/fasttrap_isa.c b/bsd/dev/i386/fasttrap_isa.c index a70039322..0e9e97849 100644 --- a/bsd/dev/i386/fasttrap_isa.c +++ b/bsd/dev/i386/fasttrap_isa.c @@ -1384,7 +1384,7 @@ fasttrap_pid_probe32(x86_saved_state_t *regs) case FASTTRAP_T_COMMON: { - user_addr_t addr; + user_addr_t addr, write_addr; uint8_t scratch[2 * FASTTRAP_MAX_INSTR_SIZE + 7]; uint_t i = 0; @@ -1428,8 +1428,9 @@ fasttrap_pid_probe32(x86_saved_state_t *regs) */ addr = uthread->t_dtrace_scratch->addr; + write_addr = uthread->t_dtrace_scratch->write_addr; - if (addr == 0LL) { + if (addr == 0LL || write_addr == 0LL) { fasttrap_sigtrap(p, uthread, pc); // Should be killing target proc new_pc = pc; break; @@ -1458,7 +1459,7 @@ fasttrap_pid_probe32(x86_saved_state_t *regs) ASSERT(i <= sizeof (scratch)); - if (fasttrap_copyout(scratch, addr, i)) { + if (fasttrap_copyout(scratch, write_addr, i)) { fasttrap_sigtrap(p, uthread, pc); new_pc = pc; break; @@ -1938,7 +1939,7 @@ fasttrap_pid_probe64(x86_saved_state_t *regs) case FASTTRAP_T_COMMON: { - user_addr_t addr; + user_addr_t addr, write_addr; uint8_t scratch[2 * FASTTRAP_MAX_INSTR_SIZE + 22]; uint_t i = 0; @@ -2026,8 +2027,9 @@ fasttrap_pid_probe64(x86_saved_state_t *regs) */ addr = uthread->t_dtrace_scratch->addr; + write_addr = uthread->t_dtrace_scratch->write_addr; - if (addr == 0LL) { + if (addr == 0LL || write_addr == 0LL) { fasttrap_sigtrap(p, uthread, pc); // Should be killing target proc new_pc = pc; break; @@ -2117,7 +2119,7 @@ fasttrap_pid_probe64(x86_saved_state_t *regs) ASSERT(i <= sizeof (scratch)); - if (fasttrap_copyout(scratch, addr, i)) { + if (fasttrap_copyout(scratch, write_addr, i)) { fasttrap_sigtrap(p, uthread, pc); new_pc = pc; break; diff --git a/bsd/dev/i386/fbt_x86.c b/bsd/dev/i386/fbt_x86.c index 2ff70daee..63d1a8430 100644 --- a/bsd/dev/i386/fbt_x86.c +++ b/bsd/dev/i386/fbt_x86.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include #include @@ -110,7 +110,7 @@ int fbt_invop(uintptr_t addr, uintptr_t *state, uintptr_t rval) { fbt_probe_t *fbt = fbt_probetab[FBT_ADDR2NDX(addr)]; - + for (; fbt != NULL; fbt = fbt->fbtp_hashnext) { if ((uintptr_t)fbt->fbtp_patchpoint == addr) { @@ -180,7 +180,7 @@ fbt_perfCallback( "_dtrace_invop_callsite_post:\n" " .quad Ldtrace_invop_callsite_post_label\n" ".text\n" - ); + ); switch (emul) { case DTRACE_INVOP_NOP: @@ -198,7 +198,7 @@ fbt_perfCallback( case DTRACE_INVOP_LEAVE: /* * Emulate first micro-op of patched leave: mov %rbp,%rsp - * fp points just below the return address slot for target's ret + * fp points just below the return address slot for target's ret * and at the slot holding the frame pointer saved by the target's prologue. */ fp = saved_state->rbp; @@ -247,7 +247,7 @@ fbt_perfCallback( retval = KERN_SUCCESS; break; - + default: retval = KERN_FAILURE; break; @@ -263,7 +263,7 @@ fbt_perfCallback( } void -fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, char *modname, char* symbolName, machine_inst_t* symbolStart) +fbt_provide_probe(struct modctl *ctl, const char *modname, const char* symbolName, machine_inst_t* symbolStart, machine_inst_t* instrHigh) { unsigned int j; unsigned int doenable = 0; @@ -272,37 +272,36 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c fbt_probe_t *newfbt, *retfbt, *entryfbt; machine_inst_t *instr, *limit, theInstr, i1, i2, i3; int size; - + /* * Guard against null symbols */ - if (!symbolStart || !instrLow || !instrHigh) { + if (!symbolStart || !instrHigh || instrHigh < symbolStart) { kprintf("dtrace: %s has an invalid address\n", symbolName); return; } for (j = 0, instr = symbolStart, theInstr = 0; - (j < 4) && ((uintptr_t)instr >= instrLow) && (instrHigh > (uintptr_t)(instr + 2)); - j++) { + (j < 4) && (instrHigh > (instr + 2)); j++) { theInstr = instr[0]; if (theInstr == FBT_PUSH_RBP || theInstr == FBT_RET || theInstr == FBT_RET_IMM16) break; - + if ((size = dtrace_instr_size(instr)) <= 0) break; - + instr += size; } - + if (theInstr != FBT_PUSH_RBP) return; - + i1 = instr[1]; i2 = instr[2]; i3 = instr[3]; - + limit = (machine_inst_t *)instrHigh; - + if (i1 == FBT_REX_RSP_RBP && i2 == FBT_MOV_RSP_RBP0 && i3 == FBT_MOV_RSP_RBP1) { instr += 1; /* Advance to the mov %rsp,%rbp */ theInstr = i1; @@ -319,26 +318,26 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c * 000006d8 pushl %ebp * 000006d9 movl $0x00000004,%edx * 000006de movl %esp,%ebp - * + * * Try the next instruction, to see if it is a movl %esp,%ebp */ - + instr += 1; /* Advance past the pushl %ebp */ if ((size = dtrace_instr_size(instr)) <= 0) return; - + instr += size; - + if ((instr + 1) >= limit) return; - + i1 = instr[0]; i2 = instr[1]; - + if (!(i1 == FBT_MOVL_ESP_EBP0_V0 && i2 == FBT_MOVL_ESP_EBP1_V0) && !(i1 == FBT_MOVL_ESP_EBP0_V1 && i2 == FBT_MOVL_ESP_EBP1_V1)) return; - + /* instr already points at the movl %esp,%ebp */ theInstr = i1; } @@ -346,7 +345,7 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c thisid = dtrace_probe_lookup(fbt_id, modname, symbolName, FBT_ENTRY); newfbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP); strlcpy( (char *)&(newfbt->fbtp_name), symbolName, MAX_FBTP_NAME_CHARS ); - + if (thisid != 0) { /* * The dtrace_probe previously existed, so we have to hook @@ -360,13 +359,13 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c for(; entryfbt != NULL; entryfbt = entryfbt->fbtp_next) { if (entryfbt->fbtp_currentval == entryfbt->fbtp_patchval) doenable++; - + if (entryfbt->fbtp_next == NULL) { entryfbt->fbtp_next = newfbt; newfbt->fbtp_id = entryfbt->fbtp_id; break; } - } + } } else { /* @@ -377,7 +376,7 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c newfbt->fbtp_id = dtrace_probe_create(fbt_id, modname, symbolName, FBT_ENTRY, FBT_AFRAMES_ENTRY, newfbt); doenable = 0; } - + newfbt->fbtp_patchpoint = instr; newfbt->fbtp_ctl = ctl; newfbt->fbtp_loadcnt = ctl->mod_loadcnt; @@ -387,18 +386,18 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c newfbt->fbtp_currentval = 0; newfbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)]; fbt_probetab[FBT_ADDR2NDX(instr)] = newfbt; - + if (doenable) fbt_enable(NULL, newfbt->fbtp_id, newfbt); - + /* * The fbt entry chain is in place, one entry point per symbol. * The fbt return chain can have multiple return points per symbol. * Here we find the end of the fbt return chain. */ - + doenable=0; - + thisid = dtrace_probe_lookup(fbt_id, modname, symbolName, FBT_RETURN); if (thisid != 0) { /* The dtrace_probe previously existed, so we have to @@ -420,11 +419,11 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c doenable = 0; retfbt = NULL; } - + again: if (instr >= limit) return; - + /* * If this disassembly fails, then we've likely walked off into * a jump table or some other unsuitable area. Bail out of the @@ -432,7 +431,7 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c */ if ((size = dtrace_instr_size(instr)) <= 0) return; - + /* * We (desperately) want to avoid erroneously instrumenting a * jump table, especially given that our markers are pretty @@ -447,66 +446,66 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c for (j = 0; j < sizeof (uintptr_t); j++) { uintptr_t check = (uintptr_t)instr - j; uint8_t *ptr; - + if (check < (uintptr_t)symbolStart) break; - + if (check + sizeof (uintptr_t) > (uintptr_t)limit) continue; - + ptr = *(uint8_t **)check; - + if (ptr >= (uint8_t *)symbolStart && ptr < limit) { instr += size; goto again; } } - + /* * OK, it's an instruction. */ theInstr = instr[0]; - + /* Walked onto the start of the next routine? If so, bail out of this function. */ if (theInstr == FBT_PUSH_RBP) return; - + if (!(size == 1 && (theInstr == FBT_POP_RBP || theInstr == FBT_LEAVE))) { instr += size; goto again; } - + /* * Found the pop %rbp; or leave. */ machine_inst_t *patch_instr = instr; - + /* * Scan forward for a "ret", or "jmp". */ instr += size; if (instr >= limit) return; - + size = dtrace_instr_size(instr); if (size <= 0) /* Failed instruction decode? */ return; - + theInstr = instr[0]; - + if (!(size == FBT_RET_LEN && (theInstr == FBT_RET)) && !(size == FBT_RET_IMM16_LEN && (theInstr == FBT_RET_IMM16)) && !(size == FBT_JMP_SHORT_REL_LEN && (theInstr == FBT_JMP_SHORT_REL)) && !(size == FBT_JMP_NEAR_REL_LEN && (theInstr == FBT_JMP_NEAR_REL)) && !(size == FBT_JMP_FAR_ABS_LEN && (theInstr == FBT_JMP_FAR_ABS))) return; - + /* * pop %rbp; ret; or leave; ret; or leave; jmp tailCalledFun; -- We have a winner! */ newfbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP); strlcpy( (char *)&(newfbt->fbtp_name), symbolName, MAX_FBTP_NAME_CHARS ); - + if (retfbt == NULL) { newfbt->fbtp_id = dtrace_probe_create(fbt_id, modname, symbolName, FBT_RETURN, FBT_AFRAMES_RETURN, newfbt); @@ -514,12 +513,12 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c retfbt->fbtp_next = newfbt; newfbt->fbtp_id = retfbt->fbtp_id; } - + retfbt = newfbt; newfbt->fbtp_patchpoint = patch_instr; newfbt->fbtp_ctl = ctl; newfbt->fbtp_loadcnt = ctl->mod_loadcnt; - + if (*patch_instr == FBT_POP_RBP) { newfbt->fbtp_rval = DTRACE_INVOP_POP_RBP; } else { @@ -528,87 +527,16 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c } newfbt->fbtp_roffset = (uintptr_t)(patch_instr - (uint8_t *)symbolStart); - + newfbt->fbtp_savedval = *patch_instr; newfbt->fbtp_patchval = FBT_PATCHVAL; newfbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(patch_instr)]; fbt_probetab[FBT_ADDR2NDX(patch_instr)] = newfbt; - + if (doenable) fbt_enable(NULL, newfbt->fbtp_id, newfbt); - + instr += size; goto again; } -void -fbt_provide_module_kernel_syms(struct modctl *ctl) -{ - kernel_mach_header_t *mh; - struct load_command *cmd; - kernel_segment_command_t *orig_ts = NULL, *orig_le = NULL; - struct symtab_command *orig_st = NULL; - kernel_nlist_t *sym = NULL; - char *strings; - uintptr_t instrLow, instrHigh; - char *modname; - unsigned int i; - - mh = (kernel_mach_header_t *)(ctl->mod_address); - modname = ctl->mod_modname; - - if (mh->magic != MH_MAGIC_KERNEL) - return; - - cmd = (struct load_command *) &mh[1]; - for (i = 0; i < mh->ncmds; i++) { - if (cmd->cmd == LC_SEGMENT_KERNEL) { - kernel_segment_command_t *orig_sg = (kernel_segment_command_t *) cmd; - - if (LIT_STRNEQL(orig_sg->segname, SEG_TEXT)) - orig_ts = orig_sg; - else if (LIT_STRNEQL(orig_sg->segname, SEG_LINKEDIT)) - orig_le = orig_sg; - else if (LIT_STRNEQL(orig_sg->segname, "")) - orig_ts = orig_sg; /* kexts have a single unnamed segment */ - } - else if (cmd->cmd == LC_SYMTAB) - orig_st = (struct symtab_command *) cmd; - - cmd = (struct load_command *) ((caddr_t) cmd + cmd->cmdsize); - } - - if ((orig_ts == NULL) || (orig_st == NULL) || (orig_le == NULL)) - return; - - sym = (kernel_nlist_t *)(orig_le->vmaddr + orig_st->symoff - orig_le->fileoff); - strings = (char *)(orig_le->vmaddr + orig_st->stroff - orig_le->fileoff); - - /* Find extent of the TEXT section */ - instrLow = (uintptr_t)orig_ts->vmaddr; - instrHigh = (uintptr_t)(orig_ts->vmaddr + orig_ts->vmsize); - - for (i = 0; i < orig_st->nsyms; i++) { - uint8_t n_type = sym[i].n_type & (N_TYPE | N_EXT); - char *name = strings + sym[i].n_un.n_strx; - - /* Check that the symbol is a global and that it has a name. */ - if (((N_SECT | N_EXT) != n_type && (N_ABS | N_EXT) != n_type)) - continue; - - if (0 == sym[i].n_un.n_strx) /* iff a null, "", name. */ - continue; - - /* Lop off omnipresent leading underscore. */ - if (*name == '_') - name += 1; - - /* - * We're only blacklisting functions in the kernel for now. - */ - if (MOD_IS_MACH_KERNEL(ctl) && fbt_excluded(name)) - continue; - - fbt_provide_probe(ctl, instrLow, instrHigh, modname, name, (machine_inst_t*)sym[i].n_value); - } -} diff --git a/bsd/dev/i386/systemcalls.c b/bsd/dev/i386/systemcalls.c index 3a8d5dffa..1fa041589 100644 --- a/bsd/dev/i386/systemcalls.c +++ b/bsd/dev/i386/systemcalls.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include diff --git a/bsd/dev/i386/unix_signal.c b/bsd/dev/i386/unix_signal.c index 1c271607f..88e615b8b 100644 --- a/bsd/dev/i386/unix_signal.c +++ b/bsd/dev/i386/unix_signal.c @@ -46,6 +46,9 @@ #include #include #include + +#include + #include /* for thread_abort_safely */ #include @@ -62,8 +65,6 @@ /* Forward: */ -extern boolean_t machine_exception(int, mach_exception_code_t, - mach_exception_subcode_t, int *, mach_exception_subcode_t *); extern kern_return_t thread_getstatus(thread_t act, int flavor, thread_state_t tstate, mach_msg_type_number_t *count); extern kern_return_t thread_setstatus(thread_t thread, int flavor, @@ -99,6 +100,7 @@ struct sigframe32 { int sig; user32_addr_t sinfo; /* siginfo32_t* */ user32_addr_t uctx; /* struct ucontext32 */ + user32_addr_t token; }; /* @@ -190,6 +192,8 @@ sendsig(struct proc *p, user_addr_t ua_catcher, int sig, int mask, __unused uint int stack_size = 0; int infostyle = UC_TRAD; xstate_t sig_xstate; + user_addr_t token_uctx; + kern_return_t kr; thread = current_thread(); ut = get_bsdthread_info(thread); @@ -216,6 +220,7 @@ sendsig(struct proc *p, user_addr_t ua_catcher, int sig, int mask, __unused uint if (proc_is64bit(p)) { x86_thread_state64_t *tstate64; struct user_ucontext64 uctx64; + user64_addr_t token; flavor = x86_THREAD_STATE64; state_count = x86_THREAD_STATE64_COUNT; @@ -273,6 +278,14 @@ sendsig(struct proc *p, user_addr_t ua_catcher, int sig, int mask, __unused uint */ ua_fp -= sizeof(user_addr_t); + /* + * Generate the validation token for sigreturn + */ + token_uctx = ua_uctxp; + kr = machine_thread_siguctx_pointer_convert_to_user(thread, &token_uctx); + assert(kr == KERN_SUCCESS); + token = (user64_addr_t)token_uctx ^ (user64_addr_t)ps->ps_sigreturn_token; + /* * Build the signal context to be used by sigreturn. */ @@ -318,11 +331,12 @@ sendsig(struct proc *p, user_addr_t ua_catcher, int sig, int mask, __unused uint tstate64->rdx = sig; tstate64->rcx = ua_sip; tstate64->r8 = ua_uctxp; - + tstate64->r9 = token; } else { x86_thread_state32_t *tstate32; struct user_ucontext32 uctx32; struct sigframe32 frame32; + user32_addr_t token; flavor = x86_THREAD_STATE32; state_count = x86_THREAD_STATE32_COUNT; @@ -380,6 +394,15 @@ sendsig(struct proc *p, user_addr_t ua_catcher, int sig, int mask, __unused uint */ ua_fp -= sizeof(frame32.retaddr); + /* + * Generate the validation token for sigreturn + */ + token_uctx = ua_uctxp; + kr = machine_thread_siguctx_pointer_convert_to_user(thread, &token_uctx); + assert(kr == KERN_SUCCESS); + token = CAST_DOWN_EXPLICIT(user32_addr_t, token_uctx) ^ + CAST_DOWN_EXPLICIT(user32_addr_t, ps->ps_sigreturn_token); + /* * Build the argument list for the signal handler. * Handler should call sigreturn to get out of it @@ -390,6 +413,7 @@ sendsig(struct proc *p, user_addr_t ua_catcher, int sig, int mask, __unused uint frame32.catcher = CAST_DOWN_EXPLICIT(user32_addr_t, ua_catcher); frame32.sinfo = CAST_DOWN_EXPLICIT(user32_addr_t, ua_sip); frame32.uctx = CAST_DOWN_EXPLICIT(user32_addr_t, ua_uctxp); + frame32.token = token; if (copyout((caddr_t)&frame32, ua_fp, sizeof (frame32))) goto bad; @@ -674,6 +698,7 @@ sigreturn(struct proc *p, struct sigreturn_args *uap, __unused int *retval) thread_t thread = current_thread(); struct uthread * ut; + struct sigacts *ps = p->p_sigacts; int error; int onstack = 0; @@ -685,6 +710,9 @@ sigreturn(struct proc *p, struct sigreturn_args *uap, __unused int *retval) void * fs; int rval = EJUSTRETURN; xstate_t sig_xstate; + uint32_t sigreturn_validation; + user_addr_t token_uctx; + kern_return_t kr; ut = (struct uthread *)get_bsdthread_info(thread); @@ -704,8 +732,15 @@ sigreturn(struct proc *p, struct sigreturn_args *uap, __unused int *retval) sig_xstate = current_xstate(); + sigreturn_validation = atomic_load_explicit( + &ps->ps_sigreturn_validation, memory_order_relaxed); + token_uctx = uap->uctx; + kr = machine_thread_siguctx_pointer_convert_to_user(thread, &token_uctx); + assert(kr == KERN_SUCCESS); + if (proc_is64bit(p)) { struct user_ucontext64 uctx64; + user64_addr_t token; if ((error = copyin(uap->uctx, (void *)&uctx64, sizeof (uctx64)))) return(error); @@ -724,8 +759,19 @@ sigreturn(struct proc *p, struct sigreturn_args *uap, __unused int *retval) fs_count = thread_state64[sig_xstate].state_count; fs = (void *)&mctxp->mctx_avx64.fs; + token = (user64_addr_t)token_uctx ^ (user64_addr_t)ps->ps_sigreturn_token; + if ((user64_addr_t)uap->token != token) { +#if DEVELOPMENT || DEBUG + printf("process %s[%d] sigreturn token mismatch: received 0x%llx expected 0x%llx\n", + p->p_comm, p->p_pid, (user64_addr_t)uap->token, token); +#endif /* DEVELOPMENT || DEBUG */ + if (sigreturn_validation != PS_SIGRETURN_VALIDATION_DISABLED) { + rval = EINVAL; + } + } } else { struct user_ucontext32 uctx32; + user32_addr_t token; if ((error = copyin(uap->uctx, (void *)&uctx32, sizeof (uctx32)))) return(error); @@ -743,6 +789,18 @@ sigreturn(struct proc *p, struct sigreturn_args *uap, __unused int *retval) fs_flavor = thread_state32[sig_xstate].flavor; fs_count = thread_state32[sig_xstate].state_count; fs = (void *)&mctxp->mctx_avx32.fs; + + token = CAST_DOWN_EXPLICIT(user32_addr_t, uap->uctx) ^ + CAST_DOWN_EXPLICIT(user32_addr_t, ps->ps_sigreturn_token); + if ((user32_addr_t)uap->token != token) { +#if DEVELOPMENT || DEBUG + printf("process %s[%d] sigreturn token mismatch: received 0x%x expected 0x%x\n", + p->p_comm, p->p_pid, (user32_addr_t)uap->token, token); +#endif /* DEVELOPMENT || DEBUG */ + if (sigreturn_validation != PS_SIGRETURN_VALIDATION_DISABLED) { + rval = EINVAL; + } + } } if (onstack) @@ -752,12 +810,21 @@ sigreturn(struct proc *p, struct sigreturn_args *uap, __unused int *retval) if (ut->uu_siglist & ~ut->uu_sigmask) signal_setast(thread); + + if (rval == EINVAL) { + goto error_ret; + } + /* * thread_set_state() does all the needed checks for the passed in * content */ if (thread_setstatus(thread, ts_flavor, ts, ts_count) != KERN_SUCCESS) { rval = EINVAL; +#if DEVELOPMENT || DEBUG + printf("process %s[%d] sigreturn thread_setstatus error %d\n", + p->p_comm, p->p_pid, rval); +#endif /* DEVELOPMENT || DEBUG */ goto error_ret; } @@ -765,6 +832,10 @@ sigreturn(struct proc *p, struct sigreturn_args *uap, __unused int *retval) if (thread_setstatus(thread, fs_flavor, fs, fs_count) != KERN_SUCCESS) { rval = EINVAL; +#if DEVELOPMENT || DEBUG + printf("process %s[%d] sigreturn thread_setstatus error %d\n", + p->p_comm, p->p_pid, rval); +#endif /* DEVELOPMENT || DEBUG */ goto error_ret; } @@ -774,55 +845,39 @@ sigreturn(struct proc *p, struct sigreturn_args *uap, __unused int *retval) /* - * machine_exception() performs MD translation - * of a mach exception to a unix signal and code. + * machine_exception() performs machine-dependent translation + * of a mach exception to a unix signal. */ - -boolean_t -machine_exception( - int exception, - mach_exception_code_t code, - __unused mach_exception_subcode_t subcode, - int *unix_signal, - mach_exception_code_t *unix_code) +int +machine_exception(int exception, + mach_exception_code_t code, + __unused mach_exception_subcode_t subcode) { - switch(exception) { - - case EXC_BAD_ACCESS: - /* Map GP fault to SIGSEGV, otherwise defer to caller */ - if (code == EXC_I386_GPFLT) { - *unix_signal = SIGSEGV; - *unix_code = code; + case EXC_BAD_ACCESS: + /* Map GP fault to SIGSEGV, otherwise defer to caller */ + if (code == EXC_I386_GPFLT) { + return SIGSEGV; + } break; - } - return(FALSE); - case EXC_BAD_INSTRUCTION: - *unix_signal = SIGILL; - *unix_code = code; - break; + case EXC_BAD_INSTRUCTION: + return SIGILL; - case EXC_ARITHMETIC: - *unix_signal = SIGFPE; - *unix_code = code; - break; + case EXC_ARITHMETIC: + return SIGFPE; - case EXC_SOFTWARE: - if (code == EXC_I386_BOUND) { - /* - * Map #BR, the Bound Range Exceeded exception, to - * SIGTRAP. - */ - *unix_signal = SIGTRAP; - *unix_code = code; + case EXC_SOFTWARE: + if (code == EXC_I386_BOUND) { + /* + * Map #BR, the Bound Range Exceeded exception, to + * SIGTRAP. + */ + return SIGTRAP; + } break; - } - - default: - return(FALSE); } - - return(TRUE); + + return 0; } diff --git a/bsd/dev/monotonic.c b/bsd/dev/monotonic.c index 19a7cf3a6..8a5d276e3 100644 --- a/bsd/dev/monotonic.c +++ b/bsd/dev/monotonic.c @@ -40,149 +40,143 @@ #include #include -static int mt_dev_open(dev_t dev, int flags, int devtype, struct proc *p); -static int mt_dev_close(dev_t dev, int flags, int devtype, struct proc *p); -static int mt_dev_ioctl(dev_t dev, unsigned long cmd, char *uptr, int fflag, - struct proc *p); +static int mt_cdev_open(dev_t dev, int flags, int devtype, proc_t p); +static int mt_cdev_close(dev_t dev, int flags, int devtype, proc_t p); +static int mt_cdev_ioctl(dev_t dev, unsigned long cmd, char *uptr, int fflag, + proc_t p); + +#define MT_NODE "monotonic" static struct cdevsw mt_cdevsw = { - .d_open = mt_dev_open, - .d_close = mt_dev_close, - .d_read = eno_rdwrt, - .d_write = eno_rdwrt, - .d_ioctl = mt_dev_ioctl, - .d_stop = eno_stop, - .d_reset = eno_reset, - .d_ttys = NULL, - .d_select = eno_select, - .d_mmap = eno_mmap, - .d_strategy = eno_strat, - .d_type = 0 + .d_open = mt_cdev_open, + .d_close = mt_cdev_close, + .d_ioctl = mt_cdev_ioctl, + + .d_read = eno_rdwrt, .d_write = eno_rdwrt, .d_stop = eno_stop, + .d_reset = eno_reset, .d_ttys = NULL, .d_select = eno_select, + .d_mmap = eno_mmap, .d_strategy = eno_strat, .d_type = 0 }; /* * Written at initialization, read-only thereafter. */ lck_grp_t *mt_lock_grp = NULL; - static int mt_dev_major; -decl_lck_mtx_data(static, mt_dev_mtxs[MT_NDEVS]); -static bool mt_dev_owned[MT_NDEVS]; + +static mt_device_t +mt_get_device(dev_t devnum) +{ + return &mt_devices[minor(devnum)]; +} + +static void +mt_device_lock(mt_device_t dev) +{ + lck_mtx_lock(&dev->mtd_lock); +} static void -mt_dev_lock(dev_t dev) +mt_device_unlock(mt_device_t dev) { - lck_mtx_lock(&mt_dev_mtxs[minor(dev)]); + lck_mtx_unlock(&dev->mtd_lock); } static void -mt_dev_unlock(dev_t dev) +mt_device_assert_lock_held(__assert_only mt_device_t dev) { - lck_mtx_unlock(&mt_dev_mtxs[minor(dev)]); + LCK_MTX_ASSERT(&dev->mtd_lock, LCK_MTX_ASSERT_OWNED); } static void -mt_dev_assert_lock_held(__assert_only dev_t dev) +mt_device_assert_inuse(__assert_only mt_device_t dev) { - LCK_MTX_ASSERT(&mt_dev_mtxs[minor(dev)], LCK_MTX_ASSERT_OWNED); + assert(dev->mtd_inuse == true); } int mt_dev_init(void) { - lck_grp_attr_t *lock_grp_attr = NULL; - int devices = 0; - - lock_grp_attr = lck_grp_attr_alloc_init(); - mt_lock_grp = lck_grp_alloc_init("monotonic", lock_grp_attr); - lck_grp_attr_free(lock_grp_attr); + mt_lock_grp = lck_grp_alloc_init(MT_NODE, LCK_GRP_ATTR_NULL); + assert(mt_lock_grp != NULL); mt_dev_major = cdevsw_add(-1 /* allocate a major number */, &mt_cdevsw); if (mt_dev_major < 0) { panic("monotonic: cdevsw_add failed: %d", mt_dev_major); - __builtin_trap(); + __builtin_unreachable(); } for (int i = 0; i < MT_NDEVS; i++) { - dev_t dev; - void *dn; - int error; - - error = monotonic_devs[i].mtd_init(); - if (error) { + if (mt_devices[i].mtd_init(&mt_devices[i])) { continue; } - dev = makedev(mt_dev_major, i); - dn = devfs_make_node(dev, - DEVFS_CHAR, UID_ROOT, GID_WINDOWSERVER, 0666, - monotonic_devs[i].mtd_name); - if (dn == NULL) { + assert(mt_devices[i].mtd_ncounters > 0); + + dev_t dev = makedev(mt_dev_major, i); + char name[128]; + snprintf(name, sizeof(name), MT_NODE "/%s", mt_devices[i].mtd_name); + void *node = devfs_make_node(dev, DEVFS_CHAR, UID_ROOT, + GID_WINDOWSERVER, 0666, name); + if (!node) { panic("monotonic: devfs_make_node failed for '%s'", - monotonic_devs[i].mtd_name); - __builtin_trap(); + mt_devices[i].mtd_name); + __builtin_unreachable(); } - lck_mtx_init(&mt_dev_mtxs[i], mt_lock_grp, LCK_ATTR_NULL); - - devices++; + lck_mtx_init(&mt_devices[i].mtd_lock, mt_lock_grp, LCK_ATTR_NULL); } return 0; } static int -mt_dev_open(dev_t dev, __unused int flags, __unused int devtype, - __unused struct proc *p) +mt_cdev_open(dev_t devnum, __unused int flags, __unused int devtype, + __unused proc_t p) { int error = 0; - mt_dev_lock(dev); - - if (mt_dev_owned[minor(dev)]) { + mt_device_t dev = mt_get_device(devnum); + mt_device_lock(dev); + if (dev->mtd_inuse) { error = EBUSY; - goto out; + } else { + dev->mtd_inuse = true; } + mt_device_unlock(dev); - mt_dev_owned[minor(dev)] = true; - -out: - mt_dev_unlock(dev); return error; } static int -mt_dev_close(dev_t dev, __unused int flags, __unused int devtype, +mt_cdev_close(dev_t devnum, __unused int flags, __unused int devtype, __unused struct proc *p) { - mt_dev_lock(dev); - - assert(mt_dev_owned[minor(dev)]); - mt_dev_owned[minor(dev)] = false; - - monotonic_devs[minor(dev)].mtd_reset(); + mt_device_t dev = mt_get_device(devnum); - mt_dev_unlock(dev); + mt_device_lock(dev); + mt_device_assert_inuse(dev); + dev->mtd_inuse = false; + dev->mtd_reset(); + mt_device_unlock(dev); return 0; } static int -mt_ctl_add(dev_t dev, user_addr_t uptr, __unused int flags, - __unused struct proc *p) +mt_ctl_add(mt_device_t dev, user_addr_t uptr) { int error; uint32_t ctr; union monotonic_ctl_add ctl; - mt_dev_assert_lock_held(dev); + mt_device_assert_lock_held(dev); error = copyin(uptr, &ctl, sizeof(ctl.in)); if (error) { return error; } - error = monotonic_devs[minor(dev)].mtd_add(&ctl.in.config, &ctr); + error = dev->mtd_add(&ctl.in.config, &ctr); if (error) { return error; } @@ -198,14 +192,12 @@ mt_ctl_add(dev_t dev, user_addr_t uptr, __unused int flags, } static int -mt_ctl_counts(dev_t dev, user_addr_t uptr, __unused int flags, - __unused struct proc *p) +mt_ctl_counts(mt_device_t dev, user_addr_t uptr) { int error; - uint64_t ctrs; union monotonic_ctl_counts ctl; - mt_dev_assert_lock_held(dev); + mt_device_assert_lock_held(dev); error = copyin(uptr, &ctl, sizeof(ctl.in)); if (error) { @@ -215,11 +207,12 @@ mt_ctl_counts(dev_t dev, user_addr_t uptr, __unused int flags, if (ctl.in.ctr_mask == 0) { return EINVAL; } - ctrs = __builtin_popcountll(ctl.in.ctr_mask); { - uint64_t counts[ctrs]; - error = monotonic_devs[minor(dev)].mtd_read(ctl.in.ctr_mask, counts); + uint64_t counts[dev->mtd_nmonitors][dev->mtd_ncounters]; + memset(counts, 0, + dev->mtd_ncounters * dev->mtd_nmonitors * sizeof(counts[0][0])); + error = dev->mtd_read(ctl.in.ctr_mask, (uint64_t *)counts); if (error) { return error; } @@ -234,39 +227,40 @@ mt_ctl_counts(dev_t dev, user_addr_t uptr, __unused int flags, } static int -mt_ctl_enable(dev_t dev, user_addr_t uptr) +mt_ctl_enable(mt_device_t dev, user_addr_t uptr) { int error; union monotonic_ctl_enable ctl; - mt_dev_assert_lock_held(dev); + mt_device_assert_lock_held(dev); error = copyin(uptr, &ctl, sizeof(ctl)); if (error) { return error; } - monotonic_devs[minor(dev)].mtd_enable(ctl.in.enable); + dev->mtd_enable(ctl.in.enable); return 0; } static int -mt_ctl_reset(dev_t dev) +mt_ctl_reset(mt_device_t dev) { - mt_dev_assert_lock_held(dev); - monotonic_devs[minor(dev)].mtd_reset(); + mt_device_assert_lock_held(dev); + dev->mtd_reset(); return 0; } static int -mt_dev_ioctl(dev_t dev, unsigned long cmd, char *arg, int flags, - struct proc *p) +mt_cdev_ioctl(dev_t devnum, unsigned long cmd, char *arg, __unused int flags, + __unused proc_t p) { - int error; + int error = ENODEV; user_addr_t uptr = *(user_addr_t *)(void *)arg; - mt_dev_lock(dev); + mt_device_t dev = mt_get_device(devnum); + mt_device_lock(dev); switch (cmd) { case MT_IOC_RESET: @@ -274,7 +268,7 @@ mt_dev_ioctl(dev_t dev, unsigned long cmd, char *arg, int flags, break; case MT_IOC_ADD: - error = mt_ctl_add(dev, uptr, flags, p); + error = mt_ctl_add(dev, uptr); break; case MT_IOC_ENABLE: @@ -282,15 +276,26 @@ mt_dev_ioctl(dev_t dev, unsigned long cmd, char *arg, int flags, break; case MT_IOC_COUNTS: - error = mt_ctl_counts(dev, uptr, flags, p); + error = mt_ctl_counts(dev, uptr); break; + case MT_IOC_GET_INFO: { + union monotonic_ctl_info info = { + .out = { + .nmonitors = dev->mtd_nmonitors, + .ncounters = dev->mtd_ncounters, + }, + }; + error = copyout(&info, uptr, sizeof(info)); + break; + } + default: error = ENODEV; break; } - mt_dev_unlock(dev); + mt_device_unlock(dev); return error; } @@ -413,47 +418,26 @@ SYSCTL_DECL(_kern_monotonic); SYSCTL_NODE(_kern, OID_AUTO, monotonic, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "monotonic"); -SYSCTL_PROC(_kern_monotonic, OID_AUTO, supported, - CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, - (void *)MT_SUPPORTED, sizeof(int), mt_sysctl, "I", - "whether monotonic is supported"); +#define MT_SYSCTL(NAME, ARG, SIZE, SIZESTR, DESC) \ + SYSCTL_PROC(_kern_monotonic, OID_AUTO, NAME, \ + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED, \ + (void *)(ARG), SIZE, mt_sysctl, SIZESTR, DESC) -SYSCTL_PROC(_kern_monotonic, OID_AUTO, debug, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MASKED, - (void *)MT_DEBUG, sizeof(int), mt_sysctl, "I", +MT_SYSCTL(supported, MT_SUPPORTED, sizeof(int), "I", + "whether monotonic is supported"); +MT_SYSCTL(debug, MT_DEBUG, sizeof(int), "I", "whether monotonic is printing debug messages"); - -SYSCTL_PROC(_kern_monotonic, OID_AUTO, pmis, - CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, - (void *)MT_PMIS, sizeof(uint64_t), mt_sysctl, "Q", - "how many PMIs have been seen"); - -SYSCTL_PROC(_kern_monotonic, OID_AUTO, retrograde_updates, - CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, - (void *)MT_RETROGRADE, sizeof(uint64_t), mt_sysctl, "Q", - "how many times a counter appeared to go backwards"); - -SYSCTL_PROC(_kern_monotonic, OID_AUTO, task_thread_counting, - CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED, - (void *)MT_TASK_THREAD, sizeof(int), mt_sysctl, "I", - "task and thread counting enabled"); - -SYSCTL_PROC(_kern_monotonic, OID_AUTO, kdebug_test, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED, - (void *)MT_KDBG_TEST, sizeof(int), mt_sysctl, "O", - "test that kdebug integration works"); - -SYSCTL_PROC(_kern_monotonic, OID_AUTO, fixed_cpu_perf, - CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED, - (void *)MT_FIX_CPU_PERF, sizeof(uint64_t) * 2, mt_sysctl, "O", +MT_SYSCTL(pmis, MT_PMIS, sizeof(uint64_t), "Q", + "number of PMIs seen"); +MT_SYSCTL(retrograde_updates, MT_RETROGRADE, sizeof(uint64_t), "Q", + "number of times a counter appeared to go backwards"); +MT_SYSCTL(task_thread_counting, MT_TASK_THREAD, sizeof(int), "I", + "whether task and thread counting is enabled"); +MT_SYSCTL(kdebug_test, MT_KDBG_TEST, sizeof(int), "O", + "whether task and thread counting is enabled"); +MT_SYSCTL(fixed_cpu_perf, MT_FIX_CPU_PERF, sizeof(uint64_t) * 2, "O", "overhead of accessing the current CPU's counters"); - -SYSCTL_PROC(_kern_monotonic, OID_AUTO, fixed_thread_perf, - CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED, - (void *)MT_FIX_THREAD_PERF, sizeof(uint64_t) * 2, mt_sysctl, "O", +MT_SYSCTL(fixed_thread_perf, MT_FIX_THREAD_PERF, sizeof(uint64_t) * 2, "O", "overhead of accessing the current thread's counters"); - -SYSCTL_PROC(_kern_monotonic, OID_AUTO, fixed_task_perf, - CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED, - (void *)MT_FIX_TASK_PERF, sizeof(uint64_t) * 2, mt_sysctl, "O", +MT_SYSCTL(fixed_task_perf, MT_FIX_TASK_PERF, sizeof(uint64_t) * 2, "O", "overhead of accessing the current task's counters"); diff --git a/bsd/kern/bsd_init.c b/bsd/kern/bsd_init.c index 36b5db056..23e115db0 100644 --- a/bsd/kern/bsd_init.c +++ b/bsd/kern/bsd_init.c @@ -107,17 +107,14 @@ #include #include #include -#include +#include /* for ux_handler_setup() */ #include #include #include -#include /* for ux_exception_port */ - #include -#include #include /* for pseudo_inits */ #include #include @@ -130,8 +127,6 @@ #include #include #include /* for thread_resume() */ -#include /* for task_set_exception_ports() */ -#include /* for ux_handler() */ #include /* for ubc_init() */ #include /* for mcache_init() */ #include /* for mbinit() */ @@ -151,8 +146,6 @@ #include /* for gif_init() */ #include /* for vnode_pager_bootstrap() */ #include /* for devfs_kernel_mount() */ -#include /* for host_set_exception_ports() */ -#include /* for host_priv_self() */ #include /* for kmem_suballoc() */ #include /* for psem_lock_init() */ #include /* for log_setsize() */ @@ -201,6 +194,9 @@ #include #include +#if CONFIG_XNUPOST +#include +#endif void * get_user_regs(thread_t); /* XXX kludge for */ void IOKitInitializeTime(void); /* XXX */ @@ -403,6 +399,10 @@ lck_attr_t * proc_lck_attr; lck_mtx_t * proc_list_mlock; lck_mtx_t * proc_klist_mlock; +#if CONFIG_XNUPOST +lck_grp_t * sysctl_debug_test_stackshot_owner_grp; +lck_mtx_t * sysctl_debug_test_stackshot_owner_init_mtx; +#endif /* !CONFIG_XNUPOST */ extern lck_mtx_t * execargs_cache_lock; @@ -500,6 +500,12 @@ bsd_init(void) #endif proc_kqhashlock_grp = lck_grp_alloc_init("proc-kqhashlock", proc_lck_grp_attr); proc_knhashlock_grp = lck_grp_alloc_init("proc-knhashlock", proc_lck_grp_attr); +#if CONFIG_XNUPOST + sysctl_debug_test_stackshot_owner_grp = lck_grp_alloc_init("test-stackshot-owner-grp", LCK_GRP_ATTR_NULL); + sysctl_debug_test_stackshot_owner_init_mtx = lck_mtx_alloc_init( + sysctl_debug_test_stackshot_owner_grp, + LCK_ATTR_NULL); +#endif /* !CONFIG_XNUPOST */ /* Allocate proc lock attribute */ proc_lck_attr = lck_attr_alloc_init(); #if 0 @@ -553,9 +559,6 @@ bsd_init(void) #endif #endif /* MAC */ - /* Initialize System Override call */ - init_system_override(); - ulock_initialize(); /* @@ -1051,6 +1054,9 @@ bsd_init(void) consider_zone_gc(FALSE); #endif + /* Initialize System Override call */ + init_system_override(); + bsd_init_kprintf("done\n"); } @@ -1058,21 +1064,11 @@ void bsdinit_task(void) { proc_t p = current_proc(); - struct uthread *ut; - thread_t thread; process_name("init", p); - ux_handler_init(); - - thread = current_thread(); - (void) host_set_exception_ports(host_priv_self(), - EXC_MASK_ALL & ~(EXC_MASK_RPC_ALERT),//pilotfish (shark) needs this port - (mach_port_t) ux_exception_port, - EXCEPTION_DEFAULT| MACH_EXCEPTION_CODES, - 0); - - ut = (uthread_t)get_bsdthread_info(thread); + /* Set up exception-to-signal reflection */ + ux_handler_setup(); #if CONFIG_MACF mac_cred_label_associate_user(p->p_ucred); @@ -1080,6 +1076,13 @@ bsdinit_task(void) vm_init_before_launchd(); +#if CONFIG_XNUPOST + int result = bsd_list_tests(); + result = bsd_do_post(); + if (result != 0) { + panic("bsd_do_post: Tests failed with result = 0x%08x\n", result); + } +#endif bsd_init_kprintf("bsd_do_post - done"); diff --git a/bsd/kern/decmpfs.c b/bsd/kern/decmpfs.c index ebca25271..3e62edb97 100644 --- a/bsd/kern/decmpfs.c +++ b/bsd/kern/decmpfs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2015 Apple Inc. All rights reserved. + * Copyright (c) 2008-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -486,17 +486,28 @@ decmpfs_fetch_compressed_header(vnode_t vp, decmpfs_cnode *cp, decmpfs_header ** and return ERANGE in that case */ - size_t read_size = 0; - size_t attr_size = 0; + size_t read_size = 0; + size_t attr_size = 0; uio_t attr_uio = NULL; int err = 0; char *data = NULL; + const bool no_additional_data= ((cp != NULL) + && (cp->cmp_type != 0) + && (cp->cmp_minimal_xattr != 0)); + char uio_buf[ UIO_SIZEOF(1) ]; decmpfs_header *hdr = NULL; - char uio_buf[ UIO_SIZEOF(1) ]; + + /* + * Trace the following parameters on entry with event-id 0x03120004 + * + * @vp->v_id: vnode-id for which to fetch compressed header. + * @no_additional_data: If set true then xattr didn't have any extra data. + * @returnInvalid: return the header even though the type is out of range. + */ + DECMPFS_EMIT_TRACE_ENTRY(DECMPDBG_FETCH_COMPRESSED_HEADER, vp->v_id, + no_additional_data, returnInvalid); - if ((cp != NULL) && - (cp->cmp_type != 0) && - (cp->cmp_minimal_xattr != 0)) { + if (no_additional_data) { /* this file's xattr didn't have any extra data when we fetched it, so we can synthesize a header from the data in the cnode */ MALLOC(data, char *, sizeof(decmpfs_header), M_TEMP, M_WAITOK); @@ -571,6 +582,13 @@ decmpfs_fetch_compressed_header(vnode_t vp, decmpfs_cnode *cp, decmpfs_header ** } else { *hdrOut = hdr; } + /* + * Trace the following parameters on return with event-id 0x03120004. + * + * @vp->v_id: vnode-id for which to fetch compressed header. + * @err: value returned from this function. + */ + DECMPFS_EMIT_TRACE_RETURN(DECMPDBG_FETCH_COMPRESSED_HEADER, vp->v_id, err); return err; } @@ -679,14 +697,15 @@ decmpfs_file_is_compressed(vnode_t vp, decmpfs_cnode *cp) */ int ret = 0; - int error = 0; - uint32_t cmp_state; - struct vnode_attr va_fetch; + int error = 0; + uint32_t cmp_state; + struct vnode_attr va_fetch; decmpfs_header *hdr = NULL; mount_t mp = NULL; - int cnode_locked = 0; + int cnode_locked = 0; int saveInvalid = 0; // save the header data even though the type was out of range uint64_t decompression_flags = 0; + bool is_mounted, is_local_fs; if (vnode_isnamedstream(vp)) { /* @@ -721,9 +740,25 @@ decmpfs_file_is_compressed(vnode_t vp, decmpfs_cnode *cp) ret = FILE_IS_NOT_COMPRESSED; goto done; } - - mp = vnode_mount(vp); - if (mp == NULL) { + + is_mounted = false; + is_local_fs = false; + mp = vnode_mount(vp); + if (mp) + is_mounted = true; + if (is_mounted) + is_local_fs = ((mp->mnt_flag & MNT_LOCAL)); + /* + * Trace the following parameters on entry with event-id 0x03120014. + * + * @vp->v_id: vnode-id of the file being queried. + * @is_mounted: set to true if @vp belongs to a mounted fs. + * @is_local_fs: set to true if @vp belongs to local fs. + */ + DECMPFS_EMIT_TRACE_ENTRY(DECMPDBG_FILE_IS_COMPRESSED, vp->v_id, + is_mounted, is_local_fs); + + if (!is_mounted) { /* this should only be true before we mount the root filesystem we short-cut this return to avoid the call to getattr below, which @@ -732,7 +767,8 @@ decmpfs_file_is_compressed(vnode_t vp, decmpfs_cnode *cp) ret = FILE_IS_NOT_COMPRESSED; goto done; } - if ((mp->mnt_flag & MNT_LOCAL) == 0) { + + if (!is_local_fs) { /* compression only supported on local filesystems */ ret = FILE_IS_NOT_COMPRESSED; goto done; @@ -811,17 +847,25 @@ decmpfs_file_is_compressed(vnode_t vp, decmpfs_cnode *cp) if (cnode_locked) decmpfs_unlock_compressed_data(cp, 1); if (hdr) FREE(hdr, M_TEMP); - - switch(ret) { - case FILE_IS_NOT_COMPRESSED: - return 0; - case FILE_IS_COMPRESSED: - case FILE_IS_CONVERTING: - return 1; - default: - /* unknown state, assume file is not compressed */ - ErrorLogWithPath("unknown ret %d\n", ret); - return 0; + /* + * Trace the following parameters on return with event-id 0x03120014. + * + * @vp->v_id: vnode-id of the file being queried. + * @return: set to 1 is file is compressed. + */ + switch(ret) { + case FILE_IS_NOT_COMPRESSED: + DECMPFS_EMIT_TRACE_RETURN(DECMPDBG_FILE_IS_COMPRESSED, vp->v_id, 0); + return 0; + case FILE_IS_COMPRESSED: + case FILE_IS_CONVERTING: + DECMPFS_EMIT_TRACE_RETURN(DECMPDBG_FILE_IS_COMPRESSED, vp->v_id, 1); + return 1; + default: + /* unknown state, assume file is not compressed */ + DECMPFS_EMIT_TRACE_RETURN(DECMPDBG_FILE_IS_COMPRESSED, vp->v_id, 0); + ErrorLogWithPath("unknown ret %d\n", ret); + return 0; } } @@ -1058,7 +1102,20 @@ decmpfs_fetch_uncompressed_data(vnode_t vp, decmpfs_cnode *cp, decmpfs_header *h err = 0; goto out; } - + + /* + * Trace the following parameters on entry with event-id 0x03120008. + * + * @vp->v_id: vnode-id of the file being decompressed. + * @hdr->compression_type: compression type. + * @offset: offset from where to fetch uncompressed data. + * @size: amount of uncompressed data to fetch. + * + * Please NOTE: @offset and @size can overflow in theory but + * here it is safe. + */ + DECMPFS_EMIT_TRACE_ENTRY(DECMPDBG_FETCH_UNCOMPRESSED_DATA, vp->v_id, + hdr->compression_type, (int)offset, (int)size); lck_rw_lock_shared(decompressorsLock); decmpfs_fetch_uncompressed_data_func fetch = decmp_get_func(vp, hdr->compression_type, fetch); if (fetch) { @@ -1079,7 +1136,17 @@ decmpfs_fetch_uncompressed_data(vnode_t vp, decmpfs_cnode *cp, decmpfs_header *h err = ENOTSUP; lck_rw_unlock_shared(decompressorsLock); } - + /* + * Trace the following parameters on return with event-id 0x03120008. + * + * @vp->v_id: vnode-id of the file being decompressed. + * @bytes_read: amount of uncompressed bytes fetched in bytes. + * @err: value returned from this function. + * + * Please NOTE: @bytes_read can overflow in theory but here it is safe. + */ + DECMPFS_EMIT_TRACE_RETURN(DECMPDBG_FETCH_UNCOMPRESSED_DATA, vp->v_id, + (int)*bytes_read, err); out: return err; } @@ -1512,8 +1579,15 @@ decmpfs_free_compressed_data(vnode_t vp, decmpfs_cnode *cp) call out to the decompressor to free remove any data associated with this compressed file then delete the file's compression xattr */ - decmpfs_header *hdr = NULL; + + /* + * Trace the following parameters on entry with event-id 0x03120010. + * + * @vp->v_id: vnode-id of the file for which to free compressed data. + */ + DECMPFS_EMIT_TRACE_ENTRY(DECMPDBG_FREE_COMPRESSED_DATA, vp->v_id); + int err = decmpfs_fetch_compressed_header(vp, cp, &hdr, 0); if (err) { ErrorLogWithPath("decmpfs_fetch_compressed_header err %d\n", err); @@ -1532,6 +1606,13 @@ decmpfs_free_compressed_data(vnode_t vp, decmpfs_cnode *cp) ErrorLogWithPath("decompressor err %d\n", err); } } + /* + * Trace the following parameters on return with event-id 0x03120010. + * + * @vp->v_id: vnode-id of the file for which to free compressed data. + * @err: value returned from this function. + */ + DECMPFS_EMIT_TRACE_RETURN(DECMPDBG_FREE_COMPRESSED_DATA, vp->v_id, err); /* delete the xattr */ err = vn_removexattr(vp, DECMPFS_XATTR_NAME, 0, decmpfs_ctx); @@ -1585,10 +1666,23 @@ decmpfs_decompress_file(vnode_t vp, decmpfs_cnode *cp, off_t toSize, int truncat uint32_t new_state = 0; int update_file_state = 0; int allocSize = 0; - decmpfs_header *hdr = NULL; + decmpfs_header *hdr = NULL; int cmpdata_locked = 0; off_t remaining = 0; uint64_t uncompressed_size = 0; + + /* + * Trace the following parameters on entry with event-id 0x03120000. + * + * @vp->v_id: vnode-id of the file being decompressed. + * @toSize: uncompress given bytes of the file. + * @truncate_okay: on error it is OK to truncate. + * @skiplock: compressed data is locked, skip locking again. + * + * Please NOTE: @toSize can overflow in theory but here it is safe. + */ + DECMPFS_EMIT_TRACE_ENTRY(DECMPDBG_DECOMPRESS_FILE, vp->v_id, + (int)toSize, truncate_okay, skiplock); if (!skiplock) { decmpfs_lock_compressed_data(cp, 1); cmpdata_locked = 1; @@ -1786,7 +1880,13 @@ decmpfs_decompress_file(vnode_t vp, decmpfs_cnode *cp, off_t toSize, int truncat } if (cmpdata_locked) decmpfs_unlock_compressed_data(cp, 1); - + /* + * Trace the following parameters on return with event-id 0x03120000. + * + * @vp->v_id: vnode-id of the file being decompressed. + * @err: value returned from this function. + */ + DECMPFS_EMIT_TRACE_RETURN(DECMPDBG_DECOMPRESS_FILE, vp->v_id, err); return err; } diff --git a/bsd/kern/kdebug.c b/bsd/kern/kdebug.c index 5b424e719..66361ad0d 100644 --- a/bsd/kern/kdebug.c +++ b/bsd/kern/kdebug.c @@ -182,6 +182,12 @@ static void typefilter_reject_all(typefilter_t tf) memset(tf, 0, KDBG_TYPEFILTER_BITMAP_SIZE); } +static void typefilter_allow_all(typefilter_t tf) +{ + assert(tf != NULL); + memset(tf, ~0, KDBG_TYPEFILTER_BITMAP_SIZE); +} + static void typefilter_allow_class(typefilter_t tf, uint8_t class) { assert(tf != NULL); @@ -248,6 +254,8 @@ kdbg_timestamp(void) } } +static int kdbg_debug = 0; + #if KDEBUG_MOJO_TRACE #include static void kdebug_serial_print( /* forward */ @@ -303,7 +311,6 @@ static void delete_buffers(void); extern int tasks_count; extern int threads_count; -extern char *proc_best_name(proc_t p); extern void IOSleep(int); /* trace enable status */ @@ -606,7 +613,7 @@ kdbg_set_flags(int slowflag, int enableflag, boolean_t enabled) /* * Disable wrapping and return true if trace wrapped, false otherwise. */ -boolean_t +static boolean_t disable_wrap(uint32_t *old_slowcheck, uint32_t *old_flags) { boolean_t wrapped; @@ -626,8 +633,8 @@ disable_wrap(uint32_t *old_slowcheck, uint32_t *old_flags) return wrapped; } -void -enable_wrap(uint32_t old_slowcheck, boolean_t lostevents) +static void +enable_wrap(uint32_t old_slowcheck) { int s = ml_set_interrupts_enabled(FALSE); lck_spin_lock(kds_spin_lock); @@ -637,9 +644,6 @@ enable_wrap(uint32_t old_slowcheck, boolean_t lostevents) if ( !(old_slowcheck & SLOW_NOLOG)) kd_ctrl_page.kdebug_slowcheck &= ~SLOW_NOLOG; - if (lostevents == TRUE) - kd_ctrl_page.kdebug_flags |= KDBG_WRAPPED; - lck_spin_unlock(kds_spin_lock); ml_set_interrupts_enabled(s); } @@ -861,13 +865,20 @@ allocate_storage_unit(int cpu) if (kdsp_actual->kds_bufindx < EVENTS_PER_STORAGE_UNIT) goto out; } - + if ((kdsp = kd_ctrl_page.kds_free_list).raw != KDS_PTR_NULL) { + /* + * If there's a free page, grab it from the free list. + */ kdsp_actual = POINTER_FROM_KDS_PTR(kdsp); kd_ctrl_page.kds_free_list = kdsp_actual->kds_next; kd_ctrl_page.kds_inuse_count++; } else { + /* + * Otherwise, we're going to lose events and repurpose the oldest + * storage unit we can find. + */ if (kd_ctrl_page.kdebug_flags & KDBG_NOWRAP) { kd_ctrl_page.kdebug_slowcheck |= SLOW_NOLOG; kdbp->kd_lostevents = TRUE; @@ -929,7 +940,9 @@ allocate_storage_unit(int cpu) } else kdbp_vict->kd_lostevents = TRUE; - kd_ctrl_page.oldest_time = oldest_ts; + if (kd_ctrl_page.oldest_time < oldest_ts) { + kd_ctrl_page.oldest_time = oldest_ts; + } kd_ctrl_page.kdebug_flags |= KDBG_WRAPPED; } kdsp_actual->kds_timestamp = kdbg_timestamp(); @@ -939,7 +952,7 @@ allocate_storage_unit(int cpu) kdsp_actual->kds_lostevents = kdbp->kd_lostevents; kdbp->kd_lostevents = FALSE; - kdsp_actual->kds_bufindx = 0; + kdsp_actual->kds_bufindx = 0; if (kdbp->kd_list_head.raw == KDS_PTR_NULL) kdbp->kd_list_head = kdsp; @@ -1130,24 +1143,72 @@ kernel_debug_enter( } } +/* + * Check if the given debug ID is allowed to be traced on the current process. + * + * Returns true if allowed and false otherwise. + */ +static inline bool +kdebug_debugid_procfilt_allowed(uint32_t debugid) +{ + uint32_t procfilt_flags = kd_ctrl_page.kdebug_flags & + (KDBG_PIDCHECK | KDBG_PIDEXCLUDE); + + if (!procfilt_flags) { + return true; + } + + /* + * DBG_TRACE and MACH_SCHED tracepoints ignore the process filter. + */ + if ((debugid & 0xffff0000) == MACHDBG_CODE(DBG_MACH_SCHED, 0) || + (debugid >> 24 == DBG_TRACE)) { + return true; + } + + struct proc *curproc = current_proc(); + /* + * If the process is missing (early in boot), allow it. + */ + if (!curproc) { + return true; + } + + if (procfilt_flags & KDBG_PIDCHECK) { + /* + * Allow only processes marked with the kdebug bit. + */ + return curproc->p_kdebug; + } else if (procfilt_flags & KDBG_PIDEXCLUDE) { + /* + * Exclude any process marked with the kdebug bit. + */ + return !curproc->p_kdebug; + } else { + panic("kdebug: invalid procfilt flags %x", kd_ctrl_page.kdebug_flags); + __builtin_unreachable(); + } +} + static void kernel_debug_internal( - boolean_t only_filter, - uint32_t debugid, + uint32_t debugid, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, - uintptr_t arg5) + uintptr_t arg5, + uint64_t flags) { - struct proc *curproc; - uint64_t now; - uint32_t bindx; - kd_buf *kd; - int cpu; + uint64_t now; + uint32_t bindx; + kd_buf *kd; + int cpu; struct kd_bufinfo *kdbp; struct kd_storage *kdsp_actual; - union kds_ptr kds_raw; + union kds_ptr kds_raw; + bool only_filter = flags & KDBG_FLAG_FILTERED; + bool observe_procfilt = !(flags & KDBG_FLAG_NOPROCFILT); if (kd_ctrl_page.kdebug_slowcheck) { if ((kd_ctrl_page.kdebug_slowcheck & SLOW_NOLOG) || @@ -1156,29 +1217,9 @@ kernel_debug_internal( goto out1; } - if ( !ml_at_interrupt_context()) { - if (kd_ctrl_page.kdebug_flags & KDBG_PIDCHECK) { - /* - * If kdebug flag is not set for current proc, return - */ - curproc = current_proc(); - - if ((curproc && !(curproc->p_kdebug)) && - ((debugid & 0xffff0000) != (MACHDBG_CODE(DBG_MACH_SCHED, 0) | DBG_FUNC_NONE)) && - (debugid >> 24 != DBG_TRACE)) - goto out1; - } - else if (kd_ctrl_page.kdebug_flags & KDBG_PIDEXCLUDE) { - /* - * If kdebug flag is set for current proc, return - */ - curproc = current_proc(); - - if ((curproc && curproc->p_kdebug) && - ((debugid & 0xffff0000) != (MACHDBG_CODE(DBG_MACH_SCHED, 0) | DBG_FUNC_NONE)) && - (debugid >> 24 != DBG_TRACE)) - goto out1; - } + if (!ml_at_interrupt_context() && observe_procfilt && + !kdebug_debugid_procfilt_allowed(debugid)) { + goto out1; } if (kd_ctrl_page.kdebug_flags & KDBG_TYPEFILTER_CHECK) { @@ -1186,14 +1227,14 @@ kernel_debug_internal( goto record_event; goto out1; - } else if (only_filter == TRUE) { + } else if (only_filter) { goto out1; } else if (kd_ctrl_page.kdebug_flags & KDBG_RANGECHECK) { /* Always record trace system info */ if (KDBG_EXTRACT_CLASS(debugid) == DBG_TRACE) goto record_event; - + if (debugid < kdlog_beg || debugid > kdlog_end) goto out1; } @@ -1201,14 +1242,14 @@ kernel_debug_internal( /* Always record trace system info */ if (KDBG_EXTRACT_CLASS(debugid) == DBG_TRACE) goto record_event; - + if ((debugid & KDBG_EVENTID_MASK) != kdlog_value1 && (debugid & KDBG_EVENTID_MASK) != kdlog_value2 && (debugid & KDBG_EVENTID_MASK) != kdlog_value3 && (debugid & KDBG_EVENTID_MASK) != kdlog_value4) goto out1; } - } else if (only_filter == TRUE) { + } else if (only_filter) { goto out1; } @@ -1237,7 +1278,7 @@ kernel_debug_internal( } else { kdsp_actual = NULL; bindx = EVENTS_PER_STORAGE_UNIT; - } + } if (kdsp_actual == NULL || bindx >= EVENTS_PER_STORAGE_UNIT) { if (allocate_storage_unit(cpu) == FALSE) { @@ -1249,6 +1290,7 @@ kernel_debug_internal( } goto retry_q; } + now = kdbg_timestamp() & KDBG_TIMESTAMP_MASK; if ( !OSCompareAndSwap(bindx, bindx + 1, &kdsp_actual->kds_bufindx)) @@ -1296,8 +1338,8 @@ kernel_debug( uintptr_t arg4, __unused uintptr_t arg5) { - kernel_debug_internal(FALSE, debugid, arg1, arg2, arg3, arg4, - (uintptr_t)thread_tid(current_thread())); + kernel_debug_internal(debugid, arg1, arg2, arg3, arg4, + (uintptr_t)thread_tid(current_thread()), 0); } void @@ -1309,19 +1351,31 @@ kernel_debug1( uintptr_t arg4, uintptr_t arg5) { - kernel_debug_internal(FALSE, debugid, arg1, arg2, arg3, arg4, arg5); + kernel_debug_internal(debugid, arg1, arg2, arg3, arg4, arg5, 0); +} + +void +kernel_debug_flags( + uint32_t debugid, + uintptr_t arg1, + uintptr_t arg2, + uintptr_t arg3, + uintptr_t arg4, + uint64_t flags) +{ + kernel_debug_internal(debugid, arg1, arg2, arg3, arg4, + (uintptr_t)thread_tid(current_thread()), flags); } void kernel_debug_filtered( - uint32_t debugid, + uint32_t debugid, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4) { - kernel_debug_internal(TRUE, debugid, arg1, arg2, arg3, arg4, - (uintptr_t)thread_tid(current_thread())); + kernel_debug_flags(debugid, arg1, arg2, arg3, arg4, KDBG_FLAG_FILTERED); } void @@ -1358,10 +1412,10 @@ kernel_debug_string_simple(uint32_t eventid, const char *str) debugid |= DBG_FUNC_END; } - kernel_debug_internal(FALSE, debugid, str_buf[0], - str_buf[1], - str_buf[2], - str_buf[3], thread_id); + kernel_debug_internal(debugid, str_buf[0], + str_buf[1], + str_buf[2], + str_buf[3], thread_id, 0); debugid &= KDBG_EVENTID_MASK; int i = 4; @@ -1372,10 +1426,10 @@ kernel_debug_string_simple(uint32_t eventid, const char *str) if ((written + (4 * sizeof(uintptr_t))) >= len) { debugid |= DBG_FUNC_END; } - kernel_debug_internal(FALSE, debugid, str_buf[i], - str_buf[i + 1], - str_buf[i + 2], - str_buf[i + 3], thread_id); + kernel_debug_internal(debugid, str_buf[i], + str_buf[i + 1], + str_buf[i + 2], + str_buf[i + 3], thread_id, 0); } } @@ -1545,6 +1599,7 @@ kdebug_typefilter(__unused struct proc* p, TYPEFILTER_ALLOC_SIZE, // initial size 0, // mask (alignment?) VM_FLAGS_ANYWHERE, // flags + VM_MAP_KERNEL_FLAGS_NONE, VM_KERN_MEMORY_NONE, kdbg_typefilter_memory_entry, // port (memory entry!) 0, // offset (in memory entry) @@ -1601,12 +1656,9 @@ int kdebug_trace64(__unused struct proc *p, struct kdebug_trace64_args *uap, __u return err; } - kernel_debug_internal(FALSE, uap->code, - (uintptr_t)uap->arg1, - (uintptr_t)uap->arg2, - (uintptr_t)uap->arg3, - (uintptr_t)uap->arg4, - (uintptr_t)thread_tid(current_thread())); + kernel_debug_internal(uap->code, (uintptr_t)uap->arg1, + (uintptr_t)uap->arg2, (uintptr_t)uap->arg3, (uintptr_t)uap->arg4, + (uintptr_t)thread_tid(current_thread()), 0); return(0); } @@ -1651,9 +1703,8 @@ kernel_debug_string_internal(uint32_t debugid, uint64_t str_id, void *vstr, /* if the ID is being invalidated, just emit that */ if (str_id != 0 && str_len == 0) { - kernel_debug_internal(FALSE, trace_debugid | DBG_FUNC_START | DBG_FUNC_END, - (uintptr_t)debugid, (uintptr_t)str_id, 0, 0, - thread_id); + kernel_debug_internal(trace_debugid | DBG_FUNC_START | DBG_FUNC_END, + (uintptr_t)debugid, (uintptr_t)str_id, 0, 0, thread_id, 0); return str_id; } @@ -1669,9 +1720,8 @@ kernel_debug_string_internal(uint32_t debugid, uint64_t str_id, void *vstr, trace_debugid |= DBG_FUNC_END; } - kernel_debug_internal(FALSE, trace_debugid, (uintptr_t)debugid, - (uintptr_t)str_id, str[0], - str[1], thread_id); + kernel_debug_internal(trace_debugid, (uintptr_t)debugid, (uintptr_t)str_id, + str[0], str[1], thread_id, 0); trace_debugid &= KDBG_EVENTID_MASK; i = 2; @@ -1681,10 +1731,10 @@ kernel_debug_string_internal(uint32_t debugid, uint64_t str_id, void *vstr, if ((written + (4 * sizeof(uintptr_t))) >= str_len) { trace_debugid |= DBG_FUNC_END; } - kernel_debug_internal(FALSE, trace_debugid, str[i], - str[i + 1], - str[i + 2], - str[i + 3], thread_id); + kernel_debug_internal(trace_debugid, str[i], + str[i + 1], + str[i + 2], + str[i + 3], thread_id, 0); } return str_id; @@ -2276,8 +2326,11 @@ kdebug_reset(void) void kdebug_free_early_buf(void) { - /* Must be done with the buffer, so release it back to the VM. */ +#if !CONFIG_EMBEDDED + /* Must be done with the buffer, so release it back to the VM. + * On embedded targets this buffer is freed when the BOOTDATA segment is freed. */ ml_static_mfree((vm_offset_t)&kd_early_buffer, sizeof(kd_early_buffer)); +#endif } int @@ -2468,6 +2521,7 @@ kdbg_enable_typefilter(void) static void kdbg_disable_typefilter(void) { + bool notify_iops = kd_ctrl_page.kdebug_flags & KDBG_TYPEFILTER_CHECK; kd_ctrl_page.kdebug_flags &= ~KDBG_TYPEFILTER_CHECK; if ((kd_ctrl_page.kdebug_flags & (KDBG_PIDCHECK | KDBG_PIDEXCLUDE))) { @@ -2476,6 +2530,17 @@ kdbg_disable_typefilter(void) kdbg_set_flags(SLOW_CHECKS, 0, FALSE); } commpage_update_kdebug_state(); + + if (notify_iops) { + /* + * Notify IOPs that the typefilter will now allow everything. + * Otherwise, they won't know a typefilter is no longer in + * effect. + */ + typefilter_allow_all(kdbg_typefilter); + kdbg_iop_list_callback(kd_ctrl_page.kdebug_iops, + KD_CALLBACK_TYPEFILTER_CHANGED, kdbg_typefilter); + } } uint32_t @@ -3587,7 +3652,6 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx, uin uint32_t tempbuf_number; uint32_t old_kdebug_flags; uint32_t old_kdebug_slowcheck; - boolean_t lostevents = FALSE; boolean_t out_of_events = FALSE; boolean_t wrapped = FALSE; @@ -3641,14 +3705,11 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx, uin } /* - * If the buffers have wrapped, capture the earliest time where there - * are events for all CPUs and do not emit additional lost events for + * If the buffers have wrapped, do not emit additional lost events for the * oldest storage units. */ if (wrapped) { - barrier_min = kd_ctrl_page.oldest_time; kd_ctrl_page.kdebug_flags &= ~KDBG_WRAPPED; - kd_ctrl_page.oldest_time = 0; for (cpu = 0, kdbp = &kdbip[0]; cpu < kd_ctrl_page.kdebug_cpus; cpu++, kdbp++) { if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) { @@ -3658,13 +3719,23 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx, uin kdsp_actual->kds_lostevents = FALSE; } } + /* + * Capture the earliest time where there are events for all CPUs and don't + * emit events with timestamps prior. + */ + barrier_min = kd_ctrl_page.oldest_time; while (count) { tempbuf = kdcopybuf; tempbuf_number = 0; if (wrapped) { - /* Trace a single lost events event for wrapping. */ + /* + * Emit a lost events tracepoint to indicate that previous events + * were lost -- the thread map cannot be trusted. A new one must + * be taken so tools can analyze the trace in a backwards-facing + * fashion. + */ kdbg_set_timestamp_and_cpu(&lostevent, barrier_min, 0); *tempbuf = lostevent; wrapped = FALSE; @@ -3673,94 +3744,138 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx, uin /* While space left in merged events scratch buffer. */ while (tempbuf_count) { + bool lostevents = false; + int lostcpu = 0; earliest_time = UINT64_MAX; min_kdbp = NULL; min_cpu = 0; - /* Check each CPU's buffers. */ + /* Check each CPU's buffers for the earliest event. */ for (cpu = 0, kdbp = &kdbip[0]; cpu < kd_ctrl_page.kdebug_cpus; cpu++, kdbp++) { - /* Skip CPUs without data. */ + /* Skip CPUs without data in their oldest storage unit. */ if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) { next_cpu: continue; } - /* Debugging aid: maintain a copy of the "kdsp" - * index. - */ - volatile union kds_ptr kdsp_shadow; - - kdsp_shadow = kdsp; - /* From CPU data to buffer header to buffer. */ kdsp_actual = POINTER_FROM_KDS_PTR(kdsp); - volatile struct kd_storage *kdsp_actual_shadow; - - kdsp_actual_shadow = kdsp_actual; - - /* Skip buffer if there are no events left. */ +next_event: + /* The next event to be read from this buffer. */ rcursor = kdsp_actual->kds_readlast; + /* Skip this buffer if there are no events left. */ if (rcursor == kdsp_actual->kds_bufindx) { continue; } - t = kdbg_get_timestamp(&kdsp_actual->kds_records[rcursor]); - - /* Ignore events that have aged out due to wrapping. */ - while (t < barrier_min) { - rcursor = ++kdsp_actual->kds_readlast; - - if (rcursor >= EVENTS_PER_STORAGE_UNIT) { - release_storage_unit(cpu, kdsp.raw); + /* + * Check that this storage unit wasn't stolen and events were + * lost. This must have happened while wrapping was disabled + * in this function. + */ + if (kdsp_actual->kds_lostevents) { + lostevents = true; + kdsp_actual->kds_lostevents = FALSE; - if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) { - goto next_cpu; - } - kdsp_shadow = kdsp; - kdsp_actual = POINTER_FROM_KDS_PTR(kdsp); - kdsp_actual_shadow = kdsp_actual; - rcursor = kdsp_actual->kds_readlast; + /* + * The earliest event we can trust is the first one in this + * stolen storage unit. + */ + uint64_t lost_time = + kdbg_get_timestamp(&kdsp_actual->kds_records[0]); + if (kd_ctrl_page.oldest_time < lost_time) { + /* + * If this is the first time we've seen lost events for + * this gap, record its timestamp as the oldest + * timestamp we're willing to merge for the lost events + * tracepoint. + */ + kd_ctrl_page.oldest_time = barrier_min = lost_time; + lostcpu = cpu; } - - t = kdbg_get_timestamp(&kdsp_actual->kds_records[rcursor]); } + t = kdbg_get_timestamp(&kdsp_actual->kds_records[rcursor]); + if ((t > barrier_max) && (barrier_max > 0)) { + if (kdbg_debug) { + printf("kdebug: FUTURE EVENT: debugid %#8x: " + "time %lld from CPU %u " + "(barrier at time %lld, read %lu events)\n", + kdsp_actual->kds_records[rcursor].debugid, + t, cpu, barrier_max, *number + tempbuf_number); + } /* - * Need to flush IOPs again before we - * can sort any more data from the - * buffers. + * Need to flush IOPs again before we can sort any more + * data from the buffers. */ out_of_events = TRUE; break; } if (t < kdsp_actual->kds_timestamp) { /* - * indicates we've not yet completed filling - * in this event... - * this should only occur when we're looking - * at the buf that the record head is utilizing - * we'll pick these events up on the next - * call to kdbg_read - * we bail at this point so that we don't - * get an out-of-order timestream by continuing - * to read events from the other CPUs' timestream(s) + * This indicates the event emitter hasn't completed + * filling in the event (becuase we're looking at the + * buffer that the record head is using). The max barrier + * timestamp should have saved us from seeing these kinds + * of things, but other CPUs might be slow on the up-take. + * + * Bail out so we don't get out-of-order events by + * continuing to read events from other CPUs' events. */ out_of_events = TRUE; break; } + + /* + * Ignore events that have aged out due to wrapping or storage + * unit exhaustion while merging events. + */ + if (t < barrier_min) { + kdsp_actual->kds_readlast++; + + if (kdsp_actual->kds_readlast >= EVENTS_PER_STORAGE_UNIT) { + release_storage_unit(cpu, kdsp.raw); + + if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) { + goto next_cpu; + } + kdsp_actual = POINTER_FROM_KDS_PTR(kdsp); + } + + goto next_event; + } + + /* + * Don't worry about merging any events -- just walk through + * the CPUs and find the latest timestamp of lost events. + */ + if (lostevents) { + continue; + } + if (t < earliest_time) { earliest_time = t; min_kdbp = kdbp; min_cpu = cpu; } } - if (min_kdbp == NULL || out_of_events == TRUE) { + if (lostevents) { /* - * all buffers ran empty + * If any lost events were hit in the buffers, emit an event + * with the latest timestamp. */ + kdbg_set_timestamp_and_cpu(&lostevent, barrier_min, lostcpu); + *tempbuf = lostevent; + tempbuf->arg1 = 1; + goto nextevent; + } + if (min_kdbp == NULL) { + /* All buffers ran empty. */ out_of_events = TRUE; + } + if (out_of_events) { break; } @@ -3774,11 +3889,12 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx, uin release_storage_unit(min_cpu, kdsp.raw); /* - * Watch for out of order timestamps + * Watch for out of order timestamps (from IOPs). */ if (earliest_time < min_kdbp->kd_prev_timebase) { /* * If we haven't already, emit a retrograde events event. + * Otherwise, ignore this event. */ if (traced_retrograde) { continue; @@ -3803,6 +3919,14 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx, uin break; } if (tempbuf_number) { + /* + * Remember the latest timestamp of events that we've merged so we + * don't think we've lost events later. + */ + uint64_t latest_time = kdbg_get_timestamp(tempbuf - 1); + if (kd_ctrl_page.oldest_time < latest_time) { + kd_ctrl_page.oldest_time = latest_time; + } if (file_version == RAW_VERSION3) { if ( !(kdbg_write_v3_event_chunk_header(buffer, V3_RAW_EVENTS, (tempbuf_number * sizeof(kd_buf)), vp, ctx))) { error = EFAULT; @@ -3820,7 +3944,7 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx, uin error = kdbg_write_to_vnode((caddr_t)kdcopybuf, write_size, vp, ctx, RAW_file_offset); if (!error) RAW_file_offset += write_size; - + if (RAW_file_written >= RAW_FLUSH_SIZE) { error = VNOP_FSYNC(vp, MNT_NOWAIT, ctx); @@ -3849,7 +3973,7 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx, uin tempbuf_count = KDCOPYBUF_COUNT; } if ( !(old_kdebug_flags & KDBG_NOWRAP)) { - enable_wrap(old_kdebug_slowcheck, lostevents); + enable_wrap(old_kdebug_slowcheck); } thread_clear_eager_preempt(current_thread()); return (error); @@ -3883,6 +4007,12 @@ kdbg_test(size_t flavor) KDBG_FILTERED(KDEBUG_TEST_CODE(code), 1, 2, 3); code++; KDBG_FILTERED(KDEBUG_TEST_CODE(code), 1, 2, 3, 4); code++; + KDBG_RELEASE_NOPROCFILT(KDEBUG_TEST_CODE(code)); code++; + KDBG_RELEASE_NOPROCFILT(KDEBUG_TEST_CODE(code), 1); code++; + KDBG_RELEASE_NOPROCFILT(KDEBUG_TEST_CODE(code), 1, 2); code++; + KDBG_RELEASE_NOPROCFILT(KDEBUG_TEST_CODE(code), 1, 2, 3); code++; + KDBG_RELEASE_NOPROCFILT(KDEBUG_TEST_CODE(code), 1, 2, 3, 4); code++; + KDBG_DEBUG(KDEBUG_TEST_CODE(code)); code++; KDBG_DEBUG(KDEBUG_TEST_CODE(code), 1); code++; KDBG_DEBUG(KDEBUG_TEST_CODE(code), 1, 2); code++; @@ -3906,6 +4036,7 @@ kdbg_test(size_t flavor) (uintptr_t)thread_tid(current_thread())); code++; break; + default: return ENOTSUP; } @@ -4181,6 +4312,10 @@ SYSCTL_PROC(_kern_kdbg, OID_AUTO, experimental_continuous, sizeof(int), kdbg_sysctl_continuous, "I", "Set kdebug to use mach_continuous_time"); +SYSCTL_INT(_kern_kdbg, OID_AUTO, debug, + CTLFLAG_RW | CTLFLAG_LOCKED, + &kdbg_debug, 0, "Set kdebug debug mode"); + SYSCTL_QUAD(_kern_kdbg, OID_AUTO, oldest_time, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_LOCKED, &kd_ctrl_page.oldest_time, diff --git a/bsd/kern/kern_aio.c b/bsd/kern/kern_aio.c index 08f8d135f..b0a82bb82 100644 --- a/bsd/kern/kern_aio.c +++ b/bsd/kern/kern_aio.c @@ -1506,6 +1506,7 @@ lio_listio(proc_t p, struct lio_listio_args *uap, int *retval ) OSIncrementAtomic(&lio_contexts_alloced); #endif /* DEBUG */ + free_context = TRUE; bzero(lio_context, sizeof(aio_lio_context)); aiocbpp = aio_copy_in_list(p, uap->aiocblist, uap->nent); @@ -1527,6 +1528,7 @@ lio_listio(proc_t p, struct lio_listio_args *uap, int *retval ) } /* process list of aio requests */ + free_context = FALSE; lio_context->io_issued = uap->nent; lio_context->io_waiter = uap->mode == LIO_WAIT ? 1 : 0; /* Should it be freed by last AIO */ for ( i = 0; i < uap->nent; i++ ) { @@ -1645,7 +1647,7 @@ lio_listio(proc_t p, struct lio_listio_args *uap, int *retval ) FREE( entryp_listp, M_TEMP ); if ( aiocbpp != NULL ) FREE( aiocbpp, M_TEMP ); - if ((lio_context != NULL) && ((lio_context->io_issued == 0) || (free_context == TRUE))) { + if (free_context) { free_lio_context(lio_context); } diff --git a/bsd/kern/kern_authorization.c b/bsd/kern/kern_authorization.c index d6bf9cf91..574cc24c8 100644 --- a/bsd/kern/kern_authorization.c +++ b/bsd/kern/kern_authorization.c @@ -512,6 +512,10 @@ kauth_authorize_process_callback(kauth_cred_t credential, __unused void *idata, * arg0 is pointer to vnode (vnode *) for file to be closed. * arg1 is pointer to path (char *) of file to be closed. * arg2 is close flags. + * arguments passed to KAUTH_FILEOP_WILL_RENAME listeners + * arg0 is pointer to vnode (vnode *) of the file being renamed + * arg1 is pointer to the "from" path (char *) + * arg2 is pointer to the "to" path (char *) * arguments passed to KAUTH_FILEOP_RENAME listeners * arg0 is pointer to "from" path (char *). * arg1 is pointer to "to" path (char *). @@ -550,7 +554,10 @@ kauth_authorize_fileop(kauth_cred_t credential, kauth_action_t action, uintptr_t return(0); } - if (action == KAUTH_FILEOP_OPEN || action == KAUTH_FILEOP_CLOSE || action == KAUTH_FILEOP_EXEC) { + if (action == KAUTH_FILEOP_OPEN || + action == KAUTH_FILEOP_CLOSE || + action == KAUTH_FILEOP_EXEC || + action == KAUTH_FILEOP_WILL_RENAME) { /* get path to the given vnode as a convenience to our listeners. */ namep = get_pathbuff(); @@ -559,8 +566,15 @@ kauth_authorize_fileop(kauth_cred_t credential, kauth_action_t action, uintptr_t release_pathbuff(namep); return(0); } - if (action == KAUTH_FILEOP_CLOSE) { - arg2 = arg1; /* close has some flags that come in via arg1 */ + if (action == KAUTH_FILEOP_CLOSE || + action == KAUTH_FILEOP_WILL_RENAME) { + /* + * - Close has some flags that come in via arg1. + * - Will-rename wants to pass the vnode and + * both paths to the listeners ("to" path + * starts in arg1, moves to arg2). + */ + arg2 = arg1; } arg1 = (uintptr_t)namep; } @@ -948,7 +962,6 @@ kauth_acl_inherit(vnode_t dvp, kauth_acl_t initial, kauth_acl_t *product, int is int kauth_copyinfilesec(user_addr_t xsecurity, kauth_filesec_t *xsecdestpp) { - user_addr_t uaddr, known_bound; int error; kauth_filesec_t fsec; u_int32_t count; @@ -965,10 +978,18 @@ kauth_copyinfilesec(user_addr_t xsecurity, kauth_filesec_t *xsecdestpp) * * The upper bound must be less than KAUTH_ACL_MAX_ENTRIES. The * value here is fairly arbitrary. It's ok to have a zero count. + * + * Because we're just using these values to make a guess about the + * number of entries, the actual address doesn't matter, only their + * relative offsets into the page. We take advantage of this to + * avoid an overflow in the rounding step (this is a user-provided + * parameter, so caution pays off). */ - known_bound = xsecurity + KAUTH_FILESEC_SIZE(0); - uaddr = mach_vm_round_page(known_bound); - count = (uaddr - known_bound) / sizeof(struct kauth_ace); + { + user_addr_t known_bound = (xsecurity & PAGE_MASK) + KAUTH_FILESEC_SIZE(0); + user_addr_t uaddr = mach_vm_round_page(known_bound); + count = (uaddr - known_bound) / sizeof(struct kauth_ace); + } if (count > 32) count = 32; restart: diff --git a/bsd/kern/kern_backtrace.c b/bsd/kern/kern_backtrace.c index 9b175b009..925994950 100644 --- a/bsd/kern/kern_backtrace.c +++ b/bsd/kern/kern_backtrace.c @@ -68,7 +68,7 @@ backtrace_sysctl SYSCTL_HANDLER_ARGS if (!bt) { return ENOBUFS; } - + bzero(bt, sizeof(uintptr_t) * bt_len); err = backtrace_user(bt, bt_len, &bt_filled, &user_64); if (err) { goto out; diff --git a/bsd/kern/kern_core.c b/bsd/kern/kern_core.c index 73a9a454b..07acd675c 100644 --- a/bsd/kern/kern_core.c +++ b/bsd/kern/kern_core.c @@ -134,7 +134,7 @@ process_cpu_type(proc_t core_proc) { cpu_type_t what_we_think; #if defined (__i386__) || defined (__x86_64__) - if (IS_64BIT_PROCESS(core_proc)) { + if (IS_64BIT_PROCESS(core_proc)) { what_we_think = CPU_TYPE_X86_64; } else { what_we_think = CPU_TYPE_I386; @@ -146,6 +146,7 @@ process_cpu_type(proc_t core_proc) what_we_think = CPU_TYPE_ARM; } #endif + return what_we_think; } @@ -154,13 +155,13 @@ process_cpu_subtype(proc_t core_proc) { cpu_type_t what_we_think; #if defined (__i386__) || defined (__x86_64__) - if (IS_64BIT_PROCESS(core_proc)) { + if (IS_64BIT_PROCESS(core_proc)) { what_we_think = CPU_SUBTYPE_X86_64_ALL; } else { what_we_think = CPU_SUBTYPE_I386_ALL; } #elif defined (__arm__) || defined(__arm64__) - if (IS_64BIT_PROCESS(core_proc)) { + if (IS_64BIT_PROCESS(core_proc)) { what_we_think = CPU_SUBTYPE_ARM64_ALL; } else { what_we_think = CPU_SUBTYPE_ARM_ALL; diff --git a/bsd/kern/kern_credential.c b/bsd/kern/kern_credential.c index c433affea..141807dc8 100644 --- a/bsd/kern/kern_credential.c +++ b/bsd/kern/kern_credential.c @@ -789,7 +789,7 @@ kauth_resolver_getwork_continue(int result) thread = current_thread(); ut = get_bsdthread_info(thread); - message = ut->uu_kevent.uu_kauth.message; + message = ut->uu_save.uus_kauth.message; return(kauth_resolver_getwork2(message)); } @@ -916,7 +916,7 @@ kauth_resolver_getwork(user_addr_t message) thread_t thread = current_thread(); struct uthread *ut = get_bsdthread_info(thread); - ut->uu_kevent.uu_kauth.message = message; + ut->uu_save.uus_kauth.message = message; error = msleep0(&kauth_resolver_unsubmitted, kauth_resolver_mtx, PCATCH, "GRGetWork", 0, kauth_resolver_getwork_continue); KAUTH_RESOLVER_UNLOCK(); /* diff --git a/bsd/kern/kern_cs.c b/bsd/kern/kern_cs.c index 92d1c43be..2b40cea3e 100644 --- a/bsd/kern/kern_cs.c +++ b/bsd/kern/kern_cs.c @@ -79,19 +79,42 @@ unsigned long cs_procs_invalidated = 0; int cs_force_kill = 0; int cs_force_hard = 0; int cs_debug = 0; +// If set, AMFI will error out early on unsigned code, before evaluation the normal policy. +int cs_debug_fail_on_unsigned_code = 0; +// If the previous mode is enabled, we count the resulting failures here. +unsigned int cs_debug_unsigned_exec_failures = 0; +unsigned int cs_debug_unsigned_mmap_failures = 0; + #if SECURE_KERNEL -const int cs_enforcement_enable = 1; +/* +Here we split cs_enforcement_enable into cs_system_enforcement_enable and cs_process_enforcement_enable + +cs_system_enforcement_enable governs whether or not system level code signing enforcement mechanisms +are applied on the system. Today, the only such mechanism is code signing enforcement of the dyld shared +cache. + +cs_process_enforcement_enable governs whether code signing enforcement mechanisms are applied to all +processes or only those that opt into such enforcement. + +(On iOS and related, both of these are set by default. On macOS, only cs_system_enforcement_enable +is set by default. Processes can then be opted into code signing enforcement on a case by case basis.) + */ +const int cs_system_enforcement_enable = 1; +const int cs_process_enforcement_enable = 1; const int cs_library_val_enable = 1; #else /* !SECURE_KERNEL */ int cs_enforcement_panic=0; int cs_relax_platform_task_ports = 0; #if CONFIG_ENFORCE_SIGNED_CODE -#define DEFAULT_CS_ENFORCEMENT_ENABLE 1 +#define DEFAULT_CS_SYSTEM_ENFORCEMENT_ENABLE 1 +#define DEFAULT_CS_PROCESS_ENFORCEMENT_ENABLE 1 #else -#define DEFAULT_CS_ENFORCEMENT_ENABLE 0 +#define DEFAULT_CS_SYSTEM_ENFORCEMENT_ENABLE 1 +#define DEFAULT_CS_PROCESS_ENFORCEMENT_ENABLE 0 #endif -SECURITY_READ_ONLY_LATE(int) cs_enforcement_enable = DEFAULT_CS_ENFORCEMENT_ENABLE; +SECURITY_READ_ONLY_LATE(int) cs_system_enforcement_enable = DEFAULT_CS_SYSTEM_ENFORCEMENT_ENABLE; +SECURITY_READ_ONLY_LATE(int) cs_process_enforcement_enable = DEFAULT_CS_PROCESS_ENFORCEMENT_ENABLE; #if CONFIG_ENFORCE_LIBRARY_VALIDATION #define DEFAULT_CS_LIBRARY_VA_ENABLE 1 @@ -108,15 +131,22 @@ static lck_grp_t *cs_lockgrp; SYSCTL_INT(_vm, OID_AUTO, cs_force_kill, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_force_kill, 0, ""); SYSCTL_INT(_vm, OID_AUTO, cs_force_hard, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_force_hard, 0, ""); SYSCTL_INT(_vm, OID_AUTO, cs_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_debug, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, cs_debug_fail_on_unsigned_code, CTLFLAG_RW | CTLFLAG_LOCKED, + &cs_debug_fail_on_unsigned_code, 0, ""); +SYSCTL_UINT(_vm, OID_AUTO, cs_debug_unsigned_exec_failures, CTLFLAG_RD | CTLFLAG_LOCKED, + &cs_debug_unsigned_exec_failures, 0, ""); +SYSCTL_UINT(_vm, OID_AUTO, cs_debug_unsigned_mmap_failures, CTLFLAG_RD | CTLFLAG_LOCKED, + &cs_debug_unsigned_mmap_failures, 0, ""); SYSCTL_INT(_vm, OID_AUTO, cs_all_vnodes, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_all_vnodes, 0, ""); #if !SECURE_KERNEL -SYSCTL_INT(_vm, OID_AUTO, cs_enforcement, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_enforcement_enable, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, cs_system_enforcement, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_system_enforcement_enable, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, cs_process_enforcement, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_process_enforcement_enable, 0, ""); SYSCTL_INT(_vm, OID_AUTO, cs_enforcement_panic, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_enforcement_panic, 0, ""); #if !CONFIG_ENFORCE_LIBRARY_VALIDATION -SYSCTL_INT(_vm, OID_AUTO, cs_library_validation, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_library_val_enable, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, cs_library_validation, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_library_val_enable, 0, ""); #endif #endif /* !SECURE_KERNEL */ @@ -125,17 +155,20 @@ int panic_on_cs_killed = 0; void cs_init(void) { -#if MACH_ASSERT && __x86_64__ +#if MACH_ASSERT +#if PLATFORM_WatchOS || __x86_64__ panic_on_cs_killed = 1; -#endif /* MACH_ASSERT && __x86_64__ */ +#endif /* watchos || x86_64 */ +#endif /* MACH_ASSERT */ PE_parse_boot_argn("panic_on_cs_killed", &panic_on_cs_killed, sizeof (panic_on_cs_killed)); #if !SECURE_KERNEL int disable_cs_enforcement = 0; PE_parse_boot_argn("cs_enforcement_disable", &disable_cs_enforcement, sizeof (disable_cs_enforcement)); - if (disable_cs_enforcement) { - cs_enforcement_enable = 0; + if (disable_cs_enforcement && PE_i_can_has_debugger(NULL) != 0) { + cs_system_enforcement_enable = 0; + cs_process_enforcement_enable = 0; } else { int panic = 0; PE_parse_boot_argn("cs_enforcement_panic", &panic, sizeof(panic)); @@ -165,7 +198,7 @@ cs_allow_invalid(struct proc *p) #if MACH_ASSERT lck_mtx_assert(&p->p_mlock, LCK_MTX_ASSERT_NOTOWNED); #endif -#if CONFIG_MACF && CONFIG_ENFORCE_SIGNED_CODE +#if CONFIG_MACF /* There needs to be a MAC policy to implement this hook, or else the * kill bits will be cleared here every time. If we have * CONFIG_ENFORCE_SIGNED_CODE, we can assume there is a policy @@ -262,10 +295,10 @@ cs_invalid_page(addr64_t vaddr, boolean_t *cs_killed) */ int -cs_enforcement(struct proc *p) +cs_process_enforcement(struct proc *p) { - if (cs_enforcement_enable) + if (cs_process_enforcement_enable) return 1; if (p == NULL) @@ -277,6 +310,18 @@ cs_enforcement(struct proc *p) return 0; } +int +cs_process_global_enforcement(void) +{ + return cs_process_enforcement_enable ? 1 : 0; +} + +int +cs_system_enforcement(void) +{ + return cs_system_enforcement_enable ? 1 : 0; +} + /* * Returns whether a given process is still valid. */ @@ -312,6 +357,18 @@ cs_require_lv(struct proc *p) return 0; } +int +csproc_forced_lv(struct proc* p) +{ + if (p == NULL) { + p = current_proc(); + } + if (p != NULL && (p->p_csflags & CS_FORCED_LV)) { + return 1; + } + return 0; +} + /* * added to allow system level library * validation check at mac_cred_label_update_execve time @@ -610,6 +667,56 @@ csproc_clear_platform_binary(struct proc *p) } #endif +void +csproc_disable_enforcement(struct proc* __unused p) +{ +#if !CONFIG_ENFORCE_SIGNED_CODE + if (p != NULL) { + proc_lock(p); + p->p_csflags &= (~CS_ENFORCEMENT); + proc_unlock(p); + } +#endif +} + +/* Function: csproc_mark_invalid_allowed + * + * Description: Mark the process as being allowed to go invalid. Called as part of + * task_for_pid and ptrace policy. Note CS_INVALID_ALLOWED only matters for + * processes that have been opted into CS_ENFORCEMENT. + */ +void +csproc_mark_invalid_allowed(struct proc* __unused p) +{ +#if !CONFIG_ENFORCE_SIGNED_CODE + if (p != NULL) { + proc_lock(p); + p->p_csflags |= CS_INVALID_ALLOWED; + proc_unlock(p); + } +#endif +} + +/* + * Function: csproc_check_invalid_allowed + * + * Description: Returns 1 if the process has been marked as allowed to go invalid + * because it gave its task port to an allowed process. + */ +int +csproc_check_invalid_allowed(struct proc* __unused p) +{ +#if !CONFIG_ENFORCE_SIGNED_CODE + if (p == NULL) { + p = current_proc(); + } + + if (p != NULL && (p->p_csflags & CS_INVALID_ALLOWED)) + return 1; +#endif + return 0; +} + /* * Function: csproc_get_prod_signed * @@ -908,6 +1015,12 @@ cs_restricted(struct proc *p) return (p->p_csflags & CS_RESTRICT) ? 1 : 0; } +int +csproc_hardened_runtime(struct proc* p) +{ + return (p->p_csflags & CS_RUNTIME) ? 1 : 0; +} + /* * Function: csfg_get_path * diff --git a/bsd/kern/kern_descrip.c b/bsd/kern/kern_descrip.c index 48904239e..efc8616f7 100644 --- a/bsd/kern/kern_descrip.c +++ b/bsd/kern/kern_descrip.c @@ -776,7 +776,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) char *pop; struct vnode *vp = NULLVP; /* for AUDIT_ARG() at end */ int i, tmp, error, error2, flg = 0; - struct flock fl; + struct flock fl = {}; struct flocktimeout fltimeout; struct timespec *timeout = NULL; struct vfs_context context; @@ -1139,10 +1139,8 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) case F_GETLK: case F_OFD_GETLK: -#if CONFIG_EMBEDDED case F_GETLKPID: case F_OFD_GETLKPID: -#endif if (fp->f_type != DTYPE_VNODE) { error = EBADF; goto out; @@ -1553,7 +1551,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) case F_LOG2PHYS: case F_LOG2PHYS_EXT: { - struct log2phys l2p_struct; /* structure for allocate command */ + struct log2phys l2p_struct = {}; /* structure for allocate command */ int devBlockSize; off_t file_offset = 0; @@ -1865,12 +1863,16 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) if (uap->cmd == F_ADDFILESIGS_FOR_DYLD_SIM) { error = ubc_cs_blob_revalidate(vp, blob, NULL, blob_add_flags); if (error) { - vnode_put(vp); - goto outdrop; + blob = NULL; + if (error != EAGAIN) { + vnode_put(vp); + goto outdrop; + } } } + } - } else { + if (blob == NULL) { /* * An arbitrary limit, to prevent someone from mapping in a 20GB blob. This should cover * our use cases for the immediate future, but note that at the time of this commit, some @@ -2086,7 +2088,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) .len = CP_MAX_WRAPPEDKEYSIZE, }; - MALLOC(k.key, char *, k.len, M_TEMP, M_WAITOK); + MALLOC(k.key, char *, k.len, M_TEMP, M_WAITOK | M_ZERO); error = VNOP_IOCTL(vp, F_TRANSCODEKEY, (caddr_t)&k, 1, &context); @@ -2168,7 +2170,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) /* For now, special case HFS+ only, since this is SPI. */ src_vp = (struct vnode *)fp->f_data; if (src_vp->v_tag != VT_HFS) { - error = EINVAL; + error = ENOTSUP; goto out; } @@ -2188,7 +2190,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) dst_vp = (struct vnode *)fp2->f_data; if (dst_vp->v_tag != VT_HFS) { fp_drop(p, fd2, fp2, 1); - error = EINVAL; + error = ENOTSUP; goto out; } @@ -2886,7 +2888,7 @@ close_internal_locked(proc_t p, int fd, struct fileproc *fp, int flags) } if (fd < fdp->fd_knlistsize) - knote_fdclose(p, fd, FALSE); + knote_fdclose(p, fd); if (fp->f_flags & FP_WAITEVENT) (void)waitevent_close(p, fp); @@ -4694,9 +4696,8 @@ fdexec(proc_t p, short flags, int self_exec) * If the current thread is bound as a workq/workloop * servicing thread, we need to unbind it first. */ - if (ut->uu_kqueue_bound && self_exec) { - kevent_qos_internal_unbind(p, 0, self, - ut->uu_kqueue_flags); + if (ut->uu_kqr_bound && self_exec) { + kqueue_threadreq_unbind(p, ut->uu_kqr_bound); } proc_fdlock(p); @@ -5048,6 +5049,12 @@ fdfree(proc_t p) assert(fdp->fd_knlistsize == -1); assert(fdp->fd_knhashmask == 0); + /* + * dealloc all workloops that have outstanding retains + * when created with scheduling parameters. + */ + kqworkloops_dealloc(p); + /* close file descriptors */ if (fdp->fd_nfiles > 0 && fdp->fd_ofiles) { for (i = fdp->fd_lastfile; i >= 0; i--) { diff --git a/bsd/kern/kern_event.c b/bsd/kern/kern_event.c index f07aa6d17..d8096ba03 100644 --- a/bsd/kern/kern_event.c +++ b/bsd/kern/kern_event.c @@ -55,7 +55,7 @@ * @(#)kern_event.c 1.0 (3/31/2000) */ #include -#include +#include #include #include @@ -87,6 +87,7 @@ #include #include #include +#include #include #include @@ -102,6 +103,9 @@ #include #include +#include +#include +#include #include #include @@ -119,25 +123,10 @@ extern mach_port_name_t ipc_entry_name_mask(mach_port_name_t name); /* osfmk/ipc #define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code)) -/* - * JMM - this typedef needs to be unified with pthread_priority_t - * and mach_msg_priority_t. It also needs to be the same type - * everywhere. - */ -typedef int32_t qos_t; - MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); #define KQ_EVENT NO_EVENT64 -#define KNUSE_NONE 0x0 -#define KNUSE_STEAL_DROP 0x1 -#define KNUSE_BOOST 0x2 -static int kqlock2knoteuse(struct kqueue *kq, struct knote *kn, int flags); -static int kqlock2knotedrop(struct kqueue *kq, struct knote *kn); -static int kqlock2knotedetach(struct kqueue *kq, struct knote *kn, int flags); -static int knoteuse2kqlock(struct kqueue *kq, struct knote *kn, int flags); - static int kqueue_read(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx); static int kqueue_write(struct fileproc *fp, struct uio *uio, @@ -166,7 +155,7 @@ static void kevent_put_kq(struct proc *p, kqueue_id_t id, struct fileproc *fp, s static int kevent_internal(struct proc *p, kqueue_id_t id, kqueue_id_t *id_out, user_addr_t changelist, int nchanges, - user_addr_t eventlist, int nevents, + user_addr_t eventlist, int nevents, user_addr_t data_out, uint64_t data_available, unsigned int flags, user_addr_t utimeout, kqueue_continue_t continuation, @@ -177,39 +166,34 @@ static int kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp, struct proc *p, unsigned int flags); char * kevent_description(struct kevent_internal_s *kevp, char *s, size_t n); +static int kevent_register_wait_prepare(struct knote *kn, struct kevent_internal_s *kev); +static void kevent_register_wait_block(struct turnstile *ts, thread_t handoff_thread, + struct knote_lock_ctx *knlc, thread_continue_t cont, + struct _kevent_register *cont_args) __dead2; +static void kevent_register_wait_return(struct _kevent_register *cont_args) __dead2; +static void kevent_register_wait_cleanup(struct knote *kn); +static inline void kqueue_release_last(struct proc *p, kqueue_t kqu); static void kqueue_interrupt(struct kqueue *kq); static int kevent_callback(struct kqueue *kq, struct kevent_internal_s *kevp, - void *data); + void *data); static void kevent_continue(struct kqueue *kq, void *data, int error); static void kqueue_scan_continue(void *contp, wait_result_t wait_result); static int kqueue_process(struct kqueue *kq, kevent_callback_t callback, void *callback_data, - struct filt_process_s *process_data, int *countp, struct proc *p); -static struct kqtailq *kqueue_get_base_queue(struct kqueue *kq, kq_index_t qos_index); -static struct kqtailq *kqueue_get_high_queue(struct kqueue *kq, kq_index_t qos_index); + struct filt_process_s *process_data, int *countp); static int kqueue_queue_empty(struct kqueue *kq, kq_index_t qos_index); -static struct kqtailq *kqueue_get_suppressed_queue(struct kqueue *kq, kq_index_t qos_index); +static struct kqtailq *kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn); +static void kqueue_threadreq_initiate(struct kqueue *kq, struct kqrequest *kqr, kq_index_t qos, int flags); -static void kqworkq_request_thread(struct kqworkq *kqwq, kq_index_t qos_index); -static void kqworkq_request_help(struct kqworkq *kqwq, kq_index_t qos_index); -static void kqworkq_update_override(struct kqworkq *kqwq, kq_index_t qos_index, kq_index_t override_index); -static void kqworkq_bind_thread_impl(struct kqworkq *kqwq, kq_index_t qos_index, thread_t thread, unsigned int flags); -static void kqworkq_unbind_thread(struct kqworkq *kqwq, kq_index_t qos_index, thread_t thread, unsigned int flags); +static void kqworkq_update_override(struct kqworkq *kqwq, struct knote *kn, kq_index_t qos); +static void kqworkq_unbind(proc_t p, struct kqrequest *kqr); +static thread_qos_t kqworkq_unbind_locked(struct kqworkq *kqwq, struct kqrequest *kqr, thread_t thread); static struct kqrequest *kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index); -enum { - KQWL_UO_NONE = 0, - KQWL_UO_OLD_OVERRIDE_IS_SYNC_UI = 0x1, - KQWL_UO_NEW_OVERRIDE_IS_SYNC_UI = 0x2, - KQWL_UO_UPDATE_SUPPRESS_SYNC_COUNTERS = 0x4, - KQWL_UO_UPDATE_OVERRIDE_LAZY = 0x8 -}; - -static void kqworkloop_update_override(struct kqworkloop *kqwl, kq_index_t qos_index, kq_index_t override_index, uint32_t flags); -static void kqworkloop_bind_thread_impl(struct kqworkloop *kqwl, thread_t thread, unsigned int flags); -static void kqworkloop_unbind_thread(struct kqworkloop *kqwl, thread_t thread, unsigned int flags); -static inline kq_index_t kqworkloop_combined_qos(struct kqworkloop *kqwl, boolean_t *); -static void kqworkloop_update_suppress_sync_count(struct kqrequest *kqr, uint32_t flags); +static void kqworkloop_update_override(struct kqworkloop *kqwl, kq_index_t override_index); +static void kqworkloop_unbind(proc_t p, struct kqworkloop *kwql); +static thread_qos_t kqworkloop_unbind_locked(struct kqworkloop *kwql, thread_t thread); +static kq_index_t kqworkloop_owner_override(struct kqworkloop *kqwl); enum { KQWL_UTQ_NONE, /* @@ -223,6 +207,8 @@ enum { KQWL_UTQ_UPDATE_WAKEUP_QOS, KQWL_UTQ_UPDATE_STAYACTIVE_QOS, KQWL_UTQ_RECOMPUTE_WAKEUP_QOS, + KQWL_UTQ_UNBINDING, /* attempt to rebind */ + KQWL_UTQ_PARKING, /* * The wakeup override is for suppressed knotes that have fired again at * a higher QoS than the one for which they are suppressed already. @@ -231,35 +217,26 @@ enum { KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE, KQWL_UTQ_RESET_WAKEUP_OVERRIDE, /* - * The async QoS is the maximum QoS of an event enqueued on this workloop in + * The QoS is the maximum QoS of an event enqueued on this workloop in * userland. It is copied from the only EVFILT_WORKLOOP knote with * a NOTE_WL_THREAD_REQUEST bit set allowed on this workloop. If there is no * such knote, this QoS is 0. */ - KQWL_UTQ_SET_ASYNC_QOS, - /* - * The sync waiters QoS is the maximum QoS of any thread blocked on an - * EVFILT_WORKLOOP knote marked with the NOTE_WL_SYNC_WAIT bit. - * If there is no such knote, this QoS is 0. - */ - KQWL_UTQ_SET_SYNC_WAITERS_QOS, + KQWL_UTQ_SET_QOS_INDEX, KQWL_UTQ_REDRIVE_EVENTS, }; static void kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos); static void kqworkloop_request_help(struct kqworkloop *kqwl, kq_index_t qos_index); +static int kqworkloop_end_processing(struct kqworkloop *kqwl, int flags, int kevent_flags); static int knote_process(struct knote *kn, kevent_callback_t callback, void *callback_data, - struct filt_process_s *process_data, struct proc *p); -#if 0 -static void knote_put(struct knote *kn); -#endif + struct filt_process_s *process_data); static int kq_add_knote(struct kqueue *kq, struct knote *kn, - struct kevent_internal_s *kev, struct proc *p, int *knoteuse_flags); + struct knote_lock_ctx *knlc, struct proc *p); static struct knote *kq_find_knote_and_kq_lock(struct kqueue *kq, struct kevent_internal_s *kev, bool is_fd, struct proc *p); -static void kq_remove_knote(struct kqueue *kq, struct knote *kn, struct proc *p, kn_status_t *kn_status, uint16_t *kq_state); -static void knote_drop(struct knote *kn, struct proc *p); +static void knote_drop(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc); static struct knote *knote_alloc(void); static void knote_free(struct knote *kn); @@ -276,109 +253,56 @@ static void knote_suppress(struct knote *kn); static void knote_unsuppress(struct knote *kn); static void knote_wakeup(struct knote *kn); -static kq_index_t knote_get_queue_index(struct knote *kn); -static struct kqtailq *knote_get_queue(struct knote *kn); -static kq_index_t knote_get_req_index(struct knote *kn); -static kq_index_t knote_get_qos_index(struct knote *kn); -static void knote_set_qos_index(struct knote *kn, kq_index_t qos_index); +static bool knote_should_apply_qos_override(struct kqueue *kq, struct knote *kn, + int result, thread_qos_t *qos_out); +static void knote_apply_qos_override(struct knote *kn, kq_index_t qos_index); +static void knote_adjust_qos(struct kqueue *kq, struct knote *kn, int result); +static void knote_reset_priority(struct knote *kn, pthread_priority_t pp); static kq_index_t knote_get_qos_override_index(struct knote *kn); -static kq_index_t knote_get_sync_qos_override_index(struct knote *kn); -static void knote_set_qos_override_index(struct knote *kn, kq_index_t qos_index, boolean_t override_is_sync); static void knote_set_qos_overcommit(struct knote *kn); -static int filt_fileattach(struct knote *kn, struct kevent_internal_s *kev); -SECURITY_READ_ONLY_EARLY(static struct filterops) file_filtops = { - .f_isfd = 1, - .f_attach = filt_fileattach, -}; +static zone_t knote_zone; +static zone_t kqfile_zone; +static zone_t kqworkq_zone; +static zone_t kqworkloop_zone; +#if DEVELOPMENT || DEBUG +#define KEVENT_PANIC_ON_WORKLOOP_OWNERSHIP_LEAK (1U << 0) +#define KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS (1U << 1) +#define KEVENT_PANIC_BOOT_ARG_INITIALIZED (1U << 31) -static void filt_kqdetach(struct knote *kn); -static int filt_kqueue(struct knote *kn, long hint); -static int filt_kqtouch(struct knote *kn, struct kevent_internal_s *kev); -static int filt_kqprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); -SECURITY_READ_ONLY_EARLY(static struct filterops) kqread_filtops = { - .f_isfd = 1, - .f_detach = filt_kqdetach, - .f_event = filt_kqueue, - .f_touch = filt_kqtouch, - .f_process = filt_kqprocess, -}; +#define KEVENT_PANIC_DEFAULT_VALUE (0) +static uint32_t +kevent_debug_flags(void) +{ + static uint32_t flags = KEVENT_PANIC_DEFAULT_VALUE; + + if ((flags & KEVENT_PANIC_BOOT_ARG_INITIALIZED) == 0) { + uint32_t value = 0; + if (!PE_parse_boot_argn("kevent_debug", &value, sizeof(value))) { + value = KEVENT_PANIC_DEFAULT_VALUE; + } + value |= KEVENT_PANIC_BOOT_ARG_INITIALIZED; + os_atomic_store(&flags, value, relaxed); + } + return flags; +} +#endif + +#define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) /* placeholder for not-yet-implemented filters */ static int filt_badattach(struct knote *kn, struct kevent_internal_s *kev); +static int filt_badevent(struct knote *kn, long hint); SECURITY_READ_ONLY_EARLY(static struct filterops) bad_filtops = { .f_attach = filt_badattach, }; -static int filt_procattach(struct knote *kn, struct kevent_internal_s *kev); -static void filt_procdetach(struct knote *kn); -static int filt_proc(struct knote *kn, long hint); -static int filt_proctouch(struct knote *kn, struct kevent_internal_s *kev); -static int filt_procprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); -SECURITY_READ_ONLY_EARLY(static struct filterops) proc_filtops = { - .f_attach = filt_procattach, - .f_detach = filt_procdetach, - .f_event = filt_proc, - .f_touch = filt_proctouch, - .f_process = filt_procprocess, -}; - #if CONFIG_MEMORYSTATUS extern const struct filterops memorystatus_filtops; #endif /* CONFIG_MEMORYSTATUS */ - extern const struct filterops fs_filtops; - extern const struct filterops sig_filtops; - -static zone_t knote_zone; -static zone_t kqfile_zone; -static zone_t kqworkq_zone; -static zone_t kqworkloop_zone; - -#define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) - -/* Mach portset filter */ extern const struct filterops machport_filtops; - -/* User filter */ -static int filt_userattach(struct knote *kn, struct kevent_internal_s *kev); -static void filt_userdetach(struct knote *kn); -static int filt_user(struct knote *kn, long hint); -static int filt_usertouch(struct knote *kn, struct kevent_internal_s *kev); -static int filt_userprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); -SECURITY_READ_ONLY_EARLY(static struct filterops) user_filtops = { - .f_attach = filt_userattach, - .f_detach = filt_userdetach, - .f_event = filt_user, - .f_touch = filt_usertouch, - .f_process = filt_userprocess, -}; - -static lck_spin_t _filt_userlock; -static void filt_userlock(void); -static void filt_userunlock(void); - -/* Workloop filter */ -static bool filt_wlneeds_boost(struct kevent_internal_s *kev); -static int filt_wlattach(struct knote *kn, struct kevent_internal_s *kev); -static int filt_wlpost_attach(struct knote *kn, struct kevent_internal_s *kev); -static void filt_wldetach(struct knote *kn); -static int filt_wlevent(struct knote *kn, long hint); -static int filt_wltouch(struct knote *kn, struct kevent_internal_s *kev); -static int filt_wldrop_and_unlock(struct knote *kn, struct kevent_internal_s *kev); -static int filt_wlprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); -SECURITY_READ_ONLY_EARLY(static struct filterops) workloop_filtops = { - .f_needs_boost = filt_wlneeds_boost, - .f_attach = filt_wlattach, - .f_post_attach = filt_wlpost_attach, - .f_detach = filt_wldetach, - .f_event = filt_wlevent, - .f_touch = filt_wltouch, - .f_drop_and_unlock = filt_wldrop_and_unlock, - .f_process = filt_wlprocess, -}; - extern const struct filterops pipe_rfiltops; extern const struct filterops pipe_wfiltops; extern const struct filterops ptsd_kqops; @@ -394,7 +318,12 @@ extern const struct filterops fsevent_filtops; extern const struct filterops vnode_filtops; extern const struct filterops tty_filtops; +const static struct filterops file_filtops; +const static struct filterops kqread_filtops; +const static struct filterops proc_filtops; const static struct filterops timer_filtops; +const static struct filterops user_filtops; +const static struct filterops workloop_filtops; /* * @@ -403,170 +332,93 @@ const static struct filterops timer_filtops; * - Add a new "EVFILT_" option value to bsd/sys/event.h (typically a negative value) * in the exported section of the header * - Update the EVFILT_SYSCOUNT value to reflect the new addition - * - Add a filterops to the sysfilt_ops array. Public filters should be added at the end + * - Add a filterops to the sysfilt_ops array. Public filters should be added at the end * of the Public Filters section in the array. * Private filters: * - Add a new "EVFILT_" value to bsd/sys/event.h (typically a positive value) * in the XNU_KERNEL_PRIVATE section of the header * - Update the EVFILTID_MAX value to reflect the new addition - * - Add a filterops to the sysfilt_ops. Private filters should be added at the end of - * the Private filters section of the array. + * - Add a filterops to the sysfilt_ops. Private filters should be added at the end of + * the Private filters section of the array. */ SECURITY_READ_ONLY_EARLY(static struct filterops *) sysfilt_ops[EVFILTID_MAX] = { /* Public Filters */ - [~EVFILT_READ] = &file_filtops, - [~EVFILT_WRITE] = &file_filtops, - [~EVFILT_AIO] = &bad_filtops, - [~EVFILT_VNODE] = &file_filtops, - [~EVFILT_PROC] = &proc_filtops, - [~EVFILT_SIGNAL] = &sig_filtops, - [~EVFILT_TIMER] = &timer_filtops, - [~EVFILT_MACHPORT] = &machport_filtops, - [~EVFILT_FS] = &fs_filtops, - [~EVFILT_USER] = &user_filtops, - &bad_filtops, - &bad_filtops, - [~EVFILT_SOCK] = &file_filtops, + [~EVFILT_READ] = &file_filtops, + [~EVFILT_WRITE] = &file_filtops, + [~EVFILT_AIO] = &bad_filtops, + [~EVFILT_VNODE] = &file_filtops, + [~EVFILT_PROC] = &proc_filtops, + [~EVFILT_SIGNAL] = &sig_filtops, + [~EVFILT_TIMER] = &timer_filtops, + [~EVFILT_MACHPORT] = &machport_filtops, + [~EVFILT_FS] = &fs_filtops, + [~EVFILT_USER] = &user_filtops, + &bad_filtops, + [~EVFILT_VM] = &bad_filtops, + [~EVFILT_SOCK] = &file_filtops, #if CONFIG_MEMORYSTATUS - [~EVFILT_MEMORYSTATUS] = &memorystatus_filtops, + [~EVFILT_MEMORYSTATUS] = &memorystatus_filtops, #else - [~EVFILT_MEMORYSTATUS] = &bad_filtops, + [~EVFILT_MEMORYSTATUS] = &bad_filtops, #endif - [~EVFILT_EXCEPT] = &file_filtops, - + [~EVFILT_EXCEPT] = &file_filtops, [~EVFILT_WORKLOOP] = &workloop_filtops, /* Private filters */ - [EVFILTID_KQREAD] = &kqread_filtops, - [EVFILTID_PIPE_R] = &pipe_rfiltops, - [EVFILTID_PIPE_W] = &pipe_wfiltops, - [EVFILTID_PTSD] = &ptsd_kqops, - [EVFILTID_SOREAD] = &soread_filtops, - [EVFILTID_SOWRITE] = &sowrite_filtops, - [EVFILTID_SCK] = &sock_filtops, - [EVFILTID_SOEXCEPT] = &soexcept_filtops, - [EVFILTID_SPEC] = &spec_filtops, - [EVFILTID_BPFREAD] = &bpfread_filtops, - [EVFILTID_NECP_FD] = &necp_fd_rfiltops, - [EVFILTID_FSEVENT] = &fsevent_filtops, - [EVFILTID_VN] = &vnode_filtops, - [EVFILTID_TTY] = &tty_filtops, - [EVFILTID_PTMX] = &ptmx_kqops, + [EVFILTID_KQREAD] = &kqread_filtops, + [EVFILTID_PIPE_R] = &pipe_rfiltops, + [EVFILTID_PIPE_W] = &pipe_wfiltops, + [EVFILTID_PTSD] = &ptsd_kqops, + [EVFILTID_SOREAD] = &soread_filtops, + [EVFILTID_SOWRITE] = &sowrite_filtops, + [EVFILTID_SCK] = &sock_filtops, + [EVFILTID_SOEXCEPT] = &soexcept_filtops, + [EVFILTID_SPEC] = &spec_filtops, + [EVFILTID_BPFREAD] = &bpfread_filtops, + [EVFILTID_NECP_FD] = &necp_fd_rfiltops, + [EVFILTID_FSEVENT] = &fsevent_filtops, + [EVFILTID_VN] = &vnode_filtops, + [EVFILTID_TTY] = &tty_filtops, + [EVFILTID_PTMX] = &ptmx_kqops, }; /* waitq prepost callback */ void waitq_set__CALLING_PREPOST_HOOK__(void *kq_hook, void *knote_hook, int qos); -#ifndef _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG -#define _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG 0x02000000 /* pthread event manager bit */ -#endif -#ifndef _PTHREAD_PRIORITY_OVERCOMMIT_FLAG -#define _PTHREAD_PRIORITY_OVERCOMMIT_FLAG 0x80000000 /* request overcommit threads */ -#endif -#ifndef _PTHREAD_PRIORITY_QOS_CLASS_MASK -#define _PTHREAD_PRIORITY_QOS_CLASS_MASK 0x003fff00 /* QoS class mask */ -#endif -#ifndef _PTHREAD_PRIORITY_QOS_CLASS_SHIFT_32 -#define _PTHREAD_PRIORITY_QOS_CLASS_SHIFT_32 8 -#endif - -static inline __kdebug_only -uintptr_t -kqr_thread_id(struct kqrequest *kqr) -{ - return (uintptr_t)thread_tid(kqr->kqr_thread); -} - -static inline -boolean_t is_workqueue_thread(thread_t thread) -{ - return (thread_get_tag(thread) & THREAD_TAG_WORKQUEUE); -} - -static inline -void knote_canonicalize_kevent_qos(struct knote *kn) -{ - struct kqueue *kq = knote_get_kq(kn); - unsigned long canonical; - - if ((kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0) - return; - - /* preserve manager and overcommit flags in this case */ - canonical = pthread_priority_canonicalize(kn->kn_qos, FALSE); - kn->kn_qos = (qos_t)canonical; -} - -static inline -kq_index_t qos_index_from_qos(struct knote *kn, qos_t qos, boolean_t propagation) +static inline struct kqworkloop * +kqr_kqworkloop(struct kqrequest *kqr) { - struct kqueue *kq = knote_get_kq(kn); - kq_index_t qos_index; - unsigned long flags = 0; - - if ((kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0) - return QOS_INDEX_KQFILE; - - qos_index = (kq_index_t)thread_qos_from_pthread_priority( - (unsigned long)qos, &flags); - - if (kq->kq_state & KQ_WORKQ) { - /* workq kqueues support requesting a manager thread (non-propagation) */ - if (!propagation && (flags & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG)) - return KQWQ_QOS_MANAGER; + if (kqr->kqr_state & KQR_WORKLOOP) { + return __container_of(kqr, struct kqworkloop, kqwl_request); } - - return qos_index; -} - -static inline -qos_t qos_from_qos_index(kq_index_t qos_index) -{ - /* should only happen for KQ_WORKQ */ - if (qos_index == KQWQ_QOS_MANAGER) - return _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG; - - if (qos_index == 0) - return THREAD_QOS_UNSPECIFIED; - - /* Should have support from pthread kext support */ - return (1 << (qos_index - 1 + - _PTHREAD_PRIORITY_QOS_CLASS_SHIFT_32)); + return NULL; } -/* kqr lock must be held */ -static inline -unsigned long pthread_priority_for_kqrequest( - struct kqrequest *kqr, - kq_index_t qos_index) +static inline kqueue_t +kqr_kqueue(proc_t p, struct kqrequest *kqr) { - unsigned long priority = qos_from_qos_index(qos_index); - if (kqr->kqr_state & KQR_THOVERCOMMIT) { - priority |= _PTHREAD_PRIORITY_OVERCOMMIT_FLAG; + kqueue_t kqu; + if (kqr->kqr_state & KQR_WORKLOOP) { + kqu.kqwl = kqr_kqworkloop(kqr); + } else { + kqu.kqwq = (struct kqworkq *)p->p_fd->fd_wqkqueue; + assert(kqr >= kqu.kqwq->kqwq_request && + kqr < kqu.kqwq->kqwq_request + KQWQ_NBUCKETS); } - return priority; + return kqu; } -static inline -kq_index_t qos_index_for_servicer(int qos_class, thread_t thread, int flags) +static inline boolean_t +is_workqueue_thread(thread_t thread) { -#pragma unused(thread) - kq_index_t qos_index; - - if (flags & KEVENT_FLAG_WORKQ_MANAGER) - return KQWQ_QOS_MANAGER; - - qos_index = (kq_index_t)qos_class; - assert(qos_index > 0 && qos_index < KQWQ_QOS_MANAGER); - - return qos_index; + return (thread_get_tag(thread) & THREAD_TAG_WORKQUEUE); } /* * kqueue/note lock implementations * * The kqueue lock guards the kq state, the state of its queues, - * and the kqueue-aware status and use counts of individual knotes. + * and the kqueue-aware status and locks of individual knotes. * * The kqueue workq lock is used to protect state guarding the * interaction of the kqueue with the workq. This state cannot @@ -580,26 +432,47 @@ kq_index_t qos_index_for_servicer(int qos_class, thread_t thread, int flags) * by calling the filter to get a [consistent] snapshot of that * data. */ -lck_grp_attr_t * kq_lck_grp_attr; -lck_grp_t * kq_lck_grp; -lck_attr_t * kq_lck_attr; +static lck_grp_attr_t *kq_lck_grp_attr; +static lck_grp_t *kq_lck_grp; +static lck_attr_t *kq_lck_attr; + +static inline void +kqlock(kqueue_t kqu) +{ + lck_spin_lock(&kqu.kq->kq_lock); +} + +static inline void +kqlock_held(__assert_only kqueue_t kqu) +{ + LCK_SPIN_ASSERT(&kqu.kq->kq_lock, LCK_ASSERT_OWNED); +} + +static inline void +kqunlock(kqueue_t kqu) +{ + lck_spin_unlock(&kqu.kq->kq_lock); +} static inline void -kqlock(struct kqueue *kq) +kq_req_lock(kqueue_t kqu) { - lck_spin_lock(&kq->kq_lock); + assert(kqu.kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)); + lck_spin_lock(&kqu.kq->kq_reqlock); } static inline void -kqlock_held(__assert_only struct kqueue *kq) +kq_req_unlock(kqueue_t kqu) { - LCK_SPIN_ASSERT(&kq->kq_lock, LCK_ASSERT_OWNED); + assert(kqu.kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)); + lck_spin_unlock(&kqu.kq->kq_reqlock); } static inline void -kqunlock(struct kqueue *kq) +kq_req_held(__assert_only kqueue_t kqu) { - lck_spin_unlock(&kq->kq_lock); + assert(kqu.kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)); + LCK_SPIN_ASSERT(&kqu.kq->kq_reqlock, LCK_ASSERT_OWNED); } static inline void @@ -614,243 +487,313 @@ knhash_unlock(proc_t p) lck_mtx_unlock(&p->p_fd->fd_knhashlock); } +#pragma mark knote locks /* - * Convert a kq lock to a knote use referece. + * Enum used by the knote_lock_* functions. * - * If the knote is being dropped, or has - * vanished, we can't get a use reference. - * Just return with it still locked. + * KNOTE_KQ_LOCK_ALWAYS + * The function will always return with the kq lock held. * - * - kq locked at entry - * - unlock on exit if we get the use reference + * KNOTE_KQ_UNLOCK_ON_SUCCESS + * The function will return with the kq lock held if it was successful + * (knote_lock() is the only function that can fail). + * + * KNOTE_KQ_UNLOCK_ON_FAILURE + * The function will return with the kq lock held if it was unsuccessful + * (knote_lock() is the only function that can fail). + * + * KNOTE_KQ_UNLOCK: + * The function returns with the kq unlocked. */ -static int -kqlock2knoteuse(struct kqueue *kq, struct knote *kn, int flags) +#define KNOTE_KQ_LOCK_ALWAYS 0x0 +#define KNOTE_KQ_LOCK_ON_SUCCESS 0x1 +#define KNOTE_KQ_LOCK_ON_FAILURE 0x2 +#define KNOTE_KQ_UNLOCK 0x3 + +#if DEBUG || DEVELOPMENT +__attribute__((noinline, not_tail_called, disable_tail_calls)) +void knote_lock_ctx_chk(struct knote_lock_ctx *knlc) { - if (kn->kn_status & (KN_DROPPING | KN_VANISHED)) - return (0); + /* evil hackery to make sure no one forgets to unlock */ + assert(knlc->knlc_state == KNOTE_LOCK_CTX_UNLOCKED); +} +#endif - assert(kn->kn_status & KN_ATTACHED); - kn->kn_inuse++; - if (flags & KNUSE_BOOST) { - set_thread_rwlock_boost(); +static struct knote_lock_ctx * +knote_lock_ctx_find(struct kqueue *kq, struct knote *kn) +{ + struct knote_lock_ctx *ctx; + LIST_FOREACH(ctx, &kq->kq_knlocks, knlc_le) { + if (ctx->knlc_knote == kn) return ctx; } - kqunlock(kq); - return (1); + panic("knote lock context not found: %p", kn); + __builtin_trap(); } -/* - * - kq locked at entry - * - kq unlocked at exit - */ -__disable_tail_calls -static wait_result_t -knoteusewait(struct kqueue *kq, struct knote *kn) -{ - kn->kn_status |= KN_USEWAIT; - waitq_assert_wait64((struct waitq *)&kq->kq_wqs, - CAST_EVENT64_T(&kn->kn_status), - THREAD_UNINT, TIMEOUT_WAIT_FOREVER); +/* slowpath of knote_lock() */ +__attribute__((noinline)) +static bool __result_use_check +knote_lock_slow(struct kqueue *kq, struct knote *kn, + struct knote_lock_ctx *knlc, int kqlocking) +{ + kqlock_held(kq); + + struct knote_lock_ctx *owner_lc = knote_lock_ctx_find(kq, kn); + thread_t owner_thread = owner_lc->knlc_thread; + +#if DEBUG || DEVELOPMENT + knlc->knlc_state = KNOTE_LOCK_CTX_WAITING; +#endif + + thread_reference(owner_thread); + TAILQ_INSERT_TAIL(&owner_lc->knlc_head, knlc, knlc_tqe); + assert_wait(&kn->kn_status, THREAD_UNINT | THREAD_WAIT_NOREPORT); kqunlock(kq); - return thread_block(THREAD_CONTINUE_NULL); -} -static bool -knoteuse_needs_boost(struct knote *kn, struct kevent_internal_s *kev) -{ - if (knote_fops(kn)->f_needs_boost) { - return knote_fops(kn)->f_needs_boost(kev); + if (thread_handoff_deallocate(owner_thread) == THREAD_RESTART) { + if (kqlocking == KNOTE_KQ_LOCK_ALWAYS || + kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) { + kqlock(kq); + } +#if DEBUG || DEVELOPMENT + assert(knlc->knlc_state == KNOTE_LOCK_CTX_WAITING); + knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED; +#endif + return false; } - return false; +#if DEBUG || DEVELOPMENT + assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED); +#endif + if (kqlocking == KNOTE_KQ_LOCK_ALWAYS || + kqlocking == KNOTE_KQ_LOCK_ON_SUCCESS) { + kqlock(kq); + } + return true; } /* - * Convert from a knote use reference back to kq lock. - * - * Drop a use reference and wake any waiters if - * this is the last one. + * Attempts to take the "knote" lock. * - * If someone is trying to drop the knote, but the - * caller has events they must deliver, take - * responsibility for the drop later - and wake the - * other attempted dropper in a manner that informs - * him of the transfer of responsibility. + * Called with the kqueue lock held. * - * The exit return indicates if the knote is still alive - * (or if not, the other dropper has been given the green - * light to drop it). - * - * The kqueue lock is re-taken unconditionally. + * Returns true if the knote lock is acquired, false if it has been dropped */ -static int -knoteuse2kqlock(struct kqueue *kq, struct knote *kn, int flags) +static bool __result_use_check +knote_lock(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc, + int kqlocking) { - int dropped = 0; - int steal_drop = (flags & KNUSE_STEAL_DROP); + kqlock_held(kq); - kqlock(kq); - if (flags & KNUSE_BOOST) { - clear_thread_rwlock_boost(); +#if DEBUG || DEVELOPMENT + assert(knlc->knlc_state == KNOTE_LOCK_CTX_UNLOCKED); +#endif + knlc->knlc_knote = kn; + knlc->knlc_thread = current_thread(); + TAILQ_INIT(&knlc->knlc_head); + + if (__improbable(kn->kn_status & KN_LOCKED)) { + return knote_lock_slow(kq, kn, knlc, kqlocking); } - if (--kn->kn_inuse == 0) { + /* + * When the knote will be dropped, the knote lock is taken before + * KN_DROPPING is set, and then the knote will be removed from any + * hash table that references it before the lock is canceled. + */ + assert((kn->kn_status & KN_DROPPING) == 0); + LIST_INSERT_HEAD(&kq->kq_knlocks, knlc, knlc_le); + kn->kn_status |= KN_LOCKED; +#if DEBUG || DEVELOPMENT + knlc->knlc_state = KNOTE_LOCK_CTX_LOCKED; +#endif - if ((kn->kn_status & KN_ATTACHING) != 0) { - kn->kn_status &= ~KN_ATTACHING; - } + if (kqlocking == KNOTE_KQ_UNLOCK || + kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) { + kqunlock(kq); + } + return true; +} - if ((kn->kn_status & KN_USEWAIT) != 0) { - wait_result_t result; +/* + * Unlocks a knote successfully locked with knote_lock(). + * + * Called with the kqueue lock held. + * + * Returns with the kqueue lock held according to KNOTE_KQ_* flags + */ +static void +knote_unlock(struct kqueue *kq, struct knote *kn, + struct knote_lock_ctx *knlc, int flags) +{ + kqlock_held(kq); - /* If we need to, try and steal the drop */ - if (kn->kn_status & KN_DROPPING) { - if (steal_drop && !(kn->kn_status & KN_STOLENDROP)) { - kn->kn_status |= KN_STOLENDROP; - } else { - dropped = 1; - } - } + assert(knlc->knlc_knote == kn); + assert(kn->kn_status & KN_LOCKED); +#if DEBUG || DEVELOPMENT + assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED); +#endif - /* wakeup indicating if ANY USE stole the drop */ - result = (kn->kn_status & KN_STOLENDROP) ? - THREAD_RESTART : THREAD_AWAKENED; + struct knote_lock_ctx *next_owner_lc = TAILQ_FIRST(&knlc->knlc_head); - kn->kn_status &= ~KN_USEWAIT; - waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, - CAST_EVENT64_T(&kn->kn_status), - result, - WAITQ_ALL_PRIORITIES); - } else { - /* should have seen use-wait if dropping with use refs */ - assert((kn->kn_status & (KN_DROPPING|KN_STOLENDROP)) == 0); - } + LIST_REMOVE(knlc, knlc_le); - } else if (kn->kn_status & KN_DROPPING) { - /* not the last ref but want to steal a drop if present */ - if (steal_drop && ((kn->kn_status & KN_STOLENDROP) == 0)) { - kn->kn_status |= KN_STOLENDROP; + if (next_owner_lc) { + assert(next_owner_lc->knlc_knote == kn); + TAILQ_REMOVE(&knlc->knlc_head, next_owner_lc, knlc_tqe); - /* but we now have to wait to be the last ref */ - knoteusewait(kq, kn); - kqlock(kq); - } else { - dropped = 1; - } + assert(TAILQ_EMPTY(&next_owner_lc->knlc_head)); + TAILQ_CONCAT(&next_owner_lc->knlc_head, &knlc->knlc_head, knlc_tqe); + LIST_INSERT_HEAD(&kq->kq_knlocks, next_owner_lc, knlc_le); +#if DEBUG || DEVELOPMENT + next_owner_lc->knlc_state = KNOTE_LOCK_CTX_LOCKED; +#endif + } else { + kn->kn_status &= ~KN_LOCKED; } - - return (!dropped); + if (kn->kn_inuse == 0) { + /* + * No f_event() in flight anymore, we can leave QoS "Merge" mode + * + * See knote_should_apply_qos_override() + */ + kn->kn_status &= ~KN_MERGE_QOS; + } + if (flags & KNOTE_KQ_UNLOCK) { + kqunlock(kq); + } + if (next_owner_lc) { + thread_wakeup_thread(&kn->kn_status, next_owner_lc->knlc_thread); + } +#if DEBUG || DEVELOPMENT + knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED; +#endif } /* - * Convert a kq lock to a knote use reference - * (for the purpose of detaching AND vanishing it). + * Aborts all waiters for a knote lock, and unlock the knote. * - * If the knote is being dropped, we can't get - * a detach reference, so wait for the knote to - * finish dropping before returning. + * Called with the kqueue lock held. * - * If the knote is being used for other purposes, - * we cannot detach it until those uses are done - * as well. Again, just wait for them to finish - * (caller will start over at lookup). - * - * - kq locked at entry - * - unlocked on exit + * Returns with the kqueue lock held according to KNOTE_KQ_* flags */ -static int -kqlock2knotedetach(struct kqueue *kq, struct knote *kn, int flags) +static void +knote_unlock_cancel(struct kqueue *kq, struct knote *kn, + struct knote_lock_ctx *knlc, int kqlocking) { - if ((kn->kn_status & KN_DROPPING) || kn->kn_inuse) { - /* have to wait for dropper or current uses to go away */ - knoteusewait(kq, kn); - return (0); + kqlock_held(kq); + + assert(knlc->knlc_knote == kn); + assert(kn->kn_status & KN_LOCKED); + assert(kn->kn_status & KN_DROPPING); + + LIST_REMOVE(knlc, knlc_le); + kn->kn_status &= ~KN_LOCKED; + + if (kqlocking == KNOTE_KQ_UNLOCK || + kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) { + kqunlock(kq); } - assert((kn->kn_status & KN_VANISHED) == 0); - assert(kn->kn_status & KN_ATTACHED); - kn->kn_status &= ~KN_ATTACHED; - kn->kn_status |= KN_VANISHED; - if (flags & KNUSE_BOOST) { - clear_thread_rwlock_boost(); + if (!TAILQ_EMPTY(&knlc->knlc_head)) { + thread_wakeup_with_result(&kn->kn_status, THREAD_RESTART); } - kn->kn_inuse++; - kqunlock(kq); - return (1); +#if DEBUG || DEVELOPMENT + knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED; +#endif } /* - * Convert a kq lock to a knote drop reference. + * Call the f_event hook of a given filter. * - * If the knote is in use, wait for the use count - * to subside. We first mark our intention to drop - * it - keeping other users from "piling on." - * If we are too late, we have to wait for the - * other drop to complete. - * - * - kq locked at entry - * - always unlocked on exit. - * - caller can't hold any locks that would prevent - * the other dropper from completing. + * Takes a use count to protect against concurrent drops. */ -static int -kqlock2knotedrop(struct kqueue *kq, struct knote *kn) +static void +knote_call_filter_event(struct kqueue *kq, struct knote *kn, long hint) { - int oktodrop; - wait_result_t result; + int result, dropping = 0; - oktodrop = ((kn->kn_status & (KN_DROPPING | KN_ATTACHING)) == 0); - /* if another thread is attaching, they will become the dropping thread */ - kn->kn_status |= KN_DROPPING; - knote_unsuppress(kn); - knote_dequeue(kn); - if (oktodrop) { - if (kn->kn_inuse == 0) { - kqunlock(kq); - return (oktodrop); + kqlock_held(kq); + + if (kn->kn_status & (KN_DROPPING | KN_VANISHED)) + return; + + kn->kn_inuse++; + kqunlock(kq); + result = filter_call(knote_fops(kn), f_event(kn, hint)); + kqlock(kq); + + dropping = (kn->kn_status & KN_DROPPING); + + if (!dropping && (result & FILTER_ACTIVE)) { + if (result & FILTER_ADJUST_EVENT_QOS_BIT) + knote_adjust_qos(kq, kn, result); + knote_activate(kn); + } + + if (--kn->kn_inuse == 0) { + if ((kn->kn_status & KN_LOCKED) == 0) { + /* + * We're the last f_event() call and there's no other f_* call in + * flight, we can leave QoS "Merge" mode. + * + * See knote_should_apply_qos_override() + */ + kn->kn_status &= ~KN_MERGE_QOS; + } + if (dropping) { + waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, + CAST_EVENT64_T(&kn->kn_inuse), + THREAD_AWAKENED, WAITQ_ALL_PRIORITIES); } } - result = knoteusewait(kq, kn); - /* THREAD_RESTART == another thread stole the knote drop */ - return (result == THREAD_AWAKENED); } -#if 0 /* - * Release a knote use count reference. + * Called by knote_drop() to wait for the last f_event() caller to be done. + * + * - kq locked at entry + * - kq unlocked at exit */ static void -knote_put(struct knote *kn) +knote_wait_for_filter_events(struct kqueue *kq, struct knote *kn) { - struct kqueue *kq = knote_get_kq(kn); + wait_result_t wr = THREAD_NOT_WAITING; - kqlock(kq); - if (--kn->kn_inuse == 0) { - if ((kn->kn_status & KN_USEWAIT) != 0) { - kn->kn_status &= ~KN_USEWAIT; - waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, - CAST_EVENT64_T(&kn->kn_status), - THREAD_AWAKENED, - WAITQ_ALL_PRIORITIES); - } + kqlock_held(kq); + + assert(kn->kn_status & KN_DROPPING); + + if (kn->kn_inuse) { + wr = waitq_assert_wait64((struct waitq *)&kq->kq_wqs, + CAST_EVENT64_T(&kn->kn_inuse), + THREAD_UNINT | THREAD_WAIT_NOREPORT, TIMEOUT_WAIT_FOREVER); } kqunlock(kq); + if (wr == THREAD_WAITING) { + thread_block(THREAD_CONTINUE_NULL); + } } -#endif + +#pragma mark file_filtops static int filt_fileattach(struct knote *kn, struct kevent_internal_s *kev) { - return (fo_kqfilter(kn->kn_fp, kn, kev, vfs_context_current())); + return fo_kqfilter(kn->kn_fp, kn, kev, vfs_context_current()); } -#define f_flag f_fglob->fg_flag -#define f_msgcount f_fglob->fg_msgcount -#define f_cred f_fglob->fg_cred -#define f_ops f_fglob->fg_ops -#define f_offset f_fglob->fg_offset -#define f_data f_fglob->fg_data - -static void +SECURITY_READ_ONLY_EARLY(static struct filterops) file_filtops = { + .f_isfd = 1, + .f_attach = filt_fileattach, +}; + +#pragma mark kqread_filtops + +#define f_flag f_fglob->fg_flag +#define f_ops f_fglob->fg_ops +#define f_data f_fglob->fg_data + +static void filt_kqdetach(struct knote *kn) { struct kqfile *kqf = (struct kqfile *)kn->kn_fp->f_data; @@ -861,15 +804,12 @@ filt_kqdetach(struct knote *kn) kqunlock(kq); } -/*ARGSUSED*/ static int filt_kqueue(struct knote *kn, __unused long hint) { struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; - int count; - count = kq->kq_count; - return (count > 0); + return (kq->kq_count > 0); } static int @@ -881,8 +821,6 @@ filt_kqtouch(struct knote *kn, struct kevent_internal_s *kev) kqlock(kq); kn->kn_data = kq->kq_count; - if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) - kn->kn_udata = kev->udata; res = (kn->kn_data > 0); kqunlock(kq); @@ -910,7 +848,15 @@ filt_kqprocess(struct knote *kn, struct filt_process_s *data, struct kevent_inte return res; } -#pragma mark EVFILT_PROC +SECURITY_READ_ONLY_EARLY(static struct filterops) kqread_filtops = { + .f_isfd = 1, + .f_detach = filt_kqdetach, + .f_event = filt_kqueue, + .f_touch = filt_kqtouch, + .f_process = filt_kqprocess, +}; + +#pragma mark proc_filtops static int filt_procattach(struct knote *kn, __unused struct kevent_internal_s *kev) @@ -920,15 +866,13 @@ filt_procattach(struct knote *kn, __unused struct kevent_internal_s *kev) assert(PID_MAX < NOTE_PDATAMASK); if ((kn->kn_sfflags & (NOTE_TRACK | NOTE_TRACKERR | NOTE_CHILD)) != 0) { - kn->kn_flags = EV_ERROR; - kn->kn_data = ENOTSUP; + knote_set_error(kn, ENOTSUP); return 0; } p = proc_find(kn->kn_id); if (p == NULL) { - kn->kn_flags = EV_ERROR; - kn->kn_data = ESRCH; + knote_set_error(kn, ESRCH); return 0; } @@ -946,8 +890,7 @@ filt_procattach(struct knote *kn, __unused struct kevent_internal_s *kev) break; /* parent-in-waiting => ok */ proc_rele(p); - kn->kn_flags = EV_ERROR; - kn->kn_data = EACCES; + knote_set_error(kn, EACCES); return 0; } while (0); @@ -1022,7 +965,7 @@ filt_proc(struct knote *kn, long hint) */ return 0; } - } + } /* * if the user is interested in this event, record it. @@ -1040,7 +983,7 @@ filt_proc(struct knote *kn, long hint) /* * The kernel has a wrapper in place that returns the same data - * as is collected here, in kn_data. Any changes to how + * as is collected here, in kn_data. Any changes to how * NOTE_EXITSTATUS and NOTE_EXIT_DETAIL are collected * should also be reflected in the proc_pidnoteexit() wrapper. */ @@ -1054,7 +997,7 @@ filt_proc(struct knote *kn, long hint) kn->kn_fflags |= NOTE_EXIT_DETAIL; if ((kn->kn_ptr.p_proc->p_lflag & P_LTERM_DECRYPTFAIL) != 0) { - kn->kn_data |= NOTE_EXIT_DECRYPTFAIL; + kn->kn_data |= NOTE_EXIT_DECRYPTFAIL; } if ((kn->kn_ptr.p_proc->p_lflag & P_LTERM_JETSAM) != 0) { @@ -1103,8 +1046,6 @@ filt_proctouch(struct knote *kn, struct kevent_internal_s *kev) /* accept new filter flags and mask off output events no long interesting */ kn->kn_sfflags = kev->fflags; - if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) - kn->kn_udata = kev->udata; /* restrict the current results to the (smaller?) set of new interest */ /* @@ -1138,9 +1079,22 @@ filt_procprocess(struct knote *kn, struct filt_process_s *data, struct kevent_in return res; } +SECURITY_READ_ONLY_EARLY(static struct filterops) proc_filtops = { + .f_attach = filt_procattach, + .f_detach = filt_procdetach, + .f_event = filt_proc, + .f_touch = filt_proctouch, + .f_process = filt_procprocess, +}; -#pragma mark EVFILT_TIMER +#pragma mark timer_filtops +struct filt_timer_params { + uint64_t deadline; /* deadline in abs/cont time + (or 0 if NOTE_ABSOLUTE and deadline is in past) */ + uint64_t leeway; /* leeway in abstime, or 0 if none */ + uint64_t interval; /* interval in abstime or 0 if non-repeating timer */ +}; /* * Values stored in the knote at rest (using Mach absolute time units) @@ -1150,23 +1104,36 @@ filt_procprocess(struct knote *kn, struct filt_process_s *data, struct kevent_in * kn->kn_ext[1] leeway value * kn->kn_sdata interval timer: the interval * absolute/deadline timer: 0 - * kn->kn_data fire count + * kn->kn_hookid timer state + * + * TIMER_IDLE: + * The timer has either never been scheduled or been cancelled. + * It is safe to schedule a new one in this state. + * + * TIMER_ARMED: + * The timer has been scheduled + * + * TIMER_FIRED + * The timer has fired and an event needs to be delivered. + * When in this state, the callout may still be running. + * + * TIMER_IMMEDIATE + * The timer has fired at registration time, and the callout was never + * dispatched. */ +#define TIMER_IDLE 0x0 +#define TIMER_ARMED 0x1 +#define TIMER_FIRED 0x2 +#define TIMER_IMMEDIATE 0x3 -static lck_mtx_t _filt_timerlock; - -static void filt_timerlock(void) { lck_mtx_lock(&_filt_timerlock); } -static void filt_timerunlock(void) { lck_mtx_unlock(&_filt_timerlock); } - -static inline void filt_timer_assert_locked(void) +static void +filt_timer_set_params(struct knote *kn, struct filt_timer_params *params) { - LCK_MTX_ASSERT(&_filt_timerlock, LCK_MTX_ASSERT_OWNED); + kn->kn_ext[0] = params->deadline; + kn->kn_ext[1] = params->leeway; + kn->kn_sdata = params->interval; } -/* state flags stored in kn_hookid */ -#define TIMER_RUNNING 0x1 -#define TIMER_CANCELWAIT 0x2 - /* * filt_timervalidate - process data from user * @@ -1177,20 +1144,21 @@ static inline void filt_timer_assert_locked(void) * kn_sfflags style of timer, unit of measurement * * Output: - * kn_sdata either interval in abstime or 0 if non-repeating timer - * ext[0] fire deadline in abs/cont time - * (or 0 if NOTE_ABSOLUTE and deadline is in past) + * struct filter_timer_params to apply to the filter with + * filt_timer_set_params when changes are ready to be commited. * * Returns: * EINVAL Invalid user data parameters + * ERANGE Various overflows with the parameters * * Called with timer filter lock held. */ static int -filt_timervalidate(struct knote *kn) +filt_timervalidate(const struct kevent_internal_s *kev, + struct filt_timer_params *params) { /* - * There are 4 knobs that need to be chosen for a timer registration: + * There are 5 knobs that need to be chosen for a timer registration: * * A) Units of time (what is the time duration of the specified number) * Absolute and interval take: @@ -1220,13 +1188,11 @@ filt_timervalidate(struct knote *kn) * expires when mach_continuous_time() is > the passed in value. */ - filt_timer_assert_locked(); - uint64_t multiplier; boolean_t use_abstime = FALSE; - switch (kn->kn_sfflags & (NOTE_SECONDS|NOTE_USECONDS|NOTE_NSECONDS|NOTE_MACHTIME)) { + switch (kev->fflags & (NOTE_SECONDS|NOTE_USECONDS|NOTE_NSECONDS|NOTE_MACHTIME)) { case NOTE_SECONDS: multiplier = NSEC_PER_SEC; break; @@ -1248,31 +1214,33 @@ filt_timervalidate(struct knote *kn) } /* transform the leeway in kn_ext[1] to same time scale */ - if (kn->kn_sfflags & NOTE_LEEWAY) { + if (kev->fflags & NOTE_LEEWAY) { uint64_t leeway_abs; if (use_abstime) { - leeway_abs = (uint64_t)kn->kn_ext[1]; + leeway_abs = (uint64_t)kev->ext[1]; } else { uint64_t leeway_ns; - if (os_mul_overflow((uint64_t)kn->kn_ext[1], multiplier, &leeway_ns)) + if (os_mul_overflow((uint64_t)kev->ext[1], multiplier, &leeway_ns)) return (ERANGE); nanoseconds_to_absolutetime(leeway_ns, &leeway_abs); } - kn->kn_ext[1] = leeway_abs; + params->leeway = leeway_abs; + } else { + params->leeway = 0; } - if (kn->kn_sfflags & NOTE_ABSOLUTE) { + if (kev->fflags & NOTE_ABSOLUTE) { uint64_t deadline_abs; if (use_abstime) { - deadline_abs = (uint64_t)kn->kn_sdata; + deadline_abs = (uint64_t)kev->data; } else { uint64_t calendar_deadline_ns; - if (os_mul_overflow((uint64_t)kn->kn_sdata, multiplier, &calendar_deadline_ns)) + if (os_mul_overflow((uint64_t)kev->data, multiplier, &calendar_deadline_ns)) return (ERANGE); /* calendar_deadline_ns is in nanoseconds since the epoch */ @@ -1306,7 +1274,7 @@ filt_timervalidate(struct knote *kn) * it does not change the calendar timebase. */ - if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) + if (kev->fflags & NOTE_MACH_CONTINUOUS_TIME) clock_continuoustime_interval_to_deadline(interval_abs, &deadline_abs); else @@ -1317,9 +1285,9 @@ filt_timervalidate(struct knote *kn) } } - kn->kn_ext[0] = deadline_abs; - kn->kn_sdata = 0; /* NOTE_ABSOLUTE is non-repeating */ - } else if (kn->kn_sdata < 0) { + params->deadline = deadline_abs; + params->interval = 0; /* NOTE_ABSOLUTE is non-repeating */ + } else if (kev->data < 0) { /* * Negative interval timers fire immediately, once. * @@ -1333,16 +1301,16 @@ filt_timervalidate(struct knote *kn) * We now skip the power-wasting hot spin phase and go straight to the idle phase. */ - kn->kn_sdata = 0; /* non-repeating */ - kn->kn_ext[0] = 0; /* expire immediately */ + params->deadline = 0; /* expire immediately */ + params->interval = 0; /* non-repeating */ } else { uint64_t interval_abs = 0; if (use_abstime) { - interval_abs = (uint64_t)kn->kn_sdata; + interval_abs = (uint64_t)kev->data; } else { uint64_t interval_ns; - if (os_mul_overflow((uint64_t)kn->kn_sdata, multiplier, &interval_ns)) + if (os_mul_overflow((uint64_t)kev->data, multiplier, &interval_ns)) return (ERANGE); nanoseconds_to_absolutetime(interval_ns, &interval_abs); @@ -1350,117 +1318,93 @@ filt_timervalidate(struct knote *kn) uint64_t deadline = 0; - if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) + if (kev->fflags & NOTE_MACH_CONTINUOUS_TIME) clock_continuoustime_interval_to_deadline(interval_abs, &deadline); else clock_absolutetime_interval_to_deadline(interval_abs, &deadline); - kn->kn_sdata = interval_abs; /* default to a repeating timer */ - kn->kn_ext[0] = deadline; + params->deadline = deadline; + params->interval = interval_abs; } return (0); } - - - /* * filt_timerexpire - the timer callout routine - * - * Just propagate the timer event into the knote - * filter routine (by going through the knote - * synchronization point). Pass a hint to - * indicate this is a real event, not just a - * query from above. */ static void filt_timerexpire(void *knx, __unused void *spare) { - struct klist timer_list; struct knote *kn = knx; + int v; - filt_timerlock(); - - kn->kn_hookid &= ~TIMER_RUNNING; - - /* no "object" for timers, so fake a list */ - SLIST_INIT(&timer_list); - SLIST_INSERT_HEAD(&timer_list, kn, kn_selnext); - - KNOTE(&timer_list, 1); - - /* if someone is waiting for timer to pop */ - if (kn->kn_hookid & TIMER_CANCELWAIT) { + if (os_atomic_cmpxchgv(&kn->kn_hookid, TIMER_ARMED, TIMER_FIRED, + &v, relaxed)) { + // our f_event always would say FILTER_ACTIVE, + // so be leaner and just do it. struct kqueue *kq = knote_get_kq(kn); - waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, - CAST_EVENT64_T(&kn->kn_hook), - THREAD_AWAKENED, - WAITQ_ALL_PRIORITIES); - - kn->kn_hookid &= ~TIMER_CANCELWAIT; + kqlock(kq); + knote_activate(kn); + kqunlock(kq); + } else { + /* + * From TIMER_ARMED, the only allowed transition are: + * - to TIMER_FIRED through the timer callout just above + * - to TIMER_IDLE due to filt_timercancel() which will wait for the + * timer callout (and any possible invocation of filt_timerexpire) to + * have finished before the state is changed again. + */ + assert(v == TIMER_IDLE); } - - filt_timerunlock(); } -/* - * Cancel a running timer (or wait for the pop). - * Timer filter lock is held. - * May drop and retake the timer filter lock. - */ static void filt_timercancel(struct knote *kn) { - filt_timer_assert_locked(); - - assert((kn->kn_hookid & TIMER_CANCELWAIT) == 0); - - /* if no timer, then we're good */ - if ((kn->kn_hookid & TIMER_RUNNING) == 0) - return; - - thread_call_t callout = (thread_call_t)kn->kn_hook; - - /* cancel the callout if we can */ - if (thread_call_cancel(callout)) { - kn->kn_hookid &= ~TIMER_RUNNING; - return; + if (os_atomic_xchg(&kn->kn_hookid, TIMER_IDLE, relaxed) == TIMER_ARMED) { + /* cancel the thread call and wait for any filt_timerexpire in flight */ + thread_call_cancel_wait((thread_call_t)kn->kn_hook); } +} - /* cancel failed, we have to wait for the in-flight expire routine */ - - kn->kn_hookid |= TIMER_CANCELWAIT; - - struct kqueue *kq = knote_get_kq(kn); - - waitq_assert_wait64((struct waitq *)&kq->kq_wqs, - CAST_EVENT64_T(&kn->kn_hook), - THREAD_UNINT, TIMEOUT_WAIT_FOREVER); +/* + * Does this deadline needs a timer armed for it, or has it expired? + */ +static bool +filt_timer_is_ready(struct knote *kn) +{ + uint64_t now, deadline = kn->kn_ext[0]; - filt_timerunlock(); - thread_block(THREAD_CONTINUE_NULL); - filt_timerlock(); + if (deadline == 0) { + return true; + } - assert((kn->kn_hookid & TIMER_CANCELWAIT) == 0); - assert((kn->kn_hookid & TIMER_RUNNING) == 0); + if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) { + now = mach_continuous_time(); + } else { + now = mach_absolute_time(); + } + return deadline <= now; } +/* + * Arm a timer + * + * It is the responsibility of the caller to make sure the timer call + * has completed or been cancelled properly prior to arming it. + */ static void filt_timerarm(struct knote *kn) { - filt_timer_assert_locked(); - - assert((kn->kn_hookid & TIMER_RUNNING) == 0); - - thread_call_t callout = (thread_call_t)kn->kn_hook; - uint64_t deadline = kn->kn_ext[0]; uint64_t leeway = kn->kn_ext[1]; int filter_flags = kn->kn_sfflags; unsigned int timer_flags = 0; + assert(os_atomic_load(&kn->kn_hookid, relaxed) == TIMER_IDLE); + if (filter_flags & NOTE_CRITICAL) timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL; else if (filter_flags & NOTE_BACKGROUND) @@ -1474,85 +1418,51 @@ filt_timerarm(struct knote *kn) if (filter_flags & NOTE_MACH_CONTINUOUS_TIME) timer_flags |= THREAD_CALL_CONTINUOUS; - thread_call_enter_delayed_with_leeway(callout, NULL, - deadline, leeway, - timer_flags); - - kn->kn_hookid |= TIMER_RUNNING; -} - -/* - * Does this knote need a timer armed for it, or should it be ready immediately? - */ -static boolean_t -filt_timer_is_ready(struct knote *kn) -{ - uint64_t now; - - if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) - now = mach_continuous_time(); - else - now = mach_absolute_time(); - - uint64_t deadline = kn->kn_ext[0]; - - if (deadline < now) - return TRUE; - else - return FALSE; + os_atomic_store(&kn->kn_hookid, TIMER_ARMED, relaxed); + thread_call_enter_delayed_with_leeway((thread_call_t)kn->kn_hook, NULL, + deadline, leeway, timer_flags); } /* * Allocate a thread call for the knote's lifetime, and kick off the timer. */ static int -filt_timerattach(struct knote *kn, __unused struct kevent_internal_s *kev) +filt_timerattach(struct knote *kn, struct kevent_internal_s *kev) { thread_call_t callout; + struct filt_timer_params params; int error; + if ((error = filt_timervalidate(kev, ¶ms)) != 0) { + knote_set_error(kn, error); + return 0; + } + callout = thread_call_allocate_with_options(filt_timerexpire, (thread_call_param_t)kn, THREAD_CALL_PRIORITY_HIGH, THREAD_CALL_OPTIONS_ONCE); if (NULL == callout) { - kn->kn_flags = EV_ERROR; - kn->kn_data = ENOMEM; - return 0; - } - - filt_timerlock(); - - if ((error = filt_timervalidate(kn)) != 0) { - kn->kn_flags = EV_ERROR; - kn->kn_data = error; - filt_timerunlock(); - - __assert_only boolean_t freed = thread_call_free(callout); - assert(freed); + knote_set_error(kn, ENOMEM); return 0; } - kn->kn_hook = (void*)callout; - kn->kn_hookid = 0; + filt_timer_set_params(kn, ¶ms); + kn->kn_hook = callout; kn->kn_flags |= EV_CLEAR; + os_atomic_store(&kn->kn_hookid, TIMER_IDLE, relaxed); /* NOTE_ABSOLUTE implies EV_ONESHOT */ if (kn->kn_sfflags & NOTE_ABSOLUTE) kn->kn_flags |= EV_ONESHOT; - boolean_t timer_ready = FALSE; - - if ((timer_ready = filt_timer_is_ready(kn))) { - /* cause immediate expiration */ - kn->kn_data = 1; + if (filt_timer_is_ready(kn)) { + os_atomic_store(&kn->kn_hookid, TIMER_IMMEDIATE, relaxed); + return FILTER_ACTIVE; } else { filt_timerarm(kn); + return 0; } - - filt_timerunlock(); - - return timer_ready; } /* @@ -1561,34 +1471,17 @@ filt_timerattach(struct knote *kn, __unused struct kevent_internal_s *kev) static void filt_timerdetach(struct knote *kn) { - thread_call_t callout; - - filt_timerlock(); - - callout = (thread_call_t)kn->kn_hook; - filt_timercancel(kn); - - filt_timerunlock(); + __assert_only boolean_t freed; - __assert_only boolean_t freed = thread_call_free(callout); + /* + * Unconditionally cancel to make sure there can't be any filt_timerexpire() + * running anymore. + */ + thread_call_cancel_wait((thread_call_t)kn->kn_hook); + freed = thread_call_free((thread_call_t)kn->kn_hook); assert(freed); } -/* - * filt_timerevent - post events to a timer knote - * - * Called in the context of filt_timerexpire with - * the filt_timerlock held - */ -static int -filt_timerevent(struct knote *kn, __unused long hint) -{ - filt_timer_assert_locked(); - - kn->kn_data = 1; - return (1); -} - /* * filt_timertouch - update timer knote with new user input * @@ -1597,54 +1490,36 @@ filt_timerevent(struct knote *kn, __unused long hint) * pops have gone off (in kn_data). */ static int -filt_timertouch( - struct knote *kn, - struct kevent_internal_s *kev) +filt_timertouch(struct knote *kn, struct kevent_internal_s *kev) { + struct filt_timer_params params; + uint32_t changed_flags = (kn->kn_sfflags ^ kev->fflags); int error; - filt_timerlock(); - - /* - * cancel current call - drops and retakes lock - * TODO: not safe against concurrent touches? - */ - filt_timercancel(kn); + if (changed_flags & NOTE_ABSOLUTE) { + kev->flags |= EV_ERROR; + kev->data = EINVAL; + return 0; + } - /* clear if the timer had previously fired, the user no longer wants to see it */ - kn->kn_data = 0; + if ((error = filt_timervalidate(kev, ¶ms)) != 0) { + kev->flags |= EV_ERROR; + kev->data = error; + return 0; + } /* capture the new values used to compute deadline */ - kn->kn_sdata = kev->data; + filt_timercancel(kn); + filt_timer_set_params(kn, ¶ms); kn->kn_sfflags = kev->fflags; - kn->kn_ext[0] = kev->ext[0]; - kn->kn_ext[1] = kev->ext[1]; - - if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) - kn->kn_udata = kev->udata; - - /* recalculate deadline */ - error = filt_timervalidate(kn); - if (error) { - /* no way to report error, so mark it in the knote */ - kn->kn_flags |= EV_ERROR; - kn->kn_data = error; - filt_timerunlock(); - return 1; - } - boolean_t timer_ready = FALSE; - - if ((timer_ready = filt_timer_is_ready(kn))) { - /* cause immediate expiration */ - kn->kn_data = 1; + if (filt_timer_is_ready(kn)) { + os_atomic_store(&kn->kn_hookid, TIMER_IMMEDIATE, relaxed); + return FILTER_ACTIVE | FILTER_UPDATE_REQ_QOS; } else { filt_timerarm(kn); + return FILTER_UPDATE_REQ_QOS; } - - filt_timerunlock(); - - return timer_ready; } /* @@ -1660,24 +1535,43 @@ filt_timerprocess( __unused struct filt_process_s *data, struct kevent_internal_s *kev) { - filt_timerlock(); - - if (kn->kn_data == 0 || (kn->kn_hookid & TIMER_CANCELWAIT)) { + /* + * filt_timerprocess is serialized with any filter routine except for + * filt_timerexpire which atomically does a TIMER_ARMED -> TIMER_FIRED + * transition, and on success, activates the knote. + * + * Hence, we don't need atomic modifications of the state, only to peek at + * whether we see any of the "FIRED" state, and if we do, it is safe to + * do simple state machine transitions. + */ + switch (os_atomic_load(&kn->kn_hookid, relaxed)) { + case TIMER_IDLE: + case TIMER_ARMED: /* - * kn_data = 0: - * The timer hasn't yet fired, so there's nothing to deliver - * TIMER_CANCELWAIT: - * touch is in the middle of canceling the timer, - * so don't deliver or re-arm anything - * * This can happen if a touch resets a timer that had fired * without being processed */ - filt_timerunlock(); return 0; } - if (kn->kn_sdata != 0 && ((kn->kn_flags & EV_ERROR) == 0)) { + os_atomic_store(&kn->kn_hookid, TIMER_IDLE, relaxed); + + /* + * Copy out the interesting kevent state, + * but don't leak out the raw time calculations. + * + * TODO: potential enhancements - tell the user about: + * - deadline to which this timer thought it was expiring + * - return kn_sfflags in the fflags field so the client can know + * under what flags the timer fired + */ + *kev = kn->kn_kevent; + kev->ext[0] = 0; + /* kev->ext[1] = 0; JMM - shouldn't we hide this too? */ + + if (kn->kn_sdata == 0) { + kev->data = 1; + } else { /* * This is a 'repeating' timer, so we have to emit * how many intervals expired between the arm @@ -1687,9 +1581,6 @@ filt_timerprocess( * this could easily be done in the client... */ - /* The timer better have had expired... */ - assert((kn->kn_hookid & TIMER_RUNNING) == 0); - uint64_t now; if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) @@ -1713,18 +1604,11 @@ filt_timerprocess( * and be in repeating mode, so therefore it must have been * more than 'interval' time since the attach or last * successful touch. - * - * An unsuccessful touch would: - * disarm the timer - * clear kn_data - * clear kn_sdata - * set EV_ERROR - * all of which will prevent this code from running. */ assert(num_fired > 0); /* report how many intervals have elapsed to the user */ - kn->kn_data = (int64_t) num_fired; + kev->data = (int64_t)num_fired; /* We only need to re-arm the timer if it's not about to be destroyed */ if ((kn->kn_flags & EV_ONESHOT) == 0) { @@ -1735,62 +1619,33 @@ filt_timerprocess( kn->kn_ext[0] = new_deadline; + /* + * This can't shortcut setting up the thread call, because + * knote_process deactivates EV_CLEAR knotes unconditionnally. + */ filt_timerarm(kn); } } - /* - * Copy out the interesting kevent state, - * but don't leak out the raw time calculations. - * - * TODO: potential enhancements - tell the user about: - * - deadline to which this timer thought it was expiring - * - return kn_sfflags in the fflags field so the client can know - * under what flags the timer fired - */ - *kev = kn->kn_kevent; - kev->ext[0] = 0; - /* kev->ext[1] = 0; JMM - shouldn't we hide this too? */ - - /* we have delivered the event, reset the timer pop count */ - kn->kn_data = 0; - - filt_timerunlock(); - return 1; + return FILTER_ACTIVE; } SECURITY_READ_ONLY_EARLY(static struct filterops) timer_filtops = { + .f_extended_codes = true, .f_attach = filt_timerattach, .f_detach = filt_timerdetach, - .f_event = filt_timerevent, + .f_event = filt_badevent, .f_touch = filt_timertouch, .f_process = filt_timerprocess, }; - -#pragma mark EVFILT_USER - - -static void -filt_userlock(void) -{ - lck_spin_lock(&_filt_userlock); -} - -static void -filt_userunlock(void) -{ - lck_spin_unlock(&_filt_userlock); -} +#pragma mark user_filtops static int filt_userattach(struct knote *kn, __unused struct kevent_internal_s *kev) { - /* EVFILT_USER knotes are not attached to anything in the kernel */ - /* Cant discover this knote until after attach - so no lock needed */ - kn->kn_hook = NULL; if (kn->kn_sfflags & NOTE_TRIGGER) { - kn->kn_hookid = 1; + kn->kn_hookid = FILTER_ACTIVE; } else { kn->kn_hookid = 0; } @@ -1804,24 +1659,10 @@ filt_userdetach(__unused struct knote *kn) } static int -filt_user( - __unused struct knote *kn, - __unused long hint) -{ - panic("filt_user"); - return 0; -} - -static int -filt_usertouch( - struct knote *kn, - struct kevent_internal_s *kev) +filt_usertouch(struct knote *kn, struct kevent_internal_s *kev) { uint32_t ffctrl; int fflags; - int active; - - filt_userlock(); ffctrl = kev->fflags & NOTE_FFCTRLMASK; fflags = kev->fflags & NOTE_FFLAGSMASK; @@ -1840,17 +1681,10 @@ filt_usertouch( } kn->kn_sdata = kev->data; - if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) - kn->kn_udata = kev->udata; - if (kev->fflags & NOTE_TRIGGER) { - kn->kn_hookid = 1; + kn->kn_hookid = FILTER_ACTIVE; } - active = kn->kn_hookid; - - filt_userunlock(); - - return (active); + return (int)kn->kn_hookid; } static int @@ -1859,34 +1693,32 @@ filt_userprocess( __unused struct filt_process_s *data, struct kevent_internal_s *kev) { - filt_userlock(); - - if (kn->kn_hookid == 0) { - filt_userunlock(); - return 0; - } + int result = (int)kn->kn_hookid; - *kev = kn->kn_kevent; - kev->fflags = (volatile UInt32)kn->kn_sfflags; - kev->data = kn->kn_sdata; - if (kn->kn_flags & EV_CLEAR) { - kn->kn_hookid = 0; - kn->kn_data = 0; - kn->kn_fflags = 0; + if (result) { + *kev = kn->kn_kevent; + kev->fflags = kn->kn_sfflags; + kev->data = kn->kn_sdata; + if (kn->kn_flags & EV_CLEAR) { + kn->kn_hookid = 0; + kn->kn_data = 0; + kn->kn_fflags = 0; + } } - filt_userunlock(); - return 1; + return result; } -#pragma mark EVFILT_WORKLOOP +SECURITY_READ_ONLY_EARLY(static struct filterops) user_filtops = { + .f_extended_codes = true, + .f_attach = filt_userattach, + .f_detach = filt_userdetach, + .f_event = filt_badevent, + .f_touch = filt_usertouch, + .f_process = filt_userprocess, +}; -#if DEBUG || DEVELOPMENT -/* - * see src/queue_internal.h in libdispatch - */ -#define DISPATCH_QUEUE_ENQUEUED 0x1ull -#endif +#pragma mark workloop_filtops static inline void filt_wllock(struct kqworkloop *kqwl) @@ -1900,117 +1732,188 @@ filt_wlunlock(struct kqworkloop *kqwl) lck_mtx_unlock(&kqwl->kqwl_statelock); } -static inline void -filt_wlheld(__assert_only struct kqworkloop *kqwl) -{ - LCK_MTX_ASSERT(&kqwl->kqwl_statelock, LCK_MTX_ASSERT_OWNED); -} - -#define WL_OWNER_SUSPENDED ((thread_t)(~0ull)) /* special owner when suspended */ - +/* + * Returns true when the interlock for the turnstile is the workqueue lock + * + * When this is the case, all turnstiles operations are delegated + * to the workqueue subsystem. + * + * This is required because kqueue_threadreq_bind_prepost only holds the + * workqueue lock but needs to move the inheritor from the workloop turnstile + * away from the creator thread, so that this now fulfilled request cannot be + * picked anymore by other threads. + */ static inline bool -filt_wlowner_is_valid(thread_t owner) +filt_wlturnstile_interlock_is_workq(struct kqworkloop *kqwl) { - return owner != THREAD_NULL && owner != WL_OWNER_SUSPENDED; + struct kqrequest *kqr = &kqwl->kqwl_request; + return (kqr->kqr_state & KQR_THREQUESTED) && + (kqr->kqr_thread == THREAD_NULL); } -static inline bool -filt_wlshould_end_ownership(struct kqworkloop *kqwl, - struct kevent_internal_s *kev, int error) +static void +filt_wlupdate_inheritor(struct kqworkloop *kqwl, struct turnstile *ts, + turnstile_update_flags_t flags) { - thread_t owner = kqwl->kqwl_owner; - return (error == 0 || error == ESTALE) && - (kev->fflags & NOTE_WL_END_OWNERSHIP) && - (owner == current_thread() || owner == WL_OWNER_SUSPENDED); -} + turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL; + struct kqrequest *kqr = &kqwl->kqwl_request; -static inline bool -filt_wlshould_update_ownership(struct kevent_internal_s *kev, int error) -{ - return error == 0 && (kev->fflags & NOTE_WL_DISCOVER_OWNER) && - kev->ext[EV_EXTIDX_WL_ADDR]; -} + /* + * binding to the workq should always happen through + * workq_kern_threadreq_update_inheritor() + */ + assert(!filt_wlturnstile_interlock_is_workq(kqwl)); -static inline bool -filt_wlshould_set_async_qos(struct kevent_internal_s *kev, int error, - kq_index_t async_qos) -{ - if (error != 0) { - return false; + if ((inheritor = kqwl->kqwl_owner)) { + flags |= TURNSTILE_INHERITOR_THREAD; + } else if ((inheritor = kqr->kqr_thread)) { + flags |= TURNSTILE_INHERITOR_THREAD; } - if (async_qos != THREAD_QOS_UNSPECIFIED) { - return true; - } - if ((kev->fflags & NOTE_WL_THREAD_REQUEST) && (kev->flags & EV_DELETE)) { - /* see filt_wlprocess() */ - return true; - } - return false; + + turnstile_update_inheritor(ts, inheritor, flags); } +#define FILT_WLATTACH 0 +#define FILT_WLTOUCH 1 +#define FILT_WLDROP 2 + __result_use_check static int -filt_wlupdateowner(struct kqworkloop *kqwl, struct kevent_internal_s *kev, - int error, kq_index_t async_qos) +filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn, + struct kevent_internal_s *kev, kq_index_t qos_index, int op) { + user_addr_t uaddr = CAST_USER_ADDR_T(kev->ext[EV_EXTIDX_WL_ADDR]); struct kqrequest *kqr = &kqwl->kqwl_request; thread_t cur_owner, new_owner, extra_thread_ref = THREAD_NULL; - kq_index_t cur_override = THREAD_QOS_UNSPECIFIED; - kq_index_t old_owner_override = THREAD_QOS_UNSPECIFIED; - boolean_t ipc_override_is_sync = false; - boolean_t old_owner_override_is_sync = false; - int action = KQWL_UTQ_NONE; + kq_index_t cur_owner_override = THREAD_QOS_UNSPECIFIED; + int action = KQWL_UTQ_NONE, error = 0; + bool needs_wake = false, needs_wllock = false; + uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE]; + uint64_t mask = kev->ext[EV_EXTIDX_WL_MASK]; + uint64_t udata = 0; + + if (kev->fflags & (NOTE_WL_END_OWNERSHIP | NOTE_WL_DISCOVER_OWNER)) { + /* + * If we're maybe going to change the kqwl_owner, + * then we need to hold the filt_wllock(). + */ + needs_wllock = true; + } else if (kqr->kqr_thread == current_thread()) { + /* + * Servicer updates need to be serialized with + * any ownership change too, as the kqr_thread value influences the + * outcome of handling NOTE_WL_DISCOVER_OWNER. + */ + needs_wllock = true; + } - filt_wlheld(kqwl); + if (needs_wllock) { + filt_wllock(kqwl); + /* + * The kqwl owner is set under both the req and filter lock, + * meaning it's fine to look at it under any. + */ + new_owner = cur_owner = kqwl->kqwl_owner; + } else { + new_owner = cur_owner = THREAD_NULL; + } /* - * The owner is only changed under both the filt_wllock and the - * kqwl_req_lock. Looking at it with either one held is fine. + * Phase 1: + * + * If asked, load the uint64 value at the user provided address and compare + * it against the passed in mask and expected value. + * + * If NOTE_WL_DISCOVER_OWNER is specified, translate the loaded name as + * a thread reference. + * + * If NOTE_WL_END_OWNERSHIP is specified and the currently known owner is + * the current thread, then end ownership. + * + * Lastly decide whether we need to perform a QoS update. */ - cur_owner = kqwl->kqwl_owner; - if (filt_wlshould_end_ownership(kqwl, kev, error)) { - new_owner = THREAD_NULL; - } else if (filt_wlshould_update_ownership(kev, error)) { - /* - * Decipher the owner port name, and translate accordingly. - * The low 2 bits were borrowed for other flags, so mask them off. - */ - uint64_t udata = kev->ext[EV_EXTIDX_WL_VALUE]; - mach_port_name_t new_owner_name = (mach_port_name_t)udata & ~0x3; - if (new_owner_name != MACH_PORT_NULL) { - new_owner_name = ipc_entry_name_mask(new_owner_name); - } - - if (MACH_PORT_VALID(new_owner_name)) { - new_owner = port_name_to_thread(new_owner_name); - if (new_owner == THREAD_NULL) - return EOWNERDEAD; - extra_thread_ref = new_owner; - } else if (new_owner_name == MACH_PORT_DEAD) { - new_owner = WL_OWNER_SUSPENDED; - } else { + if (uaddr) { + error = copyin_word(uaddr, &udata, sizeof(udata)); + if (error) { + goto out; + } + + /* Update state as copied in. */ + kev->ext[EV_EXTIDX_WL_VALUE] = udata; + + if ((udata & mask) != (kdata & mask)) { + error = ESTALE; + } else if (kev->fflags & NOTE_WL_DISCOVER_OWNER) { /* - * We never want to learn a new owner that is NULL. - * Ownership should be ended with END_OWNERSHIP. + * Decipher the owner port name, and translate accordingly. + * The low 2 bits were borrowed for other flags, so mask them off. + * + * Then attempt translation to a thread reference or fail. */ - new_owner = cur_owner; + mach_port_name_t name = (mach_port_name_t)udata & ~0x3; + if (name != MACH_PORT_NULL) { + name = ipc_entry_name_mask(name); + extra_thread_ref = port_name_to_thread(name); + if (extra_thread_ref == THREAD_NULL) { + error = EOWNERDEAD; + goto out; + } + new_owner = extra_thread_ref; + } } - } else { - new_owner = cur_owner; } - if (filt_wlshould_set_async_qos(kev, error, async_qos)) { - action = KQWL_UTQ_SET_ASYNC_QOS; + if ((kev->fflags & NOTE_WL_END_OWNERSHIP) && new_owner == current_thread()) { + new_owner = THREAD_NULL; + } + + if (error == 0) { + if ((kev->fflags & NOTE_WL_THREAD_REQUEST) && (kev->flags & EV_DELETE)) { + action = KQWL_UTQ_SET_QOS_INDEX; + } else if (qos_index && kqr->kqr_qos_index != qos_index) { + action = KQWL_UTQ_SET_QOS_INDEX; + } + + if (op == FILT_WLTOUCH) { + /* + * Save off any additional fflags/data we just accepted + * But only keep the last round of "update" bits we acted on which helps + * debugging a lot. + */ + kn->kn_sfflags &= ~NOTE_WL_UPDATES_MASK; + kn->kn_sfflags |= kev->fflags; + kn->kn_sdata = kev->data; + if (kev->fflags & NOTE_WL_SYNC_WAKE) { + needs_wake = (kn->kn_hook != THREAD_NULL); + } + } else if (op == FILT_WLDROP) { + if ((kn->kn_sfflags & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE)) == + NOTE_WL_SYNC_WAIT) { + /* + * When deleting a SYNC_WAIT knote that hasn't been woken up + * explicitly, issue a wake up. + */ + kn->kn_sfflags |= NOTE_WL_SYNC_WAKE; + needs_wake = (kn->kn_hook != THREAD_NULL); + } + } } - if (cur_owner == new_owner && action == KQWL_UTQ_NONE) { + + /* + * Phase 2: + * + * Commit ownership and QoS changes if any, possibly wake up waiters + */ + + if (cur_owner == new_owner && action == KQWL_UTQ_NONE && !needs_wake) { goto out; } - kqwl_req_lock(kqwl); + kq_req_lock(kqwl); /* If already tracked as servicer, don't track as owner */ - if ((kqr->kqr_state & KQR_BOUND) && new_owner == kqr->kqr_thread) { - kqwl->kqwl_owner = new_owner = THREAD_NULL; + if (new_owner == kqr->kqr_thread) { + new_owner = THREAD_NULL; } if (cur_owner != new_owner) { @@ -2019,30 +1922,24 @@ filt_wlupdateowner(struct kqworkloop *kqwl, struct kevent_internal_s *kev, /* we just transfered this ref to kqwl_owner */ extra_thread_ref = THREAD_NULL; } - cur_override = kqworkloop_combined_qos(kqwl, &ipc_override_is_sync); - old_owner_override = kqr->kqr_dsync_owner_qos; - old_owner_override_is_sync = kqr->kqr_owner_override_is_sync; + cur_owner_override = kqworkloop_owner_override(kqwl); + + if (cur_owner) { + thread_ends_owning_workloop(cur_owner); + } - if (filt_wlowner_is_valid(new_owner)) { + if (new_owner) { /* override it before we drop the old */ - if (cur_override != THREAD_QOS_UNSPECIFIED) { - thread_add_ipc_override(new_owner, cur_override); + if (cur_owner_override != THREAD_QOS_UNSPECIFIED) { + thread_add_ipc_override(new_owner, cur_owner_override); } - if (ipc_override_is_sync) { - thread_add_sync_ipc_override(new_owner); - } - /* Update the kqr to indicate that owner has sync ipc override */ - kqr->kqr_dsync_owner_qos = cur_override; - kqr->kqr_owner_override_is_sync = ipc_override_is_sync; thread_starts_owning_workloop(new_owner); - if ((kqr->kqr_state & (KQR_THREQUESTED | KQR_BOUND)) == KQR_THREQUESTED) { + if ((kqr->kqr_state & KQR_THREQUESTED) && !kqr->kqr_thread) { if (action == KQWL_UTQ_NONE) { action = KQWL_UTQ_REDRIVE_EVENTS; } } - } else if (new_owner == THREAD_NULL) { - kqr->kqr_dsync_owner_qos = THREAD_QOS_UNSPECIFIED; - kqr->kqr_owner_override_is_sync = false; + } else { if ((kqr->kqr_state & (KQR_THREQUESTED | KQR_WAKEUP)) == KQR_WAKEUP) { if (action == KQWL_UTQ_NONE) { action = KQWL_UTQ_REDRIVE_EVENTS; @@ -2051,74 +1948,100 @@ filt_wlupdateowner(struct kqworkloop *kqwl, struct kevent_internal_s *kev, } } + struct turnstile *ts = kqwl->kqwl_turnstile; + bool wl_inheritor_updated = false; + if (action != KQWL_UTQ_NONE) { - kqworkloop_update_threads_qos(kqwl, action, async_qos); + kqworkloop_update_threads_qos(kqwl, action, qos_index); } - kqwl_req_unlock(kqwl); - - /* Now that we are unlocked, drop the override and ref on old owner */ - if (new_owner != cur_owner && filt_wlowner_is_valid(cur_owner)) { - if (old_owner_override != THREAD_QOS_UNSPECIFIED) { - thread_drop_ipc_override(cur_owner); + if (cur_owner != new_owner && ts) { + if (action == KQWL_UTQ_REDRIVE_EVENTS) { + /* + * Note that when action is KQWL_UTQ_REDRIVE_EVENTS, + * the code went through workq_kern_threadreq_initiate() + * and the workqueue has set the inheritor already + */ + assert(filt_wlturnstile_interlock_is_workq(kqwl)); + } else if (filt_wlturnstile_interlock_is_workq(kqwl)) { + workq_kern_threadreq_lock(kqwl->kqwl_p); + workq_kern_threadreq_update_inheritor(kqwl->kqwl_p, kqr, new_owner, + ts, TURNSTILE_IMMEDIATE_UPDATE); + workq_kern_threadreq_unlock(kqwl->kqwl_p); + if (!filt_wlturnstile_interlock_is_workq(kqwl)) { + /* + * If the workq is no longer the interlock, then + * workq_kern_threadreq_update_inheritor() has finished a bind + * and we need to fallback to the regular path. + */ + filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE); + } + wl_inheritor_updated = true; + } else { + filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE); + wl_inheritor_updated = true; } - if (old_owner_override_is_sync) { - thread_drop_sync_ipc_override(cur_owner); + + /* + * We need a turnstile reference because we are dropping the interlock + * and the caller has not called turnstile_prepare. + */ + if (wl_inheritor_updated) { + turnstile_reference(ts); } - thread_ends_owning_workloop(cur_owner); - thread_deallocate(cur_owner); } -out: - if (extra_thread_ref) { - thread_deallocate(extra_thread_ref); + if (needs_wake && ts) { + waitq_wakeup64_thread(&ts->ts_waitq, CAST_EVENT64_T((event_t)kn), + (thread_t)kn->kn_hook, THREAD_AWAKENED); } - return error; -} -static int -filt_wldebounce( - struct kqworkloop *kqwl, - struct kevent_internal_s *kev, - int default_result) -{ - user_addr_t addr = CAST_USER_ADDR_T(kev->ext[EV_EXTIDX_WL_ADDR]); - uint64_t udata; - int error; + kq_req_unlock(kqwl); - /* we must have the workloop state mutex held */ - filt_wlheld(kqwl); + if (wl_inheritor_updated) { + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD); + turnstile_deallocate(ts); + } - /* Do we have a debounce address to work with? */ - if (addr) { - uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE]; - uint64_t mask = kev->ext[EV_EXTIDX_WL_MASK]; +out: + /* + * Phase 3: + * + * Unlock and cleanup various lingering references and things. + */ + if (needs_wllock) { + filt_wlunlock(kqwl); + } - error = copyin_word(addr, &udata, sizeof(udata)); - if (error) { - return error; - } +#if CONFIG_WORKLOOP_DEBUG + KQWL_HISTORY_WRITE_ENTRY(kqwl, { + .updater = current_thread(), + .servicer = kqr->kqr_thread, /* Note: racy */ + .old_owner = cur_owner, + .new_owner = new_owner, - /* update state as copied in */ - kev->ext[EV_EXTIDX_WL_VALUE] = udata; + .kev_ident = kev->ident, + .error = (int16_t)error, + .kev_flags = kev->flags, + .kev_fflags = kev->fflags, - /* If the masked bits don't match, reject it as stale */ - if ((udata & mask) != (kdata & mask)) { - return ESTALE; - } + .kev_mask = mask, + .kev_value = kdata, + .in_value = udata, + }); +#endif // CONFIG_WORKLOOP_DEBUG -#if DEBUG || DEVELOPMENT - if ((kev->fflags & NOTE_WL_THREAD_REQUEST) && !(kev->flags & EV_DELETE)) { - if ((udata & DISPATCH_QUEUE_ENQUEUED) == 0 && - (udata >> 48) != 0 && (udata >> 48) != 0xffff) { - panic("kevent: workloop %#016llx is not enqueued " - "(kev:%p dq_state:%#016llx)", kev->udata, kev, udata); - } + if (cur_owner && new_owner != cur_owner) { + if (cur_owner_override != THREAD_QOS_UNSPECIFIED) { + thread_drop_ipc_override(cur_owner); } -#endif + thread_deallocate(cur_owner); } - return default_result; + if (extra_thread_ref) { + thread_deallocate(extra_thread_ref); + } + return error; } /* @@ -2129,59 +2052,14 @@ filt_wldebounce( * - data is set to the error if any */ static inline void -filt_wlremember_last_update( - __assert_only struct kqworkloop *kqwl, - struct knote *kn, - struct kevent_internal_s *kev, - int error) +filt_wlremember_last_update(struct knote *kn, struct kevent_internal_s *kev, + int error) { - filt_wlheld(kqwl); kn->kn_fflags = kev->fflags; kn->kn_data = error; memcpy(kn->kn_ext, kev->ext, sizeof(kev->ext)); } -/* - * Return which operations on EVFILT_WORKLOOP need to be protected against - * knoteusewait() causing priority inversions. - */ -static bool -filt_wlneeds_boost(struct kevent_internal_s *kev) -{ - if (kev == NULL) { - /* - * this is an f_process() usecount, and it can cause a drop to wait - */ - return true; - } - if (kev->fflags & NOTE_WL_THREAD_REQUEST) { - /* - * All operations on thread requests may starve drops or re-attach of - * the same knote, all of them need boosts. None of what we do under - * thread-request usecount holds blocks anyway. - */ - return true; - } - if (kev->fflags & NOTE_WL_SYNC_WAIT) { - /* - * this may call filt_wlwait() and we don't want to hold any boost when - * woken up, this would cause background threads contending on - * dispatch_sync() to wake up at 64 and be preempted immediately when - * this drops. - */ - return false; - } - - /* - * SYNC_WAIT knotes when deleted don't need to be rushed, there's no - * detach/reattach race with these ever. In addition to this, when the - * SYNC_WAIT knote is dropped, the caller is no longer receiving the - * workloop overrides if any, and we'd rather schedule other threads than - * him, he's not possibly stalling anything anymore. - */ - return (kev->flags & EV_DELETE) == 0; -} - static int filt_wlattach(struct knote *kn, struct kevent_internal_s *kev) { @@ -2199,7 +2077,7 @@ filt_wlattach(struct knote *kn, struct kevent_internal_s *kev) if (kev->ident == 0 && kev->udata == 0 && kev->fflags == 0) { struct kqrequest *kqr = &kqwl->kqwl_request; - kqwl_req_lock(kqwl); + kq_req_lock(kqwl); kev->fflags = 0; if (kqr->kqr_dsync_waiters) { kev->fflags |= NOTE_WL_SYNC_WAIT; @@ -2207,21 +2085,16 @@ filt_wlattach(struct knote *kn, struct kevent_internal_s *kev) if (kqr->kqr_qos_index) { kev->fflags |= NOTE_WL_THREAD_REQUEST; } - if (kqwl->kqwl_owner == WL_OWNER_SUSPENDED) { - kev->ext[0] = ~0ull; - } else { - kev->ext[0] = thread_tid(kqwl->kqwl_owner); - } + kev->ext[0] = thread_tid(kqwl->kqwl_owner); kev->ext[1] = thread_tid(kqwl->kqwl_request.kqr_thread); kev->ext[2] = thread_owned_workloops_count(current_thread()); kev->ext[3] = kn->kn_kevent.ext[3]; - kqwl_req_unlock(kqwl); + kq_req_unlock(kqwl); error = EBUSY; goto out; } #endif - /* Some simple validation */ int command = (kn->kn_sfflags & NOTE_WL_COMMANDS_MASK); switch (command) { case NOTE_WL_THREAD_REQUEST: @@ -2229,19 +2102,22 @@ filt_wlattach(struct knote *kn, struct kevent_internal_s *kev) error = EINVAL; goto out; } - qos_index = qos_index_from_qos(kn, kn->kn_qos, FALSE); - if (qos_index < THREAD_QOS_MAINTENANCE || - qos_index > THREAD_QOS_USER_INTERACTIVE) { + qos_index = _pthread_priority_thread_qos(kn->kn_qos); + if (qos_index == THREAD_QOS_UNSPECIFIED) { error = ERANGE; goto out; } + if (kqwl->kqwl_request.kqr_qos_index) { + /* + * There already is a thread request, and well, you're only allowed + * one per workloop, so fail the attach. + */ + error = EALREADY; + goto out; + } break; case NOTE_WL_SYNC_WAIT: case NOTE_WL_SYNC_WAKE: - if (kq->kq_state & KQ_NO_WQ_THREAD) { - error = ENOTSUP; - goto out; - } if (kn->kn_id == kqwl->kqwl_dynamicid) { error = EINVAL; goto out; @@ -2260,139 +2136,131 @@ filt_wlattach(struct knote *kn, struct kevent_internal_s *kev) goto out; } - filt_wllock(kqwl); - kn->kn_hook = NULL; - - if (command == NOTE_WL_THREAD_REQUEST && kqwl->kqwl_request.kqr_qos_index) { - /* - * There already is a thread request, and well, you're only allowed - * one per workloop, so fail the attach. - * - * Note: kqr_qos_index is always set with the wllock held, so we - * don't need to take the kqr lock. - */ - error = EALREADY; - } else { - /* Make sure user and kernel are in agreement on important state */ - error = filt_wldebounce(kqwl, kev, 0); - } + error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLATTACH); - error = filt_wlupdateowner(kqwl, kev, error, qos_index); - filt_wlunlock(kqwl); out: if (error) { - kn->kn_flags |= EV_ERROR; /* If userland wants ESTALE to be hidden, fail the attach anyway */ if (error == ESTALE && (kn->kn_sfflags & NOTE_WL_IGNORE_ESTALE)) { error = 0; } - kn->kn_data = error; + knote_set_error(kn, error); return 0; } - + if (command == NOTE_WL_SYNC_WAIT) { + return kevent_register_wait_prepare(kn, kev); + } /* Just attaching the thread request successfully will fire it */ - return command == NOTE_WL_THREAD_REQUEST; + if (command == NOTE_WL_THREAD_REQUEST) { + /* + * Thread Request knotes need an explicit touch to be active again, + * so delivering an event needs to also consume it. + */ + kn->kn_flags |= EV_CLEAR; + return FILTER_ACTIVE; + } + return 0; } -__attribute__((noinline,not_tail_called)) -static int -filt_wlwait(struct kqworkloop *kqwl, - struct knote *kn, - struct kevent_internal_s *kev) +static void __dead2 +filt_wlwait_continue(void *parameter, wait_result_t wr) { - filt_wlheld(kqwl); - assert((kn->kn_sfflags & NOTE_WL_SYNC_WAKE) == 0); + struct _kevent_register *cont_args = parameter; + struct kqworkloop *kqwl = (struct kqworkloop *)cont_args->kq; + struct kqrequest *kqr = &kqwl->kqwl_request; - /* - * Hint to the wakeup side that this thread is waiting. Also used by - * stackshot for waitinfo. - */ - kn->kn_hook = current_thread(); + kq_req_lock(kqwl); + kqr->kqr_dsync_waiters--; + if (filt_wlturnstile_interlock_is_workq(kqwl)) { + workq_kern_threadreq_lock(kqwl->kqwl_p); + turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL); + workq_kern_threadreq_unlock(kqwl->kqwl_p); + } else { + turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL); + } + kq_req_unlock(kqwl); - thread_set_pending_block_hint(current_thread(), kThreadWaitWorkloopSyncWait); + turnstile_cleanup(); - wait_result_t wr = assert_wait(kn, THREAD_ABORTSAFE); + if (wr == THREAD_INTERRUPTED) { + cont_args->kev.flags |= EV_ERROR; + cont_args->kev.data = EINTR; + } else if (wr != THREAD_AWAKENED) { + panic("Unexpected wait result: %d", wr); + } - if (wr == THREAD_WAITING) { - kq_index_t qos_index = qos_index_from_qos(kn, kev->qos, TRUE); - struct kqrequest *kqr = &kqwl->kqwl_request; + kevent_register_wait_return(cont_args); +} - thread_t thread_to_handoff = THREAD_NULL; /* holds +1 thread ref */ +/* + * Called with the workloop mutex held, most of the time never returns as it + * calls filt_wlwait_continue through a continuation. + */ +static void __dead2 +filt_wlpost_register_wait(struct uthread *uth, struct knote_lock_ctx *knlc, + struct _kevent_register *cont_args) +{ + struct kqworkloop *kqwl = (struct kqworkloop *)cont_args->kq; + struct kqrequest *kqr = &kqwl->kqwl_request; + struct turnstile *ts; + bool workq_locked = false; - thread_t kqwl_owner = kqwl->kqwl_owner; - if (filt_wlowner_is_valid(kqwl_owner)) { - thread_reference(kqwl_owner); - thread_to_handoff = kqwl_owner; - } + kq_req_lock(kqwl); - kqwl_req_lock(kqwl); + kqr->kqr_dsync_waiters++; - if (qos_index) { - assert(kqr->kqr_dsync_waiters < UINT16_MAX); - kqr->kqr_dsync_waiters++; - if (qos_index > kqr->kqr_dsync_waiters_qos) { - kqworkloop_update_threads_qos(kqwl, - KQWL_UTQ_SET_SYNC_WAITERS_QOS, qos_index); - } - } + if (filt_wlturnstile_interlock_is_workq(kqwl)) { + workq_kern_threadreq_lock(kqwl->kqwl_p); + workq_locked = true; + } - if ((kqr->kqr_state & KQR_BOUND) && thread_to_handoff == THREAD_NULL) { - assert(kqr->kqr_thread != THREAD_NULL); - thread_t servicer = kqr->kqr_thread; + ts = turnstile_prepare((uintptr_t)kqwl, &kqwl->kqwl_turnstile, + TURNSTILE_NULL, TURNSTILE_WORKLOOPS); - thread_reference(servicer); - thread_to_handoff = servicer; + if (workq_locked) { + workq_kern_threadreq_update_inheritor(kqwl->kqwl_p, + &kqwl->kqwl_request, kqwl->kqwl_owner, ts, + TURNSTILE_DELAYED_UPDATE); + if (!filt_wlturnstile_interlock_is_workq(kqwl)) { + /* + * if the interlock is no longer the workqueue lock, + * then we don't need to hold it anymore. + */ + workq_kern_threadreq_unlock(kqwl->kqwl_p); + workq_locked = false; } + } + if (!workq_locked) { + /* + * If the interlock is the workloop's, then it's our responsibility to + * call update_inheritor, so just do it. + */ + filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_DELAYED_UPDATE); + } - kqwl_req_unlock(kqwl); - - filt_wlunlock(kqwl); - - /* TODO: use continuation based blocking */ - - /* consume a refcount on thread_to_handoff, then thread_block() */ - wr = thread_handoff(thread_to_handoff); - thread_to_handoff = THREAD_NULL; - - filt_wllock(kqwl); - - /* clear waiting state (only one waiting thread - so no race) */ - assert(kn->kn_hook == current_thread()); + thread_set_pending_block_hint(uth->uu_thread, kThreadWaitWorkloopSyncWait); + waitq_assert_wait64(&ts->ts_waitq, CAST_EVENT64_T(cont_args->knote), + THREAD_ABORTSAFE, TIMEOUT_WAIT_FOREVER); - if (qos_index) { - kqwl_req_lock(kqwl); - assert(kqr->kqr_dsync_waiters > 0); - if (--kqr->kqr_dsync_waiters == 0) { - assert(kqr->kqr_dsync_waiters_qos); - kqworkloop_update_threads_qos(kqwl, - KQWL_UTQ_SET_SYNC_WAITERS_QOS, 0); - } - kqwl_req_unlock(kqwl); - } + if (workq_locked) { + workq_kern_threadreq_unlock(kqwl->kqwl_p); } - kn->kn_hook = NULL; - - switch (wr) { - case THREAD_AWAKENED: - return 0; - case THREAD_INTERRUPTED: - return EINTR; - case THREAD_RESTART: - return ECANCELED; - default: - panic("filt_wlattach: unexpected wait result %d", wr); - return EINVAL; + thread_t thread = kqwl->kqwl_owner ?: kqr->kqr_thread; + if (thread) { + thread_reference(thread); } + kq_req_unlock(kqwl); + + kevent_register_wait_block(ts, thread, knlc, filt_wlwait_continue, cont_args); } /* called in stackshot context to report the thread responsible for blocking this thread */ void kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread, - event64_t event, - thread_waitinfo_t *waitinfo) + event64_t event, thread_waitinfo_t *waitinfo) { - struct knote *kn = (struct knote*) event; + struct knote *kn = (struct knote *)event; assert(kdp_is_in_zone(kn, "knote zone")); assert(kn->kn_hook == thread); @@ -2407,9 +2275,7 @@ kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread, thread_t kqwl_owner = kqwl->kqwl_owner; thread_t servicer = kqr->kqr_thread; - if (kqwl_owner == WL_OWNER_SUSPENDED) { - waitinfo->owner = STACKSHOT_WAITOWNER_SUSPENDED; - } else if (kqwl_owner != THREAD_NULL) { + if (kqwl_owner != THREAD_NULL) { assert(kdp_is_in_zone(kqwl_owner, "threads")); waitinfo->owner = thread_tid(kqwl->kqwl_owner); @@ -2424,205 +2290,82 @@ kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread, } waitinfo->context = kqwl->kqwl_dynamicid; - - return; -} - -/* - * Takes kqueue locked, returns locked, may drop in the middle and/or block for a while - */ -static int -filt_wlpost_attach(struct knote *kn, struct kevent_internal_s *kev) -{ - struct kqueue *kq = knote_get_kq(kn); - struct kqworkloop *kqwl = (struct kqworkloop *)kq; - int error = 0; - - if (kev->fflags & NOTE_WL_SYNC_WAIT) { - if (kqlock2knoteuse(kq, kn, KNUSE_NONE)) { - filt_wllock(kqwl); - /* if the wake has already preposted, don't wait */ - if ((kn->kn_sfflags & NOTE_WL_SYNC_WAKE) == 0) - error = filt_wlwait(kqwl, kn, kev); - filt_wlunlock(kqwl); - knoteuse2kqlock(kq, kn, KNUSE_NONE); - } - } - return error; } static void filt_wldetach(__assert_only struct knote *kn) { assert(knote_get_kq(kn)->kq_state & KQ_WORKLOOP); - - /* - * Thread requests have nothing to detach. - * Sync waiters should have been aborted out - * and drop their refs before we could drop/ - * detach their knotes. - */ - assert(kn->kn_hook == NULL); -} - -static int -filt_wlevent( - __unused struct knote *kn, - __unused long hint) -{ - panic("filt_wlevent"); - return 0; + if (kn->kn_hook) { + kevent_register_wait_cleanup(kn); + } } static int -filt_wlvalidate_kev_flags(struct knote *kn, struct kevent_internal_s *kev) +filt_wlvalidate_kev_flags(struct knote *kn, struct kevent_internal_s *kev, + thread_qos_t *qos_index) { int new_commands = kev->fflags & NOTE_WL_COMMANDS_MASK; int sav_commands = kn->kn_sfflags & NOTE_WL_COMMANDS_MASK; - int error = 0; + + if ((kev->fflags & NOTE_WL_DISCOVER_OWNER) && (kev->flags & EV_DELETE)) { + return EINVAL; + } + if (kev->fflags & NOTE_WL_UPDATE_QOS) { + if (kev->flags & EV_DELETE) { + return EINVAL; + } + if (sav_commands != NOTE_WL_THREAD_REQUEST) { + return EINVAL; + } + if (!(*qos_index = _pthread_priority_thread_qos(kev->qos))) { + return ERANGE; + } + } switch (new_commands) { case NOTE_WL_THREAD_REQUEST: /* thread requests can only update themselves */ - if (sav_commands != new_commands) - error = EINVAL; + if (sav_commands != NOTE_WL_THREAD_REQUEST) + return EINVAL; break; case NOTE_WL_SYNC_WAIT: if (kev->fflags & NOTE_WL_END_OWNERSHIP) - error = EINVAL; - /* FALLTHROUGH */ + return EINVAL; + goto sync_checks; + case NOTE_WL_SYNC_WAKE: - /* waits and wakes can update themselves or their counterparts */ + sync_checks: if (!(sav_commands & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE))) - error = EINVAL; - if (kev->fflags & NOTE_WL_UPDATE_QOS) - error = EINVAL; + return EINVAL; if ((kev->flags & (EV_ENABLE | EV_DELETE)) == EV_ENABLE) - error = EINVAL; - if (kev->flags & EV_DELETE) { - /* - * Really this is not supported: there is absolutely no reason - * whatsoever to want to fail the drop of a NOTE_WL_SYNC_WAIT knote. - */ - if (kev->ext[EV_EXTIDX_WL_ADDR] && kev->ext[EV_EXTIDX_WL_MASK]) { - error = EINVAL; - } - } + return EINVAL; break; default: - error = EINVAL; - } - if ((kev->flags & EV_DELETE) && (kev->fflags & NOTE_WL_DISCOVER_OWNER)) { - error = EINVAL; + return EINVAL; } - return error; + return 0; } static int -filt_wltouch( - struct knote *kn, - struct kevent_internal_s *kev) +filt_wltouch(struct knote *kn, struct kevent_internal_s *kev) { - struct kqueue *kq = knote_get_kq(kn); - int error = 0; - struct kqworkloop *kqwl; - - assert(kq->kq_state & KQ_WORKLOOP); - kqwl = (struct kqworkloop *)kq; + struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn); + thread_qos_t qos_index = THREAD_QOS_UNSPECIFIED; - error = filt_wlvalidate_kev_flags(kn, kev); + int error = filt_wlvalidate_kev_flags(kn, kev, &qos_index); if (error) { goto out; } - filt_wllock(kqwl); - - /* Make sure user and kernel are in agreement on important state */ - error = filt_wldebounce(kqwl, kev, 0); - if (error) { - error = filt_wlupdateowner(kqwl, kev, error, 0); - goto out_unlock; - } - - int new_command = kev->fflags & NOTE_WL_COMMANDS_MASK; - switch (new_command) { - case NOTE_WL_THREAD_REQUEST: - assert(kqwl->kqwl_request.kqr_qos_index != THREAD_QOS_UNSPECIFIED); - break; - - case NOTE_WL_SYNC_WAIT: - /* - * we need to allow waiting several times on the same knote because - * of EINTR. If it's already woken though, it won't block. - */ - break; - - case NOTE_WL_SYNC_WAKE: - if (kn->kn_sfflags & NOTE_WL_SYNC_WAKE) { - /* disallow waking the same knote twice */ - error = EALREADY; - goto out_unlock; - } - if (kn->kn_hook) { - thread_wakeup_thread((event_t)kn, (thread_t)kn->kn_hook); - } - break; - - default: - error = EINVAL; - goto out_unlock; - } - - /* - * Save off any additional fflags/data we just accepted - * But only keep the last round of "update" bits we acted on which helps - * debugging a lot. - */ - kn->kn_sfflags &= ~NOTE_WL_UPDATES_MASK; - kn->kn_sfflags |= kev->fflags; - kn->kn_sdata = kev->data; - - kq_index_t qos_index = THREAD_QOS_UNSPECIFIED; - - if (kev->fflags & NOTE_WL_UPDATE_QOS) { - qos_t qos = pthread_priority_canonicalize(kev->qos, FALSE); - - if (kn->kn_qos != qos) { - qos_index = qos_index_from_qos(kn, qos, FALSE); - if (qos_index == THREAD_QOS_UNSPECIFIED) { - error = ERANGE; - goto out_unlock; - } - kqlock(kq); - if (kn->kn_status & KN_QUEUED) { - knote_dequeue(kn); - knote_set_qos_index(kn, qos_index); - knote_enqueue(kn); - knote_wakeup(kn); - } else { - knote_set_qos_index(kn, qos_index); - } - kn->kn_qos = qos; - kqunlock(kq); - } - } - - error = filt_wlupdateowner(kqwl, kev, 0, qos_index); + error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLTOUCH); + filt_wlremember_last_update(kn, kev, error); if (error) { - goto out_unlock; - } - - if (new_command == NOTE_WL_SYNC_WAIT) { - /* if the wake has already preposted, don't wait */ - if ((kn->kn_sfflags & NOTE_WL_SYNC_WAKE) == 0) - error = filt_wlwait(kqwl, kn, kev); + goto out; } -out_unlock: - filt_wlremember_last_update(kqwl, kn, kev, error); - filt_wlunlock(kqwl); out: if (error) { if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) { @@ -2633,144 +2376,46 @@ filt_wltouch( kev->data = error; return 0; } + int command = kev->fflags & NOTE_WL_COMMANDS_MASK; + if (command == NOTE_WL_SYNC_WAIT && !(kn->kn_sfflags & NOTE_WL_SYNC_WAKE)) { + return kevent_register_wait_prepare(kn, kev); + } /* Just touching the thread request successfully will fire it */ - return new_command == NOTE_WL_THREAD_REQUEST; + if (command == NOTE_WL_THREAD_REQUEST) { + if (kev->fflags & NOTE_WL_UPDATE_QOS) { + return FILTER_ACTIVE | FILTER_UPDATE_REQ_QOS; + } + return FILTER_ACTIVE; + } + return 0; } -static int -filt_wldrop_and_unlock( - struct knote *kn, - struct kevent_internal_s *kev) +static bool +filt_wlallow_drop(struct knote *kn, struct kevent_internal_s *kev) { - struct kqueue *kq = knote_get_kq(kn); - struct kqworkloop *kqwl = (struct kqworkloop *)kq; - int error = 0, knoteuse_flags = KNUSE_NONE; - - kqlock_held(kq); + struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn); - assert(kev->flags & EV_DELETE); - assert(kq->kq_state & KQ_WORKLOOP); - - error = filt_wlvalidate_kev_flags(kn, kev); + int error = filt_wlvalidate_kev_flags(kn, kev, NULL); if (error) { goto out; } - if (kn->kn_sfflags & NOTE_WL_THREAD_REQUEST) { - knoteuse_flags |= KNUSE_BOOST; - } - - /* take a usecount to allow taking the filt_wllock */ - if (!kqlock2knoteuse(kq, kn, knoteuse_flags)) { - /* knote is being dropped already */ - error = EINPROGRESS; - goto out; - } - - filt_wllock(kqwl); - - /* - * Make sure user and kernel are in agreement on important state - * - * Userland will modify bits to cause this to fail for the touch / drop - * race case (when a drop for a thread request quiescing comes in late after - * the workloop has been woken up again). - */ - error = filt_wldebounce(kqwl, kev, 0); - - if (!knoteuse2kqlock(kq, kn, knoteuse_flags)) { - /* knote is no longer alive */ - error = EINPROGRESS; - goto out_unlock; - } - - if (!error && (kn->kn_sfflags & NOTE_WL_THREAD_REQUEST) && kn->kn_inuse) { - /* - * There is a concurrent drop or touch happening, we can't resolve this, - * userland has to redrive. - * - * The race we're worried about here is the following: - * - * f_touch | f_drop_and_unlock - * ------------------------+-------------------------------------------- - * | kqlock() - * | kqlock2knoteuse() - * | filt_wllock() - * | debounces successfully - * kqlock() | - * kqlock2knoteuse | - * filt_wllock() | - * | knoteuse2kqlock() - * | filt_wlunlock() - * | kqlock2knotedrop() - * debounces successfully | - * filt_wlunlock() | - * caller WAKES f_drop | - * | performs drop, but f_touch should have won - * - * So if the usecount is not 0 here, we need to wait for it to drop and - * redrive the whole logic (including looking up the knote again). - */ - filt_wlunlock(kqwl); - knoteusewait(kq, kn); - return ERESTART; - } - - /* - * If error is 0 this will set kqr_qos_index to THREAD_QOS_UNSPECIFIED - * - * If error is 0 or ESTALE this may drop ownership and cause a thread - * request redrive, however the kqlock is held which prevents f_process() to - * run until we did the drop for real. - */ - error = filt_wlupdateowner(kqwl, kev, error, 0); + error = filt_wlupdate(kqwl, kn, kev, 0, FILT_WLDROP); + filt_wlremember_last_update(kn, kev, error); if (error) { - goto out_unlock; - } - - if ((kn->kn_sfflags & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE)) == - NOTE_WL_SYNC_WAIT) { - /* - * When deleting a SYNC_WAIT knote that hasn't been woken up - * explicitly, issue a wake up. - */ - kn->kn_sfflags |= NOTE_WL_SYNC_WAKE; - if (kn->kn_hook) { - thread_wakeup_thread((event_t)kn, (thread_t)kn->kn_hook); - } + goto out; } -out_unlock: - filt_wlremember_last_update(kqwl, kn, kev, error); - filt_wlunlock(kqwl); - out: - if (error == 0) { - /* If nothing failed, do the regular knote drop. */ - if (kqlock2knotedrop(kq, kn)) { - knote_drop(kn, current_proc()); - } else { - error = EINPROGRESS; + if (error) { + if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) { + return false; } - } else { - kqunlock(kq); - } - if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) { - error = 0; - } - if (error == EINPROGRESS) { - /* - * filt_wlprocess() makes sure that no event can be delivered for - * NOTE_WL_THREAD_REQUEST knotes once a drop is happening, and - * NOTE_WL_SYNC_* knotes are never fired. - * - * It means that EINPROGRESS is about a state that userland cannot - * observe for this filter (an event being delivered concurrently from - * a drop), so silence the error. - */ - error = 0; + kev->flags |= EV_ERROR; + kev->data = error; + return false; } - return error; + return true; } static int @@ -2779,66 +2424,87 @@ filt_wlprocess( __unused struct filt_process_s *data, struct kevent_internal_s *kev) { - struct kqueue *kq = knote_get_kq(kn); - struct kqworkloop *kqwl = (struct kqworkloop *)kq; - struct kqrequest *kqr = &kqwl->kqwl_request; + struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn); int rc = 0; - assert(kq->kq_state & KQ_WORKLOOP); - - /* only thread requests should get here */ assert(kn->kn_sfflags & NOTE_WL_THREAD_REQUEST); - if (kn->kn_sfflags & NOTE_WL_THREAD_REQUEST) { - filt_wllock(kqwl); - assert(kqr->kqr_qos_index != THREAD_QOS_UNSPECIFIED); - if (kqwl->kqwl_owner) { + + filt_wllock(kqwl); + + if (kqwl->kqwl_owner) { + /* + * userspace sometimes due to events being + * delivered but not triggering a drain session can cause a process + * of the thread request knote. + * + * When that happens, the automatic deactivation due to process + * would swallow the event, so we have to activate the knote again. + */ + kqlock(kqwl); + knote_activate(kn); + kqunlock(kqwl); + } else { +#if DEBUG || DEVELOPMENT + if (kevent_debug_flags() & KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS) { /* - * userspace sometimes due to events being - * delivered but not triggering a drain session can cause a process - * of the thread request knote. - * - * When that happens, the automatic deactivation due to process - * would swallow the event, so we have to activate the knote again. + * see src/queue_internal.h in libdispatch */ - kqlock(kq); - knote_activate(kn); - kqunlock(kq); - } else if (kqr->kqr_qos_index) { -#if DEBUG || DEVELOPMENT +#define DISPATCH_QUEUE_ENQUEUED 0x1ull user_addr_t addr = CAST_USER_ADDR_T(kn->kn_ext[EV_EXTIDX_WL_ADDR]); task_t t = current_task(); uint64_t val; if (addr && task_is_active(t) && !task_is_halting(t) && copyin_word(addr, &val, sizeof(val)) == 0 && val && (val & DISPATCH_QUEUE_ENQUEUED) == 0 && - (val >> 48) != 0 && (val >> 48) != 0xffff) { + (val >> 48) != 0xdead && (val >> 48) != 0 && (val >> 48) != 0xffff) { panic("kevent: workloop %#016llx is not enqueued " "(kn:%p dq_state:%#016llx kev.dq_state:%#016llx)", - kn->kn_udata, kn, val, - kn->kn_ext[EV_EXTIDX_WL_VALUE]); + kn->kn_udata, kn, val, kn->kn_ext[EV_EXTIDX_WL_VALUE]); } -#endif - *kev = kn->kn_kevent; - kev->fflags = kn->kn_sfflags; - kev->data = kn->kn_sdata; - kev->qos = kn->kn_qos; - rc = 1; } - filt_wlunlock(kqwl); +#endif + *kev = kn->kn_kevent; + kev->fflags = kn->kn_sfflags; + kev->data = kn->kn_sdata; + kev->qos = kn->kn_qos; + rc |= FILTER_ACTIVE; + } + + filt_wlunlock(kqwl); + + if (rc & FILTER_ACTIVE) { + workq_thread_set_max_qos(kqwl->kqwl_p, &kqwl->kqwl_request); } return rc; } +SECURITY_READ_ONLY_EARLY(static struct filterops) workloop_filtops = { + .f_extended_codes = true, + .f_attach = filt_wlattach, + .f_detach = filt_wldetach, + .f_event = filt_badevent, + .f_touch = filt_wltouch, + .f_process = filt_wlprocess, + .f_allow_drop = filt_wlallow_drop, + .f_post_register_wait = filt_wlpost_register_wait, +}; + #pragma mark kevent / knotes /* * JMM - placeholder for not-yet-implemented filters */ +static int +filt_badevent(struct knote *kn, long hint) +{ + panic("%s[%d](%p, %ld)", __func__, kn->kn_filter, kn, hint); + return 0; +} + static int filt_badattach(__unused struct knote *kn, __unused struct kevent_internal_s *kev) { - kn->kn_flags |= EV_ERROR; - kn->kn_data = ENOTSUP; + knote_set_error(kn, ENOTSUP); return 0; } @@ -2849,7 +2515,6 @@ kqueue_alloc(struct proc *p, unsigned int flags) struct kqueue *kq = NULL; int policy; void *hook = NULL; - uint64_t kq_addr_offset; if (flags & KEVENT_FLAG_WORKQ) { struct kqworkq *kqwq; @@ -2865,16 +2530,29 @@ kqueue_alloc(struct proc *p, unsigned int flags) kqwq->kqwq_state = KQ_WORKQ; for (i = 0; i < KQWQ_NBUCKETS; i++) { - TAILQ_INIT(&kq->kq_queue[i]); + TAILQ_INIT(&kqwq->kqwq_queue[i]); } - for (i = 0; i < KQWQ_NQOS; i++) { + for (i = 0; i < KQWQ_NBUCKETS; i++) { + if (i != KQWQ_QOS_MANAGER) { + /* + * Because of how the bucketized system works, we mix overcommit + * sources with not overcommit: each time we move a knote from + * one bucket to the next due to overrides, we'd had to track + * overcommitness, and it's really not worth it in the workloop + * enabled world that track this faithfully. + * + * Incidentally, this behaves like the original manager-based + * kqwq where event delivery always happened (hence is + * "overcommit") + */ + kqwq->kqwq_request[i].kqr_state |= KQR_THOVERCOMMIT; + } kqwq->kqwq_request[i].kqr_qos_index = i; + TAILQ_INIT(&kqwq->kqwq_request[i].kqr_suppressed); } - lck_spin_init(&kqwq->kqwq_reqlock, kq_lck_grp, kq_lck_attr); policy = SYNC_POLICY_FIFO; hook = (void *)kqwq; - } else if (flags & KEVENT_FLAG_WORKLOOP) { struct kqworkloop *kqwl; int i; @@ -2887,41 +2565,36 @@ kqueue_alloc(struct proc *p, unsigned int flags) kqwl->kqwl_state = KQ_WORKLOOP | KQ_DYNAMIC; kqwl->kqwl_retains = 1; /* donate a retain to creator */ + kqwl->kqwl_request.kqr_state = KQR_WORKLOOP; kq = &kqwl->kqwl_kqueue; for (i = 0; i < KQWL_NBUCKETS; i++) { - TAILQ_INIT(&kq->kq_queue[i]); + TAILQ_INIT(&kqwl->kqwl_queue[i]); } TAILQ_INIT(&kqwl->kqwl_request.kqr_suppressed); - lck_spin_init(&kqwl->kqwl_reqlock, kq_lck_grp, kq_lck_attr); lck_mtx_init(&kqwl->kqwl_statelock, kq_lck_grp, kq_lck_attr); policy = SYNC_POLICY_FIFO; - if (flags & KEVENT_FLAG_WORKLOOP_NO_WQ_THREAD) { - policy |= SYNC_POLICY_PREPOST; - kq->kq_state |= KQ_NO_WQ_THREAD; - } else { - hook = (void *)kqwl; - } - + hook = (void *)kqwl; } else { struct kqfile *kqf; - + kqf = (struct kqfile *)zalloc(kqfile_zone); if (kqf == NULL) return NULL; kq = &kqf->kqf_kqueue; bzero(kqf, sizeof (struct kqfile)); - TAILQ_INIT(&kq->kq_queue[0]); + TAILQ_INIT(&kqf->kqf_queue); TAILQ_INIT(&kqf->kqf_suppressed); - + policy = SYNC_POLICY_FIFO | SYNC_POLICY_PREPOST; } waitq_set_init(&kq->kq_wqs, policy, NULL, hook); lck_spin_init(&kq->kq_lock, kq_lck_grp, kq_lck_attr); + lck_spin_init(&kq->kq_reqlock, kq_lck_grp, kq_lck_attr); kq->kq_p = p; if (fdp->fd_knlistsize < 0) { @@ -2931,19 +2604,16 @@ kqueue_alloc(struct proc *p, unsigned int flags) proc_fdunlock(p); } - kq_addr_offset = ((uintptr_t)kq - (uintptr_t)VM_MIN_KERNEL_AND_KEXT_ADDRESS); - /* Assert that the address can be pointer compacted for use with knote */ - assert(kq_addr_offset < (uint64_t)(1ull << KNOTE_KQ_BITSIZE)); return (kq); } /* * knotes_dealloc - detach all knotes for the process and drop them * - * Called with proc_fdlock held. - * Returns with it locked. - * May drop it temporarily. - * Process is in such a state that it will not try to allocate + * Called with proc_fdlock held. + * Returns with it locked. + * May drop it temporarily. + * Process is in such a state that it will not try to allocate * any more knotes during this process (stopped for exit or exec). */ void @@ -2962,10 +2632,7 @@ knotes_dealloc(proc_t p) kq = knote_get_kq(kn); kqlock(kq); proc_fdunlock(p); - /* drop it ourselves or wait */ - if (kqlock2knotedrop(kq, kn)) { - knote_drop(kn, p); - } + knote_drop(kq, kn, NULL); proc_fdlock(p); } } @@ -2985,10 +2652,7 @@ knotes_dealloc(proc_t p) kq = knote_get_kq(kn); kqlock(kq); knhash_unlock(p); - /* drop it ourselves or wait */ - if (kqlock2knotedrop(kq, kn)) { - knote_drop(kn, p); - } + knote_drop(kq, kn, NULL); knhash_lock(p); } } @@ -3006,11 +2670,43 @@ knotes_dealloc(proc_t p) proc_fdlock(p); } +/* + * kqworkloop_invalidate + * + * Invalidate ownership of a workloop. + * + * This is meant to be used so that any remnant of overrides and ownership + * information is dropped before a kqworkloop can no longer be found in the + * global hash table and have ghost workloop ownership left over. + * + * Possibly returns a thread to deallocate in a safe context. + */ +static thread_t +kqworkloop_invalidate(struct kqworkloop *kqwl) +{ + thread_t cur_owner = kqwl->kqwl_owner; + + assert(TAILQ_EMPTY(&kqwl->kqwl_request.kqr_suppressed)); + if (cur_owner) { + /* + * If the kqueue had an owner that prevented the thread request to + * go through, then no unbind happened, and we may have lingering + * overrides to drop. + */ + if (kqworkloop_owner_override(kqwl) != THREAD_QOS_UNSPECIFIED) { + thread_drop_ipc_override(cur_owner); + } + thread_ends_owning_workloop(cur_owner); + kqwl->kqwl_owner = THREAD_NULL; + } + + return cur_owner; +} /* * kqueue_dealloc - detach all knotes from a kqueue and free it * - * We walk each list looking for knotes referencing this + * We walk each list looking for knotes referencing this * this kqueue. If we find one, we try to drop it. But * if we fail to get a drop reference, that will wait * until it is dropped. So, we can just restart again @@ -3039,7 +2735,13 @@ kqueue_dealloc(struct kqueue *kq) p = kq->kq_p; fdp = p->p_fd; + /* + * Workloops are refcounted by their knotes, so there's no point + * spending a lot of time under these locks just to deallocate one. + */ if ((kq->kq_state & KQ_WORKLOOP) == 0) { + KNOTE_LOCK_CTX(knlc); + proc_fdlock(p); for (i = 0; i < fdp->fd_knlistsize; i++) { kn = SLIST_FIRST(&fdp->fd_knlist[i]); @@ -3047,9 +2749,8 @@ kqueue_dealloc(struct kqueue *kq) if (kq == knote_get_kq(kn)) { kqlock(kq); proc_fdunlock(p); - /* drop it ourselves or wait */ - if (kqlock2knotedrop(kq, kn)) { - knote_drop(kn, p); + if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) { + knote_drop(kq, kn, &knlc); } proc_fdlock(p); /* start over at beginning of list */ @@ -3059,6 +2760,7 @@ kqueue_dealloc(struct kqueue *kq) kn = SLIST_NEXT(kn, kn_link); } } + knhash_lock(p); proc_fdunlock(p); @@ -3069,9 +2771,8 @@ kqueue_dealloc(struct kqueue *kq) if (kq == knote_get_kq(kn)) { kqlock(kq); knhash_unlock(p); - /* drop it ourselves or wait */ - if (kqlock2knotedrop(kq, kn)) { - knote_drop(kn, p); + if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) { + knote_drop(kq, kn, &knlc); } knhash_lock(p); /* start over at beginning of list */ @@ -3087,28 +2788,17 @@ kqueue_dealloc(struct kqueue *kq) if (kq->kq_state & KQ_WORKLOOP) { struct kqworkloop *kqwl = (struct kqworkloop *)kq; - struct kqrequest *kqr = &kqwl->kqwl_request; - thread_t cur_owner = kqwl->kqwl_owner; + thread_t cur_owner = kqworkloop_invalidate(kqwl); - assert(TAILQ_EMPTY(&kqwl->kqwl_request.kqr_suppressed)); - if (filt_wlowner_is_valid(cur_owner)) { - /* - * If the kqueue had an owner that prevented the thread request to - * go through, then no unbind happened, and we may have lingering - * overrides to drop. - */ - if (kqr->kqr_dsync_owner_qos != THREAD_QOS_UNSPECIFIED) { - thread_drop_ipc_override(cur_owner); - kqr->kqr_dsync_owner_qos = THREAD_QOS_UNSPECIFIED; - } + if (cur_owner) thread_deallocate(cur_owner); - if (kqr->kqr_owner_override_is_sync) { - thread_drop_sync_ipc_override(cur_owner); - kqr->kqr_owner_override_is_sync = 0; - } - thread_ends_owning_workloop(cur_owner); - thread_deallocate(cur_owner); - kqwl->kqwl_owner = THREAD_NULL; + if (kqwl->kqwl_request.kqr_state & KQR_ALLOCATED_TURNSTILE) { + struct turnstile *ts; + turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, &ts); + turnstile_cleanup(); + turnstile_deallocate(ts); + } else { + assert(kqwl->kqwl_turnstile == NULL); } } @@ -3118,23 +2808,18 @@ kqueue_dealloc(struct kqueue *kq) */ waitq_set_deinit(&kq->kq_wqs); lck_spin_destroy(&kq->kq_lock, kq_lck_grp); + lck_spin_destroy(&kq->kq_reqlock, kq_lck_grp); if (kq->kq_state & KQ_WORKQ) { - struct kqworkq *kqwq = (struct kqworkq *)kq; - - lck_spin_destroy(&kqwq->kqwq_reqlock, kq_lck_grp); - zfree(kqworkq_zone, kqwq); + zfree(kqworkq_zone, (struct kqworkq *)kq); } else if (kq->kq_state & KQ_WORKLOOP) { struct kqworkloop *kqwl = (struct kqworkloop *)kq; assert(kqwl->kqwl_retains == 0); - lck_spin_destroy(&kqwl->kqwl_reqlock, kq_lck_grp); lck_mtx_destroy(&kqwl->kqwl_statelock, kq_lck_grp); zfree(kqworkloop_zone, kqwl); } else { - struct kqfile *kqf = (struct kqfile *)kq; - - zfree(kqfile_zone, kqf); + zfree(kqfile_zone, (struct kqfile *)kq); } } @@ -3159,18 +2844,16 @@ kqueue_retain(struct kqueue *kq) #define KQUEUE_MIGHT_BE_LAST_REF 1 static inline int -kqueue_release(struct kqueue *kq, __assert_only int possibly_last) +kqueue_release(kqueue_t kqu, __assert_only int possibly_last) { - struct kqworkloop *kqwl = (struct kqworkloop *)kq; - - if ((kq->kq_state & KQ_DYNAMIC) == 0) { + if ((kqu.kq->kq_state & KQ_DYNAMIC) == 0) { return 0; } - assert(kq->kq_state & KQ_WORKLOOP); /* for now */ - uint32_t refs = OSDecrementAtomic(&kqwl->kqwl_retains); + assert(kqu.kq->kq_state & KQ_WORKLOOP); /* for now */ + uint32_t refs = OSDecrementAtomic(&kqu.kqwl->kqwl_retains); if (__improbable(refs == 0)) { - panic("kq(%p) over-release", kq); + panic("kq(%p) over-release", kqu.kq); } if (refs == 1) { assert(possibly_last); @@ -3219,7 +2902,7 @@ kqueue(struct proc *p, __unused struct kqueue_args *uap, int32_t *retval) static int kevent_copyin(user_addr_t *addrp, struct kevent_internal_s *kevp, struct proc *p, - unsigned int flags) + unsigned int flags) { int advance; int error; @@ -3271,7 +2954,7 @@ kevent_copyin(user_addr_t *addrp, struct kevent_internal_s *kevp, struct proc *p kevp->data = kev64.data; kevp->ext[0] = kev64.ext[0]; kevp->ext[1] = kev64.ext[1]; - + } else { struct kevent_qos_s kevqos; @@ -3301,13 +2984,13 @@ kevent_copyin(user_addr_t *addrp, struct kevent_internal_s *kevp, struct proc *p static int kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp, struct proc *p, - unsigned int flags) + unsigned int flags) { user_addr_t addr = *addrp; int advance; int error; - /* + /* * fully initialize the differnt output event structure * types from the internal kevent (and some universal * defaults for fields not represented in the internal @@ -3321,7 +3004,7 @@ kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp, struct proc * advance = sizeof (kev64); bzero(&kev64, advance); - + /* * deal with the special case of a user-supplied * value of (uintptr_t)-1. @@ -3367,7 +3050,7 @@ kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp, struct proc * error = copyout((caddr_t)&kev64, addr, advance); } else { struct kevent_qos_s kevqos; - + advance = sizeof (struct kevent_qos_s); if (flags & KEVENT_FLAG_STACK_EVENTS) { addr -= advance; @@ -3397,10 +3080,11 @@ kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp, struct proc * } static int -kevent_get_data_size(struct proc *p, - uint64_t data_available, - unsigned int flags, - user_size_t *residp) +kevent_get_data_size( + struct proc *p, + uint64_t data_available, + unsigned int flags, + user_size_t *residp) { user_size_t resid; int error = 0; @@ -3427,10 +3111,11 @@ kevent_get_data_size(struct proc *p, } static int -kevent_put_data_size(struct proc *p, - uint64_t data_available, - unsigned int flags, - user_size_t resid) +kevent_put_data_size( + struct proc *p, + uint64_t data_available, + unsigned int flags, + user_size_t resid) { int error = 0; @@ -3453,7 +3138,6 @@ kevent_put_data_size(struct proc *p, * * assume we inherit a use count on the kq fileglob. */ - __attribute__((noreturn)) static void kevent_continue(__unused struct kqueue *kq, void *data, int error) @@ -3553,13 +3237,13 @@ kevent_qos(struct proc *p, struct kevent_qos_args *uap, int32_t *retval) retval); } -int -kevent_qos_internal(struct proc *p, int fd, +int +kevent_qos_internal(struct proc *p, int fd, user_addr_t changelist, int nchanges, user_addr_t eventlist, int nevents, user_addr_t data_out, user_size_t *data_available, - unsigned int flags, - int32_t *retval) + unsigned int flags, + int32_t *retval) { return kevent_internal(p, (kqueue_id_t)fd, NULL, @@ -3594,8 +3278,8 @@ kevent_id_internal(struct proc *p, kqueue_id_t *id, user_addr_t changelist, int nchanges, user_addr_t eventlist, int nevents, user_addr_t data_out, user_size_t *data_available, - unsigned int flags, - int32_t *retval) + unsigned int flags, + int32_t *retval) { return kevent_internal(p, *id, id, @@ -3607,7 +3291,7 @@ kevent_id_internal(struct proc *p, kqueue_id_t *id, NULL, retval); } - + static int kevent_get_timeout(struct proc *p, user_addr_t utimeout, @@ -3805,13 +3489,16 @@ kqueue_hash_lookup(struct proc *p, kqueue_id_t id) } static inline void -kqueue_release_last(struct proc *p, struct kqueue *kq) +kqueue_release_last(struct proc *p, kqueue_t kqu) { + struct kqueue *kq = kqu.kq; if (kq->kq_state & KQ_DYNAMIC) { kqhash_lock(p); if (kqueue_release(kq, KQUEUE_MIGHT_BE_LAST_REF)) { + thread_t cur_owner = kqworkloop_invalidate(kqu.kqwl); kqueue_hash_remove(p, kq); kqhash_unlock(p); + if (cur_owner) thread_deallocate(cur_owner); kqueue_dealloc(kq); } else { kqhash_unlock(p); @@ -3819,93 +3506,118 @@ kqueue_release_last(struct proc *p, struct kqueue *kq) } } -static struct kqueue * -kevent_get_bound_kq(__assert_only struct proc *p, thread_t thread, - unsigned int kev_flags, unsigned int kq_flags) +/* + * kqworkloops_dealloc - rebalance retains on kqworkloops created with + * scheduling parameters + * + * Called with proc_fdlock held. + * Returns with it locked. + * Process is in such a state that it will not try to allocate + * any more knotes during this process (stopped for exit or exec). + */ +void +kqworkloops_dealloc(proc_t p) { - struct kqueue *kq; - struct uthread *ut = get_bsdthread_info(thread); + struct filedesc *fdp = p->p_fd; + struct kqlist *list; + struct kqworkloop *kqwl, *kqwln; + struct kqlist tofree; + int i; + + if (!(fdp->fd_flags & FD_WORKLOOP)) { + return; + } + + SLIST_INIT(&tofree); + + kqhash_lock(p); + assert(fdp->fd_kqhashmask != 0); - assert(p == get_bsdthreadtask_info(thread)); + for (i = 0; i <= (int)fdp->fd_kqhashmask; i++) { + list = &fdp->fd_kqhash[i]; + SLIST_FOREACH_SAFE(kqwl, list, kqwl_hashlink, kqwln) { + /* + * kqworkloops that have scheduling parameters have an + * implicit retain from kqueue_workloop_ctl that needs + * to be balanced on process exit. + */ + assert(kqwl->kqwl_params); + SLIST_REMOVE(list, kqwl, kqworkloop, kqwl_hashlink); + SLIST_INSERT_HEAD(&tofree, kqwl, kqwl_hashlink); + } + } - if (!(ut->uu_kqueue_flags & kev_flags)) - return NULL; + kqhash_unlock(p); - kq = ut->uu_kqueue_bound; - if (!kq) - return NULL; + SLIST_FOREACH_SAFE(kqwl, &tofree, kqwl_hashlink, kqwln) { + struct kqueue *kq = (struct kqueue *)kqwl; + __assert_only bool released; + released = kqueue_release(kq, KQUEUE_MIGHT_BE_LAST_REF); + assert(released); + kqueue_dealloc(kq); + } +} - if (!(kq->kq_state & kq_flags)) - return NULL; +static struct kqueue * +kevent_get_bound_kqworkloop(thread_t thread) +{ + struct uthread *ut = get_bsdthread_info(thread); + struct kqrequest *kqr = ut->uu_kqr_bound; - return kq; + return kqr ? (struct kqueue *)kqr_kqworkloop(kqr) : NULL; } static int -kevent_get_kq(struct proc *p, kqueue_id_t id, unsigned int flags, struct fileproc **fpp, int *fdp, struct kqueue **kqp) +kevent_get_kq(struct proc *p, kqueue_id_t id, workq_threadreq_param_t *trp, + unsigned int flags, struct fileproc **fpp, int *fdp, + struct kqueue **kqp) { struct filedesc *descp = p->p_fd; struct fileproc *fp = NULL; - struct kqueue *kq; + struct kqueue *kq = NULL; int fd = 0; int error = 0; + thread_t th = current_thread(); + + assert(!trp || (flags & KEVENT_FLAG_WORKLOOP)); /* Was the workloop flag passed? Then it is for sure only a workloop */ if (flags & KEVENT_FLAG_DYNAMIC_KQUEUE) { assert(flags & KEVENT_FLAG_WORKLOOP); + assert(!trp || (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)); + kq = kevent_get_bound_kqworkloop(th); + + /* + * when kevent_id_internal is called from within the + * kernel, and the passed 'id' value is '-1' then we + * look for the currently bound workloop kq. + */ if (id == (kqueue_id_t)-1 && (flags & KEVENT_FLAG_KERNEL) && (flags & KEVENT_FLAG_WORKLOOP)) { - assert(is_workqueue_thread(current_thread())); - - /* - * when kevent_id_internal is called from within the - * kernel, and the passed 'id' value is '-1' then we - * look for the currently bound workloop kq. - * - * Until pthread kext avoids calling in to kevent_id_internal - * for threads whose fulfill is canceled, calling in unbound - * can't be fatal. - */ - kq = kevent_get_bound_kq(p, current_thread(), - KEVENT_FLAG_WORKLOOP, KQ_WORKLOOP); - if (kq) { - kqueue_retain(kq); - } else { - struct uthread *ut = get_bsdthread_info(current_thread()); - - /* If thread is unbound due to cancel, just return an error */ - if (ut->uu_kqueue_flags == KEVENT_FLAG_WORKLOOP_CANCELED) { - ut->uu_kqueue_flags = 0; - error = ECANCELED; - } else { - panic("Unbound thread called kevent_internal with id=-1" - " uu_kqueue_flags:0x%x, uu_kqueue_bound:%p", - ut->uu_kqueue_flags, ut->uu_kqueue_bound); - } + if (!is_workqueue_thread(th) || !kq) { + return EINVAL; } - *fpp = NULL; - *fdp = 0; - *kqp = kq; - return error; + kqueue_retain(kq); + goto out; + } + + if (id == 0 || id == (kqueue_id_t)-1) { + return EINVAL; } /* try shortcut on kq lookup for bound threads */ - kq = kevent_get_bound_kq(p, current_thread(), KEVENT_FLAG_WORKLOOP, KQ_WORKLOOP); if (kq != NULL && ((struct kqworkloop *)kq)->kqwl_dynamicid == id) { if (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST) { - error = EEXIST; - kq = NULL; - goto out; + return EEXIST; } /* retain a reference while working with this kq. */ assert(kq->kq_state & KQ_DYNAMIC); kqueue_retain(kq); - error = 0; goto out; } @@ -3916,39 +3628,45 @@ kevent_get_kq(struct proc *p, kqueue_id_t id, unsigned int flags, struct filepro kqhash_unlock(p); if (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST) { - error = ENOENT; - goto out; + return ENOENT; } struct kqueue *alloc_kq; alloc_kq = kqueue_alloc(p, flags); - if (alloc_kq) { - kqhash_lock(p); - kqueue_hash_init_if_needed(p); - kq = kqueue_hash_lookup(p, id); - if (kq == NULL) { - /* insert our new one */ - kq = alloc_kq; - kqueue_hash_insert(p, id, kq); - kqhash_unlock(p); - } else { - /* lost race, retain existing workloop */ - kqueue_retain(kq); - kqhash_unlock(p); - kqueue_release(alloc_kq, KQUEUE_MIGHT_BE_LAST_REF); - kqueue_dealloc(alloc_kq); - } - } else { - error = ENOMEM; - goto out; + if (!alloc_kq) { + return ENOMEM; + } + + kqhash_lock(p); + kqueue_hash_init_if_needed(p); + kq = kqueue_hash_lookup(p, id); + if (kq == NULL) { + /* insert our new one */ + kq = alloc_kq; + if (trp) { + struct kqworkloop *kqwl = (struct kqworkloop *)kq; + kqwl->kqwl_params = trp->trp_value; + } + kqueue_hash_insert(p, id, kq); + kqhash_unlock(p); + } else if (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST) { + /* lost race and caller wants an error */ + kqhash_unlock(p); + kqueue_release(alloc_kq, KQUEUE_MIGHT_BE_LAST_REF); + kqueue_dealloc(alloc_kq); + return EEXIST; + } else { + /* lost race, retain existing workloop */ + kqueue_retain(kq); + kqhash_unlock(p); + kqueue_release(alloc_kq, KQUEUE_MIGHT_BE_LAST_REF); + kqueue_dealloc(alloc_kq); } } else { if (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST) { kqhash_unlock(p); - kq = NULL; - error = EEXIST; - goto out; + return EEXIST; } /* retain a reference while working with this kq. */ @@ -3956,7 +3674,7 @@ kevent_get_kq(struct proc *p, kqueue_id_t id, unsigned int flags, struct filepro kqueue_retain(kq); kqhash_unlock(p); } - + } else if (flags & KEVENT_FLAG_WORKQ) { /* must already exist for bound threads. */ if (flags & KEVENT_FLAG_KERNEL) { @@ -3972,8 +3690,9 @@ kevent_get_kq(struct proc *p, kqueue_id_t id, unsigned int flags, struct filepro kq = descp->fd_wqkqueue; if (kq == NULL) { struct kqueue *alloc_kq = kqueue_alloc(p, KEVENT_FLAG_WORKQ); - if (alloc_kq == NULL) + if (alloc_kq == NULL) { return ENOMEM; + } knhash_lock(p); if (descp->fd_wqkqueue == NULL) { @@ -3996,13 +3715,13 @@ kevent_get_kq(struct proc *p, kqueue_id_t id, unsigned int flags, struct filepro if (fp != NULL) fp_drop(p, fd, fp, 0); return error; - } + } out: *fpp = fp; *fdp = fd; *kqp = kq; - + return error; } @@ -4048,7 +3767,7 @@ kevent_exit_on_workloop_ownership_leak(thread_t thread) proc_t p = current_proc(); struct filedesc *fdp = p->p_fd; kqueue_id_t workloop_id = 0; - os_reason_t reason; + os_reason_t reason = OS_REASON_NULL; mach_vm_address_t addr; uint32_t reason_size; @@ -4067,7 +3786,6 @@ kevent_exit_on_workloop_ownership_leak(thread_t thread) } } kqhash_unlock(p); - assert(workloop_id); reason = os_reason_create(OS_REASON_LIBSYSTEM, OS_REASON_LIBSYSTEM_CODE_WORKLOOP_OWNERSHIP_LEAK); @@ -4082,21 +3800,26 @@ kevent_exit_on_workloop_ownership_leak(thread_t thread) goto out; } - struct kcdata_descriptor *kcd = &reason->osr_kcd_descriptor; + if (workloop_id) { + struct kcdata_descriptor *kcd = &reason->osr_kcd_descriptor; - if (kcdata_get_memory_addr(kcd, EXIT_REASON_WORKLOOP_ID, - sizeof(workloop_id), &addr) == KERN_SUCCESS) { - kcdata_memcpy(kcd, addr, &workloop_id, sizeof(workloop_id)); - } + if (kcdata_get_memory_addr(kcd, EXIT_REASON_WORKLOOP_ID, + sizeof(workloop_id), &addr) == KERN_SUCCESS) { + kcdata_memcpy(kcd, addr, &workloop_id, sizeof(workloop_id)); + } - uint64_t serial_no = kevent_workloop_serial_no_copyin(p, workloop_id); - if (serial_no && kcdata_get_memory_addr(kcd, EXIT_REASON_DISPATCH_QUEUE_NO, - sizeof(serial_no), &addr) == KERN_SUCCESS) { - kcdata_memcpy(kcd, addr, &serial_no, sizeof(serial_no)); + uint64_t serial_no = kevent_workloop_serial_no_copyin(p, workloop_id); + if (serial_no && kcdata_get_memory_addr(kcd, EXIT_REASON_DISPATCH_QUEUE_NO, + sizeof(serial_no), &addr) == KERN_SUCCESS) { + kcdata_memcpy(kcd, addr, &serial_no, sizeof(serial_no)); + } } - out: #if DEVELOPMENT || DEBUG + if (kevent_debug_flags() & KEVENT_PANIC_ON_WORKLOOP_OWNERSHIP_LEAK) { + panic("thread %p in task %p is leaked workloop 0x%016llx ownership", + thread, p->task, workloop_id); + } psignal_try_thread_with_reason(p, thread, SIGABRT, reason); return 0; #else @@ -4105,139 +3828,8 @@ kevent_exit_on_workloop_ownership_leak(thread_t thread) #endif } - -static int -kevent_servicer_detach_preflight(thread_t thread, unsigned int flags, struct kqueue *kq) -{ - int error = 0; - struct kqworkloop *kqwl; - struct uthread *ut; - struct kqrequest *kqr; - - if (!(flags & KEVENT_FLAG_WORKLOOP) || !(kq->kq_state & KQ_WORKLOOP)) - return EINVAL; - - /* only kq created with KEVENT_FLAG_WORKLOOP_NO_WQ_THREAD from userspace can have attached threads */ - if (!(kq->kq_state & KQ_NO_WQ_THREAD)) - return EINVAL; - - /* allow detach only on not wq threads */ - if (is_workqueue_thread(thread)) - return EINVAL; - - /* check that the current thread is bound to the requested wq */ - ut = get_bsdthread_info(thread); - if (ut->uu_kqueue_bound != kq) - return EINVAL; - - kqwl = (struct kqworkloop *)kq; - kqwl_req_lock(kqwl); - kqr = &kqwl->kqwl_request; - - /* check that the wq is bound to the thread */ - if ((kqr->kqr_state & KQR_BOUND) == 0 || (kqr->kqr_thread != thread)) - error = EINVAL; - - kqwl_req_unlock(kqwl); - - return error; -} - -static void -kevent_servicer_detach_thread(struct proc *p, kqueue_id_t id, thread_t thread, - unsigned int flags, struct kqueue *kq) -{ - struct kqworkloop *kqwl; - struct uthread *ut; - - assert((flags & KEVENT_FLAG_WORKLOOP) && (kq->kq_state & KQ_WORKLOOP)); - - /* allow detach only on not wqthreads threads */ - assert(!is_workqueue_thread(thread)); - - /* only kq created with KEVENT_FLAG_WORKLOOP_NO_WQ_THREAD from userspace can have attached threads */ - assert(kq->kq_state & KQ_NO_WQ_THREAD); - - /* check that the current thread is bound to the requested kq */ - ut = get_bsdthread_info(thread); - assert(ut->uu_kqueue_bound == kq); - - kqwl = (struct kqworkloop *)kq; - - kqlock(kq); - - /* unbind the thread. - * unbind itself checks if still processing and ends it. - */ - kqworkloop_unbind_thread(kqwl, thread, flags); - - kqunlock(kq); - - kevent_put_kq(p, id, NULL, kq); - - return; -} - -static int -kevent_servicer_attach_thread(thread_t thread, unsigned int flags, struct kqueue *kq) -{ - int error = 0; - struct kqworkloop *kqwl; - struct uthread *ut; - struct kqrequest *kqr; - - if (!(flags & KEVENT_FLAG_WORKLOOP) || !(kq->kq_state & KQ_WORKLOOP)) - return EINVAL; - - /* only kq created with KEVENT_FLAG_WORKLOOP_NO_WQ_THREAD from userspace can have attached threads*/ - if (!(kq->kq_state & KQ_NO_WQ_THREAD)) - return EINVAL; - - /* allow attach only on not wqthreads */ - if (is_workqueue_thread(thread)) - return EINVAL; - - /* check that the thread is not already bound */ - ut = get_bsdthread_info(thread); - if (ut->uu_kqueue_bound != NULL) - return EINVAL; - - assert(ut->uu_kqueue_flags == 0); - - kqlock(kq); - kqwl = (struct kqworkloop *)kq; - kqwl_req_lock(kqwl); - kqr = &kqwl->kqwl_request; - - /* check that the kqueue is not already bound */ - if (kqr->kqr_state & (KQR_BOUND | KQR_THREQUESTED | KQR_DRAIN)) { - error = EINVAL; - goto out; - } - - assert(kqr->kqr_thread == NULL); - assert((kqr->kqr_state & KQR_PROCESSING) == 0); - - kqr->kqr_state |= KQR_THREQUESTED; - kqr->kqr_qos_index = THREAD_QOS_UNSPECIFIED; - kqr->kqr_override_index = THREAD_QOS_UNSPECIFIED; - kqr->kqr_dsync_owner_qos = THREAD_QOS_UNSPECIFIED; - kqr->kqr_owner_override_is_sync = 0; - - kqworkloop_bind_thread_impl(kqwl, thread, KEVENT_FLAG_WORKLOOP); - - /* get a ref on the wlkq on behalf of the attached thread */ - kqueue_retain(kq); - -out: - kqwl_req_unlock(kqwl); - kqunlock(kq); - - return error; -} - -static inline -boolean_t kevent_args_requesting_events(unsigned int flags, int nevents) +static inline boolean_t +kevent_args_requesting_events(unsigned int flags, int nevents) { return (!(flags & KEVENT_FLAG_ERROR_EVENTS) && nevents > 0); } @@ -4248,55 +3840,51 @@ kevent_internal(struct proc *p, user_addr_t changelist, int nchanges, user_addr_t ueventlist, int nevents, user_addr_t data_out, uint64_t data_available, - unsigned int flags, + unsigned int flags, user_addr_t utimeout, kqueue_continue_t continuation, int32_t *retval) { - struct _kevent *cont_args; uthread_t ut; struct kqueue *kq; struct fileproc *fp = NULL; int fd = 0; struct kevent_internal_s kev; - int error, noutputs; + int error, noutputs, register_rc; + bool needs_end_processing = false; struct timeval atv; user_size_t data_size; user_size_t data_resid; thread_t thread = current_thread(); + KNOTE_LOCK_CTX(knlc); /* Don't allow user-space threads to process output events from the workq kqs */ if (((flags & (KEVENT_FLAG_WORKQ | KEVENT_FLAG_KERNEL)) == KEVENT_FLAG_WORKQ) && kevent_args_requesting_events(flags, nevents)) return EINVAL; + if (flags & KEVENT_FLAG_PARKING) { + if (!kevent_args_requesting_events(flags, nevents) || id != (kqueue_id_t)-1) + return EINVAL; + } + /* restrict dynamic kqueue allocation to workloops (for now) */ if ((flags & (KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP)) == KEVENT_FLAG_DYNAMIC_KQUEUE) return EINVAL; if ((flags & (KEVENT_FLAG_WORKLOOP)) && (flags & (KEVENT_FLAG_WORKQ))) - return EINVAL; + return EINVAL; - if (flags & (KEVENT_FLAG_WORKLOOP_SERVICER_ATTACH | KEVENT_FLAG_WORKLOOP_SERVICER_DETACH | - KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST | KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST | KEVENT_FLAG_WORKLOOP_NO_WQ_THREAD)) { + if (flags & (KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST | KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)) { /* allowed only on workloops when calling kevent_id from user-space */ if (!(flags & KEVENT_FLAG_WORKLOOP) || (flags & KEVENT_FLAG_KERNEL) || !(flags & KEVENT_FLAG_DYNAMIC_KQUEUE)) return EINVAL; - - /* cannot attach and detach simultaneously*/ - if ((flags & KEVENT_FLAG_WORKLOOP_SERVICER_ATTACH) && (flags & KEVENT_FLAG_WORKLOOP_SERVICER_DETACH)) - return EINVAL; - - /* cannot ask for events and detach */ - if ((flags & KEVENT_FLAG_WORKLOOP_SERVICER_DETACH) && kevent_args_requesting_events(flags, nevents)) - return EINVAL; - } /* prepare to deal with stack-wise allocation of out events */ if (flags & KEVENT_FLAG_STACK_EVENTS) { - int scale = ((flags & KEVENT_FLAG_LEGACY32) ? + int scale = ((flags & KEVENT_FLAG_LEGACY32) ? (IS_64BIT_PROCESS(p) ? sizeof(struct user64_kevent) : sizeof(struct user32_kevent)) : ((flags & KEVENT_FLAG_LEGACY64) ? sizeof(struct kevent64_s) : @@ -4308,47 +3896,56 @@ kevent_internal(struct proc *p, error = kevent_get_timeout(p, utimeout, flags, &atv); if (error) return error; - + /* copyin initial value of data residual from data_available */ error = kevent_get_data_size(p, data_available, flags, &data_size); if (error) return error; /* get the kq we are going to be working on */ - error = kevent_get_kq(p, id, flags, &fp, &fd, &kq); + error = kevent_get_kq(p, id, NULL, flags, &fp, &fd, &kq); +#if CONFIG_WORKLOOP_DEBUG + ut = (uthread_t)get_bsdthread_info(thread); + UU_KEVENT_HISTORY_WRITE_ENTRY(ut, { + .uu_kqid = id, + .uu_kq = error ? NULL : kq, + .uu_error = error, + .uu_nchanges = nchanges, + .uu_nevents = nevents, + .uu_flags = flags, + }); +#endif // CONFIG_WORKLOOP_DEBUG if (error) return error; /* only bound threads can receive events on workloops */ - if ((flags & KEVENT_FLAG_WORKLOOP) && kevent_args_requesting_events(flags, nevents)) { - ut = (uthread_t)get_bsdthread_info(thread); - if (ut->uu_kqueue_bound != kq) { - error = EXDEV; - goto out; - } + if (flags & KEVENT_FLAG_WORKLOOP) { + struct kqworkloop *kqwl = (struct kqworkloop *)kq; + struct kqrequest *kqr = &kqwl->kqwl_request; - } + assert(kq->kq_state & KQ_WORKLOOP); - /* attach the current thread if necessary */ - if (flags & KEVENT_FLAG_WORKLOOP_SERVICER_ATTACH) { - error = kevent_servicer_attach_thread(thread, flags, kq); - if (error) - goto out; - } - else { - /* before processing events and committing to the system call, return an error if the thread cannot be detached when requested */ - if (flags & KEVENT_FLAG_WORKLOOP_SERVICER_DETACH) { - error = kevent_servicer_detach_preflight(thread, flags, kq); - if (error) + if (kevent_args_requesting_events(flags, nevents)) { + if (kq != kevent_get_bound_kqworkloop(thread)) { + error = EXDEV; goto out; + } + + kq_req_lock(kqwl); + /* + * Disable the R2K notification while doing a register, if the + * caller wants events too, we don't want the AST to be set if we + * will process these events soon. + */ + kqr->kqr_state &= ~KQR_R2K_NOTIF_ARMED; + needs_end_processing = true; + kq_req_unlock(kq); + } + + if (id_out) { + *id_out = kqwl->kqwl_dynamicid; } - } - if (id_out && kq && (flags & KEVENT_FLAG_WORKLOOP)) { - assert(kq->kq_state & KQ_WORKLOOP); - struct kqworkloop *kqwl; - kqwl = (struct kqworkloop *)kq; - *id_out = kqwl->kqwl_dynamicid; } /* register all the change requests the user provided... */ @@ -4361,11 +3958,43 @@ kevent_internal(struct proc *p, /* Make sure user doesn't pass in any system flags */ kev.flags &= ~EV_SYSFLAGS; - kevent_register(kq, &kev, p); + register_rc = kevent_register(kq, &kev, &knlc); + if (register_rc & FILTER_REGISTER_WAIT) { + kqlock_held(kq); + + // f_post_register_wait is meant to call a continuation and not to + // return, which is why we don't support FILTER_REGISTER_WAIT if + // KEVENT_FLAG_ERROR_EVENTS is not passed, or if the event that + // waits isn't the last. + // + // It is implementable, but not used by any userspace code at the + // moment, so for now return ENOTSUP if someone tries to do it. + if (nchanges == 1 && nevents >= 1 && (flags & KEVENT_FLAG_ERROR_EVENTS)) { + struct _kevent_register *cont_args; + /* store the continuation/completion data in the uthread */ + ut = (uthread_t)get_bsdthread_info(thread); + cont_args = &ut->uu_save.uus_kevent_register; + cont_args->kev = kev; + cont_args->kq = kq; + cont_args->fp = fp; + cont_args->fd = fd; + cont_args->ueventlist = ueventlist; + cont_args->flags = flags; + cont_args->retval = retval; + cont_args->eventcount = nevents; + cont_args->eventout = noutputs; + knote_fops(cont_args->knote)->f_post_register_wait(ut, &knlc, cont_args); + panic("f_post_register_wait returned (kev: %p)", &kev); + } + + kev.flags |= EV_ERROR; + kev.data = ENOTSUP; + knote_unlock(kq, knlc.knlc_knote, &knlc, KNOTE_KQ_UNLOCK); + } - if (nevents > 0 && - ((kev.flags & EV_ERROR) || (kev.flags & EV_RECEIPT))) { - if (kev.flags & EV_RECEIPT) { + // keep in sync with kevent_register_wait_return() + if (nevents > 0 && (kev.flags & (EV_ERROR|EV_RECEIPT))) { + if ((kev.flags & EV_ERROR) == 0) { kev.flags |= EV_ERROR; kev.data = 0; } @@ -4386,9 +4015,10 @@ kevent_internal(struct proc *p, /* process pending events */ if (nevents > 0 && noutputs == 0 && error == 0) { + struct _kevent *cont_args; /* store the continuation/completion data in the uthread */ ut = (uthread_t)get_bsdthread_info(thread); - cont_args = &ut->uu_kevent.ss_kevent; + cont_args = &ut->uu_save.uus_kevent; cont_args->fp = fp; cont_args->fd = fd; cont_args->retval = retval; @@ -4402,6 +4032,11 @@ kevent_internal(struct proc *p, cont_args->process_data.fp_data_size = data_size; cont_args->process_data.fp_data_resid = data_size; + /* + * kqworkloop_end_processing() will happen at the end of kqueue_scan() + */ + needs_end_processing = false; + error = kqueue_scan(kq, kevent_callback, continuation, cont_args, &cont_args->process_data, @@ -4418,13 +4053,16 @@ kevent_internal(struct proc *p, } } - /* detach the current thread if necessary */ - if (flags & KEVENT_FLAG_WORKLOOP_SERVICER_DETACH) { - assert(fp == NULL); - kevent_servicer_detach_thread(p, id, thread, flags, kq); - } - out: + if (__improbable(needs_end_processing)) { + /* + * If we didn't through kqworkloop_end_processing(), + * we need to do it here. + */ + kqlock(kq); + kqworkloop_end_processing((struct kqworkloop *)kq, 0, 0); + kqunlock(kq); + } kevent_put_kq(p, id, fp, kq); /* don't restart after signals... */ @@ -4446,7 +4084,7 @@ kevent_internal(struct proc *p, */ static int kevent_callback(__unused struct kqueue *kq, struct kevent_internal_s *kevp, - void *data) + void *data) { struct _kevent *cont_args; int error; @@ -4493,6 +4131,122 @@ kevent_description(struct kevent_internal_s *kevp, char *s, size_t n) return (s); } +static int +kevent_register_validate_priority(struct kqueue *kq, struct knote *kn, + struct kevent_internal_s *kev) +{ + /* We don't care about the priority of a disabled or deleted knote */ + if (kev->flags & (EV_DISABLE | EV_DELETE)) { + return 0; + } + + if (kq->kq_state & KQ_WORKLOOP) { + /* + * Workloops need valid priorities with a QOS (excluding manager) for + * any enabled knote. + * + * When it is pre-existing, just make sure it has a valid QoS as + * kevent_register() will not use the incoming priority (filters who do + * have the responsibility to validate it again, see filt_wltouch). + * + * If the knote is being made, validate the incoming priority. + */ + if (!_pthread_priority_thread_qos(kn ? kn->kn_qos : kev->qos)) { + return ERANGE; + } + } + + return 0; +} + +/* + * Prepare a filter for waiting after register. + * + * The f_post_register_wait hook will be called later by kevent_register() + * and should call kevent_register_wait_block() + */ +static int +kevent_register_wait_prepare(struct knote *kn, struct kevent_internal_s *kev) +{ + thread_t thread = current_thread(); + struct uthread *uth = get_bsdthread_info(thread); + + assert(knote_fops(kn)->f_extended_codes); + + if (kn->kn_hook == NULL) { + thread_reference(thread); + kn->kn_hook = thread; + } else if (kn->kn_hook != thread) { + /* + * kn_hook may be set from a previous aborted wait + * However, it has to be from the same thread. + */ + kev->flags |= EV_ERROR; + kev->data = EXDEV; + return 0; + } + + uth->uu_save.uus_kevent_register.knote = kn; + return FILTER_REGISTER_WAIT; +} + +/* + * Cleanup a kevent_register_wait_prepare() effect for threads that have been + * aborted instead of properly woken up with thread_wakeup_thread(). + */ +static void +kevent_register_wait_cleanup(struct knote *kn) +{ + thread_t thread = kn->kn_hook; + kn->kn_hook = NULL; + thread_deallocate(thread); +} + +/* + * Must be called at the end of a f_post_register_wait call from a filter. + */ +static void +kevent_register_wait_block(struct turnstile *ts, thread_t thread, + struct knote_lock_ctx *knlc, thread_continue_t cont, + struct _kevent_register *cont_args) +{ + knote_unlock(cont_args->kq, cont_args->knote, knlc, KNOTE_KQ_UNLOCK); + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD); + cont_args->handoff_thread = thread; + thread_handoff_parameter(thread, cont, cont_args); +} + +/* + * Called by Filters using a f_post_register_wait to return from their wait. + */ +static void +kevent_register_wait_return(struct _kevent_register *cont_args) +{ + struct kqueue *kq = cont_args->kq; + proc_t p = kq->kq_p; + struct kevent_internal_s *kev = &cont_args->kev; + int error = 0; + + if (cont_args->handoff_thread) { + thread_deallocate(cont_args->handoff_thread); + } + + if (kev->flags & (EV_ERROR|EV_RECEIPT)) { + if ((kev->flags & EV_ERROR) == 0) { + kev->flags |= EV_ERROR; + kev->data = 0; + } + error = kevent_copyout(kev, &cont_args->ueventlist, p, cont_args->flags); + if (error == 0) cont_args->eventout++; + } + + kevent_put_kq(p, cont_args->fd, cont_args->fp, kq); + if (error == 0) { + *cont_args->retval = cont_args->eventout; + } + unix_syscall_return(error); +} + /* * kevent_register - add a new event to a kqueue * @@ -4507,17 +4261,15 @@ kevent_description(struct kevent_internal_s *kevp, char *s, size_t n) * caller holds a reference on the kqueue */ -void +int kevent_register(struct kqueue *kq, struct kevent_internal_s *kev, - __unused struct proc *ctxp) + struct knote_lock_ctx *knlc) { struct proc *p = kq->kq_p; const struct filterops *fops; struct knote *kn = NULL; - int result = 0; - int error = 0; + int result = 0, error = 0; unsigned short kev_flags = kev->flags; - int knoteuse_flags = KNUSE_NONE; if (kev->filter < 0) { if (kev->filter + EVFILT_SYSCOUNT < 0) { @@ -4532,7 +4284,7 @@ kevent_register(struct kqueue *kq, struct kevent_internal_s *kev, /* restrict EV_VANISHED to adding udata-specific dispatch kevents */ if ((kev->flags & EV_VANISHED) && - (kev->flags & (EV_ADD | EV_DISPATCH2)) != (EV_ADD | EV_DISPATCH2)) { + (kev->flags & (EV_ADD | EV_DISPATCH2)) != (EV_ADD | EV_DISPATCH2)) { error = EINVAL; goto out; } @@ -4557,279 +4309,249 @@ kevent_register(struct kqueue *kq, struct kevent_internal_s *kev, } restart: - /* find the matching knote from the fd tables/hashes */ kn = kq_find_knote_and_kq_lock(kq, kev, fops->f_isfd, p); + error = kevent_register_validate_priority(kq, kn, kev); + result = 0; + if (error) { + goto out; + } - if (kn == NULL) { - if (kev->flags & EV_ADD) { - struct fileproc *knote_fp = NULL; + if (kn == NULL && (kev->flags & EV_ADD) == 0) { + /* + * No knote found, EV_ADD wasn't specified + */ - /* grab a file reference for the new knote */ - if (fops->f_isfd) { - if ((error = fp_lookup(p, kev->ident, &knote_fp, 0)) != 0) { - goto out; - } - } + if ((kev_flags & EV_ADD) && (kev_flags & EV_DELETE) && + (kq->kq_state & KQ_WORKLOOP)) { + /* + * For workloops, understand EV_ADD|EV_DELETE as a "soft" delete + * that doesn't care about ENOENT, so just pretend the deletion + * happened. + */ + } else { + error = ENOENT; + } + goto out; + + } else if (kn == NULL) { + /* + * No knote found, need to attach a new one (attach) + */ + + struct fileproc *knote_fp = NULL; - kn = knote_alloc(); - if (kn == NULL) { - error = ENOMEM; - if (knote_fp != NULL) - fp_drop(p, kev->ident, knote_fp, 0); + /* grab a file reference for the new knote */ + if (fops->f_isfd) { + if ((error = fp_lookup(p, kev->ident, &knote_fp, 0)) != 0) { goto out; } + } - kn->kn_fp = knote_fp; - knote_set_kq(kn, kq); - kqueue_retain(kq); /* retain a kq ref */ - kn->kn_filtid = ~kev->filter; - kn->kn_inuse = 1; /* for f_attach() */ - kn->kn_status = KN_ATTACHING | KN_ATTACHED; - - /* was vanish support requested */ - if (kev->flags & EV_VANISHED) { - kev->flags &= ~EV_VANISHED; - kn->kn_status |= KN_REQVANISH; - } + kn = knote_alloc(); + if (kn == NULL) { + error = ENOMEM; + if (knote_fp != NULL) + fp_drop(p, kev->ident, knote_fp, 0); + goto out; + } - /* snapshot matching/dispatching protcol flags into knote */ - if (kev->flags & EV_DISPATCH) - kn->kn_status |= KN_DISPATCH; - if (kev->flags & EV_UDATA_SPECIFIC) - kn->kn_status |= KN_UDATA_SPECIFIC; + kn->kn_fp = knote_fp; + kn->kn_kq_packed = (intptr_t)(struct kqueue *)kq; + kqueue_retain(kq); /* retain a kq ref */ + kn->kn_filtid = ~kev->filter; + kn->kn_status = KN_ATTACHING | KN_ATTACHED; - /* - * copy the kevent state into knote - * protocol is that fflags and data - * are saved off, and cleared before - * calling the attach routine. - */ - kn->kn_kevent = *kev; - kn->kn_sfflags = kev->fflags; - kn->kn_sdata = kev->data; - kn->kn_fflags = 0; - kn->kn_data = 0; + /* was vanish support requested */ + if (kev->flags & EV_VANISHED) { + kev->flags &= ~EV_VANISHED; + kn->kn_status |= KN_REQVANISH; + } - /* invoke pthread kext to convert kevent qos to thread qos */ - knote_canonicalize_kevent_qos(kn); - knote_set_qos_index(kn, qos_index_from_qos(kn, kn->kn_qos, FALSE)); + /* snapshot matching/dispatching protcol flags into knote */ + if (kev->flags & EV_DISPATCH) + kn->kn_status |= KN_DISPATCH; + if (kev->flags & EV_UDATA_SPECIFIC) + kn->kn_status |= KN_UDATA_SPECIFIC; + if (kev->flags & EV_DISABLE) + kn->kn_status |= KN_DISABLED; - /* before anyone can find it */ - if (kev->flags & EV_DISABLE) { - /* - * do this before anyone can find it, - * this can't call knote_disable() because it expects having - * the kqlock held - */ - kn->kn_status |= KN_DISABLED; - } + /* + * copy the kevent state into knote + * protocol is that fflags and data + * are saved off, and cleared before + * calling the attach routine. + */ + kn->kn_kevent = *kev; + kn->kn_sfflags = kev->fflags; + kn->kn_sdata = kev->data; + kn->kn_fflags = 0; + kn->kn_data = 0; + knote_reset_priority(kn, kev->qos); - /* Add the knote for lookup thru the fd table */ - error = kq_add_knote(kq, kn, kev, p, &knoteuse_flags); - if (error) { - (void)kqueue_release(kq, KQUEUE_CANT_BE_LAST_REF); - knote_free(kn); - if (knote_fp != NULL) - fp_drop(p, kev->ident, knote_fp, 0); - - if (error == ERESTART) { - error = 0; - goto restart; - } - goto out; + /* Add the knote for lookup thru the fd table */ + error = kq_add_knote(kq, kn, knlc, p); + if (error) { + (void)kqueue_release(kq, KQUEUE_CANT_BE_LAST_REF); + knote_free(kn); + if (knote_fp != NULL) + fp_drop(p, kev->ident, knote_fp, 0); + + if (error == ERESTART) { + goto restart; } + goto out; + } + + /* fp reference count now applies to knote */ - /* fp reference count now applies to knote */ - /* rwlock boost is now held */ + /* + * we can't use filter_call() because f_attach can change the filter ops + * for a filter that supports f_extended_codes, so we need to reload + * knote_fops() and not use `fops`. + */ + result = fops->f_attach(kn, kev); + if (result && !knote_fops(kn)->f_extended_codes) { + result = FILTER_ACTIVE; + } - /* call filter attach routine */ - result = fops->f_attach(kn, kev); + kqlock(kq); + if (kn->kn_flags & EV_ERROR) { /* - * Trade knote use count for kq lock. - * Cannot be dropped because we held - * KN_ATTACHING throughout. + * Failed to attach correctly, so drop. */ - knoteuse2kqlock(kq, kn, KNUSE_STEAL_DROP | knoteuse_flags); + kn->kn_status &= ~(KN_ATTACHED | KN_ATTACHING); + error = kn->kn_data; + knote_drop(kq, kn, knlc); + result = 0; + goto out; + } - if (kn->kn_flags & EV_ERROR) { - /* - * Failed to attach correctly, so drop. - * All other possible users/droppers - * have deferred to us. Save the error - * to return to our caller. - */ - kn->kn_status &= ~KN_ATTACHED; - kn->kn_status |= KN_DROPPING; - error = kn->kn_data; - kqunlock(kq); - knote_drop(kn, p); - goto out; - } - - /* end "attaching" phase - now just attached */ - kn->kn_status &= ~KN_ATTACHING; - - if (kn->kn_status & KN_DROPPING) { - /* - * Attach succeeded, but someone else - * deferred their drop - now we have - * to do it for them. - */ - kqunlock(kq); - knote_drop(kn, p); - goto out; - } + /* + * end "attaching" phase - now just attached + * + * Mark the thread request overcommit, if appropos + * + * If the attach routine indicated that an + * event is already fired, activate the knote. + */ + kn->kn_status &= ~KN_ATTACHING; + knote_set_qos_overcommit(kn); - /* Mark the thread request overcommit - if appropos */ - knote_set_qos_overcommit(kn); + if (result & FILTER_ACTIVE) { + if (result & FILTER_ADJUST_EVENT_QOS_BIT) + knote_adjust_qos(kq, kn, result); + knote_activate(kn); + } - /* - * If the attach routine indicated that an - * event is already fired, activate the knote. - */ - if (result) - knote_activate(kn); + } else if (!knote_lock(kq, kn, knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) { - if (knote_fops(kn)->f_post_attach) { - error = knote_fops(kn)->f_post_attach(kn, kev); - if (error) { - kqunlock(kq); - goto out; - } - } + /* + * The knote was dropped while we were waiting for the lock, + * we need to re-evaluate entirely + */ - } else { - if ((kev_flags & (EV_ADD | EV_DELETE)) == (EV_ADD | EV_DELETE) && - (kq->kq_state & KQ_WORKLOOP)) { - /* - * For workloops, understand EV_ADD|EV_DELETE as a "soft" delete - * that doesn't care about ENOENT, so just pretend the deletion - * happened. - */ - } else { - error = ENOENT; - } - goto out; - } + goto restart; - } else { - /* existing knote: kqueue lock already taken by kq_find_knote_and_kq_lock */ + } else if (kev->flags & EV_DELETE) { + /* + * Deletion of a knote (drop) + * + * If the filter wants to filter drop events, let it do so. + * + * defer-delete: when trying to delete a disabled EV_DISPATCH2 knote, + * we must wait for the knote to be re-enabled (unless it is being + * re-enabled atomically here). + */ - if ((kn->kn_status & (KN_DROPPING | KN_ATTACHING)) != 0) { - /* - * The knote is not in a stable state, wait for that - * transition to complete and then redrive the lookup. - */ - knoteusewait(kq, kn); - goto restart; - } + if (knote_fops(kn)->f_allow_drop) { + bool drop; - if (kev->flags & EV_DELETE) { + kqunlock(kq); + drop = knote_fops(kn)->f_allow_drop(kn, kev); + kqlock(kq); - /* - * If attempting to delete a disabled dispatch2 knote, - * we must wait for the knote to be re-enabled (unless - * it is being re-enabled atomically here). - */ - if ((kev->flags & EV_ENABLE) == 0 && - (kn->kn_status & (KN_DISPATCH2 | KN_DISABLED)) == - (KN_DISPATCH2 | KN_DISABLED)) { - kn->kn_status |= KN_DEFERDELETE; - kqunlock(kq); - error = EINPROGRESS; - } else if (knote_fops(kn)->f_drop_and_unlock) { - /* - * The filter has requested to handle EV_DELETE events - * - * ERESTART means the kevent has to be re-evaluated - */ - error = knote_fops(kn)->f_drop_and_unlock(kn, kev); - if (error == ERESTART) { - error = 0; - goto restart; - } - } else if (kqlock2knotedrop(kq, kn)) { - /* standard/default EV_DELETE path */ - knote_drop(kn, p); - } else { - /* - * The kqueue is unlocked, it's not being - * dropped, and kqlock2knotedrop returned 0: - * this means that someone stole the drop of - * the knote from us. - */ - error = EINPROGRESS; - } - goto out; + if (!drop) goto out_unlock; } - /* - * If we are re-enabling a deferred-delete knote, - * just enable it now and avoid calling the - * filter touch routine (it has delivered its - * last event already). - */ - if ((kev->flags & EV_ENABLE) && - (kn->kn_status & KN_DEFERDELETE)) { - assert(kn->kn_status & KN_DISABLED); - knote_activate(kn); - knote_enable(kn); - kqunlock(kq); - goto out; + if ((kev->flags & EV_ENABLE) == 0 && + (kn->kn_status & (KN_DISPATCH2 | KN_DISABLED)) == + (KN_DISPATCH2 | KN_DISABLED)) { + kn->kn_status |= KN_DEFERDELETE; + error = EINPROGRESS; + goto out_unlock; } - /* - * If we are disabling, do it before unlocking and - * calling the touch routine (so no processing can - * see the new kevent state before the disable is - * applied). - */ - if (kev->flags & EV_DISABLE) - knote_disable(kn); + knote_drop(kq, kn, knlc); + goto out; + } else { /* - * Convert the kqlock to a use reference on the - * knote so we can call the filter touch routine. + * Regular update of a knote (touch) + * + * Call touch routine to notify filter of changes in filter values + * (and to re-determine if any events are fired). + * + * If the knote is in defer-delete, avoid calling the filter touch + * routine (it has delivered its last event already). + * + * If the touch routine had no failure, + * apply the requested side effects to the knote. */ - if (knoteuse_needs_boost(kn, kev)) { - knoteuse_flags |= KNUSE_BOOST; - } - if (kqlock2knoteuse(kq, kn, knoteuse_flags)) { - /* - * Call touch routine to notify filter of changes - * in filter values (and to re-determine if any - * events are fired). - */ - result = knote_fops(kn)->f_touch(kn, kev); - /* Get the kq lock back (don't defer droppers). */ - if (!knoteuse2kqlock(kq, kn, knoteuse_flags)) { - kqunlock(kq); - goto out; + if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) { + if (kev->flags & EV_ENABLE) { + result = FILTER_ACTIVE; } + } else { + kqunlock(kq); + result = filter_call(knote_fops(kn), f_touch(kn, kev)); + kqlock(kq); + } - /* Handle errors during touch routine */ - if (kev->flags & EV_ERROR) { - error = kev->data; - kqunlock(kq); - goto out; + if (kev->flags & EV_ERROR) { + result = 0; + } else { + /* accept new kevent state */ + if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) + kn->kn_udata = kev->udata; + if (kev->flags & EV_DISABLE) + knote_disable(kn); + if (result & (FILTER_UPDATE_REQ_QOS | FILTER_ADJUST_EVENT_QOS_BIT)) + knote_dequeue(kn); + if ((result & FILTER_UPDATE_REQ_QOS) && + kev->qos && kev->qos != kn->kn_qos) { + knote_reset_priority(kn, kev->qos); } - - /* Activate it if the touch routine said to */ - if (result) + if (result & FILTER_ACTIVE) { + thread_qos_t qos; + if (result & FILTER_ADJUST_EVENT_QOS_BIT) { + if (knote_should_apply_qos_override(kq, kn, result, &qos)) { + knote_apply_qos_override(kn, qos); + } + } knote_activate(kn); + } + if (result & (FILTER_UPDATE_REQ_QOS | FILTER_ADJUST_EVENT_QOS_BIT)) { + if (knote_enqueue(kn) && (kn->kn_status & KN_ACTIVE)) { + knote_wakeup(kn); + } + } + if (kev->flags & EV_ENABLE) + knote_enable(kn); } - - /* Enable the knote if called for */ - if (kev->flags & EV_ENABLE) - knote_enable(kn); - } - /* still have kqlock held and knote is valid */ - kqunlock(kq); +out_unlock: + if ((result & FILTER_REGISTER_WAIT) == 0) { + /* + * When the filter asked for a post-register wait, + * we leave the knote and kqueue locked for kevent_register() + * to call the filter's f_post_register_wait hook. + */ + knote_unlock(kq, kn, knlc, KNOTE_KQ_UNLOCK); + } out: /* output local errors through the kevent */ @@ -4837,9 +4559,9 @@ kevent_register(struct kqueue *kq, struct kevent_internal_s *kev, kev->flags |= EV_ERROR; kev->data = error; } + return result; } - /* * knote_process - process a triggered event * @@ -4861,16 +4583,17 @@ kevent_register(struct kqueue *kq, struct kevent_internal_s *kev, * kqueue locked on entry and exit - but may be dropped */ static int -knote_process(struct knote *kn, +knote_process(struct knote *kn, kevent_callback_t callback, void *callback_data, - struct filt_process_s *process_data, - struct proc *p) + struct filt_process_s *process_data) { struct kevent_internal_s kev; struct kqueue *kq = knote_get_kq(kn); - int result = 0; + KNOTE_LOCK_CTX(knlc); + int result = FILTER_ACTIVE; int error = 0; + bool drop = false; bzero(&kev, sizeof(kev)); @@ -4897,110 +4620,93 @@ knote_process(struct knote *kn, kn->kn_status | (kn->kn_id << 32), kn->kn_filtid); } + if ((kn->kn_status & KN_DROPPING) || + !knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS)) { + /* + * When the knote is dropping or has dropped, + * then there's nothing we want to process. + */ + return EJUSTRETURN; + } + /* * For deferred-drop or vanished events, we just create a fake * event to acknowledge end-of-life. Otherwise, we call the * filter's process routine to snapshot the kevent state under * the filter's locking protocol. + * + * suppress knotes to avoid returning the same event multiple times in + * a single call. */ + knote_suppress(kn); + if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) { /* create fake event */ kev.filter = kn->kn_filter; kev.ident = kn->kn_id; - kev.qos = kn->kn_qos; - kev.flags = (kn->kn_status & KN_DEFERDELETE) ? - EV_DELETE : EV_VANISHED; + kev.flags = (kn->kn_status & KN_DEFERDELETE) ? EV_DELETE : EV_VANISHED; kev.flags |= (EV_DISPATCH2 | EV_ONESHOT); kev.udata = kn->kn_udata; - result = 1; - - knote_suppress(kn); } else { - int flags = KNUSE_NONE; /* deactivate - so new activations indicate a wakeup */ knote_deactivate(kn); - /* suppress knotes to avoid returning the same event multiple times in a single call. */ - knote_suppress(kn); - - if (knoteuse_needs_boost(kn, NULL)) { - flags |= KNUSE_BOOST; - } - /* convert lock to a knote use reference */ - if (!kqlock2knoteuse(kq, kn, flags)) - panic("dropping knote found on queue\n"); - - /* call out to the filter to process with just a ref */ - result = knote_fops(kn)->f_process(kn, process_data, &kev); - if (result) flags |= KNUSE_STEAL_DROP; + kqunlock(kq); + result = filter_call(knote_fops(kn), f_process(kn, process_data, &kev)); + kqlock(kq); + } - /* - * convert our reference back to a lock. accept drop - * responsibility from others if we've committed to - * delivering event data. - */ - if (!knoteuse2kqlock(kq, kn, flags)) { - /* knote dropped */ - kn = NULL; + /* + * Determine how to dispatch the knote for future event handling. + * not-fired: just return (do not callout, leave deactivated). + * One-shot: If dispatch2, enter deferred-delete mode (unless this is + * is the deferred delete event delivery itself). Otherwise, + * drop it. + * Dispatch: don't clear state, just mark it disabled. + * Cleared: just leave it deactivated. + * Others: re-activate as there may be more events to handle. + * This will not wake up more handlers right now, but + * at the completion of handling events it may trigger + * more handler threads (TODO: optimize based on more than + * just this one event being detected by the filter). + */ + if ((result & FILTER_ACTIVE) == 0) { + if ((kn->kn_status & (KN_ACTIVE | KN_STAYACTIVE)) == 0) { + /* + * Stay active knotes should not be unsuppressed or we'd create an + * infinite loop. + * + * Some knotes (like EVFILT_WORKLOOP) can be reactivated from + * within f_process() but that doesn't necessarily make them + * ready to process, so we should leave them be. + * + * For other knotes, since we will not return an event, + * there's no point keeping the knote suppressed. + */ + knote_unsuppress(kn); } + knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS); + return EJUSTRETURN; } - if (kn != NULL) { - /* - * Determine how to dispatch the knote for future event handling. - * not-fired: just return (do not callout, leave deactivated). - * One-shot: If dispatch2, enter deferred-delete mode (unless this is - * is the deferred delete event delivery itself). Otherwise, - * drop it. - * stolendrop:We took responsibility for someone else's drop attempt. - * treat this just like one-shot and prepare to turn it back - * into a deferred delete if required. - * Dispatch: don't clear state, just mark it disabled. - * Cleared: just leave it deactivated. - * Others: re-activate as there may be more events to handle. - * This will not wake up more handlers right now, but - * at the completion of handling events it may trigger - * more handler threads (TODO: optimize based on more than - * just this one event being detected by the filter). - */ + if (result & FILTER_ADJUST_EVENT_QOS_BIT) + knote_adjust_qos(kq, kn, result); + kev.qos = _pthread_priority_combine(kn->kn_qos, kn->kn_qos_override); - if (result == 0) - return (EJUSTRETURN); - - if ((kev.flags & EV_ONESHOT) || (kn->kn_status & KN_STOLENDROP)) { - if ((kn->kn_status & (KN_DISPATCH2 | KN_DEFERDELETE)) == KN_DISPATCH2) { - /* defer dropping non-delete oneshot dispatch2 events */ - kn->kn_status |= KN_DEFERDELETE; - knote_disable(kn); - - /* if we took over another's drop clear those flags here */ - if (kn->kn_status & KN_STOLENDROP) { - assert(kn->kn_status & KN_DROPPING); - /* - * the knote will be dropped when the - * deferred deletion occurs - */ - kn->kn_status &= ~(KN_DROPPING|KN_STOLENDROP); - } - } else if (kn->kn_status & KN_STOLENDROP) { - /* We now own the drop of the knote. */ - assert(kn->kn_status & KN_DROPPING); - knote_unsuppress(kn); - kqunlock(kq); - knote_drop(kn, p); - kqlock(kq); - } else if (kqlock2knotedrop(kq, kn)) { - /* just EV_ONESHOT, _not_ DISPATCH2 */ - knote_drop(kn, p); - kqlock(kq); - } - } else if (kn->kn_status & KN_DISPATCH) { - /* disable all dispatch knotes */ + if (kev.flags & EV_ONESHOT) { + if ((kn->kn_status & (KN_DISPATCH2 | KN_DEFERDELETE)) == KN_DISPATCH2) { + /* defer dropping non-delete oneshot dispatch2 events */ + kn->kn_status |= KN_DEFERDELETE; knote_disable(kn); - } else if ((kev.flags & EV_CLEAR) == 0) { - /* re-activate in case there are more events */ - knote_activate(kn); + } else { + drop = true; } + } else if (kn->kn_status & KN_DISPATCH) { + /* disable all dispatch knotes */ + knote_disable(kn); + } else if ((kev.flags & EV_CLEAR) == 0) { + /* re-activate in case there are more events */ + knote_activate(kn); } /* @@ -5008,80 +4714,137 @@ knote_process(struct knote *kn, * If we have to detach and drop the knote, do * it while we have the kq unlocked. */ - if (result) { - kqunlock(kq); - error = (callback)(kq, &kev, callback_data); - kqlock(kq); + if (drop) { + knote_drop(kq, kn, &knlc); + } else { + knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK); } - return (error); -} + if (kev.flags & EV_VANISHED) { + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KNOTE_VANISHED), + kev.ident, kn->kn_udata, kn->kn_status | (kn->kn_id << 32), + kn->kn_filtid); + } + + error = (callback)(kq, &kev, callback_data); + kqlock(kq); + return error; +} /* - * Return 0 to indicate that processing should proceed, - * -1 if there is nothing to process. - * - * Called with kqueue locked and returns the same way, - * but may drop lock temporarily. + * Returns -1 if the kqueue was unbound and processing should not happen */ +#define KQWQAE_BEGIN_PROCESSING 1 +#define KQWQAE_END_PROCESSING 2 +#define KQWQAE_UNBIND 3 static int -kqworkq_begin_processing(struct kqworkq *kqwq, kq_index_t qos_index, int flags) +kqworkq_acknowledge_events(struct kqworkq *kqwq, struct kqrequest *kqr, + int kevent_flags, int kqwqae_op) { - struct kqrequest *kqr; - thread_t self = current_thread(); - __assert_only struct uthread *ut = get_bsdthread_info(self); - - assert(kqwq->kqwq_state & KQ_WORKQ); - assert(qos_index < KQWQ_NQOS); + thread_qos_t old_override = THREAD_QOS_UNSPECIFIED; + thread_t thread = kqr->kqr_thread; + struct knote *kn; + int rc = 0; + bool seen_stayactive = false, unbind; - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_START, - flags, qos_index); + kqlock_held(&kqwq->kqwq_kqueue); - kqwq_req_lock(kqwq); + if (!TAILQ_EMPTY(&kqr->kqr_suppressed)) { + /* + * Return suppressed knotes to their original state. + * For workq kqueues, suppressed ones that are still + * truly active (not just forced into the queue) will + * set flags we check below to see if anything got + * woken up. + */ + while ((kn = TAILQ_FIRST(&kqr->kqr_suppressed)) != NULL) { + assert(kn->kn_status & KN_SUPPRESSED); + knote_unsuppress(kn); + if (kn->kn_status & KN_STAYACTIVE) { + seen_stayactive = true; + } + } + } - kqr = kqworkq_get_request(kqwq, qos_index); + kq_req_lock(kqwq); - /* manager skips buckets that haven't asked for its help */ - if (flags & KEVENT_FLAG_WORKQ_MANAGER) { +#if DEBUG || DEVELOPMENT + thread_t self = current_thread(); + struct uthread *ut = get_bsdthread_info(self); - /* If nothing for manager to do, just return */ - if ((kqr->kqr_state & KQWQ_THMANAGER) == 0) { - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_END, - 0, kqr->kqr_state); - kqwq_req_unlock(kqwq); - return -1; + assert(kqr->kqr_state & KQR_THREQUESTED); + assert(kqr->kqr_thread == self); + assert(ut->uu_kqr_bound == kqr); +#endif // DEBUG || DEVELOPMENT + + if (kqwqae_op == KQWQAE_UNBIND) { + unbind = true; + } else if ((kevent_flags & KEVENT_FLAG_PARKING) == 0) { + unbind = false; + } else if (kqwqae_op == KQWQAE_BEGIN_PROCESSING && seen_stayactive) { + /* + * When we unsuppress stayactive knotes, for the kind that are hooked + * through select, we need to process once before we can assert there's + * no event pending. Hence we can't unbind during BEGIN PROCESSING. + */ + unbind = false; + } else { + unbind = ((kqr->kqr_state & KQR_WAKEUP) == 0); + } + if (unbind) { + old_override = kqworkq_unbind_locked(kqwq, kqr, thread); + rc = -1; + /* + * request a new thread if we didn't process the whole queue or real events + * have happened (not just putting stay-active events back). + */ + if (kqr->kqr_state & KQR_WAKEUP) { + kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr, + kqr->kqr_qos_index, 0); } - /* bind manager thread from this time on */ - kqworkq_bind_thread_impl(kqwq, qos_index, self, flags); + } - } else { - /* We should already be bound to this kqueue */ - assert(kqr->kqr_state & KQR_BOUND); - assert(kqr->kqr_thread == self); - assert(ut->uu_kqueue_bound == (struct kqueue *)kqwq); - assert(ut->uu_kqueue_qos_index == qos_index); - assert((ut->uu_kqueue_flags & flags) == ut->uu_kqueue_flags); + if (rc == 0) { + /* + * Reset wakeup bit to notice events firing while we are processing, + * as we cannot rely on the bucket queue emptiness because of stay + * active knotes. + */ + kqr->kqr_state &= ~KQR_WAKEUP; } - /* - * we should have been requested to be here - * and nobody else should still be processing - */ - assert(kqr->kqr_state & KQR_WAKEUP); - assert(kqr->kqr_state & KQR_THREQUESTED); - assert((kqr->kqr_state & KQR_PROCESSING) == 0); + kq_req_unlock(kqwq); - /* reset wakeup trigger to catch new events after we start processing */ - kqr->kqr_state &= ~KQR_WAKEUP; + if (old_override) { + thread_drop_ipc_override(thread); + } - /* convert to processing mode */ - kqr->kqr_state |= KQR_PROCESSING; + return rc; +} + +/* + * Return 0 to indicate that processing should proceed, + * -1 if there is nothing to process. + * + * Called with kqueue locked and returns the same way, + * but may drop lock temporarily. + */ +static int +kqworkq_begin_processing(struct kqworkq *kqwq, struct kqrequest *kqr, + int kevent_flags) +{ + int rc = 0; + + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_START, + 0, kqr->kqr_qos_index); + + rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags, + KQWQAE_BEGIN_PROCESSING); KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_END, - kqr_thread_id(kqr), kqr->kqr_state); + thread_tid(kqr->kqr_thread), kqr->kqr_state); - kqwq_req_unlock(kqwq); - return 0; + return rc; } static inline bool @@ -5102,10 +4865,11 @@ kqworkloop_is_processing_on_current_thread(struct kqworkloop *kqwl) return false; } -static void -kqworkloop_acknowledge_events(struct kqworkloop *kqwl, boolean_t clear_ipc_override) +static thread_qos_t +kqworkloop_acknowledge_events(struct kqworkloop *kqwl) { struct kqrequest *kqr = &kqwl->kqwl_request; + kq_index_t qos = THREAD_QOS_UNSPECIFIED; struct knote *kn, *tmp; kqlock_held(&kqwl->kqwl_kqueue); @@ -5119,48 +4883,112 @@ kqworkloop_acknowledge_events(struct kqworkloop *kqwl, boolean_t clear_ipc_overr if (knote_fops(kn)->f_adjusts_qos && (kn->kn_status & KN_DISABLED) && (kn->kn_status & (KN_STAYACTIVE | KN_DROPPING)) == 0 && (kn->kn_flags & (EV_DISPATCH | EV_DISABLE)) == EV_DISPATCH) { - /* - * When called from unbind, clear the sync ipc override on the knote - * for events which are delivered. - */ - if (clear_ipc_override) { - knote_adjust_sync_qos(kn, THREAD_QOS_UNSPECIFIED, FALSE); - } + qos = MAX(qos, knote_get_qos_override_index(kn)); continue; } knote_unsuppress(kn); } + + return qos; } static int -kqworkloop_begin_processing(struct kqworkloop *kqwl, - __assert_only unsigned int flags) +kqworkloop_begin_processing(struct kqworkloop *kqwl, unsigned int kevent_flags) { struct kqrequest *kqr = &kqwl->kqwl_request; struct kqueue *kq = &kqwl->kqwl_kqueue; + thread_qos_t old_override = THREAD_QOS_UNSPECIFIED, qos_override; + thread_t thread = kqr->kqr_thread; + int rc = 0, op = KQWL_UTQ_NONE; kqlock_held(kq); KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_START, - kqwl->kqwl_dynamicid, flags, 0); - - kqwl_req_lock(kqwl); + kqwl->kqwl_dynamicid, 0, 0); /* nobody else should still be processing */ - assert((kqr->kqr_state & KQR_PROCESSING) == 0); assert((kq->kq_state & KQ_PROCESSING) == 0); - kqr->kqr_state |= KQR_PROCESSING | KQR_R2K_NOTIF_ARMED; kq->kq_state |= KQ_PROCESSING; - kqwl_req_unlock(kqwl); + if (!TAILQ_EMPTY(&kqr->kqr_suppressed)) { + op = KQWL_UTQ_RESET_WAKEUP_OVERRIDE; + } + + if (kevent_flags & KEVENT_FLAG_PARKING) { + /* + * When "parking" we want to process events and if no events are found + * unbind. + * + * However, non overcommit threads sometimes park even when they have + * more work so that the pool can narrow. For these, we need to unbind + * early, so that calling kqworkloop_update_threads_qos() can ask the + * workqueue subsystem whether the thread should park despite having + * pending events. + */ + if (kqr->kqr_state & KQR_THOVERCOMMIT) { + op = KQWL_UTQ_PARKING; + } else { + op = KQWL_UTQ_UNBINDING; + } + } + if (op == KQWL_UTQ_NONE) { + goto done; + } + + qos_override = kqworkloop_acknowledge_events(kqwl); - kqworkloop_acknowledge_events(kqwl, FALSE); + kq_req_lock(kqwl); + + if (op == KQWL_UTQ_UNBINDING) { + old_override = kqworkloop_unbind_locked(kqwl, thread); + (void)kqueue_release(kqwl, KQUEUE_CANT_BE_LAST_REF); + } + kqworkloop_update_threads_qos(kqwl, op, qos_override); + if (op == KQWL_UTQ_PARKING) { + if (!TAILQ_EMPTY(&kqwl->kqwl_queue[KQWL_BUCKET_STAYACTIVE])) { + /* + * We cannot trust KQR_WAKEUP when looking at stay active knotes. + * We need to process once, and kqworkloop_end_processing will + * handle the unbind. + */ + } else if ((kqr->kqr_state & KQR_WAKEUP) == 0 || kqwl->kqwl_owner) { + old_override = kqworkloop_unbind_locked(kqwl, thread); + (void)kqueue_release(kqwl, KQUEUE_CANT_BE_LAST_REF); + rc = -1; + } + } else if (op == KQWL_UTQ_UNBINDING) { + if (kqr->kqr_thread == thread) { + /* + * The thread request fired again, passed the admission check and + * got bound to the current thread again. + */ + } else { + rc = -1; + } + } + if (rc == 0) { + /* + * Reset wakeup bit to notice stay active events firing while we are + * processing, as we cannot rely on the stayactive bucket emptiness. + */ + kqr->kqr_wakeup_indexes &= ~KQWL_STAYACTIVE_FIRED_BIT; + } else { + kq->kq_state &= ~KQ_PROCESSING; + } + + kq_req_unlock(kqwl); + + if (old_override) { + thread_drop_ipc_override(thread); + } + +done: KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_END, - kqwl->kqwl_dynamicid, flags, 0); + kqwl->kqwl_dynamicid, 0, 0); - return 0; + return rc; } /* @@ -5172,22 +5000,15 @@ kqworkloop_begin_processing(struct kqworkloop *kqwl, * May block. */ static int -kqueue_begin_processing(struct kqueue *kq, kq_index_t qos_index, unsigned int flags) +kqfile_begin_processing(struct kqueue *kq) { struct kqtailq *suppressq; kqlock_held(kq); - if (kq->kq_state & KQ_WORKQ) { - return kqworkq_begin_processing((struct kqworkq *)kq, qos_index, flags); - } else if (kq->kq_state & KQ_WORKLOOP) { - return kqworkloop_begin_processing((struct kqworkloop*)kq, flags); - } - + assert((kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0); KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_START, - VM_KERNEL_UNSLIDE_OR_PERM(kq), flags); - - assert(qos_index == QOS_INDEX_KQFILE); + VM_KERNEL_UNSLIDE_OR_PERM(kq), 0); /* wait to become the exclusive processing thread */ for (;;) { @@ -5202,11 +5023,11 @@ kqueue_begin_processing(struct kqueue *kq, kq_index_t qos_index, unsigned int fl /* if someone else is processing the queue, wait */ kq->kq_state |= KQ_PROCWAIT; - suppressq = kqueue_get_suppressed_queue(kq, qos_index); + suppressq = kqueue_get_suppressed_queue(kq, NULL); waitq_assert_wait64((struct waitq *)&kq->kq_wqs, - CAST_EVENT64_T(suppressq), - THREAD_UNINT, TIMEOUT_WAIT_FOREVER); - + CAST_EVENT64_T(suppressq), THREAD_UNINT | THREAD_WAIT_NOREPORT, + TIMEOUT_WAIT_FOREVER); + kqunlock(kq); thread_block(THREAD_CONTINUE_NULL); kqlock(kq); @@ -5219,7 +5040,7 @@ kqueue_begin_processing(struct kqueue *kq, kq_index_t qos_index, unsigned int fl kq->kq_state &= ~KQ_WAKEUP; /* anything left to process? */ - if (kqueue_queue_empty(kq, qos_index)) { + if (kqueue_queue_empty(kq, QOS_INDEX_KQFILE)) { KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END, VM_KERNEL_UNSLIDE_OR_PERM(kq), 1); return -1; @@ -5235,697 +5056,364 @@ kqueue_begin_processing(struct kqueue *kq, kq_index_t qos_index, unsigned int fl } /* - * kqworkq_end_processing - Complete the processing of a workq kqueue - * - * We may have to request new threads. - * This can happen there are no waiting processing threads and: - * - there were active events we never got to (count > 0) - * - we pended waitq hook callouts during processing - * - we pended wakeups while processing (or unsuppressing) + * Try to end the processing, only called when a workq thread is attempting to + * park (KEVENT_FLAG_PARKING is set). * - * Called with kqueue lock held. + * When returning -1, the kqworkq is setup again so that it is ready to be + * processed. */ -static void -kqworkq_end_processing(struct kqworkq *kqwq, kq_index_t qos_index, int flags) +static int +kqworkq_end_processing(struct kqworkq *kqwq, struct kqrequest *kqr, + int kevent_flags) { -#pragma unused(flags) - - struct kqueue *kq = &kqwq->kqwq_kqueue; - struct kqtailq *suppressq = kqueue_get_suppressed_queue(kq, qos_index); - - thread_t self = current_thread(); - struct uthread *ut = get_bsdthread_info(self); - struct knote *kn; - struct kqrequest *kqr; - thread_t thread; - - assert(kqwq->kqwq_state & KQ_WORKQ); - assert(qos_index < KQWQ_NQOS); - - /* Are we really bound to this kqueue? */ - if (ut->uu_kqueue_bound != kq) { - assert(ut->uu_kqueue_bound == kq); - return; - } - - kqr = kqworkq_get_request(kqwq, qos_index); - - kqwq_req_lock(kqwq); - - /* Do we claim to be manager? */ - if (flags & KEVENT_FLAG_WORKQ_MANAGER) { - - /* bail if not bound that way */ - if (ut->uu_kqueue_qos_index != KQWQ_QOS_MANAGER || - (ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ_MANAGER) == 0) { - assert(ut->uu_kqueue_qos_index == KQWQ_QOS_MANAGER); - assert(ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ_MANAGER); - kqwq_req_unlock(kqwq); - return; - } - - /* bail if this request wasn't already getting manager help */ - if ((kqr->kqr_state & KQWQ_THMANAGER) == 0 || - (kqr->kqr_state & KQR_PROCESSING) == 0) { - kqwq_req_unlock(kqwq); - return; - } - } else { - if (ut->uu_kqueue_qos_index != qos_index || - (ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ_MANAGER)) { - assert(ut->uu_kqueue_qos_index == qos_index); - assert((ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ_MANAGER) == 0); - kqwq_req_unlock(kqwq); - return; - } - } - - assert(kqr->kqr_state & KQR_BOUND); - thread = kqr->kqr_thread; - assert(thread == self); - - assert(kqr->kqr_state & KQR_PROCESSING); - - /* If we didn't drain the whole queue, re-mark a wakeup being needed */ - if (!kqueue_queue_empty(kq, qos_index)) + if (!kqueue_queue_empty(&kqwq->kqwq_kqueue, kqr->kqr_qos_index)) { + /* remember we didn't process everything */ + kq_req_lock(kqwq); kqr->kqr_state |= KQR_WAKEUP; - - kqwq_req_unlock(kqwq); - - /* - * Return suppressed knotes to their original state. - * For workq kqueues, suppressed ones that are still - * truly active (not just forced into the queue) will - * set flags we check below to see if anything got - * woken up. - */ - while ((kn = TAILQ_FIRST(suppressq)) != NULL) { - assert(kn->kn_status & KN_SUPPRESSED); - knote_unsuppress(kn); + kq_req_unlock(kqwq); } - kqwq_req_lock(kqwq); - - /* Indicate that we are done processing this request */ - kqr->kqr_state &= ~KQR_PROCESSING; - - /* - * Drop our association with this one request and its - * override on us. - */ - kqworkq_unbind_thread(kqwq, qos_index, thread, flags); - - /* - * request a new thread if we didn't process the whole - * queue or real events have happened (not just putting - * stay-active events back). - */ - if (kqr->kqr_state & KQR_WAKEUP) { - if (kqueue_queue_empty(kq, qos_index)) { - kqr->kqr_state &= ~KQR_WAKEUP; - } else { - kqworkq_request_thread(kqwq, qos_index); - } - } - kqwq_req_unlock(kqwq); -} - -static void -kqworkloop_end_processing(struct kqworkloop *kqwl, int nevents, - unsigned int flags) -{ - struct kqrequest *kqr = &kqwl->kqwl_request; - struct kqueue *kq = &kqwl->kqwl_kqueue; - - kqlock_held(kq); - - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_START, - kqwl->kqwl_dynamicid, flags, 0); - - if ((kq->kq_state & KQ_NO_WQ_THREAD) && nevents == 0 && - (flags & KEVENT_FLAG_IMMEDIATE) == 0) { + if (kevent_flags & KEVENT_FLAG_PARKING) { /* - * We may soon block, but have returned no - * kevents that need to be kept supressed for overriding purposes. - * - * It is hence safe to acknowledge events and unsuppress everything, so - * that if we block we can observe all events firing. + * if acknowledge events "succeeds" it means there are events, + * which is a failure condition for end_processing. */ - kqworkloop_acknowledge_events(kqwl, TRUE); - } - - kqwl_req_lock(kqwl); - - assert(kqr->kqr_state & KQR_PROCESSING); - assert(kq->kq_state & KQ_PROCESSING); - - kq->kq_state &= ~KQ_PROCESSING; - kqr->kqr_state &= ~KQR_PROCESSING; - kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RECOMPUTE_WAKEUP_QOS, 0); - - kqwl_req_unlock(kqwl); - - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_END, - kqwl->kqwl_dynamicid, flags, 0); -} - -/* - * Called with kqueue lock held. - */ -static void -kqueue_end_processing(struct kqueue *kq, kq_index_t qos_index, - int nevents, unsigned int flags) -{ - struct knote *kn; - struct kqtailq *suppressq; - int procwait; - - kqlock_held(kq); - - assert((kq->kq_state & KQ_WORKQ) == 0); - - if (kq->kq_state & KQ_WORKLOOP) { - return kqworkloop_end_processing((struct kqworkloop *)kq, nevents, flags); - } - - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_END), - VM_KERNEL_UNSLIDE_OR_PERM(kq), flags); - - assert(qos_index == QOS_INDEX_KQFILE); - - /* - * Return suppressed knotes to their original state. - */ - suppressq = kqueue_get_suppressed_queue(kq, qos_index); - while ((kn = TAILQ_FIRST(suppressq)) != NULL) { - assert(kn->kn_status & KN_SUPPRESSED); - knote_unsuppress(kn); + int rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags, + KQWQAE_END_PROCESSING); + if (rc == 0) { + return -1; + } } - procwait = (kq->kq_state & KQ_PROCWAIT); - kq->kq_state &= ~(KQ_PROCESSING | KQ_PROCWAIT); - - if (procwait) { - /* first wake up any thread already waiting to process */ - waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, - CAST_EVENT64_T(suppressq), - THREAD_AWAKENED, - WAITQ_ALL_PRIORITIES); - } + return 0; } /* - * kqwq_internal_bind - bind thread to processing workq kqueue + * Try to end the processing, only called when a workq thread is attempting to + * park (KEVENT_FLAG_PARKING is set). * - * Determines if the provided thread will be responsible for - * servicing the particular QoS class index specified in the - * parameters. Once the binding is done, any overrides that may - * be associated with the cooresponding events can be applied. + * When returning -1, the kqworkq is setup again so that it is ready to be + * processed (as if kqworkloop_begin_processing had just been called). * - * This should be called as soon as the thread identity is known, - * preferably while still at high priority during creation. - * - * - caller holds a reference on the process (and workq kq) - * - the thread MUST call kevent_qos_internal after being bound - * or the bucket of events may never be delivered. - * - Nothing locked - * (unless this is a synchronous bind, then the request is locked) + * If successful and KEVENT_FLAG_PARKING was set in the kevent_flags, + * the kqworkloop is unbound from its servicer as a side effect. */ static int -kqworkq_internal_bind( - struct proc *p, - kq_index_t qos_index, - thread_t thread, - unsigned int flags) -{ - struct kqueue *kq; - struct kqworkq *kqwq; - struct kqrequest *kqr; - struct uthread *ut = get_bsdthread_info(thread); - - /* If no process workq, can't be our thread. */ - kq = p->p_fd->fd_wqkqueue; - - if (kq == NULL) - return 0; - - assert(kq->kq_state & KQ_WORKQ); - kqwq = (struct kqworkq *)kq; - - /* - * No need to bind the manager thread to any specific - * bucket, but still claim the thread. - */ - if (qos_index == KQWQ_QOS_MANAGER) { - assert(ut->uu_kqueue_bound == NULL); - assert(flags & KEVENT_FLAG_WORKQ_MANAGER); - ut->uu_kqueue_bound = kq; - ut->uu_kqueue_qos_index = qos_index; - ut->uu_kqueue_flags = flags; - - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_BIND), - thread_tid(thread), flags, qos_index); - - return 1; - } - - /* - * If this is a synchronous bind callback, the request - * lock is already held, so just do the bind. - */ - if (flags & KEVENT_FLAG_SYNCHRONOUS_BIND) { - kqwq_req_held(kqwq); - /* strip out synchronout bind flag */ - flags &= ~KEVENT_FLAG_SYNCHRONOUS_BIND; - kqworkq_bind_thread_impl(kqwq, qos_index, thread, flags); - return 1; - } - - /* - * check the request that corresponds to our qos_index - * to see if there is an outstanding request. - */ - kqr = kqworkq_get_request(kqwq, qos_index); - assert(kqr->kqr_qos_index == qos_index); - kqwq_req_lock(kqwq); - - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_BIND), - thread_tid(thread), flags, qos_index, kqr->kqr_state); - - if ((kqr->kqr_state & KQR_THREQUESTED) && - (kqr->kqr_state & KQR_PROCESSING) == 0) { - - if ((kqr->kqr_state & KQR_BOUND) && - thread == kqr->kqr_thread) { - /* duplicate bind - claim the thread */ - assert(ut->uu_kqueue_bound == kq); - assert(ut->uu_kqueue_qos_index == qos_index); - kqwq_req_unlock(kqwq); - return 1; - } - if ((kqr->kqr_state & (KQR_BOUND | KQWQ_THMANAGER)) == 0) { - /* ours to bind to */ - kqworkq_bind_thread_impl(kqwq, qos_index, thread, flags); - kqwq_req_unlock(kqwq); - return 1; - } - } - kqwq_req_unlock(kqwq); - return 0; -} - -static void -kqworkloop_bind_thread_impl(struct kqworkloop *kqwl, - thread_t thread, - __assert_only unsigned int flags) +kqworkloop_end_processing(struct kqworkloop *kqwl, int flags, int kevent_flags) { - assert(flags & KEVENT_FLAG_WORKLOOP); - - /* the request object must be locked */ - kqwl_req_held(kqwl); - + struct kqueue *kq = &kqwl->kqwl_kqueue; struct kqrequest *kqr = &kqwl->kqwl_request; - struct uthread *ut = get_bsdthread_info(thread); - boolean_t ipc_override_is_sync; - kq_index_t qos_index = kqworkloop_combined_qos(kqwl, &ipc_override_is_sync); - - /* nobody else bound so finally bind (as a workloop) */ - assert(kqr->kqr_state & KQR_THREQUESTED); - assert((kqr->kqr_state & (KQR_BOUND | KQR_PROCESSING)) == 0); - assert(thread != kqwl->kqwl_owner); - - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_BIND), - kqwl->kqwl_dynamicid, (uintptr_t)thread_tid(thread), - qos_index, - (uintptr_t)(((uintptr_t)kqr->kqr_override_index << 16) | - (((uintptr_t)kqr->kqr_state) << 8) | - ((uintptr_t)ipc_override_is_sync))); + thread_qos_t old_override = THREAD_QOS_UNSPECIFIED, qos_override; + thread_t thread = kqr->kqr_thread; + int rc = 0; - kqr->kqr_state |= KQR_BOUND | KQR_R2K_NOTIF_ARMED; - kqr->kqr_thread = thread; + kqlock_held(kq); - /* bind the workloop to the uthread */ - ut->uu_kqueue_bound = (struct kqueue *)kqwl; - ut->uu_kqueue_flags = flags; - ut->uu_kqueue_qos_index = qos_index; - assert(ut->uu_kqueue_override_is_sync == 0); - ut->uu_kqueue_override_is_sync = ipc_override_is_sync; - if (qos_index) { - thread_add_ipc_override(thread, qos_index); - } - if (ipc_override_is_sync) { - thread_add_sync_ipc_override(thread); - } -} + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_START, + kqwl->kqwl_dynamicid, 0, 0); -/* - * workloop_fulfill_threadreq - bind thread to processing workloop - * - * The provided thread will be responsible for delivering events - * associated with the given kqrequest. Bind it and get ready for - * the thread to eventually arrive. - * - * If WORKLOOP_FULFILL_THREADREQ_SYNC is specified, the callback - * within the context of the pthread_functions->workq_threadreq - * callout. In this case, the request structure is already locked. - */ -int -workloop_fulfill_threadreq(struct proc *p, - workq_threadreq_t req, - thread_t thread, - int flags) -{ - int sync = (flags & WORKLOOP_FULFILL_THREADREQ_SYNC); - int cancel = (flags & WORKLOOP_FULFILL_THREADREQ_CANCEL); - struct kqrequest *kqr; - struct kqworkloop *kqwl; + if (flags & KQ_PROCESSING) { + assert(kq->kq_state & KQ_PROCESSING); - kqwl = (struct kqworkloop *)((uintptr_t)req - - offsetof(struct kqworkloop, kqwl_request) - - offsetof(struct kqrequest, kqr_req)); - kqr = &kqwl->kqwl_request; + /* + * If we still have queued stayactive knotes, remember we didn't finish + * processing all of them. This should be extremely rare and would + * require to have a lot of them registered and fired. + */ + if (!TAILQ_EMPTY(&kqwl->kqwl_queue[KQWL_BUCKET_STAYACTIVE])) { + kq_req_lock(kqwl); + kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_QOS, + KQWL_BUCKET_STAYACTIVE); + kq_req_unlock(kqwl); + } - /* validate we're looking at something valid */ - if (kqwl->kqwl_p != p || - (kqwl->kqwl_state & KQ_WORKLOOP) == 0) { - assert(kqwl->kqwl_p == p); - assert(kqwl->kqwl_state & KQ_WORKLOOP); - return EINVAL; + /* + * When KEVENT_FLAG_PARKING is set, we need to attempt an unbind while + * still under the lock. + * + * So we do everything kqworkloop_unbind() would do, but because we're + * inside kqueue_process(), if the workloop actually received events + * while our locks were dropped, we have the opportunity to fail the end + * processing and loop again. + * + * This avoids going through the process-wide workqueue lock hence + * scales better. + */ + if (kevent_flags & KEVENT_FLAG_PARKING) { + qos_override = kqworkloop_acknowledge_events(kqwl); + } } - - if (!sync) - kqwl_req_lock(kqwl); - /* Should be a pending request */ - if ((kqr->kqr_state & KQR_BOUND) || - (kqr->kqr_state & KQR_THREQUESTED) == 0) { + kq_req_lock(kqwl); - assert((kqr->kqr_state & KQR_BOUND) == 0); - assert(kqr->kqr_state & KQR_THREQUESTED); - if (!sync) - kqwl_req_unlock(kqwl); - return EINPROGRESS; + if (kevent_flags & KEVENT_FLAG_PARKING) { + kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_PARKING, qos_override); + if ((kqr->kqr_state & KQR_WAKEUP) && !kqwl->kqwl_owner) { + /* + * Reset wakeup bit to notice stay active events firing while we are + * processing, as we cannot rely on the stayactive bucket emptiness. + */ + kqr->kqr_wakeup_indexes &= ~KQWL_STAYACTIVE_FIRED_BIT; + rc = -1; + } else { + old_override = kqworkloop_unbind_locked(kqwl, thread); + (void)kqueue_release(kqwl, KQUEUE_CANT_BE_LAST_REF); + kq->kq_state &= ~flags; + } + } else { + kq->kq_state &= ~flags; + kqr->kqr_state |= KQR_R2K_NOTIF_ARMED; + kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RECOMPUTE_WAKEUP_QOS, 0); } - assert((kqr->kqr_state & KQR_DRAIN) == 0); + kq_req_unlock(kqwl); - /* - * Is it a cancel indication from pthread. - * If so, we must be exiting/exec'ing. Forget - * our pending request. - */ - if (cancel) { - kqr->kqr_state &= ~KQR_THREQUESTED; - kqr->kqr_state |= KQR_DRAIN; - } else { - /* do the actual bind? */ - kqworkloop_bind_thread_impl(kqwl, thread, KEVENT_FLAG_WORKLOOP); + if (old_override) { + thread_drop_ipc_override(thread); } - if (!sync) - kqwl_req_unlock(kqwl); - - if (cancel) - kqueue_release_last(p, &kqwl->kqwl_kqueue); /* may dealloc kq */ + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_END, + kqwl->kqwl_dynamicid, 0, 0); - return 0; + return rc; } - /* - * kevent_qos_internal_bind - bind thread to processing kqueue - * - * Indicates that the provided thread will be responsible for - * servicing the particular QoS class index specified in the - * parameters. Once the binding is done, any overrides that may - * be associated with the cooresponding events can be applied. - * - * This should be called as soon as the thread identity is known, - * preferably while still at high priority during creation. - * - * - caller holds a reference on the kqueue. - * - the thread MUST call kevent_qos_internal after being bound - * or the bucket of events may never be delivered. - * - Nothing locked (may take mutex or block). + * Called with kqueue lock held. */ - -int -kevent_qos_internal_bind( - struct proc *p, - int qos_class, - thread_t thread, - unsigned int flags) -{ - kq_index_t qos_index; - - assert(flags & KEVENT_FLAG_WORKQ); - - if (thread == THREAD_NULL || (flags & KEVENT_FLAG_WORKQ) == 0) { - return EINVAL; - } - - /* get the qos index we're going to service */ - qos_index = qos_index_for_servicer(qos_class, thread, flags); - - if (kqworkq_internal_bind(p, qos_index, thread, flags)) - return 0; - - return EINPROGRESS; -} - - static void -kqworkloop_internal_unbind( - struct proc *p, - thread_t thread, - unsigned int flags) +kqfile_end_processing(struct kqueue *kq) { - struct kqueue *kq; - struct kqworkloop *kqwl; - struct uthread *ut = get_bsdthread_info(thread); - - assert(ut->uu_kqueue_bound != NULL); - kq = ut->uu_kqueue_bound; - assert(kq->kq_state & KQ_WORKLOOP); - kqwl = (struct kqworkloop *)kq; - - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_UNBIND), - kqwl->kqwl_dynamicid, (uintptr_t)thread_tid(thread), - flags, 0); - - if (!(kq->kq_state & KQ_NO_WQ_THREAD)) { - assert(is_workqueue_thread(thread)); - - kqlock(kq); - kqworkloop_unbind_thread(kqwl, thread, flags); - kqunlock(kq); - - /* If last reference, dealloc the workloop kq */ - kqueue_release_last(p, kq); - } else { - assert(!is_workqueue_thread(thread)); - kevent_servicer_detach_thread(p, kqwl->kqwl_dynamicid, thread, flags, kq); - } -} - -static void -kqworkq_internal_unbind( - struct proc *p, - kq_index_t qos_index, - thread_t thread, - unsigned int flags) -{ - struct kqueue *kq; - struct kqworkq *kqwq; - struct uthread *ut; - kq_index_t end_index; - - assert(thread == current_thread()); - ut = get_bsdthread_info(thread); - - kq = p->p_fd->fd_wqkqueue; - assert(kq->kq_state & KQ_WORKQ); - assert(ut->uu_kqueue_bound == kq); + struct knote *kn; + struct kqtailq *suppressq; + int procwait; - kqwq = (struct kqworkq *)kq; + kqlock_held(kq); - /* end servicing any requests we might own */ - end_index = (qos_index == KQWQ_QOS_MANAGER) ? - 0 : qos_index; - kqlock(kq); + assert((kq->kq_state & (KQ_WORKQ|KQ_WORKLOOP)) == 0); - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_UNBIND), - (uintptr_t)thread_tid(thread), flags, qos_index); + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_END), + VM_KERNEL_UNSLIDE_OR_PERM(kq), 0); - do { - kqworkq_end_processing(kqwq, qos_index, flags); - } while (qos_index-- > end_index); + /* + * Return suppressed knotes to their original state. + */ + suppressq = kqueue_get_suppressed_queue(kq, NULL); + while ((kn = TAILQ_FIRST(suppressq)) != NULL) { + assert(kn->kn_status & KN_SUPPRESSED); + knote_unsuppress(kn); + } - ut->uu_kqueue_bound = NULL; - ut->uu_kqueue_qos_index = 0; - ut->uu_kqueue_flags = 0; + procwait = (kq->kq_state & KQ_PROCWAIT); + kq->kq_state &= ~(KQ_PROCESSING | KQ_PROCWAIT); - kqunlock(kq); + if (procwait) { + /* first wake up any thread already waiting to process */ + waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, + CAST_EVENT64_T(suppressq), + THREAD_AWAKENED, + WAITQ_ALL_PRIORITIES); + } } -/* - * kevent_qos_internal_unbind - unbind thread from processing kqueue - * - * End processing the per-QoS bucket of events and allow other threads - * to be requested for future servicing. - * - * caller holds a reference on the kqueue. - * thread is the current thread. - */ - -int -kevent_qos_internal_unbind( - struct proc *p, - int qos_class, - thread_t thread, - unsigned int flags) +static int +kqueue_workloop_ctl_internal(proc_t p, uintptr_t cmd, uint64_t __unused options, + struct kqueue_workloop_params *params, int *retval) { -#pragma unused(qos_class) - - struct uthread *ut; + int error = 0; + int fd; + struct fileproc *fp; struct kqueue *kq; - unsigned int bound_flags; - bool check_flags; + struct kqworkloop *kqwl; + struct filedesc *fdp = p->p_fd; + workq_threadreq_param_t trp = { }; - ut = get_bsdthread_info(thread); - if (ut->uu_kqueue_bound == NULL) { - /* early out if we are already unbound */ - assert(ut->uu_kqueue_flags == 0); - assert(ut->uu_kqueue_qos_index == 0); - assert(ut->uu_kqueue_override_is_sync == 0); - return EALREADY; - } + switch (cmd) { + case KQ_WORKLOOP_CREATE: + if (!params->kqwlp_flags) { + error = EINVAL; + break; + } - assert(flags & (KEVENT_FLAG_WORKQ | KEVENT_FLAG_WORKLOOP)); - assert(thread == current_thread()); + if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) && + (params->kqwlp_sched_pri < 1 || + params->kqwlp_sched_pri > 63 /* MAXPRI_USER */)) { + error = EINVAL; + break; + } - check_flags = flags & KEVENT_FLAG_UNBIND_CHECK_FLAGS; + if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) && + invalid_policy(params->kqwlp_sched_pol)) { + error = EINVAL; + break; + } - /* Get the kqueue we started with */ - kq = ut->uu_kqueue_bound; - assert(kq != NULL); - assert(kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)); + if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) && + (params->kqwlp_cpu_percent <= 0 || + params->kqwlp_cpu_percent > 100 || + params->kqwlp_cpu_refillms <= 0 || + params->kqwlp_cpu_refillms > 0x00ffffff)) { + error = EINVAL; + break; + } - /* get flags and QoS parameters we started with */ - bound_flags = ut->uu_kqueue_flags; + if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) { + trp.trp_flags |= TRP_PRIORITY; + trp.trp_pri = params->kqwlp_sched_pri; + } + if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) { + trp.trp_flags |= TRP_POLICY; + trp.trp_pol = params->kqwlp_sched_pol; + } + if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) { + trp.trp_flags |= TRP_CPUPERCENT; + trp.trp_cpupercent = (uint8_t)params->kqwlp_cpu_percent; + trp.trp_refillms = params->kqwlp_cpu_refillms; + } - /* Unbind from the class of workq */ - if (kq->kq_state & KQ_WORKQ) { - if (check_flags && !(flags & KEVENT_FLAG_WORKQ)) { - return EINVAL; + error = kevent_get_kq(p, params->kqwlp_id, &trp, + KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP | + KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST , &fp, &fd, &kq); + if (error) { + break; } - kqworkq_internal_unbind(p, ut->uu_kqueue_qos_index, thread, bound_flags); - } else { - if (check_flags && !(flags & KEVENT_FLAG_WORKLOOP)) { - return EINVAL; + if (!(fdp->fd_flags & FD_WORKLOOP)) { + /* FD_WORKLOOP indicates we've ever created a workloop + * via this syscall but its only ever added to a process, never + * removed. + */ + proc_fdlock(p); + fdp->fd_flags |= FD_WORKLOOP; + proc_fdunlock(p); + } + break; + case KQ_WORKLOOP_DESTROY: + error = kevent_get_kq(p, params->kqwlp_id, NULL, + KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP | + KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST , &fp, &fd, &kq); + if (error) { + break; + } + kqlock(kq); + kqwl = (struct kqworkloop *)kq; + trp.trp_value = kqwl->kqwl_params; + if (trp.trp_flags && !(trp.trp_flags & TRP_RELEASED)) { + trp.trp_flags |= TRP_RELEASED; + kqueue_release(kq, KQUEUE_CANT_BE_LAST_REF); + } else { + error = EINVAL; } + kqunlock(kq); + kqueue_release_last(p, kq); + break; + } + *retval = 0; + return error; +} + +int +kqueue_workloop_ctl(proc_t p, struct kqueue_workloop_ctl_args *uap, int *retval) +{ + struct kqueue_workloop_params params = { + .kqwlp_id = 0, + }; + if (uap->sz < sizeof(params.kqwlp_version)) { + return EINVAL; + } - kqworkloop_internal_unbind(p, thread, bound_flags); + size_t copyin_sz = MIN(sizeof(params), uap->sz); + int rv = copyin(uap->addr, ¶ms, copyin_sz); + if (rv) { + return rv; } - return 0; + if (params.kqwlp_version != (int)uap->sz) { + return EINVAL; + } + + return kqueue_workloop_ctl_internal(p, uap->cmd, uap->options, ¶ms, + retval); } /* * kqueue_process - process the triggered events in a kqueue * - * Walk the queued knotes and validate that they are - * really still triggered events by calling the filter - * routines (if necessary). Hold a use reference on - * the knote to avoid it being detached. For each event - * that is still considered triggered, invoke the - * callback routine provided. + * Walk the queued knotes and validate that they are really still triggered + * events by calling the filter routines (if necessary). + * + * For each event that is still considered triggered, invoke the callback + * routine provided. * * caller holds a reference on the kqueue. * kqueue locked on entry and exit - but may be dropped * kqueue list locked (held for duration of call) */ - static int kqueue_process(struct kqueue *kq, - kevent_callback_t callback, - void *callback_data, - struct filt_process_s *process_data, - int *countp, - struct proc *p) + kevent_callback_t callback, + void *callback_data, + struct filt_process_s *process_data, + int *countp) { - unsigned int flags = process_data ? process_data->fp_flags : 0; struct uthread *ut = get_bsdthread_info(current_thread()); - kq_index_t start_index, end_index, i; + struct kqrequest *kqr = ut->uu_kqr_bound; struct knote *kn; - int nevents = 0; - int error = 0; + unsigned int flags = process_data ? process_data->fp_flags : 0; + int nevents = 0, error = 0, rc = 0; + struct kqtailq *base_queue, *queue; + kqueue_t kqu = { .kq = kq }; +#if DEBUG || DEVELOPMENT + int retries = 64; +#endif - /* - * Based on the mode of the kqueue and the bound QoS of the servicer, - * determine the range of thread requests that need checking - */ if (kq->kq_state & KQ_WORKQ) { - if (flags & KEVENT_FLAG_WORKQ_MANAGER) { - start_index = KQWQ_QOS_MANAGER; - } else if (ut->uu_kqueue_bound != kq) { + if (kqr == NULL || (kqr->kqr_state & KQR_WORKLOOP)) { return EJUSTRETURN; - } else { - start_index = ut->uu_kqueue_qos_index; } - - /* manager services every request in a workq kqueue */ - assert(start_index > 0 && start_index <= KQWQ_QOS_MANAGER); - end_index = (start_index == KQWQ_QOS_MANAGER) ? 0 : start_index; - + rc = kqworkq_begin_processing(kqu.kqwq, kqr, flags); } else if (kq->kq_state & KQ_WORKLOOP) { - if (ut->uu_kqueue_bound != kq) + if (ut->uu_kqr_bound != &kqu.kqwl->kqwl_request) { return EJUSTRETURN; - - /* - * Single request servicing - * we want to deliver all events, regardless of the QOS - */ - start_index = end_index = THREAD_QOS_UNSPECIFIED; + } + rc = kqworkloop_begin_processing(kqu.kqwl, flags); } else { - start_index = end_index = QOS_INDEX_KQFILE; + rc = kqfile_begin_processing(kq); } - - i = start_index; - do { - if (kqueue_begin_processing(kq, i, flags) == -1) { - *countp = 0; - /* Nothing to process */ - continue; - } + if (rc == -1) { + /* Nothing to process */ + *countp = 0; + return 0; + } - /* - * loop through the enqueued knotes associated with this request, - * processing each one. Each request may have several queues - * of knotes to process (depending on the type of kqueue) so we - * have to loop through all the queues as long as we have additional - * space. - */ - error = 0; + /* + * loop through the enqueued knotes associated with this request, + * processing each one. Each request may have several queues + * of knotes to process (depending on the type of kqueue) so we + * have to loop through all the queues as long as we have additional + * space. + */ - struct kqtailq *base_queue = kqueue_get_base_queue(kq, i); - struct kqtailq *queue = kqueue_get_high_queue(kq, i); - do { - while (error == 0 && (kn = TAILQ_FIRST(queue)) != NULL) { - error = knote_process(kn, callback, callback_data, process_data, p); - if (error == EJUSTRETURN) { - error = 0; - } else { - nevents++; - } - /* error is EWOULDBLOCK when the out event array is full */ - } - } while (error == 0 && queue-- > base_queue); +process_again: + if (kq->kq_state & KQ_WORKQ) { + base_queue = queue = &kqu.kqwq->kqwq_queue[kqr->kqr_qos_index]; + } else if (kq->kq_state & KQ_WORKLOOP) { + base_queue = &kqu.kqwl->kqwl_queue[0]; + queue = &kqu.kqwl->kqwl_queue[KQWL_NBUCKETS - 1]; + } else { + base_queue = queue = &kq->kq_queue[QOS_INDEX_KQFILE]; + } - if ((kq->kq_state & KQ_WORKQ) == 0) { - kqueue_end_processing(kq, i, nevents, flags); + do { + while (error == 0 && (kn = TAILQ_FIRST(queue)) != NULL) { + error = knote_process(kn, callback, callback_data, process_data); + if (error == EJUSTRETURN) { + error = 0; + } else { + nevents++; + } + /* error is EWOULDBLOCK when the out event array is full */ } if (error == EWOULDBLOCK) { @@ -5933,10 +5421,40 @@ kqueue_process(struct kqueue *kq, error = 0; break; } - } while (i-- > end_index); + } while (queue-- > base_queue); *countp = nevents; - return (error); + + /* + * If KEVENT_FLAG_PARKING is set, and no kevents have been returned, + * we want to unbind the kqrequest from the thread. + * + * However, because the kq locks are dropped several times during process, + * new knotes may have fired again, in which case, we want to fail the end + * processing and process again, until it converges. + * + * If we returned events however, end processing never fails. + */ + if (error || nevents) flags &= ~KEVENT_FLAG_PARKING; + if (kq->kq_state & KQ_WORKQ) { + rc = kqworkq_end_processing(kqu.kqwq, kqr, flags); + } else if (kq->kq_state & KQ_WORKLOOP) { + rc = kqworkloop_end_processing(kqu.kqwl, KQ_PROCESSING, flags); + } else { + kqfile_end_processing(kq); + rc = 0; + } + if (rc == -1) { + assert(flags & KEVENT_FLAG_PARKING); +#if DEBUG || DEVELOPMENT + if (retries-- == 0) { + panic("kevent: way too many knote_process retries, kq: %p (0x%02x)", + kq, kq->kq_state); + } +#endif + goto process_again; + } + return error; } static void @@ -5944,7 +5462,7 @@ kqueue_scan_continue(void *data, wait_result_t wait_result) { thread_t self = current_thread(); uthread_t ut = (uthread_t)get_bsdthread_info(self); - struct _kqueue_scan * cont_args = &ut->uu_kevent.ss_kqueue_scan; + struct _kqueue_scan * cont_args = &ut->uu_save.uus_kqueue_scan; struct kqueue *kq = (struct kqueue *)data; struct filt_process_s *process_data = cont_args->process_data; int error; @@ -5955,8 +5473,8 @@ kqueue_scan_continue(void *data, wait_result_t wait_result) case THREAD_AWAKENED: { kqlock(kq); retry: - error = kqueue_process(kq, cont_args->call, cont_args->data, - process_data, &count, current_proc()); + error = kqueue_process(kq, cont_args->call, cont_args->data, + process_data, &count); if (error == 0 && count == 0) { if (kq->kq_state & KQ_DRAIN) { kqunlock(kq); @@ -6011,7 +5529,6 @@ kqueue_scan_continue(void *data, wait_result_t wait_result) * The callback routine must be valid. * The caller must hold a use-count reference on the kq. */ - int kqueue_scan(struct kqueue *kq, kevent_callback_t callback, @@ -6019,7 +5536,7 @@ kqueue_scan(struct kqueue *kq, void *callback_data, struct filt_process_s *process_data, struct timeval *atvp, - struct proc *p) + __unused struct proc *p) { thread_continue_t cont = THREAD_CONTINUE_NULL; unsigned int flags; @@ -6047,7 +5564,7 @@ kqueue_scan(struct kqueue *kq, */ kqlock(kq); error = kqueue_process(kq, callback, callback_data, - process_data, &count, p); + process_data, &count); if (error || count) break; /* lock still held */ @@ -6070,12 +5587,12 @@ kqueue_scan(struct kqueue *kq, deadline -= now; clock_absolutetime_interval_to_deadline(deadline, &deadline); } else { - deadline = 0; /* block forever */ + deadline = 0; /* block forever */ } if (continuation) { uthread_t ut = (uthread_t)get_bsdthread_info(current_thread()); - struct _kqueue_scan *cont_args = &ut->uu_kevent.ss_kqueue_scan; + struct _kqueue_scan *cont_args = &ut->uu_save.uus_kqueue_scan; cont_args->call = callback; cont_args->cont = continuation; @@ -6134,9 +5651,9 @@ kqueue_scan(struct kqueue *kq, /*ARGSUSED*/ static int kqueue_read(__unused struct fileproc *fp, - __unused struct uio *uio, - __unused int flags, - __unused vfs_context_t ctx) + __unused struct uio *uio, + __unused int flags, + __unused vfs_context_t ctx) { return (ENXIO); } @@ -6144,9 +5661,9 @@ kqueue_read(__unused struct fileproc *fp, /*ARGSUSED*/ static int kqueue_write(__unused struct fileproc *fp, - __unused struct uio *uio, - __unused int flags, - __unused vfs_context_t ctx) + __unused struct uio *uio, + __unused int flags, + __unused vfs_context_t ctx) { return (ENXIO); } @@ -6154,9 +5671,9 @@ kqueue_write(__unused struct fileproc *fp, /*ARGSUSED*/ static int kqueue_ioctl(__unused struct fileproc *fp, - __unused u_long com, - __unused caddr_t data, - __unused vfs_context_t ctx) + __unused u_long com, + __unused caddr_t data, + __unused vfs_context_t ctx) { return (ENOTTY); } @@ -6164,7 +5681,7 @@ kqueue_ioctl(__unused struct fileproc *fp, /*ARGSUSED*/ static int kqueue_select(struct fileproc *fp, int which, void *wq_link_id, - __unused vfs_context_t ctx) + __unused vfs_context_t ctx) { struct kqueue *kq = (struct kqueue *)fp->f_data; struct kqtailq *queue; @@ -6189,7 +5706,7 @@ kqueue_select(struct fileproc *fp, int which, void *wq_link_id, */ if (wq_link_id != NULL) { thread_t cur_act = current_thread(); - struct uthread * ut = get_bsdthread_info(cur_act); + struct uthread * ut = get_bsdthread_info(cur_act); kq->kq_state |= KQ_SEL; waitq_link((struct waitq *)&kq->kq_wqs, ut->uu_wqset, @@ -6212,12 +5729,12 @@ kqueue_select(struct fileproc *fp, int which, void *wq_link_id, memcpy(wq_link_id, (void *)&wqptr, sizeof(void *)); } - if (kqueue_begin_processing(kq, QOS_INDEX_KQFILE, 0) == -1) { + if (kqfile_begin_processing(kq) == -1) { kqunlock(kq); return (0); } - queue = kqueue_get_base_queue(kq, QOS_INDEX_KQFILE); + queue = &kq->kq_queue[QOS_INDEX_KQFILE]; if (!TAILQ_EMPTY(queue)) { /* * there is something queued - but it might be a @@ -6239,26 +5756,27 @@ kqueue_select(struct fileproc *fp, int which, void *wq_link_id, * There were no regular events on the queue, so take * a deeper look at the stay-queued ones we suppressed. */ - suppressq = kqueue_get_suppressed_queue(kq, QOS_INDEX_KQFILE); + suppressq = kqueue_get_suppressed_queue(kq, NULL); while ((kn = (struct knote *)TAILQ_FIRST(suppressq)) != NULL) { - unsigned peek = 1; - - assert(!knoteuse_needs_boost(kn, NULL)); + KNOTE_LOCK_CTX(knlc); + int result = 0; /* If didn't vanish while suppressed - peek at it */ - if (kqlock2knoteuse(kq, kn, KNUSE_NONE)) { - peek = knote_fops(kn)->f_peek(kn); - - /* if it dropped while getting lock - move on */ - if (!knoteuse2kqlock(kq, kn, KNUSE_NONE)) - continue; + if ((kn->kn_status & KN_DROPPING) || !knote_lock(kq, kn, &knlc, + KNOTE_KQ_LOCK_ON_FAILURE)) { + continue; } + result = filter_call(knote_fops(kn), f_peek(kn)); + + kqlock(kq); + knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS); + /* unsuppress it */ knote_unsuppress(kn); /* has data or it has to report a vanish */ - if (peek > 0) { + if (result & FILTER_ACTIVE) { retnum = 1; goto out; } @@ -6266,7 +5784,7 @@ kqueue_select(struct fileproc *fp, int which, void *wq_link_id, } out: - kqueue_end_processing(kq, QOS_INDEX_KQFILE, retnum, 0); + kqfile_end_processing(kq); kqunlock(kq); return (retnum); } @@ -6286,6 +5804,13 @@ kqueue_close(struct fileglob *fg, __unused vfs_context_t ctx) return (0); } +/* + * Max depth of the nested kq path that can be created. + * Note that this has to be less than the size of kq_level + * to avoid wrapping around and mislabeling the level. + */ +#define MAX_NESTED_KQ 1000 + /*ARGSUSED*/ /* * The callers has taken a use-count reference on this kqueue and will donate it @@ -6299,13 +5824,12 @@ kqueue_kqfilter(__unused struct fileproc *fp, struct knote *kn, struct kqfile *kqf = (struct kqfile *)kn->kn_fp->f_data; struct kqueue *kq = &kqf->kqf_kqueue; struct kqueue *parentkq = knote_get_kq(kn); + uint16_t plevel = 0; assert((kqf->kqf_state & KQ_WORKQ) == 0); - if (parentkq == kq || - kn->kn_filter != EVFILT_READ) { - kn->kn_flags = EV_ERROR; - kn->kn_data = EINVAL; + if (parentkq == kq || kn->kn_filter != EVFILT_READ) { + knote_set_error(kn, EINVAL); return 0; } @@ -6318,6 +5842,8 @@ kqueue_kqfilter(__unused struct fileproc *fp, struct knote *kn, * into another kqueue at a lower level than the potenial * child (because it could indicate a cycle). If that test * passes, we just mark the nesting levels accordingly. + * + * Only up to MAX_NESTED_KQ can be nested. */ kqlock(parentkq); @@ -6325,15 +5851,21 @@ kqueue_kqfilter(__unused struct fileproc *fp, struct knote *kn, parentkq->kq_level < kq->kq_level) { kqunlock(parentkq); - kn->kn_flags = EV_ERROR; - kn->kn_data = EINVAL; + knote_set_error(kn, EINVAL); return 0; } else { /* set parent level appropriately */ - if (parentkq->kq_level == 0) - parentkq->kq_level = 2; - if (parentkq->kq_level < kq->kq_level + 1) - parentkq->kq_level = kq->kq_level + 1; + plevel = (parentkq->kq_level == 0)? 2: parentkq->kq_level; + if (plevel < kq->kq_level + 1) { + if (kq->kq_level + 1 > MAX_NESTED_KQ) { + kqunlock(parentkq); + knote_set_error(kn, EINVAL); + return 0; + } + plevel = kq->kq_level + 1; + } + + parentkq->kq_level = plevel; kqunlock(parentkq); kn->kn_filtid = EVFILTID_KQREAD; @@ -6408,10 +5940,8 @@ kqueue_stat(struct kqueue *kq, void *ub, int isstat64, proc_t p) } /* - * Interact with the pthread kext to request a servicing there. - * Eventually, this will request threads at specific QoS levels. - * For now, it only requests a dispatch-manager-QoS thread, and - * only one-at-a-time. + * Interact with the pthread kext to request a servicing there at a specific QoS + * level. * * - Caller holds the workq request lock * @@ -6419,279 +5949,296 @@ kqueue_stat(struct kqueue *kq, void *ub, int isstat64, proc_t p) * so cannot do anything that could recurse on that. */ static void -kqworkq_request_thread( - struct kqworkq *kqwq, - kq_index_t qos_index) +kqueue_threadreq_initiate(struct kqueue *kq, struct kqrequest *kqr, + kq_index_t qos, int flags) { - struct kqrequest *kqr; - - assert(kqwq->kqwq_state & KQ_WORKQ); - assert(qos_index < KQWQ_NQOS); - - kqr = kqworkq_get_request(kqwq, qos_index); - assert(kqr->kqr_state & KQR_WAKEUP); + assert(kqr->kqr_thread == THREAD_NULL); + assert((kqr->kqr_state & KQR_THREQUESTED) == 0); + struct turnstile *ts = TURNSTILE_NULL; - /* - * If we have already requested a thread, and it hasn't - * started processing yet, there's no use hammering away - * on the pthread kext. - */ - if (kqr->kqr_state & KQR_THREQUESTED) + if (workq_is_exiting(kq->kq_p)) { return; + } - assert((kqr->kqr_state & KQR_BOUND) == 0); - - /* request additional workq threads if appropriate */ - if (pthread_functions != NULL && - pthread_functions->workq_reqthreads != NULL) { - unsigned int flags = KEVENT_FLAG_WORKQ; - unsigned long priority; - thread_t wqthread; + /* Add a thread request reference on the kqueue. */ + kqueue_retain(kq); - /* Compute the appropriate pthread priority */ - priority = qos_from_qos_index(qos_index); + kq_req_held(kq); -#if 0 - /* JMM - for now remain compatible with old invocations */ - /* set the over-commit flag on the request if needed */ - if (kqr->kqr_state & KQR_THOVERCOMMIT) - priority |= _PTHREAD_PRIORITY_OVERCOMMIT_FLAG; -#endif /* 0 */ - - /* Compute a priority based on qos_index. */ - struct workq_reqthreads_req_s request = { - .priority = priority, - .count = 1 - }; - - /* mark that we are making a request */ - kqr->kqr_state |= KQR_THREQUESTED; - if (qos_index == KQWQ_QOS_MANAGER) - kqr->kqr_state |= KQWQ_THMANAGER; + if (kq->kq_state & KQ_WORKLOOP) { + __assert_only struct kqworkloop *kqwl = (struct kqworkloop *)kq; + assert(kqwl->kqwl_owner == THREAD_NULL); + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THREQUEST), + kqwl->kqwl_dynamicid, 0, qos, kqr->kqr_state); + ts = kqwl->kqwl_turnstile; + } else { + assert(kq->kq_state & KQ_WORKQ); KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_THREQUEST), - 0, qos_index, - (((uintptr_t)kqr->kqr_override_index << 8) | - (uintptr_t)kqr->kqr_state)); - wqthread = (*pthread_functions->workq_reqthreads)(kqwq->kqwq_p, 1, &request); - - /* We've been switched to the emergency/manager thread */ - if (wqthread == (thread_t)-1) { - assert(qos_index != KQWQ_QOS_MANAGER); - kqr->kqr_state |= KQWQ_THMANAGER; - return; - } + -1, 0, qos, kqr->kqr_state); + } + + kqr->kqr_state |= KQR_THREQUESTED; + /* + * New-style thread request supported. + * Provide the pthread kext a pointer to a workq_threadreq_s structure for + * its use until a corresponding kqueue_threadreq_bind callback. + */ + if ((kq->kq_state & KQ_WORKLOOP) && current_proc() == kq->kq_p) { + flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE; + } + if (qos == KQWQ_QOS_MANAGER) { + qos = WORKQ_THREAD_QOS_MANAGER; + } + if (!workq_kern_threadreq_initiate(kq->kq_p, kqr, ts, qos, flags)) { /* - * bind the returned thread identity - * This goes away when we switch to synchronous callback - * binding from the pthread kext. + * Process is shutting down or exec'ing. + * All the kqueues are going to be cleaned up + * soon. Forget we even asked for a thread - + * and make sure we don't ask for more. */ - if (wqthread != NULL) { - kqworkq_bind_thread_impl(kqwq, qos_index, wqthread, flags); - } + kqr->kqr_state &= ~(KQR_THREQUESTED | KQR_R2K_NOTIF_ARMED); + kqueue_release(kq, KQUEUE_CANT_BE_LAST_REF); } } /* - * If we aren't already busy processing events [for this QoS], - * request workq thread support as appropriate. + * kqueue_threadreq_bind_prepost - prepost the bind to kevent * - * TBD - for now, we don't segregate out processing by QoS. + * This is used when kqueue_threadreq_bind may cause a lock inversion. + */ +void +kqueue_threadreq_bind_prepost(struct proc *p __unused, workq_threadreq_t req, + thread_t thread) +{ + struct kqrequest *kqr = __container_of(req, struct kqrequest, kqr_req); + struct uthread *ut = get_bsdthread_info(thread); + + req->tr_binding_thread = thread; + ut->uu_kqr_bound = kqr; + req->tr_state = TR_STATE_BINDING; + + struct kqworkloop *kqwl = kqr_kqworkloop(kqr); + if (kqwl && kqwl->kqwl_turnstile) { + struct turnstile *ts = kqwl->kqwl_turnstile; + /* + * While a thread request is in flight, the workqueue + * is the interlock for the turnstile and can update the inheritor. + */ + turnstile_update_inheritor(ts, thread, TURNSTILE_IMMEDIATE_UPDATE | + TURNSTILE_INHERITOR_THREAD); + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD); + } +} + +/* + * kqueue_threadreq_bind_commit - commit a bind prepost * - * - May be called with the kqueue's wait queue set locked, - * so cannot do anything that could recurse on that. + * The workq code has to commit any binding prepost before the thread has + * a chance to come back to userspace (and do kevent syscalls) or be aborted. */ +void +kqueue_threadreq_bind_commit(struct proc *p, thread_t thread) +{ + struct uthread *ut = get_bsdthread_info(thread); + struct kqrequest *kqr = ut->uu_kqr_bound; + kqueue_t kqu = kqr_kqueue(p, kqr); + + kq_req_lock(kqu); + if (kqr->kqr_req.tr_state == TR_STATE_BINDING) { + kqueue_threadreq_bind(p, &kqr->kqr_req, thread, 0); + } + kq_req_unlock(kqu); +} + static void -kqworkq_request_help( - struct kqworkq *kqwq, - kq_index_t qos_index) +kqueue_threadreq_modify(struct kqueue *kq, struct kqrequest *kqr, kq_index_t qos) { - struct kqrequest *kqr; + assert(kqr->kqr_state & KQR_THREQUESTED); + assert(kqr->kqr_thread == THREAD_NULL); - /* convert to thread qos value */ - assert(qos_index < KQWQ_NQOS); - - kqwq_req_lock(kqwq); - kqr = kqworkq_get_request(kqwq, qos_index); + kq_req_held(kq); - if ((kqr->kqr_state & KQR_WAKEUP) == 0) { - /* Indicate that we needed help from this request */ - kqr->kqr_state |= KQR_WAKEUP; + int flags = 0; + if ((kq->kq_state & KQ_WORKLOOP) && kq->kq_p == current_proc()) { + flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE; + } + workq_kern_threadreq_modify(kq->kq_p, kqr, qos, flags); +} + +/* + * kqueue_threadreq_bind - bind thread to processing kqrequest + * + * The provided thread will be responsible for delivering events + * associated with the given kqrequest. Bind it and get ready for + * the thread to eventually arrive. + */ +void +kqueue_threadreq_bind(struct proc *p, workq_threadreq_t req, thread_t thread, + unsigned int flags) +{ + struct kqrequest *kqr = __container_of(req, struct kqrequest, kqr_req); + kqueue_t kqu = kqr_kqueue(p, kqr); + struct uthread *ut = get_bsdthread_info(thread); + + kq_req_held(kqu); + + assert(kqr->kqr_state & KQR_THREQUESTED); + assert(kqr->kqr_thread == THREAD_NULL); + assert(ut->uu_kqueue_override == 0); + + if (kqr->kqr_req.tr_state == TR_STATE_BINDING) { + assert(ut->uu_kqr_bound == kqr); + assert(kqr->kqr_req.tr_binding_thread == thread); + kqr->kqr_req.tr_state = TR_STATE_IDLE; + kqr->kqr_req.tr_binding_thread = NULL; + } else { + assert(ut->uu_kqr_bound == NULL); + } + + ut->uu_kqr_bound = kqr; + kqr->kqr_thread = thread; + + if (kqu.kq->kq_state & KQ_WORKLOOP) { + struct turnstile *ts = kqu.kqwl->kqwl_turnstile; + + if (__improbable(thread == kqu.kqwl->kqwl_owner)) { + /* + * shows that asserting here is not ok. + * + * This is not supposed to happen for correct use of the interface, + * but it is sadly possible for userspace (with the help of memory + * corruption, such as over-release of a dispatch queue) to make + * the creator thread the "owner" of a workloop. + * + * Once that happens, and that creator thread picks up the same + * workloop as a servicer, we trip this codepath. We need to fixup + * the state to forget about this thread being the owner, as the + * entire workloop state machine expects servicers to never be + * owners and everything would basically go downhill from here. + */ + kqu.kqwl->kqwl_owner = THREAD_NULL; + if (kqworkloop_owner_override(kqu.kqwl)) { + thread_drop_ipc_override(thread); + } + thread_ends_owning_workloop(thread); + } - /* Go assure a thread request has been made */ - kqworkq_request_thread(kqwq, qos_index); + if (ts && (flags & KQUEUE_THREADERQ_BIND_NO_INHERITOR_UPDATE) == 0) { + /* + * Past this point, the interlock is the kq req lock again, + * so we can fix the inheritor for good. + */ + filt_wlupdate_inheritor(kqu.kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE); + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD); + } + + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_BIND), kqu.kqwl->kqwl_dynamicid, + thread_tid(thread), kqr->kqr_qos_index, + (kqr->kqr_override_index << 16) | kqr->kqr_state); + + ut->uu_kqueue_override = kqr->kqr_override_index; + if (kqr->kqr_override_index) { + thread_add_ipc_override(thread, kqr->kqr_override_index); + } + } else { + assert(kqr->kqr_override_index == 0); + + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_BIND), -1, + thread_tid(thread), kqr->kqr_qos_index, + (kqr->kqr_override_index << 16) | kqr->kqr_state); } - kqwq_req_unlock(kqwq); } -static void -kqworkloop_threadreq_impl(struct kqworkloop *kqwl, kq_index_t qos_index) +/* + * kqueue_threadreq_cancel - abort a pending thread request + * + * Called when exiting/exec'ing. Forget our pending request. + */ +void +kqueue_threadreq_cancel(struct proc *p, workq_threadreq_t req) { - struct kqrequest *kqr = &kqwl->kqwl_request; - unsigned long pri = pthread_priority_for_kqrequest(kqr, qos_index); - int op, ret; + struct kqrequest *kqr = __container_of(req, struct kqrequest, kqr_req); + kqueue_t kqu = kqr_kqueue(p, kqr); - assert((kqr->kqr_state & (KQR_THREQUESTED | KQR_BOUND)) == KQR_THREQUESTED); + kq_req_lock(kqu); - /* - * New-style thread request supported. Provide - * the pthread kext a pointer to a workq_threadreq_s - * structure for its use until a corresponding - * workloop_fulfill_threqreq callback. - */ - if (current_proc() == kqwl->kqwl_kqueue.kq_p) { - op = WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL; - } else { - op = WORKQ_THREADREQ_WORKLOOP; - } -again: - ret = (*pthread_functions->workq_threadreq)(kqwl->kqwl_p, &kqr->kqr_req, - WORKQ_THREADREQ_WORKLOOP, pri, 0); - switch (ret) { - case ENOTSUP: - assert(op == WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL); - op = WORKQ_THREADREQ_WORKLOOP; - goto again; - - case ECANCELED: - case EINVAL: - /* - * Process is shutting down or exec'ing. - * All the kqueues are going to be cleaned up - * soon. Forget we even asked for a thread - - * and make sure we don't ask for more. - */ - kqueue_release((struct kqueue *)kqwl, KQUEUE_CANT_BE_LAST_REF); - kqr->kqr_state &= ~KQR_THREQUESTED; - kqr->kqr_state |= KQR_DRAIN; - break; + assert(kqr->kqr_thread == THREAD_NULL); + assert(kqr->kqr_state & KQR_THREQUESTED); + kqr->kqr_state &= ~(KQR_THREQUESTED | KQR_R2K_NOTIF_ARMED); - case EAGAIN: - assert(op == WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL); - act_set_astkevent(current_thread(), AST_KEVENT_REDRIVE_THREADREQ); - break; + kq_req_unlock(kqu); - default: - assert(ret == 0); - } + kqueue_release_last(p, kqu); /* may dealloc kqu */ } -static void -kqworkloop_threadreq_modify(struct kqworkloop *kqwl, kq_index_t qos_index) +workq_threadreq_param_t +kqueue_threadreq_workloop_param(workq_threadreq_t req) { - struct kqrequest *kqr = &kqwl->kqwl_request; - unsigned long pri = pthread_priority_for_kqrequest(kqr, qos_index); - int ret, op = WORKQ_THREADREQ_CHANGE_PRI_NO_THREAD_CALL; + struct kqrequest *kqr = __container_of(req, struct kqrequest, kqr_req); + struct kqworkloop *kqwl; + workq_threadreq_param_t trp; - assert((kqr->kqr_state & (KQR_THREQUESTED | KQR_BOUND)) == KQR_THREQUESTED); + assert(kqr->kqr_state & KQR_WORKLOOP); + kqwl = __container_of(kqr, struct kqworkloop, kqwl_request); + trp.trp_value = kqwl->kqwl_params; + return trp; +} - if (current_proc() == kqwl->kqwl_kqueue.kq_p) { - op = WORKQ_THREADREQ_CHANGE_PRI_NO_THREAD_CALL; +/* + * kqueue_threadreq_unbind - unbind thread from processing kqueue + * + * End processing the per-QoS bucket of events and allow other threads + * to be requested for future servicing. + * + * caller holds a reference on the kqueue. + */ +void +kqueue_threadreq_unbind(struct proc *p, struct kqrequest *kqr) +{ + if (kqr->kqr_state & KQR_WORKLOOP) { + kqworkloop_unbind(p, kqr_kqworkloop(kqr)); } else { - op = WORKQ_THREADREQ_CHANGE_PRI; - } -again: - ret = (*pthread_functions->workq_threadreq_modify)(kqwl->kqwl_p, - &kqr->kqr_req, op, pri, 0); - switch (ret) { - case ENOTSUP: - assert(op == WORKQ_THREADREQ_CHANGE_PRI_NO_THREAD_CALL); - op = WORKQ_THREADREQ_CHANGE_PRI; - goto again; - - case EAGAIN: - assert(op == WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL); - act_set_astkevent(current_thread(), AST_KEVENT_REDRIVE_THREADREQ); - break; - - case ECANCELED: - case EINVAL: - case 0: - break; - - default: - assert(ret == 0); + kqworkq_unbind(p, kqr); } } /* - * Interact with the pthread kext to request a servicing thread. - * This will request a single thread at the highest QoS level - * for which there is work (whether that was the requested QoS - * for an event or an override applied to a lower-QoS request). + * If we aren't already busy processing events [for this QoS], + * request workq thread support as appropriate. * - * - Caller holds the workloop request lock + * TBD - for now, we don't segregate out processing by QoS. * * - May be called with the kqueue's wait queue set locked, * so cannot do anything that could recurse on that. */ static void -kqworkloop_request_thread(struct kqworkloop *kqwl, kq_index_t qos_index) +kqworkq_request_help(struct kqworkq *kqwq, kq_index_t qos_index) { struct kqrequest *kqr; - assert(kqwl->kqwl_state & KQ_WORKLOOP); - - kqr = &kqwl->kqwl_request; - - assert(kqwl->kqwl_owner == THREAD_NULL); - assert((kqr->kqr_state & KQR_BOUND) == 0); - assert((kqr->kqr_state & KQR_THREQUESTED) == 0); - assert(!(kqwl->kqwl_kqueue.kq_state & KQ_NO_WQ_THREAD)); - - /* If we're draining thread requests, just bail */ - if (kqr->kqr_state & KQR_DRAIN) - return; - - if (pthread_functions != NULL && - pthread_functions->workq_threadreq != NULL) { - /* - * set request state flags, etc... before calling pthread - * This assures they are set before a possible synchronous - * callback to workloop_fulfill_threadreq(). - */ - kqr->kqr_state |= KQR_THREQUESTED; + /* convert to thread qos value */ + assert(qos_index < KQWQ_NBUCKETS); - /* Add a thread request reference on the kqueue. */ - kqueue_retain((struct kqueue *)kqwl); + kq_req_lock(kqwq); + kqr = kqworkq_get_request(kqwq, qos_index); - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THREQUEST), - kqwl->kqwl_dynamicid, - 0, qos_index, kqr->kqr_state); - kqworkloop_threadreq_impl(kqwl, qos_index); - } else { - panic("kqworkloop_request_thread"); - return; + if ((kqr->kqr_state & KQR_WAKEUP) == 0) { + kqr->kqr_state |= KQR_WAKEUP; + if ((kqr->kqr_state & KQR_THREQUESTED) == 0) { + kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr, qos_index, 0); + } } + kq_req_unlock(kqwq); } -static void -kqworkloop_update_sync_override_state(struct kqworkloop *kqwl, boolean_t sync_ipc_override) -{ - struct kqrequest *kqr = &kqwl->kqwl_request; - kqwl_req_lock(kqwl); - kqr->kqr_has_sync_override = sync_ipc_override; - kqwl_req_unlock(kqwl); - -} - -static inline kq_index_t -kqworkloop_combined_qos(struct kqworkloop *kqwl, boolean_t *ipc_override_is_sync) +static kq_index_t +kqworkloop_owner_override(struct kqworkloop *kqwl) { struct kqrequest *kqr = &kqwl->kqwl_request; - kq_index_t override; - - *ipc_override_is_sync = FALSE; - override = MAX(MAX(kqr->kqr_qos_index, kqr->kqr_override_index), - kqr->kqr_dsync_waiters_qos); - - if (kqr->kqr_sync_suppress_count > 0 || kqr->kqr_has_sync_override) { - *ipc_override_is_sync = TRUE; - override = THREAD_QOS_USER_INTERACTIVE; - } - return override; + return MAX(kqr->kqr_qos_index, kqr->kqr_override_index); } static inline void @@ -6699,12 +6246,10 @@ kqworkloop_request_fire_r2k_notification(struct kqworkloop *kqwl) { struct kqrequest *kqr = &kqwl->kqwl_request; - kqwl_req_held(kqwl); + kq_req_held(kqwl); if (kqr->kqr_state & KQR_R2K_NOTIF_ARMED) { - assert(kqr->kqr_state & KQR_BOUND); assert(kqr->kqr_thread); - kqr->kqr_state &= ~KQR_R2K_NOTIF_ARMED; act_set_astkevent(kqr->kqr_thread, AST_KEVENT_RETURN_TO_KERNEL); } @@ -6713,17 +6258,13 @@ kqworkloop_request_fire_r2k_notification(struct kqworkloop *kqwl) static void kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos) { - const uint8_t KQWL_STAYACTIVE_FIRED_BIT = (1 << 0); - struct kqrequest *kqr = &kqwl->kqwl_request; - boolean_t old_ipc_override_is_sync = FALSE; - kq_index_t old_qos = kqworkloop_combined_qos(kqwl, &old_ipc_override_is_sync); struct kqueue *kq = &kqwl->kqwl_kqueue; - bool static_thread = (kq->kq_state & KQ_NO_WQ_THREAD); + kq_index_t old_owner_override = kqworkloop_owner_override(kqwl); kq_index_t i; /* must hold the kqr lock */ - kqwl_req_held(kqwl); + kq_req_held(kqwl); switch (op) { case KQWL_UTQ_UPDATE_WAKEUP_QOS: @@ -6742,7 +6283,6 @@ kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos) kqr->kqr_wakeup_indexes |= KQWL_STAYACTIVE_FIRED_BIT; qos = kqr->kqr_stayactive_qos; assert(qos); - assert(!static_thread); } if (kqr->kqr_wakeup_indexes & (1 << qos)) { assert(kqr->kqr_state & KQR_WAKEUP); @@ -6752,7 +6292,7 @@ kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos) kqr->kqr_wakeup_indexes |= (1 << qos); kqr->kqr_state |= KQR_WAKEUP; kqworkloop_request_fire_r2k_notification(kqwl); - goto recompute_async; + goto recompute; case KQWL_UTQ_UPDATE_STAYACTIVE_QOS: assert(qos); @@ -6761,19 +6301,25 @@ kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos) if (kqr->kqr_wakeup_indexes & KQWL_STAYACTIVE_FIRED_BIT) { assert(kqr->kqr_state & KQR_WAKEUP); kqr->kqr_wakeup_indexes |= (1 << qos); - goto recompute_async; + goto recompute; } } break; + case KQWL_UTQ_PARKING: + case KQWL_UTQ_UNBINDING: + kqr->kqr_override_index = qos; + /* FALLTHROUGH */ case KQWL_UTQ_RECOMPUTE_WAKEUP_QOS: - kqlock_held(kq); // to look at kq_queues - kqr->kqr_has_sync_override = FALSE; + if (op == KQWL_UTQ_RECOMPUTE_WAKEUP_QOS) { + assert(qos == THREAD_QOS_UNSPECIFIED); + } + kqlock_held(kqwl); // to look at kq_queues i = KQWL_BUCKET_STAYACTIVE; if (TAILQ_EMPTY(&kqr->kqr_suppressed)) { kqr->kqr_override_index = THREAD_QOS_UNSPECIFIED; } - if (!TAILQ_EMPTY(&kq->kq_queue[i]) && + if (!TAILQ_EMPTY(&kqwl->kqwl_queue[i]) && (kqr->kqr_wakeup_indexes & KQWL_STAYACTIVE_FIRED_BIT)) { /* * If the KQWL_STAYACTIVE_FIRED_BIT is set, it means a stay active @@ -6787,13 +6333,8 @@ kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos) kqr->kqr_wakeup_indexes = 0; } for (i = THREAD_QOS_UNSPECIFIED + 1; i < KQWL_BUCKET_STAYACTIVE; i++) { - if (!TAILQ_EMPTY(&kq->kq_queue[i])) { + if (!TAILQ_EMPTY(&kqwl->kqwl_queue[i])) { kqr->kqr_wakeup_indexes |= (1 << i); - struct knote *kn = TAILQ_FIRST(&kqwl->kqwl_kqueue.kq_queue[i]); - if (i == THREAD_QOS_USER_INTERACTIVE && - kn->kn_qos_override_is_sync) { - kqr->kqr_has_sync_override = TRUE; - } } } if (kqr->kqr_wakeup_indexes) { @@ -6802,20 +6343,18 @@ kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos) } else { kqr->kqr_state &= ~KQR_WAKEUP; } - assert(qos == THREAD_QOS_UNSPECIFIED); - goto recompute_async; + goto recompute; case KQWL_UTQ_RESET_WAKEUP_OVERRIDE: - kqr->kqr_override_index = THREAD_QOS_UNSPECIFIED; - assert(qos == THREAD_QOS_UNSPECIFIED); - goto recompute_async; + kqr->kqr_override_index = qos; + goto recompute; case KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE: - recompute_async: + recompute: /* - * When modifying the wakeup QoS or the async override QoS, we always - * need to maintain our invariant that kqr_override_index is at least as - * large as the highest QoS for which an event is fired. + * When modifying the wakeup QoS or the override QoS, we always need to + * maintain our invariant that kqr_override_index is at least as large + * as the highest QoS for which an event is fired. * * However this override index can be larger when there is an overriden * suppressed knote pushing on the kqueue. @@ -6831,96 +6370,44 @@ kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos) case KQWL_UTQ_REDRIVE_EVENTS: break; - case KQWL_UTQ_SET_ASYNC_QOS: - filt_wlheld(kqwl); + case KQWL_UTQ_SET_QOS_INDEX: kqr->kqr_qos_index = qos; break; - case KQWL_UTQ_SET_SYNC_WAITERS_QOS: - filt_wlheld(kqwl); - kqr->kqr_dsync_waiters_qos = qos; - break; - default: panic("unknown kqwl thread qos update operation: %d", op); } - boolean_t new_ipc_override_is_sync = FALSE; - kq_index_t new_qos = kqworkloop_combined_qos(kqwl, &new_ipc_override_is_sync); thread_t kqwl_owner = kqwl->kqwl_owner; thread_t servicer = kqr->kqr_thread; - __assert_only int ret; + boolean_t qos_changed = FALSE; + kq_index_t new_owner_override = kqworkloop_owner_override(kqwl); /* * Apply the diffs to the owner if applicable */ - if (filt_wlowner_is_valid(kqwl_owner)) { + if (kqwl_owner) { #if 0 /* JMM - need new trace hooks for owner overrides */ KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST), - kqwl->kqwl_dynamicid, - (kqr->kqr_state & KQR_BOUND) ? thread_tid(kqwl_owner) : 0, - (kqr->kqr_qos_index << 8) | new_qos, - (kqr->kqr_override_index << 8) | kqr->kqr_state); + kqwl->kqwl_dynamicid, thread_tid(kqwl_owner), kqr->kqr_qos_index, + (kqr->kqr_override_index << 16) | kqr->kqr_state); #endif - if (new_qos == kqr->kqr_dsync_owner_qos) { + if (new_owner_override == old_owner_override) { // nothing to do - } else if (kqr->kqr_dsync_owner_qos == THREAD_QOS_UNSPECIFIED) { - thread_add_ipc_override(kqwl_owner, new_qos); - } else if (new_qos == THREAD_QOS_UNSPECIFIED) { + } else if (old_owner_override == THREAD_QOS_UNSPECIFIED) { + thread_add_ipc_override(kqwl_owner, new_owner_override); + } else if (new_owner_override == THREAD_QOS_UNSPECIFIED) { thread_drop_ipc_override(kqwl_owner); - } else /* kqr->kqr_dsync_owner_qos != new_qos */ { - thread_update_ipc_override(kqwl_owner, new_qos); - } - kqr->kqr_dsync_owner_qos = new_qos; - - if (new_ipc_override_is_sync && - !kqr->kqr_owner_override_is_sync) { - thread_add_sync_ipc_override(kqwl_owner); - } else if (!new_ipc_override_is_sync && - kqr->kqr_owner_override_is_sync) { - thread_drop_sync_ipc_override(kqwl_owner); + } else /* old_owner_override != new_owner_override */ { + thread_update_ipc_override(kqwl_owner, new_owner_override); } - kqr->kqr_owner_override_is_sync = new_ipc_override_is_sync; } /* * apply the diffs to the servicer */ - if (static_thread) { - /* - * Statically bound thread - * - * These threads don't participates in QoS overrides today, just wakeup - * the thread blocked on this kqueue if a new event arrived. - */ - - switch (op) { - case KQWL_UTQ_UPDATE_WAKEUP_QOS: - case KQWL_UTQ_UPDATE_STAYACTIVE_QOS: - case KQWL_UTQ_RECOMPUTE_WAKEUP_QOS: - break; - - case KQWL_UTQ_RESET_WAKEUP_OVERRIDE: - case KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE: - case KQWL_UTQ_REDRIVE_EVENTS: - case KQWL_UTQ_SET_ASYNC_QOS: - case KQWL_UTQ_SET_SYNC_WAITERS_QOS: - panic("should never be called"); - break; - } - - kqlock_held(kq); - - if ((kqr->kqr_state & KQR_BOUND) && (kqr->kqr_state & KQR_WAKEUP)) { - assert(servicer && !is_workqueue_thread(servicer)); - if (kq->kq_state & (KQ_SLEEP | KQ_SEL)) { - kq->kq_state &= ~(KQ_SLEEP | KQ_SEL); - waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, KQ_EVENT, - THREAD_AWAKENED, WAITQ_ALL_PRIORITIES); - } - } - } else if ((kqr->kqr_state & KQR_THREQUESTED) == 0) { + if ((kqr->kqr_state & KQR_THREQUESTED) == 0) { /* * No servicer, nor thread-request * @@ -6929,70 +6416,54 @@ kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos) * first place. */ - if (kqwl_owner == THREAD_NULL && (kqr->kqr_state & KQR_WAKEUP)) { - kqworkloop_request_thread(kqwl, new_qos); + if (kqwl_owner == NULL && (kqr->kqr_state & KQR_WAKEUP)) { + int initiate_flags = 0; + if (op == KQWL_UTQ_UNBINDING) { + initiate_flags = WORKQ_THREADREQ_ATTEMPT_REBIND; + } + kqueue_threadreq_initiate(kq, kqr, new_owner_override, + initiate_flags); } - } else if ((kqr->kqr_state & KQR_BOUND) == 0 && - (kqwl_owner || (kqr->kqr_state & KQR_WAKEUP) == 0)) { + } else if (servicer) { /* - * No servicer, thread request in flight we want to cancel + * Servicer in flight * - * We just got rid of the last knote of the kqueue or noticed an owner - * with a thread request still in flight, take it back. + * Just apply the diff to the servicer */ - ret = (*pthread_functions->workq_threadreq_modify)(kqwl->kqwl_p, - &kqr->kqr_req, WORKQ_THREADREQ_CANCEL, 0, 0); - if (ret == 0) { - kqr->kqr_state &= ~KQR_THREQUESTED; - kqueue_release(kq, KQUEUE_CANT_BE_LAST_REF); + struct uthread *ut = get_bsdthread_info(servicer); + if (ut->uu_kqueue_override != kqr->kqr_override_index) { + if (ut->uu_kqueue_override == THREAD_QOS_UNSPECIFIED) { + thread_add_ipc_override(servicer, kqr->kqr_override_index); + } else if (kqr->kqr_override_index == THREAD_QOS_UNSPECIFIED) { + thread_drop_ipc_override(servicer); + } else /* ut->uu_kqueue_override != kqr->kqr_override_index */ { + thread_update_ipc_override(servicer, kqr->kqr_override_index); + } + ut->uu_kqueue_override = kqr->kqr_override_index; + qos_changed = TRUE; } - } else { - boolean_t qos_changed = FALSE; - + } else if (new_owner_override == THREAD_QOS_UNSPECIFIED) { /* - * Servicer or request is in flight + * No events to deliver anymore. * - * Just apply the diff to the servicer or the thread request + * However canceling with turnstiles is challenging, so the fact that + * the request isn't useful will be discovered by the servicer himself + * later on. */ - if (kqr->kqr_state & KQR_BOUND) { - servicer = kqr->kqr_thread; - struct uthread *ut = get_bsdthread_info(servicer); - if (ut->uu_kqueue_qos_index != new_qos) { - if (ut->uu_kqueue_qos_index == THREAD_QOS_UNSPECIFIED) { - thread_add_ipc_override(servicer, new_qos); - } else if (new_qos == THREAD_QOS_UNSPECIFIED) { - thread_drop_ipc_override(servicer); - } else /* ut->uu_kqueue_qos_index != new_qos */ { - thread_update_ipc_override(servicer, new_qos); - } - ut->uu_kqueue_qos_index = new_qos; - qos_changed = TRUE; - } + } else if (old_owner_override != new_owner_override) { + /* + * Request is in flight + * + * Apply the diff to the thread request + */ + kqueue_threadreq_modify(kq, kqr, new_owner_override); + qos_changed = TRUE; + } - if (new_ipc_override_is_sync != ut->uu_kqueue_override_is_sync) { - if (new_ipc_override_is_sync && - !ut->uu_kqueue_override_is_sync) { - thread_add_sync_ipc_override(servicer); - } else if (!new_ipc_override_is_sync && - ut->uu_kqueue_override_is_sync) { - thread_drop_sync_ipc_override(servicer); - } - ut->uu_kqueue_override_is_sync = new_ipc_override_is_sync; - qos_changed = TRUE; - } - } else if (old_qos != new_qos) { - assert(new_qos); - kqworkloop_threadreq_modify(kqwl, new_qos); - qos_changed = TRUE; - } - if (qos_changed) { - servicer = kqr->kqr_thread; - KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST), - kqwl->kqwl_dynamicid, - (kqr->kqr_state & KQR_BOUND) ? thread_tid(servicer) : 0, - (kqr->kqr_qos_index << 16) | (new_qos << 8) | new_ipc_override_is_sync, - (kqr->kqr_override_index << 8) | kqr->kqr_state); - } + if (qos_changed) { + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST), kqwl->kqwl_dynamicid, + thread_tid(kqr->kqr_thread), kqr->kqr_qos_index, + (kqr->kqr_override_index << 16) | kqr->kqr_state); } } @@ -7002,179 +6473,179 @@ kqworkloop_request_help(struct kqworkloop *kqwl, kq_index_t qos_index) /* convert to thread qos value */ assert(qos_index < KQWL_NBUCKETS); - kqwl_req_lock(kqwl); + kq_req_lock(kqwl); kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_QOS, qos_index); - kqwl_req_unlock(kqwl); + kq_req_unlock(kqwl); } -/* - * These arrays described the low and high qindexes for a given qos_index. - * The values come from the chart in (must stay in sync). - */ -static kq_index_t _kqwq_base_index[KQWQ_NQOS] = {0, 0, 6, 11, 15, 18, 20, 21}; -static kq_index_t _kqwq_high_index[KQWQ_NQOS] = {0, 5, 10, 14, 17, 19, 20, 21}; - static struct kqtailq * -kqueue_get_base_queue(struct kqueue *kq, kq_index_t qos_index) +kqueue_get_queue(struct kqueue *kq, kq_index_t qos_index) { if (kq->kq_state & KQ_WORKQ) { - assert(qos_index < KQWQ_NQOS); - return &kq->kq_queue[_kqwq_base_index[qos_index]]; + assert(qos_index < KQWQ_NBUCKETS); } else if (kq->kq_state & KQ_WORKLOOP) { assert(qos_index < KQWL_NBUCKETS); - return &kq->kq_queue[qos_index]; } else { assert(qos_index == QOS_INDEX_KQFILE); - return &kq->kq_queue[QOS_INDEX_KQFILE]; } + static_assert(offsetof(struct kqueue, kq_queue) == sizeof(struct kqueue), + "struct kqueue::kq_queue must be exactly at the end"); + return &kq->kq_queue[qos_index]; +} + +static int +kqueue_queue_empty(struct kqueue *kq, kq_index_t qos_index) +{ + return TAILQ_EMPTY(kqueue_get_queue(kq, qos_index)); } static struct kqtailq * -kqueue_get_high_queue(struct kqueue *kq, kq_index_t qos_index) +kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn) { - if (kq->kq_state & KQ_WORKQ) { - assert(qos_index < KQWQ_NQOS); - return &kq->kq_queue[_kqwq_high_index[qos_index]]; - } else if (kq->kq_state & KQ_WORKLOOP) { - assert(qos_index < KQWL_NBUCKETS); - return &kq->kq_queue[KQWL_BUCKET_STAYACTIVE]; + if (kq.kq->kq_state & KQ_WORKQ) { + return &kqworkq_get_request(kq.kqwq, kn->kn_qos_index)->kqr_suppressed; + } else if (kq.kq->kq_state & KQ_WORKLOOP) { + return &kq.kqwl->kqwl_request.kqr_suppressed; } else { - assert(qos_index == QOS_INDEX_KQFILE); - return &kq->kq_queue[QOS_INDEX_KQFILE]; + return &kq.kqf->kqf_suppressed; } } -static int -kqueue_queue_empty(struct kqueue *kq, kq_index_t qos_index) +static struct turnstile * +kqueue_get_turnstile(kqueue_t kqu, bool can_alloc) { - struct kqtailq *base_queue = kqueue_get_base_queue(kq, qos_index); - struct kqtailq *queue = kqueue_get_high_queue(kq, qos_index); + uint8_t kqr_state; - do { - if (!TAILQ_EMPTY(queue)) - return 0; - } while (queue-- > base_queue); - return 1; -} + if ((kqu.kq->kq_state & KQ_WORKLOOP) == 0) { + return TURNSTILE_NULL; + } -static struct kqtailq * -kqueue_get_suppressed_queue(struct kqueue *kq, kq_index_t qos_index) -{ - struct kqtailq *res; - struct kqrequest *kqr; + kqr_state = os_atomic_load(&kqu.kqwl->kqwl_request.kqr_state, relaxed); + if (kqr_state & KQR_ALLOCATED_TURNSTILE) { + /* force a dependency to pair with the atomic or with release below */ + return os_atomic_load_with_dependency_on(&kqu.kqwl->kqwl_turnstile, + kqr_state); + } - if (kq->kq_state & KQ_WORKQ) { - struct kqworkq *kqwq = (struct kqworkq *)kq; + if (!can_alloc) { + return TURNSTILE_NULL; + } - kqr = kqworkq_get_request(kqwq, qos_index); - res = &kqr->kqr_suppressed; - } else if (kq->kq_state & KQ_WORKLOOP) { - struct kqworkloop *kqwl = (struct kqworkloop *)kq; + struct turnstile *ts = turnstile_alloc(), *free_ts = TURNSTILE_NULL; + + kq_req_lock(kqu); + if (filt_wlturnstile_interlock_is_workq(kqu.kqwl)) { + workq_kern_threadreq_lock(kqu.kqwl->kqwl_p); + } - kqr = &kqwl->kqwl_request; - res = &kqr->kqr_suppressed; + if (kqu.kqwl->kqwl_request.kqr_state & KQR_ALLOCATED_TURNSTILE) { + free_ts = ts; + ts = kqu.kqwl->kqwl_turnstile; } else { - struct kqfile *kqf = (struct kqfile *)kq; - res = &kqf->kqf_suppressed; + ts = turnstile_prepare((uintptr_t)kqu.kqwl, &kqu.kqwl->kqwl_turnstile, + ts, TURNSTILE_WORKLOOPS); + + /* release-barrier to pair with the unlocked load of kqwl_turnstile above */ + os_atomic_or(&kqu.kqwl->kqwl_request.kqr_state, + KQR_ALLOCATED_TURNSTILE, release); } - return res; -} -static kq_index_t -knote_get_queue_index(struct knote *kn) -{ - kq_index_t override_index = knote_get_qos_override_index(kn); - kq_index_t qos_index = knote_get_qos_index(kn); - struct kqueue *kq = knote_get_kq(kn); - kq_index_t res; + if (filt_wlturnstile_interlock_is_workq(kqu.kqwl)) { + workq_kern_threadreq_unlock(kqu.kqwl->kqwl_p); + } + kq_req_unlock(kqu.kqwl); - if (kq->kq_state & KQ_WORKQ) { - res = _kqwq_base_index[qos_index]; - if (override_index > qos_index) - res += override_index - qos_index; - assert(res <= _kqwq_high_index[qos_index]); - } else if (kq->kq_state & KQ_WORKLOOP) { - res = MAX(override_index, qos_index); - assert(res < KQWL_NBUCKETS); - } else { - assert(qos_index == QOS_INDEX_KQFILE); - assert(override_index == QOS_INDEX_KQFILE); - res = QOS_INDEX_KQFILE; + if (free_ts) { + turnstile_deallocate(free_ts); } - return res; + return ts; } -static struct kqtailq * -knote_get_queue(struct knote *kn) +struct turnstile * +kqueue_turnstile(struct kqueue *kq) { - kq_index_t qindex = knote_get_queue_index(kn); - - return &(knote_get_kq(kn))->kq_queue[qindex]; + return kqueue_get_turnstile(kq, false); } -static kq_index_t -knote_get_req_index(struct knote *kn) +struct turnstile * +kqueue_alloc_turnstile(struct kqueue *kq) { - return kn->kn_req_index; + return kqueue_get_turnstile(kq, true); } -static kq_index_t -knote_get_qos_index(struct knote *kn) +static struct kqtailq * +knote_get_queue(struct knote *kn) { - return kn->kn_qos_index; + return kqueue_get_queue(knote_get_kq(kn), kn->kn_qos_index); } static void -knote_set_qos_index(struct knote *kn, kq_index_t qos_index) +knote_reset_priority(struct knote *kn, pthread_priority_t pp) { struct kqueue *kq = knote_get_kq(kn); + kq_index_t qos = _pthread_priority_thread_qos(pp); - assert(qos_index < KQWQ_NQOS); assert((kn->kn_status & KN_QUEUED) == 0); if (kq->kq_state & KQ_WORKQ) { - assert(qos_index > THREAD_QOS_UNSPECIFIED); + if (qos == THREAD_QOS_UNSPECIFIED) { + /* On workqueues, outside of QoS means MANAGER */ + qos = KQWQ_QOS_MANAGER; + pp = _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG; + } else { + pp = _pthread_priority_normalize(pp); + } } else if (kq->kq_state & KQ_WORKLOOP) { - /* XXX this policy decision shouldn't be here */ - if (qos_index == THREAD_QOS_UNSPECIFIED) - qos_index = THREAD_QOS_LEGACY; - } else - qos_index = QOS_INDEX_KQFILE; + assert((pp & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG) == 0); + pp = _pthread_priority_normalize(pp); + } else { + pp = _pthread_unspecified_priority(); + qos = THREAD_QOS_UNSPECIFIED; + } - /* always set requested */ - kn->kn_req_index = qos_index; + kn->kn_qos = pp; + kn->kn_req_index = qos; + + if ((kn->kn_status & KN_MERGE_QOS) == 0 || qos > kn->kn_qos_override) { + /* Never lower QoS when in "Merge" mode */ + kn->kn_qos_override = qos; + } /* only adjust in-use qos index when not suppressed */ - if ((kn->kn_status & KN_SUPPRESSED) == 0) - kn->kn_qos_index = qos_index; + if ((kn->kn_status & KN_SUPPRESSED) == 0) { + kn->kn_qos_index = qos; + } else if (kq->kq_state & KQ_WORKQ) { + kqworkq_update_override((struct kqworkq *)kq, kn, qos); + } else if (kq->kq_state & KQ_WORKLOOP) { + kqworkloop_update_override((struct kqworkloop *)kq, qos); + } } static void knote_set_qos_overcommit(struct knote *kn) { struct kqueue *kq = knote_get_kq(kn); - struct kqrequest *kqr; /* turn overcommit on for the appropriate thread request? */ - if (kn->kn_qos & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) { - if (kq->kq_state & KQ_WORKQ) { - kq_index_t qos_index = knote_get_qos_index(kn); - struct kqworkq *kqwq = (struct kqworkq *)kq; - - kqr = kqworkq_get_request(kqwq, qos_index); - - kqwq_req_lock(kqwq); - kqr->kqr_state |= KQR_THOVERCOMMIT; - kqwq_req_unlock(kqwq); - } else if (kq->kq_state & KQ_WORKLOOP) { - struct kqworkloop *kqwl = (struct kqworkloop *)kq; + if ((kn->kn_qos & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) && + (kq->kq_state & KQ_WORKLOOP)) { + struct kqworkloop *kqwl = (struct kqworkloop *)kq; + struct kqrequest *kqr = &kqwl->kqwl_request; - kqr = &kqwl->kqwl_request; + /* + * This test is racy, but since we never remove this bit, + * it allows us to avoid taking a lock. + */ + if (kqr->kqr_state & KQR_THOVERCOMMIT) { + return; + } - kqwl_req_lock(kqwl); - kqr->kqr_state |= KQR_THOVERCOMMIT; - kqwl_req_unlock(kqwl); + kq_req_lock(kqwl); + kqr->kqr_state |= KQR_THOVERCOMMIT; + if (!kqr->kqr_thread && (kqr->kqr_state & KQR_THREQUESTED)) { + kqueue_threadreq_modify(kq, kqr, kqr->kqr_req.tr_qos); } + kq_req_unlock(kqwl); } } @@ -7185,490 +6656,309 @@ knote_get_qos_override_index(struct knote *kn) } static void -knote_set_qos_override_index(struct knote *kn, kq_index_t override_index, - boolean_t override_is_sync) -{ - struct kqueue *kq = knote_get_kq(kn); - kq_index_t qos_index = knote_get_qos_index(kn); - kq_index_t old_override_index = knote_get_qos_override_index(kn); - boolean_t old_override_is_sync = kn->kn_qos_override_is_sync; - uint32_t flags = 0; - - assert((kn->kn_status & KN_QUEUED) == 0); - - if (override_index == KQWQ_QOS_MANAGER) { - assert(qos_index == KQWQ_QOS_MANAGER); - } else { - assert(override_index < KQWQ_QOS_MANAGER); - } - - kn->kn_qos_override = override_index; - kn->kn_qos_override_is_sync = override_is_sync; - - /* - * If this is a workq/workloop kqueue, apply the override to the - * servicing thread. - */ - if (kq->kq_state & KQ_WORKQ) { - struct kqworkq *kqwq = (struct kqworkq *)kq; - - assert(qos_index > THREAD_QOS_UNSPECIFIED); - kqworkq_update_override(kqwq, qos_index, override_index); - } else if (kq->kq_state & KQ_WORKLOOP) { - struct kqworkloop *kqwl = (struct kqworkloop *)kq; - - if ((kn->kn_status & KN_SUPPRESSED) == KN_SUPPRESSED) { - flags = flags | KQWL_UO_UPDATE_SUPPRESS_SYNC_COUNTERS; - - if (override_index == THREAD_QOS_USER_INTERACTIVE - && override_is_sync) { - flags = flags | KQWL_UO_NEW_OVERRIDE_IS_SYNC_UI; - } - - if (old_override_index == THREAD_QOS_USER_INTERACTIVE - && old_override_is_sync) { - flags = flags | KQWL_UO_OLD_OVERRIDE_IS_SYNC_UI; - } - } - - assert(qos_index > THREAD_QOS_UNSPECIFIED); - kqworkloop_update_override(kqwl, qos_index, override_index, flags); - } -} - -static kq_index_t -knote_get_sync_qos_override_index(struct knote *kn) -{ - return kn->kn_qos_sync_override; -} - -static void -kqworkq_update_override(struct kqworkq *kqwq, kq_index_t qos_index, kq_index_t override_index) +kqworkq_update_override(struct kqworkq *kqwq, struct knote *kn, + kq_index_t override_index) { struct kqrequest *kqr; kq_index_t old_override_index; + kq_index_t queue_index = kn->kn_qos_index; - if (override_index <= qos_index) { + if (override_index <= queue_index) { return; } - kqr = kqworkq_get_request(kqwq, qos_index); + kqr = kqworkq_get_request(kqwq, queue_index); - kqwq_req_lock(kqwq); + kq_req_lock(kqwq); old_override_index = kqr->kqr_override_index; if (override_index > MAX(kqr->kqr_qos_index, old_override_index)) { kqr->kqr_override_index = override_index; /* apply the override to [incoming?] servicing thread */ - if (kqr->kqr_state & KQR_BOUND) { - thread_t wqthread = kqr->kqr_thread; - - /* only apply if non-manager */ - assert(wqthread); - if ((kqr->kqr_state & KQWQ_THMANAGER) == 0) { - if (old_override_index) - thread_update_ipc_override(wqthread, override_index); - else - thread_add_ipc_override(wqthread, override_index); - } + if (kqr->kqr_thread) { + if (old_override_index) + thread_update_ipc_override(kqr->kqr_thread, override_index); + else + thread_add_ipc_override(kqr->kqr_thread, override_index); } } - kqwq_req_unlock(kqwq); + kq_req_unlock(kqwq); } -/* called with the kqworkq lock held */ static void -kqworkq_bind_thread_impl( - struct kqworkq *kqwq, - kq_index_t qos_index, - thread_t thread, - unsigned int flags) +kqworkloop_update_override(struct kqworkloop *kqwl, kq_index_t override_index) { - /* request lock must be held */ - kqwq_req_held(kqwq); - - struct kqrequest *kqr = kqworkq_get_request(kqwq, qos_index); - assert(kqr->kqr_state & KQR_THREQUESTED); - - if (qos_index == KQWQ_QOS_MANAGER) - flags |= KEVENT_FLAG_WORKQ_MANAGER; - - struct uthread *ut = get_bsdthread_info(thread); - - /* - * If this is a manager, and the manager request bit is - * not set, assure no other thread is bound. If the bit - * is set, make sure the old thread is us (or not set). - */ - if (flags & KEVENT_FLAG_WORKQ_MANAGER) { - if ((kqr->kqr_state & KQR_BOUND) == 0) { - kqr->kqr_state |= (KQR_BOUND | KQWQ_THMANAGER); - TAILQ_INIT(&kqr->kqr_suppressed); - kqr->kqr_thread = thread; - ut->uu_kqueue_bound = (struct kqueue *)kqwq; - ut->uu_kqueue_qos_index = KQWQ_QOS_MANAGER; - ut->uu_kqueue_flags = (KEVENT_FLAG_WORKQ | - KEVENT_FLAG_WORKQ_MANAGER); - } else { - assert(kqr->kqr_state & KQR_BOUND); - assert(thread == kqr->kqr_thread); - assert(ut->uu_kqueue_bound == (struct kqueue *)kqwq); - assert(ut->uu_kqueue_qos_index == KQWQ_QOS_MANAGER); - assert(ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ_MANAGER); - } - return; - } - - /* Just a normal one-queue servicing thread */ - assert(kqr->kqr_state & KQR_THREQUESTED); - assert(kqr->kqr_qos_index == qos_index); - - if ((kqr->kqr_state & KQR_BOUND) == 0) { - kqr->kqr_state |= KQR_BOUND; - TAILQ_INIT(&kqr->kqr_suppressed); - kqr->kqr_thread = thread; - - /* apply an ipc QoS override if one is needed */ - if (kqr->kqr_override_index) { - assert(kqr->kqr_qos_index); - assert(kqr->kqr_override_index > kqr->kqr_qos_index); - assert(thread_get_ipc_override(thread) == THREAD_QOS_UNSPECIFIED); - thread_add_ipc_override(thread, kqr->kqr_override_index); - } - - /* indicate that we are processing in the uthread */ - ut->uu_kqueue_bound = (struct kqueue *)kqwq; - ut->uu_kqueue_qos_index = qos_index; - ut->uu_kqueue_flags = flags; - } else { - /* - * probably syncronously bound AND post-request bound - * this logic can go away when we get rid of post-request bind - */ - assert(kqr->kqr_state & KQR_BOUND); - assert(thread == kqr->kqr_thread); - assert(ut->uu_kqueue_bound == (struct kqueue *)kqwq); - assert(ut->uu_kqueue_qos_index == qos_index); - assert((ut->uu_kqueue_flags & flags) == flags); - } + kq_req_lock(kqwl); + kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE, + override_index); + kq_req_unlock(kqwl); } -static void -kqworkloop_update_override( - struct kqworkloop *kqwl, - kq_index_t qos_index, - kq_index_t override_index, - uint32_t flags) +static thread_qos_t +kqworkloop_unbind_locked(struct kqworkloop *kqwl, thread_t thread) { + struct uthread *ut = get_bsdthread_info(thread); struct kqrequest *kqr = &kqwl->kqwl_request; + kq_index_t ipc_override = ut->uu_kqueue_override; - kqwl_req_lock(kqwl); - - /* Do not override on attached threads */ - if (kqr->kqr_state & KQR_BOUND) { - assert(kqr->kqr_thread); - - if (kqwl->kqwl_kqueue.kq_state & KQ_NO_WQ_THREAD) { - kqwl_req_unlock(kqwl); - assert(!is_workqueue_thread(kqr->kqr_thread)); - return; - } - } - - /* Update sync ipc counts on kqr for suppressed knotes */ - if (flags & KQWL_UO_UPDATE_SUPPRESS_SYNC_COUNTERS) { - kqworkloop_update_suppress_sync_count(kqr, flags); - } + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_UNBIND), kqwl->kqwl_dynamicid, + thread_tid(thread), 0, 0); - if ((flags & KQWL_UO_UPDATE_OVERRIDE_LAZY) == 0) { - kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE, - MAX(qos_index, override_index)); - } - kqwl_req_unlock(kqwl); -} + kq_req_held(kqwl); + assert(ut->uu_kqr_bound == kqr); + ut->uu_kqr_bound = NULL; + ut->uu_kqueue_override = THREAD_QOS_UNSPECIFIED; -static void -kqworkloop_update_suppress_sync_count( - struct kqrequest *kqr, - uint32_t flags) -{ - if (flags & KQWL_UO_NEW_OVERRIDE_IS_SYNC_UI) { - kqr->kqr_sync_suppress_count++; + if (kqwl->kqwl_owner == NULL && kqwl->kqwl_turnstile) { + turnstile_update_inheritor(kqwl->kqwl_turnstile, + TURNSTILE_INHERITOR_NULL, TURNSTILE_IMMEDIATE_UPDATE); + turnstile_update_inheritor_complete(kqwl->kqwl_turnstile, + TURNSTILE_INTERLOCK_HELD); } - if (flags & KQWL_UO_OLD_OVERRIDE_IS_SYNC_UI) { - assert(kqr->kqr_sync_suppress_count > 0); - kqr->kqr_sync_suppress_count--; - } + kqr->kqr_thread = NULL; + kqr->kqr_state &= ~(KQR_THREQUESTED | KQR_R2K_NOTIF_ARMED); + return ipc_override; } /* - * kqworkloop_unbind_thread - Unbind the servicer thread of a workloop kqueue + * kqworkloop_unbind - Unbind the servicer thread of a workloop kqueue * - * It will end the processing phase in case it was still processing: - * - * We may have to request a new thread for not KQ_NO_WQ_THREAD workloop. - * This can happen if : - * - there were active events at or above our QoS we never got to (count > 0) + * It will acknowledge events, and possibly request a new thread if: + * - there were active events left * - we pended waitq hook callouts during processing * - we pended wakeups while processing (or unsuppressing) * * Called with kqueue lock held. */ - static void -kqworkloop_unbind_thread( - struct kqworkloop *kqwl, - thread_t thread, - __unused unsigned int flags) +kqworkloop_unbind(proc_t p, struct kqworkloop *kqwl) { struct kqueue *kq = &kqwl->kqwl_kqueue; struct kqrequest *kqr = &kqwl->kqwl_request; + thread_t thread = kqr->kqr_thread; + int op = KQWL_UTQ_PARKING; + kq_index_t ipc_override, qos_override = THREAD_QOS_UNSPECIFIED; - kqlock_held(kq); + assert(thread == current_thread()); - assert((kq->kq_state & KQ_PROCESSING) == 0); - if (kq->kq_state & KQ_PROCESSING) { - return; - } + kqlock(kqwl); /* * Forcing the KQ_PROCESSING flag allows for QoS updates because of * unsuppressing knotes not to be applied until the eventual call to * kqworkloop_update_threads_qos() below. */ - kq->kq_state |= KQ_PROCESSING; - kqworkloop_acknowledge_events(kqwl, TRUE); - kq->kq_state &= ~KQ_PROCESSING; - - kqwl_req_lock(kqwl); - - /* deal with extraneous unbinds in release kernels */ - assert((kqr->kqr_state & (KQR_BOUND | KQR_PROCESSING)) == KQR_BOUND); - if ((kqr->kqr_state & (KQR_BOUND | KQR_PROCESSING)) != KQR_BOUND) { - kqwl_req_unlock(kqwl); - return; + assert((kq->kq_state & KQ_PROCESSING) == 0); + if (!TAILQ_EMPTY(&kqr->kqr_suppressed)) { + kq->kq_state |= KQ_PROCESSING; + qos_override = kqworkloop_acknowledge_events(kqwl); + kq->kq_state &= ~KQ_PROCESSING; } - assert(thread == current_thread()); - assert(kqr->kqr_thread == thread); - if (kqr->kqr_thread != thread) { - kqwl_req_unlock(kqwl); - return; - } + kq_req_lock(kqwl); - struct uthread *ut = get_bsdthread_info(thread); - kq_index_t old_qos_index = ut->uu_kqueue_qos_index; - boolean_t ipc_override_is_sync = ut->uu_kqueue_override_is_sync; - ut->uu_kqueue_bound = NULL; - ut->uu_kqueue_qos_index = 0; - ut->uu_kqueue_override_is_sync = 0; - ut->uu_kqueue_flags = 0; - - /* unbind the servicer thread, drop overrides */ - kqr->kqr_thread = NULL; - kqr->kqr_state &= ~(KQR_BOUND | KQR_THREQUESTED | KQR_R2K_NOTIF_ARMED); - kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RECOMPUTE_WAKEUP_QOS, 0); + ipc_override = kqworkloop_unbind_locked(kqwl, thread); + kqworkloop_update_threads_qos(kqwl, op, qos_override); + + kq_req_unlock(kqwl); - kqwl_req_unlock(kqwl); + kqunlock(kqwl); /* * Drop the override on the current thread last, after the call to * kqworkloop_update_threads_qos above. */ - if (old_qos_index) { + if (ipc_override) { thread_drop_ipc_override(thread); } - if (ipc_override_is_sync) { - thread_drop_sync_ipc_override(thread); - } + + /* If last reference, dealloc the workloop kq */ + kqueue_release_last(p, kqwl); } -/* called with the kqworkq lock held */ -static void -kqworkq_unbind_thread( - struct kqworkq *kqwq, - kq_index_t qos_index, - thread_t thread, - __unused unsigned int flags) +static thread_qos_t +kqworkq_unbind_locked(__assert_only struct kqworkq *kqwq, + struct kqrequest *kqr, thread_t thread) { - struct kqrequest *kqr = kqworkq_get_request(kqwq, qos_index); - kq_index_t override_index = 0; - - /* request lock must be held */ - kqwq_req_held(kqwq); - - assert(thread == current_thread()); - - if ((kqr->kqr_state & KQR_BOUND) == 0) { - assert(kqr->kqr_state & KQR_BOUND); - return; - } - - assert(kqr->kqr_thread == thread); - assert(TAILQ_EMPTY(&kqr->kqr_suppressed)); + struct uthread *ut = get_bsdthread_info(thread); + kq_index_t old_override = kqr->kqr_override_index; - /* - * If there is an override, drop it from the current thread - * and then we are free to recompute (a potentially lower) - * minimum override to apply to the next thread request. - */ - if (kqr->kqr_override_index) { - struct kqtailq *base_queue = kqueue_get_base_queue(&kqwq->kqwq_kqueue, qos_index); - struct kqtailq *queue = kqueue_get_high_queue(&kqwq->kqwq_kqueue, qos_index); + KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_UNBIND), -1, + thread_tid(kqr->kqr_thread), kqr->kqr_qos_index, 0); - /* if not bound to a manager thread, drop the current ipc override */ - if ((kqr->kqr_state & KQWQ_THMANAGER) == 0) { - thread_drop_ipc_override(thread); - } + kq_req_held(kqwq); + assert(ut->uu_kqr_bound == kqr); + ut->uu_kqr_bound = NULL; + kqr->kqr_thread = NULL; + kqr->kqr_state &= ~(KQR_THREQUESTED | KQR_R2K_NOTIF_ARMED); + kqr->kqr_override_index = THREAD_QOS_UNSPECIFIED; - /* recompute the new override */ - do { - if (!TAILQ_EMPTY(queue)) { - override_index = queue - base_queue + qos_index; - break; - } - } while (queue-- > base_queue); - } + return old_override; +} - /* Mark it unbound */ - kqr->kqr_thread = NULL; - kqr->kqr_state &= ~(KQR_BOUND | KQR_THREQUESTED | KQWQ_THMANAGER); +/* + * kqworkq_unbind - unbind of a workq kqueue from a thread + * + * We may have to request new threads. + * This can happen there are no waiting processing threads and: + * - there were active events we never got to (count > 0) + * - we pended waitq hook callouts during processing + * - we pended wakeups while processing (or unsuppressing) + */ +static void +kqworkq_unbind(proc_t p, struct kqrequest *kqr) +{ + struct kqworkq *kqwq = (struct kqworkq *)p->p_fd->fd_wqkqueue; + __assert_only int rc; - /* apply the new override */ - if (override_index > kqr->kqr_qos_index) { - kqr->kqr_override_index = override_index; - } else { - kqr->kqr_override_index = THREAD_QOS_UNSPECIFIED; - } + kqlock(kqwq); + rc = kqworkq_acknowledge_events(kqwq, kqr, 0, KQWQAE_UNBIND); + assert(rc == -1); + kqunlock(kqwq); } struct kqrequest * kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index) { - assert(qos_index < KQWQ_NQOS); + assert(qos_index < KQWQ_NBUCKETS); return &kqwq->kqwq_request[qos_index]; } -void -knote_adjust_qos(struct knote *kn, qos_t new_qos, qos_t new_override, kq_index_t sync_override_index) +static void +knote_apply_qos_override(struct knote *kn, kq_index_t qos_index) { - struct kqueue *kq = knote_get_kq(kn); - boolean_t override_is_sync = FALSE; - - if (kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) { - kq_index_t new_qos_index; - kq_index_t new_override_index; - kq_index_t servicer_qos_index; - - new_qos_index = qos_index_from_qos(kn, new_qos, FALSE); - new_override_index = qos_index_from_qos(kn, new_override, TRUE); + assert((kn->kn_status & KN_QUEUED) == 0); - /* make sure the servicer qos acts as a floor */ - servicer_qos_index = qos_index_from_qos(kn, kn->kn_qos, FALSE); - if (servicer_qos_index > new_qos_index) - new_qos_index = servicer_qos_index; - if (servicer_qos_index > new_override_index) - new_override_index = servicer_qos_index; - if (sync_override_index >= new_override_index) { - new_override_index = sync_override_index; - override_is_sync = TRUE; - } + kn->kn_qos_override = qos_index; - kqlock(kq); - if (new_qos_index != knote_get_req_index(kn) || - new_override_index != knote_get_qos_override_index(kn) || - override_is_sync != kn->kn_qos_override_is_sync) { - if (kn->kn_status & KN_QUEUED) { - knote_dequeue(kn); - knote_set_qos_index(kn, new_qos_index); - knote_set_qos_override_index(kn, new_override_index, override_is_sync); - knote_enqueue(kn); - knote_wakeup(kn); - } else { - knote_set_qos_index(kn, new_qos_index); - knote_set_qos_override_index(kn, new_override_index, override_is_sync); - } + if (kn->kn_status & KN_SUPPRESSED) { + struct kqueue *kq = knote_get_kq(kn); + /* + * For suppressed events, the kn_qos_index field cannot be touched as it + * allows us to know on which supress queue the knote is for a kqworkq. + * + * Also, there's no natural push applied on the kqueues when this field + * changes anyway. We hence need to apply manual overrides in this case, + * which will be cleared when the events are later acknowledged. + */ + if (kq->kq_state & KQ_WORKQ) { + kqworkq_update_override((struct kqworkq *)kq, kn, qos_index); + } else { + kqworkloop_update_override((struct kqworkloop *)kq, qos_index); } - kqunlock(kq); + } else { + kn->kn_qos_index = qos_index; } } -void -knote_adjust_sync_qos(struct knote *kn, kq_index_t sync_qos, boolean_t lock_kq) +static bool +knote_should_apply_qos_override(struct kqueue *kq, struct knote *kn, int result, + thread_qos_t *qos_out) { - struct kqueue *kq = knote_get_kq(kn); - kq_index_t old_sync_override; - kq_index_t qos_index = knote_get_qos_index(kn); - uint32_t flags = 0; - - /* Tracking only happens for UI qos */ - if (sync_qos != THREAD_QOS_USER_INTERACTIVE && - sync_qos != THREAD_QOS_UNSPECIFIED) { - return; - } - - if (lock_kq) - kqlock(kq); + thread_qos_t qos_index = (result >> FILTER_ADJUST_EVENT_QOS_SHIFT) & 7; - if (kq->kq_state & KQ_WORKLOOP) { - struct kqworkloop *kqwl = (struct kqworkloop *)kq; + kqlock_held(kq); - old_sync_override = knote_get_sync_qos_override_index(kn); - if (old_sync_override != sync_qos) { - kn->kn_qos_sync_override = sync_qos; + assert(result & FILTER_ADJUST_EVENT_QOS_BIT); + assert(qos_index < THREAD_QOS_LAST); - /* update sync ipc counters for suppressed knotes */ - if ((kn->kn_status & KN_SUPPRESSED) == KN_SUPPRESSED) { - flags = flags | KQWL_UO_UPDATE_SUPPRESS_SYNC_COUNTERS; + /* + * Early exit for knotes that should not change QoS + * + * It is safe to test kn_req_index against MANAGER / STAYACTIVE because + * knotes with such kn_req_index values never change for their entire + * lifetime. + */ + if (__improbable(!knote_fops(kn)->f_adjusts_qos)) { + panic("filter %d cannot change QoS", kn->kn_filtid); + } else if (kq->kq_state & KQ_WORKLOOP) { + if (kn->kn_req_index == KQWL_BUCKET_STAYACTIVE) { + return false; + } + } else if (kq->kq_state & KQ_WORKQ) { + if (kn->kn_req_index == KQWQ_QOS_MANAGER) { + return false; + } + } else { + return false; + } - /* Do not recalculate kqwl override, it would be done later */ - flags = flags | KQWL_UO_UPDATE_OVERRIDE_LAZY; + /* + * knotes with the FALLBACK flag will only use their registration QoS if the + * incoming event has no QoS, else, the registration QoS acts as a floor. + */ + if (kn->kn_qos & _PTHREAD_PRIORITY_FALLBACK_FLAG) { + if (qos_index == THREAD_QOS_UNSPECIFIED) + qos_index = kn->kn_req_index; + } else { + if (qos_index < kn->kn_req_index) + qos_index = kn->kn_req_index; + } + if ((kn->kn_status & KN_MERGE_QOS) && (qos_index < kn->kn_qos_override)) { + /* Never lower QoS when in "Merge" mode */ + return false; + } - if (sync_qos == THREAD_QOS_USER_INTERACTIVE) { - flags = flags | KQWL_UO_NEW_OVERRIDE_IS_SYNC_UI; - } + if ((kn->kn_status & KN_LOCKED) && kn->kn_inuse) { + /* + * When we're trying to update the QoS override and that both an + * f_event() and other f_* calls are running concurrently, any of these + * in flight calls may want to perform overrides that aren't properly + * serialized with each other. + * + * The first update that observes this racy situation enters a "Merge" + * mode which causes subsequent override requests to saturate the + * override instead of replacing its value. + * + * This mode is left when knote_unlock() or knote_call_filter_event() + * observe that no other f_* routine is in flight. + */ + kn->kn_status |= KN_MERGE_QOS; + } - if (old_sync_override == THREAD_QOS_USER_INTERACTIVE) { - flags = flags | KQWL_UO_OLD_OVERRIDE_IS_SYNC_UI; - } + if (kn->kn_qos_override == qos_index) { + return false; + } - kqworkloop_update_override(kqwl, qos_index, sync_qos, - flags); - } + *qos_out = qos_index; + return true; +} +static void +knote_adjust_qos(struct kqueue *kq, struct knote *kn, int result) +{ + thread_qos_t qos; + if (knote_should_apply_qos_override(kq, kn, result, &qos)) { + knote_dequeue(kn); + knote_apply_qos_override(kn, qos); + if (knote_enqueue(kn) && (kn->kn_status & KN_ACTIVE)) { + knote_wakeup(kn); } } - if (lock_kq) - kqunlock(kq); } static void knote_wakeup(struct knote *kn) { struct kqueue *kq = knote_get_kq(kn); - kq_index_t qos_index = knote_get_qos_index(kn); kqlock_held(kq); if (kq->kq_state & KQ_WORKQ) { - /* request a servicing thread */ struct kqworkq *kqwq = (struct kqworkq *)kq; - kqworkq_request_help(kqwq, qos_index); - + kqworkq_request_help(kqwq, kn->kn_qos_index); } else if (kq->kq_state & KQ_WORKLOOP) { - /* request a servicing thread */ struct kqworkloop *kqwl = (struct kqworkloop *)kq; - if (kqworkloop_is_processing_on_current_thread(kqwl)) { - /* - * kqworkloop_end_processing() will perform the required QoS - * computations when it unsets the processing mode. - */ - return; + /* + * kqworkloop_end_processing() will perform the required QoS + * computations when it unsets the processing mode. + */ + if (!kqworkloop_is_processing_on_current_thread(kqwl)) { + kqworkloop_request_help(kqwl, kn->kn_qos_index); } - kqworkloop_request_help(kqwl, qos_index); } else { struct kqfile *kqf = (struct kqfile *)kq; @@ -7679,10 +6969,8 @@ knote_wakeup(struct knote *kn) /* wakeup a thread waiting on this queue */ if (kq->kq_state & (KQ_SLEEP | KQ_SEL)) { kq->kq_state &= ~(KQ_SLEEP | KQ_SEL); - waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, - KQ_EVENT, - THREAD_AWAKENED, - WAITQ_ALL_PRIORITIES); + waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, KQ_EVENT, + THREAD_AWAKENED, WAITQ_ALL_PRIORITIES); } /* wakeup other kqueues/select sets we're inside */ @@ -7714,8 +7002,8 @@ kqueue_interrupt(struct kqueue *kq) assert(kq->kq_state & KQ_PROCESSING); kq->kq_state &= ~KQ_PROCWAIT; - suppressq = kqueue_get_suppressed_queue(kq, QOS_INDEX_KQFILE); - (void)waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, + suppressq = kqueue_get_suppressed_queue(kq, NULL); + (void)waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, CAST_EVENT64_T(suppressq), THREAD_RESTART, WAITQ_ALL_PRIORITIES); @@ -7744,7 +7032,6 @@ waitq_set__CALLING_PREPOST_HOOK__(void *kq_hook, void *knote_hook, int qos) struct kqworkq *kqwq = (struct kqworkq *)kq; kqworkq_request_help(kqwq, KQWQ_QOS_MANAGER); - } else if (kq->kq_state & KQ_WORKLOOP) { struct kqworkloop *kqwl = (struct kqworkloop *)kq; @@ -7768,8 +7055,7 @@ klist_init(struct klist *list) * the hint) and not deadlock itself. * * The object lock should also hold off pending - * detach/drop operations. But we'll prevent it here - * too (by taking a use reference) - just in case. + * detach/drop operations. */ void knote(struct klist *list, long hint) @@ -7778,23 +7064,8 @@ knote(struct klist *list, long hint) SLIST_FOREACH(kn, list, kn_selnext) { struct kqueue *kq = knote_get_kq(kn); - kqlock(kq); - - assert(!knoteuse_needs_boost(kn, NULL)); - - /* If we can get a use reference - deliver event */ - if (kqlock2knoteuse(kq, kn, KNUSE_NONE)) { - int result; - - /* call the event with only a use count */ - result = knote_fops(kn)->f_event(kn, hint); - - /* if its not going away and triggered */ - if (knoteuse2kqlock(kq, kn, KNUSE_NONE) && result) - knote_activate(kn); - /* kq lock held */ - } + knote_call_filter_event(kq, kn, hint); kqunlock(kq); } } @@ -7845,32 +7116,45 @@ knote_vanish(struct klist *list) SLIST_FOREACH_SAFE(kn, list, kn_selnext, kn_next) { struct kqueue *kq = knote_get_kq(kn); - int result; kqlock(kq); - - assert(!knoteuse_needs_boost(kn, NULL)); - - if ((kn->kn_status & KN_DROPPING) == 0) { + if (kn->kn_status & KN_REQVANISH) { /* If EV_VANISH supported - prepare to deliver one */ - if (kn->kn_status & KN_REQVANISH) { - kn->kn_status |= KN_VANISHED; - knote_activate(kn); - - } else if (kqlock2knoteuse(kq, kn, KNUSE_NONE)) { - /* call the event with only a use count */ - result = knote_fops(kn)->f_event(kn, NOTE_REVOKE); - - /* if its not going away and triggered */ - if (knoteuse2kqlock(kq, kn, KNUSE_NONE) && result) - knote_activate(kn); - /* lock held again */ - } + kn->kn_status |= KN_VANISHED; + knote_activate(kn); + } else { + knote_call_filter_event(kq, kn, NOTE_REVOKE); } kqunlock(kq); } } +/* + * Force a lazy allocation of the waitqset link + * of the kq_wqs associated with the kn + * if it wasn't already allocated. + * + * This allows knote_link_waitq to never block + * if reserved_link is not NULL. + */ +void +knote_link_waitqset_lazy_alloc(struct knote *kn) +{ + struct kqueue *kq = knote_get_kq(kn); + waitq_set_lazy_init_link(&kq->kq_wqs); +} + +/* + * Check if a lazy allocation for the waitqset link + * of the kq_wqs is needed. + */ +boolean_t +knote_link_waitqset_should_lazy_alloc(struct knote *kn) +{ + struct kqueue *kq = knote_get_kq(kn); + return waitq_set_should_lazy_init_link(&kq->kq_wqs); +} + /* * For a given knote, link a provided wait queue directly with the kqueue. * Wakeups will happen via recursive wait queue support. But nothing will move @@ -7880,7 +7164,8 @@ knote_vanish(struct klist *list) * kqueue and knote references are held by caller. * waitq locked by caller. * - * caller provides the wait queue link structure. + * caller provides the wait queue link structure and insures that the kq->kq_wqs + * is linked by previously calling knote_link_waitqset_lazy_alloc. */ int knote_link_waitq(struct knote *kn, struct waitq *wq, uint64_t *reserved_link) @@ -7920,17 +7205,15 @@ knote_unlink_waitq(struct knote *kn, struct waitq *wq) /* * remove all knotes referencing a specified fd * - * Essentially an inlined knote_remove & knote_drop - * when we know for sure that the thing is a file - * * Entered with the proc_fd lock already held. * It returns the same way, but may drop it temporarily. */ void -knote_fdclose(struct proc *p, int fd, int force) +knote_fdclose(struct proc *p, int fd) { struct klist *list; struct knote *kn; + KNOTE_LOCK_CTX(knlc); restart: list = &p->p_fd->fd_knlist[fd]; @@ -7948,45 +7231,28 @@ knote_fdclose(struct proc *p, int fd, int force) * transition it to vanished mode (or skip over * it if already vanished). */ - if (!force && (kn->kn_status & KN_REQVANISH)) { - - if ((kn->kn_status & KN_VANISHED) == 0) { - proc_fdunlock(p); - - assert(!knoteuse_needs_boost(kn, NULL)); - - /* get detach reference (also marks vanished) */ - if (kqlock2knotedetach(kq, kn, KNUSE_NONE)) { - /* detach knote and drop fp use reference */ - knote_fops(kn)->f_detach(kn); - if (knote_fops(kn)->f_isfd) - fp_drop(p, kn->kn_id, kn->kn_fp, 0); - - /* activate it if it's still in existence */ - if (knoteuse2kqlock(kq, kn, KNUSE_NONE)) { - knote_activate(kn); - } - kqunlock(kq); - } - proc_fdlock(p); - goto restart; - } else { - kqunlock(kq); - continue; - } + if (kn->kn_status & KN_VANISHED) { + kqunlock(kq); + continue; } proc_fdunlock(p); + if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) { + /* the knote was dropped by someone, nothing to do */ + } else if (kn->kn_status & KN_REQVANISH) { + kn->kn_status |= KN_VANISHED; + kn->kn_status &= ~KN_ATTACHED; - /* - * Convert the kq lock to a drop ref. - * If we get it, go ahead and drop it. - * Otherwise, we waited for the blocking - * condition to complete. Either way, - * we dropped the fdlock so start over. - */ - if (kqlock2knotedrop(kq, kn)) { - knote_drop(kn, p); + kqunlock(kq); + knote_fops(kn)->f_detach(kn); + if (knote_fops(kn)->f_isfd) + fp_drop(p, kn->kn_id, kn->kn_fp, 0); + kqlock(kq); + + knote_activate(kn); + knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK); + } else { + knote_drop(kq, kn, &knlc); } proc_fdlock(p); @@ -7994,7 +7260,7 @@ knote_fdclose(struct proc *p, int fd, int force) } } -/* +/* * knote_fdfind - lookup a knote in the fd table for process * * If the filter is file-based, lookup based on fd index. @@ -8009,15 +7275,15 @@ knote_fdclose(struct proc *p, int fd, int force) */ static struct knote * knote_fdfind(struct kqueue *kq, - struct kevent_internal_s *kev, - bool is_fd, - struct proc *p) + struct kevent_internal_s *kev, + bool is_fd, + struct proc *p) { struct filedesc *fdp = p->p_fd; struct klist *list = NULL; struct knote *kn = NULL; - /* + /* * determine where to look for the knote */ if (is_fd) { @@ -8036,7 +7302,7 @@ knote_fdfind(struct kqueue *kq, if (list != NULL) { SLIST_FOREACH(kn, list, kn_link) { if (kq == knote_get_kq(kn) && - kev->ident == kn->kn_id && + kev->ident == kn->kn_id && kev->filter == kn->kn_filter) { if (kev->flags & EV_UDATA_SPECIFIC) { if ((kn->kn_status & KN_UDATA_SPECIFIC) && @@ -8067,9 +7333,8 @@ knote_fdfind(struct kqueue *kq, * Takes a rwlock boost if inserting the knote is successful. */ static int -kq_add_knote(struct kqueue *kq, struct knote *kn, - struct kevent_internal_s *kev, - struct proc *p, int *knoteuse_flags) +kq_add_knote(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc, + struct proc *p) { struct filedesc *fdp = p->p_fd; struct klist *list = NULL; @@ -8081,7 +7346,7 @@ kq_add_knote(struct kqueue *kq, struct knote *kn, else knhash_lock(p); - if (knote_fdfind(kq, kev, is_fd, p) != NULL) { + if (knote_fdfind(kq, &kn->kn_kevent, is_fd, p) != NULL) { /* found an existing knote: we can't add this one */ ret = ERESTART; goto out_locked; @@ -8092,8 +7357,7 @@ kq_add_knote(struct kqueue *kq, struct knote *kn, if (fdp->fd_knhashmask == 0) { u_long size = 0; - list = hashinit(CONFIG_KN_HASHSIZE, M_KQUEUE, - &size); + list = hashinit(CONFIG_KN_HASHSIZE, M_KQUEUE, &size); if (list == NULL) { ret = ENOMEM; goto out_locked; @@ -8154,11 +7418,10 @@ kq_add_knote(struct kqueue *kq, struct knote *kn, } out_locked: - if (ret == 0 && knoteuse_needs_boost(kn, kev)) { - set_thread_rwlock_boost(); - *knoteuse_flags = KNUSE_BOOST; - } else { - *knoteuse_flags = KNUSE_NONE; + if (ret == 0) { + kqlock(kq); + assert((kn->kn_status & KN_LOCKED) == 0); + (void)knote_lock(kq, kn, knlc, KNOTE_KQ_UNLOCK); } if (is_fd) proc_fdunlock(p); @@ -8170,8 +7433,6 @@ kq_add_knote(struct kqueue *kq, struct knote *kn, /* * kq_remove_knote - remove a knote from the fd table for process - * and copy kn_status an kq_state while holding kqlock and - * fd table locks. * * If the filter is file-based, remove based on fd index. * Otherwise remove from the hash based on the ident. @@ -8180,10 +7441,11 @@ kq_add_knote(struct kqueue *kq, struct knote *kn, */ static void kq_remove_knote(struct kqueue *kq, struct knote *kn, struct proc *p, - kn_status_t *kn_status, uint16_t *kq_state) + struct knote_lock_ctx *knlc) { struct filedesc *fdp = p->p_fd; struct klist *list = NULL; + uint16_t kq_state; bool is_fd; is_fd = knote_fops(kn)->f_isfd; @@ -8202,14 +7464,19 @@ kq_remove_knote(struct kqueue *kq, struct knote *kn, struct proc *p, SLIST_REMOVE(list, kn, knote, kn_link); kqlock(kq); - *kn_status = kn->kn_status; - *kq_state = kq->kq_state; - kqunlock(kq); - + kq_state = kq->kq_state; + if (knlc) { + knote_unlock_cancel(kq, kn, knlc, KNOTE_KQ_UNLOCK); + } else { + kqunlock(kq); + } if (is_fd) proc_fdunlock(p); else knhash_unlock(p); + + if (kq_state & KQ_DYNAMIC) + kqueue_release_last(p, kq); } /* @@ -8220,10 +7487,8 @@ kq_remove_knote(struct kqueue *kq, struct knote *kn, struct proc *p, */ static struct knote * -kq_find_knote_and_kq_lock(struct kqueue *kq, - struct kevent_internal_s *kev, - bool is_fd, - struct proc *p) +kq_find_knote_and_kq_lock(struct kqueue *kq, struct kevent_internal_s *kev, + bool is_fd, struct proc *p) { struct knote * ret; @@ -8248,71 +7513,41 @@ kq_find_knote_and_kq_lock(struct kqueue *kq, /* * knote_drop - disconnect and drop the knote * - * Called with the kqueue unlocked and holding a - * "drop reference" on the knote in question. - * This reference is most often aquired thru a call - * to kqlock2knotedrop(). But it can also be acquired - * through stealing a drop reference via a call to - * knoteuse2knotedrop() or during the initial attach - * of the knote. + * Called with the kqueue locked, returns with the kqueue unlocked. + * + * If a knote locking context is passed, it is canceled. * * The knote may have already been detached from * (or not yet attached to) its source object. */ static void -knote_drop(struct knote *kn, __unused struct proc *ctxp) +knote_drop(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc) { - struct kqueue *kq = knote_get_kq(kn); struct proc *p = kq->kq_p; - kn_status_t kn_status; - uint16_t kq_state; + + kqlock_held(kq); + + assert((kn->kn_status & KN_DROPPING) == 0); + if (knlc == NULL) { + assert((kn->kn_status & KN_LOCKED) == 0); + } + kn->kn_status |= KN_DROPPING; + + knote_unsuppress(kn); + knote_dequeue(kn); + knote_wait_for_filter_events(kq, kn); /* If we are attached, disconnect from the source first */ if (kn->kn_status & KN_ATTACHED) { knote_fops(kn)->f_detach(kn); } - /* Remove the source from the appropriate hash */ - kq_remove_knote(kq, kn, p, &kn_status, &kq_state); - - /* - * If a kqueue_dealloc is happening in parallel for the kq - * pointed by the knote the kq could be aready deallocated - * at this point. - * Do not access the kq after the kq_remove_knote if it is - * not a KQ_DYNAMIC. - */ - - /* determine if anyone needs to know about the drop */ - assert((kn_status & (KN_DROPPING | KN_SUPPRESSED | KN_QUEUED)) == KN_DROPPING); - - /* - * If KN_USEWAIT is set, some other thread was trying to drop the kn. - * Or it was in kqueue_dealloc, so the kqueue_dealloc did not happen - * because that thread was waiting on this wake, or it was a drop happening - * because of a kevent_register that takes a reference on the kq, and therefore - * the kq cannot be deallocated in parallel. - * - * It is safe to access kq->kq_wqs if needswakeup is set. - */ - if (kn_status & KN_USEWAIT) - waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, - CAST_EVENT64_T(&kn->kn_status), - THREAD_RESTART, - WAITQ_ALL_PRIORITIES); - + /* kq may be freed when kq_remove_knote() returns */ + kq_remove_knote(kq, kn, p, knlc); if (knote_fops(kn)->f_isfd && ((kn->kn_status & KN_VANISHED) == 0)) fp_drop(p, kn->kn_id, kn->kn_fp, 0); knote_free(kn); - - /* - * release reference on dynamic kq (and free if last). - * Will only be last if this is from fdfree, etc... - * because otherwise processing thread has reference. - */ - if (kq_state & KQ_DYNAMIC) - kqueue_release_last(p, kq); } /* called with kqueue lock held */ @@ -8350,9 +7585,6 @@ knote_enable(struct knote *kn) kn->kn_status &= ~KN_DISABLED; if (kn->kn_status & KN_SUPPRESSED) { - /* Clear the sync qos on the knote */ - knote_adjust_sync_qos(kn, THREAD_QOS_UNSPECIFIED, FALSE); - /* * it is possible for userland to have knotes registered for a given * workloop `wl_orig` but really handled on another workloop `wl_new`. @@ -8401,18 +7633,8 @@ knote_suppress(struct knote *kn) knote_dequeue(kn); kn->kn_status |= KN_SUPPRESSED; - suppressq = kqueue_get_suppressed_queue(kq, knote_get_qos_index(kn)); + suppressq = kqueue_get_suppressed_queue(kq, kn); TAILQ_INSERT_TAIL(suppressq, kn, kn_tqe); - - if ((kq->kq_state & KQ_WORKLOOP) && - knote_get_qos_override_index(kn) == THREAD_QOS_USER_INTERACTIVE && - kn->kn_qos_override_is_sync) { - struct kqworkloop *kqwl = (struct kqworkloop *)kq; - /* update the sync qos override counter for suppressed knotes */ - kqworkloop_update_override(kqwl, knote_get_qos_index(kn), - knote_get_qos_override_index(kn), - (KQWL_UO_UPDATE_SUPPRESS_SYNC_COUNTERS | KQWL_UO_NEW_OVERRIDE_IS_SYNC_UI)); - } } /* called with kqueue lock held */ @@ -8427,70 +7649,41 @@ knote_unsuppress(struct knote *kn) if ((kn->kn_status & KN_SUPPRESSED) == 0) return; - /* Clear the sync qos on the knote */ - knote_adjust_sync_qos(kn, THREAD_QOS_UNSPECIFIED, FALSE); - kn->kn_status &= ~KN_SUPPRESSED; - suppressq = kqueue_get_suppressed_queue(kq, knote_get_qos_index(kn)); + suppressq = kqueue_get_suppressed_queue(kq, kn); TAILQ_REMOVE(suppressq, kn, kn_tqe); - /* udate in-use qos to equal requested qos */ - kn->kn_qos_index = kn->kn_req_index; + /* + * If the knote is no longer active, reset its push, + * and resynchronize kn_qos_index with kn_qos_override + */ + if ((kn->kn_status & KN_ACTIVE) == 0) { + kn->kn_qos_override = kn->kn_req_index; + } + kn->kn_qos_index = kn->kn_qos_override; /* don't wakeup if unsuppressing just a stay-active knote */ if (knote_enqueue(kn) && (kn->kn_status & KN_ACTIVE)) { knote_wakeup(kn); } - if ((kq->kq_state & KQ_WORKLOOP) && !(kq->kq_state & KQ_NO_WQ_THREAD) && - knote_get_qos_override_index(kn) == THREAD_QOS_USER_INTERACTIVE && - kn->kn_qos_override_is_sync) { + if ((kq->kq_state & KQ_WORKLOOP) && TAILQ_EMPTY(suppressq)) { struct kqworkloop *kqwl = (struct kqworkloop *)kq; - /* update the sync qos override counter for suppressed knotes */ - kqworkloop_update_override(kqwl, knote_get_qos_index(kn), - knote_get_qos_override_index(kn), - (KQWL_UO_UPDATE_SUPPRESS_SYNC_COUNTERS | KQWL_UO_OLD_OVERRIDE_IS_SYNC_UI)); - } - - if (TAILQ_EMPTY(suppressq) && (kq->kq_state & KQ_WORKLOOP) && - !(kq->kq_state & KQ_NO_WQ_THREAD)) { - struct kqworkloop *kqwl = (struct kqworkloop *)kq; if (kqworkloop_is_processing_on_current_thread(kqwl)) { /* - * kqworkloop_end_processing() will perform the required QoS - * computations when it unsets the processing mode. + * kqworkloop_end_processing() or kqworkloop_begin_processing() + * will perform the required QoS computations when it unsets the + * processing mode. */ } else { - kqwl_req_lock(kqwl); + kq_req_lock(kqwl); kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RESET_WAKEUP_OVERRIDE, 0); - kqwl_req_unlock(kqwl); + kq_req_unlock(kqwl); } } } -/* called with kqueue lock held */ -static void -knote_update_sync_override_state(struct knote *kn) -{ - struct kqtailq *queue = knote_get_queue(kn); - struct kqueue *kq = knote_get_kq(kn); - - if (!(kq->kq_state & KQ_WORKLOOP) || - knote_get_queue_index(kn) != THREAD_QOS_USER_INTERACTIVE) - return; - - /* Update the sync ipc state on workloop */ - struct kqworkloop *kqwl = (struct kqworkloop *)kq; - boolean_t sync_ipc_override = FALSE; - if (!TAILQ_EMPTY(queue)) { - struct knote *kn_head = TAILQ_FIRST(queue); - if (kn_head->kn_qos_override_is_sync) - sync_ipc_override = TRUE; - } - kqworkloop_update_sync_override_state(kqwl, sync_ipc_override); -} - /* called with kqueue lock held */ static int knote_enqueue(struct knote *kn) @@ -8504,15 +7697,9 @@ knote_enqueue(struct knote *kn) struct kqueue *kq = knote_get_kq(kn); kqlock_held(kq); - /* insert at head for sync ipc waiters */ - if (kn->kn_qos_override_is_sync) { - TAILQ_INSERT_HEAD(queue, kn, kn_tqe); - } else { - TAILQ_INSERT_TAIL(queue, kn, kn_tqe); - } + TAILQ_INSERT_TAIL(queue, kn, kn_tqe); kn->kn_status |= KN_QUEUED; kq->kq_count++; - knote_update_sync_override_state(kn); return 1; } return ((kn->kn_status & KN_STAYACTIVE) != 0); @@ -8535,7 +7722,6 @@ knote_dequeue(struct knote *kn) TAILQ_REMOVE(queue, kn, kn_tqe); kn->kn_status &= ~KN_QUEUED; kq->kq_count--; - knote_update_sync_override_state(kn); } void @@ -8561,12 +7747,6 @@ knote_init(void) /* Allocate kq lock attribute */ kq_lck_attr = lck_attr_alloc_init(); - /* Initialize the timer filter lock */ - lck_mtx_init(&_filt_timerlock, kq_lck_grp, kq_lck_attr); - - /* Initialize the user filter lock */ - lck_spin_init(&_filt_userlock, kq_lck_grp, kq_lck_attr); - #if CONFIG_MEMORYSTATUS /* Initialize the memorystatus list lock */ memorystatus_kevent_init(kq_lck_grp, kq_lck_attr); @@ -8583,15 +7763,16 @@ knote_fops(struct knote *kn) static struct knote * knote_alloc(void) { - struct knote *kn; - kn = ((struct knote *)zalloc(knote_zone)); - *kn = (struct knote) { .kn_qos_override = 0, .kn_qos_sync_override = 0, .kn_qos_override_is_sync = 0 }; + struct knote *kn = ((struct knote *)zalloc(knote_zone)); + bzero(kn, sizeof(struct knote)); return kn; } static void knote_free(struct knote *kn) { + assert(kn->kn_inuse == 0); + assert((kn->kn_status & KN_LOCKED) == 0); zfree(knote_zone, kn); } @@ -8623,7 +7804,7 @@ static lck_rw_t *kev_rwlock = &kev_lck_data; static int kev_attach(struct socket *so, int proto, struct proc *p); static int kev_detach(struct socket *so); static int kev_control(struct socket *so, u_long cmd, caddr_t data, - struct ifnet *ifp, struct proc *p); + struct ifnet *ifp, struct proc *p); static lck_mtx_t * event_getlock(struct socket *, int); static int event_lock(struct socket *, int, void *); static int event_unlock(struct socket *, int, void *); @@ -8658,8 +7839,8 @@ SYSCTL_NODE(_net_systm, OID_AUTO, kevt, struct kevtstat kevtstat; SYSCTL_PROC(_net_systm_kevt, OID_AUTO, stats, - CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, - kevt_getstat, "S,kevtstat", ""); + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, + kevt_getstat, "S,kevtstat", ""); SYSCTL_PROC(_net_systm_kevt, OID_AUTO, pcblist, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, @@ -8906,7 +8087,7 @@ kev_detach(struct socket *so) */ errno_t kev_vendor_code_find( const char *string, - u_int32_t *out_vendor_code) + u_int32_t *out_vendor_code) { if (strlen(string) >= KEV_VENDOR_CODE_MAX_STR_LEN) { return (EINVAL); @@ -8925,7 +8106,7 @@ kev_msg_post(struct kev_msg *event_msg) if (event_msg == NULL) return (EINVAL); - /* + /* * Limit third parties to posting events for registered vendor codes * only */ @@ -9050,10 +8231,10 @@ kev_post_msg(struct kev_msg *event_msg) static int kev_control(struct socket *so, - u_long cmd, - caddr_t data, - __unused struct ifnet *ifp, - __unused struct proc *p) + u_long cmd, + caddr_t data, + __unused struct ifnet *ifp, + __unused struct proc *p) { struct kev_request *kev_req = (struct kev_request *) data; struct kern_event_pcb *ev_pcb; @@ -9255,6 +8436,7 @@ fill_kqueue_dyninfo(struct kqueue *kq, struct kqueue_dyninfo *kqdi) { struct kqworkloop *kqwl = (struct kqworkloop *)kq; struct kqrequest *kqr = &kqwl->kqwl_request; + workq_threadreq_param_t trp = {}; int err; if ((kq->kq_state & KQ_WORKLOOP) == 0) { @@ -9265,25 +8447,33 @@ fill_kqueue_dyninfo(struct kqueue *kq, struct kqueue_dyninfo *kqdi) return err; } - kqwl_req_lock(kqwl); - - if (kqr->kqr_thread) { - kqdi->kqdi_servicer = thread_tid(kqr->kqr_thread); - } - - if (kqwl->kqwl_owner == WL_OWNER_SUSPENDED) { - kqdi->kqdi_owner = ~0ull; - } else { - kqdi->kqdi_owner = thread_tid(kqwl->kqwl_owner); - } + kq_req_lock(kqwl); + kqdi->kqdi_servicer = thread_tid(kqr->kqr_thread); + kqdi->kqdi_owner = thread_tid(kqwl->kqwl_owner); kqdi->kqdi_request_state = kqr->kqr_state; kqdi->kqdi_async_qos = kqr->kqr_qos_index; kqdi->kqdi_events_qos = kqr->kqr_override_index; kqdi->kqdi_sync_waiters = kqr->kqr_dsync_waiters; - kqdi->kqdi_sync_waiter_qos = kqr->kqr_dsync_waiters_qos; + kqdi->kqdi_sync_waiter_qos = 0; + + trp.trp_value = kqwl->kqwl_params; + if (trp.trp_flags & TRP_PRIORITY) + kqdi->kqdi_pri = trp.trp_pri; + else + kqdi->kqdi_pri = 0; - kqwl_req_unlock(kqwl); + if (trp.trp_flags & TRP_POLICY) + kqdi->kqdi_pol = trp.trp_pol; + else + kqdi->kqdi_pol = 0; + + if (trp.trp_flags & TRP_CPUPERCENT) + kqdi->kqdi_cpupercent = trp.trp_cpupercent; + else + kqdi->kqdi_cpupercent = 0; + + kq_req_unlock(kqwl); return 0; } @@ -9293,6 +8483,7 @@ void knote_markstayactive(struct knote *kn) { struct kqueue *kq = knote_get_kq(kn); + kq_index_t qos; kqlock(kq); kn->kn_status |= KN_STAYACTIVE; @@ -9302,20 +8493,28 @@ knote_markstayactive(struct knote *kn) * established before it is fully attached. */ assert(kn->kn_status & KN_ATTACHING); + assert((kn->kn_status & (KN_QUEUED | KN_SUPPRESSED)) == 0); /* handle all stayactive knotes on the (appropriate) manager */ if (kq->kq_state & KQ_WORKQ) { - knote_set_qos_index(kn, KQWQ_QOS_MANAGER); + qos = KQWQ_QOS_MANAGER; } else if (kq->kq_state & KQ_WORKLOOP) { struct kqworkloop *kqwl = (struct kqworkloop *)kq; - kqwl_req_lock(kqwl); - assert(kn->kn_req_index && kn->kn_req_index < THREAD_QOS_LAST); - kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_STAYACTIVE_QOS, - kn->kn_req_index); - kqwl_req_unlock(kqwl); - knote_set_qos_index(kn, KQWL_BUCKET_STAYACTIVE); + + qos = _pthread_priority_thread_qos(kn->kn_qos); + assert(qos && qos < THREAD_QOS_LAST); + kq_req_lock(kq); + kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_STAYACTIVE_QOS, qos); + kq_req_unlock(kq); + qos = KQWL_BUCKET_STAYACTIVE; + } else { + qos = THREAD_QOS_UNSPECIFIED; } + kn->kn_req_index = qos; + kn->kn_qos_override = qos; + kn->kn_qos_index = qos; + knote_activate(kn); kqunlock(kq); } @@ -9546,7 +8745,7 @@ pid_kqueue_extinfo(proc_t p, struct kqueue *kq, user_addr_t ubuf, assert(bufsize >= sizeof(struct kevent_extinfo) * min(buflen, nknotes)); err = copyout(kqext, ubuf, sizeof(struct kevent_extinfo) * min(buflen, nknotes)); - out: +out: if (kqext) { kfree(kqext, buflen * sizeof(struct kevent_extinfo)); kqext = NULL; @@ -9631,14 +8830,6 @@ kevent_proc_copy_uptrs(void *proc, uint64_t *buf, int bufsize) return (int)nuptrs; } -static void -kevent_redrive_proc_thread_request(proc_t p) -{ - __assert_only int ret; - ret = (*pthread_functions->workq_threadreq)(p, NULL, WORKQ_THREADREQ_REDRIVE, 0, 0); - assert(ret == 0 || ret == ECANCELED); -} - static void kevent_set_return_to_kernel_user_tsd(proc_t p, thread_t thread) { @@ -9649,12 +8840,8 @@ kevent_set_return_to_kernel_user_tsd(proc_t p, thread_t thread) uint64_t ast_flags64 = 0; struct uthread *ut = get_bsdthread_info(thread); - if (ut->uu_kqueue_bound != NULL) { - if (ut->uu_kqueue_flags & KEVENT_FLAG_WORKLOOP) { - ast_flags64 |= R2K_WORKLOOP_PENDING_EVENTS; - } else if (ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ) { - ast_flags64 |= R2K_WORKQ_PENDING_EVENTS; - } + if (ut->uu_kqr_bound != NULL) { + ast_flags64 |= R2K_WORKLOOP_PENDING_EVENTS; } if (ast_flags64 == 0) { @@ -9685,7 +8872,7 @@ kevent_ast(thread_t thread, uint16_t bits) proc_t p = current_proc(); if (bits & AST_KEVENT_REDRIVE_THREADREQ) { - kevent_redrive_proc_thread_request(p); + workq_kern_threadreq_redrive(p, WORKQ_THREADREQ_CAN_CREATE_THREADS); } if (bits & AST_KEVENT_RETURN_TO_KERNEL) { kevent_set_return_to_kernel_user_tsd(p, thread); @@ -9702,8 +8889,6 @@ kevent_sysctl SYSCTL_HANDLER_ARGS #pragma unused(oidp, arg2) uintptr_t type = (uintptr_t)arg1; uint64_t bound_id = 0; - struct uthread *ut; - struct kqueue *kq; if (type != KEVENT_SYSCTL_BOUND_ID) { return EINVAL; @@ -9713,16 +8898,16 @@ kevent_sysctl SYSCTL_HANDLER_ARGS return EINVAL; } - ut = get_bsdthread_info(current_thread()); + struct uthread *ut = get_bsdthread_info(current_thread()); if (!ut) { return EFAULT; } - kq = ut->uu_kqueue_bound; - if (kq) { - if (kq->kq_state & KQ_WORKLOOP) { - bound_id = ((struct kqworkloop *)kq)->kqwl_dynamicid; - } else if (kq->kq_state & KQ_WORKQ) { + struct kqrequest *kqr = ut->uu_kqr_bound; + if (kqr) { + if (kqr->kqr_state & KQR_WORKLOOP) { + bound_id = kqr_kqworkloop(kqr)->kqwl_dynamicid; + } else { bound_id = -1; } } diff --git a/bsd/kern/kern_exec.c b/bsd/kern/kern_exec.c index e5ac6b2c7..5e145cfac 100644 --- a/bsd/kern/kern_exec.c +++ b/bsd/kern/kern_exec.c @@ -162,6 +162,8 @@ #include #endif +extern boolean_t vm_darkwake_mode; + #if CONFIG_DTRACE /* Do not include dtrace.h, it redefines kmem_[alloc/free] */ extern void dtrace_proc_exec(proc_t); @@ -177,7 +179,13 @@ static void (*dtrace_proc_waitfor_hook)(proc_t) = NULL; #endif /* support for child creation in exec after vfork */ -thread_t fork_create_child(task_t parent_task, coalition_t *parent_coalition, proc_t child_proc, int inherit_memory, int is64bit, int in_exec); +thread_t fork_create_child(task_t parent_task, + coalition_t *parent_coalition, + proc_t child_proc, + int inherit_memory, + int is_64bit_addr, + int is_64bit_data, + int in_exec); void vfork_exit(proc_t p, int rv); extern void proc_apply_task_networkbg_internal(proc_t, thread_t); extern void task_set_did_exec_flag(task_t task); @@ -727,11 +735,10 @@ activate_exec_state(task_t task, proc_t p, thread_t thread, load_result_t *resul int ret; task_set_dyld_info(task, MACH_VM_MIN_ADDRESS, 0); - if (result->is64bit) { - task_set_64bit(task, TRUE); + task_set_64bit(task, result->is_64bit_addr, result->is_64bit_data); + if (result->is_64bit_addr) { OSBitOrAtomic(P_LP64, &p->p_flag); } else { - task_set_64bit(task, FALSE); OSBitAndAtomic(~((uint32_t)P_LP64), &p->p_flag); } @@ -833,7 +840,7 @@ exec_mach_imgact(struct image_params *imgp) vm_map_t old_map = VM_MAP_NULL; vm_map_t map = VM_MAP_NULL; load_return_t lret; - load_result_t load_result; + load_result_t load_result = {}; struct _posix_spawnattr *psa = NULL; int spawn = (imgp->ip_flags & IMGPF_SPAWN); int vfexec = (imgp->ip_flags & IMGPF_VFORK_EXEC); @@ -879,8 +886,9 @@ exec_mach_imgact(struct image_params *imgp) thread = current_thread(); uthread = get_bsdthread_info(thread); - if ((mach_header->cputype & CPU_ARCH_ABI64) == CPU_ARCH_ABI64) - imgp->ip_flags |= IMGPF_IS_64BIT; + if ((mach_header->cputype & CPU_ARCH_ABI64) == CPU_ARCH_ABI64) { + imgp->ip_flags |= IMGPF_IS_64BIT_ADDR | IMGPF_IS_64BIT_DATA; + } /* If posix_spawn binprefs exist, respect those prefs. */ psa = (struct _posix_spawnattr *) imgp->ip_px_sa; @@ -913,6 +921,8 @@ exec_mach_imgact(struct image_params *imgp) goto bad; } + + /* Copy in arguments/environment from the old process */ error = exec_extract_strings(imgp); if (error) @@ -931,7 +941,13 @@ exec_mach_imgact(struct image_params *imgp) * new child process. */ if (vfexec) { - imgp->ip_new_thread = fork_create_child(task, NULL, p, FALSE, (imgp->ip_flags & IMGPF_IS_64BIT), FALSE); + imgp->ip_new_thread = fork_create_child(task, + NULL, + p, + FALSE, + (imgp->ip_flags & IMGPF_IS_64BIT_ADDR), + (imgp->ip_flags & IMGPF_IS_64BIT_DATA), + FALSE); /* task and thread ref returned, will be released in __mac_execve */ if (imgp->ip_new_thread == NULL) { error = ENOMEM; @@ -1002,7 +1018,7 @@ exec_mach_imgact(struct image_params *imgp) imgp->ip_csflags |= load_result.csflags & (CS_VALID|CS_SIGNED|CS_DEV_CODE| CS_HARD|CS_KILL|CS_RESTRICT|CS_ENFORCEMENT|CS_REQUIRE_LV| - CS_ENTITLEMENTS_VALIDATED|CS_DYLD_PLATFORM| + CS_FORCED_LV|CS_ENTITLEMENTS_VALIDATED|CS_DYLD_PLATFORM|CS_RUNTIME| CS_ENTITLEMENT_FLAGS| CS_EXEC_SET_HARD|CS_EXEC_SET_KILL|CS_EXEC_SET_ENFORCEMENT); } else { @@ -1027,7 +1043,9 @@ exec_mach_imgact(struct image_params *imgp) /* * Set up the system reserved areas in the new address space. */ - vm_map_exec(map, task, load_result.is64bit, (void *)p->p_fd->fd_rdir, cpu_type()); + int cpu_subtype; + cpu_subtype = 0; /* all cpu_subtypes use the same shared region */ + vm_map_exec(map, task, load_result.is_64bit_addr, (void *)p->p_fd->fd_rdir, cpu_type(), cpu_subtype); /* * Close file descriptors which specify close-on-exec. @@ -1129,7 +1147,7 @@ exec_mach_imgact(struct image_params *imgp) if (load_result.dynlinker) { uint64_t ap; - int new_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT) ? 8 : 4; + int new_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT_ADDR) ? 8 : 4; /* Adjust the stack */ ap = thread_adjuserstack(thread, -new_ptr_size); @@ -1201,6 +1219,12 @@ exec_mach_imgact(struct image_params *imgp) } #endif /* CONFIG_SECLUDED_MEMORY */ +#if __arm64__ + if (load_result.legacy_footprint) { + task_set_legacy_footprint(task, TRUE); + } +#endif /* __arm64__ */ + pal_dbg_set_task_name(task); /* @@ -1525,14 +1549,20 @@ exec_activate_image(struct image_params *imgp) } } - /* - * Call out to allow 3rd party notification of exec. - * Ignore result of kauth_authorize_fileop call. - */ - if (error == 0 && kauth_authorize_fileop_has_listeners()) { - kauth_authorize_fileop(vfs_context_ucred(imgp->ip_vfs_context), - KAUTH_FILEOP_EXEC, - (uintptr_t)ndp->ni_vp, 0); + if (error == 0) { + if (imgp->ip_flags & IMGPF_INTERPRET && ndp->ni_vp) { + AUDIT_ARG(vnpath, ndp->ni_vp, ARG_VNODE2); + } + + /* + * Call out to allow 3rd party notification of exec. + * Ignore result of kauth_authorize_fileop call. + */ + if (kauth_authorize_fileop_has_listeners()) { + kauth_authorize_fileop(vfs_context_ucred(imgp->ip_vfs_context), + KAUTH_FILEOP_EXEC, + (uintptr_t)ndp->ni_vp, 0); + } } bad: proc_transend(p, 0); @@ -2228,6 +2258,7 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) int portwatch_count = 0; ipc_port_t * portwatch_ports = NULL; vm_size_t px_sa_offset = offsetof(struct _posix_spawnattr, psa_ports); + task_t old_task = current_task(); task_t new_task = NULL; boolean_t should_release_proc_ref = FALSE; void *inherit = NULL; @@ -2255,7 +2286,7 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) imgp->ip_vattr = vap; imgp->ip_origvattr = origvap; imgp->ip_vfs_context = &context; - imgp->ip_flags = (is_64 ? IMGPF_WAS_64BIT : IMGPF_NONE); + imgp->ip_flags = (is_64 ? IMGPF_WAS_64BIT_ADDR : IMGPF_NONE); imgp->ip_seg = (is_64 ? UIO_USERSPACE64 : UIO_USERSPACE32); imgp->ip_mac_return = 0; imgp->ip_px_persona = NULL; @@ -2296,9 +2327,10 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) * This is a bit fragile: */ - if ((error = copyin(px_args.attrp, &px_sa, px_sa_offset) != 0)) - goto bad; - + if ((error = copyin(px_args.attrp, &px_sa, px_sa_offset)) != 0) { + goto bad; + } + bzero( (void *)( (unsigned long) &px_sa + px_sa_offset), sizeof(px_sa) - px_sa_offset ); imgp->ip_px_sa = &px_sa; @@ -2588,8 +2620,13 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) * During exec any transition from new_task -> proc is fine, but don't allow * transition from proc->task, since it will modify old_task. */ - imgp->ip_new_thread = fork_create_child(current_task(), - NULL, p, FALSE, p->p_flag & P_LP64, TRUE); + imgp->ip_new_thread = fork_create_child(old_task, + NULL, + p, + FALSE, + p->p_flag & P_LP64, + task_get_64bit_data(old_task), + TRUE); /* task and thread ref returned by fork_create_child */ if (imgp->ip_new_thread == NULL) { error = ENOMEM; @@ -2797,9 +2834,18 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) error = exec_activate_image(imgp); if (error == 0 && !spawn_no_exec) { - p = proc_exec_switch_task(p, current_task(), new_task, imgp->ip_new_thread); + p = proc_exec_switch_task(p, old_task, new_task, imgp->ip_new_thread); /* proc ref returned */ should_release_proc_ref = TRUE; + + /* + * Need to transfer pending watch port boosts to the new task while still making + * sure that the old task remains in the importance linkage. Create an importance + * linkage from old task to new task, then switch the task importance base + * of old task and new task. After the switch the port watch boost will be + * boosting the new task and new task will be donating importance to old task. + */ + inherit = ipc_importance_exec_switch_task(old_task, new_task); } if (error == 0) { @@ -2926,6 +2972,9 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) } #endif /* CONFIG_MEMORYSTATUS */ + if (imgp->ip_px_sa != NULL && px_sa.psa_thread_limit > 0) { + task_set_thread_limit(new_task, (uint16_t)px_sa.psa_thread_limit); + } } /* @@ -2966,14 +3015,14 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) error = proc_transstart(p, 0, 0); if (error == 0) { - task_bank_init(get_threadtask(imgp->ip_new_thread)); + task_bank_init(new_task); proc_transend(p, 0); } } /* Inherit task role from old task to new task for exec */ if (error == 0 && !spawn_no_exec) { - proc_inherit_task_role(get_threadtask(imgp->ip_new_thread), current_task()); + proc_inherit_task_role(new_task, old_task); } /* @@ -2993,20 +3042,20 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) } /* - * Need to transfer pending watch port boosts to the new task while still making - * sure that the old task remains in the importance linkage. Create an importance - * linkage from old task to new task, then switch the task importance base - * of old task and new task. After the switch the port watch boost will be - * boosting the new task and new task will be donating importance to old task. + * Apply the requested maximum address. */ - if (error == 0 && task_did_exec(current_task())) { - inherit = ipc_importance_exec_switch_task(current_task(), get_threadtask(imgp->ip_new_thread)); + if (error == 0 && imgp->ip_px_sa != NULL) { + struct _posix_spawnattr *psa = (struct _posix_spawnattr *) imgp->ip_px_sa; + + if (psa->psa_max_addr) { + vm_map_set_max_addr(get_task_map(new_task), psa->psa_max_addr); + } } if (error == 0) { - /* Apply the main thread qos */ + /* Apply the main thread qos */ thread_t main_thread = imgp->ip_new_thread; - task_set_main_thread_qos(get_threadtask(imgp->ip_new_thread), main_thread); + task_set_main_thread_qos(new_task, main_thread); #if CONFIG_MACF /* @@ -3014,7 +3063,7 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) * a jumbo-size map. */ if (mac_proc_check_map_anon(p, 0, 0, 0, MAP_JIT, NULL) == 0) { - vm_map_set_jumbo(get_task_map(p->task)); + vm_map_set_jumbo(get_task_map(new_task)); } #endif /* CONFIG_MACF */ } @@ -3129,11 +3178,22 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) (*dtrace_proc_waitfor_hook)(p); } #endif + +#if CONFIG_AUDIT + if (!error && AUDIT_ENABLED() && p) { + /* Add the CDHash of the new process to the audit record */ + uint8_t *cdhash = cs_get_cdhash(p); + if (cdhash) { + AUDIT_ARG(data, cdhash, sizeof(uint8_t), CS_CDHASH_LEN); + } + } +#endif + /* * clear bsd_info from old task if it did exec. */ - if (task_did_exec(current_task())) { - set_bsdtask_info(current_task(), NULL); + if (task_did_exec(old_task)) { + set_bsdtask_info(old_task, NULL); } /* clear bsd_info from new task and terminate it if exec failed */ @@ -3177,9 +3237,9 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) * switch the tasks, terminating the current task without the switch would * result in loosing the SIGKILL status. */ - if (task_did_exec(current_task())) { + if (task_did_exec(old_task)) { /* Terminate the current task, since exec will start in new task */ - task_terminate_internal(current_task()); + task_terminate_internal(old_task); } /* Release the thread ref returned by fork_create_child/fork1 */ @@ -3413,6 +3473,7 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval) int is_64 = IS_64BIT_PROCESS(p); struct vfs_context context; struct uthread *uthread; + task_t old_task = current_task(); task_t new_task = NULL; boolean_t should_release_proc_ref = FALSE; boolean_t exec_done = FALSE; @@ -3441,7 +3502,7 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval) imgp->ip_vattr = vap; imgp->ip_origvattr = origvap; imgp->ip_vfs_context = &context; - imgp->ip_flags = (is_64 ? IMGPF_WAS_64BIT : IMGPF_NONE) | ((p->p_flag & P_DISABLE_ASLR) ? IMGPF_DISABLE_ASLR : IMGPF_NONE); + imgp->ip_flags = (is_64 ? IMGPF_WAS_64BIT_ADDR : IMGPF_NONE) | ((p->p_flag & P_DISABLE_ASLR) ? IMGPF_DISABLE_ASLR : IMGPF_NONE); imgp->ip_seg = (is_64 ? UIO_USERSPACE64 : UIO_USERSPACE32); imgp->ip_mac_return = 0; imgp->ip_cs_error = OS_REASON_NULL; @@ -3487,8 +3548,13 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval) * During exec any transition from new_task -> proc is fine, but don't allow * transition from proc->task, since it will modify old_task. */ - imgp->ip_new_thread = fork_create_child(current_task(), - NULL, p, FALSE, p->p_flag & P_LP64, TRUE); + imgp->ip_new_thread = fork_create_child(old_task, + NULL, + p, + FALSE, + p->p_flag & P_LP64, + task_get_64bit_data(old_task), + TRUE); /* task and thread ref returned by fork_create_child */ if (imgp->ip_new_thread == NULL) { error = ENOMEM; @@ -3511,9 +3577,18 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval) } if (!error && !in_vfexec) { - p = proc_exec_switch_task(p, current_task(), new_task, imgp->ip_new_thread); + p = proc_exec_switch_task(p, old_task, new_task, imgp->ip_new_thread); /* proc ref returned */ should_release_proc_ref = TRUE; + + /* + * Need to transfer pending watch port boosts to the new task while still making + * sure that the old task remains in the importance linkage. Create an importance + * linkage from old task to new task, then switch the task importance base + * of old task and new task. After the switch the port watch boost will be + * boosting the new task and new task will be donating importance to old task. + */ + inherit = ipc_importance_exec_switch_task(old_task, new_task); } kauth_cred_unref(&context.vc_ucred); @@ -3562,7 +3637,7 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval) } if (!error) { - task_bank_init(get_threadtask(imgp->ip_new_thread)); + task_bank_init(new_task); proc_transend(p, 0); /* Sever any extant thread affinity */ @@ -3570,7 +3645,7 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval) /* Inherit task role from old task to new task for exec */ if (!in_vfexec) { - proc_inherit_task_role(get_threadtask(imgp->ip_new_thread), current_task()); + proc_inherit_task_role(new_task, old_task); } thread_t main_thread = imgp->ip_new_thread; @@ -3587,6 +3662,14 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval) } #endif /* CONFIG_MACF */ + if (vm_darkwake_mode == TRUE) { + /* + * This process is being launched when the system + * is in darkwake. So mark it specially. This will + * cause all its pages to be entered in the background Q. + */ + task_set_darkwake_mode(new_task, vm_darkwake_mode); + } #if CONFIG_DTRACE dtrace_thread_didexec(imgp->ip_new_thread); @@ -3595,6 +3678,16 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval) (*dtrace_proc_waitfor_hook)(p); #endif +#if CONFIG_AUDIT + if (!error && AUDIT_ENABLED() && p) { + /* Add the CDHash of the new process to the audit record */ + uint8_t *cdhash = cs_get_cdhash(p); + if (cdhash) { + AUDIT_ARG(data, cdhash, sizeof(uint8_t), CS_CDHASH_LEN); + } + } +#endif + if (in_vfexec) { vfork_return(p, retval, p->p_pid); } @@ -3607,8 +3700,8 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval) /* * clear bsd_info from old task if it did exec. */ - if (task_did_exec(current_task())) { - set_bsdtask_info(current_task(), NULL); + if (task_did_exec(old_task)) { + set_bsdtask_info(old_task, NULL); } /* clear bsd_info from new task and terminate it if exec failed */ @@ -3617,26 +3710,15 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval) task_terminate_internal(new_task); } - /* - * Need to transfer pending watch port boosts to the new task while still making - * sure that the old task remains in the importance linkage. Create an importance - * linkage from old task to new task, then switch the task importance base - * of old task and new task. After the switch the port watch boost will be - * boosting the new task and new task will be donating importance to old task. - */ - if (error == 0 && task_did_exec(current_task())) { - inherit = ipc_importance_exec_switch_task(current_task(), get_threadtask(imgp->ip_new_thread)); - } - if (imgp != NULL) { /* * Do not terminate the current task, if proc_exec_switch_task did not * switch the tasks, terminating the current task without the switch would * result in loosing the SIGKILL status. */ - if (task_did_exec(current_task())) { + if (task_did_exec(old_task)) { /* Terminate the current task, since exec will start in new task */ - task_terminate_internal(current_task()); + task_terminate_internal(old_task); } /* Release the thread ref returned by fork_create_child */ @@ -3813,7 +3895,7 @@ static int exec_copyout_strings(struct image_params *imgp, user_addr_t *stackp) { proc_t p = vfs_context_proc(imgp->ip_vfs_context); - int ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT) ? 8 : 4; + int ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT_ADDR) ? 8 : 4; int ptr_area_size; void *ptr_buffer_start, *ptr_buffer; int string_size; @@ -3887,8 +3969,7 @@ exec_copyout_strings(struct image_params *imgp, user_addr_t *stackp) * Need room for one pointer for each string, plus * one for the NULLs terminating the argv, envv, and apple areas. */ - ptr_area_size = (imgp->ip_argc + imgp->ip_envc + imgp->ip_applec + 3) * - ptr_size; + ptr_area_size = (imgp->ip_argc + imgp->ip_envc + imgp->ip_applec + 3) * ptr_size; stack -= ptr_area_size; ptr_area = stack; @@ -4014,8 +4095,8 @@ static int exec_extract_strings(struct image_params *imgp) { int error = 0; - int ptr_size = (imgp->ip_flags & IMGPF_WAS_64BIT) ? 8 : 4; - int new_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT) ? 8 : 4; + int ptr_size = (imgp->ip_flags & IMGPF_WAS_64BIT_ADDR) ? 8 : 4; + int new_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT_ADDR) ? 8 : 4; user_addr_t argv = imgp->ip_user_argv; user_addr_t envv = imgp->ip_user_envv; @@ -4220,6 +4301,12 @@ exec_extract_strings(struct image_params *imgp) #define ENTROPY_VALUES 2 #define ENTROPY_KEY "malloc_entropy=" +/* + * libplatform needs a random pointer-obfuscation value when it is initialized. + */ +#define PTR_MUNGE_VALUES 1 +#define PTR_MUNGE_KEY "ptr_munge=" + /* * System malloc engages nanozone for UIAPP. */ @@ -4278,7 +4365,7 @@ exec_add_apple_strings(struct image_params *imgp, const load_result_t *load_result) { int error; - int img_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT) ? 8 : 4; + int img_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT_ADDR) ? 8 : 4; /* exec_save_path stored the first string */ imgp->ip_applec = 1; @@ -4335,6 +4422,16 @@ exec_add_apple_strings(struct image_params *imgp, } imgp->ip_applec++; + /* + * Supply libpthread & libplatform with a random value to use for pointer + * obfuscation. + */ + error = exec_add_entropy_key(imgp, PTR_MUNGE_KEY, PTR_MUNGE_VALUES, FALSE); + if (error) { + goto bad; + } + imgp->ip_applec++; + /* * Add MAIN_STACK_KEY: Supplies the address and size of the main thread's * stack if it was allocated by the kernel. @@ -5050,7 +5147,7 @@ load_init_program_at_path(proc_t p, user_addr_t scratch_addr, const char* path) } if (proc_is64bit(p)) { - user64_addr_t argv64bit[3]; + user64_addr_t argv64bit[3] = {}; argv64bit[0] = argv0; argv64bit[1] = argv1; @@ -5060,7 +5157,7 @@ load_init_program_at_path(proc_t p, user_addr_t scratch_addr, const char* path) if (error) return error; } else { - user32_addr_t argv32bit[3]; + user32_addr_t argv32bit[3] = {}; argv32bit[0] = (user32_addr_t)argv0; argv32bit[1] = (user32_addr_t)argv1; @@ -5694,7 +5791,7 @@ static void exec_prefault_data(proc_t p __unused, struct image_params *imgp, loa FALSE, VM_KERN_MEMORY_NONE, THREAD_UNINT, NULL, 0); - if (imgp->ip_flags & IMGPF_IS_64BIT) { + if (imgp->ip_flags & IMGPF_IS_64BIT_ADDR) { expected_all_image_infos_size = sizeof(struct user64_dyld_all_image_infos); } else { expected_all_image_infos_size = sizeof(struct user32_dyld_all_image_infos); @@ -5740,7 +5837,7 @@ static void exec_prefault_data(proc_t p __unused, struct image_params *imgp, loa user_addr_t dyld_all_image_infos_address; user_addr_t dyld_slide_amount; - if (imgp->ip_flags & IMGPF_IS_64BIT) { + if (imgp->ip_flags & IMGPF_IS_64BIT_ADDR) { notification_address = all_image_infos.infos64.notification; dyld_image_address = all_image_infos.infos64.dyldImageLoadAddress; dyld_version_address = all_image_infos.infos64.dyldVersion; diff --git a/bsd/kern/kern_exit.c b/bsd/kern/kern_exit.c index b2e226f06..edffa1854 100644 --- a/bsd/kern/kern_exit.c +++ b/bsd/kern/kern_exit.c @@ -312,6 +312,21 @@ populate_corpse_crashinfo(proc_t p, task_t corpse_task, struct rusage_superset * unsigned int pflags = 0; uint64_t max_footprint_mb; uint64_t max_footprint; + + uint64_t ledger_internal; + uint64_t ledger_internal_compressed; + uint64_t ledger_iokit_mapped; + uint64_t ledger_alternate_accounting; + uint64_t ledger_alternate_accounting_compressed; + uint64_t ledger_purgeable_nonvolatile; + uint64_t ledger_purgeable_nonvolatile_compressed; + uint64_t ledger_page_table; + uint64_t ledger_phys_footprint; + uint64_t ledger_phys_footprint_lifetime_max; + uint64_t ledger_network_nonvolatile; + uint64_t ledger_network_nonvolatile_compressed; + uint64_t ledger_wired_mem; + void *crash_info_ptr = task_get_corpseinfo(corpse_task); #if CONFIG_MEMORYSTATUS @@ -412,6 +427,72 @@ populate_corpse_crashinfo(proc_t p, task_t corpse_task, struct rusage_superset * kcdata_memcpy(crash_info_ptr, uaddr, &max_footprint_mb, sizeof(max_footprint_mb)); } + if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_LEDGER_PHYS_FOOTPRINT_LIFETIME_MAX, sizeof(ledger_phys_footprint_lifetime_max), &uaddr)) { + ledger_phys_footprint_lifetime_max = get_task_phys_footprint_lifetime_max(p->task); + kcdata_memcpy(crash_info_ptr, uaddr, &ledger_phys_footprint_lifetime_max, sizeof(ledger_phys_footprint_lifetime_max)); + } + + // In the forking case, the current ledger info is copied into the corpse while the original task is suspended for consistency + if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_LEDGER_INTERNAL, sizeof(ledger_internal), &uaddr)) { + ledger_internal = get_task_internal(corpse_task); + kcdata_memcpy(crash_info_ptr, uaddr, &ledger_internal, sizeof(ledger_internal)); + } + + if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_LEDGER_INTERNAL_COMPRESSED, sizeof(ledger_internal_compressed), &uaddr)) { + ledger_internal_compressed = get_task_internal_compressed(corpse_task); + kcdata_memcpy(crash_info_ptr, uaddr, &ledger_internal_compressed, sizeof(ledger_internal_compressed)); + } + + if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_LEDGER_IOKIT_MAPPED, sizeof(ledger_iokit_mapped), &uaddr)) { + ledger_iokit_mapped = get_task_iokit_mapped(corpse_task); + kcdata_memcpy(crash_info_ptr, uaddr, &ledger_iokit_mapped, sizeof(ledger_iokit_mapped)); + } + + if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_LEDGER_ALTERNATE_ACCOUNTING, sizeof(ledger_alternate_accounting), &uaddr)) { + ledger_alternate_accounting = get_task_alternate_accounting(corpse_task); + kcdata_memcpy(crash_info_ptr, uaddr, &ledger_alternate_accounting, sizeof(ledger_alternate_accounting)); + } + + if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_LEDGER_ALTERNATE_ACCOUNTING_COMPRESSED, sizeof(ledger_alternate_accounting_compressed), &uaddr)) { + ledger_alternate_accounting_compressed = get_task_alternate_accounting_compressed(corpse_task); + kcdata_memcpy(crash_info_ptr, uaddr, &ledger_alternate_accounting_compressed, sizeof(ledger_alternate_accounting_compressed)); + } + + if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_LEDGER_PURGEABLE_NONVOLATILE, sizeof(ledger_purgeable_nonvolatile), &uaddr)) { + ledger_purgeable_nonvolatile = get_task_purgeable_nonvolatile(corpse_task); + kcdata_memcpy(crash_info_ptr, uaddr, &ledger_purgeable_nonvolatile, sizeof(ledger_purgeable_nonvolatile)); + } + + if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_LEDGER_PURGEABLE_NONVOLATILE_COMPRESSED, sizeof(ledger_purgeable_nonvolatile_compressed), &uaddr)) { + ledger_purgeable_nonvolatile_compressed = get_task_purgeable_nonvolatile_compressed(corpse_task); + kcdata_memcpy(crash_info_ptr, uaddr, &ledger_purgeable_nonvolatile_compressed, sizeof(ledger_purgeable_nonvolatile_compressed)); + } + + if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_LEDGER_PAGE_TABLE, sizeof(ledger_page_table), &uaddr)) { + ledger_page_table = get_task_page_table(corpse_task); + kcdata_memcpy(crash_info_ptr, uaddr, &ledger_page_table, sizeof(ledger_page_table)); + } + + if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_LEDGER_PHYS_FOOTPRINT, sizeof(ledger_phys_footprint), &uaddr)) { + ledger_phys_footprint = get_task_phys_footprint(corpse_task); + kcdata_memcpy(crash_info_ptr, uaddr, &ledger_phys_footprint, sizeof(ledger_phys_footprint)); + } + + if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_LEDGER_NETWORK_NONVOLATILE, sizeof(ledger_network_nonvolatile), &uaddr)) { + ledger_network_nonvolatile = get_task_network_nonvolatile(corpse_task); + kcdata_memcpy(crash_info_ptr, uaddr, &ledger_network_nonvolatile, sizeof(ledger_network_nonvolatile)); + } + + if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_LEDGER_NETWORK_NONVOLATILE_COMPRESSED, sizeof(ledger_network_nonvolatile_compressed), &uaddr)) { + ledger_network_nonvolatile_compressed = get_task_network_nonvolatile_compressed(corpse_task); + kcdata_memcpy(crash_info_ptr, uaddr, &ledger_network_nonvolatile_compressed, sizeof(ledger_network_nonvolatile_compressed)); + } + + if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_LEDGER_WIRED_MEM, sizeof(ledger_wired_mem), &uaddr)) { + ledger_wired_mem = get_task_wired_mem(corpse_task); + kcdata_memcpy(crash_info_ptr, uaddr, &ledger_wired_mem, sizeof(ledger_wired_mem)); + } + bzero(&pwqinfo, sizeof(struct proc_workqueueinfo)); retval = fill_procworkqueue(p, &pwqinfo); if (retval == 0) { @@ -614,7 +695,7 @@ abort_with_payload_internal(proc_t p, reason_code, 0, 0); exit_reason = build_userspace_exit_reason(reason_namespace, reason_code, - payload, payload_size, reason_string, reason_flags); + payload, payload_size, reason_string, reason_flags | OS_REASON_FLAG_ABORT); if (internal_flags & OS_REASON_IFLAG_USER_FAULT) { mach_exception_code_t code = 0; @@ -1065,7 +1146,7 @@ proc_exit(proc_t p) /* if any pending cpu limits action, clear it */ task_clear_cpuusage(p->task, TRUE); - workqueue_mark_exiting(p); + workq_mark_exiting(p); _aio_exit( p ); @@ -1079,7 +1160,7 @@ proc_exit(proc_t p) * Once all the knotes, kqueues & workloops are destroyed, get rid of the * workqueue. */ - workqueue_exit(p); + workq_exit(p); if (uth->uu_lowpri_window) { /* @@ -1361,8 +1442,6 @@ proc_exit(proc_t p) proc_limitdrop(p, 1); p->p_limit = NULL; - vm_purgeable_disown(p->task); - /* * Finish up by terminating the task * and halt this thread (only if a @@ -1432,6 +1511,7 @@ proc_exit(proc_t p) * The write is to an int and is coherent. Also parent is * keyed off of list lock for reaping */ + DTRACE_PROC2(exited, proc_t, p, int, exitval); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXIT) | DBG_FUNC_END, pid, exitval, 0, 0, 0); @@ -1455,6 +1535,7 @@ proc_exit(proc_t p) * The write is to an int and is coherent. Also parent is * keyed off of list lock for reaping */ + DTRACE_PROC2(exited, proc_t, p, int, exitval); proc_list_lock(); KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXIT) | DBG_FUNC_END, @@ -1716,7 +1797,7 @@ wait1continue(int result) thread = current_thread(); uth = (struct uthread *)get_bsdthread_info(thread); - wait4_data = &uth->uu_kevent.uu_wait4_data; + wait4_data = &uth->uu_save.uus_wait4_data; uap = wait4_data->args; retval = wait4_data->retval; return(wait4_nocancel(p, uap, retval)); @@ -1763,6 +1844,14 @@ wait4_nocancel(proc_t q, struct wait4_nocancel_args *uap, int32_t *retval) /* XXX This is racy because we don't get the lock!!!! */ if (p->p_listflag & P_LIST_WAITING) { + + /* we're not using a continuation here but we still need to stash + * the args for stackshot. */ + uth = current_uthread(); + wait4_data = &uth->uu_save.uus_wait4_data; + wait4_data->args = uap; + thread_set_pending_block_hint(current_thread(), kThreadWaitOnProcess); + (void)msleep(&p->p_stat, proc_list_mlock, PWAIT, "waitcoll", 0); goto loop1; } @@ -1897,10 +1986,11 @@ wait4_nocancel(proc_t q, struct wait4_nocancel_args *uap, int32_t *retval) /* Save arguments for continuation. Backing storage is in uthread->uu_arg, and will not be deallocated */ uth = current_uthread(); - wait4_data = &uth->uu_kevent.uu_wait4_data; + wait4_data = &uth->uu_save.uus_wait4_data; wait4_data->args = uap; wait4_data->retval = retval; + thread_set_pending_block_hint(current_thread(), kThreadWaitOnProcess); if ((error = msleep0((caddr_t)q, proc_list_mlock, PWAIT | PCATCH | PDROP, "wait", 0, wait1continue))) return (error); @@ -1937,7 +2027,7 @@ waitidcontinue(int result) thread = current_thread(); uth = (struct uthread *)get_bsdthread_info(thread); - waitid_data = &uth->uu_kevent.uu_waitid_data; + waitid_data = &uth->uu_save.uus_waitid_data; uap = waitid_data->args; retval = waitid_data->retval; return(waitid_nocancel(p, uap, retval)); @@ -2161,7 +2251,7 @@ waitid_nocancel(proc_t q, struct waitid_nocancel_args *uap, /* Save arguments for continuation. Backing storage is in uthread->uu_arg, and will not be deallocated */ uth = current_uthread(); - waitid_data = &uth->uu_kevent.uu_waitid_data; + waitid_data = &uth->uu_save.uus_waitid_data; waitid_data->args = uap; waitid_data->retval = retval; @@ -2725,3 +2815,20 @@ munge_user32_rusage(struct rusage *a_rusage_p, struct user32_rusage *a_user_rusa a_user_rusage_p->ru_nvcsw = a_rusage_p->ru_nvcsw; a_user_rusage_p->ru_nivcsw = a_rusage_p->ru_nivcsw; } + +void +kdp_wait4_find_process(thread_t thread, __unused event64_t wait_event, thread_waitinfo_t *waitinfo) +{ + assert(thread != NULL); + assert(waitinfo != NULL); + + struct uthread *ut = get_bsdthread_info(thread); + waitinfo->context = 0; + // ensure wmesg is consistent with a thread waiting in wait4 + assert(!strcmp(ut->uu_wmesg, "waitcoll") || !strcmp(ut->uu_wmesg, "wait")); + struct wait4_nocancel_args *args = ut->uu_save.uus_wait4_data.args; + // May not actually contain a pid; this is just the argument to wait4. + // See man wait4 for other valid wait4 arguments. + waitinfo->owner = args->pid; +} + diff --git a/bsd/kern/kern_fork.c b/bsd/kern/kern_fork.c index 952b6f8fb..2fb8a03d6 100644 --- a/bsd/kern/kern_fork.c +++ b/bsd/kern/kern_fork.c @@ -158,7 +158,13 @@ extern boolean_t task_is_exec_copy(task_t); thread_t cloneproc(task_t, coalition_t *, proc_t, int, int); proc_t forkproc(proc_t); void forkproc_free(proc_t); -thread_t fork_create_child(task_t parent_task, coalition_t *parent_coalitions, proc_t child, int inherit_memory, int is64bit, int in_exec); +thread_t fork_create_child(task_t parent_task, + coalition_t *parent_coalitions, + proc_t child, + int inherit_memory, + int is_64bit_addr, + int is_64bit_data, + int in_exec); void proc_vfork_begin(proc_t parent_proc); void proc_vfork_end(proc_t parent_proc); @@ -738,14 +744,15 @@ vfork_return(proc_t child_proc, int32_t *retval, int rval) * * Parameters: parent_task parent task * parent_coalitions parent's set of coalitions - * child_proc child process + * child_proc child process * inherit_memory TRUE, if the parents address space is - * to be inherited by the child - * is64bit TRUE, if the child being created will - * be associated with a 64 bit process - * rather than a 32 bit process - * in_exec TRUE, if called from execve or posix spawn set exec - * FALSE, if called from fork or vfexec + * to be inherited by the child + * is_64bit_addr TRUE, if the child being created will + * be associated with a 64 bit address space + * is_64bit_data TRUE if the child being created will use a + 64-bit register state + * in_exec TRUE, if called from execve or posix spawn set exec + * FALSE, if called from fork or vfexec * * Note: This code is called in the fork() case, from the execve() call * graph, if implementing an execve() following a vfork(), from @@ -764,7 +771,13 @@ vfork_return(proc_t child_proc, int32_t *retval, int rval) * in this case, 'inherit_memory' MUST be FALSE. */ thread_t -fork_create_child(task_t parent_task, coalition_t *parent_coalitions, proc_t child_proc, int inherit_memory, int is64bit, int in_exec) +fork_create_child(task_t parent_task, + coalition_t *parent_coalitions, + proc_t child_proc, + int inherit_memory, + int is_64bit_addr, + int is_64bit_data, + int in_exec) { thread_t child_thread = NULL; task_t child_task; @@ -774,7 +787,8 @@ fork_create_child(task_t parent_task, coalition_t *parent_coalitions, proc_t chi result = task_create_internal(parent_task, parent_coalitions, inherit_memory, - is64bit, + is_64bit_addr, + is_64bit_data, TF_LRETURNWAIT | TF_LRETURNWAITER, /* All created threads will wait in task_wait_to_return */ in_exec ? TPF_EXEC_COPY : TPF_NONE, /* Mark the task exec copy if in execve */ &child_task); @@ -968,7 +982,26 @@ cloneproc(task_t parent_task, coalition_t *parent_coalitions, proc_t parent_proc goto bad; } - child_thread = fork_create_child(parent_task, parent_coalitions, child_proc, inherit_memory, parent_proc->p_flag & P_LP64, FALSE); + /* + * In the case where the parent_task is TASK_NULL (during the init path) + * we make the assumption that the register size will be the same as the + * address space size since there's no way to determine the possible + * register size until an image is exec'd. + * + * The only architecture that has different address space and register sizes + * (arm64_32) isn't being used within kernel-space, so the above assumption + * always holds true for the init path. + */ + const int parent_64bit_addr = parent_proc->p_flag & P_LP64; + const int parent_64bit_data = (parent_task == TASK_NULL) ? parent_64bit_addr : task_get_64bit_data(parent_task); + + child_thread = fork_create_child(parent_task, + parent_coalitions, + child_proc, + inherit_memory, + parent_64bit_addr, + parent_64bit_data, + FALSE); if (child_thread == NULL) { /* @@ -980,11 +1013,9 @@ cloneproc(task_t parent_task, coalition_t *parent_coalitions, proc_t parent_proc } child_task = get_threadtask(child_thread); - if (parent_proc->p_flag & P_LP64) { - task_set_64bit(child_task, TRUE); + if (parent_64bit_addr) { OSBitOrAtomic(P_LP64, (UInt32 *)&child_proc->p_flag); } else { - task_set_64bit(child_task, FALSE); OSBitAndAtomic(~((uint32_t)P_LP64), (UInt32 *)&child_proc->p_flag); } @@ -1110,7 +1141,10 @@ forkproc_free(proc_t p) /* Free allocated memory */ FREE_ZONE(p->p_sigacts, sizeof *p->p_sigacts, M_SIGACTS); + p->p_sigacts = NULL; FREE_ZONE(p->p_stats, sizeof *p->p_stats, M_PSTATS); + p->p_stats = NULL; + proc_checkdeadrefs(p); FREE_ZONE(p, sizeof *p, M_PROC); } @@ -1162,6 +1196,7 @@ forkproc(proc_t parent_proc) if (child_proc->p_sigacts == NULL) { printf("forkproc: M_SUBPROC zone exhausted (p_sigacts)\n"); FREE_ZONE(child_proc->p_stats, sizeof *child_proc->p_stats, M_PSTATS); + child_proc->p_stats = NULL; FREE_ZONE(child_proc, sizeof *child_proc, M_PROC); child_proc = NULL; goto bad; @@ -1171,7 +1206,9 @@ forkproc(proc_t parent_proc) child_proc->p_rcall = thread_call_allocate((thread_call_func_t)realitexpire, child_proc); if (child_proc->p_rcall == NULL) { FREE_ZONE(child_proc->p_sigacts, sizeof *child_proc->p_sigacts, M_SIGACTS); + child_proc->p_sigacts = NULL; FREE_ZONE(child_proc->p_stats, sizeof *child_proc->p_stats, M_PSTATS); + child_proc->p_stats = NULL; FREE_ZONE(child_proc, sizeof *child_proc, M_PROC); child_proc = NULL; goto bad; @@ -1266,7 +1303,7 @@ forkproc(proc_t parent_proc) if (parent_proc->p_flag & P_PROFIL) startprofclock(child_proc); - child_proc->p_vfs_iopolicy = (parent_proc->p_vfs_iopolicy & (P_VFS_IOPOLICY_FORCE_HFS_CASE_SENSITIVITY)); + child_proc->p_vfs_iopolicy = (parent_proc->p_vfs_iopolicy & (P_VFS_IOPOLICY_VALID_MASK)); /* * Note that if the current thread has an assumed identity, this @@ -1416,7 +1453,7 @@ forkproc(proc_t parent_proc) child_proc->p_memstat_memlimit_active = 0; child_proc->p_memstat_memlimit_inactive = 0; #if CONFIG_FREEZE - child_proc->p_memstat_suspendedfootprint = 0; + child_proc->p_memstat_freeze_sharedanon_pages = 0; #endif child_proc->p_memstat_dirty = 0; child_proc->p_memstat_idledeadline = 0; @@ -1646,12 +1683,8 @@ uthread_cleanup(task_t task, void *uthread, void * bsd_info) */ assert(uth->uu_ar == NULL); - if (uth->uu_kqueue_bound) { - kevent_qos_internal_unbind(p, - 0, /* didn't save qos_class */ - uth->uu_thread, - uth->uu_kqueue_flags); - assert(uth->uu_kqueue_override_is_sync == 0); + if (uth->uu_kqr_bound) { + kqueue_threadreq_unbind(p, uth->uu_kqr_bound); } sel = &uth->uu_select; diff --git a/bsd/kern/kern_guarded.c b/bsd/kern/kern_guarded.c index ea583e9cf..795eb5667 100644 --- a/bsd/kern/kern_guarded.c +++ b/bsd/kern/kern_guarded.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016 Apple Inc. All rights reserved. + * Copyright (c) 2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -53,6 +53,7 @@ #include #include #include +#include #endif @@ -1202,11 +1203,39 @@ vng_file_label_destroy(struct label *label) lck_rw_unlock_exclusive(&llock); } +static os_reason_t +vng_reason_from_pathname(const char *path, uint32_t pathlen) +{ + os_reason_t r = os_reason_create(OS_REASON_GUARD, GUARD_REASON_VNODE); + if (NULL == r) + return (r); + /* + * If the pathname is very long, just keep the trailing part + */ + const uint32_t pathmax = 3 * EXIT_REASON_USER_DESC_MAX_LEN / 4; + if (pathlen > pathmax) { + path += (pathlen - pathmax); + pathlen = pathmax; + } + uint32_t rsize = kcdata_estimate_required_buffer_size(1, pathlen); + if (0 == os_reason_alloc_buffer(r, rsize)) { + struct kcdata_descriptor *kcd = &r->osr_kcd_descriptor; + mach_vm_address_t addr; + if (kcdata_get_memory_addr(kcd, + EXIT_REASON_USER_DESC, pathlen, &addr) == KERN_SUCCESS) { + kcdata_memcpy(kcd, addr, path, pathlen); + return (r); + } + } + os_reason_free(r); + return (OS_REASON_NULL); +} + static int vng_policy_flags; static int vng_guard_violation(const struct vng_info *vgi, - unsigned opval, const char *nm) + unsigned opval, vnode_t vp) { int retval = 0; @@ -1215,7 +1244,7 @@ vng_guard_violation(const struct vng_info *vgi, retval = EPERM; } - if (vng_policy_flags & kVNG_POLICY_LOGMSG) { + if (vng_policy_flags & (kVNG_POLICY_LOGMSG|kVNG_POLICY_UPRINTMSG)) { /* log a message */ const char *op; switch (opval) { @@ -1244,16 +1273,33 @@ vng_guard_violation(const struct vng_info *vgi, op = "(unknown)"; break; } + + const char *nm = vnode_getname(vp); proc_t p = current_proc(); const struct vng_owner *vgo; TAILQ_FOREACH(vgo, &vgi->vgi_owners, vgo_link) { - printf("%s[%d]: %s%s: '%s' guarded by %s[%d] (0x%llx)\n", - proc_name_address(p), proc_pid(p), op, - 0 != retval ? " denied" : "", - NULL != nm ? nm : "(unknown)", - proc_name_address(vgo->vgo_p), proc_pid(vgo->vgo_p), - vgi->vgi_guard); + const char fmt[] = + "%s[%d]: %s%s: '%s' guarded by %s[%d] (0x%llx)\n"; + + if (vng_policy_flags & kVNG_POLICY_LOGMSG) { + printf(fmt, + proc_name_address(p), proc_pid(p), op, + 0 != retval ? " denied" : "", + NULL != nm ? nm : "(unknown)", + proc_name_address(vgo->vgo_p), + proc_pid(vgo->vgo_p), vgi->vgi_guard); + } + if (vng_policy_flags & kVNG_POLICY_UPRINTMSG) { + uprintf(fmt, + proc_name_address(p), proc_pid(p), op, + 0 != retval ? " denied" : "", + NULL != nm ? nm : "(unknown)", + proc_name_address(vgo->vgo_p), + proc_pid(vgo->vgo_p), vgi->vgi_guard); + } } + if (NULL != nm) + vnode_putname(nm); } if (vng_policy_flags & (kVNG_POLICY_EXC|kVNG_POLICY_EXC_CORPSE)) { @@ -1270,8 +1316,20 @@ vng_guard_violation(const struct vng_info *vgi, subcode = vgi->vgi_guard; if (vng_policy_flags & kVNG_POLICY_EXC_CORPSE) { - task_violated_guard(code, subcode, NULL); - /* not fatal */ + char *path; + int len = MAXPATHLEN; + MALLOC(path, char *, len, M_TEMP, M_WAITOK); + os_reason_t r = NULL; + if (NULL != path) { + vn_getpath(vp, path, &len); + if (*path && len) + r = vng_reason_from_pathname(path, len); + } + task_violated_guard(code, subcode, r); /* not fatal */ + if (NULL != r) + os_reason_free(r); + if (NULL != path) + FREE(path, M_TEMP); } else { thread_t t = current_thread(); thread_guard_violation(t, code, subcode); @@ -1281,11 +1339,11 @@ vng_guard_violation(const struct vng_info *vgi, psignal(p, SIGKILL); } - return retval; + return (retval); } /* - * A vnode guard was tripped on this thread. + * A fatal vnode guard was tripped on this thread. * * (Invoked before returning to userland from the syscall handler.) */ @@ -1305,11 +1363,11 @@ vn_guard_ast(thread_t __unused t, static int vng_vnode_check_rename(kauth_cred_t __unused cred, struct vnode *__unused dvp, struct label *__unused dlabel, - struct vnode *__unused vp, struct label *label, - struct componentname *cnp, + struct vnode *vp, struct label *label, + struct componentname *__unused cnp, struct vnode *__unused tdvp, struct label *__unused tdlabel, - struct vnode *__unused tvp, struct label *tlabel, - struct componentname *tcnp) + struct vnode *tvp, struct label *tlabel, + struct componentname *__unused tcnp) { int error = 0; if (NULL != label || NULL != tlabel) { @@ -1317,17 +1375,16 @@ vng_vnode_check_rename(kauth_cred_t __unused cred, const struct vng_info *vgi = vng_lbl_get_withattr(label, VNG_RENAME_FROM); if (NULL != vgi) - error = vng_guard_violation(vgi, - VNG_RENAME_FROM, cnp->cn_nameptr); + error = vng_guard_violation(vgi, VNG_RENAME_FROM, vp); if (0 == error) { vgi = vng_lbl_get_withattr(tlabel, VNG_RENAME_TO); if (NULL != vgi) error = vng_guard_violation(vgi, - VNG_RENAME_TO, tcnp->cn_nameptr); + VNG_RENAME_TO, tvp); } lck_rw_unlock_shared(&llock); } - return error; + return (error); } static int @@ -1340,21 +1397,17 @@ vng_vnode_check_link(kauth_cred_t __unused cred, lck_rw_lock_shared(&llock); const struct vng_info *vgi = vng_lbl_get_withattr(label, VNG_LINK); - if (vgi) { - const char *nm = vnode_getname(vp); - error = vng_guard_violation(vgi, VNG_LINK, nm); - if (nm) - vnode_putname(nm); - } + if (vgi) + error = vng_guard_violation(vgi, VNG_LINK, vp); lck_rw_unlock_shared(&llock); } - return error; + return (error); } static int vng_vnode_check_unlink(kauth_cred_t __unused cred, struct vnode *__unused dvp, struct label *__unused dlabel, - struct vnode *__unused vp, struct label *label, struct componentname *cnp) + struct vnode *vp, struct label *label, struct componentname *__unused cnp) { int error = 0; if (NULL != label) { @@ -1362,11 +1415,10 @@ vng_vnode_check_unlink(kauth_cred_t __unused cred, const struct vng_info *vgi = vng_lbl_get_withattr(label, VNG_UNLINK); if (vgi) - error = vng_guard_violation(vgi, VNG_UNLINK, - cnp->cn_nameptr); + error = vng_guard_violation(vgi, VNG_UNLINK, vp); lck_rw_unlock_shared(&llock); } - return error; + return (error); } /* @@ -1388,16 +1440,12 @@ vng_vnode_check_write(kauth_cred_t __unused actv_cred, if (vgo->vgo_p == p) goto done; } - const char *nm = vnode_getname(vp); - error = vng_guard_violation(vgi, - VNG_WRITE_OTHER, nm); - if (nm) - vnode_putname(nm); + error = vng_guard_violation(vgi, VNG_WRITE_OTHER, vp); } done: lck_rw_unlock_shared(&llock); } - return error; + return (error); } /* @@ -1420,11 +1468,7 @@ vng_vnode_check_truncate(kauth_cred_t __unused actv_cred, if (vgo->vgo_p == p) goto done; } - const char *nm = vnode_getname(vp); - error = vng_guard_violation(vgi, - VNG_TRUNC_OTHER, nm); - if (nm) - vnode_putname(nm); + error = vng_guard_violation(vgi, VNG_TRUNC_OTHER, vp); } done: lck_rw_unlock_shared(&llock); @@ -1442,26 +1486,28 @@ vng_vnode_check_exchangedata(kauth_cred_t __unused cred, lck_rw_lock_shared(&llock); const struct vng_info *vgi = vng_lbl_get_withattr(flabel, VNG_EXCHDATA); - if (NULL != vgi) { - const char *nm = vnode_getname(fvp); - error = vng_guard_violation(vgi, - VNG_EXCHDATA, nm); - if (nm) - vnode_putname(nm); - } + if (NULL != vgi) + error = vng_guard_violation(vgi, VNG_EXCHDATA, fvp); if (0 == error) { vgi = vng_lbl_get_withattr(slabel, VNG_EXCHDATA); - if (NULL != vgi) { - const char *nm = vnode_getname(svp); + if (NULL != vgi) error = vng_guard_violation(vgi, - VNG_EXCHDATA, nm); - if (nm) - vnode_putname(nm); - } + VNG_EXCHDATA, svp); } lck_rw_unlock_shared(&llock); } - return error; + return (error); +} + +/* Intercept open-time truncations (by "other") of a guarded vnode */ + +static int +vng_vnode_check_open(kauth_cred_t cred, + struct vnode *vp, struct label *label, int acc_mode) +{ + if (0 == (acc_mode & O_TRUNC)) + return (0); + return (vng_vnode_check_truncate(cred, NULL, vp, label)); } /* @@ -1484,6 +1530,7 @@ SECURITY_READ_ONLY_EARLY(static struct mac_policy_ops) vng_policy_ops = { .mpo_vnode_check_write = vng_vnode_check_write, .mpo_vnode_check_truncate = vng_vnode_check_truncate, .mpo_vnode_check_exchangedata = vng_vnode_check_exchangedata, + .mpo_vnode_check_open = vng_vnode_check_open, .mpo_policy_syscall = vng_policy_syscall, .mpo_policy_init = vng_init, @@ -1513,7 +1560,8 @@ vnguard_policy_init(void) { if (0 == PE_i_can_has_debugger(NULL)) return; - vng_policy_flags = kVNG_POLICY_LOGMSG | kVNG_POLICY_EXC_CORPSE; + vng_policy_flags = kVNG_POLICY_LOGMSG | + kVNG_POLICY_EXC_CORPSE | kVNG_POLICY_UPRINTMSG; PE_parse_boot_argn("vnguard", &vng_policy_flags, sizeof (vng_policy_flags)); if (vng_policy_flags) mac_policy_register(&vng_policy_conf, &vng_policy_handle, NULL); diff --git a/bsd/kern/kern_kpc.c b/bsd/kern/kern_kpc.c index 098b7349f..96700a924 100644 --- a/bsd/kern/kern_kpc.c +++ b/bsd/kern/kern_kpc.c @@ -66,24 +66,10 @@ static lck_grp_attr_t *sysctl_lckgrp_attr = NULL; static lck_grp_t *sysctl_lckgrp = NULL; static lck_mtx_t sysctl_lock; -#if defined(__x86_64__) -/* 18 cores, 7 counters each */ -#define KPC_MAX_COUNTERS_COPIED (18 * 7) -#elif defined(__arm64__) -#include -#if defined(CPU_COUNT) -#define KPC_MAX_COUNTERS_COPIED (CPU_COUNT * 10) -#else /* defined(CPU_COUNT) */ -#define KPC_MAX_COUNTERS_COPIED (2 * 10) -#endif /* !defined(CPU_COUNT) */ -#elif defined(__arm__) -#define KPC_MAX_COUNTERS_COPIED (16) -#else /* !defined(__arm__) && !defined(__arm64__) && !defined(__x86_64__) */ -#error "unknown architecture for kpc buffer sizes" -#endif /* !defined(__arm__) && !defined(__arm64__) && !defined(__x86_64__) */ - -static_assert((KPC_MAX_COUNTERS_COPIED * sizeof(uint64_t)) < 1024, - "kpc's stack could grow too large"); +/* + * Another element is needed to hold the CPU number when getting counter values. + */ +#define KPC_MAX_BUF_LEN (KPC_MAX_COUNTERS_COPIED + 1) typedef int (*setget_func_t)(int); @@ -101,6 +87,29 @@ kpc_init(void) kpc_initted = 1; } +static uint64_t * +kpc_get_bigarray(uint32_t *size_out) +{ + static uint64_t *bigarray = NULL; + + LCK_MTX_ASSERT(&sysctl_lock, LCK_MTX_ASSERT_OWNED); + + uint32_t size = kpc_get_counterbuf_size() + sizeof(uint64_t); + *size_out = size; + + if (bigarray) { + return bigarray; + } + + /* + * Another element is needed to hold the CPU number when getting counter + * values. + */ + bigarray = kalloc_tag(size, VM_KERN_MEMORY_DIAG); + assert(bigarray != NULL); + return bigarray; +} + /* abstract sysctl handlers */ static int sysctl_get_int( struct sysctl_oid *oidp, struct sysctl_req *req, @@ -276,8 +285,8 @@ static int sysctl_get_bigarray(struct sysctl_req *req, int (*get_fn)(uint32_t, uint32_t*, void*)) { - uint64_t buf[KPC_MAX_COUNTERS_COPIED] = {}; - uint32_t bufsize = sizeof(buf); + uint32_t bufsize = 0; + uint64_t *buf = kpc_get_bigarray(&bufsize); uint32_t arg = 0; /* get the argument */ @@ -286,9 +295,9 @@ sysctl_get_bigarray(struct sysctl_req *req, return error; } - error = get_fn(arg, &bufsize, &buf); + error = get_fn(arg, &bufsize, buf); if (!error) { - error = SYSCTL_OUT(req, &buf, bufsize); + error = SYSCTL_OUT(req, buf, bufsize); } return error; @@ -318,10 +327,11 @@ sysctl_getset_bigarray(struct sysctl_req *req, int (*size_fn)(uint32_t arg), int (*get_fn)(uint32_t, void*), int (*set_fn)(uint32_t, void*)) { int error = 0; - uint64_t buf[KPC_MAX_COUNTERS_COPIED] = {}; - uint32_t bufsize = sizeof(buf); uint64_t arg; + uint32_t bufsize = 0; + uint64_t *buf = kpc_get_bigarray(&bufsize); + /* get the config word */ error = SYSCTL_IN(req, &arg, sizeof(arg)); if (error) { @@ -337,11 +347,11 @@ sysctl_getset_bigarray(struct sysctl_req *req, int (*size_fn)(uint32_t arg), /* if writing */ if (req->newptr) { /* copy the rest -- SYSCTL_IN knows the copyin should be shifted */ - error = SYSCTL_IN(req, &buf, regsize); + error = SYSCTL_IN(req, buf, regsize); /* SYSCTL_IN failure means only need to read */ if (!error) { - error = set_fn((uint32_t)arg, &buf); + error = set_fn((uint32_t)arg, buf); if (error) { return error; } @@ -350,12 +360,12 @@ sysctl_getset_bigarray(struct sysctl_req *req, int (*size_fn)(uint32_t arg), /* if reading */ if (req->oldptr) { - error = get_fn((uint32_t)arg, &buf); + error = get_fn((uint32_t)arg, buf); if (error) { return error; } - error = SYSCTL_OUT(req, &buf, regsize); + error = SYSCTL_OUT(req, buf, regsize); } return error; @@ -369,8 +379,13 @@ kpc_sysctl SYSCTL_HANDLER_ARGS // __unused struct sysctl_oid *unused_oidp = oidp; (void)arg2; - if( !kpc_initted ) + if (!kpc_initted) { panic("kpc_init not called"); + } + + if (!kpc_supported) { + return ENOTSUP; + } ktrace_lock(); @@ -486,74 +501,74 @@ SYSCTL_NODE(, OID_AUTO, kpc, CTLFLAG_RW|CTLFLAG_LOCKED, 0, /* values */ SYSCTL_PROC(_kpc, OID_AUTO, classes, - CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_ANYBODY, + CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_ANYBODY|CTLFLAG_MASKED|CTLFLAG_LOCKED, (void*)REQ_CLASSES, sizeof(int), kpc_sysctl, "I", "Available classes"); SYSCTL_PROC(_kpc, OID_AUTO, counting, - CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY, + CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY|CTLFLAG_MASKED|CTLFLAG_LOCKED, (void*)REQ_COUNTING, sizeof(int), kpc_sysctl, "I", "PMCs counting"); SYSCTL_PROC(_kpc, OID_AUTO, thread_counting, - CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY, + CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY|CTLFLAG_MASKED|CTLFLAG_LOCKED, (void*)REQ_THREAD_COUNTING, sizeof(int), kpc_sysctl, "I", "Thread accumulation"); SYSCTL_PROC(_kpc, OID_AUTO, pmu_version, - CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_ANYBODY, + CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_ANYBODY|CTLFLAG_MASKED|CTLFLAG_LOCKED, (void *)REQ_PMU_VERSION, sizeof(int), kpc_sysctl, "I", "PMU version for hardware"); /* faux values */ SYSCTL_PROC(_kpc, OID_AUTO, config_count, - CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY, + CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY|CTLFLAG_MASKED|CTLFLAG_LOCKED, (void*)REQ_CONFIG_COUNT, sizeof(int), kpc_sysctl, "S", "Config count"); SYSCTL_PROC(_kpc, OID_AUTO, counter_count, - CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY, + CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY|CTLFLAG_MASKED|CTLFLAG_LOCKED, (void*)REQ_COUNTER_COUNT, sizeof(int), kpc_sysctl, "S", "Counter count"); SYSCTL_PROC(_kpc, OID_AUTO, sw_inc, - CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY, + CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY|CTLFLAG_MASKED|CTLFLAG_LOCKED, (void*)REQ_SW_INC, sizeof(int), kpc_sysctl, "S", "Software increment"); /* arrays */ SYSCTL_PROC(_kpc, OID_AUTO, thread_counters, - CTLFLAG_RD|CTLFLAG_WR|CTLFLAG_ANYBODY, + CTLFLAG_RD|CTLFLAG_WR|CTLFLAG_ANYBODY|CTLFLAG_MASKED|CTLFLAG_LOCKED, (void*)REQ_THREAD_COUNTERS, sizeof(uint64_t), kpc_sysctl, "QU", "Current thread counters"); SYSCTL_PROC(_kpc, OID_AUTO, counters, - CTLFLAG_RD|CTLFLAG_WR|CTLFLAG_ANYBODY, + CTLFLAG_RD|CTLFLAG_WR|CTLFLAG_ANYBODY|CTLFLAG_MASKED|CTLFLAG_LOCKED, (void*)REQ_COUNTERS, sizeof(uint64_t), kpc_sysctl, "QU", "Current counters"); SYSCTL_PROC(_kpc, OID_AUTO, shadow_counters, - CTLFLAG_RD|CTLFLAG_WR|CTLFLAG_ANYBODY, + CTLFLAG_RD|CTLFLAG_WR|CTLFLAG_ANYBODY|CTLFLAG_MASKED|CTLFLAG_LOCKED, (void*)REQ_SHADOW_COUNTERS, sizeof(uint64_t), kpc_sysctl, "QU", "Current shadow counters"); SYSCTL_PROC(_kpc, OID_AUTO, config, - CTLFLAG_RD|CTLFLAG_WR|CTLFLAG_ANYBODY, + CTLFLAG_RD|CTLFLAG_WR|CTLFLAG_ANYBODY|CTLFLAG_MASKED|CTLFLAG_LOCKED, (void*)REQ_CONFIG, sizeof(uint64_t), kpc_sysctl, "QU", "Set counter configs"); SYSCTL_PROC(_kpc, OID_AUTO, period, - CTLFLAG_RD|CTLFLAG_WR|CTLFLAG_ANYBODY, + CTLFLAG_RD|CTLFLAG_WR|CTLFLAG_ANYBODY|CTLFLAG_MASKED|CTLFLAG_LOCKED, (void*)REQ_PERIOD, sizeof(uint64_t), kpc_sysctl, "QU", "Set counter periods"); SYSCTL_PROC(_kpc, OID_AUTO, actionid, - CTLFLAG_RD|CTLFLAG_WR|CTLFLAG_ANYBODY, + CTLFLAG_RD|CTLFLAG_WR|CTLFLAG_ANYBODY|CTLFLAG_MASKED|CTLFLAG_LOCKED, (void*)REQ_ACTIONID, sizeof(uint32_t), kpc_sysctl, "QU", "Set counter actionids"); diff --git a/bsd/kern/kern_lockf.c b/bsd/kern/kern_lockf.c index 5284f060c..32ab96fd7 100644 --- a/bsd/kern/kern_lockf.c +++ b/bsd/kern/kern_lockf.c @@ -330,12 +330,10 @@ lf_advlock(struct vnop_advlock_args *ap) FREE(lock, M_LOCKF); break; -#if CONFIG_EMBEDDED case F_GETLKPID: error = lf_getlock(lock, fl, fl->l_pid); FREE(lock, M_LOCKF); break; -#endif default: FREE(lock, M_LOCKF); diff --git a/bsd/kern/kern_malloc.c b/bsd/kern/kern_malloc.c index 6e28f3f77..565839738 100644 --- a/bsd/kern/kern_malloc.c +++ b/bsd/kern/kern_malloc.c @@ -303,7 +303,8 @@ const char *memname[] = { "Event Handler",/* 125 M_EVENTHANDLER */ "Link Layer Table", /* 126 M_LLTABLE */ "Network Work Queue", /* 127 M_NWKWQ */ - "" + "Content Filter", /* 128 M_CFIL */ + "" }; /* for use with kmzones.kz_zalloczone */ @@ -491,6 +492,7 @@ struct kmzones { { 0, KMZ_MALLOC, FALSE }, /* 125 M_EVENTHANDLER */ { 0, KMZ_MALLOC, FALSE }, /* 126 M_LLTABLE */ { 0, KMZ_MALLOC, FALSE }, /* 127 M_NWKWQ */ + { 0, KMZ_MALLOC, FALSE }, /* 128 M_CFIL */ #undef SOS #undef SOX }; diff --git a/bsd/kern/kern_memorystatus.c b/bsd/kern/kern_memorystatus.c index a2de71f0f..f52c05c1a 100644 --- a/bsd/kern/kern_memorystatus.c +++ b/bsd/kern/kern_memorystatus.c @@ -71,20 +71,23 @@ #include #include +#include /* For logging clarity */ static const char *memorystatus_kill_cause_name[] = { - "" , - "jettisoned" , /* kMemorystatusKilled */ - "highwater" , /* kMemorystatusKilledHiwat */ - "vnode-limit" , /* kMemorystatusKilledVnodes */ - "vm-pageshortage" , /* kMemorystatusKilledVMPageShortage */ - "vm-thrashing" , /* kMemorystatusKilledVMThrashing */ - "fc-thrashing" , /* kMemorystatusKilledFCThrashing */ - "per-process-limit" , /* kMemorystatusKilledPerProcessLimit */ - "diagnostic" , /* kMemorystatusKilledDiagnostic */ - "idle-exit" , /* kMemorystatusKilledIdleExit */ - "zone-map-exhaustion" , /* kMemorystatusKilledZoneMapExhaustion */ + "" , /* kMemorystatusInvalid */ + "jettisoned" , /* kMemorystatusKilled */ + "highwater" , /* kMemorystatusKilledHiwat */ + "vnode-limit" , /* kMemorystatusKilledVnodes */ + "vm-pageshortage" , /* kMemorystatusKilledVMPageShortage */ + "proc-thrashing" , /* kMemorystatusKilledProcThrashing */ + "fc-thrashing" , /* kMemorystatusKilledFCThrashing */ + "per-process-limit" , /* kMemorystatusKilledPerProcessLimit */ + "disk-space-shortage" , /* kMemorystatusKilledDiskSpaceShortage */ + "idle-exit" , /* kMemorystatusKilledIdleExit */ + "zone-map-exhaustion" , /* kMemorystatusKilledZoneMapExhaustion */ + "vm-compressor-thrashing" , /* kMemorystatusKilledVMCompressorThrashing */ + "vm-compressor-space-shortage" , /* kMemorystatusKilledVMCompressorSpaceShortage */ }; static const char * @@ -115,8 +118,9 @@ static boolean_t is_reason_thrashing(unsigned cause) { switch (cause) { - case kMemorystatusKilledVMThrashing: case kMemorystatusKilledFCThrashing: + case kMemorystatusKilledVMCompressorThrashing: + case kMemorystatusKilledVMCompressorSpaceShortage: return TRUE; default: return FALSE; @@ -280,12 +284,11 @@ boolean_t is_knote_registered_modify_task_pressure_bits(struct knote*, int, task void memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear); void memorystatus_send_low_swap_note(void); -int memorystatus_wakeup = 0; - unsigned int memorystatus_level = 0; static int memorystatus_list_count = 0; + #define MEMSTAT_BUCKET_COUNT (JETSAM_PRIORITY_MAX + 1) typedef struct memstat_bucket { @@ -303,8 +306,16 @@ int system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1; int applications_aging_band = JETSAM_PRIORITY_IDLE; #define isProcessInAgingBands(p) ((isSysProc(p) && system_procs_aging_band && (p->p_memstat_effectivepriority == system_procs_aging_band)) || (isApp(p) && applications_aging_band && (p->p_memstat_effectivepriority == applications_aging_band))) -#define isApp(p) (! (p->p_memstat_dirty & P_DIRTY_TRACK)) -#define isSysProc(p) ((p->p_memstat_dirty & P_DIRTY_TRACK)) + +/* + * Checking the p_memstat_state almost always requires the proc_list_lock + * because the jetsam thread could be on the other core changing the state. + * + * App -- almost always managed by a system process. Always have dirty tracking OFF. Can include extensions too. + * System Processes -- not managed by anybody. Always have dirty tracking ON. Can include extensions (here) too. + */ +#define isApp(p) ((p->p_memstat_state & P_MEMSTAT_MANAGED) || ! (p->p_memstat_dirty & P_DIRTY_TRACK)) +#define isSysProc(p) ( ! (p->p_memstat_state & P_MEMSTAT_MANAGED) || (p->p_memstat_dirty & P_DIRTY_TRACK)) #define kJetsamAgingPolicyNone (0) #define kJetsamAgingPolicyLegacy (1) @@ -598,9 +609,12 @@ static uint32_t kill_under_pressure_cause = 0; * default jetsam snapshot support */ static memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot; +static memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot_copy; #define memorystatus_jetsam_snapshot_list memorystatus_jetsam_snapshot->entries static unsigned int memorystatus_jetsam_snapshot_count = 0; +static unsigned int memorystatus_jetsam_snapshot_copy_count = 0; static unsigned int memorystatus_jetsam_snapshot_max = 0; +static unsigned int memorystatus_jetsam_snapshot_size = 0; static uint64_t memorystatus_jetsam_snapshot_last_timestamp = 0; static uint64_t memorystatus_jetsam_snapshot_timeout = 0; #define JETSAM_SNAPSHOT_TIMEOUT_SECS 30 @@ -615,7 +629,7 @@ static boolean_t memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memory static void memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, uint64_t killtime); static void memorystatus_clear_errors(void); -static void memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages); +static void memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages); static void memorystatus_get_task_phys_footprint_page_counts(task_t task, uint64_t *internal_pages, uint64_t *internal_compressed_pages, uint64_t *purgeable_nonvolatile_pages, uint64_t *purgeable_nonvolatile_compressed_pages, @@ -629,7 +643,7 @@ static uint32_t memorystatus_build_state(proc_t p); static boolean_t memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause, os_reason_t jetsam_reason, int32_t *priority, uint32_t *errors); static boolean_t memorystatus_kill_top_process_aggressive(uint32_t cause, int aggr_count, int32_t priority_max, uint32_t *errors); -static boolean_t memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, int aggr_count, uint32_t *errors); +static boolean_t memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, unsigned int band, int aggr_count, uint32_t *errors); static boolean_t memorystatus_kill_hiwat_proc(uint32_t *errors, boolean_t *purged); static boolean_t memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause); @@ -691,7 +705,14 @@ int32_t max_kill_priority = JETSAM_PRIORITY_IDLE; #endif /* CONFIG_JETSAM */ unsigned int memorystatus_frozen_count = 0; +unsigned int memorystatus_frozen_processes_max = 0; +unsigned int memorystatus_frozen_shared_mb = 0; +unsigned int memorystatus_frozen_shared_mb_max = 0; +unsigned int memorystatus_freeze_shared_mb_per_process_max = 0; /* Max. MB allowed per process to be freezer-eligible. */ +unsigned int memorystatus_freeze_private_shared_pages_ratio = 2; /* Ratio of private:shared pages for a process to be freezer-eligible. */ unsigned int memorystatus_suspended_count = 0; +unsigned int memorystatus_thaw_count = 0; +unsigned int memorystatus_refreeze_eligible_count = 0; /* # of processes currently thawed i.e. have state on disk & in-memory */ #if VM_PRESSURE_EVENTS @@ -715,6 +736,21 @@ boolean_t memorystatus_hwm_candidates = 0; static int memorystatus_send_note(int event_code, void *data, size_t data_length); +/* + * This value is the threshold that a process must meet to be considered for scavenging. + */ +#if CONFIG_EMBEDDED +#define VM_PRESSURE_MINIMUM_RSIZE 6 /* MB */ +#else /* CONFIG_EMBEDDED */ +#define VM_PRESSURE_MINIMUM_RSIZE 10 /* MB */ +#endif /* CONFIG_EMBEDDED */ + +uint32_t vm_pressure_task_footprint_min = VM_PRESSURE_MINIMUM_RSIZE; + +#if DEVELOPMENT || DEBUG +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_task_footprint_min, CTLFLAG_RW|CTLFLAG_LOCKED, &vm_pressure_task_footprint_min, 0, ""); +#endif /* DEVELOPMENT || DEBUG */ + #endif /* VM_PRESSURE_EVENTS */ @@ -728,12 +764,24 @@ extern boolean_t kill_on_no_paging_space; #endif /* DEVELOPMENT || DEBUG */ +/* + * Table that expresses the probability of a process + * being used in the next hour. + */ +typedef struct memorystatus_internal_probabilities { + char proc_name[MAXCOMLEN + 1]; + int use_probability; +} memorystatus_internal_probabilities_t; + +static memorystatus_internal_probabilities_t *memorystatus_global_probabilities_table = NULL; +static size_t memorystatus_global_probabilities_size = 0; + /* Freeze */ #if CONFIG_FREEZE - boolean_t memorystatus_freeze_enabled = FALSE; int memorystatus_freeze_wakeup = 0; +int memorystatus_freeze_jetsam_band = 0; /* the jetsam band which will contain P_MEMSTAT_FROZEN processes */ lck_grp_attr_t *freezer_lck_grp_attr; lck_grp_t *freezer_lck_grp; @@ -741,8 +789,11 @@ static lck_mtx_t freezer_mutex; static inline boolean_t memorystatus_can_freeze_processes(void); static boolean_t memorystatus_can_freeze(boolean_t *memorystatus_freeze_swap_low); - +static boolean_t memorystatus_is_process_eligible_for_freeze(proc_t p); static void memorystatus_freeze_thread(void *param __unused, wait_result_t wr __unused); +static boolean_t memorystatus_freeze_thread_should_run(void); + +void memorystatus_disable_freeze(void); /* Thresholds */ static unsigned int memorystatus_freeze_threshold = 0; @@ -753,24 +804,37 @@ static unsigned int memorystatus_freeze_pages_max = 0; static unsigned int memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_DEFAULT; static unsigned int memorystatus_freeze_daily_mb_max = FREEZE_DAILY_MB_MAX_DEFAULT; +static uint64_t memorystatus_freeze_budget_pages_remaining = 0; //remaining # of pages that can be frozen to disk +static boolean_t memorystatus_freeze_degradation = FALSE; //protected by the freezer mutex. Signals we are in a degraded freeze mode. + +static unsigned int memorystatus_max_frozen_demotions_daily = 0; +static unsigned int memorystatus_thaw_count_demotion_threshold = 0; /* Stats */ -static uint64_t memorystatus_freeze_count = 0; static uint64_t memorystatus_freeze_pageouts = 0; /* Throttling */ +#define DEGRADED_WINDOW_MINS (30) +#define NORMAL_WINDOW_MINS (24 * 60) + static throttle_interval_t throttle_intervals[] = { - { 60, 8, 0, 0, { 0, 0 }, FALSE }, /* 1 hour intermediate interval, 8x burst */ - { 24 * 60, 1, 0, 0, { 0, 0 }, FALSE }, /* 24 hour long interval, no burst */ + { DEGRADED_WINDOW_MINS, 1, 0, 0, { 0, 0 }}, + { NORMAL_WINDOW_MINS, 1, 0, 0, { 0, 0 }}, }; +throttle_interval_t *degraded_throttle_window = &throttle_intervals[0]; +throttle_interval_t *normal_throttle_window = &throttle_intervals[1]; -static uint64_t memorystatus_freeze_throttle_count = 0; +extern uint64_t vm_swap_get_free_space(void); +extern boolean_t vm_swap_max_budget(uint64_t *); -static unsigned int memorystatus_suspended_footprint_total = 0; /* pages */ +static void memorystatus_freeze_update_throttle(uint64_t *budget_pages_allowed); -extern uint64_t vm_swap_get_free_space(void); +static uint64_t memorystatus_freezer_thread_next_run_ts = 0; -static boolean_t memorystatus_freeze_update_throttle(void); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_count, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_frozen_count, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_thaw_count, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_thaw_count, 0, ""); +SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_pageouts, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_pageouts, ""); +SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_budget_pages_remaining, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_budget_pages_remaining, ""); #endif /* CONFIG_FREEZE */ @@ -1135,18 +1199,41 @@ SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_pressure, CTLFLAG_RW|C #if CONFIG_FREEZE +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_jetsam_band, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_jetsam_band, 0, ""); SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_daily_mb_max, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_daily_mb_max, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_degraded_mode, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_degradation, 0, ""); SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_threshold, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_threshold, 0, ""); SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_min, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_pages_min, 0, ""); SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_max, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_pages_max, 0, ""); -SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_count, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_count, ""); -SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_pageouts, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_pageouts, ""); -SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_throttle_count, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_throttle_count, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_refreeze_eligible_count, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_refreeze_eligible_count, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_processes_max, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_frozen_processes_max, 0, ""); + +/* + * Max. shared-anonymous memory in MB that can be held by frozen processes in the high jetsam band. + * "0" means no limit. + * Default is 10% of system-wide task limit. + */ + +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_shared_mb_max, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_frozen_shared_mb_max, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_shared_mb, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_frozen_shared_mb, 0, ""); + +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_shared_mb_per_process_max, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_shared_mb_per_process_max, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_private_shared_pages_ratio, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_private_shared_pages_ratio, 0, ""); + SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_min_processes, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_suspended_threshold, 0, ""); +/* + * max. # of frozen process demotions we will allow in our daily cycle. + */ +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_max_freeze_demotions_daily, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_max_frozen_demotions_daily, 0, ""); +/* + * min # of thaws needed by a process to protect it from getting demoted into the IDLE band. + */ +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_thaw_count_demotion_threshold, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_thaw_count_demotion_threshold, 0, ""); + boolean_t memorystatus_freeze_throttle_enabled = TRUE; SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_throttle_enabled, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_throttle_enabled, 0, ""); @@ -1160,8 +1247,10 @@ sysctl_memorystatus_freeze SYSCTL_HANDLER_ARGS #pragma unused(arg1, arg2) int error, pid = 0; proc_t p; + int freezer_error_code = 0; if (memorystatus_freeze_enabled == FALSE) { + printf("sysctl_freeze: Freeze is DISABLED\n"); return ENOTSUP; } @@ -1179,21 +1268,22 @@ sysctl_memorystatus_freeze SYSCTL_HANDLER_ARGS p = proc_find(pid); if (p != NULL) { - uint32_t purgeable, wired, clean, dirty; - boolean_t shared; - uint32_t max_pages = 0; + uint32_t purgeable, wired, clean, dirty, shared; + uint32_t max_pages = 0, state = 0; if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { - - unsigned int avail_swap_space = 0; /* in pages. */ - /* * Freezer backed by the compressor and swap file(s) - * while will hold compressed data. + * will hold compressed data. + * + * We don't care about the global freezer budget or the process's (min/max) budget here. + * The freeze sysctl is meant to force-freeze a process. + * + * We also don't update any global or process stats on this path, so that the jetsam/ freeze + * logic remains unaffected. The tasks we're performing here are: freeze the process, set the + * P_MEMSTAT_FROZEN bit, and elevate the process to a higher band (if the freezer is active). */ - avail_swap_space = vm_swap_get_free_space() / PAGE_SIZE_64; - - max_pages = MIN(avail_swap_space, memorystatus_freeze_pages_max); + max_pages = memorystatus_freeze_pages_max; } else { /* @@ -1202,16 +1292,87 @@ sysctl_memorystatus_freeze SYSCTL_HANDLER_ARGS max_pages = UINT32_MAX - 1; } - error = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, FALSE); - proc_rele(p); + proc_list_lock(); + state = p->p_memstat_state; + proc_list_unlock(); + + /* + * The jetsam path also verifies that the process is a suspended App. We don't care about that here. + * We simply ensure that jetsam is not already working on the process and that the process has not + * explicitly disabled freezing. + */ + if (state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FREEZE_DISABLED)) { + printf("sysctl_freeze: p_memstat_state check failed, process is%s%s%s\n", + (state & P_MEMSTAT_TERMINATED) ? " terminated" : "", + (state & P_MEMSTAT_LOCKED) ? " locked" : "", + (state & P_MEMSTAT_FREEZE_DISABLED) ? " unfreezable" : ""); + + proc_rele(p); + lck_mtx_unlock(&freezer_mutex); + return EPERM; + } + + error = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, &freezer_error_code, FALSE /* eval only */); + + if (error) { + char reason[128]; + if (freezer_error_code == FREEZER_ERROR_EXCESS_SHARED_MEMORY) { + strlcpy(reason, "too much shared memory", 128); + } + + if (freezer_error_code == FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO) { + strlcpy(reason, "low private-shared pages ratio", 128); + } - if (error) - error = EIO; + if (freezer_error_code == FREEZER_ERROR_NO_COMPRESSOR_SPACE) { + strlcpy(reason, "no compressor space", 128); + } + + if (freezer_error_code == FREEZER_ERROR_NO_SWAP_SPACE) { + strlcpy(reason, "no swap space", 128); + } + + printf("sysctl_freeze: task_freeze failed: %s\n", reason); + + if (error == KERN_NO_SPACE) { + /* Make it easy to distinguish between failures due to low compressor/ swap space and other failures. */ + error = ENOSPC; + } else { + error = EIO; + } + } else { + proc_list_lock(); + if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == 0) { + p->p_memstat_state |= P_MEMSTAT_FROZEN; + memorystatus_frozen_count++; + } + p->p_memstat_frozen_count++; + + + proc_list_unlock(); + + if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { + /* + * We elevate only if we are going to swap out the data. + */ + error = memorystatus_update_inactive_jetsam_priority_band(pid, MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE, + memorystatus_freeze_jetsam_band, TRUE); + + if (error) { + printf("sysctl_freeze: Elevating frozen process to higher jetsam band failed with %d\n", error); + } + } + } + + proc_rele(p); lck_mtx_unlock(&freezer_mutex); return error; + } else { + printf("sysctl_freeze: Invalid process\n"); } + lck_mtx_unlock(&freezer_mutex); return EINVAL; } @@ -1242,10 +1403,23 @@ sysctl_memorystatus_available_pages_thaw SYSCTL_HANDLER_ARGS p = proc_find(pid); if (p != NULL) { error = task_thaw(p->task); - proc_rele(p); - if (error) + if (error) { error = EIO; + } else { + /* + * task_thaw() succeeded. + * + * We increment memorystatus_frozen_count on the sysctl freeze path. + * And so we need the P_MEMSTAT_FROZEN to decrement the frozen count + * when this process exits. + * + * proc_list_lock(); + * p->p_memstat_state &= ~P_MEMSTAT_FROZEN; + * proc_list_unlock(); + */ + } + proc_rele(p); return error; } } @@ -1256,6 +1430,194 @@ sysctl_memorystatus_available_pages_thaw SYSCTL_HANDLER_ARGS SYSCTL_PROC(_kern, OID_AUTO, memorystatus_thaw, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED, 0, 0, &sysctl_memorystatus_available_pages_thaw, "I", ""); +typedef struct _global_freezable_status{ + boolean_t freeze_pages_threshold_crossed; + boolean_t freeze_eligible_procs_available; + boolean_t freeze_scheduled_in_future; +}global_freezable_status_t; + +typedef struct _proc_freezable_status{ + boolean_t freeze_has_memstat_state; + boolean_t freeze_has_pages_min; + int freeze_has_probability; + boolean_t freeze_attempted; + uint32_t p_memstat_state; + uint32_t p_pages; + int p_freeze_error_code; + int p_pid; + char p_name[MAXCOMLEN + 1]; +}proc_freezable_status_t; + +#define MAX_FREEZABLE_PROCESSES 100 + +static int +memorystatus_freezer_get_status(user_addr_t buffer, size_t buffer_size, int32_t *retval) +{ + uint32_t proc_count = 0, i = 0; + global_freezable_status_t *list_head; + proc_freezable_status_t *list_entry; + size_t list_size = 0; + proc_t p; + memstat_bucket_t *bucket; + uint32_t state = 0, pages = 0, entry_count = 0; + boolean_t try_freeze = TRUE; + int error = 0, probability_of_use = 0; + + + if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) { + return ENOTSUP; + } + + list_size = sizeof(global_freezable_status_t) + (sizeof(proc_freezable_status_t) * MAX_FREEZABLE_PROCESSES); + + if (buffer_size < list_size) { + return EINVAL; + } + + list_head = (global_freezable_status_t*)kalloc(list_size); + if (list_head == NULL) { + return ENOMEM; + } + + memset(list_head, 0, list_size); + + list_size = sizeof(global_freezable_status_t); + + proc_list_lock(); + + uint64_t curr_time = mach_absolute_time(); + + list_head->freeze_pages_threshold_crossed = (memorystatus_available_pages < memorystatus_freeze_threshold); + list_head->freeze_eligible_procs_available = ((memorystatus_suspended_count - memorystatus_frozen_count) > memorystatus_freeze_suspended_threshold); + list_head->freeze_scheduled_in_future = (curr_time < memorystatus_freezer_thread_next_run_ts); + + list_entry = (proc_freezable_status_t*) ((uintptr_t)list_head + sizeof(global_freezable_status_t)); + + bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE]; + + entry_count = (memorystatus_global_probabilities_size / sizeof(memorystatus_internal_probabilities_t)); + + p = memorystatus_get_first_proc_locked(&i, FALSE); + proc_count++; + + while ((proc_count <= MAX_FREEZABLE_PROCESSES) && + (p) && + (list_size < buffer_size)) { + + if (isApp(p) == FALSE) { + p = memorystatus_get_next_proc_locked(&i, p, FALSE); + proc_count++; + continue; + } + + strlcpy(list_entry->p_name, p->p_name, MAXCOMLEN + 1); + + list_entry->p_pid = p->p_pid; + + state = p->p_memstat_state; + + if ((state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FREEZE_DISABLED | P_MEMSTAT_FREEZE_IGNORE)) || + !(state & P_MEMSTAT_SUSPENDED)) { + + try_freeze = list_entry->freeze_has_memstat_state = FALSE; + } else { + try_freeze = list_entry->freeze_has_memstat_state = TRUE; + } + + list_entry->p_memstat_state = state; + + memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL); + if (pages < memorystatus_freeze_pages_min) { + try_freeze = list_entry->freeze_has_pages_min = FALSE; + } else { + list_entry->freeze_has_pages_min = TRUE; + if (try_freeze != FALSE) { + try_freeze = TRUE; + } + } + + list_entry->p_pages = pages; + + if (entry_count) { + uint32_t j = 0; + for (j = 0; j < entry_count; j++ ) { + if (strncmp(memorystatus_global_probabilities_table[j].proc_name, + p->p_name, + MAXCOMLEN + 1) == 0) { + + probability_of_use = memorystatus_global_probabilities_table[j].use_probability; + break; + } + } + + list_entry->freeze_has_probability = probability_of_use; + + if (probability_of_use && try_freeze != FALSE) { + try_freeze = TRUE; + } else { + try_freeze = FALSE; + } + } else { + if (try_freeze != FALSE) { + try_freeze = TRUE; + } + list_entry->freeze_has_probability = -1; + } + + if (try_freeze) { + + uint32_t purgeable, wired, clean, dirty, shared; + uint32_t max_pages = 0; + int freezer_error_code = 0; + + error = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, &freezer_error_code, TRUE /* eval only */); + + if (error) { + list_entry->p_freeze_error_code = freezer_error_code; + } + + list_entry->freeze_attempted = TRUE; + } + + list_entry++; + + list_size += sizeof(proc_freezable_status_t); + + p = memorystatus_get_next_proc_locked(&i, p, FALSE); + proc_count++; + } + + proc_list_unlock(); + + buffer_size = list_size; + + error = copyout(list_head, buffer, buffer_size); + if (error == 0) { + *retval = buffer_size; + } else { + *retval = 0; + } + + list_size = sizeof(global_freezable_status_t) + (sizeof(proc_freezable_status_t) * MAX_FREEZABLE_PROCESSES); + kfree(list_head, list_size); + + MEMORYSTATUS_DEBUG(1, "memorystatus_freezer_get_status: returning %d (%lu - size)\n", error, (unsigned long)*list_size); + + return error; +} + +static int +memorystatus_freezer_control(int32_t flags, user_addr_t buffer, size_t buffer_size, int32_t *retval) +{ + int err = ENOTSUP; + + if (flags == FREEZER_CONTROL_GET_STATUS) { + err = memorystatus_freezer_get_status(buffer, buffer_size, retval); + } + + return err; +} + #endif /* CONFIG_FREEZE */ #endif /* DEVELOPMENT || DEBUG */ @@ -1390,7 +1752,7 @@ static void memorystatus_sort_by_largest_process_locked(unsigned int bucket_inde p = TAILQ_FIRST(¤t_bucket->list); while (p) { - memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL); + memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL); max_pages = pages; max_proc = p; prev_max_proc = p; @@ -1398,7 +1760,7 @@ static void memorystatus_sort_by_largest_process_locked(unsigned int bucket_inde while ((next_p = TAILQ_NEXT(p, p_memstat_list)) != NULL) { /* traversing list until we find next largest process */ p=next_p; - memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL); + memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL); if (pages > max_pages) { max_pages = pages; max_proc = p; @@ -1459,16 +1821,65 @@ static proc_t memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc return next_p; } +/* + * Structure to hold state for a jetsam thread. + * Typically there should be a single jetsam thread + * unless parallel jetsam is enabled. + */ +struct jetsam_thread_state { + boolean_t inited; /* if the thread is initialized */ + int memorystatus_wakeup; /* wake channel */ + int index; /* jetsam thread index */ + thread_t thread; /* jetsam thread pointer */ +} *jetsam_threads; + +/* Maximum number of jetsam threads allowed */ +#define JETSAM_THREADS_LIMIT 3 + +/* Number of active jetsam threads */ +_Atomic int active_jetsam_threads = 1; + +/* Number of maximum jetsam threads configured */ +int max_jetsam_threads = JETSAM_THREADS_LIMIT; + +/* + * Global switch for enabling fast jetsam. Fast jetsam is + * hooked up via the system_override() system call. It has the + * following effects: + * - Raise the jetsam threshold ("clear-the-deck") + * - Enabled parallel jetsam on eligible devices + */ +int fast_jetsam_enabled = 0; + +/* Routine to find the jetsam state structure for the current jetsam thread */ +static inline struct jetsam_thread_state * +jetsam_current_thread(void) +{ + for (int thr_id = 0; thr_id < max_jetsam_threads; thr_id++) { + if (jetsam_threads[thr_id].thread == current_thread()) + return &(jetsam_threads[thr_id]); + } + panic("jetsam_current_thread() is being called from a non-jetsam thread\n"); + /* Contol should not reach here */ + return NULL; +} + + __private_extern__ void memorystatus_init(void) { - thread_t thread = THREAD_NULL; kern_return_t result; int i; #if CONFIG_FREEZE + memorystatus_freeze_jetsam_band = JETSAM_PRIORITY_UI_SUPPORT; + memorystatus_frozen_processes_max = FREEZE_PROCESSES_MAX; + memorystatus_frozen_shared_mb_max = ((MAX_FROZEN_SHARED_MB_PERCENT * max_task_footprint_mb) / 100); /* 10% of the system wide task limit */ + memorystatus_freeze_shared_mb_per_process_max = (memorystatus_frozen_shared_mb_max / 4); memorystatus_freeze_pages_min = FREEZE_PAGES_MIN; memorystatus_freeze_pages_max = FREEZE_PAGES_MAX; + memorystatus_max_frozen_demotions_daily = MAX_FROZEN_PROCESS_DEMOTIONS; + memorystatus_thaw_count_demotion_threshold = MIN_THAW_DEMOTION_THRESHOLD; #endif #if DEVELOPMENT || DEBUG @@ -1591,13 +2002,22 @@ memorystatus_init(void) #endif /* CONFIG_JETSAM */ memorystatus_jetsam_snapshot_max = maxproc; + + memorystatus_jetsam_snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + + (sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_max); + memorystatus_jetsam_snapshot = - (memorystatus_jetsam_snapshot_t*)kalloc(sizeof(memorystatus_jetsam_snapshot_t) + - sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_max); + (memorystatus_jetsam_snapshot_t*)kalloc(memorystatus_jetsam_snapshot_size); if (!memorystatus_jetsam_snapshot) { panic("Could not allocate memorystatus_jetsam_snapshot"); } + memorystatus_jetsam_snapshot_copy = + (memorystatus_jetsam_snapshot_t*)kalloc(memorystatus_jetsam_snapshot_size); + if (!memorystatus_jetsam_snapshot_copy) { + panic("Could not allocate memorystatus_jetsam_snapshot_copy"); + } + nanoseconds_to_absolutetime((uint64_t)JETSAM_SNAPSHOT_TIMEOUT_SECS * NSEC_PER_SEC, &memorystatus_jetsam_snapshot_timeout); memset(&memorystatus_at_boot_snapshot, 0, sizeof(memorystatus_jetsam_snapshot_t)); @@ -1606,11 +2026,41 @@ memorystatus_init(void) memorystatus_freeze_threshold = (freeze_threshold_percentage / delta_percentage) * memorystatus_delta; #endif - result = kernel_thread_start_priority(memorystatus_thread, NULL, 95 /* MAXPRI_KERNEL */, &thread); - if (result == KERN_SUCCESS) { - thread_deallocate(thread); - } else { - panic("Could not create memorystatus_thread"); + /* Check the boot-arg to see if fast jetsam is allowed */ + if (!PE_parse_boot_argn("fast_jetsam_enabled", &fast_jetsam_enabled, sizeof (fast_jetsam_enabled))) { + fast_jetsam_enabled = 0; + } + + /* Check the boot-arg to configure the maximum number of jetsam threads */ + if (!PE_parse_boot_argn("max_jetsam_threads", &max_jetsam_threads, sizeof (max_jetsam_threads))) { + max_jetsam_threads = JETSAM_THREADS_LIMIT; + } + + /* Restrict the maximum number of jetsam threads to JETSAM_THREADS_LIMIT */ + if (max_jetsam_threads > JETSAM_THREADS_LIMIT) { + max_jetsam_threads = JETSAM_THREADS_LIMIT; + } + + /* For low CPU systems disable fast jetsam mechanism */ + if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) { + max_jetsam_threads = 1; + fast_jetsam_enabled = 0; + } + + /* Initialize the jetsam_threads state array */ + jetsam_threads = kalloc(sizeof(struct jetsam_thread_state) * max_jetsam_threads); + + /* Initialize all the jetsam threads */ + for (i = 0; i < max_jetsam_threads; i++) { + + result = kernel_thread_start_priority(memorystatus_thread, NULL, 95 /* MAXPRI_KERNEL */, &jetsam_threads[i].thread); + if (result == KERN_SUCCESS) { + jetsam_threads[i].inited = FALSE; + jetsam_threads[i].index = i; + thread_deallocate(jetsam_threads[i].thread); + } else { + panic("Could not create memorystatus_thread %d", i); + } } } @@ -1658,15 +2108,20 @@ memorystatus_do_kill(proc_t p, uint32_t cause, os_reason_t jetsam_reason) { (uint64_t)memorystatus_available_pages); } + /* + * The jetsam_reason (os_reason_t) has enough information about the kill cause. + * We don't really need jetsam_flags anymore, so it's okay that not all possible kill causes have been mapped. + */ int jetsam_flags = P_LTERM_JETSAM; switch (cause) { - case kMemorystatusKilledHiwat: jetsam_flags |= P_JETSAM_HIWAT; break; - case kMemorystatusKilledVnodes: jetsam_flags |= P_JETSAM_VNODE; break; - case kMemorystatusKilledVMPageShortage: jetsam_flags |= P_JETSAM_VMPAGESHORTAGE; break; - case kMemorystatusKilledVMThrashing: jetsam_flags |= P_JETSAM_VMTHRASHING; break; - case kMemorystatusKilledFCThrashing: jetsam_flags |= P_JETSAM_FCTHRASHING; break; - case kMemorystatusKilledPerProcessLimit: jetsam_flags |= P_JETSAM_PID; break; - case kMemorystatusKilledIdleExit: jetsam_flags |= P_JETSAM_IDLEEXIT; break; + case kMemorystatusKilledHiwat: jetsam_flags |= P_JETSAM_HIWAT; break; + case kMemorystatusKilledVnodes: jetsam_flags |= P_JETSAM_VNODE; break; + case kMemorystatusKilledVMPageShortage: jetsam_flags |= P_JETSAM_VMPAGESHORTAGE; break; + case kMemorystatusKilledVMCompressorThrashing: + case kMemorystatusKilledVMCompressorSpaceShortage: jetsam_flags |= P_JETSAM_VMTHRASHING; break; + case kMemorystatusKilledFCThrashing: jetsam_flags |= P_JETSAM_FCTHRASHING; break; + case kMemorystatusKilledPerProcessLimit: jetsam_flags |= P_JETSAM_PID; break; + case kMemorystatusKilledIdleExit: jetsam_flags |= P_JETSAM_IDLEEXIT; break; } error = jetsam_do_kill(p, jetsam_flags, jetsam_reason); @@ -1702,7 +2157,7 @@ memorystatus_check_levels_locked(void) { */ int -memorystatus_update_inactive_jetsam_priority_band(pid_t pid, uint32_t op_flags, boolean_t effective_now) +memorystatus_update_inactive_jetsam_priority_band(pid_t pid, uint32_t op_flags, int jetsam_prio, boolean_t effective_now) { int error = 0; boolean_t enable = FALSE; @@ -1734,7 +2189,7 @@ memorystatus_update_inactive_jetsam_priority_band(pid_t pid, uint32_t op_flags, memorystatus_invalidate_idle_demotion_locked(p, TRUE); if (effective_now) { - if (p->p_memstat_effectivepriority < JETSAM_PRIORITY_ELEVATED_INACTIVE) { + if (p->p_memstat_effectivepriority < jetsam_prio) { if(memorystatus_highwater_enabled) { /* * Process is about to transition from @@ -1746,7 +2201,7 @@ memorystatus_update_inactive_jetsam_priority_band(pid_t pid, uint32_t op_flags, CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal); task_set_phys_footprint_limit_internal(p->task, (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1, NULL, use_active, is_fatal); } - memorystatus_update_priority_locked(p, JETSAM_PRIORITY_ELEVATED_INACTIVE, FALSE, FALSE); + memorystatus_update_priority_locked(p, jetsam_prio, FALSE, FALSE); } } else { if (isProcessInAgingBands(p)) { @@ -1759,7 +2214,7 @@ memorystatus_update_inactive_jetsam_priority_band(pid_t pid, uint32_t op_flags, memorystatus_invalidate_idle_demotion_locked(p, TRUE); if (effective_now) { - if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_ELEVATED_INACTIVE) { + if (p->p_memstat_effectivepriority == jetsam_prio) { memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE); } } else { @@ -2122,21 +2577,48 @@ memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_inser * We can, however, override that if the process has an 'elevated inactive jetsam band' attribute. */ - if (priority <= JETSAM_PRIORITY_ELEVATED_INACTIVE && (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND)) { - priority = JETSAM_PRIORITY_ELEVATED_INACTIVE; - + if (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) { + /* + * 2 types of processes can use the non-standard elevated inactive band: + * - Frozen processes that always land in memorystatus_freeze_jetsam_band + * OR + * - processes that specifically opt-in to the elevated inactive support e.g. docked processes. + */ +#if CONFIG_FREEZE + if (p->p_memstat_state & P_MEMSTAT_FROZEN) { + if (priority <= memorystatus_freeze_jetsam_band) { + priority = memorystatus_freeze_jetsam_band; + } + } else +#endif /* CONFIG_FREEZE */ + { + if (priority <= JETSAM_PRIORITY_ELEVATED_INACTIVE) { + priority = JETSAM_PRIORITY_ELEVATED_INACTIVE; + } + } assert(! (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS)); } } else if (isApp(p)) { /* * Check to see if the application is being lowered in jetsam priority. If so, and: - * - it has an 'elevated inactive jetsam band' attribute, then put it in the JETSAM_PRIORITY_ELEVATED_INACTIVE band. + * - it has an 'elevated inactive jetsam band' attribute, then put it in the appropriate band. * - it is a normal application, then let it age in the aging band if that policy is in effect. */ - if (priority <= JETSAM_PRIORITY_ELEVATED_INACTIVE && (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND)) { - priority = JETSAM_PRIORITY_ELEVATED_INACTIVE; + if (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) { +#if CONFIG_FREEZE + if (p->p_memstat_state & P_MEMSTAT_FROZEN) { + if (priority <= memorystatus_freeze_jetsam_band) { + priority = memorystatus_freeze_jetsam_band; + } + } else +#endif /* CONFIG_FREEZE */ + { + if (priority <= JETSAM_PRIORITY_ELEVATED_INACTIVE) { + priority = JETSAM_PRIORITY_ELEVATED_INACTIVE; + } + } } else { if (applications_aging_band) { @@ -2259,6 +2741,15 @@ memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_inser if (now > p->p_memstat_idle_start) { p->p_memstat_idle_delta = now - p->p_memstat_idle_start; } + + /* + * About to become active and so memory footprint could change. + * So mark it eligible for freeze-considerations next time around. + */ + if (p->p_memstat_state & P_MEMSTAT_FREEZE_IGNORE) { + p->p_memstat_state &= ~P_MEMSTAT_FREEZE_IGNORE; + } + } else if (priority == JETSAM_PRIORITY_IDLE) { /* * Transitioning into the idle priority bucket. @@ -2267,6 +2758,8 @@ memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_inser p->p_memstat_idle_start = mach_absolute_time(); } + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_CHANGE_PRIORITY), p->p_pid, priority, p->p_memstat_effectivepriority, 0, 0); + p->p_memstat_effectivepriority = priority; #if CONFIG_SECLUDED_MEMORY @@ -2536,11 +3029,18 @@ memorystatus_remove(proc_t p, boolean_t locked) #if CONFIG_FREEZE if (p->p_memstat_state & (P_MEMSTAT_FROZEN)) { + + if (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) { + p->p_memstat_state &= ~P_MEMSTAT_REFREEZE_ELIGIBLE; + memorystatus_refreeze_eligible_count--; + } + memorystatus_frozen_count--; + memorystatus_frozen_shared_mb -= p->p_memstat_freeze_sharedanon_pages; + p->p_memstat_freeze_sharedanon_pages = 0; } if (p->p_memstat_state & P_MEMSTAT_SUSPENDED) { - memorystatus_suspended_footprint_total -= p->p_memstat_suspendedfootprint; memorystatus_suspended_count--; } #endif @@ -2587,8 +3087,15 @@ memorystatus_validate_track_flags(struct proc *target_p, uint32_t pcontrol) { return EINVAL; } - /* Deferral is only relevant if idle exit is specified */ + /* Only one type of DEFER behavior is allowed.*/ if ((pcontrol & PROC_DIRTY_DEFER) && + (pcontrol & PROC_DIRTY_DEFER_ALWAYS)) { + return EINVAL; + } + + /* Deferral is only relevant if idle exit is specified */ + if (((pcontrol & PROC_DIRTY_DEFER) || + (pcontrol & PROC_DIRTY_DEFER_ALWAYS)) && !(pcontrol & PROC_DIRTY_ALLOWS_IDLE_EXIT)) { return EINVAL; } @@ -2714,12 +3221,18 @@ memorystatus_dirty_track(proc_t p, uint32_t pcontrol) { /* This can be set and cleared exactly once. */ - if (pcontrol & PROC_DIRTY_DEFER) { + if (pcontrol & (PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) { - if ( !(old_dirty & P_DIRTY_DEFER)) { + if ((pcontrol & (PROC_DIRTY_DEFER)) && + !(old_dirty & P_DIRTY_DEFER)) { p->p_memstat_dirty |= P_DIRTY_DEFER; } + if ((pcontrol & (PROC_DIRTY_DEFER_ALWAYS)) && + !(old_dirty & P_DIRTY_DEFER_ALWAYS)) { + p->p_memstat_dirty |= P_DIRTY_DEFER_ALWAYS; + } + defer_now = TRUE; } @@ -2901,6 +3414,8 @@ memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol) { /* * Legacy mode: P_DIRTY_AGING_IN_PROGRESS means the process is in the aging band OR it might be heading back * there once it's clean again. For the legacy case, this only applies if it has some protection window left. + * P_DIRTY_DEFER: one-time protection window given at launch + * P_DIRTY_DEFER_ALWAYS: protection window given for every dirty->clean transition. Like non-legacy mode. * * Non-Legacy mode: P_DIRTY_AGING_IN_PROGRESS means the process is in the aging band. It will always stop over * in that band on it's way to IDLE. @@ -2924,9 +3439,11 @@ memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol) { */ if (jetsam_aging_policy == kJetsamAgingPolicyLegacy) { - if (mach_absolute_time() >= p->p_memstat_idledeadline) { + if (((p->p_memstat_dirty & P_DIRTY_DEFER_ALWAYS) == FALSE) && + (mach_absolute_time() >= p->p_memstat_idledeadline)) { /* - * The process' deadline has expired. It currently + * The process' hasn't enrolled in the "always defer after dirty" + * mode and its deadline has expired. It currently * does not reside in any of the aging buckets. * * It's on its way to the JETSAM_PRIORITY_IDLE @@ -2942,12 +3459,16 @@ memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol) { reschedule = TRUE; } else { /* - * It still has some protection window left and so + * Process enrolled in "always stop in deferral band after dirty" OR + * it still has some protection window left and so * we just re-arm the timer without modifying any * state on the process iff it still wants into that band. */ - if (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) { + if (p->p_memstat_dirty & P_DIRTY_DEFER_ALWAYS) { + memorystatus_schedule_idle_demotion_locked(p, TRUE); + reschedule = TRUE; + } else if (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) { memorystatus_schedule_idle_demotion_locked(p, FALSE); reschedule = TRUE; } @@ -3077,7 +3598,7 @@ memorystatus_dirty_clear(proc_t p, uint32_t pcontrol) { goto exit; } - if (!pcontrol || (pcontrol & (PROC_DIRTY_LAUNCH_IN_PROGRESS | PROC_DIRTY_DEFER)) == 0) { + if (!pcontrol || (pcontrol & (PROC_DIRTY_LAUNCH_IN_PROGRESS | PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) == 0) { ret = EINVAL; goto exit; } @@ -3087,16 +3608,19 @@ memorystatus_dirty_clear(proc_t p, uint32_t pcontrol) { } /* This can be set and cleared exactly once. */ - if (pcontrol & PROC_DIRTY_DEFER) { + if (pcontrol & (PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) { - if (p->p_memstat_dirty & P_DIRTY_DEFER) { + if (p->p_memstat_dirty & P_DIRTY_DEFER) { + p->p_memstat_dirty &= ~(P_DIRTY_DEFER); + } - p->p_memstat_dirty &= ~P_DIRTY_DEFER; + if (p->p_memstat_dirty & P_DIRTY_DEFER_ALWAYS) { + p->p_memstat_dirty &= ~(P_DIRTY_DEFER_ALWAYS); + } - memorystatus_invalidate_idle_demotion_locked(p, TRUE); - memorystatus_update_idle_priority_locked(p); - memorystatus_reschedule_idle_demotion_locked(); - } + memorystatus_invalidate_idle_demotion_locked(p, TRUE); + memorystatus_update_idle_priority_locked(p); + memorystatus_reschedule_idle_demotion_locked(); } ret = 0; @@ -3156,12 +3680,10 @@ memorystatus_on_suspend(proc_t p) { #if CONFIG_FREEZE uint32_t pages; - memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL); + memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL); #endif proc_list_lock(); #if CONFIG_FREEZE - p->p_memstat_suspendedfootprint = pages; - memorystatus_suspended_footprint_total += pages; memorystatus_suspended_count++; #endif p->p_memstat_state |= P_MEMSTAT_SUSPENDED; @@ -3181,17 +3703,36 @@ memorystatus_on_resume(proc_t p) #if CONFIG_FREEZE frozen = (p->p_memstat_state & P_MEMSTAT_FROZEN); if (frozen) { - memorystatus_frozen_count--; - p->p_memstat_state |= P_MEMSTAT_PRIOR_THAW; + /* + * Now that we don't _thaw_ a process completely, + * resuming it (and having some on-demand swapins) + * shouldn't preclude it from being counted as frozen. + * + * memorystatus_frozen_count--; + * + * We preserve the P_MEMSTAT_FROZEN state since the process + * could have state on disk AND so will deserve some protection + * in the jetsam bands. + */ + if ((p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) == 0) { + p->p_memstat_state |= P_MEMSTAT_REFREEZE_ELIGIBLE; + memorystatus_refreeze_eligible_count++; + } + p->p_memstat_thaw_count++; + + memorystatus_thaw_count++; } - memorystatus_suspended_footprint_total -= p->p_memstat_suspendedfootprint; memorystatus_suspended_count--; pid = p->p_pid; #endif - p->p_memstat_state &= ~(P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN); + /* + * P_MEMSTAT_FROZEN will remain unchanged. This used to be: + * p->p_memstat_state &= ~(P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN); + */ + p->p_memstat_state &= ~P_MEMSTAT_SUSPENDED; proc_list_unlock(); @@ -3227,7 +3768,7 @@ memorystatus_build_state(proc_t p) { if (p->p_memstat_state & P_MEMSTAT_FROZEN) { snapshot_state |= kMemorystatusFrozen; } - if (p->p_memstat_state & P_MEMSTAT_PRIOR_THAW) { + if (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) { snapshot_state |= kMemorystatusWasThawed; } @@ -3296,19 +3837,49 @@ kill_idle_exit_proc(void) } static void -memorystatus_thread_wake(void) { - thread_wakeup((event_t)&memorystatus_wakeup); +memorystatus_thread_wake(void) +{ + int thr_id = 0; + int active_thr = atomic_load(&active_jetsam_threads); + + /* Wakeup all the jetsam threads */ + for (thr_id = 0; thr_id < active_thr; thr_id++) { + thread_wakeup((event_t)&jetsam_threads[thr_id].memorystatus_wakeup); + } +} + +#if CONFIG_JETSAM + +static void +memorystatus_thread_pool_max() +{ + /* Increase the jetsam thread pool to max_jetsam_threads */ + int max_threads = max_jetsam_threads; + printf("Expanding memorystatus pool to %d!\n", max_threads); + atomic_store(&active_jetsam_threads, max_threads); +} + +static void +memorystatus_thread_pool_default() +{ + /* Restore the jetsam thread pool to a single thread */ + printf("Reverting memorystatus pool back to 1\n"); + atomic_store(&active_jetsam_threads, 1); } +#endif /* CONFIG_JETSAM */ + extern void vm_pressure_response(void); static int memorystatus_thread_block(uint32_t interval_ms, thread_continue_t continuation) { + struct jetsam_thread_state *jetsam_thread = jetsam_current_thread(); + if (interval_ms) { - assert_wait_timeout(&memorystatus_wakeup, THREAD_UNINT, interval_ms, 1000 * NSEC_PER_USEC); + assert_wait_timeout(&jetsam_thread->memorystatus_wakeup, THREAD_UNINT, interval_ms, NSEC_PER_MSEC); } else { - assert_wait(&memorystatus_wakeup, THREAD_UNINT); + assert_wait(&jetsam_thread->memorystatus_wakeup, THREAD_UNINT); } return thread_block(continuation); @@ -3380,6 +3951,192 @@ memorystatus_action_needed(void) #endif /* CONFIG_EMBEDDED */ } +#if CONFIG_FREEZE +extern void vm_swap_consider_defragmenting(int); + +/* + * This routine will _jetsam_ all frozen processes + * and reclaim the swap space immediately. + * + * So freeze has to be DISABLED when we call this routine. + */ + +void +memorystatus_disable_freeze(void) +{ + memstat_bucket_t *bucket; + int bucket_count = 0, retries = 0; + boolean_t retval = FALSE, killed = FALSE; + uint32_t errors = 0, errors_over_prev_iteration = 0; + os_reason_t jetsam_reason = 0; + unsigned int band = 0; + proc_t p = PROC_NULL, next_p = PROC_NULL; + + assert(memorystatus_freeze_enabled == FALSE); + + jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_DISK_SPACE_SHORTAGE); + if (jetsam_reason == OS_REASON_NULL) { + printf("memorystatus_disable_freeze: failed to allocate jetsam reason\n"); + } + + /* + * Let's relocate all frozen processes into band 8. Demoted frozen processes + * are sitting in band 0 currently and it's possible to have a frozen process + * in the FG band being actively used. We don't reset its frozen state when + * it is resumed because it has state on disk. + * + * We choose to do this relocation rather than implement a new 'kill frozen' + * process function for these reasons: + * - duplication of code: too many kill functions exist and we need to rework them better. + * - disk-space-shortage kills are rare + * - not having the 'real' jetsam band at time of the this frozen kill won't preclude us + * from answering any imp. questions re. jetsam policy/effectiveness. + * + * This is essentially what memorystatus_update_inactive_jetsam_priority_band() does while + * avoiding the application of memory limits. + */ + +again: + proc_list_lock(); + + band = JETSAM_PRIORITY_IDLE; + p = PROC_NULL; + next_p = PROC_NULL; + + next_p = memorystatus_get_first_proc_locked(&band, TRUE); + while (next_p) { + + p = next_p; + next_p = memorystatus_get_next_proc_locked(&band, p, TRUE); + + if (p->p_memstat_effectivepriority > JETSAM_PRIORITY_FOREGROUND) { + break; + } + + if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == FALSE) { + continue; + } + + if (p->p_memstat_state & P_MEMSTAT_ERROR) { + p->p_memstat_state &= ~P_MEMSTAT_ERROR; + } + + if (p->p_memstat_effectivepriority == memorystatus_freeze_jetsam_band) { + continue; + } + + /* + * We explicitly add this flag here so the process looks like a normal + * frozen process i.e. P_MEMSTAT_FROZEN and P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND. + * We don't bother with assigning the 'active' memory + * limits at this point because we are going to be killing it soon below. + */ + p->p_memstat_state |= P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND; + memorystatus_invalidate_idle_demotion_locked(p, TRUE); + + memorystatus_update_priority_locked(p, memorystatus_freeze_jetsam_band, FALSE, TRUE); + } + + bucket = &memstat_bucket[memorystatus_freeze_jetsam_band]; + bucket_count = bucket->count; + proc_list_unlock(); + + /* + * Bucket count is already stale at this point. But, we don't expect + * freezing to continue since we have already disabled the freeze functionality. + * However, an existing freeze might be in progress. So we might miss that process + * in the first go-around. We hope to catch it in the next. + */ + + errors_over_prev_iteration = 0; + while (bucket_count) { + + bucket_count--; + + /* + * memorystatus_kill_elevated_process() drops a reference, + * so take another one so we can continue to use this exit reason + * even after it returns. + */ + + os_reason_ref(jetsam_reason); + retval = memorystatus_kill_elevated_process( + kMemorystatusKilledDiskSpaceShortage, + jetsam_reason, + memorystatus_freeze_jetsam_band, + 0, /* the iteration of aggressive jetsam..ignored here */ + &errors); + + if (errors > 0) { + printf("memorystatus_disable_freeze: memorystatus_kill_elevated_process returned %d error(s)\n", errors); + errors_over_prev_iteration += errors; + errors = 0; + } + + if (retval == 0) { + /* + * No frozen processes left to kill. + */ + break; + } + + killed = TRUE; + } + + proc_list_lock(); + + if (memorystatus_frozen_count) { + /* + * A frozen process snuck in and so + * go back around to kill it. That + * process may have been resumed and + * put into the FG band too. So we + * have to do the relocation again. + */ + assert(memorystatus_freeze_enabled == FALSE); + + retries++; + if (retries < 3) { + proc_list_unlock(); + goto again; + } +#if DEVELOPMENT || DEBUG + panic("memorystatus_disable_freeze: Failed to kill all frozen processes, memorystatus_frozen_count = %d, errors = %d", + memorystatus_frozen_count, errors_over_prev_iteration); +#endif /* DEVELOPMENT || DEBUG */ + } + proc_list_unlock(); + + os_reason_free(jetsam_reason); + + if (killed) { + + vm_swap_consider_defragmenting(VM_SWAP_FLAGS_FORCE_DEFRAG | VM_SWAP_FLAGS_FORCE_RECLAIM); + + proc_list_lock(); + size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + + sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count); + uint64_t timestamp_now = mach_absolute_time(); + memorystatus_jetsam_snapshot->notification_time = timestamp_now; + memorystatus_jetsam_snapshot->js_gencount++; + if (memorystatus_jetsam_snapshot_count > 0 && (memorystatus_jetsam_snapshot_last_timestamp == 0 || + timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout)) { + proc_list_unlock(); + int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size)); + if (!ret) { + proc_list_lock(); + memorystatus_jetsam_snapshot_last_timestamp = timestamp_now; + proc_list_unlock(); + } + } else { + proc_list_unlock(); + } + } + + return; +} +#endif /* CONFIG_FREEZE */ + static boolean_t memorystatus_act_on_hiwat_processes(uint32_t *errors, uint32_t *hwm_kill, boolean_t *post_snapshot, __unused boolean_t *is_critical) { @@ -3557,6 +4314,7 @@ memorystatus_act_aggressive(uint32_t cause, os_reason_t jetsam_reason, int *jld_ killed = memorystatus_kill_elevated_process( cause, jetsam_reason, + JETSAM_PRIORITY_ELEVATED_INACTIVE, jld_eval_aggressive_count, &errors); @@ -3582,11 +4340,11 @@ memorystatus_act_aggressive(uint32_t cause, os_reason_t jetsam_reason, int *jld_ /* * memorystatus_kill_top_process_aggressive() allocates its own - * jetsam_reason so the kMemorystatusKilledVMThrashing cause + * jetsam_reason so the kMemorystatusKilledProcThrashing cause * is consistent throughout the aggressive march. */ killed = memorystatus_kill_top_process_aggressive( - kMemorystatusKilledVMThrashing, + kMemorystatusKilledProcThrashing, jld_eval_aggressive_count, jld_priority_band_max, &errors); @@ -3609,26 +4367,31 @@ memorystatus_act_aggressive(uint32_t cause, os_reason_t jetsam_reason, int *jld_ static void memorystatus_thread(void *param __unused, wait_result_t wr __unused) { - static boolean_t is_vm_privileged = FALSE; - boolean_t post_snapshot = FALSE; uint32_t errors = 0; uint32_t hwm_kill = 0; boolean_t sort_flag = TRUE; boolean_t corpse_list_purged = FALSE; int jld_idle_kills = 0; + struct jetsam_thread_state *jetsam_thread = jetsam_current_thread(); - if (is_vm_privileged == FALSE) { + if (jetsam_thread->inited == FALSE) { /* * It's the first time the thread has run, so just mark the thread as privileged and block. * This avoids a spurious pass with unset variables, as set out in . */ + + char name[32]; thread_wire(host_priv_self(), current_thread(), TRUE); - is_vm_privileged = TRUE; - - if (vm_restricted_to_single_processor == TRUE) - thread_vm_bind_group_add(); - thread_set_thread_name(current_thread(), "VM_memorystatus"); + snprintf(name, 32, "VM_memorystatus_%d", jetsam_thread->index + 1); + + if (jetsam_thread->index == 0) { + if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) { + thread_vm_bind_group_add(); + } + } + thread_set_thread_name(current_thread(), name); + jetsam_thread->inited = TRUE; memorystatus_thread_block(0, memorystatus_thread); } @@ -3659,8 +4422,11 @@ memorystatus_thread(void *param __unused, wait_result_t wr __unused) case kMemorystatusKilledFCThrashing: jetsam_reason_code = JETSAM_REASON_MEMORY_FCTHRASHING; break; - case kMemorystatusKilledVMThrashing: - jetsam_reason_code = JETSAM_REASON_MEMORY_VMTHRASHING; + case kMemorystatusKilledVMCompressorThrashing: + jetsam_reason_code = JETSAM_REASON_MEMORY_VMCOMPRESSOR_THRASHING; + break; + case kMemorystatusKilledVMCompressorSpaceShortage: + jetsam_reason_code = JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE; break; case kMemorystatusKilledZoneMapExhaustion: jetsam_reason_code = JETSAM_REASON_ZONE_MAP_EXHAUSTION; @@ -3863,7 +4629,7 @@ memorystatus_on_ledger_footprint_exceeded(boolean_t warning, boolean_t memlimit_ jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_PERPROCESSLIMIT); if (jetsam_reason == NULL) { printf("task_exceeded footprint: failed to allocate jetsam reason\n"); - } else if (corpse_for_fatal_memkill != 0) { + } else if (corpse_for_fatal_memkill != 0 && proc_send_synchronous_EXC_RESOURCE(p) == FALSE) { /* Set OS_REASON_FLAG_GENERATE_CRASH_REPORT to generate corpse */ jetsam_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT; } @@ -4254,7 +5020,6 @@ memorystatus_allowed_vm_map_fork(task_t task) #if CONFIG_EMBEDDED uint64_t footprint_in_bytes; - uint64_t purgeable_in_bytes; uint64_t max_allowed_bytes; if (max_task_footprint_mb == 0) { @@ -4262,17 +5027,12 @@ memorystatus_allowed_vm_map_fork(task_t task) return (is_allowed); } - purgeable_in_bytes = get_task_purgeable_size(task); footprint_in_bytes = get_task_phys_footprint(task); /* - * Maximum is half the system-wide task limit. + * Maximum is 1/4 of the system-wide task limit. */ - max_allowed_bytes = ((uint64_t)max_task_footprint_mb * 1024 * 1024) >> 1; - - if (footprint_in_bytes > purgeable_in_bytes) { - footprint_in_bytes -= purgeable_in_bytes; - } + max_allowed_bytes = ((uint64_t)max_task_footprint_mb * 1024 * 1024) >> 2; if (footprint_in_bytes > max_allowed_bytes) { printf("memorystatus disallowed vm_map_fork %lld %lld\n", footprint_in_bytes, max_allowed_bytes); @@ -4287,7 +5047,7 @@ memorystatus_allowed_vm_map_fork(task_t task) } static void -memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages) +memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages) { assert(task); assert(footprint); @@ -4298,11 +5058,6 @@ memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *ma assert(((uint32_t)pages) == pages); *footprint = (uint32_t)pages; - if (max_footprint) { - pages = (get_task_phys_footprint_recent_max(task) / PAGE_SIZE_64); - assert(((uint32_t)pages) == pages); - *max_footprint = (uint32_t)pages; - } if (max_footprint_lifetime) { pages = (get_task_resident_max(task) / PAGE_SIZE_64); assert(((uint32_t)pages) == pages); @@ -4371,6 +5126,8 @@ memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, unsigned int i; + LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED); + if (memorystatus_jetsam_snapshot_count == 0) { /* * No active snapshot. @@ -4411,6 +5168,11 @@ memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, entry->jse_killtime = killtime; entry->jse_gencount = snapshot->js_gencount; entry->jse_idle_delta = p->p_memstat_idle_delta; +#if CONFIG_FREEZE + entry->jse_thaw_count = p->p_memstat_thaw_count; +#else /* CONFIG_FREEZE */ + entry->jse_thaw_count = 0; +#endif /* CONFIG_FREEZE */ /* * If a process has moved between bands since snapshot was @@ -4430,13 +5192,11 @@ memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, */ uint32_t pages = 0; - uint32_t max_pages = 0; uint32_t max_pages_lifetime = 0; uint32_t purgeable_pages = 0; - memorystatus_get_task_page_counts(p->task, &pages, &max_pages, &max_pages_lifetime, &purgeable_pages); + memorystatus_get_task_page_counts(p->task, &pages, &max_pages_lifetime, &purgeable_pages); entry->pages = (uint64_t)pages; - entry->max_pages = (uint64_t)max_pages; entry->max_pages_lifetime = (uint64_t)max_pages_lifetime; entry->purgeable_pages = (uint64_t)purgeable_pages; @@ -4546,6 +5306,25 @@ void memorystatus_pages_update(unsigned int pages_avail) memorystatus_thread_wake(); } } +#if CONFIG_FREEZE + /* + * We can't grab the freezer_mutex here even though that synchronization would be correct to inspect + * the # of frozen processes and wakeup the freezer thread. Reason being that we come here into this + * code with (possibly) the page-queue locks held and preemption disabled. So trying to grab a mutex here + * will result in the "mutex with preemption disabled" panic. + */ + + if (memorystatus_freeze_thread_should_run() == TRUE) { + /* + * The freezer thread is usually woken up by some user-space call i.e. pid_hibernate(any process). + * That trigger isn't invoked often enough and so we are enabling this explicit wakeup here. + */ + if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { + thread_wakeup((event_t)&memorystatus_freeze_wakeup); + } + } +#endif /* CONFIG_FREEZE */ + #else /* VM_PRESSURE_EVENTS */ boolean_t critical, delta; @@ -4578,7 +5357,6 @@ memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_sna clock_sec_t tv_sec; clock_usec_t tv_usec; uint32_t pages = 0; - uint32_t max_pages = 0; uint32_t max_pages_lifetime = 0; uint32_t purgeable_pages = 0; uint64_t internal_pages = 0; @@ -4598,9 +5376,8 @@ memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_sna strlcpy(&entry->name[0], p->p_name, sizeof(entry->name)); entry->priority = p->p_memstat_effectivepriority; - memorystatus_get_task_page_counts(p->task, &pages, &max_pages, &max_pages_lifetime, &purgeable_pages); + memorystatus_get_task_page_counts(p->task, &pages, &max_pages_lifetime, &purgeable_pages); entry->pages = (uint64_t)pages; - entry->max_pages = (uint64_t)max_pages; entry->max_pages_lifetime = (uint64_t)max_pages_lifetime; entry->purgeable_pages = (uint64_t)purgeable_pages; @@ -4627,8 +5404,8 @@ memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_sna entry->fds = p->p_fd->fd_nfiles; absolutetime_to_microtime(get_task_cpu_time(p->task), &tv_sec, &tv_usec); - entry->cpu_time.tv_sec = tv_sec; - entry->cpu_time.tv_usec = tv_usec; + entry->cpu_time.tv_sec = (int64_t)tv_sec; + entry->cpu_time.tv_usec = (int64_t)tv_usec; assert(p->p_stats != NULL); entry->jse_starttime = p->p_stats->ps_start; /* abstime process started */ @@ -4638,6 +5415,12 @@ memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_sna entry->jse_idle_delta = p->p_memstat_idle_delta; /* Most recent timespan spent in idle-band */ +#if CONFIG_FREEZE + entry->jse_thaw_count = p->p_memstat_thaw_count; +#else /* CONFIG_FREEZE */ + entry->jse_thaw_count = 0; +#endif /* CONFIG_FREEZE */ + proc_coalitionids(p, cids); entry->jse_coalition_jetsam_id = cids[COALITION_TYPE_JETSAM]; @@ -4699,6 +5482,8 @@ memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t *od_snap memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL; unsigned int snapshot_max = 0; + LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED); + if (od_snapshot) { /* * This is an on_demand snapshot @@ -4815,7 +5600,7 @@ memorystatus_kill_proc(proc_t p, uint32_t cause, os_reason_t jetsam_reason, bool pid_t aPid = 0; uint32_t aPid_ep = 0; - uint64_t killtime = 0; + uint64_t killtime = 0; clock_sec_t tv_sec; clock_usec_t tv_usec; uint32_t tv_msec; @@ -4891,7 +5676,11 @@ memorystatus_kill_proc(proc_t p, uint32_t cause, os_reason_t jetsam_reason, bool } } - memorystatus_update_jetsam_snapshot_entry_locked(p, kMemorystatusKilledDiagnostic, killtime); + proc_list_lock(); + /* This diagnostic code is going away soon. Ignore the kMemorystatusInvalid cause here. */ + memorystatus_update_jetsam_snapshot_entry_locked(p, kMemorystatusInvalid, killtime); + proc_list_unlock(); + p->p_memstat_state |= P_MEMSTAT_DIAG_SUSPENDED; if (p) { @@ -4901,7 +5690,9 @@ memorystatus_kill_proc(proc_t p, uint32_t cause, os_reason_t jetsam_reason, bool } else #endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */ { + proc_list_lock(); memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime); + proc_list_unlock(); char kill_reason_string[128]; @@ -4977,7 +5768,7 @@ memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause * * kMemorystatusKilledZoneMapExhaustion * AND - * kMemorystatusKilledVMThrashing + * kMemorystatusKilledVMCompressorSpaceShortage * * If we are here because of kMemorystatusKilledZoneMapExhaustion, we will consider * any and all processes as eligible kill candidates since we need to avoid a panic. @@ -5047,7 +5838,7 @@ memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause #if CONFIG_FREEZE boolean_t skip; - boolean_t reclaim_proc = !(p->p_memstat_state & (P_MEMSTAT_LOCKED | P_MEMSTAT_NORECLAIM)); + boolean_t reclaim_proc = !(p->p_memstat_state & P_MEMSTAT_LOCKED); if (any || reclaim_proc) { skip = FALSE; } else { @@ -5068,7 +5859,6 @@ memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause */ p->p_memstat_state |= P_MEMSTAT_TERMINATED; - proc_list_unlock(); } else { /* * We need to restart the search again because @@ -5095,6 +5885,8 @@ memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause new_snapshot = TRUE; } + proc_list_unlock(); + freed_mem = memorystatus_kill_proc(p, cause, jetsam_reason, &killed); /* purged and/or killed 'p' */ /* Success? */ if (freed_mem) { @@ -5369,7 +6161,9 @@ memorystatus_kill_top_process_aggressive(uint32_t cause, int aggr_count, /* Clear snapshot if freshly captured and no target was found */ if (new_snapshot && (kill_count == 0)) { + proc_list_lock(); memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0; + proc_list_unlock(); } KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END, @@ -5531,13 +6325,12 @@ memorystatus_kill_hiwat_proc(uint32_t *errors, boolean_t *purged) * false -- no pinned process was jetsammed */ static boolean_t -memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, int aggr_count, uint32_t *errors) +memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, unsigned int band, int aggr_count, uint32_t *errors) { pid_t aPid = 0; proc_t p = PROC_NULL, next_p = PROC_NULL; boolean_t new_snapshot = FALSE, killed = FALSE; int kill_count = 0; - unsigned int i = JETSAM_PRIORITY_ELEVATED_INACTIVE; uint32_t aPid_ep; uint64_t killtime = 0; clock_sec_t tv_sec; @@ -5548,13 +6341,21 @@ memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, in KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START, memorystatus_available_pages, 0, 0, 0, 0); +#if CONFIG_FREEZE + boolean_t consider_frozen_only = FALSE; + + if (band == (unsigned int) memorystatus_freeze_jetsam_band) { + consider_frozen_only = TRUE; + } +#endif /* CONFIG_FREEZE */ + proc_list_lock(); - next_p = memorystatus_get_first_proc_locked(&i, FALSE); + next_p = memorystatus_get_first_proc_locked(&band, FALSE); while (next_p) { p = next_p; - next_p = memorystatus_get_next_proc_locked(&i, p, FALSE); + next_p = memorystatus_get_next_proc_locked(&band, p, FALSE); aPid = p->p_pid; aPid_ep = p->p_memstat_effectivepriority; @@ -5571,10 +6372,14 @@ memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, in } #if CONFIG_FREEZE + if (consider_frozen_only && ! (p->p_memstat_state & P_MEMSTAT_FROZEN)) { + continue; + } + if (p->p_memstat_state & P_MEMSTAT_LOCKED) { continue; } -#endif +#endif /* CONFIG_FREEZE */ #if DEVELOPMENT || DEBUG MEMORYSTATUS_DEBUG(1, "jetsam: elevated%d process pid %d [%s] - memorystatus_available_pages: %d\n", @@ -5644,7 +6449,7 @@ memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, in * And, we hold the the proc_list_lock at this point. */ - next_p = memorystatus_get_first_proc_locked(&i, FALSE); + next_p = memorystatus_get_first_proc_locked(&band, FALSE); } proc_list_unlock(); @@ -5673,8 +6478,12 @@ memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause) { * NOTE: If a new async kill cause is added, make sure to update memorystatus_thread() to * add the appropriate exit reason code mapping. */ - if ((victim_pid != -1) || (cause != kMemorystatusKilledVMPageShortage && cause != kMemorystatusKilledVMThrashing && - cause != kMemorystatusKilledFCThrashing && cause != kMemorystatusKilledZoneMapExhaustion)) { + if ((victim_pid != -1) || + (cause != kMemorystatusKilledVMPageShortage && + cause != kMemorystatusKilledVMCompressorThrashing && + cause != kMemorystatusKilledVMCompressorSpaceShortage && + cause != kMemorystatusKilledFCThrashing && + cause != kMemorystatusKilledZoneMapExhaustion)) { return FALSE; } @@ -5684,20 +6493,34 @@ memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause) { } boolean_t -memorystatus_kill_on_VM_thrashing(boolean_t async) { +memorystatus_kill_on_VM_compressor_space_shortage(boolean_t async) { if (async) { - return memorystatus_kill_process_async(-1, kMemorystatusKilledVMThrashing); + return memorystatus_kill_process_async(-1, kMemorystatusKilledVMCompressorSpaceShortage); } else { - os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMTHRASHING); + os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE); if (jetsam_reason == OS_REASON_NULL) { - printf("memorystatus_kill_on_VM_thrashing -- sync: failed to allocate jetsam reason\n"); + printf("memorystatus_kill_on_VM_compressor_space_shortage -- sync: failed to allocate jetsam reason\n"); } - return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMThrashing, jetsam_reason); + return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMCompressorSpaceShortage, jetsam_reason); } } #if CONFIG_JETSAM +boolean_t +memorystatus_kill_on_VM_compressor_thrashing(boolean_t async) { + if (async) { + return memorystatus_kill_process_async(-1, kMemorystatusKilledVMCompressorThrashing); + } else { + os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMCOMPRESSOR_THRASHING); + if (jetsam_reason == OS_REASON_NULL) { + printf("memorystatus_kill_on_VM_compressor_thrashing -- sync: failed to allocate jetsam reason\n"); + } + + return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMCompressorThrashing, jetsam_reason); + } +} + boolean_t memorystatus_kill_on_VM_page_shortage(boolean_t async) { if (async) { @@ -5768,18 +6591,89 @@ memorystatus_freeze_init(void) freezer_lck_grp = lck_grp_alloc_init("freezer", freezer_lck_grp_attr); lck_mtx_init(&freezer_mutex, freezer_lck_grp, NULL); - + + /* + * This is just the default value if the underlying + * storage device doesn't have any specific budget. + * We check with the storage layer in memorystatus_freeze_update_throttle() + * before we start our freezing the first time. + */ + memorystatus_freeze_budget_pages_remaining = (memorystatus_freeze_daily_mb_max * 1024 * 1024) / PAGE_SIZE; + result = kernel_thread_start(memorystatus_freeze_thread, NULL, &thread); if (result == KERN_SUCCESS) { + + proc_set_thread_policy(thread, TASK_POLICY_INTERNAL, TASK_POLICY_IO, THROTTLE_LEVEL_COMPRESSOR_TIER2); + proc_set_thread_policy(thread, TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE); + thread_set_thread_name(thread, "VM_freezer"); + thread_deallocate(thread); } else { panic("Could not create memorystatus_freeze_thread"); } } +static boolean_t +memorystatus_is_process_eligible_for_freeze(proc_t p) +{ + /* + * Called with proc_list_lock held. + */ + + LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED); + + boolean_t should_freeze = FALSE; + uint32_t state = 0, entry_count = 0, pages = 0, i = 0; + int probability_of_use = 0; + + if (isApp(p) == FALSE) { + goto out; + } + + state = p->p_memstat_state; + + if ((state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FREEZE_DISABLED | P_MEMSTAT_FREEZE_IGNORE)) || + !(state & P_MEMSTAT_SUSPENDED)) { + goto out; + } + + /* Only freeze processes meeting our minimum resident page criteria */ + memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL); + if (pages < memorystatus_freeze_pages_min) { + goto out; + } + + entry_count = (memorystatus_global_probabilities_size / sizeof(memorystatus_internal_probabilities_t)); + + if (entry_count) { + + for (i=0; i < entry_count; i++ ) { + if (strncmp(memorystatus_global_probabilities_table[i].proc_name, + p->p_name, + MAXCOMLEN + 1) == 0) { + + probability_of_use = memorystatus_global_probabilities_table[i].use_probability; + break; + } + } + + if (probability_of_use == 0) { + goto out; + } + } + + should_freeze = TRUE; +out: + return should_freeze; +} + /* * Synchronously freeze the passed proc. Called with a reference to the proc held. * + * Doesn't deal with re-freezing because this is called on a specific process and + * not by the freezer thread. If that changes, we'll have to teach it about + * refreezing a frozen process. + * * Returns EINVAL or the value returned by task_freeze(). */ int @@ -5788,69 +6682,49 @@ memorystatus_freeze_process_sync(proc_t p) int ret = EINVAL; pid_t aPid = 0; boolean_t memorystatus_freeze_swap_low = FALSE; - - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_START, - memorystatus_available_pages, 0, 0, 0, 0); + int freezer_error_code = 0; lck_mtx_lock(&freezer_mutex); if (p == NULL) { + printf("memorystatus_freeze_process_sync: Invalid process\n"); goto exit; } if (memorystatus_freeze_enabled == FALSE) { + printf("memorystatus_freeze_process_sync: Freezing is DISABLED\n"); goto exit; } if (!memorystatus_can_freeze(&memorystatus_freeze_swap_low)) { + printf("memorystatus_freeze_process_sync: Low compressor and/or low swap space...skipping freeze\n"); goto exit; } - if (memorystatus_freeze_update_throttle()) { - printf("memorystatus_freeze_process_sync: in throttle, ignorning freeze\n"); - memorystatus_freeze_throttle_count++; + memorystatus_freeze_update_throttle(&memorystatus_freeze_budget_pages_remaining); + if (!memorystatus_freeze_budget_pages_remaining) { + printf("memorystatus_freeze_process_sync: exit with NO available budget\n"); goto exit; } proc_list_lock(); if (p != NULL) { - uint32_t purgeable, wired, clean, dirty, state; - uint32_t max_pages, pages, i; - boolean_t shared; + uint32_t purgeable, wired, clean, dirty, shared; + uint32_t max_pages, i; aPid = p->p_pid; - state = p->p_memstat_state; /* Ensure the process is eligible for freezing */ - if ((state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FROZEN)) || !(state & P_MEMSTAT_SUSPENDED)) { - proc_list_unlock(); - goto exit; - } - - /* Only freeze processes meeting our minimum resident page criteria */ - memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL); - if (pages < memorystatus_freeze_pages_min) { + if (memorystatus_is_process_eligible_for_freeze(p) == FALSE) { proc_list_unlock(); goto exit; } if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { - unsigned int avail_swap_space = 0; /* in pages. */ - - /* - * Freezer backed by the compressor and swap file(s) - * while will hold compressed data. - */ - avail_swap_space = vm_swap_get_free_space() / PAGE_SIZE_64; - - max_pages = MIN(avail_swap_space, memorystatus_freeze_pages_max); + max_pages = MIN(memorystatus_freeze_pages_max, memorystatus_freeze_budget_pages_remaining); - if (max_pages < memorystatus_freeze_pages_min) { - proc_list_unlock(); - goto exit; - } } else { /* * We only have the compressor without any swap. @@ -5862,7 +6736,13 @@ memorystatus_freeze_process_sync(proc_t p) p->p_memstat_state |= P_MEMSTAT_LOCKED; proc_list_unlock(); - ret = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, FALSE); + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_START, + memorystatus_available_pages, 0, 0, 0, 0); + + ret = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, &freezer_error_code, FALSE /* eval only */); + + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_END, + memorystatus_available_pages, aPid, 0, 0, 0); DTRACE_MEMORYSTATUS6(memorystatus_freeze, proc_t, p, unsigned int, memorystatus_available_pages, boolean_t, purgeable, unsigned int, wired, uint32_t, clean, uint32_t, dirty); @@ -5872,99 +6752,188 @@ memorystatus_freeze_process_sync(proc_t p) memorystatus_available_pages, purgeable, wired, clean, dirty, max_pages, shared); proc_list_lock(); - p->p_memstat_state &= ~P_MEMSTAT_LOCKED; if (ret == KERN_SUCCESS) { + + os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: freezing (specific) pid %d [%s]...done", + aPid, (*p->p_name ? p->p_name : "unknown")); + memorystatus_freeze_entry_t data = { aPid, TRUE, dirty }; - memorystatus_frozen_count++; + p->p_memstat_freeze_sharedanon_pages += shared; - p->p_memstat_state |= (P_MEMSTAT_FROZEN | (shared ? 0: P_MEMSTAT_NORECLAIM)); + memorystatus_frozen_shared_mb += shared; - if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { - /* Update stats */ - for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) { - throttle_intervals[i].pageouts += dirty; - } + if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == 0) { + p->p_memstat_state |= P_MEMSTAT_FROZEN; + memorystatus_frozen_count++; } - memorystatus_freeze_pageouts += dirty; - memorystatus_freeze_count++; + p->p_memstat_frozen_count++; + /* + * Still keeping the P_MEMSTAT_LOCKED bit till we are actually done elevating this frozen process + * to its higher jetsam band. + */ proc_list_unlock(); memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data)); + + if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { + + ret = memorystatus_update_inactive_jetsam_priority_band(p->p_pid, MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE, + memorystatus_freeze_jetsam_band, TRUE); + + if (ret) { + printf("Elevating the frozen process failed with %d\n", ret); + /* not fatal */ + ret = 0; + } + + proc_list_lock(); + + /* Update stats */ + for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) { + throttle_intervals[i].pageouts += dirty; + } + } else { + proc_list_lock(); + } + + memorystatus_freeze_pageouts += dirty; + + if (memorystatus_frozen_count == (memorystatus_frozen_processes_max - 1)) { + /* + * Add some eviction logic here? At some point should we + * jetsam a process to get back its swap space so that we + * can freeze a more eligible process at this moment in time? + */ + } } else { - proc_list_unlock(); + char reason[128]; + if (freezer_error_code == FREEZER_ERROR_EXCESS_SHARED_MEMORY) { + strlcpy(reason, "too much shared memory", 128); + } + + if (freezer_error_code == FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO) { + strlcpy(reason, "low private-shared pages ratio", 128); + } + + if (freezer_error_code == FREEZER_ERROR_NO_COMPRESSOR_SPACE) { + strlcpy(reason, "no compressor space", 128); + } + + if (freezer_error_code == FREEZER_ERROR_NO_SWAP_SPACE) { + strlcpy(reason, "no swap space", 128); + } + + os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: freezing (specific) pid %d [%s]...skipped (%s)", + aPid, (*p->p_name ? p->p_name : "unknown"), reason); + p->p_memstat_state |= P_MEMSTAT_FREEZE_IGNORE; } + + p->p_memstat_state &= ~P_MEMSTAT_LOCKED; + proc_list_unlock(); } exit: lck_mtx_unlock(&freezer_mutex); - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_END, - memorystatus_available_pages, aPid, 0, 0, 0); return ret; } static int -memorystatus_freeze_top_process(boolean_t *memorystatus_freeze_swap_low) +memorystatus_freeze_top_process(void) { pid_t aPid = 0; int ret = -1; proc_t p = PROC_NULL, next_p = PROC_NULL; unsigned int i = 0; - - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_START, - memorystatus_available_pages, 0, 0, 0, 0); + unsigned int band = JETSAM_PRIORITY_IDLE; + boolean_t refreeze_processes = FALSE; proc_list_lock(); - - next_p = memorystatus_get_first_proc_locked(&i, TRUE); + + if (memorystatus_frozen_count >= memorystatus_frozen_processes_max) { + /* + * Freezer is already full but we are here and so let's + * try to refreeze any processes we might have thawed + * in the past and push out their compressed state out. + */ + refreeze_processes = TRUE; + band = (unsigned int) memorystatus_freeze_jetsam_band; + } + + freeze_process: + + next_p = memorystatus_get_first_proc_locked(&band, FALSE); while (next_p) { kern_return_t kr; - uint32_t purgeable, wired, clean, dirty; - boolean_t shared; - uint32_t pages; + uint32_t purgeable, wired, clean, dirty, shared; uint32_t max_pages = 0; - uint32_t state; + int freezer_error_code = 0; p = next_p; - next_p = memorystatus_get_next_proc_locked(&i, p, TRUE); + next_p = memorystatus_get_next_proc_locked(&band, p, FALSE); aPid = p->p_pid; - state = p->p_memstat_state; - /* Ensure the process is eligible for freezing */ - if ((state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FROZEN)) || !(state & P_MEMSTAT_SUSPENDED)) { - continue; // with lock held + if (p->p_memstat_effectivepriority != (int32_t) band) { + /* + * We shouldn't be freezing processes outside the + * prescribed band. + */ + break; } - - /* Only freeze processes meeting our minimum resident page criteria */ - memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL); - if (pages < memorystatus_freeze_pages_min) { - continue; // with lock held - } - if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { + /* Ensure the process is eligible for (re-)freezing */ + if (refreeze_processes) { + /* + * Has to have been frozen once before. + */ + if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == FALSE) { + continue; + } + + /* + * Has to have been resumed once before. + */ + if ((p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) == FALSE) { + continue; + } + + /* + * Not currently being looked at for something. + */ + if (p->p_memstat_state & P_MEMSTAT_LOCKED) { + continue; + } - /* Ensure there's enough free space to freeze this process. */ + /* + * We are going to try and refreeze and so re-evaluate + * the process. We don't want to double count the shared + * memory. So deduct the old snapshot here. + */ + memorystatus_frozen_shared_mb -= p->p_memstat_freeze_sharedanon_pages; + p->p_memstat_freeze_sharedanon_pages = 0; - unsigned int avail_swap_space = 0; /* in pages. */ + p->p_memstat_state &= ~P_MEMSTAT_REFREEZE_ELIGIBLE; + memorystatus_refreeze_eligible_count--; + } else { + if (memorystatus_is_process_eligible_for_freeze(p) == FALSE) { + continue; // with lock held + } + } + + if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { /* * Freezer backed by the compressor and swap file(s) - * while will hold compressed data. + * will hold compressed data. */ - avail_swap_space = vm_swap_get_free_space() / PAGE_SIZE_64; - max_pages = MIN(avail_swap_space, memorystatus_freeze_pages_max); + max_pages = MIN(memorystatus_freeze_pages_max, memorystatus_freeze_budget_pages_remaining); - if (max_pages < memorystatus_freeze_pages_min) { - *memorystatus_freeze_swap_low = TRUE; - proc_list_unlock(); - goto exit; - } } else { /* * We only have the compressor pool. @@ -5976,60 +6945,174 @@ memorystatus_freeze_top_process(boolean_t *memorystatus_freeze_swap_low) p->p_memstat_state |= P_MEMSTAT_LOCKED; p = proc_ref_locked(p); - proc_list_unlock(); if (!p) { - goto exit; + break; } - - kr = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, FALSE); + + proc_list_unlock(); + + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_START, + memorystatus_available_pages, 0, 0, 0, 0); + + kr = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, &freezer_error_code, FALSE /* eval only */); + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_END, + memorystatus_available_pages, aPid, 0, 0, 0); + MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_top_process: task_freeze %s for pid %d [%s] - " "memorystatus_pages: %d, purgeable: %d, wired: %d, clean: %d, dirty: %d, max_pages %d, shared %d\n", (kr == KERN_SUCCESS) ? "SUCCEEDED" : "FAILED", aPid, (*p->p_name ? p->p_name : "(unknown)"), memorystatus_available_pages, purgeable, wired, clean, dirty, max_pages, shared); proc_list_lock(); - p->p_memstat_state &= ~P_MEMSTAT_LOCKED; /* Success? */ if (KERN_SUCCESS == kr) { + + if (refreeze_processes) { + os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: Refreezing (general) pid %d [%s]...done", + aPid, (*p->p_name ? p->p_name : "unknown")); + } else { + os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: freezing (general) pid %d [%s]...done", + aPid, (*p->p_name ? p->p_name : "unknown")); + } + memorystatus_freeze_entry_t data = { aPid, TRUE, dirty }; - memorystatus_frozen_count++; - - p->p_memstat_state |= (P_MEMSTAT_FROZEN | (shared ? 0: P_MEMSTAT_NORECLAIM)); + p->p_memstat_freeze_sharedanon_pages += shared; + + memorystatus_frozen_shared_mb += shared; + + if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == 0) { + p->p_memstat_state |= P_MEMSTAT_FROZEN; + memorystatus_frozen_count++; + } + + p->p_memstat_frozen_count++; + + /* + * Still keeping the P_MEMSTAT_LOCKED bit till we are actually done elevating this frozen process + * to its higher jetsam band. + */ + proc_list_unlock(); + + memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data)); if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { + + ret = memorystatus_update_inactive_jetsam_priority_band(p->p_pid, MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE, memorystatus_freeze_jetsam_band, TRUE); + + if (ret) { + printf("Elevating the frozen process failed with %d\n", ret); + /* not fatal */ + ret = 0; + } + + proc_list_lock(); + /* Update stats */ for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) { throttle_intervals[i].pageouts += dirty; } + } else { + proc_list_lock(); } memorystatus_freeze_pageouts += dirty; - memorystatus_freeze_count++; - - proc_list_unlock(); - memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data)); + if (memorystatus_frozen_count == (memorystatus_frozen_processes_max - 1)) { + /* + * Add some eviction logic here? At some point should we + * jetsam a process to get back its swap space so that we + * can freeze a more eligible process at this moment in time? + */ + } - /* Return KERN_SUCESS */ + /* Return KERN_SUCCESS */ ret = kr; + p->p_memstat_state &= ~P_MEMSTAT_LOCKED; + proc_rele_locked(p); + + /* + * We froze a process successfully. We can stop now + * and see if that helped. + */ + + break; } else { - proc_list_unlock(); + + p->p_memstat_state &= ~P_MEMSTAT_LOCKED; + + if (refreeze_processes == TRUE) { + if ((freezer_error_code == FREEZER_ERROR_EXCESS_SHARED_MEMORY) || + (freezer_error_code == FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO)) { + /* + * Keeping this prior-frozen process in this high band when + * we failed to re-freeze it due to bad shared memory usage + * could cause excessive pressure on the lower bands. + * We need to demote it for now. It'll get re-evaluated next + * time because we don't set the P_MEMSTAT_FREEZE_IGNORE + * bit. + */ + + p->p_memstat_state &= ~P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND; + memorystatus_invalidate_idle_demotion_locked(p, TRUE); + memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, TRUE, TRUE); + } + } else { + p->p_memstat_state |= P_MEMSTAT_FREEZE_IGNORE; + } + + proc_rele_locked(p); + + char reason[128]; + if (freezer_error_code == FREEZER_ERROR_EXCESS_SHARED_MEMORY) { + strlcpy(reason, "too much shared memory", 128); + } + + if (freezer_error_code == FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO) { + strlcpy(reason, "low private-shared pages ratio", 128); + } + + if (freezer_error_code == FREEZER_ERROR_NO_COMPRESSOR_SPACE) { + strlcpy(reason, "no compressor space", 128); + } + + if (freezer_error_code == FREEZER_ERROR_NO_SWAP_SPACE) { + strlcpy(reason, "no swap space", 128); + } + + os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: freezing (general) pid %d [%s]...skipped (%s)", + aPid, (*p->p_name ? p->p_name : "unknown"), reason); + + if (vm_compressor_low_on_space() || vm_swap_low_on_space()) { + break; + } } - - proc_rele(p); - goto exit; + } + + if ((ret == -1) && + (memorystatus_refreeze_eligible_count >= MIN_THAW_REFREEZE_THRESHOLD) && + (refreeze_processes == FALSE)) { + /* + * We failed to freeze a process from the IDLE + * band AND we have some thawed processes + * AND haven't tried refreezing as yet. + * Let's try and re-freeze processes in the + * frozen band that have been resumed in the past + * and so have brought in state from disk. + */ + + band = (unsigned int) memorystatus_freeze_jetsam_band; + + refreeze_processes = TRUE; + + goto freeze_process; } proc_list_unlock(); -exit: - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_END, - memorystatus_available_pages, aPid, 0, 0, 0); - return ret; } @@ -6041,22 +7124,8 @@ memorystatus_can_freeze_processes(void) proc_list_lock(); if (memorystatus_suspended_count) { - uint32_t average_resident_pages, estimated_processes; - - /* Estimate the number of suspended processes we can fit */ - average_resident_pages = memorystatus_suspended_footprint_total / memorystatus_suspended_count; - estimated_processes = memorystatus_suspended_count + - ((memorystatus_available_pages - memorystatus_available_pages_critical) / average_resident_pages); - - /* If it's predicted that no freeze will occur, lower the threshold temporarily */ - if (estimated_processes <= FREEZE_SUSPENDED_THRESHOLD_DEFAULT) { - memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_LOW; - } else { - memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_DEFAULT; - } - MEMORYSTATUS_DEBUG(1, "memorystatus_can_freeze_processes: %d suspended processes, %d average resident pages / process, %d suspended processes estimated\n", - memorystatus_suspended_count, average_resident_pages, estimated_processes); + memorystatus_freeze_suspended_threshold = MIN(memorystatus_freeze_suspended_threshold, FREEZE_SUSPENDED_THRESHOLD_DEFAULT); if ((memorystatus_suspended_count - memorystatus_frozen_count) > memorystatus_freeze_suspended_threshold) { ret = TRUE; @@ -6126,67 +7195,254 @@ memorystatus_can_freeze(boolean_t *memorystatus_freeze_swap_low) return can_freeze; } +/* + * This function evaluates if the currently frozen processes deserve + * to stay in the higher jetsam band. If the # of thaws of a process + * is below our threshold, then we will demote that process into the IDLE + * band and put it at the head. We don't immediately kill the process here + * because it already has state on disk and so it might be worth giving + * it another shot at getting thawed/resumed and used. + */ static void -memorystatus_freeze_update_throttle_interval(mach_timespec_t *ts, struct throttle_interval_t *interval) +memorystatus_demote_frozen_processes(void) { - unsigned int freeze_daily_pageouts_max = memorystatus_freeze_daily_mb_max * (1024 * 1024 / PAGE_SIZE); - if (CMP_MACH_TIMESPEC(ts, &interval->ts) >= 0) { - if (!interval->max_pageouts) { - interval->max_pageouts = (interval->burst_multiple * (((uint64_t)interval->mins * freeze_daily_pageouts_max) / (24 * 60))); - } else { - printf("memorystatus_freeze_update_throttle_interval: %d minute throttle timeout, resetting\n", interval->mins); + unsigned int band = (unsigned int) memorystatus_freeze_jetsam_band; + unsigned int demoted_proc_count = 0; + proc_t p = PROC_NULL, next_p = PROC_NULL; + + proc_list_lock(); + + if (memorystatus_freeze_enabled == FALSE) { + /* + * Freeze has been disabled likely to + * reclaim swap space. So don't change + * any state on the frozen processes. + */ + proc_list_unlock(); + return; + } + + next_p = memorystatus_get_first_proc_locked(&band, FALSE); + while (next_p) { + + p = next_p; + next_p = memorystatus_get_next_proc_locked(&band, p, FALSE); + + if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == FALSE) { + continue; } - interval->ts.tv_sec = interval->mins * 60; - interval->ts.tv_nsec = 0; - ADD_MACH_TIMESPEC(&interval->ts, ts); - /* Since we update the throttle stats pre-freeze, adjust for overshoot here */ - if (interval->pageouts > interval->max_pageouts) { - interval->pageouts -= interval->max_pageouts; - } else { - interval->pageouts = 0; + + if (p->p_memstat_state & P_MEMSTAT_LOCKED) { + continue; } - interval->throttle = FALSE; - } else if (!interval->throttle && interval->pageouts >= interval->max_pageouts) { - printf("memorystatus_freeze_update_throttle_interval: %d minute pageout limit exceeded; enabling throttle\n", interval->mins); - interval->throttle = TRUE; - } - MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_update_throttle_interval: throttle updated - %d frozen (%d max) within %dm; %dm remaining; throttle %s\n", - interval->pageouts, interval->max_pageouts, interval->mins, (interval->ts.tv_sec - ts->tv_sec) / 60, - interval->throttle ? "on" : "off"); + if (p->p_memstat_thaw_count < memorystatus_thaw_count_demotion_threshold) { + p->p_memstat_state &= ~P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND; + memorystatus_invalidate_idle_demotion_locked(p, TRUE); + + memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, TRUE, TRUE); +#if DEVELOPMENT || DEBUG + os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus_demote_frozen_process pid %d [%s]", + p->p_pid, (*p->p_name ? p->p_name : "unknown")); +#endif /* DEVELOPMENT || DEBUG */ + + /* + * The freezer thread will consider this a normal app to be frozen + * because it is in the IDLE band. So we don't need the + * P_MEMSTAT_REFREEZE_ELIGIBLE state here. Also, if it gets resumed + * we'll correctly count it as eligible for re-freeze again. + * + * We don't drop the frozen count because this process still has + * state on disk. So there's a chance it gets resumed and then it + * should land in the higher jetsam band. For that it needs to + * remain marked frozen. + */ + if (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) { + p->p_memstat_state &= ~P_MEMSTAT_REFREEZE_ELIGIBLE; + memorystatus_refreeze_eligible_count--; + } + + demoted_proc_count++; + } + + if (demoted_proc_count == memorystatus_max_frozen_demotions_daily) { + break; + } + } + + memorystatus_thaw_count = 0; + proc_list_unlock(); } -static boolean_t -memorystatus_freeze_update_throttle(void) + +/* + * This function will do 4 things: + * + * 1) check to see if we are currently in a degraded freezer mode, and if so: + * - check to see if our window has expired and we should exit this mode, OR, + * - return a budget based on the degraded throttle window's max. pageouts vs current pageouts. + * + * 2) check to see if we are in a NEW normal window and update the normal throttle window's params. + * + * 3) check what the current normal window allows for a budget. + * + * 4) calculate the current rate of pageouts for DEGRADED_WINDOW_MINS duration. If that rate is below + * what we would normally expect, then we are running low on our daily budget and need to enter + * degraded perf. mode. + */ + +static void +memorystatus_freeze_update_throttle(uint64_t *budget_pages_allowed) { clock_sec_t sec; clock_nsec_t nsec; mach_timespec_t ts; - uint32_t i; - boolean_t throttled = FALSE; + + unsigned int freeze_daily_pageouts_max = 0; #if DEVELOPMENT || DEBUG - if (!memorystatus_freeze_throttle_enabled) - return FALSE; + if (!memorystatus_freeze_throttle_enabled) { + /* + * No throttling...we can use the full budget everytime. + */ + *budget_pages_allowed = UINT64_MAX; + return; + } #endif clock_get_system_nanotime(&sec, &nsec); ts.tv_sec = sec; ts.tv_nsec = nsec; - - /* Check freeze pageouts over multiple intervals and throttle if we've exceeded our budget. - * - * This ensures that periods of inactivity can't be used as 'credit' towards freeze if the device has - * remained dormant for a long period. We do, however, allow increased thresholds for shorter intervals in - * order to allow for bursts of activity. - */ - for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) { - memorystatus_freeze_update_throttle_interval(&ts, &throttle_intervals[i]); - if (throttle_intervals[i].throttle == TRUE) - throttled = TRUE; - } - return throttled; + struct throttle_interval_t *interval = NULL; + + if (memorystatus_freeze_degradation == TRUE) { + + interval = degraded_throttle_window; + + if (CMP_MACH_TIMESPEC(&ts, &interval->ts) >= 0) { + memorystatus_freeze_degradation = FALSE; + interval->pageouts = 0; + interval->max_pageouts = 0; + + } else { + *budget_pages_allowed = interval->max_pageouts - interval->pageouts; + } + } + + interval = normal_throttle_window; + + if (CMP_MACH_TIMESPEC(&ts, &interval->ts) >= 0) { + /* + * New throttle window. + * Rollover any unused budget. + * Also ask the storage layer what the new budget needs to be. + */ + uint64_t freeze_daily_budget = 0; + unsigned int daily_budget_pageouts = 0; + + if (vm_swap_max_budget(&freeze_daily_budget)) { + memorystatus_freeze_daily_mb_max = (freeze_daily_budget / (1024 * 1024)); + os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: memorystatus_freeze_daily_mb_max set to %dMB\n", memorystatus_freeze_daily_mb_max); + } + + freeze_daily_pageouts_max = memorystatus_freeze_daily_mb_max * (1024 * 1024 / PAGE_SIZE); + + daily_budget_pageouts = (interval->burst_multiple * (((uint64_t)interval->mins * freeze_daily_pageouts_max) / NORMAL_WINDOW_MINS)); + interval->max_pageouts = (interval->max_pageouts - interval->pageouts) + daily_budget_pageouts; + + interval->ts.tv_sec = interval->mins * 60; + interval->ts.tv_nsec = 0; + ADD_MACH_TIMESPEC(&interval->ts, &ts); + /* Since we update the throttle stats pre-freeze, adjust for overshoot here */ + if (interval->pageouts > interval->max_pageouts) { + interval->pageouts -= interval->max_pageouts; + } else { + interval->pageouts = 0; + } + *budget_pages_allowed = interval->max_pageouts; + + memorystatus_demote_frozen_processes(); + + } else { + /* + * Current throttle window. + * Deny freezing if we have no budget left. + * Try graceful degradation if we are within 25% of: + * - the daily budget, and + * - the current budget left is below our normal budget expectations. + */ + +#if DEVELOPMENT || DEBUG + /* + * This can only happen in the INTERNAL configs because we allow modifying the daily budget for testing. + */ + + if (freeze_daily_pageouts_max > interval->max_pageouts) { + /* + * We just bumped the daily budget. Re-evaluate our normal window params. + */ + interval->max_pageouts = (interval->burst_multiple * (((uint64_t)interval->mins * freeze_daily_pageouts_max) / NORMAL_WINDOW_MINS)); + memorystatus_freeze_degradation = FALSE; //we'll re-evaluate this below... + } +#endif /* DEVELOPMENT || DEBUG */ + + if (memorystatus_freeze_degradation == FALSE) { + + if (interval->pageouts >= interval->max_pageouts) { + + *budget_pages_allowed = 0; + + } else { + + int budget_left = interval->max_pageouts - interval->pageouts; + int budget_threshold = (freeze_daily_pageouts_max * FREEZE_DEGRADATION_BUDGET_THRESHOLD) / 100; + + mach_timespec_t time_left = {0,0}; + + time_left.tv_sec = interval->ts.tv_sec; + time_left.tv_nsec = 0; + + SUB_MACH_TIMESPEC(&time_left, &ts); + + if (budget_left <= budget_threshold) { + + /* + * For the current normal window, calculate how much we would pageout in a DEGRADED_WINDOW_MINS duration. + * And also calculate what we would pageout for the same DEGRADED_WINDOW_MINS duration if we had the full + * daily pageout budget. + */ + + unsigned int current_budget_rate_allowed = ((budget_left / time_left.tv_sec) / 60) * DEGRADED_WINDOW_MINS; + unsigned int normal_budget_rate_allowed = (freeze_daily_pageouts_max / NORMAL_WINDOW_MINS) * DEGRADED_WINDOW_MINS; + + /* + * The current rate of pageouts is below what we would expect for + * the normal rate i.e. we have below normal budget left and so... + */ + + if (current_budget_rate_allowed < normal_budget_rate_allowed) { + + memorystatus_freeze_degradation = TRUE; + degraded_throttle_window->max_pageouts = current_budget_rate_allowed; + degraded_throttle_window->pageouts = 0; + + /* + * Switch over to the degraded throttle window so the budget + * doled out is based on that window. + */ + interval = degraded_throttle_window; + } + } + + *budget_pages_allowed = interval->max_pageouts - interval->pageouts; + } + } + } + + MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_update_throttle_interval: throttle updated - %d frozen (%d max) within %dm; %dm remaining; throttle %s\n", + interval->pageouts, interval->max_pageouts, interval->mins, (interval->ts.tv_sec - ts->tv_sec) / 60, + interval->throttle ? "on" : "off"); } static void @@ -6195,23 +7451,77 @@ memorystatus_freeze_thread(void *param __unused, wait_result_t wr __unused) static boolean_t memorystatus_freeze_swap_low = FALSE; lck_mtx_lock(&freezer_mutex); + if (memorystatus_freeze_enabled) { - if (memorystatus_can_freeze(&memorystatus_freeze_swap_low)) { - /* Only freeze if we've not exceeded our pageout budgets.*/ - if (!memorystatus_freeze_update_throttle()) { - memorystatus_freeze_top_process(&memorystatus_freeze_swap_low); - } else { - printf("memorystatus_freeze_thread: in throttle, ignoring freeze\n"); - memorystatus_freeze_throttle_count++; /* Throttled, update stats */ + + if ((memorystatus_frozen_count < memorystatus_frozen_processes_max) || + (memorystatus_refreeze_eligible_count >= MIN_THAW_REFREEZE_THRESHOLD)) { + + if (memorystatus_can_freeze(&memorystatus_freeze_swap_low)) { + + /* Only freeze if we've not exceeded our pageout budgets.*/ + memorystatus_freeze_update_throttle(&memorystatus_freeze_budget_pages_remaining); + + if (memorystatus_freeze_budget_pages_remaining) { + memorystatus_freeze_top_process(); + } } } } - lck_mtx_unlock(&freezer_mutex); + + /* + * We use memorystatus_apps_idle_delay_time because if/when we adopt aging for applications, + * it'll tie neatly into running the freezer once we age an application. + * + * Till then, it serves as a good interval that can be tuned via a sysctl too. + */ + memorystatus_freezer_thread_next_run_ts = mach_absolute_time() + memorystatus_apps_idle_delay_time; assert_wait((event_t) &memorystatus_freeze_wakeup, THREAD_UNINT); + lck_mtx_unlock(&freezer_mutex); + thread_block((thread_continue_t) memorystatus_freeze_thread); } +static boolean_t +memorystatus_freeze_thread_should_run(void) +{ + /* + * No freezer_mutex held here...see why near call-site + * within memorystatus_pages_update(). + */ + + boolean_t should_run = FALSE; + + if (memorystatus_freeze_enabled == FALSE) { + goto out; + } + + if (memorystatus_available_pages > memorystatus_freeze_threshold) { + goto out; + } + + if ((memorystatus_frozen_count >= memorystatus_frozen_processes_max) && + (memorystatus_refreeze_eligible_count < MIN_THAW_REFREEZE_THRESHOLD)) { + goto out; + } + + if (memorystatus_frozen_shared_mb_max && (memorystatus_frozen_shared_mb >= memorystatus_frozen_shared_mb_max)) { + goto out; + } + + uint64_t curr_time = mach_absolute_time(); + + if (curr_time < memorystatus_freezer_thread_next_run_ts) { + goto out; + } + + should_run = TRUE; + +out: + return should_run; +} + static int sysctl_memorystatus_do_fastwake_warmup_all SYSCTL_HANDLER_ARGS { @@ -6494,7 +7804,17 @@ memorystatus_bg_pressure_eligible(proc_t p) { if (!(p->p_memstat_state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN))) { eligible = TRUE; } - + + if (p->p_memstat_effectivepriority < JETSAM_PRIORITY_BACKGROUND_OPPORTUNISTIC) { + /* + * IDLE and IDLE_DEFERRED bands contain processes + * that have dropped memory to be under their inactive + * memory limits. And so they can't really give back + * anything. + */ + eligible = FALSE; + } + proc_list_unlock(); return eligible; @@ -6510,14 +7830,18 @@ memorystatus_is_foreground_locked(proc_t p) { * This is meant for stackshot and kperf -- it does not take the proc_list_lock * to access the p_memstat_dirty field. */ -boolean_t -memorystatus_proc_is_dirty_unsafe(void *v) +void memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit) { if (!v) { - return FALSE; + *is_dirty = FALSE; + *is_dirty_tracked = FALSE; + *allow_idle_exit = FALSE; + } else { + proc_t p = (proc_t)v; + *is_dirty = (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) != 0; + *is_dirty_tracked = (p->p_memstat_dirty & P_DIRTY_TRACK) != 0; + *allow_idle_exit = (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) != 0; } - proc_t p = (proc_t)v; - return (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) != 0; } #endif /* CONFIG_MEMORYSTATUS */ @@ -6539,14 +7863,6 @@ vm_pressure_level_t memorystatus_manual_testing_level = kVMPressureNormal; extern struct knote * vm_pressure_select_optimal_candidate_to_notify(struct klist *, int, boolean_t); -/* - * This value is the threshold that a process must meet to be considered for scavenging. - */ -#if CONFIG_EMBEDDED -#define VM_PRESSURE_MINIMUM_RSIZE 1 /* MB */ -#else /* CONFIG_EMBEDDED */ -#define VM_PRESSURE_MINIMUM_RSIZE 10 /* MB */ -#endif /* CONFIG_EMBEDDED */ #define VM_PRESSURE_NOTIFY_WAIT_PERIOD 10000 /* milliseconds */ @@ -6788,7 +8104,7 @@ vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int */ resident_size = (get_task_phys_footprint(t))/(1024*1024ULL); /* MB */ - if (resident_size >= VM_PRESSURE_MINIMUM_RSIZE) { + if (resident_size >= vm_pressure_task_footprint_min) { if (level > 0) { /* @@ -7179,8 +8495,6 @@ SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT|CTLFLAG #endif /* DEBUG || DEVELOPMENT */ -extern int memorystatus_purge_on_warning; -extern int memorystatus_purge_on_critical; static int sysctl_memorypressure_manual_trigger SYSCTL_HANDLER_ARGS @@ -7229,12 +8543,12 @@ sysctl_memorypressure_manual_trigger SYSCTL_HANDLER_ARGS } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_WARN) { memorystatus_manual_testing_level = kVMPressureWarning; - force_purge = memorystatus_purge_on_warning; + force_purge = vm_pageout_state.memorystatus_purge_on_warning; } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) { memorystatus_manual_testing_level = kVMPressureCritical; - force_purge = memorystatus_purge_on_critical; + force_purge = vm_pageout_state.memorystatus_purge_on_critical; } memorystatus_vm_pressure_level = memorystatus_manual_testing_level; @@ -7284,14 +8598,13 @@ SYSCTL_PROC(_kern, OID_AUTO, memorypressure_manual_trigger, CTLTYPE_INT|CTLFLAG_ 0, 0, &sysctl_memorypressure_manual_trigger, "I", ""); -extern int memorystatus_purge_on_warning; -extern int memorystatus_purge_on_urgent; -extern int memorystatus_purge_on_critical; - -SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_warning, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_purge_on_warning, 0, ""); -SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_urgent, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_purge_on_urgent, 0, ""); -SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_critical, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_purge_on_critical, 0, ""); +SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_warning, CTLFLAG_RW|CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_warning, 0, ""); +SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_urgent, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_urgent, 0, ""); +SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_critical, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_critical, 0, ""); +#if DEBUG || DEVELOPMENT +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_events_enabled, CTLFLAG_RW|CTLFLAG_LOCKED, &vm_pressure_events_enabled, 0, ""); +#endif #endif /* VM_PRESSURE_EVENTS */ @@ -7506,6 +8819,32 @@ memorystatus_update_levels_locked(boolean_t critical_only) { #endif } +void +memorystatus_fast_jetsam_override(boolean_t enable_override) +{ + /* If fast jetsam is not enabled, simply return */ + if (!fast_jetsam_enabled) + return; + + if (enable_override) { + if ((memorystatus_jetsam_policy & kPolicyMoreFree) == kPolicyMoreFree) + return; + proc_list_lock(); + memorystatus_jetsam_policy |= kPolicyMoreFree; + memorystatus_thread_pool_max(); + memorystatus_update_levels_locked(TRUE); + proc_list_unlock(); + } else { + if ((memorystatus_jetsam_policy & kPolicyMoreFree) == 0) + return; + proc_list_lock(); + memorystatus_jetsam_policy &= ~kPolicyMoreFree; + memorystatus_thread_pool_default(); + memorystatus_update_levels_locked(TRUE); + proc_list_unlock(); + } +} + static int sysctl_kern_memorystatus_policy_more_free SYSCTL_HANDLER_ARGS @@ -7525,27 +8864,12 @@ sysctl_kern_memorystatus_policy_more_free SYSCTL_HANDLER_ARGS if (error || !req->newptr) return (error); - if ((more_free && ((memorystatus_jetsam_policy & kPolicyMoreFree) == kPolicyMoreFree)) || - (!more_free && ((memorystatus_jetsam_policy & kPolicyMoreFree) == 0))) { - - /* - * No change in state. - */ - return 0; - } - - proc_list_lock(); - if (more_free) { - memorystatus_jetsam_policy |= kPolicyMoreFree; + memorystatus_fast_jetsam_override(true); } else { - memorystatus_jetsam_policy &= ~kPolicyMoreFree; + memorystatus_fast_jetsam_override(false); } - memorystatus_update_levels_locked(TRUE); - - proc_list_unlock(); - return 0; } SYSCTL_PROC(_kern, OID_AUTO, memorystatus_policy_more_free, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED, @@ -7587,6 +8911,35 @@ memorystatus_get_at_boot_snapshot(memorystatus_jetsam_snapshot_t **snapshot, siz return 0; } +/* + * Get the previous fully populated snapshot + */ +static int +memorystatus_get_jetsam_snapshot_copy(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only) { + size_t input_size = *snapshot_size; + + if (memorystatus_jetsam_snapshot_copy_count > 0) { + *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_copy_count)); + } else { + *snapshot_size = 0; + } + + if (size_only) { + return 0; + } + + if (input_size < *snapshot_size) { + return EINVAL; + } + + *snapshot = memorystatus_jetsam_snapshot_copy; + + MEMORYSTATUS_DEBUG(7, "memorystatus_get_jetsam_snapshot_copy: returned inputsize (%ld), snapshot_size(%ld), listcount(%ld)\n", + (long)input_size, (long)*snapshot_size, (long)memorystatus_jetsam_snapshot_copy_count); + + return 0; +} + static int memorystatus_get_on_demand_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only) { size_t input_size = *snapshot_size; @@ -7680,17 +9033,16 @@ memorystatus_cmd_get_jetsam_snapshot(int32_t flags, user_addr_t buffer, size_t b is_default_snapshot = TRUE; error = memorystatus_get_jetsam_snapshot(&snapshot, &buffer_size, size_only); } else { - if (flags & ~(MEMORYSTATUS_SNAPSHOT_ON_DEMAND | MEMORYSTATUS_SNAPSHOT_AT_BOOT)) { + if (flags & ~(MEMORYSTATUS_SNAPSHOT_ON_DEMAND | MEMORYSTATUS_SNAPSHOT_AT_BOOT | MEMORYSTATUS_SNAPSHOT_COPY)) { /* * Unsupported bit set in flag. */ return EINVAL; } - if ((flags & (MEMORYSTATUS_SNAPSHOT_ON_DEMAND | MEMORYSTATUS_SNAPSHOT_AT_BOOT)) == - (MEMORYSTATUS_SNAPSHOT_ON_DEMAND | MEMORYSTATUS_SNAPSHOT_AT_BOOT)) { + if (flags & (flags - 0x1)) { /* - * Can't have both set at the same time. + * Can't have multiple flags set at the same time. */ return EINVAL; } @@ -7706,6 +9058,8 @@ memorystatus_cmd_get_jetsam_snapshot(int32_t flags, user_addr_t buffer, size_t b } else if (flags & MEMORYSTATUS_SNAPSHOT_AT_BOOT) { is_at_boot_snapshot = TRUE; error = memorystatus_get_at_boot_snapshot(&snapshot, &buffer_size, size_only); + } else if (flags & MEMORYSTATUS_SNAPSHOT_COPY) { + error = memorystatus_get_jetsam_snapshot_copy(&snapshot, &buffer_size, size_only); } else { /* * Invalid flag setting. @@ -7726,14 +9080,20 @@ memorystatus_cmd_get_jetsam_snapshot(int32_t flags, user_addr_t buffer, size_t b * clearing the buffer means, free it. * If working with the at_boot snapshot * there is nothing to clear or update. + * If working with a copy of the snapshot + * there is nothing to clear or update. */ if (!size_only) { if ((error = copyout(snapshot, buffer, buffer_size)) == 0) { if (is_default_snapshot) { /* * The jetsam snapshot is never freed, its count is simply reset. + * However, we make a copy for any parties that might be interested + * in the previous fully populated snapshot. */ proc_list_lock(); + memcpy(memorystatus_jetsam_snapshot_copy, memorystatus_jetsam_snapshot, memorystatus_jetsam_snapshot_size); + memorystatus_jetsam_snapshot_copy_count = memorystatus_jetsam_snapshot_count; snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0; memorystatus_jetsam_snapshot_last_timestamp = 0; proc_list_unlock(); @@ -7759,10 +9119,9 @@ memorystatus_cmd_get_jetsam_snapshot(int32_t flags, user_addr_t buffer, size_t b } /* - * Routine: memorystatus_cmd_grp_set_properties - * Purpose: Update properties for a group of processes. + * Routine: memorystatus_cmd_grp_set_priorities + * Purpose: Update priorities for a group of processes. * - * Supported Properties: * [priority] * Move each process out of its effective priority * band and into a new priority band. @@ -7794,18 +9153,9 @@ memorystatus_cmd_get_jetsam_snapshot(int32_t flags, user_addr_t buffer, size_t b */ -/* This internal structure can expand when we add support for more properties */ -typedef struct memorystatus_internal_properties -{ - proc_t proc; - int32_t priority; /* see memorytstatus_priority_entry_t : priority */ -} memorystatus_internal_properties_t; - - static int -memorystatus_cmd_grp_set_properties(int32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval) { - -#pragma unused (flags) +memorystatus_cmd_grp_set_priorities(user_addr_t buffer, size_t buffer_size) +{ /* * We only handle setting priority @@ -7813,10 +9163,15 @@ memorystatus_cmd_grp_set_properties(int32_t flags, user_addr_t buffer, size_t bu */ int error = 0; - memorystatus_priority_entry_t *entries = NULL; + memorystatus_properties_entry_v1_t *entries = NULL; uint32_t entry_count = 0; /* This will be the ordered proc list */ + typedef struct memorystatus_internal_properties { + proc_t proc; + int32_t priority; + } memorystatus_internal_properties_t; + memorystatus_internal_properties_t *table = NULL; size_t table_size = 0; uint32_t table_count = 0; @@ -7829,24 +9184,34 @@ memorystatus_cmd_grp_set_properties(int32_t flags, user_addr_t buffer, size_t bu proc_t p; /* Verify inputs */ - if ((buffer == USER_ADDR_NULL) || (buffer_size == 0) || ((buffer_size % sizeof(memorystatus_priority_entry_t)) != 0)) { + if ((buffer == USER_ADDR_NULL) || (buffer_size == 0)) { error = EINVAL; goto out; } - entry_count = (buffer_size / sizeof(memorystatus_priority_entry_t)); - if ((entries = (memorystatus_priority_entry_t *)kalloc(buffer_size)) == NULL) { + entry_count = (buffer_size / sizeof(memorystatus_properties_entry_v1_t)); + if ((entries = (memorystatus_properties_entry_v1_t *)kalloc(buffer_size)) == NULL) { error = ENOMEM; goto out; } - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_START, entry_count, 0, 0, 0, 0); + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_START, MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY, entry_count, 0, 0, 0); if ((error = copyin(buffer, entries, buffer_size)) != 0) { goto out; } /* Verify sanity of input priorities */ + if (entries[0].version == MEMORYSTATUS_MPE_VERSION_1) { + if ((buffer_size % MEMORYSTATUS_MPE_VERSION_1_SIZE) != 0) { + error = EINVAL; + goto out; + } + } else { + error = EINVAL; + goto out; + } + for (i=0; i < entry_count; i++) { if (entries[i].priority == -1) { /* Use as shorthand for default priority */ @@ -7938,9 +9303,9 @@ memorystatus_cmd_grp_set_properties(int32_t flags, user_addr_t buffer, size_t bu * then some pids were not found in a jetsam band. * harmless but interesting... */ - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_END, entry_count, table_count, 0, 0, 0); - out: + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_END, MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY, entry_count, table_count, 0, 0); + if (entries) kfree(entries, buffer_size); if (table) @@ -7949,6 +9314,123 @@ memorystatus_cmd_grp_set_properties(int32_t flags, user_addr_t buffer, size_t bu return (error); } +static int +memorystatus_cmd_grp_set_probabilities(user_addr_t buffer, size_t buffer_size) +{ + int error = 0; + memorystatus_properties_entry_v1_t *entries = NULL; + uint32_t entry_count = 0, i = 0; + memorystatus_internal_probabilities_t *tmp_table_new = NULL, *tmp_table_old = NULL; + size_t tmp_table_new_size = 0, tmp_table_old_size = 0; + + /* Verify inputs */ + if ((buffer == USER_ADDR_NULL) || (buffer_size == 0)) { + error = EINVAL; + goto out; + } + + entry_count = (buffer_size / sizeof(memorystatus_properties_entry_v1_t)); + + if ((entries = (memorystatus_properties_entry_v1_t *) kalloc(buffer_size)) == NULL) { + error = ENOMEM; + goto out; + } + + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_START, MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY, entry_count, 0, 0, 0); + + if ((error = copyin(buffer, entries, buffer_size)) != 0) { + goto out; + } + + if (entries[0].version == MEMORYSTATUS_MPE_VERSION_1) { + if ((buffer_size % MEMORYSTATUS_MPE_VERSION_1_SIZE) != 0) { + error = EINVAL; + goto out; + } + } else { + error = EINVAL; + goto out; + } + + /* Verify sanity of input priorities */ + for (i=0; i < entry_count; i++) { + /* + * 0 - low probability of use. + * 1 - high probability of use. + * + * Keeping this field an int (& not a bool) to allow + * us to experiment with different values/approaches + * later on. + */ + if (entries[i].use_probability > 1) { + error = EINVAL; + goto out; + } + } + + tmp_table_new_size = sizeof(memorystatus_internal_probabilities_t) * entry_count; + + if ( (tmp_table_new = (memorystatus_internal_probabilities_t *) kalloc(tmp_table_new_size)) == NULL) { + error = ENOMEM; + goto out; + } + memset(tmp_table_new, 0, tmp_table_new_size); + + proc_list_lock(); + + if (memorystatus_global_probabilities_table) { + tmp_table_old = memorystatus_global_probabilities_table; + tmp_table_old_size = memorystatus_global_probabilities_size; + } + + memorystatus_global_probabilities_table = tmp_table_new; + memorystatus_global_probabilities_size = tmp_table_new_size; + tmp_table_new = NULL; + + for (i=0; i < entry_count; i++ ) { + /* Build the table data */ + strlcpy(memorystatus_global_probabilities_table[i].proc_name, entries[i].proc_name, MAXCOMLEN + 1); + memorystatus_global_probabilities_table[i].use_probability = entries[i].use_probability; + } + + proc_list_unlock(); + +out: + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_END, MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY, entry_count, tmp_table_new_size, 0, 0); + + if (entries) { + kfree(entries, buffer_size); + entries = NULL; + } + + if (tmp_table_old) { + kfree(tmp_table_old, tmp_table_old_size); + tmp_table_old = NULL; + } + + return (error); + +} + +static int +memorystatus_cmd_grp_set_properties(int32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval) +{ + int error = 0; + + if ((flags & MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY) == MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY) { + + error = memorystatus_cmd_grp_set_priorities(buffer, buffer_size); + + } else if ((flags & MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY) == MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY) { + + error = memorystatus_cmd_grp_set_probabilities(buffer, buffer_size); + + } else { + error = EINVAL; + } + + return error; +} /* * This routine is used to update a process's jetsam priority position and stored user_data. @@ -8313,9 +9795,131 @@ proc_get_memstat_priority(proc_t p, boolean_t effective_priority) return 0; } +static int +memorystatus_get_process_is_managed(pid_t pid, int *is_managed) +{ + proc_t p = NULL; + + /* Validate inputs */ + if (pid == 0) { + return EINVAL; + } + + p = proc_find(pid); + if (!p) { + return ESRCH; + } + + proc_list_lock(); + *is_managed = ((p->p_memstat_state & P_MEMSTAT_MANAGED) ? 1 : 0); + proc_rele_locked(p); + proc_list_unlock(); + + return 0; +} + +static int +memorystatus_set_process_is_managed(pid_t pid, boolean_t set_managed) +{ + proc_t p = NULL; + + /* Validate inputs */ + if (pid == 0) { + return EINVAL; + } + + p = proc_find(pid); + if (!p) { + return ESRCH; + } + + proc_list_lock(); + if (set_managed == TRUE) { + p->p_memstat_state |= P_MEMSTAT_MANAGED; + } else { + p->p_memstat_state &= ~P_MEMSTAT_MANAGED; + } + proc_rele_locked(p); + proc_list_unlock(); + + return 0; +} + +static int +memorystatus_get_process_is_freezable(pid_t pid, int *is_freezable) +{ + proc_t p = PROC_NULL; + + if (pid == 0) { + return EINVAL; + } + + p = proc_find(pid); + if (!p) { + return ESRCH; + } + + /* + * Only allow this on the current proc for now. + * We can check for privileges and allow targeting another process in the future. + */ + if (p != current_proc()) { + proc_rele(p); + return EPERM; + } + + proc_list_lock(); + *is_freezable = ((p->p_memstat_state & P_MEMSTAT_FREEZE_DISABLED) ? 0 : 1); + proc_rele_locked(p); + proc_list_unlock(); + + return 0; +} + +static int +memorystatus_set_process_is_freezable(pid_t pid, boolean_t is_freezable) +{ + proc_t p = PROC_NULL; + + if (pid == 0) { + return EINVAL; + } + + p = proc_find(pid); + if (!p) { + return ESRCH; + } + + /* + * Only allow this on the current proc for now. + * We can check for privileges and allow targeting another process in the future. + */ + if (p != current_proc()) { + proc_rele(p); + return EPERM; + } + + proc_list_lock(); + if (is_freezable == FALSE) { + /* Freeze preference set to FALSE. Set the P_MEMSTAT_FREEZE_DISABLED bit. */ + p->p_memstat_state |= P_MEMSTAT_FREEZE_DISABLED; + printf("memorystatus_set_process_is_freezable: disabling freeze for pid %d [%s]\n", + p->p_pid, (*p->p_name ? p->p_name : "unknown")); + } else { + p->p_memstat_state &= ~P_MEMSTAT_FREEZE_DISABLED; + printf("memorystatus_set_process_is_freezable: enabling freeze for pid %d [%s]\n", + p->p_pid, (*p->p_name ? p->p_name : "unknown")); + } + proc_rele_locked(p); + proc_list_unlock(); + + return 0; +} + int memorystatus_control(struct proc *p __unused, struct memorystatus_control_args *args, int *ret) { int error = EINVAL; + boolean_t skip_auth_check = FALSE; os_reason_t jetsam_reason = OS_REASON_NULL; #if !CONFIG_JETSAM @@ -8323,8 +9927,13 @@ memorystatus_control(struct proc *p __unused, struct memorystatus_control_args * #pragma unused(jetsam_reason) #endif - /* Need to be root or have entitlement */ - if (!kauth_cred_issuser(kauth_cred_get()) && !IOTaskHasEntitlement(current_task(), MEMORYSTATUS_ENTITLEMENT)) { + /* We don't need entitlements if we're setting/ querying the freeze preference for a process. Skip the check below. */ + if (args->command == MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE || args->command == MEMORYSTATUS_CMD_GET_PROCESS_IS_FREEZABLE) { + skip_auth_check = TRUE; + } + + /* Need to be root or have entitlement. */ + if (!kauth_cred_issuser(kauth_cred_get()) && !IOTaskHasEntitlement(current_task(), MEMORYSTATUS_ENTITLEMENT) && !skip_auth_check) { error = EPERM; goto out; } @@ -8430,9 +10039,32 @@ memorystatus_control(struct proc *p __unused, struct memorystatus_control_args * case MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE: case MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_DISABLE: - error = memorystatus_update_inactive_jetsam_priority_band(args->pid, args->command, args->flags ? TRUE : FALSE); + error = memorystatus_update_inactive_jetsam_priority_band(args->pid, args->command, JETSAM_PRIORITY_ELEVATED_INACTIVE, args->flags ? TRUE : FALSE); + break; + case MEMORYSTATUS_CMD_SET_PROCESS_IS_MANAGED: + error = memorystatus_set_process_is_managed(args->pid, args->flags); break; + case MEMORYSTATUS_CMD_GET_PROCESS_IS_MANAGED: + error = memorystatus_get_process_is_managed(args->pid, ret); + break; + + case MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE: + error = memorystatus_set_process_is_freezable(args->pid, args->flags ? TRUE : FALSE); + break; + + case MEMORYSTATUS_CMD_GET_PROCESS_IS_FREEZABLE: + error = memorystatus_get_process_is_freezable(args->pid, ret); + break; + +#if CONFIG_FREEZE +#if DEVELOPMENT || DEBUG + case MEMORYSTATUS_CMD_FREEZER_CONTROL: + error = memorystatus_freezer_control(args->flags, args->buffer, args->buffersize, ret); + break; +#endif /* DEVELOPMENT || DEBUG */ +#endif /* CONFIG_FREEZE */ + default: break; } @@ -8592,9 +10224,6 @@ filt_memorystatustouch(struct knote *kn, struct kevent_internal_s *kev) } #endif /* !CONFIG_EMBEDDED */ - if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) - kn->kn_udata = kev->udata; - /* * reset the output flags based on a * combination of the old events and @@ -8981,9 +10610,19 @@ int memorystatus_update_priority_for_appnap(proc_t p, boolean_t is_appnap) { #if !CONFIG_JETSAM - if (!p || (!isApp(p)) || (p->p_memstat_state & P_MEMSTAT_INTERNAL)) { + if (!p || (!isApp(p)) || (p->p_memstat_state & (P_MEMSTAT_INTERNAL | P_MEMSTAT_MANAGED))) { /* * Ineligible processes OR system processes e.g. launchd. + * + * We also skip processes that have the P_MEMSTAT_MANAGED bit set, i.e. + * they're managed by assertiond. These are iOS apps that have been ported + * to macOS. assertiond might be in the process of modifying the app's + * priority / memory limit - so it might have the proc_list lock, and then try + * to take the task lock. Meanwhile we've entered this function with the task lock + * held, and we need the proc_list lock below. So we'll deadlock with assertiond. + * + * It should be fine to read the P_MEMSTAT_MANAGED bit without the proc_list + * lock here, since assertiond only sets this bit on process launch. */ return -1; } @@ -9075,6 +10714,8 @@ memorystatus_update_priority_for_appnap(proc_t p, boolean_t is_appnap) p->p_memstat_idle_start = mach_absolute_time(); } + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_CHANGE_PRIORITY), p->p_pid, priority, p->p_memstat_effectivepriority, 0, 0); + p->p_memstat_effectivepriority = priority; proc_list_unlock(); diff --git a/bsd/kern/kern_mib.c b/bsd/kern/kern_mib.c index 35115e557..08cd860d6 100644 --- a/bsd/kern/kern_mib.c +++ b/bsd/kern/kern_mib.c @@ -503,7 +503,9 @@ int watchpoint_flag = -1; int breakpoint_flag = -1; int gNeon = -1; int gNeonHpfp = -1; +int gNeonFp16 = -1; int gARMv81Atomics = 0; +int gARMv8Crc32 = 0; #if defined (__arm__) int arm64_flag = 0; @@ -517,7 +519,9 @@ SYSCTL_INT(_hw_optional, OID_AUTO, watchpoint, CTLFLAG_RD | CTLFLAG_KERN | CTLFL SYSCTL_INT(_hw_optional, OID_AUTO, breakpoint, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &breakpoint_flag, 0, ""); SYSCTL_INT(_hw_optional, OID_AUTO, neon, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gNeon, 0, ""); SYSCTL_INT(_hw_optional, OID_AUTO, neon_hpfp, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gNeonHpfp, 0, ""); +SYSCTL_INT(_hw_optional, OID_AUTO, neon_fp16, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gNeonFp16, 0, ""); SYSCTL_INT(_hw_optional, OID_AUTO, armv8_1_atomics, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gARMv81Atomics, 0, ""); +SYSCTL_INT(_hw_optional, OID_AUTO, armv8_crc32, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gARMv8Crc32, 0, ""); /* * Without this little ifdef dance, the preprocessor replaces "arm64" with "1", @@ -600,6 +604,7 @@ sysctl_mib_init(void) arm_mvfp_info_t *mvfp_info = arm_mvfp_info(); gNeon = mvfp_info->neon; gNeonHpfp = mvfp_info->neon_hpfp; + gNeonFp16 = mvfp_info->neon_fp16; cacheconfig[0] = ml_get_max_cpus(); cacheconfig[1] = 1; diff --git a/bsd/kern/kern_newsysctl.c b/bsd/kern/kern_newsysctl.c index 4103009fe..0381325a9 100644 --- a/bsd/kern/kern_newsysctl.c +++ b/bsd/kern/kern_newsysctl.c @@ -84,6 +84,7 @@ #include #endif + lck_grp_t * sysctl_lock_group = NULL; lck_rw_t * sysctl_geometry_lock = NULL; lck_mtx_t * sysctl_unlocked_node_lock = NULL; @@ -206,6 +207,7 @@ sysctl_register_oid(struct sysctl_oid *new_oidp) new_oidp->oid_number = oidp->oid_number; } + /* * Insert the oid into the parent's list in order. */ @@ -263,6 +265,7 @@ sysctl_unregister_oid(struct sysctl_oid *oidp) } } + /* * We've removed it from the list at this point, but we don't want * to return to the caller until all handler references have drained @@ -642,7 +645,7 @@ sysctl_sysctl_name(__unused struct sysctl_oid *oidp, void *arg1, int arg2, int error = 0; struct sysctl_oid *oid; struct sysctl_oid_list *lsp = &sysctl__children, *lsp2; - char tempbuf[10]; + char tempbuf[10] = {}; lck_rw_lock_shared(sysctl_geometry_lock); while (namelen) { @@ -834,7 +837,7 @@ sysctl_sysctl_next(__unused struct sysctl_oid *oidp, void *arg1, int arg2, int i, j, error; struct sysctl_oid *oid; struct sysctl_oid_list *lsp = &sysctl__children; - int newoid[CTL_MAXNAME]; + int newoid[CTL_MAXNAME] = {}; lck_rw_lock_shared(sysctl_geometry_lock); i = sysctl_sysctl_next_ls (lsp, name, namelen, newoid, &j, 1, &oid); @@ -966,7 +969,7 @@ sysctl_sysctl_name2oid(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { char *p; - int error, oid[CTL_MAXNAME]; + int error, oid[CTL_MAXNAME] = {}; u_int len = 0; /* set by name2oid() */ if (req->newlen < 1) @@ -1327,6 +1330,7 @@ sysctl_root(boolean_t from_kernel, boolean_t string_is_canonical, char *namestri int i; struct sysctl_oid *oid; struct sysctl_oid_list *lsp = &sysctl__children; + sysctl_handler_t oid_handler = NULL; int error; boolean_t unlocked_node_found = FALSE; boolean_t namestring_started = FALSE; @@ -1464,7 +1468,12 @@ sysctl_root(boolean_t from_kernel, boolean_t string_is_canonical, char *namestri (error = proc_suser(req->p))) goto err; - if (!oid->oid_handler) { + /* + * sysctl_unregister_oid() may change the handler value, so grab it + * under the lock. + */ + oid_handler = oid->oid_handler; + if (!oid_handler) { error = EINVAL; goto err; } @@ -1503,14 +1512,11 @@ sysctl_root(boolean_t from_kernel, boolean_t string_is_canonical, char *namestri lck_mtx_lock(sysctl_unlocked_node_lock); } + if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) { - i = (oid->oid_handler) (oid, - name + indx, namelen - indx, - req); + i = oid_handler(oid, name + indx, namelen - indx, req); } else { - i = (oid->oid_handler) (oid, - oid->oid_arg1, oid->oid_arg2, - req); + i = oid_handler(oid, oid->oid_arg1, oid->oid_arg2, req); } error = i; diff --git a/bsd/kern/kern_ntptime.c b/bsd/kern/kern_ntptime.c index 915f4c4b7..937f12d5d 100644 --- a/bsd/kern/kern_ntptime.c +++ b/bsd/kern/kern_ntptime.c @@ -293,11 +293,11 @@ ntp_gettime(struct proc *p, struct ntp_gettime_args *uap, __unused int32_t *retv } int -ntp_adjtime(struct proc *p, struct ntp_adjtime_args *uap, __unused int32_t *retval) +ntp_adjtime(struct proc *p, struct ntp_adjtime_args *uap, int32_t *retval) { - struct timex ntv; + struct timex ntv = {}; long freq; - int modes; + unsigned int modes; int error, ret = 0; clock_sec_t sec; clock_usec_t microsecs; @@ -334,7 +334,7 @@ ntp_adjtime(struct proc *p, struct ntp_adjtime_args *uap, __unused int32_t *retv #if DEVELOPEMNT || DEBUG if (g_should_log_clock_adjustments) { - os_log(OS_LOG_DEFAULT, "%s:BEFORE modes %u offset %ld freq %ld status %d constant %ld time_adjtime %lld\n", + os_log(OS_LOG_DEFAULT, "%s: BEFORE modes %u offset %ld freq %ld status %d constant %ld time_adjtime %lld\n", __func__, ntv.modes, ntv.offset, ntv.freq, ntv.status, ntv.constant, time_adjtime); } #endif @@ -429,8 +429,8 @@ ntp_adjtime(struct proc *p, struct ntp_adjtime_args *uap, __unused int32_t *retv #if DEVELOPEMNT || DEBUG if (g_should_log_clock_adjustments) { - os_log(OS_LOG_DEFAULT, "%s:AFTER offset %lld freq %lld status %d constant %ld time_adjtime %lld\n", - __func__, time_offset, time_freq, time_status, time_constant, time_adjtime); + os_log(OS_LOG_DEFAULT, "%s: AFTER modes %u offset %lld freq %lld status %d constant %ld time_adjtime %lld\n", + __func__, modes, time_offset, time_freq, time_status, time_constant, time_adjtime); } #endif @@ -441,6 +441,7 @@ ntp_adjtime(struct proc *p, struct ntp_adjtime_args *uap, __unused int32_t *retv if (IS_64BIT_PROCESS(p)) { struct user64_timex user_ntv = {}; + user_ntv.modes = modes; if (time_status & STA_NANO) user_ntv.offset = L_GINT(time_offset); else @@ -465,6 +466,7 @@ ntp_adjtime(struct proc *p, struct ntp_adjtime_args *uap, __unused int32_t *retv else{ struct user32_timex user_ntv = {}; + user_ntv.modes = modes; if (time_status & STA_NANO) user_ntv.offset = L_GINT(time_offset); else diff --git a/bsd/kern/kern_overrides.c b/bsd/kern/kern_overrides.c index 8e70d80f0..0d7ece73d 100644 --- a/bsd/kern/kern_overrides.c +++ b/bsd/kern/kern_overrides.c @@ -26,10 +26,6 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - * System Overrides syscall implementation - */ - #include #include #include @@ -52,6 +48,7 @@ #include #include #include +#include /* Mutex for global system override state */ static lck_mtx_t sys_override_lock; @@ -59,9 +56,33 @@ static lck_grp_t *sys_override_mtx_grp; static lck_attr_t *sys_override_mtx_attr; static lck_grp_attr_t *sys_override_mtx_grp_attr; -/* Assertion counts for system properties */ +/* + * Assertion counts for system properties (add new ones for each new mechanism) + * + * The assertion count management for system overrides is as follows: + * + * - All assertion counts are protected by the sys_override_lock. + * + * - Each caller of system_override() increments the assertion count for the + * mechanism it specified in the flags. The caller then blocks for the + * timeout specified in the system call. + * + * - At the end of the timeout, the caller thread wakes up and decrements the + * assertion count for the mechanism it originally took an assertion on. + * + * - If another caller calls the system_override() to disable the override + * for a mechanism, it simply disables the mechanism without changing any + * assertion counts. That way, the assertion counts are properly balanced. + * + * One thing to note is that a SYS_OVERRIDE_DISABLE disables the overrides + * for a mechanism irrespective of how many clients requested that override. + * That makes the implementation simpler and avoids keeping a lot of process + * specific state in the kernel. + * + */ static int64_t io_throttle_assert_cnt; static int64_t cpu_throttle_assert_cnt; +static int64_t fast_jetsam_assert_cnt; /* Wait Channel for system override */ static uint64_t sys_override_wait; @@ -69,19 +90,13 @@ static uint64_t sys_override_wait; /* Global variable to indicate if system_override is enabled */ int sys_override_enabled; -/* Sysctl definition for sys_override_enabled */ -SYSCTL_INT(_debug, OID_AUTO, sys_override_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &sys_override_enabled, 0, ""); - -/* Forward Declarations */ -static void enable_system_override(uint64_t flags); -static void disable_system_override(uint64_t flags); +/* Helper routines */ +static void system_override_begin(uint64_t flags); +static void system_override_end(uint64_t flags); +static void system_override_abort(uint64_t flags); +static void system_override_callouts(uint64_t flags, boolean_t enable_override); static __attribute__((noinline)) void PROCESS_OVERRIDING_SYSTEM_DEFAULTS(uint64_t timeout); -/***************************** system_override ********************/ -/* - * int system_override(uint64_t timeout, uint64_t flags); - */ - void init_system_override() { @@ -89,7 +104,7 @@ init_system_override() sys_override_mtx_grp = lck_grp_alloc_init("system_override", sys_override_mtx_grp_attr); sys_override_mtx_attr = lck_attr_alloc_init(); lck_mtx_init(&sys_override_lock, sys_override_mtx_grp, sys_override_mtx_attr); - io_throttle_assert_cnt = cpu_throttle_assert_cnt = 0; + io_throttle_assert_cnt = cpu_throttle_assert_cnt = fast_jetsam_assert_cnt = 0; sys_override_enabled = 1; } @@ -106,37 +121,28 @@ system_override(__unused struct proc *p, struct system_override_args * uap, __un goto out; } - /* Check to see if some flags are specified. */ + /* Check to see if sane flags are specified. */ if ((flags & ~SYS_OVERRIDE_FLAGS_MASK) != 0) { error = EINVAL; goto out; } - if (flags == SYS_OVERRIDE_DISABLE) { - - printf("Process %s [%d] disabling system_override()\n", current_proc()->p_comm, current_proc()->p_pid); - - lck_mtx_lock(&sys_override_lock); - - if (io_throttle_assert_cnt > 0) - sys_override_io_throttle(THROTTLE_IO_ENABLE); - if (cpu_throttle_assert_cnt > 0) - sys_override_cpu_throttle(CPU_THROTTLE_ENABLE); - - sys_override_enabled = 0; - - lck_mtx_unlock(&sys_override_lock); - + /* Make sure that the system override syscall has been initialized */ + if (!sys_override_enabled) { + error = EINVAL; goto out; } lck_mtx_lock(&sys_override_lock); - enable_system_override(flags); - - PROCESS_OVERRIDING_SYSTEM_DEFAULTS(timeout); - - disable_system_override(flags); + if (flags & SYS_OVERRIDE_DISABLE) { + flags &= ~SYS_OVERRIDE_DISABLE; + system_override_abort(flags); + } else { + system_override_begin(flags); + PROCESS_OVERRIDING_SYSTEM_DEFAULTS(timeout); + system_override_end(flags); + } lck_mtx_unlock(&sys_override_lock); @@ -145,62 +151,164 @@ system_override(__unused struct proc *p, struct system_override_args * uap, __un } /* - * Call for enabling global system override. - * This should be called only with the sys_override_lock held. + * Helper routines for enabling/disabling system overrides for various mechanisms. + * These routines should be called with the sys_override_lock held. Each subsystem + * which is hooked into the override service provides two routines: + * + * - void sys_override_foo_init(void); + * Routine to initialize the subsystem or the data needed for the override to work. + * This routine is optional and if a subsystem needs it, it should be invoked from + * init_system_override(). + * + * - void sys_override_foo(boolean_t enable_override); + * Routine to enable/disable the override mechanism for that subsystem. A value of + * true indicates that the mechanism should be overridden and the special behavior + * should begin. A false value indicates that the subsystem should return to default + * behavior. This routine is mandatory and should be invoked as part of the helper + * routines if the flags passed in the syscall match the subsystem. Also, this + * routine should preferably be idempotent. + */ + +static void +system_override_callouts(uint64_t flags, boolean_t enable_override) +{ + switch (flags) { + case SYS_OVERRIDE_IO_THROTTLE: + if (enable_override) { + KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_THROTTLE, IO_THROTTLE_DISABLE) | DBG_FUNC_START, + current_proc()->p_pid, 0, 0, 0, 0); + } else { + KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_THROTTLE, IO_THROTTLE_DISABLE) | DBG_FUNC_END, + current_proc()->p_pid, 0, 0, 0, 0); + } + sys_override_io_throttle(enable_override); + break; + + case SYS_OVERRIDE_CPU_THROTTLE: + if (enable_override) { + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_CPU_THROTTLE_DISABLE) | DBG_FUNC_START, + current_proc()->p_pid, 0, 0, 0, 0); + } else { + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_CPU_THROTTLE_DISABLE) | DBG_FUNC_END, + current_proc()->p_pid, 0, 0, 0, 0); + } + sys_override_cpu_throttle(enable_override); + break; + + case SYS_OVERRIDE_FAST_JETSAM: + if (enable_override) { + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FAST_JETSAM) | DBG_FUNC_START, + current_proc()->p_pid, 0, 0, 0, 0); + } else { + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FAST_JETSAM) | DBG_FUNC_END, + current_proc()->p_pid, 0, 0, 0, 0); + } +#if CONFIG_JETSAM + memorystatus_fast_jetsam_override(enable_override); +#endif /* CONFIG_JETSAM */ + break; + + default: + panic("Unknown option to system_override_callouts(): %llu\n", flags); + } +} + +/* + * system_override_begin(uint64_t flags) + * + * Routine to start a system override if the assertion count + * transitions from 0->1 for a specified mechanism. */ static void -enable_system_override(uint64_t flags) +system_override_begin(uint64_t flags) { + lck_mtx_assert(&sys_override_lock, LCK_MTX_ASSERT_OWNED); if (flags & SYS_OVERRIDE_IO_THROTTLE) { - if ((io_throttle_assert_cnt == 0) && sys_override_enabled) { - /* Disable I/O Throttling */ - printf("Process %s [%d] disabling system-wide I/O Throttling\n", current_proc()->p_comm, current_proc()->p_pid); - sys_override_io_throttle(THROTTLE_IO_DISABLE); + if (io_throttle_assert_cnt == 0) { + system_override_callouts(SYS_OVERRIDE_IO_THROTTLE, true); } - KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_THROTTLE, IO_THROTTLE_DISABLE) | DBG_FUNC_START, current_proc()->p_pid, 0, 0, 0, 0); io_throttle_assert_cnt++; } if (flags & SYS_OVERRIDE_CPU_THROTTLE) { - if ((cpu_throttle_assert_cnt == 0) && sys_override_enabled) { - /* Disable CPU Throttling */ - printf("Process %s [%d] disabling system-wide CPU Throttling\n", current_proc()->p_comm, current_proc()->p_pid); - sys_override_cpu_throttle(CPU_THROTTLE_DISABLE); + if (cpu_throttle_assert_cnt == 0) { + system_override_callouts(SYS_OVERRIDE_CPU_THROTTLE, true); } - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_CPU_THROTTLE_DISABLE) | DBG_FUNC_START, current_proc()->p_pid, 0, 0, 0, 0); cpu_throttle_assert_cnt++; } + + if (flags & SYS_OVERRIDE_FAST_JETSAM) { + if (fast_jetsam_assert_cnt == 0) { + system_override_callouts(SYS_OVERRIDE_FAST_JETSAM, true); + } + fast_jetsam_assert_cnt++; + } } /* - * Call for disabling global system override. - * This should be called only with the sys_override_lock held. + * system_override_end(uint64_t flags) + * + * Routine to end a system override if the assertion count + * transitions from 1->0 for a specified mechanism. */ static void -disable_system_override(uint64_t flags) +system_override_end(uint64_t flags) { + lck_mtx_assert(&sys_override_lock, LCK_MTX_ASSERT_OWNED); + if (flags & SYS_OVERRIDE_IO_THROTTLE) { assert(io_throttle_assert_cnt > 0); io_throttle_assert_cnt--; - KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_THROTTLE, IO_THROTTLE_DISABLE) | DBG_FUNC_END, current_proc()->p_pid, 0, 0, 0, 0); - if ((io_throttle_assert_cnt == 0) && sys_override_enabled) { - /* Enable I/O Throttling */ - sys_override_io_throttle(THROTTLE_IO_ENABLE); + if (io_throttle_assert_cnt == 0) { + system_override_callouts(SYS_OVERRIDE_IO_THROTTLE, false); } } if (flags & SYS_OVERRIDE_CPU_THROTTLE) { assert(cpu_throttle_assert_cnt > 0); cpu_throttle_assert_cnt--; - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_CPU_THROTTLE_DISABLE) | DBG_FUNC_END, current_proc()->p_pid, 0, 0, 0, 0); - if ((cpu_throttle_assert_cnt == 0) && sys_override_enabled) { - /* Enable CPU Throttling */ - sys_override_cpu_throttle(CPU_THROTTLE_ENABLE); + if (cpu_throttle_assert_cnt == 0) { + system_override_callouts(SYS_OVERRIDE_CPU_THROTTLE, false); } } + + if (flags & SYS_OVERRIDE_FAST_JETSAM) { + assert(fast_jetsam_assert_cnt > 0); + fast_jetsam_assert_cnt--; + if (fast_jetsam_assert_cnt == 0) { + system_override_callouts(SYS_OVERRIDE_FAST_JETSAM, false); + } + } + +} + +/* + * system_override_abort(uint64_t flags) + * + * Routine to abort a system override (if one was active) + * irrespective of the assertion counts and number of blocked + * requestors. + */ +static void +system_override_abort(uint64_t flags) +{ + + lck_mtx_assert(&sys_override_lock, LCK_MTX_ASSERT_OWNED); + + if ((flags & SYS_OVERRIDE_IO_THROTTLE) && (io_throttle_assert_cnt > 0)) { + system_override_callouts(SYS_OVERRIDE_IO_THROTTLE, false); + } + + if ((flags & SYS_OVERRIDE_CPU_THROTTLE) && (cpu_throttle_assert_cnt > 0)) { + system_override_callouts(SYS_OVERRIDE_CPU_THROTTLE, false); + } + + if ((flags & SYS_OVERRIDE_FAST_JETSAM) && (fast_jetsam_assert_cnt > 0)) { + system_override_callouts(SYS_OVERRIDE_FAST_JETSAM, false); + } } static __attribute__((noinline)) void diff --git a/bsd/kern/kern_pcsamples.c b/bsd/kern/kern_pcsamples.c index 887029225..eaedcc705 100644 --- a/bsd/kern/kern_pcsamples.c +++ b/bsd/kern/kern_pcsamples.c @@ -228,7 +228,7 @@ pcsamples_control(int *name, __unused u_int namelen, user_addr_t where, size_t * int ret=0; size_t size=*sizep; int value = name[1]; - pcinfo_t pc_bufinfo; + pcinfo_t pc_bufinfo = {}; pid_t *pidcheck; pid_t curpid; diff --git a/bsd/kern/kern_persona.c b/bsd/kern/kern_persona.c index 37df0b5a0..e05e8d424 100644 --- a/bsd/kern/kern_persona.c +++ b/bsd/kern/kern_persona.c @@ -41,11 +41,9 @@ #include #include -#define pna_info(fmt, ...) \ - printf("%s: " fmt "\n", __func__, ## __VA_ARGS__) - +#include #define pna_err(fmt, ...) \ - printf("ERROR[%s]: " fmt "\n", __func__, ## __VA_ARGS__) + os_log_error(OS_LOG_DEFAULT, "ERROR: " fmt, ## __VA_ARGS__) #define MAX_PERSONAS 512 @@ -57,7 +55,10 @@ #define PERSONA_SYSTEM_UID ((uid_t)99) #define PERSONA_SYSTEM_LOGIN "system" +#define PERSONA_ALLOC_TOKEN (0x7a0000ae) +#define PERSONA_INIT_TOKEN (0x7500005e) #define PERSONA_MAGIC (0x0aa55aa0) +#define persona_initialized(p) ((p)->pna_valid == PERSONA_MAGIC || (p)->pna_valid == PERSONA_INIT_TOKEN) #define persona_valid(p) ((p)->pna_valid == PERSONA_MAGIC) #define persona_mkinvalid(p) ((p)->pna_valid = ~(PERSONA_MAGIC)) @@ -74,6 +75,8 @@ lck_attr_t *persona_lck_attr; lck_grp_t *persona_lck_grp; lck_grp_attr_t *persona_lck_grp_attr; +os_refgrp_decl(static, persona_refgrp, "persona", NULL); + static zone_t persona_zone; kauth_cred_t g_default_persona_cred; @@ -126,15 +129,18 @@ void personas_bootstrap(void) g_system_persona = persona_alloc(PERSONA_SYSTEM_UID, PERSONA_SYSTEM_LOGIN, PERSONA_SYSTEM, NULL); + int err = persona_init_begin(g_system_persona); + assert(err == 0); + + persona_init_end(g_system_persona, err); + assert(g_system_persona != NULL); } struct persona *persona_alloc(uid_t id, const char *login, int type, int *error) { - struct persona *persona, *tmp; + struct persona *persona; int err = 0; - kauth_cred_t tmp_cred; - gid_t new_group; if (!login) { pna_err("Must provide a login name for a new persona!"); @@ -167,10 +173,11 @@ struct persona *persona_alloc(uid_t id, const char *login, int type, int *error) } strncpy(persona->pna_login, login, sizeof(persona->pna_login)-1); + persona_dbg("Starting persona allocation for: '%s'", persona->pna_login); LIST_INIT(&persona->pna_members); lck_mtx_init(&persona->pna_lock, persona_lck_grp, persona_lck_attr); - persona->pna_refcount = 1; + os_ref_init(&persona->pna_refcount, &persona_refgrp); /* * Setup initial (temporary) kauth_cred structure @@ -184,18 +191,71 @@ struct persona *persona_alloc(uid_t id, const char *login, int type, int *error) goto out_error; } + persona->pna_type = type; + persona->pna_id = id; + persona->pna_valid = PERSONA_ALLOC_TOKEN; + + /* + * NOTE: this persona has not been fully initialized. A subsequent + * call to persona_init_begin() followed by persona_init_end() will make + * the persona visible to the rest of the system. + */ + if (error) { + *error = 0; + } + return persona; + +out_error: + (void)hw_atomic_add(&g_total_personas, -1); + zfree(persona_zone, persona); + if (error) { + *error = err; + } + return NULL; +} + +/** + * persona_init_begin + * + * This function begins initialization of a persona. It first acquires the + * global persona list lock via lock_personas(), then selects an appropriate + * persona ID and sets up the persona's credentials. This function *must* be + * followed by a call to persona_init_end() which will mark the persona + * structure as valid + * + * Conditions: + * persona has been allocated via persona_alloc() + * nothing locked + * + * Returns: + * global persona list is locked (even on error) + */ +int persona_init_begin(struct persona *persona) +{ + struct persona *tmp; + int err = 0; + kauth_cred_t tmp_cred; + gid_t new_group; + uid_t id; + + if (!persona || (persona->pna_valid != PERSONA_ALLOC_TOKEN)) { + return EINVAL; + } + + id = persona->pna_id; + lock_personas(); try_again: - if (id != PERSONA_ID_NONE) - persona->pna_id = id; - else + if (id == PERSONA_ID_NONE) persona->pna_id = g_next_persona_id; - persona_dbg("Adding %d (%s) to global list...", persona->pna_id, persona->pna_login); + persona_dbg("Beginning Initialization of %d:%d (%s)...", id, persona->pna_id, persona->pna_login); err = 0; LIST_FOREACH(tmp, &all_personas, pna_list) { - if (id == PERSONA_ID_NONE && tmp->pna_id == id) { + persona_lock(tmp); + if (id == PERSONA_ID_NONE && tmp->pna_id == persona->pna_id) { + persona_unlock(tmp); /* * someone else manually claimed this ID, and we're * trying to allocate an ID for the caller: try again @@ -203,8 +263,9 @@ struct persona *persona_alloc(uid_t id, const char *login, int type, int *error) g_next_persona_id += PERSONA_ID_STEP; goto try_again; } - if (strncmp(tmp->pna_login, login, sizeof(tmp->pna_login)) == 0 - || tmp->pna_id == id) { + if (strncmp(tmp->pna_login, persona->pna_login, sizeof(tmp->pna_login)) == 0 || + tmp->pna_id == persona->pna_id) { + persona_unlock(tmp); /* * Disallow use of identical login names and re-use * of previously allocated persona IDs @@ -212,9 +273,10 @@ struct persona *persona_alloc(uid_t id, const char *login, int type, int *error) err = EEXIST; break; } + persona_unlock(tmp); } if (err) - goto out_unlock; + goto out; /* ensure the cred has proper UID/GID defaults */ kauth_cred_ref(persona->pna_cred); @@ -227,7 +289,7 @@ struct persona *persona_alloc(uid_t id, const char *login, int type, int *error) if (!persona->pna_cred) { err = EACCES; - goto out_unlock; + goto out; } /* it should be a member of exactly 1 group (equal to its UID) */ @@ -243,54 +305,79 @@ struct persona *persona_alloc(uid_t id, const char *login, int type, int *error) if (!persona->pna_cred) { err = EACCES; - goto out_unlock; + goto out; } - persona->pna_type = type; - - /* insert the, now valid, persona into the global list! */ - persona->pna_valid = PERSONA_MAGIC; - LIST_INSERT_HEAD(&all_personas, persona, pna_list); - /* if the kernel supplied the persona ID, increment for next time */ if (id == PERSONA_ID_NONE) g_next_persona_id += PERSONA_ID_STEP; -out_unlock: - unlock_personas(); + persona->pna_valid = PERSONA_INIT_TOKEN; - if (err) { - switch (err) { - case EEXIST: - persona_dbg("Login '%s' (%d) already exists", - login, persona->pna_id); - break; - case EACCES: - persona_dbg("kauth_error for persona:%d", persona->pna_id); - break; - default: - persona_dbg("Unknown error:%d", err); - } - goto out_error; +out: + if (err != 0) { + persona_dbg("ERROR:%d while initializing %d:%d (%s)...", err, id, persona->pna_id, persona->pna_login); + /* + * mark the persona with an error so that persona_init_end() + * will *not* add it to the global list. + */ + persona->pna_id = PERSONA_ID_NONE; } - return persona; + /* + * leave the global persona list locked: it will be + * unlocked in a call to persona_init_end() + */ + return err; +} -out_error: - (void)hw_atomic_add(&g_total_personas, -1); - zfree(persona_zone, persona); - if (error) - *error = err; - return NULL; +/** + * persona_init_end + * + * This function finalizes the persona initialization by marking it valid and + * adding it to the global list of personas. After unlocking the global list, + * the persona will be visible to the reset of the system. The function will + * only mark the persona valid if the input parameter 'error' is 0. + * + * Conditions: + * persona is initialized via persona_init_begin() + * global persona list is locked via lock_personas() + * + * Returns: + * global persona list is unlocked + */ +void persona_init_end(struct persona *persona, int error) +{ + if (persona == NULL) { + return; + } + + /* + * If the pna_valid member is set to the INIT_TOKEN value, then it has + * successfully gone through persona_init_begin(), and we can mark it + * valid and make it visible to the rest of the system. However, if + * there was an error either during initialization or otherwise, we + * need to decrement the global count of personas because this one + * will be disposed-of by the callers invocation of persona_put(). + */ + if (error != 0 || persona->pna_valid == PERSONA_ALLOC_TOKEN) { + persona_dbg("ERROR:%d after initialization of %d (%s)", error, persona->pna_id, persona->pna_login); + /* remove this persona from the global count */ + (void)hw_atomic_add(&g_total_personas, -1); + } else if (error == 0 && + persona->pna_valid == PERSONA_INIT_TOKEN) { + persona->pna_valid = PERSONA_MAGIC; + LIST_INSERT_HEAD(&all_personas, persona, pna_list); + persona_dbg("Initialization of %d (%s) Complete.", persona->pna_id, persona->pna_login); + } + + unlock_personas(); } static struct persona *persona_get_locked(struct persona *persona) { - if (persona->pna_refcount) { - persona->pna_refcount++; - return persona; - } - return NULL; + os_ref_retain_locked(&persona->pna_refcount); + return persona; } struct persona *persona_get(struct persona *persona) @@ -313,9 +400,8 @@ void persona_put(struct persona *persona) return; persona_lock(persona); - if (persona->pna_refcount >= 0) { - if (--(persona->pna_refcount) == 0) - destroy = 1; + if (os_ref_release_locked(&persona->pna_refcount) == 0) { + destroy = 1; } persona_unlock(persona); @@ -851,7 +937,7 @@ int persona_set_cred(struct persona *persona, kauth_cred_t cred) return EINVAL; persona_lock(persona); - if (!persona_valid(persona)) { + if (!persona_initialized(persona)) { ret = EINVAL; goto out_unlock; } @@ -888,7 +974,7 @@ int persona_set_cred_from_proc(struct persona *persona, proc_t proc) return EINVAL; persona_lock(persona); - if (!persona_valid(persona)) { + if (!persona_initialized(persona)) { ret = EINVAL; goto out_unlock; } @@ -969,7 +1055,7 @@ int persona_set_gid(struct persona *persona, gid_t gid) return EINVAL; persona_lock(persona); - if (!persona_valid(persona)) { + if (!persona_initialized(persona)) { ret = EINVAL; goto out_unlock; } @@ -1016,7 +1102,7 @@ int persona_set_groups(struct persona *persona, gid_t *groups, unsigned ngroups, return EINVAL; persona_lock(persona); - if (!persona_valid(persona)) { + if (!persona_initialized(persona)) { ret = EINVAL; goto out_unlock; } diff --git a/bsd/kern/kern_proc.c b/bsd/kern/kern_proc.c index f25390cb4..3107ae6d0 100644 --- a/bsd/kern/kern_proc.c +++ b/bsd/kern/kern_proc.c @@ -162,10 +162,6 @@ extern struct tty cons; extern int cs_debug; -#if DEVELOPMENT || DEBUG -extern int cs_enforcement_enable; -#endif - #if DEBUG #define __PROC_INTERNAL_DEBUG 1 #endif @@ -188,14 +184,10 @@ typedef uint64_t unaligned_u64 __attribute__((aligned(1))); static void orphanpg(struct pgrp * pg); void proc_name_kdp(task_t t, char * buf, int size); -void * proc_get_uthread_uu_threadlist(void * uthread_v); int proc_threadname_kdp(void * uth, char * buf, size_t size); void proc_starttime_kdp(void * p, unaligned_u64 *tv_sec, unaligned_u64 *tv_usec, unaligned_u64 *abstime); char * proc_name_address(void * p); -/* TODO: make a header that's exported and usable in osfmk */ -char* proc_best_name(proc_t p); - static void pgrp_add(struct pgrp * pgrp, proc_t parent, proc_t child); static void pgrp_remove(proc_t p); static void pgrp_replace(proc_t p, struct pgrp *pgrp); @@ -212,9 +204,6 @@ struct fixjob_iterargs { int fixjob_callback(proc_t, void *); -uint64_t get_current_unique_pid(void); - - uint64_t get_current_unique_pid(void) { @@ -1010,6 +999,17 @@ proc_exiting(proc_t p) return(retval? 1: 0); } +int +proc_in_teardown(proc_t p) +{ + int retval = 0; + + if (p) + retval = p->p_lflag & P_LPEXIT; + return(retval? 1: 0); + +} + int proc_forcequota(proc_t p) { @@ -1079,6 +1079,13 @@ proc_is64bit(proc_t p) return(IS_64BIT_PROCESS(p)); } +int +proc_is64bit_data(proc_t p) +{ + assert(p->task); + return (int)task_get_64bit_data(p->task); +} + int proc_pidversion(proc_t p) { @@ -1950,6 +1957,7 @@ csops_internal(pid_t pid, int ops, user_addr_t uaddr, user_size_t usersize, user case CS_OPS_ENTITLEMENTS_BLOB: case CS_OPS_IDENTITY: case CS_OPS_BLOB: + case CS_OPS_TEAMID: break; /* not restricted to root */ default: if (forself == 0 && kauth_cred_issuser(kauth_cred_get()) != TRUE) @@ -2000,12 +2008,16 @@ csops_internal(pid_t pid, int ops, user_addr_t uaddr, user_size_t usersize, user proc_lock(pt); retflags = pt->p_csflags; - if (cs_enforcement(pt)) + if (cs_process_enforcement(pt)) retflags |= CS_ENFORCEMENT; if (csproc_get_platform_binary(pt)) retflags |= CS_PLATFORM_BINARY; if (csproc_get_platform_path(pt)) retflags |= CS_PLATFORM_PATH; + //Don't return CS_REQUIRE_LV if we turned it on with CS_FORCED_LV but still report CS_FORCED_LV + if ((pt->p_csflags & CS_FORCED_LV) == CS_FORCED_LV) { + retflags &= (~CS_REQUIRE_LV); + } proc_unlock(pt); if (uaddr != USER_ADDR_NULL) @@ -2154,7 +2166,8 @@ csops_internal(pid_t pid, int ops, user_addr_t uaddr, user_size_t usersize, user error = csops_copy_token(start, length, usize, uaddr); break; } - case CS_OPS_IDENTITY: { + case CS_OPS_IDENTITY: + case CS_OPS_TEAMID: { const char *identity; uint8_t fakeheader[8]; uint32_t idlen; @@ -2178,7 +2191,7 @@ csops_internal(pid_t pid, int ops, user_addr_t uaddr, user_size_t usersize, user break; } - identity = cs_identity_get(pt); + identity = ops == CS_OPS_TEAMID ? csproc_get_teamid(pt) : cs_identity_get(pt); proc_unlock(pt); if (identity == NULL) { error = ENOENT; @@ -2209,7 +2222,7 @@ csops_internal(pid_t pid, int ops, user_addr_t uaddr, user_size_t usersize, user case CS_OPS_CLEARPLATFORM: #if DEVELOPMENT || DEBUG - if (cs_enforcement_enable) { + if (cs_process_global_enforcement()) { error = ENOTSUP; break; } @@ -2248,7 +2261,7 @@ proc_iterate( proc_iterate_fn_t filterfn, void *filterarg) { - pid_t *pid_list; + pid_t *pid_list = NULL; vm_size_t pid_list_size = 0; vm_size_t pid_list_size_needed = 0; int pid_count = 0; @@ -2260,7 +2273,7 @@ proc_iterate( for (;;) { proc_list_lock(); - pid_count_available = nprocs + 1; //kernel_task is not counted in nprocs + pid_count_available = nprocs + 1 /* kernel_task not counted in nprocs */; assert(pid_count_available > 0); pid_list_size_needed = pid_count_available * sizeof(pid_t); @@ -2278,6 +2291,7 @@ proc_iterate( } pid_list_size = pid_list_size_needed; } + assert(pid_list != NULL); /* filter pids into pid_list */ @@ -3229,7 +3243,7 @@ extern boolean_t kill_on_no_paging_space; #endif /* DEVELOPMENT || DEBUG */ #define MB_SIZE (1024 * 1024ULL) -boolean_t memorystatus_kill_on_VM_thrashing(boolean_t); +boolean_t memorystatus_kill_on_VM_compressor_space_shortage(boolean_t); extern int32_t max_kill_priority; extern int memorystatus_get_proccnt_upto_priority(int32_t max_bucket_index); @@ -3305,7 +3319,7 @@ no_paging_space_action() if (memorystatus_get_proccnt_upto_priority(max_kill_priority) > 0) { last_no_space_action = now; - memorystatus_kill_on_VM_thrashing(TRUE /* async */); + memorystatus_kill_on_VM_compressor_space_shortage(TRUE /* async */); return (1); } @@ -3432,11 +3446,17 @@ proc_chrooted(proc_t p) return retval; } -void * -proc_get_uthread_uu_threadlist(void * uthread_v) +boolean_t +proc_send_synchronous_EXC_RESOURCE(proc_t p) { - uthread_t uth = (uthread_t)uthread_v; - return (uth != NULL) ? uth->uu_threadlist : NULL; + if (p == PROC_NULL) + return FALSE; + + /* Send sync EXC_RESOURCE if the process is traced */ + if (ISSET(p->p_lflag, P_LTRACED)) { + return TRUE; + } + return FALSE; } #ifdef CONFIG_32BIT_TELEMETRY diff --git a/bsd/kern/kern_prot.c b/bsd/kern/kern_prot.c index 9d825afcb..36beb2737 100644 --- a/bsd/kern/kern_prot.c +++ b/bsd/kern/kern_prot.c @@ -669,6 +669,12 @@ setpgid(proc_t curp, struct setpgid_args *uap, __unused int32_t *retval) * real, effective, or saved user or group IDs since beginning * execution. */ +int +proc_issetugid (proc_t p) +{ + return (p->p_flag & P_SUGID) ? 1 : 0; +} + int issetugid(proc_t p, __unused struct issetugid_args *uap, int32_t *retval) { @@ -681,7 +687,7 @@ issetugid(proc_t p, __unused struct issetugid_args *uap, int32_t *retval) * that libc *might* have put in their data segment. */ - *retval = (p->p_flag & P_SUGID) ? 1 : 0; + *retval = proc_issetugid(p); return (0); } diff --git a/bsd/kern/kern_resource.c b/bsd/kern/kern_resource.c index 780159263..cf55f42d5 100644 --- a/bsd/kern/kern_resource.c +++ b/bsd/kern/kern_resource.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2017 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1232,8 +1232,8 @@ int getrusage(struct proc *p, struct getrusage_args *uap, __unused int32_t *retval) { struct rusage *rup, rubuf; - struct user64_rusage rubuf64; - struct user32_rusage rubuf32; + struct user64_rusage rubuf64 = {}; + struct user32_rusage rubuf32 = {}; size_t retsize = sizeof(rubuf); /* default: 32 bits */ caddr_t retbuf = (caddr_t)&rubuf; /* default: 32 bits */ struct timeval utime; @@ -1421,6 +1421,13 @@ proc_limitreplace(proc_t p) return(0); } +static int +iopolicysys_disk(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param); +static int +iopolicysys_vfs_hfs_case_sensitivity(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param); +static int +iopolicysys_vfs_atime_updates(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param); + /* * iopolicysys * @@ -1433,12 +1440,6 @@ proc_limitreplace(proc_t p) * EINVAL Invalid command or invalid policy arguments * */ - -static int -iopolicysys_disk(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param); -static int -iopolicysys_vfs(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param); - int iopolicysys(struct proc *p, struct iopolicysys_args *uap, int32_t *retval) { @@ -1459,7 +1460,12 @@ iopolicysys(struct proc *p, struct iopolicysys_args *uap, int32_t *retval) goto out; break; case IOPOL_TYPE_VFS_HFS_CASE_SENSITIVITY: - error = iopolicysys_vfs(p, uap->cmd, iop_param.iop_scope, iop_param.iop_policy, &iop_param); + error = iopolicysys_vfs_hfs_case_sensitivity(p, uap->cmd, iop_param.iop_scope, iop_param.iop_policy, &iop_param); + if (error) + goto out; + break; + case IOPOL_TYPE_VFS_ATIME_UPDATES: + error = iopolicysys_vfs_atime_updates(p, uap->cmd, iop_param.iop_scope, iop_param.iop_policy, &iop_param); if (error) goto out; break; @@ -1600,7 +1606,7 @@ iopolicysys_disk(struct proc *p __unused, int cmd, int scope, int policy, struct } static int -iopolicysys_vfs(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param) +iopolicysys_vfs_hfs_case_sensitivity(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param) { int error = 0; @@ -1668,6 +1674,93 @@ iopolicysys_vfs(struct proc *p, int cmd, int scope, int policy, struct _iopol_pa return (error); } +static inline int +get_thread_atime_policy(struct uthread *ut) +{ + return (ut->uu_flag & UT_ATIME_UPDATE)? IOPOL_ATIME_UPDATES_OFF: IOPOL_ATIME_UPDATES_DEFAULT; +} + +static inline void +set_thread_atime_policy(struct uthread *ut, int policy) +{ + if (policy == IOPOL_ATIME_UPDATES_OFF) { + ut->uu_flag |= UT_ATIME_UPDATE; + } else { + ut->uu_flag &= ~UT_ATIME_UPDATE; + } +} + +static inline void +set_task_atime_policy(struct proc *p, int policy) +{ + if (policy == IOPOL_ATIME_UPDATES_OFF) { + OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_ATIME_UPDATES, &p->p_vfs_iopolicy); + } else { + OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_ATIME_UPDATES), &p->p_vfs_iopolicy); + } +} + +static inline int +get_task_atime_policy(struct proc *p) +{ + return (p->p_vfs_iopolicy & P_VFS_IOPOLICY_ATIME_UPDATES)? IOPOL_ATIME_UPDATES_OFF: IOPOL_ATIME_UPDATES_DEFAULT; +} + +static int +iopolicysys_vfs_atime_updates(struct proc *p __unused, int cmd, int scope, int policy, struct _iopol_param_t *iop_param) +{ + int error = 0; + thread_t thread; + + /* Validate scope */ + switch (scope) { + case IOPOL_SCOPE_THREAD: + thread = current_thread(); + break; + case IOPOL_SCOPE_PROCESS: + thread = THREAD_NULL; + break; + default: + error = EINVAL; + goto out; + } + + /* Validate policy */ + if (cmd == IOPOL_CMD_SET) { + switch (policy) { + case IOPOL_ATIME_UPDATES_DEFAULT: + case IOPOL_ATIME_UPDATES_OFF: + break; + default: + error = EINVAL; + goto out; + } + } + + /* Perform command */ + switch(cmd) { + case IOPOL_CMD_SET: + if (thread != THREAD_NULL) + set_thread_atime_policy(get_bsdthread_info(thread), policy); + else + set_task_atime_policy(p, policy); + break; + case IOPOL_CMD_GET: + if (thread != THREAD_NULL) + policy = get_thread_atime_policy(get_bsdthread_info(thread)); + else + policy = get_task_atime_policy(p); + iop_param->iop_policy = policy; + break; + default: + error = EINVAL; /* unknown command */ + break; + } + +out: + return (error); +} + /* BSD call back function for task_policy networking changes */ void proc_apply_task_networkbg(void * bsd_info, thread_t thread) @@ -1697,8 +1790,11 @@ gather_rusage_info(proc_t p, rusage_info_current *ru, int flavor) case RUSAGE_INFO_V4: ru->ri_logical_writes = get_task_logical_writes(p->task); ru->ri_lifetime_max_phys_footprint = get_task_phys_footprint_lifetime_max(p->task); +#if CONFIG_LEDGER_INTERVAL_MAX + ru->ri_interval_max_phys_footprint = get_task_phys_footprint_interval_max(p->task, FALSE); +#endif fill_task_monotonic_rusage(p->task, ru); - /* fall through */ + /* fall through */ case RUSAGE_INFO_V3: fill_task_qos_rusage(p->task, ru); @@ -1736,7 +1832,7 @@ gather_rusage_info(proc_t p, rusage_info_current *ru, int flavor) int proc_get_rusage(proc_t p, int flavor, user_addr_t buffer, __unused int is_zombie) { - rusage_info_current ri_current; + rusage_info_current ri_current = {}; int error = 0; size_t size = 0; @@ -1811,6 +1907,9 @@ mach_to_bsd_rv(int mach_rv) * uap->flavor available flavors: * * RLIMIT_WAKEUPS_MONITOR + * RLIMIT_CPU_USAGE_MONITOR + * RLIMIT_THREAD_CPULIMITS + * RLIMIT_FOOTPRINT_INTERVAL */ int proc_rlimit_control(__unused struct proc *p, struct proc_rlimit_control_args *uap, __unused int32_t *retval) @@ -1821,6 +1920,10 @@ proc_rlimit_control(__unused struct proc *p, struct proc_rlimit_control_args *ua uint32_t cpumon_flags; uint32_t cpulimits_flags; kauth_cred_t my_cred, target_cred; +#if CONFIG_LEDGER_INTERVAL_MAX + uint32_t footprint_interval_flags; + uint64_t interval_max_footprint; +#endif /* CONFIG_LEDGER_INTERVAL_MAX */ /* -1 implicitly means our own process (perhaps even the current thread for per-thread attributes) */ if (uap->pid == -1) { @@ -1883,6 +1986,20 @@ proc_rlimit_control(__unused struct proc *p, struct proc_rlimit_control_args *ua error = mach_to_bsd_rv(thread_set_cpulimit(THREAD_CPULIMIT_BLOCK, percent, ns_refill)); break; + +#if CONFIG_LEDGER_INTERVAL_MAX + case RLIMIT_FOOTPRINT_INTERVAL: + footprint_interval_flags = uap->arg; // XXX temporarily stashing flags in argp (12592127) + /* + * There is currently only one option for this flavor. + */ + if ((footprint_interval_flags & FOOTPRINT_INTERVAL_RESET) == 0) { + error = EINVAL; + break; + } + interval_max_footprint = get_task_phys_footprint_interval_max(targetp->task, TRUE); + break; +#endif /* CONFIG_LEDGER_INTERVAL_MAX */ default: error = EINVAL; break; diff --git a/bsd/kern/kern_sig.c b/bsd/kern/kern_sig.c index 849562cca..c444668bd 100644 --- a/bsd/kern/kern_sig.c +++ b/bsd/kern/kern_sig.c @@ -114,6 +114,7 @@ #include #include +#include #include #if CONFIG_MACF @@ -261,6 +262,11 @@ __sigaction_user32_to_kern(struct __user32_sigaction *in, struct __kern_sigactio out->sa_tramp = CAST_USER_ADDR_T(in->sa_tramp); out->sa_mask = in->sa_mask; out->sa_flags = in->sa_flags; + + kern_return_t kr; + kr = machine_thread_function_pointers_convert_from_user(current_thread(), + &out->sa_tramp, 1); + assert(kr == KERN_SUCCESS); } static void @@ -270,6 +276,11 @@ __sigaction_user64_to_kern(struct __user64_sigaction *in, struct __kern_sigactio out->sa_tramp = in->sa_tramp; out->sa_mask = in->sa_mask; out->sa_flags = in->sa_flags; + + kern_return_t kr; + kr = machine_thread_function_pointers_convert_from_user(current_thread(), + &out->sa_tramp, 1); + assert(kr == KERN_SUCCESS); } #if SIGNAL_DEBUG @@ -444,6 +455,7 @@ sigaction(proc_t p, struct sigaction_args *uap, __unused int32_t *retval) int signum; int bit, error=0; + uint32_t sigreturn_validation = PS_SIGRETURN_VALIDATION_DEFAULT; signum = uap->signum; if (signum <= 0 || signum >= NSIG || @@ -462,6 +474,9 @@ sigaction(proc_t p, struct sigaction_args *uap, __unused int32_t *retval) } if (error) return (error); + + sigreturn_validation = (__vec.sa_flags & SA_VALIDATE_SIGRETURN_FROM_SIGTRAMP) ? + PS_SIGRETURN_VALIDATION_ENABLED : PS_SIGRETURN_VALIDATION_DISABLED; __vec.sa_flags &= SA_USERSPACE_MASK; /* Only pass on valid sa_flags */ if ((__vec.sa_flags & SA_SIGINFO) || __vec.sa_handler != SIG_DFL) { @@ -488,8 +503,6 @@ sigaction(proc_t p, struct sigaction_args *uap, __unused int32_t *retval) sa->sa_flags |= SA_SIGINFO; if (ps->ps_signodefer & bit) sa->sa_flags |= SA_NODEFER; - if (ps->ps_64regset & bit) - sa->sa_flags |= SA_64REGSET; if ((signum == SIGCHLD) && (p->p_flag & P_NOCLDSTOP)) sa->sa_flags |= SA_NOCLDSTOP; if ((signum == SIGCHLD) && (p->p_flag & P_NOCLDWAIT)) @@ -509,6 +522,13 @@ sigaction(proc_t p, struct sigaction_args *uap, __unused int32_t *retval) } if (uap->nsa) { + uint32_t old_sigreturn_validation = atomic_load_explicit( + &ps->ps_sigreturn_validation, memory_order_relaxed); + if (old_sigreturn_validation == PS_SIGRETURN_VALIDATION_DEFAULT) { + atomic_compare_exchange_strong_explicit(&ps->ps_sigreturn_validation, + &old_sigreturn_validation, sigreturn_validation, + memory_order_relaxed, memory_order_relaxed); + } error = setsigvec(p, current_thread(), signum, &__vec, FALSE); } @@ -673,10 +693,6 @@ setsigvec(proc_t p, __unused thread_t thread, int signum, struct __kern_sigactio ps->ps_siginfo |= bit; else ps->ps_siginfo &= ~bit; - if (sa->sa_flags & SA_64REGSET) - ps->ps_64regset |= bit; - else - ps->ps_64regset &= ~bit; if ((sa->sa_flags & SA_RESTART) == 0) ps->ps_sigintr |= bit; else @@ -685,10 +701,6 @@ setsigvec(proc_t p, __unused thread_t thread, int signum, struct __kern_sigactio ps->ps_sigonstack |= bit; else ps->ps_sigonstack &= ~bit; - if (sa->sa_flags & SA_USERTRAMP) - ps->ps_usertramp |= bit; - else - ps->ps_usertramp &= ~bit; if (sa->sa_flags & SA_RESETHAND) ps->ps_sigreset |= bit; else @@ -786,6 +798,11 @@ execsigs(proc_t p, thread_t thread) ps->ps_sigact[nc] = SIG_DFL; } + atomic_store_explicit(&ps->ps_sigreturn_validation, + PS_SIGRETURN_VALIDATION_DEFAULT, memory_order_relaxed); + /* Generate random token value used to validate sigreturn arguments */ + read_random(&ps->ps_sigreturn_token, sizeof(ps->ps_sigreturn_token)); + /* * Reset stack state to the user stack. * Clear set of signals caught on the signal stack. @@ -1678,6 +1695,15 @@ terminate_with_payload_internal(struct proc *cur_proc, int target_pid, uint32_t return EPERM; } + if (target_pid != cur_proc->p_pid) { + /* + * FLAG_ABORT should only be set on terminate_with_reason(getpid()) that + * was a fallback from an unsuccessful abort_with_reason(). In that case + * caller's pid matches the target one. Otherwise remove the flag. + */ + reason_flags &= ~((typeof(reason_flags))OS_REASON_FLAG_ABORT); + } + KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE, target_proc->p_pid, reason_namespace, reason_code, 0, 0); @@ -3063,6 +3089,7 @@ postsig_locked(int signum) uint32_t code; int mask, returnmask; struct uthread * ut; + os_reason_t ut_exit_reason = OS_REASON_NULL; #if DIAGNOSTIC if (signum == 0) @@ -3093,6 +3120,15 @@ postsig_locked(int signum) * the process. (Other cases were ignored above.) */ sig_lock_to_exit(p); + + /* + * exit_with_reason() below will consume a reference to the thread's exit reason, so we take another + * reference so the thread still has one even after we call exit_with_reason(). The thread's reference will + * ultimately be destroyed in uthread_cleanup(). + */ + ut_exit_reason = ut->uu_exit_reason; + os_reason_ref(ut_exit_reason); + p->p_acflag |= AXSIG; if (sigprop[signum] & SA_CORE) { p->p_sigacts->ps_sig = signum; @@ -3132,12 +3168,7 @@ postsig_locked(int signum) KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_FRCEXIT) | DBG_FUNC_NONE, p->p_pid, W_EXITCODE(0, signum), 3, 0, 0); - /* - * exit_with_reason() will consume a reference to the thread's exit reason, so we take another - * reference for the thread. This reference will be destroyed in uthread_cleanup(). - */ - os_reason_ref(ut->uu_exit_reason); - exit_with_reason(p, W_EXITCODE(0, signum), (int *)NULL, TRUE, TRUE, 0, ut->uu_exit_reason); + exit_with_reason(p, W_EXITCODE(0, signum), (int *)NULL, TRUE, TRUE, 0, ut_exit_reason); proc_lock(p); return; @@ -3266,11 +3297,8 @@ filt_signaltouch( proc_klist_lock(); - if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) - kn->kn_udata = kev->udata; - /* - * No data to save - - * just capture if it is already fired + /* + * No data to save - just capture if it is already fired */ res = (kn->kn_data > 0); diff --git a/bsd/kern/kern_symfile.c b/bsd/kern/kern_symfile.c index 46018b2de..a88a51ca8 100644 --- a/bsd/kern/kern_symfile.c +++ b/bsd/kern/kern_symfile.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include @@ -61,6 +62,9 @@ #include #include +#define HIBERNATE_MIN_PHYSICAL_LBA (34) +#define HIBERNATE_MIN_FILE_SIZE (1024*1024) + /* This function is called from kern_sysctl in the current process context; * it is exported with the System6.0.exports, but this appears to be a legacy * export, as there are no internal consumers. @@ -75,13 +79,15 @@ get_kernel_symfile(__unused proc_t p, __unused char const **symfile) struct kern_direct_file_io_ref_t { - vfs_context_t ctx; - struct vnode * vp; - dev_t device; - uint32_t blksize; - off_t filelength; - char cf; - char pinned; + vfs_context_t ctx; + struct vnode * vp; + dev_t device; + uint32_t blksize; + off_t filelength; + char cf; + char pinned; + char frozen; + char wbcranged; }; @@ -201,7 +207,7 @@ extern uint32_t freespace_mb(vnode_t vp); struct kern_direct_file_io_ref_t * kern_open_file_for_direct_io(const char * name, - boolean_t create_file, + uint32_t iflags, kern_get_file_extents_callback_t callback, void * callback_ref, off_t set_file_size, @@ -219,17 +225,18 @@ kern_open_file_for_direct_io(const char * name, proc_t p; struct vnode_attr va; + dk_apfs_wbc_range_t wbc_range; int error; off_t f_offset; uint64_t fileblk; size_t filechunk; - uint64_t physoffset; + uint64_t physoffset, minoffset; dev_t device; dev_t target = 0; int isssd = 0; uint32_t flags = 0; uint32_t blksize; - off_t maxiocount, count, segcount; + off_t maxiocount, count, segcount, wbctotal; boolean_t locked = FALSE; int fmode, cmode; struct nameidata nd; @@ -253,7 +260,7 @@ kern_open_file_for_direct_io(const char * name, p = kernproc; ref->ctx = vfs_context_kernel(); - fmode = (create_file) ? (O_CREAT | FWRITE) : FWRITE; + fmode = (kIOPolledFileCreate & iflags) ? (O_CREAT | FWRITE) : FWRITE; cmode = S_IRUSR | S_IWUSR; ndflags = NOFOLLOW; NDINIT(&nd, LOOKUP, OP_OPEN, ndflags, UIO_SYSSPACE, CAST_USER_ADDR_T(name), ref->ctx); @@ -276,10 +283,10 @@ kern_open_file_for_direct_io(const char * name, if (write_file_addr && write_file_len) { - if ((error = kern_write_file(ref, write_file_offset, write_file_addr, write_file_len, IO_SKIP_ENCRYPTION))) { - kprintf("kern_write_file() failed with error: %d\n", error); - goto out; - } + if ((error = kern_write_file(ref, write_file_offset, write_file_addr, write_file_len, IO_SKIP_ENCRYPTION))) { + kprintf("kern_write_file() failed with error: %d\n", error); + goto out; + } } VATTR_INIT(&va); @@ -292,6 +299,7 @@ kern_open_file_for_direct_io(const char * name, error = EFAULT; if (vnode_getattr(ref->vp, &va, ref->ctx)) goto out; + wbctotal = 0; mpFree = freespace_mb(ref->vp); mpFree <<= 20; kprintf("kern_direct_file(%s): vp size %qd, alloc %qd, mp free %qd, keep free %qd\n", @@ -309,8 +317,31 @@ kern_open_file_for_direct_io(const char * name, p2 = p; do_ioctl = &file_ioctl; + if (kIOPolledFileHibernate & iflags) + { + error = do_ioctl(p1, p2, DKIOCAPFSGETWBCRANGE, (caddr_t) &wbc_range); + ref->wbcranged = (error == 0); + } + if (ref->wbcranged) + { + uint32_t idx; + assert(wbc_range.count <= (sizeof(wbc_range.extents) / sizeof(wbc_range.extents[0]))); + for (idx = 0; idx < wbc_range.count; idx++) wbctotal += wbc_range.extents[idx].length; + kprintf("kern_direct_file(%s): wbc %qd\n", name, wbctotal); + if (wbctotal) target = wbc_range.dev; + } + if (set_file_size) { + if (wbctotal) + { + if (wbctotal >= set_file_size) set_file_size = HIBERNATE_MIN_FILE_SIZE; + else + { + set_file_size -= wbctotal; + if (set_file_size < HIBERNATE_MIN_FILE_SIZE) set_file_size = HIBERNATE_MIN_FILE_SIZE; + } + } if (fs_free_size) { mpFree += va.va_data_alloc; @@ -354,6 +385,8 @@ kern_open_file_for_direct_io(const char * name, if (error) goto out; + minoffset = HIBERNATE_MIN_PHYSICAL_LBA * ref->blksize; + if (ref->vp->v_type != VREG) { error = do_ioctl(p1, p2, DKIOCGETBLOCKCOUNT, (caddr_t) &fileblk); @@ -361,12 +394,18 @@ kern_open_file_for_direct_io(const char * name, ref->filelength = fileblk * ref->blksize; } - // pin logical extents + // pin logical extents, CS version error = kern_ioctl_file_extents(ref, _DKIOCCSPINEXTENT, 0, ref->filelength); if (error && (ENOTTY != error)) goto out; ref->pinned = (error == 0); + // pin logical extents, apfs version + + error = VNOP_IOCTL(ref->vp, FSCTL_FREEZE_EXTENTS, NULL, 0, ref->ctx); + if (error && (ENOTTY != error)) goto out; + ref->frozen = (error == 0); + // generate the block list error = do_ioctl(p1, p2, DKIOCLOCKPHYSICALEXTENTS, NULL); @@ -412,6 +451,9 @@ kern_open_file_for_direct_io(const char * name, error = ENOTSUP; goto out; } + + assert(getphysreq.offset >= minoffset); + #if HIBFRAGMENT uint64_t rev; for (rev = 4096; rev <= getphysreq.length; rev += 4096) @@ -424,6 +466,15 @@ kern_open_file_for_direct_io(const char * name, physoffset += getphysreq.length; } } + if (ref->wbcranged) + { + uint32_t idx; + for (idx = 0; idx < wbc_range.count; idx++) + { + assert(wbc_range.extents[idx].offset >= minoffset); + callback(callback_ref, wbc_range.extents[idx].offset, wbc_range.extents[idx].length); + } + } callback(callback_ref, 0ULL, 0ULL); if (ref->vp->v_type == VREG) p1 = ⌖ @@ -529,15 +580,24 @@ kern_open_file_for_direct_io(const char * name, if (error && ref) { - if (ref->vp) - { - (void) kern_ioctl_file_extents(ref, _DKIOCCSUNPINEXTENT, 0, (ref->pinned && ref->cf) ? ref->filelength : 0); - vnode_close(ref->vp, FWRITE, ref->ctx); - ref->vp = NULLVP; - } - ref->ctx = NULL; - kfree(ref, sizeof(struct kern_direct_file_io_ref_t)); - ref = NULL; + if (ref->vp) + { + (void) kern_ioctl_file_extents(ref, _DKIOCCSUNPINEXTENT, 0, (ref->pinned && ref->cf) ? ref->filelength : 0); + + if (ref->frozen) + { + (void) VNOP_IOCTL(ref->vp, FSCTL_THAW_EXTENTS, NULL, 0, ref->ctx); + } + if (ref->wbcranged) + { + (void) do_ioctl(p1, p2, DKIOCAPFSRELEASEWBCRANGE, (caddr_t) NULL); + } + vnode_close(ref->vp, FWRITE, ref->ctx); + ref->vp = NULLVP; + } + ref->ctx = NULL; + kfree(ref, sizeof(struct kern_direct_file_io_ref_t)); + ref = NULL; } return(ref); @@ -586,6 +646,9 @@ kern_close_file_for_direct_io(struct kern_direct_file_io_ref_t * ref, void * p1; void * p2; + discard_offset = ((discard_offset + ref->blksize - 1) & ~(((off_t) ref->blksize) - 1)); + discard_end = ((discard_end) & ~(((off_t) ref->blksize) - 1)); + if (ref->vp->v_type == VREG) { p1 = &ref->device; @@ -616,6 +679,15 @@ kern_close_file_for_direct_io(struct kern_direct_file_io_ref_t * ref, (void) kern_ioctl_file_extents(ref, DKIOCUNMAP, discard_offset, (ref->cf) ? ref->filelength : discard_end); } + if (ref->frozen) + { + (void) VNOP_IOCTL(ref->vp, FSCTL_THAW_EXTENTS, NULL, 0, ref->ctx); + } + if (ref->wbcranged) + { + (void) do_ioctl(p1, p2, DKIOCAPFSRELEASEWBCRANGE, (caddr_t) NULL); + } + if (addr && write_length) { (void) kern_write_file(ref, write_offset, addr, write_length, IO_SKIP_ENCRYPTION); diff --git a/bsd/kern/kern_sysctl.c b/bsd/kern/kern_sysctl.c index f6ed41035..d937e9e4f 100644 --- a/bsd/kern/kern_sysctl.c +++ b/bsd/kern/kern_sysctl.c @@ -122,6 +122,7 @@ #include #include #include +#include #include #include #include @@ -186,8 +187,6 @@ extern unsigned int vm_max_batch; extern unsigned int vm_page_free_min; extern unsigned int vm_page_free_target; extern unsigned int vm_page_free_reserved; -extern unsigned int vm_page_speculative_percentage; -extern unsigned int vm_page_speculative_q_age_ms; #if (DEVELOPMENT || DEBUG) extern uint32_t vm_page_creation_throttled_hard; @@ -305,6 +304,13 @@ STATIC int sysctl_singleuser(struct sysctl_oid *oidp, void *arg1, int arg2, stru STATIC int sysctl_minimalboot(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); STATIC int sysctl_slide(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +#ifdef CONFIG_XNUPOST +#include + +STATIC int sysctl_debug_test_oslog_ctl(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_debug_test_stackshot_mutex_owner(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_debug_test_stackshot_rwlck_owner(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +#endif extern void IORegistrySetOSBuildVersion(char * build_version); @@ -1269,6 +1275,7 @@ sysctl_procargsx(int *name, u_int namelen, user_addr_t where, caddr_t data; size_t argslen=0; int size; + vm_size_t alloc_size = 0; vm_offset_t copy_start, copy_end; kern_return_t ret; int pid; @@ -1383,12 +1390,13 @@ sysctl_procargsx(int *name, u_int namelen, user_addr_t where, if (proc_map == NULL) return(EINVAL); - - ret = kmem_alloc(kernel_map, ©_start, round_page(arg_size), VM_KERN_MEMORY_BSD); + alloc_size = round_page(arg_size); + ret = kmem_alloc(kernel_map, ©_start, alloc_size, VM_KERN_MEMORY_BSD); if (ret != KERN_SUCCESS) { vm_map_deallocate(proc_map); return(ENOMEM); } + bzero((void *)copy_start, alloc_size); copy_end = round_page(copy_start + arg_size); @@ -1622,6 +1630,11 @@ SYSCTL_STRING(_kern, KERN_VERSION, version, SYSCTL_STRING(_kern, OID_AUTO, uuid, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &kernel_uuid_string[0], 0, ""); + +SYSCTL_STRING(_kern, OID_AUTO, osbuildconfig, + CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLFLAG_MASKED, + &osbuild_config[0], 0, ""); + #if DEBUG #ifndef DKPR #define DKPR 1 @@ -1758,6 +1771,21 @@ SYSCTL_PROC(_kern, OID_AUTO, bootargs, NULL, 0, sysctl_sysctl_bootargs, "A", "bootargs"); +STATIC int +sysctl_kernelcacheuuid(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req) +{ + int rval = ENOENT; + if (kernelcache_uuid_valid) { + rval = sysctl_handle_string(oidp, arg1, arg2, req); + } + return rval; +} + +SYSCTL_PROC(_kern, OID_AUTO, kernelcacheuuid, + CTLFLAG_RD | CTLFLAG_KERN | CTLTYPE_STRING | CTLFLAG_LOCKED, + kernelcache_uuid_string, sizeof(kernelcache_uuid_string), + sysctl_kernelcacheuuid, "A", ""); + SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, &maxfiles, 0, ""); @@ -2017,11 +2045,11 @@ SYSCTL_UINT(_kern, OID_AUTO, vm_page_free_reserved, SYSCTL_UINT(_kern, OID_AUTO, vm_page_speculative_percentage, CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, - &vm_page_speculative_percentage, 0, ""); + &vm_pageout_state.vm_page_speculative_percentage, 0, ""); SYSCTL_UINT(_kern, OID_AUTO, vm_page_speculative_q_age_ms, CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, - &vm_page_speculative_q_age_ms, 0, ""); + &vm_pageout_state.vm_page_speculative_q_age_ms, 0, ""); SYSCTL_UINT(_kern, OID_AUTO, vm_max_delayed_work_limit, CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, @@ -2585,6 +2613,322 @@ sysctl_vm_toggle_address_reuse(__unused struct sysctl_oid *oidp, __unused void * SYSCTL_PROC(_debug, OID_AUTO, toggle_address_reuse, CTLFLAG_ANYBODY | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_vm_toggle_address_reuse,"I",""); +#ifdef CONFIG_XNUPOST + +extern int xnupost_export_testdata(void *outp, uint32_t size, uint32_t *lenp); +extern uint32_t xnupost_get_estimated_testdata_size(void); + +extern int xnupost_reset_all_tests(void); + +STATIC int +sysctl_handle_xnupost_get_tests SYSCTL_HANDLER_ARGS +{ + /* fixup unused arguments warnings */ + __unused int _oa2 = arg2; + __unused void * _oa1 = arg1; + __unused struct sysctl_oid * _oidp = oidp; + + int error = 0; + user_addr_t oldp = 0; + user_addr_t newp = 0; + uint32_t usedbytes = 0; + + oldp = req->oldptr; + newp = req->newptr; + + if (newp) + return ENOTSUP; + + if ((void *)oldp == NULL) { + /* return estimated size for second call where info can be placed */ + req->oldidx = xnupost_get_estimated_testdata_size(); + } else { + error = xnupost_export_testdata((void *)oldp, req->oldlen, &usedbytes); + req->oldidx = usedbytes; + } + + return error; +} + +SYSCTL_PROC(_debug, + OID_AUTO, + xnupost_get_tests, + CTLFLAG_MASKED | CTLFLAG_ANYBODY | CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_LOCKED, + 0, + 0, + sysctl_handle_xnupost_get_tests, + "-", + "read xnupost test data in kernel"); + +STATIC int +sysctl_debug_xnupost_ctl SYSCTL_HANDLER_ARGS +{ + /* fixup unused arguments warnings */ + __unused int _oa2 = arg2; + __unused void * _oa1 = arg1; + __unused struct sysctl_oid * _oidp = oidp; + +#define ARRCOUNT 4 + /* + * INPUT: ACTION, PARAM1, PARAM2, PARAM3 + * OUTPUT: RESULTCODE, ADDITIONAL DATA + */ + int32_t outval[ARRCOUNT] = {0}; + int32_t input[ARRCOUNT] = {0}; + int32_t out_size = sizeof(outval); + int32_t in_size = sizeof(input); + int error = 0; + + /* if this is NULL call to find out size, send out size info */ + if (!req->newptr) { + goto out; + } + + /* pull in provided value from userspace */ + error = SYSCTL_IN(req, &input[0], in_size); + if (error) + return error; + + if (input[0] == XTCTL_RESET_TESTDATA) { + outval[0] = xnupost_reset_all_tests(); + goto out; + } + +out: + error = SYSCTL_OUT(req, &outval[0], out_size); + return error; +} + +SYSCTL_PROC(_debug, + OID_AUTO, + xnupost_testctl, + CTLFLAG_MASKED | CTLFLAG_ANYBODY | CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, + 0, + sysctl_debug_xnupost_ctl, + "I", + "xnupost control for kernel testing"); + +extern void test_oslog_handleOSLogCtl(int32_t * in, int32_t * out, int32_t arraycount); + +STATIC int +sysctl_debug_test_oslog_ctl(__unused struct sysctl_oid * oidp, __unused void * arg1, __unused int arg2, struct sysctl_req * req) +{ +#define ARRCOUNT 4 + int32_t outval[ARRCOUNT] = {0}; + int32_t input[ARRCOUNT] = {0}; + int32_t size_outval = sizeof(outval); + int32_t size_inval = sizeof(input); + int32_t error; + + /* if this is NULL call to find out size, send out size info */ + if (!req->newptr) { + error = SYSCTL_OUT(req, &outval[0], size_outval); + return error; + } + + /* pull in provided value from userspace */ + error = SYSCTL_IN(req, &input[0], size_inval); + if (error) + return error; + + test_oslog_handleOSLogCtl(input, outval, ARRCOUNT); + + error = SYSCTL_OUT(req, &outval[0], size_outval); + + return error; +} + +SYSCTL_PROC(_debug, + OID_AUTO, + test_OSLogCtl, + CTLFLAG_MASKED | CTLFLAG_ANYBODY | CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, + 0, + sysctl_debug_test_oslog_ctl, + "I", + "testing oslog in kernel"); + +#include +#include + +extern lck_grp_t * sysctl_debug_test_stackshot_owner_grp; /* used for both mutexes and rwlocks */ +extern lck_mtx_t * sysctl_debug_test_stackshot_owner_init_mtx; /* used to protect lck_*_init */ + +/* This is a sysctl for testing collection of owner info on a lock in kernel space. A multi-threaded + * test from userland sets this sysctl in such a way that a thread blocks in kernel mode, and a + * stackshot is taken to see if the owner of the lock can be identified. + * + * We can't return to userland with a kernel lock held, so be sure to unlock before we leave. + * the semaphores allow us to artificially create cases where the lock is being held and the + * thread is hanging / taking a long time to do something. */ + +volatile char sysctl_debug_test_stackshot_mtx_inited = 0; +semaphore_t sysctl_debug_test_stackshot_mutex_sem; +lck_mtx_t sysctl_debug_test_stackshot_owner_lck; + +#define SYSCTL_DEBUG_MTX_ACQUIRE_WAIT 1 +#define SYSCTL_DEBUG_MTX_ACQUIRE_NOWAIT 2 +#define SYSCTL_DEBUG_MTX_SIGNAL 3 +#define SYSCTL_DEBUG_MTX_TEARDOWN 4 + +STATIC int +sysctl_debug_test_stackshot_mutex_owner(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + long long option = -1; + /* if the user tries to read the sysctl, we tell them what the address of the lock is (to test against stackshot's output) */ + long long mtx_unslid_addr = (long long)VM_KERNEL_UNSLIDE_OR_PERM(&sysctl_debug_test_stackshot_owner_lck); + int error = sysctl_io_number(req, mtx_unslid_addr, sizeof(long long), (void*)&option, NULL); + + lck_mtx_lock(sysctl_debug_test_stackshot_owner_init_mtx); + if (!sysctl_debug_test_stackshot_mtx_inited) { + lck_mtx_init(&sysctl_debug_test_stackshot_owner_lck, + sysctl_debug_test_stackshot_owner_grp, + LCK_ATTR_NULL); + semaphore_create(kernel_task, + &sysctl_debug_test_stackshot_mutex_sem, + SYNC_POLICY_FIFO, 0); + sysctl_debug_test_stackshot_mtx_inited = 1; + } + lck_mtx_unlock(sysctl_debug_test_stackshot_owner_init_mtx); + + if (!error) { + switch(option) { + case SYSCTL_DEBUG_MTX_ACQUIRE_NOWAIT: + lck_mtx_lock(&sysctl_debug_test_stackshot_owner_lck); + lck_mtx_unlock(&sysctl_debug_test_stackshot_owner_lck); + break; + case SYSCTL_DEBUG_MTX_ACQUIRE_WAIT: + lck_mtx_lock(&sysctl_debug_test_stackshot_owner_lck); + semaphore_wait(sysctl_debug_test_stackshot_mutex_sem); + lck_mtx_unlock(&sysctl_debug_test_stackshot_owner_lck); + break; + case SYSCTL_DEBUG_MTX_SIGNAL: + semaphore_signal(sysctl_debug_test_stackshot_mutex_sem); + break; + case SYSCTL_DEBUG_MTX_TEARDOWN: + lck_mtx_lock(sysctl_debug_test_stackshot_owner_init_mtx); + + lck_mtx_destroy(&sysctl_debug_test_stackshot_owner_lck, + sysctl_debug_test_stackshot_owner_grp); + semaphore_destroy(kernel_task, + sysctl_debug_test_stackshot_mutex_sem); + sysctl_debug_test_stackshot_mtx_inited = 0; + + lck_mtx_unlock(sysctl_debug_test_stackshot_owner_init_mtx); + break; + case -1: /* user just wanted to read the value, so do nothing */ + break; + default: + error = EINVAL; + break; + } + } + return error; +} + +/* we can't return to userland with a kernel rwlock held, so be sure to unlock before we leave. + * the semaphores allow us to artificially create cases where the lock is being held and the + * thread is hanging / taking a long time to do something. */ + +SYSCTL_PROC(_debug, + OID_AUTO, + test_MutexOwnerCtl, + CTLFLAG_MASKED | CTLFLAG_ANYBODY | CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, + 0, + sysctl_debug_test_stackshot_mutex_owner, + "-", + "Testing mutex owner in kernel"); + +volatile char sysctl_debug_test_stackshot_rwlck_inited = 0; +lck_rw_t sysctl_debug_test_stackshot_owner_rwlck; +semaphore_t sysctl_debug_test_stackshot_rwlck_sem; + +#define SYSCTL_DEBUG_KRWLCK_RACQUIRE_NOWAIT 1 +#define SYSCTL_DEBUG_KRWLCK_RACQUIRE_WAIT 2 +#define SYSCTL_DEBUG_KRWLCK_WACQUIRE_NOWAIT 3 +#define SYSCTL_DEBUG_KRWLCK_WACQUIRE_WAIT 4 +#define SYSCTL_DEBUG_KRWLCK_SIGNAL 5 +#define SYSCTL_DEBUG_KRWLCK_TEARDOWN 6 + +STATIC int +sysctl_debug_test_stackshot_rwlck_owner(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + long long option = -1; + /* if the user tries to read the sysctl, we tell them what the address of the lock is + * (to test against stackshot's output) */ + long long rwlck_unslid_addr = (long long)VM_KERNEL_UNSLIDE_OR_PERM(&sysctl_debug_test_stackshot_owner_rwlck); + int error = sysctl_io_number(req, rwlck_unslid_addr, sizeof(long long), (void*)&option, NULL); + + lck_mtx_lock(sysctl_debug_test_stackshot_owner_init_mtx); + if (!sysctl_debug_test_stackshot_rwlck_inited) { + lck_rw_init(&sysctl_debug_test_stackshot_owner_rwlck, + sysctl_debug_test_stackshot_owner_grp, + LCK_ATTR_NULL); + semaphore_create(kernel_task, + &sysctl_debug_test_stackshot_rwlck_sem, + SYNC_POLICY_FIFO, + 0); + sysctl_debug_test_stackshot_rwlck_inited = 1; + } + lck_mtx_unlock(sysctl_debug_test_stackshot_owner_init_mtx); + + if (!error) { + switch(option) { + case SYSCTL_DEBUG_KRWLCK_RACQUIRE_NOWAIT: + lck_rw_lock(&sysctl_debug_test_stackshot_owner_rwlck, LCK_RW_TYPE_SHARED); + lck_rw_unlock(&sysctl_debug_test_stackshot_owner_rwlck, LCK_RW_TYPE_SHARED); + break; + case SYSCTL_DEBUG_KRWLCK_RACQUIRE_WAIT: + lck_rw_lock(&sysctl_debug_test_stackshot_owner_rwlck, LCK_RW_TYPE_SHARED); + semaphore_wait(sysctl_debug_test_stackshot_rwlck_sem); + lck_rw_unlock(&sysctl_debug_test_stackshot_owner_rwlck, LCK_RW_TYPE_SHARED); + break; + case SYSCTL_DEBUG_KRWLCK_WACQUIRE_NOWAIT: + lck_rw_lock(&sysctl_debug_test_stackshot_owner_rwlck, LCK_RW_TYPE_EXCLUSIVE); + lck_rw_unlock(&sysctl_debug_test_stackshot_owner_rwlck, LCK_RW_TYPE_EXCLUSIVE); + break; + case SYSCTL_DEBUG_KRWLCK_WACQUIRE_WAIT: + lck_rw_lock(&sysctl_debug_test_stackshot_owner_rwlck, LCK_RW_TYPE_EXCLUSIVE); + semaphore_wait(sysctl_debug_test_stackshot_rwlck_sem); + lck_rw_unlock(&sysctl_debug_test_stackshot_owner_rwlck, LCK_RW_TYPE_EXCLUSIVE); + break; + case SYSCTL_DEBUG_KRWLCK_SIGNAL: + semaphore_signal(sysctl_debug_test_stackshot_rwlck_sem); + break; + case SYSCTL_DEBUG_KRWLCK_TEARDOWN: + lck_mtx_lock(sysctl_debug_test_stackshot_owner_init_mtx); + + lck_rw_destroy(&sysctl_debug_test_stackshot_owner_rwlck, + sysctl_debug_test_stackshot_owner_grp); + semaphore_destroy(kernel_task, + sysctl_debug_test_stackshot_rwlck_sem); + sysctl_debug_test_stackshot_rwlck_inited = 0; + + lck_mtx_unlock(sysctl_debug_test_stackshot_owner_init_mtx); + break; + case -1: /* user just wanted to read the value, so do nothing */ + break; + default: + error = EINVAL; + break; + } + } + return error; +} + + +SYSCTL_PROC(_debug, + OID_AUTO, + test_RWLockOwnerCtl, + CTLFLAG_MASKED | CTLFLAG_ANYBODY | CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, + 0, + sysctl_debug_test_stackshot_rwlck_owner, + "-", + "Testing rwlock owner in kernel"); +#endif /* !CONFIG_XNUPOST */ STATIC int sysctl_swapusage @@ -2620,6 +2964,7 @@ SYSCTL_PROC(_vm, VM_SWAPUSAGE, swapusage, #if CONFIG_FREEZE extern void vm_page_reactivate_all_throttled(void); +extern void memorystatus_disable_freeze(void); static int sysctl_freeze_enabled SYSCTL_HANDLER_ARGS @@ -2632,7 +2977,7 @@ sysctl_freeze_enabled SYSCTL_HANDLER_ARGS if (error || !req->newptr) return (error); - if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) { + if (! VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { //assert(req->newptr); printf("Failed attempt to set vm.freeze_enabled sysctl\n"); return EINVAL; @@ -2647,14 +2992,62 @@ sysctl_freeze_enabled SYSCTL_HANDLER_ARGS if (disabled) { vm_page_reactivate_all_throttled(); + memorystatus_disable_freeze(); } return (0); } -SYSCTL_PROC(_vm, OID_AUTO, freeze_enabled, CTLTYPE_INT|CTLFLAG_RW, &memorystatus_freeze_enabled, 0, sysctl_freeze_enabled, "I", ""); +SYSCTL_PROC(_vm, OID_AUTO, freeze_enabled, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY, &memorystatus_freeze_enabled, 0, sysctl_freeze_enabled, "I", ""); #endif /* CONFIG_FREEZE */ +#if DEVELOPMENT || DEBUG +extern int vm_num_swap_files_config; +extern int vm_num_swap_files; +extern lck_mtx_t vm_swap_data_lock; +#define VM_MAX_SWAP_FILE_NUM 100 + +static int +sysctl_vm_config_num_swap_files SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int error = 0, val = vm_num_swap_files_config; + + error = sysctl_handle_int(oidp, &val, 0, req); + if (error || !req->newptr) { + goto out; + } + + if (!VM_CONFIG_SWAP_IS_ACTIVE && !VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { + printf("Swap is disabled\n"); + error = EINVAL; + goto out; + } + + lck_mtx_lock(&vm_swap_data_lock); + + if (val < vm_num_swap_files) { + printf("Cannot configure fewer swap files than already exist.\n"); + error = EINVAL; + lck_mtx_unlock(&vm_swap_data_lock); + goto out; + } + + if (val > VM_MAX_SWAP_FILE_NUM) { + printf("Capping number of swap files to upper bound.\n"); + val = VM_MAX_SWAP_FILE_NUM; + } + + vm_num_swap_files_config = val; + lck_mtx_unlock(&vm_swap_data_lock); +out: + + return (0); +} + +SYSCTL_PROC(_debug, OID_AUTO, num_swap_files_configured, CTLFLAG_ANYBODY | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_vm_config_num_swap_files, "I", ""); +#endif /* DEVELOPMENT || DEBUG */ + /* this kernel does NOT implement shared_region_make_private_np() */ SYSCTL_INT(_kern, KERN_SHREG_PRIVATIZABLE, shreg_private, CTLFLAG_RD | CTLFLAG_LOCKED, @@ -2685,8 +3078,9 @@ fetch_process_cputype( } ret = cpu_type() & ~CPU_ARCH_MASK; - if (IS_64BIT_PROCESS(p)) + if (IS_64BIT_PROCESS(p)) { ret |= CPU_ARCH_ABI64; + } *cputype = ret; @@ -2823,10 +3217,16 @@ SYSCTL_INT(_vm, OID_AUTO, vm_copy_src_large, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_ma extern uint32_t vm_page_external_count; -extern uint32_t vm_page_filecache_min; SYSCTL_INT(_vm, OID_AUTO, vm_page_external_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_external_count, 0, ""); -SYSCTL_INT(_vm, OID_AUTO, vm_page_filecache_min, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_page_filecache_min, 0, ""); + +SYSCTL_INT(_vm, OID_AUTO, vm_page_filecache_min, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_page_filecache_min, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, vm_page_xpmapped_min, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_page_xpmapped_min, 0, ""); + +#if DEVELOPMENT || DEBUG +SYSCTL_INT(_vm, OID_AUTO, vm_page_filecache_min_divisor, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.vm_page_filecache_min_divisor, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, vm_page_xpmapped_min_divisor, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.vm_page_xpmapped_min_divisor, 0, ""); +#endif extern int vm_compressor_mode; extern int vm_compressor_is_active; @@ -2841,15 +3241,95 @@ extern uint32_t compressor_sample_min_in_msecs; extern uint32_t compressor_sample_max_in_msecs; extern uint32_t compressor_thrashing_threshold_per_10msecs; extern uint32_t compressor_thrashing_min_per_10msecs; +extern uint32_t vm_compressor_time_thread; + +#if DEVELOPMENT || DEBUG extern uint32_t vm_compressor_minorcompact_threshold_divisor; extern uint32_t vm_compressor_majorcompact_threshold_divisor; extern uint32_t vm_compressor_unthrottle_threshold_divisor; extern uint32_t vm_compressor_catchup_threshold_divisor; -extern uint32_t vm_compressor_time_thread; -#if DEVELOPMENT || DEBUG + +extern uint32_t vm_compressor_minorcompact_threshold_divisor_overridden; +extern uint32_t vm_compressor_majorcompact_threshold_divisor_overridden; +extern uint32_t vm_compressor_unthrottle_threshold_divisor_overridden; +extern uint32_t vm_compressor_catchup_threshold_divisor_overridden; + extern vmct_stats_t vmct_stats; + + +STATIC int +sysctl_minorcompact_threshold_divisor(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + int new_value, changed; + int error = sysctl_io_number(req, vm_compressor_minorcompact_threshold_divisor, sizeof(int), &new_value, &changed); + + if (changed) { + vm_compressor_minorcompact_threshold_divisor = new_value; + vm_compressor_minorcompact_threshold_divisor_overridden = 1; + } + return(error); +} + +SYSCTL_PROC(_vm, OID_AUTO, compressor_minorcompact_threshold_divisor, + CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW, + 0, 0, sysctl_minorcompact_threshold_divisor, "I", ""); + + +STATIC int +sysctl_majorcompact_threshold_divisor(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + int new_value, changed; + int error = sysctl_io_number(req, vm_compressor_majorcompact_threshold_divisor, sizeof(int), &new_value, &changed); + + if (changed) { + vm_compressor_majorcompact_threshold_divisor = new_value; + vm_compressor_majorcompact_threshold_divisor_overridden = 1; + } + return(error); +} + +SYSCTL_PROC(_vm, OID_AUTO, compressor_majorcompact_threshold_divisor, + CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW, + 0, 0, sysctl_majorcompact_threshold_divisor, "I", ""); + + +STATIC int +sysctl_unthrottle_threshold_divisor(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + int new_value, changed; + int error = sysctl_io_number(req, vm_compressor_unthrottle_threshold_divisor, sizeof(int), &new_value, &changed); + + if (changed) { + vm_compressor_unthrottle_threshold_divisor = new_value; + vm_compressor_unthrottle_threshold_divisor_overridden = 1; + } + return(error); +} + +SYSCTL_PROC(_vm, OID_AUTO, compressor_unthrottle_threshold_divisor, + CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW, + 0, 0, sysctl_unthrottle_threshold_divisor, "I", ""); + + +STATIC int +sysctl_catchup_threshold_divisor(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + int new_value, changed; + int error = sysctl_io_number(req, vm_compressor_catchup_threshold_divisor, sizeof(int), &new_value, &changed); + + if (changed) { + vm_compressor_catchup_threshold_divisor = new_value; + vm_compressor_catchup_threshold_divisor_overridden = 1; + } + return(error); +} + +SYSCTL_PROC(_vm, OID_AUTO, compressor_catchup_threshold_divisor, + CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW, + 0, 0, sysctl_catchup_threshold_divisor, "I", ""); #endif + SYSCTL_QUAD(_vm, OID_AUTO, compressor_input_bytes, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_input_bytes, ""); SYSCTL_QUAD(_vm, OID_AUTO, compressor_compressed_bytes, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_compressed_bytes, ""); SYSCTL_QUAD(_vm, OID_AUTO, compressor_bytes_used, CTLFLAG_RD | CTLFLAG_LOCKED, &compressor_bytes_used, ""); @@ -2866,10 +3346,6 @@ SYSCTL_INT(_vm, OID_AUTO, compressor_sample_min_in_msecs, CTLFLAG_RW | CTLFLAG_L SYSCTL_INT(_vm, OID_AUTO, compressor_sample_max_in_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &compressor_sample_max_in_msecs, 0, ""); SYSCTL_INT(_vm, OID_AUTO, compressor_thrashing_threshold_per_10msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &compressor_thrashing_threshold_per_10msecs, 0, ""); SYSCTL_INT(_vm, OID_AUTO, compressor_thrashing_min_per_10msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &compressor_thrashing_min_per_10msecs, 0, ""); -SYSCTL_INT(_vm, OID_AUTO, compressor_minorcompact_threshold_divisor, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_compressor_minorcompact_threshold_divisor, 0, ""); -SYSCTL_INT(_vm, OID_AUTO, compressor_majorcompact_threshold_divisor, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_compressor_majorcompact_threshold_divisor, 0, ""); -SYSCTL_INT(_vm, OID_AUTO, compressor_unthrottle_threshold_divisor, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_compressor_unthrottle_threshold_divisor, 0, ""); -SYSCTL_INT(_vm, OID_AUTO, compressor_catchup_threshold_divisor, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_compressor_catchup_threshold_divisor, 0, ""); SYSCTL_STRING(_vm, OID_AUTO, swapfileprefix, CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, swapfilename, sizeof(swapfilename) - SWAPFILENAME_INDEX_LEN, ""); @@ -2985,8 +3461,6 @@ extern uint32_t vm_page_background_external_count; extern uint32_t vm_page_background_mode; extern uint32_t vm_page_background_exclude_external; extern uint64_t vm_page_background_promoted_count; -extern uint64_t vm_pageout_considered_bq_internal; -extern uint64_t vm_pageout_considered_bq_external; extern uint64_t vm_pageout_rejected_bq_internal; extern uint64_t vm_pageout_rejected_bq_external; @@ -2998,12 +3472,38 @@ SYSCTL_INT(_vm, OID_AUTO, vm_page_background_internal_count, CTLFLAG_RD | CTLFLA SYSCTL_INT(_vm, OID_AUTO, vm_page_background_external_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_background_external_count, 0, ""); SYSCTL_QUAD(_vm, OID_AUTO, vm_page_background_promoted_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_background_promoted_count, ""); -SYSCTL_QUAD(_vm, OID_AUTO, vm_pageout_considered_bq_internal, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_considered_bq_internal, ""); -SYSCTL_QUAD(_vm, OID_AUTO, vm_pageout_considered_bq_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_considered_bq_external, ""); +SYSCTL_QUAD(_vm, OID_AUTO, vm_pageout_considered_bq_internal, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_considered_bq_internal, ""); +SYSCTL_QUAD(_vm, OID_AUTO, vm_pageout_considered_bq_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_considered_bq_external, ""); SYSCTL_QUAD(_vm, OID_AUTO, vm_pageout_rejected_bq_internal, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_rejected_bq_internal, ""); SYSCTL_QUAD(_vm, OID_AUTO, vm_pageout_rejected_bq_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_rejected_bq_external, ""); -#endif +#endif /* CONFIG_BACKGROUND_QUEUE */ + +extern void vm_update_darkwake_mode(boolean_t); +extern boolean_t vm_darkwake_mode; + +STATIC int +sysctl_toggle_darkwake_mode(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + int new_value, changed; + int error = sysctl_io_number(req, vm_darkwake_mode, sizeof(int), &new_value, &changed); + + if ( !error && changed) { + + if (new_value != 0 && new_value != 1) { + printf("Error: Invalid value passed to darkwake sysctl. Acceptable: 0 or 1.\n"); + error = EINVAL; + } else { + vm_update_darkwake_mode((boolean_t) new_value); + } + } + + return(error); +} + +SYSCTL_PROC(_vm, OID_AUTO, darkwake_mode, + CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW, + 0, 0, sysctl_toggle_darkwake_mode, "I", ""); #if (DEVELOPMENT || DEBUG) @@ -3020,11 +3520,9 @@ extern uint32_t vm_pageout_memorystatus_fb_factor_dr; SYSCTL_INT(_vm, OID_AUTO, vm_pageout_memorystatus_fb_factor_nr, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_memorystatus_fb_factor_nr, 0, ""); SYSCTL_INT(_vm, OID_AUTO, vm_pageout_memorystatus_fb_factor_dr, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_memorystatus_fb_factor_dr, 0, ""); -extern uint32_t vm_grab_anon_overrides; -extern uint32_t vm_grab_anon_nops; -SYSCTL_INT(_vm, OID_AUTO, vm_grab_anon_overrides, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_grab_anon_overrides, 0, ""); -SYSCTL_INT(_vm, OID_AUTO, vm_grab_anon_nops, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_grab_anon_nops, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, vm_grab_anon_overrides, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_debug.vm_grab_anon_overrides, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, vm_grab_anon_nops, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_debug.vm_grab_anon_nops, 0, ""); /* log message counters for persistence mode */ extern uint32_t oslog_p_total_msgcount; @@ -3097,6 +3595,29 @@ SYSCTL_STRING(_kern, OID_AUTO, sched, sched_string, sizeof(sched_string), "Timeshare scheduler implementation"); +#if CONFIG_QUIESCE_COUNTER +static int +sysctl_cpu_quiescent_counter_interval SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + + int error = sysctl_handle_int(oidp, &cpu_checkin_min_interval_us, 0, req); + if (error || !req->newptr) + return error; + + cpu_quiescent_counter_set_min_interval_us(cpu_checkin_min_interval_us); + + return 0; +} + +SYSCTL_PROC(_kern, OID_AUTO, cpu_checkin_interval, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, 0, + sysctl_cpu_quiescent_counter_interval, "I", + "Quiescent CPU checkin interval (microseconds)"); +#endif /* CONFIG_QUIESCE_COUNTER */ + + /* * Only support runtime modification on embedded platforms * with development config enabled @@ -3531,6 +4052,10 @@ SYSCTL_PROC(_debug, OID_AUTO, debugger_test, CTLTYPE_STRING | CTLFLAG_RW | CTLFL SYSCTL_PROC(_debug, OID_AUTO, spinlock_panic_test, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_MASKED, 0, 0, sysctl_spinlock_panic_test, "A", "spinlock panic test"); SYSCTL_PROC(_debug, OID_AUTO, simultaneous_panic_test, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_MASKED, 0, 0, sysctl_simultaneous_panic_test, "A", "simultaneous panic test"); +extern int exc_resource_threads_enabled; + +SYSCTL_INT(_kern, OID_AUTO, exc_resource_threads_enabled, CTLFLAG_RD | CTLFLAG_LOCKED, &exc_resource_threads_enabled, 0, "exc_resource thread limit enabled"); + #endif /* DEVELOPMENT || DEBUG */ @@ -3575,3 +4100,427 @@ SYSCTL_PROC(_kern, OID_AUTO, grade_cputype, CTLFLAG_RW|CTLFLAG_ANYBODY|CTLFLAG_MASKED|CTLFLAG_LOCKED|CTLTYPE_OPAQUE, 0, 0, &sysctl_grade_cputype, "S", "grade value of cpu_type_t+cpu_sub_type_t"); + + +#if DEVELOPMENT || DEBUG + +static atomic_int wedge_thread_should_wake = 0; + +static int +unwedge_thread SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int error, val = 0; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error || val == 0) { + return error; + } + + atomic_store(&wedge_thread_should_wake, 1); + return 0; +} + +SYSCTL_PROC(_kern, OID_AUTO, unwedge_thread, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, 0, 0, unwedge_thread, "I", "unwedge the thread wedged by kern.wedge_thread"); + +extern uintptr_t phys_carveout_pa; +SYSCTL_LONG(_kern, OID_AUTO, phys_carveout_pa, CTLFLAG_RD | CTLFLAG_LOCKED, + &phys_carveout_pa, + "base physical address of the phys_carveout_mb boot-arg region"); +extern size_t phys_carveout_size; +SYSCTL_LONG(_kern, OID_AUTO, phys_carveout_size, CTLFLAG_RD | CTLFLAG_LOCKED, + &phys_carveout_size, + "size in bytes of the phys_carveout_mb boot-arg region"); + +static int +wedge_thread SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + + int error, val = 0; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error || val == 0) { + return error; + } + + uint64_t interval = 1; + nanoseconds_to_absolutetime(1000 * 1000 * 50, &interval); + + atomic_store(&wedge_thread_should_wake, 0); + while (!atomic_load(&wedge_thread_should_wake)) { + tsleep1(NULL, 0, "wedge_thread", mach_absolute_time()+interval, NULL); + } + + return 0; +} + +SYSCTL_PROC(_kern, OID_AUTO, wedge_thread, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, 0, 0, wedge_thread, "I", "wedge this thread so it cannot be cleaned up"); + +static int +sysctl_turnstile_test_prim_lock SYSCTL_HANDLER_ARGS; +static int +sysctl_turnstile_test_prim_unlock SYSCTL_HANDLER_ARGS; +int +tstile_test_prim_lock(boolean_t use_hashtable); +int +tstile_test_prim_unlock(boolean_t use_hashtable); + +#define SYSCTL_TURNSTILE_TEST_DEFAULT 1 +#define SYSCTL_TURNSTILE_TEST_GLOBAL_HASHTABLE 2 + +static int +sysctl_turnstile_test_prim_lock SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int error, val = 0; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error || val == 0) { + return error; + } + boolean_t use_hashtable = (val == SYSCTL_TURNSTILE_TEST_GLOBAL_HASHTABLE) ? true : false; + return tstile_test_prim_lock(use_hashtable); +} + +static int +sysctl_turnstile_test_prim_unlock SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int error, val = 0; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error || val == 0) { + return error; + } + boolean_t use_hashtable = (val == SYSCTL_TURNSTILE_TEST_GLOBAL_HASHTABLE) ? true : false; + return tstile_test_prim_unlock(use_hashtable); +} + +SYSCTL_PROC(_kern, OID_AUTO, turnstiles_test_lock, CTLFLAG_WR | CTLFLAG_ANYBODY | CTLFLAG_KERN | CTLFLAG_LOCKED, + 0, 0, sysctl_turnstile_test_prim_lock, "I", "turnstiles test lock"); + +SYSCTL_PROC(_kern, OID_AUTO, turnstiles_test_unlock, CTLFLAG_WR | CTLFLAG_ANYBODY | CTLFLAG_KERN | CTLFLAG_LOCKED, + 0, 0, sysctl_turnstile_test_prim_unlock, "I", "turnstiles test unlock"); + +int +turnstile_get_boost_stats_sysctl(void *req); +int +turnstile_get_unboost_stats_sysctl(void *req); +static int +sysctl_turnstile_boost_stats SYSCTL_HANDLER_ARGS; +static int +sysctl_turnstile_unboost_stats SYSCTL_HANDLER_ARGS; +extern uint64_t thread_block_on_turnstile_count; +extern uint64_t thread_block_on_regular_waitq_count; + +static int +sysctl_turnstile_boost_stats SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2, oidp) + return turnstile_get_boost_stats_sysctl(req); +} + +static int +sysctl_turnstile_unboost_stats SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2, oidp) + return turnstile_get_unboost_stats_sysctl(req); +} + +SYSCTL_PROC(_kern, OID_AUTO, turnstile_boost_stats, CTLFLAG_RD | CTLFLAG_ANYBODY | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLTYPE_STRUCT, + 0, 0, sysctl_turnstile_boost_stats, "S", "turnstiles boost stats"); +SYSCTL_PROC(_kern, OID_AUTO, turnstile_unboost_stats, CTLFLAG_RD | CTLFLAG_ANYBODY | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLTYPE_STRUCT, + 0, 0, sysctl_turnstile_unboost_stats, "S", "turnstiles unboost stats"); +SYSCTL_QUAD(_kern, OID_AUTO, thread_block_count_on_turnstile, + CTLFLAG_RD | CTLFLAG_ANYBODY | CTLFLAG_KERN | CTLFLAG_LOCKED, + &thread_block_on_turnstile_count, "thread blocked on turnstile count"); +SYSCTL_QUAD(_kern, OID_AUTO, thread_block_count_on_reg_waitq, + CTLFLAG_RD | CTLFLAG_ANYBODY | CTLFLAG_KERN | CTLFLAG_LOCKED, + &thread_block_on_regular_waitq_count, "thread blocked on regular waitq count"); + +static int +sysctl_lck_mtx_test_lock SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int error, val = 0; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error || val == 0) { + return error; + } + + if (val == 1) { + lck_mtx_test_init(); + lck_mtx_test_lock(); + } + + return 0; +} + +static int +sysctl_lck_mtx_test_unlock SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int error, val = 0; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error || val == 0) { + return error; + } + + if (val == 1) { + lck_mtx_test_init(); + lck_mtx_test_unlock(); + } + + return 0; +} + +static int +sysctl_erase_all_test_mtx_stats SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int error, val = 0; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error || val == 0) { + return error; + } + + if (val == 1) { + lck_mtx_test_init(); + erase_all_test_mtx_stats(); + } + + return 0; +} + +static int +sysctl_get_test_mtx_stats SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + char* buffer; + int size, buffer_size, error; + + buffer_size = 1000; + buffer = kalloc(buffer_size); + if (!buffer) + panic("Impossible to allocate memory for %s\n", __func__); + + lck_mtx_test_init(); + + size = get_test_mtx_stats_string(buffer, buffer_size); + + error = sysctl_io_string(req, buffer, size, 0, NULL); + + kfree(buffer, buffer_size); + + return error; +} + +static int +sysctl_test_mtx_uncontended SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + char* buffer; + int buffer_size, offset, error, iter; + char input_val[40]; + + if (!req->newptr) { + return 0; + } + + if (!req->oldptr) { + return EINVAL; + } + + if (req->newlen >= sizeof(input_val)) { + return EINVAL; + } + + error = SYSCTL_IN(req, input_val, req->newlen); + if (error) { + return error; + } + input_val[req->newlen] = '\0'; + + sscanf(input_val, "%d", &iter); + + if (iter <= 0) { + printf("%s requested %d iterations, not starting the test\n", __func__, iter); + return EINVAL; + } + + lck_mtx_test_init(); + + buffer_size = 2000; + offset = 0; + buffer = kalloc(buffer_size); + if (!buffer) + panic("Impossible to allocate memory for %s\n", __func__); + memset(buffer, 0, buffer_size); + + printf("%s starting uncontended mutex test with %d iterations\n", __func__, iter); + + offset = snprintf(buffer, buffer_size, "STATS INNER LOOP"); + offset += lck_mtx_test_mtx_uncontended(iter, &buffer[offset], buffer_size - offset); + + offset += snprintf(&buffer[offset], buffer_size - offset, "\nSTATS OUTER LOOP"); + offset += lck_mtx_test_mtx_uncontended_loop_time(iter, &buffer[offset], buffer_size - offset); + + error = SYSCTL_OUT(req, buffer, offset); + + kfree(buffer, buffer_size); + return error; +} + +static int +sysctl_test_mtx_contended SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + char* buffer; + int buffer_size, offset, error, iter; + char input_val[40]; + + printf("%s called\n", __func__); + + if (!req->newptr) { + return 0; + } + + if (!req->oldptr) { + return EINVAL; + } + + if (req->newlen >= sizeof(input_val)) { + return EINVAL; + } + + error = SYSCTL_IN(req, input_val, req->newlen); + if (error) { + return error; + } + input_val[req->newlen] = '\0'; + + sscanf(input_val, "%d", &iter); + + if (iter <= 0) { + printf("%s requested %d iterations, not starting the test\n", __func__, iter); + return EINVAL; + } + + lck_mtx_test_init(); + + erase_all_test_mtx_stats(); + + buffer_size = 1000; + offset = 0; + buffer = kalloc(buffer_size); + if (!buffer) + panic("Impossible to allocate memory for %s\n", __func__); + memset(buffer, 0, buffer_size); + + printf("%s starting contended mutex test with %d iterations\n", __func__, iter); + + offset = snprintf(buffer, buffer_size, "STATS INNER LOOP"); + offset += lck_mtx_test_mtx_contended(iter, &buffer[offset], buffer_size - offset); + + printf("%s starting contended mutex loop test with %d iterations\n", __func__, iter); + + offset += snprintf(&buffer[offset], buffer_size - offset, "\nSTATS OUTER LOOP"); + offset += lck_mtx_test_mtx_contended_loop_time(iter, &buffer[offset], buffer_size - offset); + + error = SYSCTL_OUT(req, buffer, offset); + + kfree(buffer, buffer_size); + + return error; +} + +SYSCTL_PROC(_kern, OID_AUTO, lck_mtx_test_lock, CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_ANYBODY | CTLFLAG_KERN | CTLFLAG_LOCKED, + 0, 0, sysctl_lck_mtx_test_lock, "I", "lck mtx test lock"); + +SYSCTL_PROC(_kern, OID_AUTO, lck_mtx_test_unlock, CTLFLAG_WR | CTLFLAG_MASKED |CTLFLAG_ANYBODY | CTLFLAG_KERN | CTLFLAG_LOCKED, + 0, 0, sysctl_lck_mtx_test_unlock, "I", "lck mtx test unlock"); + +SYSCTL_PROC(_kern, OID_AUTO, erase_all_test_mtx_stats, CTLFLAG_WR | CTLFLAG_MASKED |CTLFLAG_ANYBODY | CTLFLAG_KERN | CTLFLAG_LOCKED, + 0, 0, sysctl_erase_all_test_mtx_stats, "I", "erase test_mtx statistics"); + +SYSCTL_PROC(_kern, OID_AUTO, get_test_mtx_stats, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MASKED| CTLFLAG_KERN | CTLFLAG_LOCKED, + 0, 0, sysctl_get_test_mtx_stats, "A", "get test_mtx statistics"); + +SYSCTL_PROC(_kern, OID_AUTO, test_mtx_contended, CTLTYPE_STRING | CTLFLAG_MASKED | CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, + 0, 0, sysctl_test_mtx_contended, "A", "get statistics for contended mtx test"); + +SYSCTL_PROC(_kern, OID_AUTO, test_mtx_uncontended, CTLTYPE_STRING | CTLFLAG_MASKED | CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, + 0, 0, sysctl_test_mtx_uncontended, "A", "get statistics for uncontended mtx test"); + +#if defined (__x86_64__) + +semaphore_t sysctl_test_panic_with_thread_sem; + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Winfinite-recursion" /* rdar://38801963 */ +__attribute__((noreturn)) +static void +panic_thread_test_child_spin(void * arg, wait_result_t wres) +{ + static int panic_thread_recurse_count = 5; + + if (panic_thread_recurse_count > 0) { + panic_thread_recurse_count--; + panic_thread_test_child_spin(arg, wres); + } + + semaphore_signal(sysctl_test_panic_with_thread_sem); + while (1) { ; } +} +#pragma clang diagnostic pop + +static void +panic_thread_test_child_park(void * arg __unused, wait_result_t wres __unused) +{ + int event; + + assert_wait(&event, THREAD_UNINT); + semaphore_signal(sysctl_test_panic_with_thread_sem); + thread_block(panic_thread_test_child_park); +} + +static int +sysctl_test_panic_with_thread SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int rval = 0; + char str[16] = { '\0' }; + thread_t child_thread = THREAD_NULL; + + rval = sysctl_handle_string(oidp, str, sizeof(str), req); + if (rval != 0 || !req->newptr) { + return EINVAL; + } + + semaphore_create(kernel_task, &sysctl_test_panic_with_thread_sem, SYNC_POLICY_FIFO, 0); + + /* Create thread to spin or park in continuation */ + if (strncmp("spin", str, strlen("spin")) == 0) { + if (kernel_thread_start(panic_thread_test_child_spin, NULL, &child_thread) != KERN_SUCCESS) { + semaphore_destroy(kernel_task, sysctl_test_panic_with_thread_sem); + return EBUSY; + } + } else if (strncmp("continuation", str, strlen("continuation")) == 0) { + if (kernel_thread_start(panic_thread_test_child_park, NULL, &child_thread) != KERN_SUCCESS) { + semaphore_destroy(kernel_task, sysctl_test_panic_with_thread_sem); + return EBUSY; + } + } else { + semaphore_destroy(kernel_task, sysctl_test_panic_with_thread_sem); + return EINVAL; + } + + semaphore_wait(sysctl_test_panic_with_thread_sem); + + panic_with_thread_context(0, NULL, 0, child_thread, "testing panic_with_thread_context for thread %p", child_thread); + + /* Not reached */ + return EINVAL; +} + +SYSCTL_PROC(_kern, OID_AUTO, test_panic_with_thread, CTLFLAG_MASKED | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLFLAG_WR | CTLTYPE_STRING, + 0, 0, sysctl_test_panic_with_thread, "A", "test panic flow for backtracing a different thread"); +#endif /* defined (__x86_64__) */ +#endif /* DEVELOPMENT || DEBUG */ diff --git a/bsd/kern/kern_time.c b/bsd/kern/kern_time.c index 5ce07288a..ba2e6c990 100644 --- a/bsd/kern/kern_time.c +++ b/bsd/kern/kern_time.c @@ -103,6 +103,7 @@ static void setthetime( struct timeval *tv); void time_zone_slock_init(void); +static boolean_t timeval_fixusec(struct timeval *t1); /* * Time of day and interval timer support. @@ -209,8 +210,10 @@ settimeofday(__unused struct proc *p, struct settimeofday_args *uap, __unused i if (uap->tzp && (error = copyin(uap->tzp, (caddr_t)&atz, sizeof(atz)))) return (error); if (uap->tv) { - timevalfix(&atv); - if (atv.tv_sec < 0 || (atv.tv_sec == 0 && atv.tv_usec < 0)) + /* only positive values of sec/usec are accepted */ + if (atv.tv_sec < 0 || atv.tv_usec < 0) + return (EPERM); + if (!timeval_fixusec(&atv)) return (EPERM); setthetime(&atv); } @@ -711,6 +714,22 @@ timevalfix( } } +static boolean_t +timeval_fixusec( + struct timeval *t1) +{ + assert(t1->tv_usec >= 0); + assert(t1->tv_sec >= 0); + + if (t1->tv_usec >= 1000000) { + if (os_add_overflow(t1->tv_sec, t1->tv_usec / 1000000, &t1->tv_sec)) + return FALSE; + t1->tv_usec = t1->tv_usec % 1000000; + } + + return TRUE; +} + /* * Return the best possible estimate of the time in the timeval * to which tvp points. diff --git a/bsd/kern/kern_xxx.c b/bsd/kern/kern_xxx.c index c922c1641..889ccd6c7 100644 --- a/bsd/kern/kern_xxx.c +++ b/bsd/kern/kern_xxx.c @@ -115,8 +115,10 @@ reboot(struct proc *p, struct reboot_args *uap, __unused int32_t *retval) if (uap->opt & RB_COMMAND) return ENOSYS; - if (uap->opt & RB_PANIC) { - error = copyinstr(uap->command, (void *)message, sizeof(message), (size_t *)&dummy); + if (uap->opt & RB_PANIC && uap->msg != USER_ADDR_NULL) { + if (copyinstr(uap->msg, (void *)message, sizeof(message), (size_t *)&dummy)) { + strncpy(message, "user space RB_PANIC message copyin failed", sizeof(message)-1); + } } #if CONFIG_MACF @@ -139,7 +141,7 @@ reboot(struct proc *p, struct reboot_args *uap, __unused int32_t *retval) OSBitOrAtomic(P_REBOOT, &p->p_flag); /* No more signals for this proc */ error = reboot_kernel(uap->opt, message); } - return(error); + return error; } int diff --git a/bsd/kern/kpi_mbuf.c b/bsd/kern/kpi_mbuf.c index 6cb79b10e..2d8e97be2 100644 --- a/bsd/kern/kpi_mbuf.c +++ b/bsd/kern/kpi_mbuf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2017 Apple Inc. All rights reserved. + * Copyright (c) 2004-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -386,7 +386,7 @@ mbuf_freem_list(mbuf_t mbuf) size_t mbuf_leadingspace(const mbuf_t mbuf) { - return (m_leadingspace(mbuf)); + return (M_LEADINGSPACE(mbuf)); } /* @@ -397,7 +397,7 @@ mbuf_leadingspace(const mbuf_t mbuf) size_t mbuf_trailingspace(const mbuf_t mbuf) { - return (m_trailingspace(mbuf)); + return (M_TRAILINGSPACE(mbuf)); } /* Manipulation */ @@ -1725,6 +1725,21 @@ get_tx_compl_callback_index(mbuf_tx_compl_func callback) return (i); } +mbuf_tx_compl_func +m_get_tx_compl_callback(u_int32_t idx) +{ + mbuf_tx_compl_func cb; + + if (idx >= MAX_MBUF_TX_COMPL_FUNC) { + ASSERT(0); + return (NULL); + } + lck_rw_lock_shared(mbuf_tx_compl_tbl_lock); + cb = mbuf_tx_compl_table[idx]; + lck_rw_unlock_shared(mbuf_tx_compl_tbl_lock); + return (cb); +} + errno_t mbuf_register_tx_compl_callback(mbuf_tx_compl_func callback) { diff --git a/bsd/kern/mach_loader.c b/bsd/kern/mach_loader.c index c63a8d8cd..18d2239ba 100644 --- a/bsd/kern/mach_loader.c +++ b/bsd/kern/mach_loader.c @@ -121,7 +121,8 @@ static const load_result_t load_result_null = { .needs_dynlinker = 0, .validentry = 0, .using_lcmain = 0, - .is64bit = 0, + .is_64bit_addr = 0, + .is_64bit_data = 0, .csflags = 0, .has_pagezero = 0, .uuid = { 0 }, @@ -314,8 +315,8 @@ note_all_image_info_section(const struct segment_command_64 *scp, * in exchange for better binary compatibility for legacy apps built * before 16KB-alignment was enforced. */ -int fourk_binary_compatibility_unsafe = TRUE; -int fourk_binary_compatibility_allow_wx = FALSE; +const int fourk_binary_compatibility_unsafe = TRUE; +const int fourk_binary_compatibility_allow_wx = FALSE; #endif /* __arm64__ */ load_return_t @@ -349,7 +350,8 @@ load_machfile( return(LOAD_BADMACHO); } - result->is64bit = ((imgp->ip_flags & IMGPF_IS_64BIT) == IMGPF_IS_64BIT); + result->is_64bit_addr = ((imgp->ip_flags & IMGPF_IS_64BIT_ADDR) == IMGPF_IS_64BIT_ADDR); + result->is_64bit_data = ((imgp->ip_flags & IMGPF_IS_64BIT_DATA) == IMGPF_IS_64BIT_DATA); task_t ledger_task; if (imgp->ip_new_thread) { @@ -359,14 +361,14 @@ load_machfile( } pmap = pmap_create(get_task_ledger(ledger_task), (vm_map_size_t) 0, - result->is64bit); + result->is_64bit_addr); map = vm_map_create(pmap, 0, - vm_compute_max_offset(result->is64bit), + vm_compute_max_offset(result->is_64bit_addr), TRUE); #if defined(__arm64__) - if (result->is64bit) { + if (result->is_64bit_addr) { /* enforce 16KB alignment of VM map entries */ vm_map_set_page_shift(map, SIXTEENK_PAGE_SHIFT); } else { @@ -383,8 +385,10 @@ load_machfile( * flag (CS_ENFORCEMENT) is not set yet, but we can use the * global flag. */ - if ( !cs_enforcement(NULL) && (header->flags & MH_ALLOW_STACK_EXECUTION) ) + if ( !cs_process_global_enforcement() && (header->flags & MH_ALLOW_STACK_EXECUTION) ) { vm_map_disable_NX(map); + // TODO: Message Trace or log that this is happening + } #endif /* Forcibly disallow execution from data pages on even if the arch @@ -418,7 +422,8 @@ load_machfile( /* * re-set the bitness on the load result since we cleared the load result above. */ - result->is64bit = ((imgp->ip_flags & IMGPF_IS_64BIT) == IMGPF_IS_64BIT); + result->is_64bit_addr = ((imgp->ip_flags & IMGPF_IS_64BIT_ADDR) == IMGPF_IS_64BIT_ADDR); + result->is_64bit_data = ((imgp->ip_flags & IMGPF_IS_64BIT_DATA) == IMGPF_IS_64BIT_DATA); lret = parse_machfile(vp, map, thread, header, file_offset, macho_size, 0, aslr_page_offset, dyld_aslr_page_offset, result, @@ -433,7 +438,7 @@ load_machfile( /* * On x86, for compatibility, don't enforce the hard page-zero restriction for 32-bit binaries. */ - if (!result->is64bit) { + if (!result->is_64bit_addr) { enforce_hard_pagezero = FALSE; } @@ -443,7 +448,7 @@ load_machfile( */ #define VM_MAP_HIGH_START_BITS_COUNT 8 #define VM_MAP_HIGH_START_BITS_SHIFT 27 - if (result->is64bit && + if (result->is_64bit_addr && (imgp->ip_flags & IMGPF_HIGH_BITS_ASLR)) { int random_bits; vm_map_offset_t high_start; @@ -462,7 +467,7 @@ load_machfile( if (enforce_hard_pagezero && (vm_map_has_hard_pagezero(map, 0x1000) == FALSE)) { #if __arm64__ - if (!result->is64bit && /* not 64-bit */ + if (!result->is_64bit_addr && /* not 64-bit address space */ !(header->flags & MH_PIE) && /* not PIE */ (vm_map_page_shift(map) != FOURK_PAGE_SHIFT || PAGE_SHIFT != FOURK_PAGE_SHIFT) && /* page size != 4KB */ @@ -513,9 +518,9 @@ load_machfile( return (LOAD_FAILURE); } proc_transcommit(p, 0); - workqueue_mark_exiting(p); + workq_mark_exiting(p); task_complete_halt(task); - workqueue_exit(p); + workq_exit(p); /* * Roll up accounting info to new task. The roll up is done after @@ -527,7 +532,7 @@ load_machfile( *mapp = map; #ifdef CONFIG_32BIT_TELEMETRY - if (!result->is64bit) { + if (!result->is_64bit_data) { /* * This may not need to be an AST; we merely need to ensure that * we gather telemetry at the point where all of the information @@ -863,7 +868,6 @@ parse_machfile( switch(lcp->cmd) { case LC_SEGMENT: { struct segment_command *scp = (struct segment_command *) lcp; - if (pass == 0) { if (is_dyld && scp->vmaddr == 0 && scp->fileoff == 0) { dyld_no_load_addr = TRUE; @@ -926,7 +930,6 @@ parse_machfile( map, slide, result); - if (ret == LOAD_SUCCESS && scp->fileoff == 0 && scp->filesize > 0) { /* Enforce a single segment mapping offset zero, with R+X * protection. */ @@ -1052,7 +1055,7 @@ parse_machfile( /* * Allow injections to be ignored on devices w/o enforcement enabled */ - if (!cs_enforcement(NULL)) + if (!cs_process_global_enforcement()) ret = LOAD_SUCCESS; /* ignore error */ } else { @@ -1081,7 +1084,7 @@ parse_machfile( if (cs_debug) printf("CODE SIGNING: %s[%d]: invalid initial page at offset %lld validated:%d tainted:%d csflags:0x%x\n", vp->v_name, p->p_pid, (long long)(file_offset + off), valid, tainted, result->csflags); - if (cs_enforcement(NULL) || + if (cs_process_global_enforcement() || (result->csflags & (CS_HARD|CS_KILL|CS_ENFORCEMENT))) { ret = LOAD_FAILURE; } @@ -1133,6 +1136,22 @@ parse_machfile( } break; #endif +#if __arm64__ + case LC_VERSION_MIN_IPHONEOS: { + struct version_min_command *vmc; + + if (pass != 1) { + break; + } + vmc = (struct version_min_command *) lcp; + if (vmc->sdk < (12 << 16)) { + /* app built with a pre-iOS12 SDK: apply legacy footprint mitigation */ + result->legacy_footprint = TRUE; + } +// printf("FBDP %s:%d vp %p (%s) sdk %d.%d.%d -> legacy_footprint=%d\n", __FUNCTION__, __LINE__, vp, vp->v_name, (vmc->sdk >> 16), ((vmc->sdk & 0xFF00) >> 8), (vmc->sdk & 0xFF), result->legacy_footprint); + break; + } +#endif /* __arm64__ */ default: /* Other commands are ignored by the kernel */ ret = LOAD_SUCCESS; @@ -1146,7 +1165,7 @@ parse_machfile( } if (ret == LOAD_SUCCESS) { - if(!got_code_signatures && cs_enforcement(NULL)) { + if(!got_code_signatures && cs_process_global_enforcement()) { ret = LOAD_FAILURE; } @@ -1168,7 +1187,7 @@ parse_machfile( if (result->thread_count == 0) { ret = LOAD_FAILURE; } -#if CONFIG_EMBEDDED +#if CONFIG_ENFORCE_SIGNED_CODE if (result->needs_dynlinker && !(result->csflags & CS_DYLD_PLATFORM)) { ret = LOAD_FAILURE; } @@ -1308,7 +1327,8 @@ map_segment( vm_map_offset_t file_start, vm_map_offset_t file_end, vm_prot_t initprot, - vm_prot_t maxprot) + vm_prot_t maxprot, + load_result_t *result) { vm_map_offset_t cur_offset, cur_start, cur_end; kern_return_t ret; @@ -1410,6 +1430,23 @@ map_segment( /* regular mapping for the middle */ cur_vmk_flags = VM_MAP_KERNEL_FLAGS_NONE; } + +#if CONFIG_EMBEDDED + (void) result; +#else /* CONFIG_EMBEDDED */ + /* + * This process doesn't have its new csflags (from + * the image being loaded) yet, so tell VM to override the + * current process's CS_ENFORCEMENT for this mapping. + */ + if (result->csflags & CS_ENFORCEMENT) { + cur_vmk_flags.vmkf_cs_enforcement = TRUE; + } else { + cur_vmk_flags.vmkf_cs_enforcement = FALSE; + } + cur_vmk_flags.vmkf_cs_enforcement_override = TRUE; +#endif /* CONFIG_EMBEDDED */ + cur_end = vm_map_trunc_page(vm_start + (file_end - file_start), effective_page_mask); @@ -1785,7 +1822,8 @@ load_segment( file_start, file_end, initprot, - maxprot); + maxprot, + result); if (ret) { return LOAD_NOSPACE; } @@ -1843,7 +1881,8 @@ load_segment( 0, delta_size, scp->initprot, - scp->maxprot); + scp->maxprot, + result); if (kr != KERN_SUCCESS) { return(LOAD_NOSPACE); } @@ -1960,7 +1999,7 @@ load_main( } /* use default location for stack */ - ret = thread_userstackdefault(&addr, result->is64bit); + ret = thread_userstackdefault(&addr, result->is_64bit_addr); if (ret != KERN_SUCCESS) return(LOAD_FAILURE); @@ -2001,7 +2040,6 @@ load_unixthread( load_return_t ret; int customstack =0; mach_vm_offset_t addr; - if (tcp->cmdsize < sizeof(*tcp)) return (LOAD_BADMACHO); if (result->thread_count != 0) { @@ -2012,15 +2050,15 @@ load_unixthread( return (LOAD_SUCCESS); ret = load_threadstack(thread, - (uint32_t *)(((vm_offset_t)tcp) + - sizeof(struct thread_command)), - tcp->cmdsize - sizeof(struct thread_command), - &addr, &customstack, result); + (uint32_t *)(((vm_offset_t)tcp) + + sizeof(struct thread_command)), + tcp->cmdsize - sizeof(struct thread_command), + &addr, &customstack, result); if (ret != LOAD_SUCCESS) return(ret); /* LC_UNIXTHREAD optionally specifies stack size and location */ - + if (!customstack) { result->user_stack_alloc_size = MAXSSIZ; } @@ -2030,10 +2068,10 @@ load_unixthread( result->user_stack -= slide; ret = load_threadentry(thread, - (uint32_t *)(((vm_offset_t)tcp) + - sizeof(struct thread_command)), - tcp->cmdsize - sizeof(struct thread_command), - &addr); + (uint32_t *)(((vm_offset_t)tcp) + + sizeof(struct thread_command)), + tcp->cmdsize - sizeof(struct thread_command), + &addr); if (ret != LOAD_SUCCESS) return(ret); @@ -2046,9 +2084,9 @@ load_unixthread( result->entry_point += slide; ret = load_threadstate(thread, - (uint32_t *)(((vm_offset_t)tcp) + sizeof(struct thread_command)), - tcp->cmdsize - sizeof(struct thread_command), - result); + (uint32_t *)(((vm_offset_t)tcp) + sizeof(struct thread_command)), + tcp->cmdsize - sizeof(struct thread_command), + result); if (ret != LOAD_SUCCESS) return (ret); @@ -2148,7 +2186,7 @@ load_threadstack( * to the appropriate type in thread_userstack() based on * the value of flavor. */ - ret = thread_userstack(thread, flavor, (thread_state_t)ts, size, user_stack, customstack, result->is64bit); + ret = thread_userstack(thread, flavor, (thread_state_t)ts, size, user_stack, customstack, result->is_64bit_data); if (ret != KERN_SUCCESS) { return(LOAD_FAILURE); } @@ -2304,7 +2342,8 @@ load_dylinker( goto novp_out; *myresult = load_result_null; - myresult->is64bit = result->is64bit; + myresult->is_64bit_addr = result->is_64bit_addr; + myresult->is_64bit_data = result->is_64bit_data; ret = parse_machfile(vp, map, thread, header, file_offset, macho_size, depth, slide, 0, myresult, result, imgp); @@ -2373,23 +2412,44 @@ load_code_signature( } blob = ubc_cs_blob_get(vp, cputype, macho_offset); + if (blob != NULL) { /* we already have a blob for this vnode and cputype */ - if (blob->csb_cpu_type == cputype && - blob->csb_base_offset == macho_offset) { - /* it matches the blob we want here, lets verify the version */ - if(0 != ubc_cs_generation_check(vp)) { - if (0 != ubc_cs_blob_revalidate(vp, blob, imgp, 0)) { - ret = LOAD_FAILURE; /* set error same as from ubc_cs_blob_add */ - goto out; - } - } - ret = LOAD_SUCCESS; - } else { + if (blob->csb_cpu_type != cputype || + blob->csb_base_offset != macho_offset) { /* the blob has changed for this vnode: fail ! */ ret = LOAD_BADMACHO; + goto out; } - goto out; + + /* It matches the blob we want here, let's verify the version */ + if (ubc_cs_generation_check(vp) == 0) { + /* No need to revalidate, we're good! */ + ret = LOAD_SUCCESS; + goto out; + } + + /* That blob may be stale, let's revalidate. */ + error = ubc_cs_blob_revalidate(vp, blob, imgp, 0); + if (error == 0) { + /* Revalidation succeeded, we're good! */ + ret = LOAD_SUCCESS; + goto out; + } + + if (error != EAGAIN) { + printf("load_code_signature: revalidation failed: %d\n", error); + ret = LOAD_FAILURE; + goto out; + } + + assert(error == EAGAIN); + + /* + * Revalidation was not possible for this blob. We just continue as if there was no blob, + * rereading the signature, and ubc_cs_blob_add will do the right thing. + */ + blob = NULL; } blob_size = lcp->datasize; diff --git a/bsd/kern/mach_loader.h b/bsd/kern/mach_loader.h index b564d1201..7870e8e84 100644 --- a/bsd/kern/mach_loader.h +++ b/bsd/kern/mach_loader.h @@ -67,13 +67,16 @@ typedef struct _load_result { int thread_count; unsigned int /* boolean_t */ unixproc :1, - needs_dynlinker : 1, - dynlinker :1, - validentry :1, - has_pagezero :1, - using_lcmain :1, - is64bit :1, - :0; + needs_dynlinker :1, + dynlinker :1, + validentry :1, + has_pagezero :1, + using_lcmain :1, +#if __arm64__ + legacy_footprint :1, +#endif /* __arm64__ */ + is_64bit_addr :1, + is_64bit_data :1; unsigned int csflags; unsigned char uuid[16]; mach_vm_address_t min_vm_addr; diff --git a/bsd/kern/policy_check.c b/bsd/kern/policy_check.c index 52a91f33b..527c89c16 100644 --- a/bsd/kern/policy_check.c +++ b/bsd/kern/policy_check.c @@ -119,7 +119,7 @@ common_hook(void) return rv; } -#if (MAC_POLICY_OPS_VERSION != 53) +#if (MAC_POLICY_OPS_VERSION != 55) # error "struct mac_policy_ops doesn't match definition in mac_policy.h" #endif /* @@ -268,9 +268,9 @@ const static struct mac_policy_ops policy_ops = { CHECK_SET_HOOK(proc_check_inherit_ipc_ports) CHECK_SET_HOOK(vnode_check_rename) CHECK_SET_HOOK(kext_check_query) - CHECK_SET_HOOK(iokit_check_nvram_get) - CHECK_SET_HOOK(iokit_check_nvram_set) - CHECK_SET_HOOK(iokit_check_nvram_delete) + CHECK_SET_HOOK(proc_notify_exec_complete) + .mpo_reserved5 = (mpo_reserved_hook_t *)common_hook, + .mpo_reserved6 = (mpo_reserved_hook_t *)common_hook, CHECK_SET_HOOK(proc_check_expose_task) CHECK_SET_HOOK(proc_check_set_host_special_port) CHECK_SET_HOOK(proc_check_set_host_exception_port) diff --git a/bsd/kern/proc_info.c b/bsd/kern/proc_info.c index 52eaeda02..2de42fa30 100644 --- a/bsd/kern/proc_info.c +++ b/bsd/kern/proc_info.c @@ -67,6 +67,7 @@ #include #include #include +#include #include #include @@ -154,9 +155,9 @@ int __attribute__ ((noinline)) proc_pidfdlist(proc_t p, user_addr_t buffer, uint int __attribute__ ((noinline)) proc_pidbsdinfo(proc_t p, struct proc_bsdinfo *pbsd, int zombie); int __attribute__ ((noinline)) proc_pidshortbsdinfo(proc_t p, struct proc_bsdshortinfo *pbsd_shortp, int zombie); int __attribute__ ((noinline)) proc_pidtaskinfo(proc_t p, struct proc_taskinfo *ptinfo); -int __attribute__ ((noinline)) proc_pidthreadinfo(proc_t p, uint64_t arg, int thuniqueid, struct proc_threadinfo *pthinfo); +int __attribute__ ((noinline)) proc_pidthreadinfo(proc_t p, uint64_t arg, bool thuniqueid, struct proc_threadinfo *pthinfo); int __attribute__ ((noinline)) proc_pidthreadpathinfo(proc_t p, uint64_t arg, struct proc_threadwithpathinfo *pinfo); -int __attribute__ ((noinline)) proc_pidlistthreads(proc_t p, user_addr_t buffer, uint32_t buffersize, int32_t *retval); +int __attribute__ ((noinline)) proc_pidlistthreads(proc_t p, bool thuniqueid, user_addr_t buffer, uint32_t buffersize, int32_t *retval); int __attribute__ ((noinline)) proc_pidregioninfo(proc_t p, uint64_t arg, user_addr_t buffer, uint32_t buffersize, int32_t *retval); int __attribute__ ((noinline)) proc_pidregionpathinfo(proc_t p, uint64_t arg, user_addr_t buffer, uint32_t buffersize, int32_t *retval); int __attribute__ ((noinline)) proc_pidregionpathinfo2(proc_t p, uint64_t arg, user_addr_t buffer, uint32_t buffersize, int32_t *retval); @@ -796,7 +797,7 @@ proc_pidtaskinfo(proc_t p, struct proc_taskinfo * ptinfo) int -proc_pidthreadinfo(proc_t p, uint64_t arg, int thuniqueid, struct proc_threadinfo *pthinfo) +proc_pidthreadinfo(proc_t p, uint64_t arg, bool thuniqueid, struct proc_threadinfo *pthinfo) { int error = 0; uint64_t threadaddr = (uint64_t)arg; @@ -926,7 +927,7 @@ proc_pidthreadpathinfo(proc_t p, uint64_t arg, struct proc_threadwithpathinfo * int -proc_pidlistthreads(proc_t p, user_addr_t buffer, uint32_t buffersize, int32_t *retval) +proc_pidlistthreads(proc_t p, bool thuniqueid, user_addr_t buffer, uint32_t buffersize, int32_t *retval) { uint32_t count = 0; int ret = 0; @@ -950,7 +951,7 @@ proc_pidlistthreads(proc_t p, user_addr_t buffer, uint32_t buffersize, int32_t return(ENOMEM); bzero(kbuf, numthreads * sizeof(uint64_t)); - ret = fill_taskthreadlist(p->task, kbuf, numthreads); + ret = fill_taskthreadlist(p->task, kbuf, numthreads, thuniqueid); error = copyout(kbuf, buffer, ret); kfree(kbuf, numthreads * sizeof(uint64_t)); @@ -1357,7 +1358,7 @@ proc_pidoriginatorinfo(int pid, int flavor, user_addr_t buffer, uint32_t buffer switch (flavor) { case PROC_PIDORIGINATOR_UUID: { - uuid_t uuid; + uuid_t uuid = {}; error = proc_pidoriginatoruuid(uuid, sizeof(uuid)); if (error != 0) @@ -1385,7 +1386,7 @@ proc_pidoriginatorinfo(int pid, int flavor, user_addr_t buffer, uint32_t buffer break; case PROC_PIDORIGINATOR_BGSTATE: { - uint32_t is_backgrounded; + uint32_t is_backgrounded = 0; error = proc_get_originatorbgstate(&is_backgrounded); if (error) goto out; @@ -1684,7 +1685,7 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu int shortversion = 0; uint32_t size; int zombie = 0; - int thuniqueid = 0; + bool thuniqueid = false; int uniqidversion = 0; boolean_t check_same_user; @@ -1706,6 +1707,9 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu case PROC_PIDTHREADINFO: size = PROC_PIDTHREADINFO_SIZE; break; + case PROC_PIDLISTTHREADIDS: + size = PROC_PIDLISTTHREADIDS_SIZE; + break; case PROC_PIDLISTTHREADS: size = PROC_PIDLISTTHREADS_SIZE; break; @@ -1788,6 +1792,12 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu size = 0; } break; + case PROC_PIDVMRTFAULTINFO: + size = sizeof(vm_rtfault_record_t); + if (buffer == USER_ADDR_NULL) { + size = 0; + } + break; default: return(EINVAL); } @@ -1917,7 +1927,7 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu break; case PROC_PIDTHREADID64INFO: - thuniqueid = 1; + thuniqueid = true; case PROC_PIDTHREADINFO:{ struct proc_threadinfo pthinfo; @@ -1930,8 +1940,10 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu } break; + case PROC_PIDLISTTHREADIDS: + thuniqueid = true; case PROC_PIDLISTTHREADS:{ - error = proc_pidlistthreads(p, buffer, buffersize, retval); + error = proc_pidlistthreads(p, thuniqueid, buffer, buffersize, retval); } break; @@ -2070,7 +2082,48 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu case PROC_PIDLISTDYNKQUEUES: error = kevent_copyout_proc_dynkqids(p, buffer, buffersize, retval); break; + case PROC_PIDVMRTFAULTINFO: { + /* This interface can only be employed on the current + * process. We will eventually enforce an entitlement. + */ + *retval = 0; + + if (p != current_proc()) { + error = EINVAL; + break; + } + + size_t kbufsz = MIN(buffersize, vmrtfaultinfo_bufsz()); + void *vmrtfbuf = kalloc(kbufsz); + + if (vmrtfbuf == NULL) { + error = ENOMEM; + break; + } + + bzero(vmrtfbuf, kbufsz); + + uint64_t effpid = get_current_unique_pid(); + /* The VM may choose to provide more comprehensive records + * for root-privileged users on internal configurations. + */ + boolean_t isroot = (suser(kauth_cred_get(), (u_short *)0) == 0); + int vmf_residue = vmrtf_extract(effpid, isroot, kbufsz, vmrtfbuf, retval); + int vmfsz = *retval * sizeof(vm_rtfault_record_t); + + error = 0; + if (vmfsz) { + error = copyout(vmrtfbuf, buffer, vmfsz); + } + if (error == 0) { + if (vmf_residue) { + error = ENOMEM; + } + } + kfree(vmrtfbuf, kbufsz); + } + break; default: error = ENOTSUP; break; diff --git a/bsd/kern/subr_log.c b/bsd/kern/subr_log.c index cde2de0be..ff45d5c8a 100644 --- a/bsd/kern/subr_log.c +++ b/bsd/kern/subr_log.c @@ -161,6 +161,9 @@ int oslog_stream_open = 0; int oslog_stream_buf_size = OSLOG_STREAM_BUF_SIZE; int oslog_stream_num_entries = OSLOG_NUM_STREAM_ENTRIES; +uint8_t __firehose_buffer_kernel_chunk_count = FIREHOSE_BUFFER_KERNEL_DEFAULT_CHUNK_COUNT; +uint8_t __firehose_num_kernel_io_pages = FIREHOSE_BUFFER_KERNEL_DEFAULT_IO_PAGES; + /* oslogsoftc only valid while oslog_open=1 */ struct oslogsoftc { int sc_state; /* see above for possibilities */ @@ -784,7 +787,7 @@ int oslogioctl(__unused dev_t dev, u_long com, caddr_t data, __unused int flag, __unused struct proc *p) { int ret = 0; - mach_vm_size_t buffer_size = (FIREHOSE_BUFFER_KERNEL_CHUNK_COUNT * FIREHOSE_CHUNK_SIZE); + mach_vm_size_t buffer_size = (__firehose_buffer_kernel_chunk_count * FIREHOSE_CHUNK_SIZE); firehose_buffer_map_info_t map_info = {0, 0}; firehose_buffer_t kernel_firehose_buffer = NULL; mach_vm_address_t user_addr = 0; @@ -809,6 +812,7 @@ oslogioctl(__unused dev_t dev, u_long com, caddr_t data, __unused int flag, __un buffer_size, 0, /* mask */ VM_FLAGS_ANYWHERE, + VM_MAP_KERNEL_FLAGS_NONE, VM_KERN_MEMORY_NONE, mem_entry_ptr, 0, /* offset */ @@ -876,7 +880,18 @@ void oslog_init(void) { kern_return_t kr; - vm_size_t size = FIREHOSE_BUFFER_KERNEL_CHUNK_COUNT * FIREHOSE_CHUNK_SIZE; + if (!PE_parse_boot_argn("firehose_chunk_count", &__firehose_buffer_kernel_chunk_count, sizeof(__firehose_buffer_kernel_chunk_count))) { + __firehose_buffer_kernel_chunk_count = FIREHOSE_BUFFER_KERNEL_DEFAULT_CHUNK_COUNT; + } + if (!PE_parse_boot_argn("firehose_io_pages", &__firehose_num_kernel_io_pages, sizeof(__firehose_num_kernel_io_pages))) { + __firehose_num_kernel_io_pages = FIREHOSE_BUFFER_KERNEL_DEFAULT_IO_PAGES; + } + if (!__firehose_kernel_configuration_valid(__firehose_buffer_kernel_chunk_count, __firehose_num_kernel_io_pages)) { + printf("illegal firehose configuration %u/%u, using defaults\n", __firehose_buffer_kernel_chunk_count, __firehose_num_kernel_io_pages); + __firehose_buffer_kernel_chunk_count = FIREHOSE_BUFFER_KERNEL_DEFAULT_CHUNK_COUNT; + __firehose_num_kernel_io_pages = FIREHOSE_BUFFER_KERNEL_DEFAULT_IO_PAGES; + } + vm_size_t size = __firehose_buffer_kernel_chunk_count * FIREHOSE_CHUNK_SIZE; oslog_lock_init(); @@ -891,7 +906,7 @@ oslog_init(void) /* register buffer with firehose */ kernel_firehose_addr = (vm_offset_t)__firehose_buffer_create((size_t *) &size); - kprintf("oslog_init completed\n"); + printf("oslog_init completed, %u chunks, %u io pages\n", __firehose_buffer_kernel_chunk_count, __firehose_num_kernel_io_pages); } /* @@ -1333,3 +1348,46 @@ log_dmesg(user_addr_t buffer, uint32_t buffersize, int32_t * retval) return (error); } +#ifdef CONFIG_XNUPOST + +uint32_t find_pattern_in_buffer(char * pattern, uint32_t len, int expected_count); + +/* + * returns count of pattern found in systemlog buffer. + * stops searching further if count reaches expected_count. + */ +uint32_t +find_pattern_in_buffer(char * pattern, uint32_t len, int expected_count) +{ + int match_count = 0; + int i = 0; + int j = 0; + int no_match = 0; + int pos = 0; + char ch = 0; + + if (pattern == NULL || len == 0 || expected_count == 0) { + return 0; + } + + for (i = 0; i < msgbufp->msg_size; i++) { + no_match = 0; + for (j = 0; j < (int)len; j++) { + pos = (msgbufp->msg_bufx + i + j) % msgbufp->msg_size; + ch = msgbufp->msg_bufc[pos]; + if (ch != pattern[j]) { + no_match = 1; + break; + } + } + if (no_match == 0) { + match_count++; + if (match_count >= expected_count) { + break; + } + } + } + return match_count; +} + +#endif diff --git a/bsd/kern/sys_coalition.c b/bsd/kern/sys_coalition.c index fec4bfa44..4741bfc16 100644 --- a/bsd/kern/sys_coalition.c +++ b/bsd/kern/sys_coalition.c @@ -315,7 +315,7 @@ static int sysctl_coalition_get_ids SYSCTL_HANDLER_ARGS int error, pid; proc_t tproc; uint64_t value; - uint64_t ids[COALITION_NUM_TYPES]; + uint64_t ids[COALITION_NUM_TYPES] = {}; error = SYSCTL_IN(req, &value, sizeof(value)); @@ -349,7 +349,7 @@ static int sysctl_coalition_get_roles SYSCTL_HANDLER_ARGS int error, pid; proc_t tproc; int value; - int roles[COALITION_NUM_TYPES]; + int roles[COALITION_NUM_TYPES] = {}; error = SYSCTL_IN(req, &value, sizeof(value)); diff --git a/bsd/kern/sys_generic.c b/bsd/kern/sys_generic.c index 9cfbe7e91..ad3d80a6b 100644 --- a/bsd/kern/sys_generic.c +++ b/bsd/kern/sys_generic.c @@ -1085,7 +1085,7 @@ select_internal(struct proc *p, struct select_nocancel_args *uap, uint64_t timeo th_act = current_thread(); uth = get_bsdthread_info(th_act); sel = &uth->uu_select; - seldata = &uth->uu_kevent.ss_select_data; + seldata = &uth->uu_save.uus_select_data; *retval = 0; seldata->args = uap; @@ -1270,7 +1270,7 @@ selprocess(int error, int sel_pass) th_act = current_thread(); uth = get_bsdthread_info(th_act); sel = &uth->uu_select; - seldata = &uth->uu_kevent.ss_select_data; + seldata = &uth->uu_save.uus_select_data; uap = seldata->args; retval = seldata->retval; @@ -1483,16 +1483,7 @@ static uint64_t sellinkfp(struct fileproc *fp, void **wq_data, struct waitq_set } if ((fp->f_flags & FP_SELCONFLICT) == FP_SELCONFLICT) { - /* - * The conflict queue requires disabling interrupts, so we - * need to explicitly reserve a link object to avoid a - * panic/assert in the waitq code. Hopefully this extra step - * can be avoided if we can split the waitq structure into - * blocking and linkage sub-structures. - */ - uint64_t reserved_link = waitq_link_reserve(&select_conflict_queue); - waitq_link(&select_conflict_queue, wqset, WAITQ_SHOULD_LOCK, &reserved_link); - waitq_link_release(reserved_link); + waitq_link(&select_conflict_queue, wqset, WAITQ_SHOULD_LOCK, NULL); } /* @@ -1610,6 +1601,8 @@ selscan(struct proc *p, struct _select *sel, struct _select_data * seldata, fp->f_flags |= FP_SELCONFLICT; else fp->f_flags |= FP_INSELECT; + + waitq_set_lazy_init_link(wqset); } context.vc_ucred = fp->f_cred; @@ -1731,6 +1724,8 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval) OSBitOrAtomic(P_SELECT, &p->p_flag); for (i = 0; i < nfds; i++) { short events = fds[i].events; + KNOTE_LOCK_CTX(knlc); + __assert_only int rc; /* per spec, ignore fd values below zero */ if (fds[i].fd < 0) { @@ -1749,14 +1744,16 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval) kev.filter = EVFILT_READ; if (events & ( POLLPRI | POLLRDBAND )) kev.flags |= EV_OOBAND; - kevent_register(kq, &kev, p); + rc = kevent_register(kq, &kev, &knlc); + assert((rc & FILTER_REGISTER_WAIT) == 0); } /* Handle output events */ if ((kev.flags & EV_ERROR) == 0 && (events & ( POLLOUT | POLLWRNORM | POLLWRBAND ))) { kev.filter = EVFILT_WRITE; - kevent_register(kq, &kev, p); + rc = kevent_register(kq, &kev, &knlc); + assert((rc & FILTER_REGISTER_WAIT) == 0); } /* Handle BSD extension vnode events */ @@ -1772,7 +1769,8 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval) kev.fflags |= NOTE_LINK; if (events & POLLWRITE) kev.fflags |= NOTE_WRITE; - kevent_register(kq, &kev, p); + rc = kevent_register(kq, &kev, &knlc); + assert((rc & FILTER_REGISTER_WAIT) == 0); } if (kev.flags & EV_ERROR) { @@ -2028,7 +2026,7 @@ seldrop_locked(struct proc *p, u_int32_t *ibits, int nfd, int lim, int *need_wak } nw = howmany(nfd, NFDBITS); - seldata = &uth->uu_kevent.ss_select_data; + seldata = &uth->uu_save.uus_select_data; nc = 0; for (msk = 0; msk < 3; msk++) { @@ -2741,7 +2739,7 @@ waitevent(proc_t p, struct waitevent_args *uap, int *retval) union { struct eventreq64 er64; struct eventreq32 er32; - } uer; + } uer = {}; interval = 0; @@ -3112,7 +3110,7 @@ gethostuuid(struct proc *p, struct gethostuuid_args *uap, __unused int32_t *retv kern_return_t kret; int error; mach_timespec_t mach_ts; /* for IOKit call */ - __darwin_uuid_t uuid_kern; /* for IOKit call */ + __darwin_uuid_t uuid_kern = {}; /* for IOKit call */ if (!uap->spi) { #if CONFIG_EMBEDDED @@ -3227,7 +3225,7 @@ ledger(struct proc *p, struct ledger_args *args, __unused int32_t *retval) } #endif case LEDGER_INFO: { - struct ledger_info info; + struct ledger_info info = {}; rval = ledger_info(task, &info); proc_rele(proc); @@ -3287,6 +3285,9 @@ telemetry(__unused struct proc *p, struct telemetry_args *args, __unused int32_t case TELEMETRY_CMD_TIMER_EVENT: error = telemetry_timer_event(args->deadline, args->interval, args->leeway); break; + case TELEMETRY_CMD_PMI_SETUP: + error = telemetry_pmi_setup((enum telemetry_pmi)args->deadline, args->interval); + break; #endif /* CONFIG_TELEMETRY */ case TELEMETRY_CMD_VOUCHER_NAME: if (thread_set_voucher_name((mach_port_name_t)args->deadline)) @@ -3681,6 +3682,26 @@ SYSCTL_PROC(_kern, OID_AUTO, wqset_clear_preposts, CTLTYPE_QUAD | CTLFLAG_RW | C 0, 0, sysctl_wqset_clear_preposts, "Q", "clear preposts on given waitq set"); #endif /* CONFIG_WAITQ_DEBUG */ + +static int +sysctl_waitq_set_nelem SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + int nelem; + + /* Read only */ + if (req->newptr != USER_ADDR_NULL) + return (EPERM); + + nelem = sysctl_helper_waitq_set_nelem(); + + return SYSCTL_OUT(req, &nelem, sizeof(nelem)); +} + +SYSCTL_PROC(_kern, OID_AUTO, n_ltable_entries, CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, sysctl_waitq_set_nelem, "I", "ltable elementis currently used"); + + #endif /* DEVELOPMENT || DEBUG */ diff --git a/bsd/kern/sys_persona.c b/bsd/kern/sys_persona.c index 7f33f6769..d00584e97 100644 --- a/bsd/kern/sys_persona.c +++ b/bsd/kern/sys_persona.c @@ -105,10 +105,15 @@ static int kpersona_alloc_syscall(user_addr_t infop, user_addr_t idp) if (!persona) return error; + error = persona_init_begin(persona); + if (error) { + goto out_persona_err; + } + if (kinfo.persona_gid) { error = persona_set_gid(persona, kinfo.persona_gid); if (error) - goto out_error; + goto out_persona_err; } if (kinfo.persona_ngroups > 0) { @@ -120,13 +125,21 @@ static int kpersona_alloc_syscall(user_addr_t infop, user_addr_t idp) kinfo.persona_ngroups, kinfo.persona_gmuid); if (error) - goto out_error; + goto out_persona_err; } error = copyout(&persona->pna_id, idp, sizeof(persona->pna_id)); - if (error) - goto out_error; + if (error) { + goto out_persona_err; + } + + kinfo.persona_id = persona->pna_id; error = kpersona_copyout(&kinfo, infop); + if (error) { + goto out_persona_err; + } + + persona_init_end(persona, error); /* * On success, we have a persona structure in the global list with a @@ -135,8 +148,13 @@ static int kpersona_alloc_syscall(user_addr_t infop, user_addr_t idp) */ return error; -out_error: +out_persona_err: + assert(error != 0); + persona_init_end(persona, error); + +#if PERSONA_DEBUG printf("%s: ERROR:%d\n", __func__, error); +#endif if (persona) persona_put(persona); return error; @@ -204,8 +222,8 @@ static int kpersona_info_syscall(user_addr_t idp, user_addr_t infop) if (!persona) return ESRCH; - persona_dbg("FOUND: persona:%p, id:%d, gid:%d, login:\"%s\"", - persona, persona->pna_id, persona_get_gid(persona), + persona_dbg("FOUND: persona: id:%d, gid:%d, login:\"%s\"", + persona->pna_id, persona_get_gid(persona), persona->pna_login); memset(&kinfo, 0, sizeof(kinfo)); diff --git a/bsd/kern/sys_pipe.c b/bsd/kern/sys_pipe.c index 9e8b346e9..c6725337b 100644 --- a/bsd/kern/sys_pipe.c +++ b/bsd/kern/sys_pipe.c @@ -1419,7 +1419,7 @@ filt_piperead(struct knote *kn, long hint) return filt_piperead_common(kn, rpipe); } - + static int filt_pipereadtouch(struct knote *kn, struct kevent_internal_s *kev) { @@ -1431,8 +1431,6 @@ filt_pipereadtouch(struct knote *kn, struct kevent_internal_s *kev) /* accept new inputs (and save the low water threshold and flag) */ kn->kn_sdata = kev->data; kn->kn_sfflags = kev->fflags; - if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) - kn->kn_udata = kev->udata; /* identify if any events are now fired */ retval = filt_piperead_common(kn, rpipe); @@ -1515,8 +1513,6 @@ filt_pipewritetouch(struct knote *kn, struct kevent_internal_s *kev) /* accept new kevent data (and save off lowat threshold and flag) */ kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; - if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) - kn->kn_udata = kev->udata; /* determine if any event is now deemed fired */ res = filt_pipewrite_common(kn, rpipe); diff --git a/bsd/kern/sys_ulock.c b/bsd/kern/sys_ulock.c index c8bf0da86..0d8664c7e 100644 --- a/bsd/kern/sys_ulock.c +++ b/bsd/kern/sys_ulock.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include @@ -87,14 +88,14 @@ */ static lck_grp_t *ull_lck_grp; -static lck_mtx_t ull_table_lock; -#define ull_global_lock() lck_mtx_lock(&ull_table_lock) -#define ull_global_unlock() lck_mtx_unlock(&ull_table_lock) - -#define ull_lock(ull) lck_mtx_lock(&ull->ull_lock) -#define ull_unlock(ull) lck_mtx_unlock(&ull->ull_lock) -#define ull_assert_owned(ull) LCK_MTX_ASSERT(&ull->ull_lock, LCK_MTX_ASSERT_OWNED) +typedef lck_spin_t ull_lock_t; +#define ull_lock_init(ull) lck_spin_init(&ull->ull_lock, ull_lck_grp, NULL) +#define ull_lock_destroy(ull) lck_spin_destroy(&ull->ull_lock, ull_lck_grp) +#define ull_lock(ull) lck_spin_lock(&ull->ull_lock) +#define ull_unlock(ull) lck_spin_unlock(&ull->ull_lock) +#define ull_assert_owned(ull) LCK_SPIN_ASSERT(&ull->ull_lock, LCK_ASSERT_OWNED) +#define ull_assert_notwned(ull) LCK_SPIN_ASSERT(&ull->ull_lock, LCK_ASSERT_NOTOWNED) #define ULOCK_TO_EVENT(ull) ((event_t)ull) #define EVENT_TO_ULOCK(event) ((ull_t *)event) @@ -119,25 +120,22 @@ typedef struct ull { thread_t ull_owner; /* holds +1 thread reference */ ulk_t ull_key; ulk_t ull_saved_key; - lck_mtx_t ull_lock; + ull_lock_t ull_lock; + uint ull_bucket_index; int32_t ull_nwaiters; int32_t ull_max_nwaiters; int32_t ull_refcount; - struct promote_token ull_promote_token; - queue_chain_t ull_hash_link; uint8_t ull_opcode; + struct turnstile *ull_turnstile; + queue_chain_t ull_hash_link; } ull_t; -static const bool ull_debug = false; - extern void ulock_initialize(void); #define ULL_MUST_EXIST 0x0001 -static ull_t *ull_get(ulk_t *, uint32_t); +static ull_t *ull_get(ulk_t *, uint32_t, ull_t **); static void ull_put(ull_t *); -static thread_t ull_promote_owner_locked(ull_t* ull, thread_t thread); - #if DEVELOPMENT || DEBUG static int ull_simulate_copyin_fault = 0; @@ -154,15 +152,23 @@ ull_dump(ull_t *ull) kprintf("ull_refcount\t%d\n", ull->ull_refcount); kprintf("ull_opcode\t%d\n\n", ull->ull_opcode); kprintf("ull_owner\t0x%llx\n\n", thread_tid(ull->ull_owner)); - kprintf("ull_promote_token\t%d, %d\n\n", ull->ull_promote_token.pt_basepri, ull->ull_promote_token.pt_qos); + kprintf("ull_turnstile\t%p\n\n", ull->ull_turnstile); } #endif +typedef struct ull_bucket { + queue_head_t ulb_head; + lck_spin_t ulb_lock; +} ull_bucket_t; + static int ull_hash_buckets; -static queue_head_t *ull_bucket; +static ull_bucket_t *ull_bucket; static uint32_t ull_nzalloc = 0; static zone_t ull_zone; +#define ull_bucket_lock(i) lck_spin_lock(&ull_bucket[i].ulb_lock) +#define ull_bucket_unlock(i) lck_spin_unlock(&ull_bucket[i].ulb_lock) + static __inline__ uint32_t ull_hash_index(char *key, size_t length) { @@ -185,7 +191,6 @@ void ulock_initialize(void) { ull_lck_grp = lck_grp_alloc_init("ulocks", NULL); - lck_mtx_init(&ull_table_lock, ull_lck_grp, NULL); assert(thread_max > 16); /* Size ull_hash_buckets based on thread_max. @@ -196,11 +201,12 @@ ulock_initialize(void) kprintf("%s>thread_max=%d, ull_hash_buckets=%d\n", __FUNCTION__, thread_max, ull_hash_buckets); assert(ull_hash_buckets >= thread_max/4); - ull_bucket = (queue_head_t *)kalloc(sizeof(queue_head_t) * ull_hash_buckets); + ull_bucket = (ull_bucket_t *)kalloc(sizeof(ull_bucket_t) * ull_hash_buckets); assert(ull_bucket != NULL); for (int i = 0; i < ull_hash_buckets; i++) { - queue_init(&ull_bucket[i]); + queue_init(&ull_bucket[i].ulb_head); + lck_spin_init(&ull_bucket[i].ulb_lock, ull_lck_grp, NULL); } ull_zone = zinit(sizeof(ull_t), @@ -218,30 +224,30 @@ static int ull_hash_dump(pid_t pid) { int count = 0; - ull_global_lock(); if (pid == 0) { kprintf("%s>total number of ull_t allocated %d\n", __FUNCTION__, ull_nzalloc); kprintf("%s>BEGIN\n", __FUNCTION__); } for (int i = 0; i < ull_hash_buckets; i++) { - if (!queue_empty(&ull_bucket[i])) { + ull_bucket_lock(i); + if (!queue_empty(&ull_bucket[i].ulb_head)) { ull_t *elem; if (pid == 0) { kprintf("%s>index %d:\n", __FUNCTION__, i); } - qe_foreach_element(elem, &ull_bucket[i], ull_hash_link) { + qe_foreach_element(elem, &ull_bucket[i].ulb_head, ull_hash_link) { if ((pid == 0) || (pid == elem->ull_key.ulk_pid)) { ull_dump(elem); count++; } } } + ull_bucket_unlock(i); } if (pid == 0) { kprintf("%s>END\n", __FUNCTION__); ull_nzalloc = 0; } - ull_global_unlock(); return count; } #endif @@ -255,14 +261,15 @@ ull_alloc(ulk_t *key) ull->ull_refcount = 1; ull->ull_key = *key; ull->ull_saved_key = *key; + ull->ull_bucket_index = ULL_INDEX(key); ull->ull_nwaiters = 0; ull->ull_max_nwaiters = 0; ull->ull_opcode = 0; ull->ull_owner = THREAD_NULL; - ull->ull_promote_token = PROMOTE_TOKEN_INIT; + ull->ull_turnstile = TURNSTILE_NULL; - lck_mtx_init(&ull->ull_lock, ull_lck_grp, NULL); + ull_lock_init(ull); ull_nzalloc++; return ull; @@ -272,10 +279,11 @@ static void ull_free(ull_t *ull) { assert(ull->ull_owner == THREAD_NULL); + assert(ull->ull_turnstile == TURNSTILE_NULL); - LCK_MTX_ASSERT(&ull->ull_lock, LCK_ASSERT_NOTOWNED); + ull_assert_notwned(ull); - lck_mtx_destroy(&ull->ull_lock, ull_lck_grp); + ull_lock_destroy(ull); zfree(ull_zone, ull); } @@ -283,17 +291,17 @@ ull_free(ull_t *ull) /* Finds an existing ulock structure (ull_t), or creates a new one. * If MUST_EXIST flag is set, returns NULL instead of creating a new one. * The ulock structure is returned with ull_lock locked - * - * TODO: Per-bucket lock to reduce contention on global lock */ static ull_t * -ull_get(ulk_t *key, uint32_t flags) +ull_get(ulk_t *key, uint32_t flags, ull_t **unused_ull) { ull_t *ull = NULL; uint i = ULL_INDEX(key); + ull_t *new_ull = (flags & ULL_MUST_EXIST) ? NULL : ull_alloc(key); ull_t *elem; - ull_global_lock(); - qe_foreach_element(elem, &ull_bucket[i], ull_hash_link) { + + ull_bucket_lock(i); + qe_foreach_element(elem, &ull_bucket[i].ulb_head, ull_hash_link) { ull_lock(elem); if (ull_key_match(&elem->ull_key, key)) { ull = elem; @@ -305,30 +313,31 @@ ull_get(ulk_t *key, uint32_t flags) if (ull == NULL) { if (flags & ULL_MUST_EXIST) { /* Must already exist (called from wake) */ - ull_global_unlock(); + ull_bucket_unlock(i); + assert(new_ull == NULL); + assert(unused_ull == NULL); return NULL; } - /* NRG maybe drop the ull_global_lock before the kalloc, - * then take the lock and check again for a key match - * and either use the new ull_t or free it. - */ - - ull = ull_alloc(key); - - if (ull == NULL) { - ull_global_unlock(); + if (new_ull == NULL) { + /* Alloc above failed */ + ull_bucket_unlock(i); return NULL; } + ull = new_ull; ull_lock(ull); - - enqueue(&ull_bucket[i], &ull->ull_hash_link); + enqueue(&ull_bucket[i].ulb_head, &ull->ull_hash_link); + } else if (!(flags & ULL_MUST_EXIST)) { + assert(new_ull); + assert(unused_ull); + assert(*unused_ull == NULL); + *unused_ull = new_ull; } ull->ull_refcount++; - ull_global_unlock(); + ull_bucket_unlock(i); return ull; /* still locked */ } @@ -348,38 +357,56 @@ ull_put(ull_t *ull) return; } - ull_global_lock(); + ull_bucket_lock(ull->ull_bucket_index); remqueue(&ull->ull_hash_link); - ull_global_unlock(); + ull_bucket_unlock(ull->ull_bucket_index); -#if DEVELOPMENT || DEBUG - if (ull_debug) { - kprintf("%s>", __FUNCTION__); - ull_dump(ull); - } -#endif ull_free(ull); } +static void ulock_wait_continue(void *, wait_result_t); +static void ulock_wait_cleanup(ull_t *, thread_t, thread_t, int32_t *); + +inline static int +wait_result_to_return_code(wait_result_t wr) +{ + int ret = 0; + + switch (wr) { + case THREAD_AWAKENED: + break; + case THREAD_TIMED_OUT: + ret = ETIMEDOUT; + break; + case THREAD_INTERRUPTED: + case THREAD_RESTART: + default: + ret = EINTR; + break; + } + + return ret; +} + int ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval) { uint opcode = args->operation & UL_OPCODE_MASK; uint flags = args->operation & UL_FLAGS_MASK; + + if (flags & ULF_WAIT_CANCEL_POINT) { + __pthread_testcancel(1); + } + int ret = 0; thread_t self = current_thread(); - int id = thread_tid(self); ulk_t key; /* involved threads - each variable holds +1 ref if not null */ thread_t owner_thread = THREAD_NULL; thread_t old_owner = THREAD_NULL; - thread_t old_lingering_owner = THREAD_NULL; - sched_call_t workq_callback = NULL; - if (ull_debug) { - kprintf("[%d]%s>ENTER opcode %d addr %llx value %llx timeout %d flags %x\n", id, __FUNCTION__, opcode, (unsigned long long)(args->addr), args->value, args->timeout, flags); - } + ull_t *unused_ull = NULL; if ((flags & ULF_WAIT_MASK) != flags) { ret = EINVAL; @@ -395,11 +422,6 @@ ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval) case UL_COMPARE_AND_WAIT: break; default: - if (ull_debug) { - kprintf("[%d]%s>EINVAL opcode %d addr 0x%llx flags 0x%x\n", - id, __FUNCTION__, opcode, - (unsigned long long)(args->addr), flags); - } ret = EINVAL; goto munge_retval; } @@ -415,12 +437,7 @@ ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval) key.ulk_pid = p->p_pid; key.ulk_addr = args->addr; - if (flags & ULF_WAIT_WORKQ_DATA_CONTENTION) { - workq_callback = workqueue_get_sched_callback(); - workq_callback = thread_disable_sched_call(self, workq_callback); - } - - ull_t *ull = ull_get(&key, 0); + ull_t *ull = ull_get(&key, 0, &unused_ull); if (ull == NULL) { ret = ENOMEM; goto munge_retval; @@ -436,9 +453,8 @@ ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval) if (ull->ull_opcode == 0) { ull->ull_opcode = opcode; } else if (ull->ull_opcode != opcode) { - ull_unlock(ull); ret = EDOM; - goto out; + goto out_locked; } /* @@ -446,14 +462,12 @@ ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval) * but we have to read the userspace value under the ull lock for correctness. * * Until exists, - * fake it by disabling preemption across copyin, which forces any + * holding the ull spinlock across copyin forces any * vm_fault we encounter to fail. */ uint64_t val64; /* copyin_word always zero-extends to 64-bits */ - disable_preemption(); int copy_ret = copyin_word(args->addr, &val64, sizeof(value)); - enable_preemption(); value = (uint32_t)val64; @@ -467,23 +481,16 @@ ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval) } #endif if (copy_ret != 0) { - ull_unlock(ull); - /* copyin() will return an error if the access to the user addr would have faulted, * so just return and let the user level code fault it in. */ ret = copy_ret; - goto out; + goto out_locked; } if (value != args->value) { /* Lock value has changed from expected so bail out */ - ull_unlock(ull); - if (ull_debug) { - kprintf("[%d]%s>Lock value %d has changed from expected %d so bail out\n", - id, __FUNCTION__, value, (uint32_t)(args->value)); - } - goto out; + goto out_locked; } if (set_owner) { @@ -496,9 +503,8 @@ ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval) * Translation failed - even though the lock value is up to date, * whatever was stored in the lock wasn't actually a thread port. */ - ull_unlock(ull); ret = EOWNERDEAD; - goto out; + goto out_locked; } /* owner_thread has a +1 reference */ @@ -511,54 +517,102 @@ ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval) * and is heading toward the kernel to call ull_wake. * If so, it's going to have to wait for the ull mutex. * - * Therefore, I can promote its priority to match mine, and I can rely on it to - * come by later to issue the wakeup and lose its promotion. + * Therefore, I can ask the turnstile to promote its priority, and I can rely + * on it to come by later to issue the wakeup and lose its promotion. */ - old_owner = ull_promote_owner_locked(ull, owner_thread); + /* Return the +1 ref from the ull_owner field */ + old_owner = ull->ull_owner; + ull->ull_owner = THREAD_NULL; + + if (owner_thread != THREAD_NULL) { + /* The ull_owner field now owns a +1 ref on owner_thread */ + thread_reference(owner_thread); + ull->ull_owner = owner_thread; + } } wait_result_t wr; uint32_t timeout = args->timeout; + uint64_t deadline = TIMEOUT_WAIT_FOREVER; + wait_interrupt_t interruptible = THREAD_ABORTSAFE; + struct turnstile *ts; + + ts = turnstile_prepare((uintptr_t)ull, &ull->ull_turnstile, + TURNSTILE_NULL, TURNSTILE_ULOCK); thread_set_pending_block_hint(self, kThreadWaitUserLock); + + if (flags & ULF_WAIT_WORKQ_DATA_CONTENTION) { + interruptible |= THREAD_WAIT_NOREPORT; + } + if (timeout) { - wr = assert_wait_timeout(ULOCK_TO_EVENT(ull), THREAD_ABORTSAFE, timeout, NSEC_PER_USEC); - } else { - wr = assert_wait(ULOCK_TO_EVENT(ull), THREAD_ABORTSAFE); + clock_interval_to_deadline(timeout, NSEC_PER_USEC, &deadline); } + turnstile_update_inheritor(ts, owner_thread, + (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD)); + + wr = waitq_assert_wait64(&ts->ts_waitq, CAST_EVENT64_T(ULOCK_TO_EVENT(ull)), + interruptible, deadline); + ull_unlock(ull); - if (ull_debug) { - kprintf("[%d]%s>after assert_wait() returned %d\n", id, __FUNCTION__, wr); + if (unused_ull) { + ull_free(unused_ull); + unused_ull = NULL; } - if (set_owner && owner_thread != THREAD_NULL && wr == THREAD_WAITING) { - wr = thread_handoff(owner_thread); - /* owner_thread ref is consumed */ - owner_thread = THREAD_NULL; - } else { - /* NRG At some point this should be a continuation based block, so that we can avoid saving the full kernel context. */ - wr = thread_block(NULL); + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD); + + if (wr == THREAD_WAITING) { + uthread_t uthread = (uthread_t)get_bsdthread_info(self); + uthread->uu_save.uus_ulock_wait_data.retval = retval; + uthread->uu_save.uus_ulock_wait_data.flags = flags; + uthread->uu_save.uus_ulock_wait_data.owner_thread = owner_thread; + uthread->uu_save.uus_ulock_wait_data.old_owner = old_owner; + if (set_owner && owner_thread != THREAD_NULL) { + thread_handoff_parameter(owner_thread, ulock_wait_continue, ull); + } else { + assert(owner_thread == THREAD_NULL); + thread_block_parameter(ulock_wait_continue, ull); + } + /* NOT REACHED */ } - if (ull_debug) { - kprintf("[%d]%s>thread_block() returned %d\n", id, __FUNCTION__, wr); + + ret = wait_result_to_return_code(wr); + + ull_lock(ull); + turnstile_complete((uintptr_t)ull, &ull->ull_turnstile, NULL); + +out_locked: + ulock_wait_cleanup(ull, owner_thread, old_owner, retval); + + if (unused_ull) { + ull_free(unused_ull); + unused_ull = NULL; } - switch (wr) { - case THREAD_AWAKENED: - break; - case THREAD_TIMED_OUT: - ret = ETIMEDOUT; - break; - case THREAD_INTERRUPTED: - case THREAD_RESTART: - default: - ret = EINTR; - break; + + assert(*retval >= 0); + +munge_retval: + if ((flags & ULF_NO_ERRNO) && (ret != 0)) { + *retval = -ret; + ret = 0; } + return ret; +} + +/* + * Must be called with ull_lock held + */ +static void +ulock_wait_cleanup(ull_t *ull, thread_t owner_thread, thread_t old_owner, int32_t *retval) +{ + ull_assert_owned(ull); + + thread_t old_lingering_owner = THREAD_NULL; -out: - ull_lock(ull); *retval = --ull->ull_nwaiters; if (ull->ull_nwaiters == 0) { /* @@ -566,11 +620,8 @@ ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval) * clear out the lingering owner reference before * freeing the ull. */ - if (ull->ull_owner != THREAD_NULL) { - old_lingering_owner = ull_promote_owner_locked(ull, THREAD_NULL); - } - - assert(ull->ull_owner == THREAD_NULL); + old_lingering_owner = ull->ull_owner; + ull->ull_owner = THREAD_NULL; ull->ull_key.ulk_pid = 0; ull->ull_key.ulk_addr = 0; @@ -579,6 +630,9 @@ ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval) } ull_put(ull); + /* Need to be called after dropping the interlock */ + turnstile_cleanup(); + if (owner_thread != THREAD_NULL) { thread_deallocate(owner_thread); } @@ -592,17 +646,35 @@ ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval) } assert(*retval >= 0); +} -munge_retval: - if (workq_callback) { - thread_reenable_sched_call(self, workq_callback); - } +__attribute__((noreturn)) +static void +ulock_wait_continue(void * parameter, wait_result_t wr) +{ + thread_t self = current_thread(); + uthread_t uthread = (uthread_t)get_bsdthread_info(self); + int ret = 0; + + ull_t *ull = (ull_t *)parameter; + int32_t *retval = uthread->uu_save.uus_ulock_wait_data.retval; + uint flags = uthread->uu_save.uus_ulock_wait_data.flags; + thread_t owner_thread = uthread->uu_save.uus_ulock_wait_data.owner_thread; + thread_t old_owner = uthread->uu_save.uus_ulock_wait_data.old_owner; + + ret = wait_result_to_return_code(wr); + + ull_lock(ull); + turnstile_complete((uintptr_t)ull, &ull->ull_turnstile, NULL); + + ulock_wait_cleanup(ull, owner_thread, old_owner, retval); if ((flags & ULF_NO_ERRNO) && (ret != 0)) { *retval = -ret; ret = 0; } - return ret; + + unix_syscall_return(ret); } int @@ -611,18 +683,12 @@ ulock_wake(struct proc *p, struct ulock_wake_args *args, __unused int32_t *retva uint opcode = args->operation & UL_OPCODE_MASK; uint flags = args->operation & UL_FLAGS_MASK; int ret = 0; - int id = thread_tid(current_thread()); ulk_t key; /* involved threads - each variable holds +1 ref if not null */ thread_t wake_thread = THREAD_NULL; thread_t old_owner = THREAD_NULL; - if (ull_debug) { - kprintf("[%d]%s>ENTER opcode %d addr %llx flags %x\n", - id, __FUNCTION__, opcode, (unsigned long long)(args->addr), flags); - } - if ((flags & ULF_WAKE_MASK) != flags) { ret = EINVAL; goto munge_retval; @@ -662,7 +728,7 @@ ulock_wake(struct proc *p, struct ulock_wake_args *args, __unused int32_t *retva key.ulk_pid = p->p_pid; key.ulk_addr = args->addr; - ull_t *ull = ull_get(&key, ULL_MUST_EXIST); + ull_t *ull = ull_get(&key, ULL_MUST_EXIST, NULL); if (ull == NULL) { if (wake_thread != THREAD_NULL) { thread_deallocate(wake_thread); @@ -681,19 +747,11 @@ ulock_wake(struct proc *p, struct ulock_wake_args *args, __unused int32_t *retva case UL_COMPARE_AND_WAIT: break; default: - if (ull_debug) { - kprintf("[%d]%s>EINVAL opcode %d addr 0x%llx flags 0x%x\n", - id, __FUNCTION__, opcode, (unsigned long long)(args->addr), flags); - } ret = EINVAL; goto out_locked; } if (opcode != ull->ull_opcode) { - if (ull_debug) { - kprintf("[%d]%s>EDOM - opcode mismatch - opcode %d addr 0x%llx flags 0x%x\n", - id, __FUNCTION__, opcode, (unsigned long long)(args->addr), flags); - } ret = EDOM; goto out_locked; } @@ -702,10 +760,16 @@ ulock_wake(struct proc *p, struct ulock_wake_args *args, __unused int32_t *retva assert(ull->ull_owner == THREAD_NULL); } + struct turnstile *ts; + ts = turnstile_prepare((uintptr_t)ull, &ull->ull_turnstile, + TURNSTILE_NULL, TURNSTILE_ULOCK); + if (flags & ULF_WAKE_ALL) { - thread_wakeup(ULOCK_TO_EVENT(ull)); + waitq_wakeup64_all(&ts->ts_waitq, CAST_EVENT64_T(ULOCK_TO_EVENT(ull)), + THREAD_AWAKENED, 0); } else if (flags & ULF_WAKE_THREAD) { - kern_return_t kr = thread_wakeup_thread(ULOCK_TO_EVENT(ull), wake_thread); + kern_return_t kr = waitq_wakeup64_thread(&ts->ts_waitq, CAST_EVENT64_T(ULOCK_TO_EVENT(ull)), + wake_thread, THREAD_AWAKENED); if (kr != KERN_SUCCESS) { assert(kr == KERN_NOT_WAITING); ret = EALREADY; @@ -718,7 +782,8 @@ ulock_wake(struct proc *p, struct ulock_wake_args *args, __unused int32_t *retva * TODO: 'owner is not current_thread (or null)' likely means we can avoid this wakeup * */ - thread_wakeup_one_with_pri(ULOCK_TO_EVENT(ull), WAITQ_SELECT_MAX_PRI); + waitq_wakeup64_one(&ts->ts_waitq, CAST_EVENT64_T(ULOCK_TO_EVENT(ull)), + THREAD_AWAKENED, WAITQ_SELECT_MAX_PRI); } /* @@ -732,12 +797,21 @@ ulock_wake(struct proc *p, struct ulock_wake_args *args, __unused int32_t *retva */ if (ull->ull_owner == current_thread()) { - old_owner = ull_promote_owner_locked(ull, THREAD_NULL); + turnstile_update_inheritor(ts, THREAD_NULL, + (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD)); + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD); + old_owner = ull->ull_owner; + ull->ull_owner = THREAD_NULL; } + turnstile_complete((uintptr_t)ull, &ull->ull_turnstile, NULL); + out_locked: ull_put(ull); + /* Need to be called after dropping the interlock */ + turnstile_cleanup(); + if (wake_thread != THREAD_NULL) { thread_deallocate(wake_thread); } @@ -754,46 +828,6 @@ ulock_wake(struct proc *p, struct ulock_wake_args *args, __unused int32_t *retva return ret; } -/* - * Change ull_owner to be new_owner, and update it with the properties - * of the current thread. - * - * Records the highest current promotion value in ull_promote_token, and applies that - * to any new owner. - * - * Returns +1 ref to the old ull_owner if it is going away. - */ -static thread_t -ull_promote_owner_locked(ull_t* ull, - thread_t new_owner) -{ - if (new_owner != THREAD_NULL && ull->ull_owner == new_owner) { - thread_user_promotion_update(new_owner, current_thread(), &ull->ull_promote_token); - return THREAD_NULL; - } - - thread_t old_owner = ull->ull_owner; - ull->ull_owner = THREAD_NULL; - - if (new_owner != THREAD_NULL) { - /* The ull_owner field now owns a +1 ref on thread */ - thread_reference(new_owner); - ull->ull_owner = new_owner; - - thread_user_promotion_add(new_owner, current_thread(), &ull->ull_promote_token); - } else { - /* No new owner - clear the saturated promotion value */ - ull->ull_promote_token = PROMOTE_TOKEN_INIT; - } - - if (old_owner != THREAD_NULL) { - thread_user_promotion_drop(old_owner); - } - - /* Return the +1 ref from the ull_owner field */ - return old_owner; -} - void kdp_ulock_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo) { diff --git a/bsd/kern/syscalls.master b/bsd/kern/syscalls.master index e1b8a75c9..278dd224f 100644 --- a/bsd/kern/syscalls.master +++ b/bsd/kern/syscalls.master @@ -102,7 +102,7 @@ 52 AUE_SIGPENDING ALL { int sigpending(struct sigvec *osv); } 53 AUE_SIGALTSTACK ALL { int sigaltstack(struct sigaltstack *nss, struct sigaltstack *oss) NO_SYSCALL_STUB ; } 54 AUE_IOCTL ALL { int ioctl(int fd, u_long com, caddr_t data) NO_SYSCALL_STUB; } -55 AUE_REBOOT ALL { int reboot(int opt, char *command) NO_SYSCALL_STUB; } +55 AUE_REBOOT ALL { int reboot(int opt, char *msg) NO_SYSCALL_STUB; } 56 AUE_REVOKE ALL { int revoke(char *path); } 57 AUE_SYMLINK ALL { int symlink(char *path, char *link); } 58 AUE_READLINK ALL { int readlink(char *path, char *buf, int count); } @@ -269,7 +269,7 @@ 181 AUE_SETGID ALL { int setgid(gid_t gid); } 182 AUE_SETEGID ALL { int setegid(gid_t egid); } 183 AUE_SETEUID ALL { int seteuid(uid_t euid); } -184 AUE_SIGRETURN ALL { int sigreturn(struct ucontext *uctx, int infostyle) NO_SYSCALL_STUB; } +184 AUE_SIGRETURN ALL { int sigreturn(struct ucontext *uctx, int infostyle, user_addr_t token) NO_SYSCALL_STUB; } 185 AUE_NULL ALL { int enosys(void); } { old chud } 186 AUE_NULL ALL { int thread_selfcounts(int type, user_addr_t buf, user_size_t nbytes); } 187 AUE_FDATASYNC ALL { int fdatasync(int fd); } @@ -836,5 +836,9 @@ 527 AUE_NULL ALL { int ntp_adjtime(struct timex *tp); } 528 AUE_NULL ALL { int ntp_gettime(struct ntptimeval *ntvp); } 529 AUE_NULL ALL { int os_fault_with_payload(uint32_t reason_namespace, uint64_t reason_code, void *payload, uint32_t payload_size, const char *reason_string, uint64_t reason_flags); } +#if CONFIG_WORKQUEUE +530 AUE_WORKLOOPCTL ALL { int kqueue_workloop_ctl(user_addr_t cmd, uint64_t options, user_addr_t addr, size_t sz) NO_SYSCALL_STUB; } +#else 530 AUE_NULL ALL { int enosys(void); } +#endif // CONFIG_WORKQUEUE 531 AUE_NULL ALL { int enosys(void); } diff --git a/bsd/kern/sysv_shm.c b/bsd/kern/sysv_shm.c index a962e9ab1..2fb45c996 100644 --- a/bsd/kern/sysv_shm.c +++ b/bsd/kern/sysv_shm.c @@ -488,6 +488,7 @@ shmat(struct proc *p, struct shmat_args *uap, user_addr_t *retval) map_size, 0, vm_flags, + VM_MAP_KERNEL_FLAGS_NONE, VM_KERN_MEMORY_NONE, IPC_PORT_NULL, 0, diff --git a/bsd/kern/trace_codes b/bsd/kern/trace_codes index 3065de277..2e1361f4f 100644 --- a/bsd/kern/trace_codes +++ b/bsd/kern/trace_codes @@ -197,6 +197,7 @@ 0x1200024 MACH_IPC_voucher_destroy 0x1200028 MACH_IPC_kmsg_info 0x120002c MACH_IPC_kmsg_link +0x1200030 MACH_IPC_port_entry_modify 0x1250008 MACH_RMON_CPUUSAGE_VIOLATED 0x1250010 MACH_RMON_CPUUSAGE_VIOLATED_K32A 0x1250014 MACH_RMON_CPUUSAGE_VIOLATED_K32B @@ -235,14 +236,24 @@ 0x130043c MACH_vm_info5 0x1300440 MACH_vm_info6 0x1300444 MACH_vm_info7 +0x1300448 MACH_vm_info8 +0x130044c MACH_vm_info9 0x1300480 MACH_vm_upl_page_wait 0x1300484 MACH_vm_iopl_page_wait 0x1300488 MACH_vm_page_wait_block 0x130048C MACH_vm_page_sleep 0x1300490 MACH_vm_page_expedite 0x1300494 MACH_vm_page_expedite_no_memory +0x1300498 MACH_vm_page_grab +0x130049c MACH_vm_page_release 0x13004c0 MACH_vm_pressure_event -0x1300500 MACH_vm_data_write +0x13004c4 MACH_vm_execve +0x13004c8 MACH_vm_wakeup_compactor_swapper +0x13004cc MACH_vm_upl_request +0x13004d0 MACH_vm_iopl_request +0x13004d4 MACH_vm_kern_request +0x1300500 MACH_vm_data_write +0x1300504 vm_pressure_level_change 0x1320000 vm_disconnect_all_page_mappings 0x1320004 vm_disconnect_task_page_mappings 0x1320008 RealFaultAddressInternal @@ -298,6 +309,11 @@ 0x14000C4 MACH_EXEC_DEMOTE 0x14000C8 MACH_AMP_SIGNAL_SPILL 0x14000CC MACH_AMP_STEAL +0x14000D0 MACH_SCHED_LOAD_EFFECTIVE +0x14000D4 MACH_PROMOTED +0x14000D8 MACH_UNPROMOTED +0x14000DC MACH_PROMOTED_UPDATE +0x14000E0 MACH_QUIESCENT_COUNTER 0x1500000 MACH_MSGID_INVALID 0x1600000 MTX_SLEEP 0x1600004 MTX_SLEEP_DEADLINE @@ -345,6 +361,7 @@ 0x170003c PMAP_flush_TLBS_TO 0x1700040 PMAP_flush_EPT 0x1700044 PMAP_fast_fault +0x1700048 PMAP_switch 0x1800000 MACH_CLOCK_EPOCH_CHANGE 0x1800004 MACH_CLOCK_BRIDGE_RCV_TS 0x1800008 MACH_CLOCK_BRIDGE_REMOTE_TIME @@ -872,6 +889,11 @@ 0x3110004 OpenThrottleWindow 0x3110008 CauseIOThrottle 0x311000C IO_THROTTLE_DISABLE +0x3120000 DECMPFS_decmp_file +0x3120004 DECMPFS_fetch_cmp_header +0x3120008 DECMPFS_fetch_uncmp_data +0x3120010 DECMPFS_free_cmp_data +0x3120014 DECMPFS_file_is_cmp 0x3CF0000 CP_OFFSET_IO 0x4010004 proc_exit 0x4010008 force_exit @@ -891,6 +913,8 @@ 0x402002C MEMSTAT_dirty_clear 0x4020030 MEMSTAT_grp_set_properties 0x4020034 MEMSTAT_do_kill +0x4020038 MEMSTAT_change_priority +0x402003C MEMSTAT_fast_jetsam 0x4030004 KEVENT_kq_processing_begin 0x4030008 KEVENT_kq_processing_end 0x403000c KEVENT_kqwq_processing_begin @@ -912,6 +936,7 @@ 0x403004c KEVENT_kqwl_bind 0x4030050 KEVENT_kqwl_unbind 0x4030054 KEVENT_knote_enable +0x4030058 KEVENT_knote_vanished 0x40e0104 BSC_msync_extended_info 0x40e0264 BSC_pread_extended_info 0x40e0268 BSC_pwrite_extended_info @@ -1235,16 +1260,30 @@ 0x8010800 F_DLIL_Output 0x8010c00 F_DLIL_IfOut 0x8040000 USER_STOP -0x9000084 wq_deallocate_stack -0x9000088 wq_allocate_stack -0x9008070 wq_run_item -0x9008074 wq_clean_thread -0x9008078 wq_post_done -0x900807c wq_stk_cleanup -0x9008080 wq_tsd_cleanup -0x9008084 wq_tsd_destructor -0x9008088 wq_pthread_exit -0x900808c wq_workqueue_exit +0x9010004 wq_pthread_exit +0x9010008 wq_workqueue_exit +0x901000c wq_runthread +0x9010014 wq_death_call +0x9010024 wq_thread_block +0x9010028 wq_thactive_update +0x901002c wq_add_timer +0x9010030 wq_start_add_timer +0x9010050 wq_override_dispatch +0x9010054 wq_override_reset +0x9010074 wq_thread_create_failed +0x9010078 wq_thread_terminate +0x901007c wq_thread_create +0x9010080 wq_select_threadreq +0x901008c wq_creator_select +0x9010090 wq_creator_yield +0x9010094 wq_constrained_admission +0x9010098 wq_wqops_reqthreads +0x9020004 wq_create +0x9020008 wq_destroy +0x902000c wq_thread_logical_run +0x9020014 wq_thread_request_initiate +0x9020018 wq_thread_request_modify +0x9100004 bsdthread_set_qos_self 0xa000100 P_CS_Read 0xa000110 P_CS_Write 0xa000104 P_CS_ReadDone @@ -1517,13 +1556,17 @@ 0x2506002c PERF_KPC_Thd_Sample 0x25070000 PERF_KDBG_Handler 0x25080000 PERF_TK_Snap_Sample -0x25080004 PERF_TK_Snap_Data1 -0x25080008 PERF_TK_Snap_Data2 -0x2508000c PERF_TK_Snap_Data1_32 -0x25080010 PERF_TK_Snap_Data2_32 +0x25080004 PERF_TK_Snap_Data +0x25080008 PERF_TK_Snap_Data1_32 +0x2508000c PERF_TK_Snap_Data2_32 +0x25080010 PERF_TK_Info_Data +0x25090000 PERF_LZ_MkRunnable +0x25090040 PERF_LZ_WaitSample +0x25090080 PERF_LZ_CPUSample 0x250a0000 PERF_MI_Sample 0x250a0004 PERF_MI_Data 0x250a0008 PERF_MI_SysMem_Data +0x250a000c PERF_MI_SysMem_Data_2 0x26100008 imp_assertion_hold 0x2610000c imp_assertion_hold_ext 0x26100020 imp_assertion_externalize @@ -1585,6 +1628,7 @@ 0x26350028 imp_thread_qos 0x26360028 imp_thread_qos_override 0x26380028 imp_thread_qos_and_relprio +0x263b0028 imp_thread_qos_workq_override 0x263c0028 imp_thread_qos_promote 0x263d0028 imp_thread_qos_ipc_override 0x27000000 PERF_PCEVENT @@ -1620,6 +1664,21 @@ 0x2a200008 ATM_VALUE_ADDED 0x2a300004 ATM_VALUE_UNREGISTERED 0x2a300008 ATM_VALUE_DIFF_MAILBOX +0x35100004 TURNSTILE_thread_added_to_turnstile_waitq +0x35100008 TURNSTILE_thread_removed_from_turnstile_waitq +0x3510000c TURNSTILE_thread_moved_in_turnstile_waitq +0x35100010 TURNSTILE_turnstile_added_to_turnstile_heap +0x35100014 TURNSTILE_turnstile_removed_from_turnstile_heap +0x35100018 TURNSTILE_turnstile_moved_in_turnstile_heap +0x3510001c TURNSTILE_turnstile_added_to_thread_heap +0x35100020 TURNSTILE_turnstile_removed_from_thread_heap +0x35100024 TURNSTILE_turnstile_moved_in_thread_heap +0x35100028 TURNSTILE_update_stopped_by_limit +0x3510002c TURNSTILE_thread_not_waiting_on_turnstile +0x35200004 TURNSTILE_turnstile_priority_change +0x35200008 TURNSTILE_thread_user_promotion_change +0x35300004 TURNSTILE_turnstile_prepare +0x35300008 TURNSTILE_turnstile_complete 0xff000104 MSG_mach_notify_port_deleted 0xff000114 MSG_mach_notify_port_destroyed 0xff000118 MSG_mach_notify_no_senders diff --git a/bsd/kern/tty.c b/bsd/kern/tty.c index 61d59ee62..ed7e79658 100644 --- a/bsd/kern/tty.c +++ b/bsd/kern/tty.c @@ -913,6 +913,20 @@ ttysetpgrphup(struct tty *tp) { TTY_LOCK_OWNED(tp); /* debug assert */ SET(tp->t_state, TS_PGRPHUP); + /* + * Also wake up sleeping readers which may or may not belong to the + * current foreground process group. + * + * This forces any non-fg readers (which entered read when + * that process group was in the fg) to return with EIO (if they're + * catching SIGTTIN or with SIGTTIN). The ones which do belong to the fg + * process group will promptly go back to sleep and get a SIGHUP shortly + * This would normally happen as part of the close in revoke but if + * there is a sleeping reader from a non-fg process group we never get + * to the close because the sleeping reader holds an iocount on the + * vnode of the terminal which is going to get revoked->reclaimed. + */ + wakeup(TSA_HUP_OR_INPUT(tp)); } /* @@ -1454,6 +1468,9 @@ ttioctl_locked(struct tty *tp, u_long cmd, caddr_t data, int flag, proc_t p) * case. */ if (ISSET(tp->t_state, TS_PGRPHUP)) { + if (sessp != SESSION_NULL) + session_rele(sessp); + pg_rele(pgrp); error = EPERM; goto out; } @@ -1462,6 +1479,23 @@ ttioctl_locked(struct tty *tp, u_long cmd, caddr_t data, int flag, proc_t p) tp->t_pgrp = pgrp; sessp->s_ttypgrpid = pgrp->pg_id; proc_list_unlock(); + + /* + * Wakeup readers to recheck if they are still the foreground + * process group. + * + * ttwakeup() isn't called because the readers aren't getting + * woken up becuse there is something to read but to force + * the re-evaluation of their foreground process group status. + * + * Ordinarily leaving these readers waiting wouldn't be an issue + * as launchd would send them a termination signal eventually + * (if nobody else does). But if this terminal happens to be + * /dev/console, launchd itself could get blocked forever behind + * a revoke of /dev/console and leave the system deadlocked. + */ + wakeup(TSA_HUP_OR_INPUT(tp)); + /* SAFE: All callers drop the lock on return */ tty_unlock(tp); if (oldpg != PGRP_NULL) @@ -3333,6 +3367,12 @@ tty_set_knote_hook(struct knote *kn) NULL); assert(kr == KERN_SUCCESS); + /* + * Lazy allocate the waitqset to avoid potential allocation under + * a spinlock; + */ + waitq_set_lazy_init_link(&tmp_wqs); + old_wqs = uth->uu_wqset; uth->uu_wqset = &tmp_wqs; /* @@ -3507,8 +3547,6 @@ filt_ttytouch(struct knote *kn, struct kevent_internal_s *kev) kn->kn_sdata = kev->data; kn->kn_sfflags = kev->fflags; - if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) - kn->kn_udata = kev->udata; if (kn->kn_vnode_kqok) { res = filt_tty_common(kn, tp); diff --git a/bsd/kern/tty_ptmx.c b/bsd/kern/tty_ptmx.c index 8e81ab669..893b1d912 100644 --- a/bsd/kern/tty_ptmx.c +++ b/bsd/kern/tty_ptmx.c @@ -597,9 +597,6 @@ ptsd_kqops_touch(struct knote *kn, struct kevent_internal_s *kev) /* accept new kevent state */ kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; - if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) { - kn->kn_udata = kev->udata; - } /* recapture fired state of knote */ ret = ptsd_kqops_common(kn, tp); @@ -832,6 +829,7 @@ ptmx_kqops_common(struct knote *kn, struct ptmx_ioctl *pti, struct tty *tp) /* there's data on the TTY and it's not stopped */ if (tp->t_outq.c_cc && !(tp->t_state & TS_TTSTOP)) { retval = tp->t_outq.c_cc; + kn->kn_data = retval; } else if (((pti->pt_flags & PF_PKT) && pti->pt_send) || ((pti->pt_flags & PF_UCNTL) && pti->pt_ucntl)) { retval = 1; @@ -907,9 +905,6 @@ ptmx_kqops_touch(struct knote *kn, struct kevent_internal_s *kev) /* accept new kevent state */ kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; - if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) { - kn->kn_udata = kev->udata; - } /* recapture fired state of knote */ ret = ptmx_kqops_common(kn, pti, tp); diff --git a/bsd/kern/ubc_subr.c b/bsd/kern/ubc_subr.c index e05d13c3e..6f09debf7 100644 --- a/bsd/kern/ubc_subr.c +++ b/bsd/kern/ubc_subr.c @@ -81,9 +81,10 @@ extern kern_return_t memory_object_pages_resident(memory_object_control_t, boolean_t *); extern kern_return_t memory_object_signed(memory_object_control_t control, boolean_t is_signed); -extern boolean_t memory_object_is_slid(memory_object_control_t control); extern boolean_t memory_object_is_signed(memory_object_control_t); +/* XXX Same for those. */ + extern void Debugger(const char *message); @@ -112,7 +113,7 @@ static int ubc_msync_internal(vnode_t, off_t, off_t, off_t *, int, int *); static void ubc_cs_free(struct ubc_info *uip); static boolean_t ubc_cs_supports_multilevel_hash(struct cs_blob *blob); -static void ubc_cs_convert_to_multilevel_hash(struct cs_blob *blob); +static kern_return_t ubc_cs_convert_to_multilevel_hash(struct cs_blob *blob); struct zone *ubc_info_zone; static uint32_t cs_blob_generation_count = 1; @@ -477,19 +478,18 @@ cs_validate_blob(const CS_GenericBlob *blob, size_t length) static int cs_validate_csblob( const uint8_t *addr, - size_t *blob_size_p, + const size_t blob_size, const CS_CodeDirectory **rcd, const CS_GenericBlob **rentitlements) { const CS_GenericBlob *blob; int error; - size_t length, blob_size; + size_t length; *rcd = NULL; *rentitlements = NULL; blob = (const CS_GenericBlob *)(const void *)addr; - blob_size = *blob_size_p; length = blob_size; error = cs_validate_blob(blob, length); @@ -617,8 +617,6 @@ cs_validate_csblob( if (*rcd == NULL) return EBADEXEC; - *blob_size_p = blob_size; - return 0; } @@ -1467,17 +1465,6 @@ ubc_getobject(struct vnode *vp, __unused int flags) return (MEMORY_OBJECT_CONTROL_NULL); } -boolean_t -ubc_strict_uncached_IO(struct vnode *vp) -{ - boolean_t result = FALSE; - - if (UBCINFOEXISTS(vp)) { - result = memory_object_is_slid(vp->v_ubcinfo->ui_control); - } - return result; -} - /* * ubc_blktooff * @@ -2835,14 +2822,18 @@ ubc_cs_blob_allocate( vm_offset_t *blob_addr_p, vm_size_t *blob_size_p) { - kern_return_t kr; + kern_return_t kr = KERN_FAILURE; - *blob_addr_p = (vm_offset_t) kalloc_tag(*blob_size_p, VM_KERN_MEMORY_SECURITY); - if (*blob_addr_p == 0) { - kr = KERN_NO_SPACE; - } else { - kr = KERN_SUCCESS; + { + *blob_addr_p = (vm_offset_t) kalloc_tag(*blob_size_p, VM_KERN_MEMORY_SECURITY); + + if (*blob_addr_p == 0) { + kr = KERN_NO_SPACE; + } else { + kr = KERN_SUCCESS; + } } + return kr; } @@ -2851,7 +2842,14 @@ ubc_cs_blob_deallocate( vm_offset_t blob_addr, vm_size_t blob_size) { - kfree((void *) blob_addr, blob_size); +#if PMAP_CS + if (blob_size > pmap_cs_blob_limit) { + kmem_free(kernel_map, blob_addr, blob_size); + } else +#endif + { + kfree((void *) blob_addr, blob_size); + } } /* @@ -2871,6 +2869,7 @@ ubc_cs_supports_multilevel_hash(struct cs_blob *blob) { const CS_CodeDirectory *cd; + /* * Only applies to binaries that ship as part of the OS, * primarily the shared cache. @@ -2935,11 +2934,25 @@ ubc_cs_supports_multilevel_hash(struct cs_blob *blob) } /* - * All state and preconditions were checked before, so this - * function cannot fail. + * Given a cs_blob with an already chosen best code directory, this + * function allocates memory and copies into it only the blobs that + * will be needed by the kernel, namely the single chosen code + * directory (and not any of its alternatives) and the entitlement + * blob. + * + * This saves significant memory with agile signatures, and additional + * memory for 3rd Party Code because we also omit the CMS blob. + * + * To support multilevel and other potential code directory rewriting, + * the size of a new code directory can be specified. Since that code + * directory will replace the existing code directory, + * ubc_cs_reconstitute_code_signature does not copy the original code + * directory when a size is given, and the caller must fill it in. */ -static void -ubc_cs_convert_to_multilevel_hash(struct cs_blob *blob) +static int +ubc_cs_reconstitute_code_signature(struct cs_blob const *blob, vm_size_t optional_new_cd_size, + vm_address_t *new_blob_addr_p, vm_size_t *new_blob_size_p, + CS_CodeDirectory **new_cd_p, CS_GenericBlob const **new_entitlements_p) { const CS_CodeDirectory *old_cd, *cd; CS_CodeDirectory *new_cd; @@ -2949,20 +2962,10 @@ ubc_cs_convert_to_multilevel_hash(struct cs_blob *blob) vm_size_t new_cdsize; kern_return_t kr; int error; - size_t length; - - uint32_t hashes_per_new_hash_shift = (uint32_t)(PAGE_SHIFT - blob->csb_hash_pageshift); - - if (cs_debug > 1) { - printf("CODE SIGNING: Attempting to convert Code Directory for %lu -> %lu page shift\n", - (unsigned long)blob->csb_hash_pageshift, (unsigned long)PAGE_SHIFT); - } old_cd = blob->csb_cd; - /* Up to the hashes, we can copy all data */ - new_cdsize = ntohl(old_cd->hashOffset); - new_cdsize += (ntohl(old_cd->nCodeSlots) >> hashes_per_new_hash_shift) * old_cd->hashSize; + new_cdsize = optional_new_cd_size != 0 ? optional_new_cd_size : htonl(old_cd->length); new_blob_size = sizeof(CS_SuperBlob); new_blob_size += sizeof(CS_BlobIndex); @@ -2980,7 +2983,7 @@ ubc_cs_convert_to_multilevel_hash(struct cs_blob *blob) printf("CODE SIGNING: Failed to allocate memory for new Code Signing Blob: %d\n", kr); } - return; + return ENOMEM; } CS_SuperBlob *new_superblob; @@ -3004,15 +3007,69 @@ ubc_cs_convert_to_multilevel_hash(struct cs_blob *blob) new_cd = (CS_CodeDirectory *)(new_blob_addr + cd_offset); } else { - vm_size_t cd_offset; + // Blob is the code directory, directly. + new_cd = (CS_CodeDirectory *)new_blob_addr; + } - cd_offset = sizeof(CS_SuperBlob) + 1 * sizeof(CS_BlobIndex); + if (optional_new_cd_size == 0) { + // Copy code directory, and revalidate. + memcpy(new_cd, old_cd, new_cdsize); - new_superblob->count = htonl(1); - new_superblob->index[0].type = htonl(CSSLOT_CODEDIRECTORY); - new_superblob->index[0].offset = htonl((uint32_t)cd_offset); + vm_size_t length = new_blob_size; - new_cd = (CS_CodeDirectory *)new_blob_addr; + error = cs_validate_csblob((const uint8_t *)new_blob_addr, length, &cd, &entitlements); + + if (error) { + printf("CODE SIGNING: Failed to validate new Code Signing Blob: %d\n", + error); + + ubc_cs_blob_deallocate(new_blob_addr, new_blob_size); + return error; + } + *new_entitlements_p = entitlements; + } else { + // Caller will fill out and validate code directory. + memset(new_cd, 0, new_cdsize); + *new_entitlements_p = NULL; + } + + *new_blob_addr_p = new_blob_addr; + *new_blob_size_p = new_blob_size; + *new_cd_p = new_cd; + + return 0; +} + +static int +ubc_cs_convert_to_multilevel_hash(struct cs_blob *blob) +{ + const CS_CodeDirectory *old_cd, *cd; + CS_CodeDirectory *new_cd; + const CS_GenericBlob *entitlements; + vm_offset_t new_blob_addr; + vm_size_t new_blob_size; + vm_size_t new_cdsize; + int error; + + uint32_t hashes_per_new_hash_shift = (uint32_t)(PAGE_SHIFT - blob->csb_hash_pageshift); + + if (cs_debug > 1) { + printf("CODE SIGNING: Attempting to convert Code Directory for %lu -> %lu page shift\n", + (unsigned long)blob->csb_hash_pageshift, (unsigned long)PAGE_SHIFT); + } + + old_cd = blob->csb_cd; + + /* Up to the hashes, we can copy all data */ + new_cdsize = ntohl(old_cd->hashOffset); + new_cdsize += (ntohl(old_cd->nCodeSlots) >> hashes_per_new_hash_shift) * old_cd->hashSize; + + error = ubc_cs_reconstitute_code_signature(blob, new_cdsize, + &new_blob_addr, &new_blob_size, &new_cd, + &entitlements); + if (error != 0) { + printf("CODE SIGNING: Failed to reconsitute code signature: %d\n", error); + return error; } memcpy(new_cd, old_cd, ntohl(old_cd->hashOffset)); @@ -3066,21 +3123,17 @@ ubc_cs_convert_to_multilevel_hash(struct cs_blob *blob) blob->csb_hashtype->cs_final(dst, &mdctx); } - length = new_blob_size; - error = cs_validate_csblob((const uint8_t *)new_blob_addr, &length, &cd, &entitlements); - assert(length == new_blob_size); - if (error) { + error = cs_validate_csblob((const uint8_t *)new_blob_addr, new_blob_size, &cd, &entitlements); + if (error != 0) { - if (cs_debug > 1) { - printf("CODE SIGNING: Failed to validate new Code Signing Blob: %d\n", - error); - } + printf("CODE SIGNING: Failed to validate new Code Signing Blob: %d\n", + error); ubc_cs_blob_deallocate(new_blob_addr, new_blob_size); - return; + return error; } - /* New Code Directory is ready for use, swap it out in the blob structure */ + /* New Code Directory is ready for use, swap it out in the blob structure */ ubc_cs_blob_deallocate(blob->csb_mem_kaddr, blob->csb_mem_size); blob->csb_mem_size = new_blob_size; @@ -3103,31 +3156,33 @@ ubc_cs_convert_to_multilevel_hash(struct cs_blob *blob) } else { blob->csb_start_offset = 0; } + + return 0; } +/* + * Validate the code signature blob, create a struct cs_blob wrapper + * and return it together with a pointer to the chosen code directory + * and entitlements blob. + * + * Note that this takes ownership of the memory as addr, mainly because + * this function can actually replace the passed in blob with another + * one, e.g. when performing multilevel hashing optimization. + */ int -ubc_cs_blob_add( - struct vnode *vp, - cpu_type_t cputype, - off_t base_offset, - vm_address_t *addr, - vm_size_t size, - struct image_params *imgp, - __unused int flags, - struct cs_blob **ret_blob) +cs_blob_create_validated( + vm_address_t * const addr, + vm_size_t size, + struct cs_blob ** const ret_blob, + CS_CodeDirectory const ** const ret_cd) { - kern_return_t kr; - struct ubc_info *uip; - struct cs_blob *blob, *oblob; - int error; + struct cs_blob *blob; + int error = EINVAL; const CS_CodeDirectory *cd; const CS_GenericBlob *entitlements; - off_t blob_start_offset, blob_end_offset; union cs_hash_union mdctx; - boolean_t record_mtime; size_t length; - record_mtime = FALSE; if (ret_blob) *ret_blob = NULL; @@ -3137,8 +3192,6 @@ ubc_cs_blob_add( } /* fill in the new blob */ - blob->csb_cpu_type = cputype; - blob->csb_base_offset = base_offset; blob->csb_mem_size = size; blob->csb_mem_offset = 0; blob->csb_mem_kaddr = *addr; @@ -3149,7 +3202,8 @@ ubc_cs_blob_add( blob->csb_teamid = NULL; blob->csb_entitlements_blob = NULL; blob->csb_entitlements = NULL; - + blob->csb_reconstituted = false; + /* Transfer ownership. Even on error, this function will deallocate */ *addr = 0; @@ -3158,7 +3212,7 @@ ubc_cs_blob_add( */ length = (size_t) size; error = cs_validate_csblob((const uint8_t *)blob->csb_mem_kaddr, - &length, &cd, &entitlements); + length, &cd, &entitlements); if (error) { if (cs_debug) @@ -3173,49 +3227,6 @@ ubc_cs_blob_add( uint8_t hash[CS_HASH_MAX_SIZE]; int md_size; - size = (vm_size_t) length; - assert(size <= blob->csb_mem_size); - if (size < blob->csb_mem_size) { - vm_address_t new_blob_addr; - const CS_CodeDirectory *new_cd; - const CS_GenericBlob *new_entitlements; - - kr = ubc_cs_blob_allocate(&new_blob_addr, &size); - if (kr != KERN_SUCCESS) { - if (cs_debug > 1) { - printf("CODE SIGNING: failed to " - "re-allocate blob (size " - "0x%llx->0x%llx) error 0x%x\n", - (uint64_t)blob->csb_mem_size, - (uint64_t)size, - kr); - } - } else { - memcpy((void *)new_blob_addr, (void *)blob->csb_mem_kaddr, size); - if (cd == NULL) { - new_cd = NULL; - } else { - new_cd = (void *)(((uintptr_t)cd - - (uintptr_t)blob->csb_mem_kaddr - + (uintptr_t)new_blob_addr)); - } - if (entitlements == NULL) { - new_entitlements = NULL; - } else { - new_entitlements = (void *)(((uintptr_t)entitlements - - (uintptr_t)blob->csb_mem_kaddr - + (uintptr_t)new_blob_addr)); - } -// printf("CODE SIGNING: %s:%d kaddr 0x%llx cd %p ents %p -> blob 0x%llx cd %p ents %p\n", __FUNCTION__, __LINE__, (uint64_t)blob->csb_mem_kaddr, cd, entitlements, (uint64_t)new_blob_addr, new_cd, new_entitlements); - ubc_cs_blob_deallocate(blob->csb_mem_kaddr, - blob->csb_mem_size); - blob->csb_mem_kaddr = new_blob_addr; - blob->csb_mem_size = size; - cd = new_cd; - entitlements = new_entitlements; - } - } - blob->csb_cd = cd; blob->csb_entitlements_blob = entitlements; /* may be NULL, not yet validated */ blob->csb_hashtype = cs_find_md(cd->hashType); @@ -3246,7 +3257,81 @@ ubc_cs_blob_add( memcpy(blob->csb_cdhash, hash, CS_CDHASH_LEN); } - /* + error = 0; + +out: + if (error != 0) { + cs_blob_free(blob); + blob = NULL; + cd = NULL; + } + + if (ret_blob != NULL) { + *ret_blob = blob; + } + if (ret_cd != NULL) { + *ret_cd = cd; + } + + return error; +} + +/* + * Free a cs_blob previously created by cs_blob_create_validated. + */ +void +cs_blob_free( + struct cs_blob * const blob) +{ + if (blob != NULL) { + if (blob->csb_mem_kaddr) { + ubc_cs_blob_deallocate(blob->csb_mem_kaddr, blob->csb_mem_size); + blob->csb_mem_kaddr = 0; + } + if (blob->csb_entitlements != NULL) { + osobject_release(blob->csb_entitlements); + blob->csb_entitlements = NULL; + } + kfree(blob, sizeof (*blob)); + } +} + +int +ubc_cs_blob_add( + struct vnode *vp, + cpu_type_t cputype, + off_t base_offset, + vm_address_t *addr, + vm_size_t size, + struct image_params *imgp, + __unused int flags, + struct cs_blob **ret_blob) +{ + kern_return_t kr; + struct ubc_info *uip; + struct cs_blob *blob, *oblob; + int error; + CS_CodeDirectory const *cd; + off_t blob_start_offset, blob_end_offset; + boolean_t record_mtime; + + record_mtime = FALSE; + if (ret_blob) + *ret_blob = NULL; + + /* Create the struct cs_blob wrapper that will be attached to the vnode. + * Validates the passed in blob in the process. */ + error = cs_blob_create_validated(addr, size, &blob, &cd); + + if (error != 0) { + printf("malform code signature blob: %d\n", error); + return error; + } + + blob->csb_cpu_type = cputype; + blob->csb_base_offset = base_offset; + + /* * Let policy module check whether the blob's signature is accepted. */ #if CONFIG_MACF @@ -3269,6 +3354,38 @@ ubc_cs_blob_add( } #endif +#if CONFIG_ENFORCE_SIGNED_CODE + /* + * Reconstitute code signature + */ + { + vm_address_t new_mem_kaddr = 0; + vm_size_t new_mem_size = 0; + + CS_CodeDirectory *new_cd = NULL; + CS_GenericBlob const *new_entitlements = NULL; + + error = ubc_cs_reconstitute_code_signature(blob, 0, + &new_mem_kaddr, &new_mem_size, + &new_cd, &new_entitlements); + + if (error != 0) { + printf("failed code signature reconstitution: %d\n", error); + goto out; + } + + ubc_cs_blob_deallocate(blob->csb_mem_kaddr, blob->csb_mem_size); + + blob->csb_mem_kaddr = new_mem_kaddr; + blob->csb_mem_size = new_mem_size; + blob->csb_cd = new_cd; + blob->csb_entitlements_blob = new_entitlements; + blob->csb_reconstituted = true; + } + +#endif + + if (blob->csb_flags & CS_PLATFORM_BINARY) { if (cs_debug > 1) printf("check_signature[pid: %d]: platform binary\n", current_proc()->p_pid); @@ -3301,7 +3418,12 @@ ubc_cs_blob_add( } if (ubc_cs_supports_multilevel_hash(blob)) { - ubc_cs_convert_to_multilevel_hash(blob); + error = ubc_cs_convert_to_multilevel_hash(blob); + if (error != 0) { + printf("failed multilevel hash conversion: %d\n", error); + goto out; + } + blob->csb_reconstituted = true; } vnode_lock(vp); @@ -3378,6 +3500,11 @@ ubc_cs_blob_add( */ oblob->csb_cpu_type = cputype; } + + /* The signature is still accepted, so update the + * generation count. */ + uip->cs_add_gen = cs_blob_generation_count; + vnode_unlock(vp); if (ret_blob) *ret_blob = oblob; @@ -3464,19 +3591,7 @@ ubc_cs_blob_add( if (cs_debug) printf("check_signature[pid: %d]: error = %d\n", current_proc()->p_pid, error); - /* we failed; release what we allocated */ - if (blob) { - if (blob->csb_mem_kaddr) { - ubc_cs_blob_deallocate(blob->csb_mem_kaddr, blob->csb_mem_size); - blob->csb_mem_kaddr = 0; - } - if (blob->csb_entitlements != NULL) { - osobject_release(blob->csb_entitlements); - blob->csb_entitlements = NULL; - } - kfree(blob, sizeof (*blob)); - blob = NULL; - } + cs_blob_free(blob); } if (error == EAGAIN) { @@ -3577,18 +3692,9 @@ ubc_cs_free( blob != NULL; blob = next_blob) { next_blob = blob->csb_next; - if (blob->csb_mem_kaddr != 0) { - ubc_cs_blob_deallocate(blob->csb_mem_kaddr, - blob->csb_mem_size); - blob->csb_mem_kaddr = 0; - } - if (blob->csb_entitlements != NULL) { - osobject_release(blob->csb_entitlements); - blob->csb_entitlements = NULL; - } OSAddAtomic(-1, &cs_blob_count); OSAddAtomic((SInt32) -blob->csb_mem_size, &cs_blob_size); - kfree(blob, sizeof (*blob)); + cs_blob_free(blob); } #if CHECK_CS_VALIDATION_BITMAP ubc_cs_validation_bitmap_deallocate( uip->ui_vnode ); @@ -3634,17 +3740,45 @@ ubc_cs_blob_revalidate( size = blob->csb_mem_size; error = cs_validate_csblob((const uint8_t *)blob->csb_mem_kaddr, - &size, &cd, &entitlements); + size, &cd, &entitlements); if (error) { if (cs_debug) { printf("CODESIGNING: csblob invalid: %d\n", error); } goto out; } - assert(size == blob->csb_mem_size); unsigned int cs_flags = (ntohl(cd->flags) & CS_ALLOWED_MACHO) | CS_VALID; unsigned int signer_type = CS_SIGNER_TYPE_UNKNOWN; + + if (blob->csb_reconstituted) { + /* + * Code signatures that have been modified after validation + * cannot be revalidated inline from their in-memory blob. + * + * That's okay, though, because the only path left that relies + * on revalidation of existing in-memory blobs is the legacy + * detached signature database path, which only exists on macOS, + * which does not do reconstitution of any kind. + */ + if (cs_debug) { + printf("CODESIGNING: revalidate: not inline revalidating reconstituted signature.\n"); + } + + /* + * EAGAIN tells the caller that they may reread the code + * signature and try attaching it again, which is the same + * thing they would do if there was no cs_blob yet in the + * first place. + * + * Conveniently, after ubc_cs_blob_add did a successful + * validation, it will detect that a matching cs_blob (cdhash, + * offset, arch etc.) already exists, and return success + * without re-adding a cs_blob to the vnode. + */ + return EAGAIN; + } + /* callout to mac_vnode_check_signature */ #if CONFIG_MACF error = mac_vnode_check_signature(vp, blob, imgp, &cs_flags, &signer_type, flags); @@ -4150,3 +4284,66 @@ void ubc_cs_validation_bitmap_deallocate(__unused vnode_t vp){ return; } #endif /* CHECK_CS_VALIDATION_BITMAP */ + +#if PMAP_CS +kern_return_t +cs_associate_blob_with_mapping( + void *pmap, + vm_map_offset_t start, + vm_map_size_t size, + vm_object_offset_t offset, + void *blobs_p) +{ + off_t blob_start_offset, blob_end_offset; + kern_return_t kr; + struct cs_blob *blobs, *blob; + vm_offset_t kaddr; + struct pmap_cs_code_directory *cd_entry = NULL; + + if (!pmap_cs) { + return KERN_NOT_SUPPORTED; + } + + blobs = (struct cs_blob *)blobs_p; + + for (blob = blobs; + blob != NULL; + blob = blob->csb_next) { + blob_start_offset = (blob->csb_base_offset + + blob->csb_start_offset); + blob_end_offset = (blob->csb_base_offset + + blob->csb_end_offset); + if ((off_t) offset < blob_start_offset || + (off_t) offset >= blob_end_offset || + (off_t) (offset + size) <= blob_start_offset || + (off_t) (offset + size) > blob_end_offset) { + continue; + } + kaddr = blob->csb_mem_kaddr; + if (kaddr == 0) { + /* blob data has been released */ + continue; + } + cd_entry = blob->csb_pmap_cs_entry; + if (cd_entry == NULL) { + continue; + } + + break; + } + + if (cd_entry != NULL) { + kr = pmap_cs_associate(pmap, + cd_entry, + start, + size); + } else { + kr = KERN_CODESIGN_ERROR; + } +#if 00 + printf("FBDP %d[%s] pmap_cs_associate(%p,%p,0x%llx,0x%llx) -> kr=0x%x\n", proc_selfpid(), &(current_proc()->p_comm[0]), pmap, cd_entry, (uint64_t)start, (uint64_t)size, kr); + kr = KERN_SUCCESS; +#endif + return kr; +} +#endif /* PMAP_CS */ diff --git a/bsd/kern/uipc_domain.c b/bsd/kern/uipc_domain.c index e03a08e6d..4433e81bd 100644 --- a/bsd/kern/uipc_domain.c +++ b/bsd/kern/uipc_domain.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2013 Apple Inc. All rights reserved. + * Copyright (c) 1998-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * diff --git a/bsd/kern/uipc_mbuf.c b/bsd/kern/uipc_mbuf.c index c0b051922..f33335a38 100644 --- a/bsd/kern/uipc_mbuf.c +++ b/bsd/kern/uipc_mbuf.c @@ -344,6 +344,7 @@ static uint64_t mb_expand_16kcl_total; static boolean_t mbuf_worker_needs_wakeup; /* wait channel for mbuf worker */ static uint32_t mbuf_worker_run_cnt; static uint64_t mbuf_worker_last_runtime; +static uint64_t mbuf_drain_last_runtime; static int mbuf_worker_ready; /* worker thread is runnable */ static int ncpu; /* number of CPUs */ static ppnum_t *mcl_paddr; /* Array of cluster physical addresses */ @@ -708,7 +709,7 @@ static char *mbuf_dump_buf; * also toggeable via the kern.ipc.mb_watchdog sysctl. * Garbage collection is also enabled by default on embedded platforms. * mb_drain_maxint controls the amount of time to wait (in seconds) before - * consecutive calls to m_drain(). + * consecutive calls to mbuf_drain(). */ #if CONFIG_EMBEDDED static unsigned int mb_watchdog = 1; @@ -801,6 +802,16 @@ static int m_copyback0(struct mbuf **, int, int, const void *, int, int); static struct mbuf *m_split0(struct mbuf *, int, int, int); __private_extern__ void mbuf_report_peak_usage(void); static boolean_t mbuf_report_usage(mbuf_class_t); +#if DEBUG || DEVELOPMENT +#define mbwdog_logger(fmt, ...) _mbwdog_logger(__func__, __LINE__, fmt, ## __VA_ARGS__) +static void _mbwdog_logger(const char *func, const int line, const char *fmt, ...); +static char *mbwdog_logging; +const unsigned mbwdog_logging_size = 4096; +static size_t mbwdog_logging_used; +#else +#define mbwdog_logger(fmt, ...) do { } while (0) +#endif +static void mbuf_drain_locked(boolean_t); /* flags for m_copyback0 */ #define M_COPYBACK0_COPYBACK 0x0001 /* copyback from cp */ @@ -1528,6 +1539,7 @@ mbinit(void) _CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI); _CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI); _CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI); + _CASSERT(MBUF_SC2TC(MBUF_SC_SIG) == MBUF_TC_VI); _CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO); _CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO); @@ -2258,7 +2270,9 @@ mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged) if ((w = mb_waiters) > 0) mb_waiters = 0; - + if (w) { + mbwdog_logger("waking up all threads"); + } lck_mtx_unlock(mbuf_mlock); if (w != 0) @@ -2332,6 +2346,9 @@ mbuf_slab_notify(void *arg, u_int32_t reason) m_notified(class)++; mb_waiters = 0; } + if (w) { + mbwdog_logger("waking up all threads"); + } lck_mtx_unlock(mbuf_mlock); if (w != 0) @@ -2755,6 +2772,9 @@ mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged) if ((w = mb_waiters) > 0) mb_waiters = 0; + if (w) { + mbwdog_logger("waking up all threads"); + } lck_mtx_unlock(mbuf_mlock); @@ -3144,6 +3164,8 @@ m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize) * pool or if the number of free clusters is less than requested. */ if (i > 0 && mbuf_worker_ready && mbuf_worker_needs_wakeup) { + mbwdog_logger("waking up the worker thread to to grow %s by %d", + m_cname(class), i); wakeup((caddr_t)&mbuf_worker_needs_wakeup); mbuf_worker_needs_wakeup = FALSE; } @@ -3317,8 +3339,10 @@ freelist_populate(mbuf_class_t class, unsigned int num, int wait) if ((i = mb_waiters) > 0) mb_waiters = 0; - if (i != 0) + if (i != 0) { + mbwdog_logger("waking up all threads"); wakeup(mb_waitchan); + } } return (count != 0); } @@ -4808,37 +4832,27 @@ m_freem(struct mbuf *m) /* * Mbuffer utility routines. */ - /* - * Compute the amount of space available before the current start - * of data in an mbuf. + * Set the m_data pointer of a newly allocated mbuf to place an object of the + * specified size at the end of the mbuf, longword aligned. + * + * NB: Historically, we had M_ALIGN(), MH_ALIGN(), and MEXT_ALIGN() as + * separate macros, each asserting that it was called at the proper moment. + * This required callers to themselves test the storage type and call the + * right one. Rather than require callers to be aware of those layout + * decisions, we centralize here. */ -int -m_leadingspace(struct mbuf *m) +void +m_align(struct mbuf *m, int len) { - if (m->m_flags & M_EXT) { - if (MCLHASREFERENCE(m)) - return (0); - return (m->m_data - m->m_ext.ext_buf); - } - if (m->m_flags & M_PKTHDR) - return (m->m_data - m->m_pktdat); - return (m->m_data - m->m_dat); -} + int adjust = 0; -/* - * Compute the amount of space available after the end of data in an mbuf. - */ -int -m_trailingspace(struct mbuf *m) -{ - if (m->m_flags & M_EXT) { - if (MCLHASREFERENCE(m)) - return (0); - return (m->m_ext.ext_buf + m->m_ext.ext_size - - (m->m_data + m->m_len)); - } - return (&m->m_dat[MLEN] - (m->m_data + m->m_len)); + /* At this point data must point to start */ + VERIFY(m->m_data == M_START(m)); + VERIFY(len >= 0); + VERIFY(len <= M_SIZE(m)); + adjust = M_SIZE(m) - len; + m->m_data += adjust &~ (sizeof(long) - 1); } /* @@ -5321,6 +5335,17 @@ m_pullup(struct mbuf *n, int len) __func__, len); goto bad; } + if (len > MLEN) { + os_log_info(OS_LOG_DEFAULT, "%s: failed len %d too big", + __func__, len); + goto bad; + } + if ((n->m_flags & M_EXT) == 0 && + n->m_data >= &n->m_dat[MLEN]) { + os_log_info(OS_LOG_DEFAULT, "%s: m_data out of bounds", + __func__); + goto bad; + } /* * If first mbuf has no cluster, and has room for len bytes @@ -5328,7 +5353,7 @@ m_pullup(struct mbuf *n, int len) * otherwise allocate a new mbuf to prepend to the chain. */ if ((n->m_flags & M_EXT) == 0 && - n->m_data + len < &n->m_dat[MLEN] && n->m_next) { + len < &n->m_dat[MLEN] - n->m_data && n->m_next != NULL) { if (n->m_len >= len) return (n); m = n; @@ -5355,11 +5380,11 @@ m_pullup(struct mbuf *n, int len) m->m_len += count; n->m_len -= count; space -= count; - if (n->m_len) + if (n->m_len != 0) n->m_data += count; else n = m_free(n); - } while (len > 0 && n); + } while (len > 0 && n != NULL); if (len > 0) { (void) m_free(m); goto bad; @@ -5439,18 +5464,52 @@ m_split0(struct mbuf *m0, int len0, int wait, int copyhdr) struct mbuf *m, *n; unsigned len = len0, remain; + /* + * First iterate to the mbuf which contains the first byte of + * data at offset len0 + */ for (m = m0; m && len > m->m_len; m = m->m_next) len -= m->m_len; if (m == NULL) return (NULL); + /* + * len effectively is now the offset in the current + * mbuf where we have to perform split. + * + * remain becomes the tail length. + * Note that len can also be == m->m_len + */ remain = m->m_len - len; - if (copyhdr && (m0->m_flags & M_PKTHDR)) { + + /* + * If current mbuf len contains the entire remaining offset len, + * just make the second mbuf chain pointing to next mbuf onwards + * and return after making necessary adjustments + */ + if (copyhdr && (m0->m_flags & M_PKTHDR) && remain == 0) { + _MGETHDR(n, wait, m0->m_type); + if (n == NULL) + return (NULL); + n->m_next = m->m_next; + m->m_next = NULL; + n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; + n->m_pkthdr.len = m0->m_pkthdr.len - len0; + m0->m_pkthdr.len = len0; + return (n); + } if (copyhdr && (m0->m_flags & M_PKTHDR)) { _MGETHDR(n, wait, m0->m_type); if (n == NULL) return (NULL); n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; n->m_pkthdr.len = m0->m_pkthdr.len - len0; m0->m_pkthdr.len = len0; + + /* + * If current points to external storage + * then it can be shared by making last mbuf + * of head chain and first mbuf of current chain + * pointing to different data offsets + */ if (m->m_flags & M_EXT) goto extpacket; if (remain > MHLEN) { @@ -5472,7 +5531,11 @@ m_split0(struct mbuf *m0, int len0, int wait, int copyhdr) _MGET(n, wait, m->m_type); if (n == NULL) return (NULL); - M_ALIGN(n, remain); + + if ((m->m_flags & M_EXT) == 0) { + VERIFY(remain <= MLEN); + M_ALIGN(n, remain); + } } extpacket: if (m->m_flags & M_EXT) { @@ -5607,6 +5670,9 @@ m_howmany(int num, size_t bufsize) if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) || (njcl > 0 && bufsize == m_maxsize(MC_16KCL) && (m_16kclusters << NCLPJCLSHIFT) >= njcl)) { + mbwdog_logger("maxed out nclusters (%u >= %u) or njcl (%u >= %u)", + sumclusters, nclusters, + (m_16kclusters << NCLPJCLSHIFT), njcl); return (0); } @@ -6550,8 +6616,9 @@ mbuf_sleep(mbuf_class_t class, unsigned int num, int wait) wakeup((caddr_t)&mbuf_worker_needs_wakeup); mbuf_worker_needs_wakeup = FALSE; } - + mbwdog_logger("waiting (%d mbufs in class %s)", num, m_cname(class)); (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL); + mbwdog_logger("woke up (%d mbufs in class %s) ", num, m_cname(class)); /* We are now up; stop getting notified until next round */ mbuf_waiter_dec(class, (wait & MCR_COMP)); @@ -6576,8 +6643,31 @@ mbuf_worker_thread(void) while (1) { lck_mtx_lock(mbuf_mlock); + mbwdog_logger("worker thread running"); mbuf_worker_run_cnt++; mbuf_expand = 0; + /* + * Allocations are based on page size, so if we have depleted + * the reserved spaces, try to free mbufs from the major classes. + */ +#if PAGE_SIZE == 4096 + uint32_t m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT; + uint32_t m_clusters = m_total(MC_CL); + uint32_t m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT; + uint32_t sumclusters = m_mbclusters + m_clusters + m_bigclusters; + if (sumclusters >= nclusters) { + mbwdog_logger("reclaiming bigcl"); + mbuf_drain_locked(TRUE); + m_reclaim(MC_BIGCL, 4, FALSE); + } +#else + uint32_t m_16kclusters = m_total(MC_16KCL); + if (njcl > 0 && (m_16kclusters << NCLPJCLSHIFT) >= njcl) { + mbwdog_logger("reclaiming 16kcl"); + mbuf_drain_locked(TRUE); + m_reclaim(MC_16KCL, 4, FALSE); + } +#endif if (m_region_expand(MC_CL) > 0) { int n; mb_expand_cl_cnt++; @@ -6591,8 +6681,10 @@ mbuf_worker_thread(void) } m_region_expand(MC_CL) = 0; - if (n > 0) + if (n > 0) { + mbwdog_logger("expanding MC_CL by %d", n); freelist_populate(MC_CL, n, M_WAIT); + } } if (m_region_expand(MC_BIGCL) > 0) { int n; @@ -6607,8 +6699,10 @@ mbuf_worker_thread(void) } m_region_expand(MC_BIGCL) = 0; - if (n > 0) + if (n > 0) { + mbwdog_logger("expanding MC_BIGCL by %d", n); freelist_populate(MC_BIGCL, n, M_WAIT); + } } if (m_region_expand(MC_16KCL) > 0) { int n; @@ -6623,8 +6717,10 @@ mbuf_worker_thread(void) } m_region_expand(MC_16KCL) = 0; - if (n > 0) + if (n > 0) { + mbwdog_logger("expanding MC_16KCL by %d", n); (void) freelist_populate(MC_16KCL, n, M_WAIT); + } } /* @@ -6633,11 +6729,23 @@ mbuf_worker_thread(void) * mbufs -- otherwise we could have a large number of useless * clusters allocated. */ - while (m_total(MC_MBUF) < - (m_total(MC_BIGCL) + m_total(MC_CL) + m_total(MC_16KCL))) { + mbwdog_logger("totals: MC_MBUF %d MC_BIGCL %d MC_CL %d MC_16KCL %d", + m_total(MC_MBUF), m_total(MC_BIGCL), m_total(MC_CL), + m_total(MC_16KCL)); + uint32_t total_mbufs = m_total(MC_MBUF); + uint32_t total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) + + m_total(MC_16KCL); + if (total_mbufs < total_clusters) { + mbwdog_logger("expanding MC_MBUF by %d", + total_clusters - total_mbufs); + } + while (total_mbufs < total_clusters) { mb_expand_cnt++; if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0) break; + total_mbufs = m_total(MC_MBUF); + total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) + + m_total(MC_16KCL); } mbuf_worker_needs_wakeup = TRUE; @@ -6650,6 +6758,7 @@ mbuf_worker_thread(void) mbuf_worker_last_runtime = net_uptime(); assert_wait((caddr_t)&mbuf_worker_needs_wakeup, THREAD_UNINT); + mbwdog_logger("worker thread sleeping"); lck_mtx_unlock(mbuf_mlock); (void) thread_block((thread_continue_t)mbuf_worker_thread); } @@ -7419,6 +7528,7 @@ mbuf_dump(void) mleak_trace_stat_t *mltr; char *c = mbuf_dump_buf; int i, j, k, clen = MBUF_DUMP_BUF_SIZE; + bool printed_banner = false; mbuf_dump_buf[0] = '\0'; @@ -7562,14 +7672,29 @@ mbuf_dump(void) net_uptime() - mbuf_worker_last_runtime); MBUF_DUMP_BUF_CHK(); } + if (mbuf_drain_last_runtime != 0) { + k = snprintf(c, clen, "drain routine last run time: " + "%llu (%llu seconds ago)\n", + mbuf_drain_last_runtime, + net_uptime() - mbuf_drain_last_runtime); + MBUF_DUMP_BUF_CHK(); + } - k = snprintf(c, clen, "\nlargest allocation failure backtraces:\n"); +#if DEBUG || DEVELOPMENT + k = snprintf(c, clen, "\nworker thread log:\n%s\n", mbwdog_logging); MBUF_DUMP_BUF_CHK(); +#endif for (j = 0; j < MTRACELARGE_NUM_TRACES; j++) { struct mtracelarge *trace = &mtracelarge_table[j]; if (trace->size == 0 || trace->depth == 0) continue; + if (printed_banner == false) { + k = snprintf(c, clen, + "\nlargest allocation failure backtraces:\n"); + MBUF_DUMP_BUF_CHK(); + printed_banner = true; + } k = snprintf(c, clen, "size %llu: < ", trace->size); MBUF_DUMP_BUF_CHK(); for (i = 0; i < trace->depth; i++) { @@ -8017,10 +8142,27 @@ mbuf_report_peak_usage(void) } /* - * Called by the VM when there's memory pressure. + * Simple routine to avoid taking the lock when we can't run the + * mbuf drain. */ -__private_extern__ void -m_drain(void) +static int +mbuf_drain_checks(boolean_t ignore_waiters) +{ + + if (mb_drain_maxint == 0) + return 0; + if (!ignore_waiters && mb_waiters != 0) + return 0; + + return 1; +} + +/* + * Called by the VM when there's memory pressure or when we exhausted + * the 4k/16k reserved space. + */ +static void +mbuf_drain_locked(boolean_t ignore_waiters) { mbuf_class_t mc; mcl_slab_t *sp, *sp_tmp, *nsp; @@ -8030,11 +8172,11 @@ m_drain(void) ppnum_t offset; mcache_obj_t *obj; unsigned long per; - static uint64_t last_drain = 0; static unsigned char scratch[32]; static ppnum_t scratch_pa = 0; - if (mb_drain_maxint == 0 || mb_waiters) + LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + if (!mbuf_drain_checks(ignore_waiters)) return; if (scratch_pa == 0) { bzero(scratch, sizeof(scratch)); @@ -8053,20 +8195,15 @@ m_drain(void) * waiting times for mbufs. Purge caches if we were asked to drain * in the last 5 minutes. */ - lck_mtx_lock(mbuf_mlock); - if (last_drain == 0) { - last_drain = net_uptime(); - lck_mtx_unlock(mbuf_mlock); - return; - } - interval = net_uptime() - last_drain; - if (interval <= mb_drain_maxint) { - lck_mtx_unlock(mbuf_mlock); - return; + if (mbuf_drain_last_runtime != 0) { + interval = net_uptime() - mbuf_drain_last_runtime; + if (interval <= mb_drain_maxint) { + return; + } + if (interval <= mb_drain_maxint * 5) + purge_caches = TRUE; } - if (interval <= mb_drain_maxint * 5) - purge_caches = TRUE; - last_drain = net_uptime(); + mbuf_drain_last_runtime = net_uptime(); /* * Don't free any memory if we're using 60% or more. */ @@ -8076,7 +8213,6 @@ m_drain(void) } per = (use_mem * 100) / total_mem; if (per >= 60) { - lck_mtx_unlock(mbuf_mlock); return; } /* @@ -8205,9 +8341,20 @@ m_drain(void) mbstat.m_mbufs = m_total(MC_MBUF); mbuf_stat_sync(); mbuf_mtypes_sync(TRUE); +} + +__private_extern__ void +mbuf_drain(boolean_t ignore_waiters) +{ + LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_NOTOWNED); + if (!mbuf_drain_checks(ignore_waiters)) + return; + lck_mtx_lock(mbuf_mlock); + mbuf_drain_locked(ignore_waiters); lck_mtx_unlock(mbuf_mlock); } + static int m_drain_force_sysctl SYSCTL_HANDLER_ARGS { @@ -8218,16 +8365,60 @@ m_drain_force_sysctl SYSCTL_HANDLER_ARGS if (err != 0 || req->newptr == USER_ADDR_NULL) return (err); if (val) { - lck_mtx_lock(mbuf_mlock); - printf("%s\n", mbuf_dump()); - lck_mtx_unlock(mbuf_mlock); - m_drain(); + mbuf_drain(TRUE); } return (err); } #if DEBUG || DEVELOPMENT +static void +_mbwdog_logger(const char *func, const int line, const char *fmt, ...) +{ + va_list ap; + struct timeval now; + char str[384], p[256]; + int len; + + LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + if (mbwdog_logging == NULL) { + mbwdog_logging = _MALLOC(mbwdog_logging_size, + M_TEMP, M_ZERO|M_NOWAIT); + if (mbwdog_logging == NULL) + return; + } + va_start(ap, fmt); + vsnprintf(p, sizeof(p), fmt, ap); + va_end(ap); + microuptime(&now); + len = snprintf(str, sizeof(str), + "\n%ld.%d (%d/%llx) %s:%d %s", + now.tv_sec, now.tv_usec, + current_proc()->p_pid, + (uint64_t)VM_KERNEL_ADDRPERM(current_thread()), + func, line, p); + if (len < 0) + return; + if (mbwdog_logging_used + len > mbwdog_logging_size) { + mbwdog_logging_used = mbwdog_logging_used / 2; + memmove(mbwdog_logging, mbwdog_logging + mbwdog_logging_used, + mbwdog_logging_size - mbwdog_logging_used); + mbwdog_logging[mbwdog_logging_used] = 0; + } + strlcat(mbwdog_logging, str, mbwdog_logging_size); + mbwdog_logging_used += len; +} + +static int +sysctl_mbwdog_log SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + return SYSCTL_OUT(req, mbwdog_logging, mbwdog_logging_used); +} +SYSCTL_DECL(_kern_ipc); +SYSCTL_PROC(_kern_ipc, OID_AUTO, mbwdog_log, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, sysctl_mbwdog_log, "A", ""); static int mbtest_val; static int mbtest_running; @@ -8299,8 +8490,7 @@ mbtest SYSCTL_HANDLER_ARGS return (error); } -#endif - +#endif // DEBUG || DEVELOPMENT static void mtracelarge_register(size_t size) diff --git a/bsd/kern/uipc_socket.c b/bsd/kern/uipc_socket.c index bd6b3030f..348afd442 100644 --- a/bsd/kern/uipc_socket.c +++ b/bsd/kern/uipc_socket.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2017 Apple Inc. All rights reserved. + * Copyright (c) 1998-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -141,7 +141,6 @@ /* TODO: this should be in a header file somewhere */ extern char *proc_name_address(void *p); -extern char *proc_best_name(proc_t); static u_int32_t so_cache_hw; /* High water mark for socache */ static u_int32_t so_cache_timeouts; /* number of timeouts */ @@ -2332,12 +2331,12 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, if ((m->m_flags & M_EXT)) mlen = m->m_ext.ext_size - - m_leadingspace(m); + M_LEADINGSPACE(m); else if ((m->m_flags & M_PKTHDR)) mlen = - MHLEN - m_leadingspace(m); + MHLEN - M_LEADINGSPACE(m); else - mlen = MLEN - m_leadingspace(m); + mlen = MLEN - M_LEADINGSPACE(m); len = imin(mlen, bytes_to_copy); chainlength += len; @@ -2431,8 +2430,7 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, * Content filter processing */ error = cfil_sock_data_out(so, addr, top, - control, (sendflags & MSG_OOB) ? - sock_data_filt_flag_oob : 0); + control, sendflags); if (error) { if (error == EJUSTRETURN) { error = 0; @@ -2500,6 +2498,51 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, return (error); } +int +sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags) +{ + struct mbuf *m0, *control_end; + + socket_lock_assert_owned(so); + + /* + * top must points to mbuf chain to be sent. + * If control is not NULL, top must be packet header + */ + VERIFY(top != NULL && + (control == NULL || top->m_flags & M_PKTHDR)); + + /* + * If control is not passed in, see if we can get it + * from top. + */ + if (control == NULL && (top->m_flags & M_PKTHDR) == 0) { + // Locate start of control if present and start of data + for (m0 = top; m0 != NULL; m0 = m0->m_next) { + if (m0->m_flags & M_PKTHDR) { + top = m0; + break; + } else if (m0->m_type == MT_CONTROL) { + if (control == NULL) { + // Found start of control + control = m0; + } + if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) { + // Found end of control + control_end = m0; + } + } + } + if (control_end != NULL) + control_end->m_next = NULL; + } + + int error = (*so->so_proto->pr_usrreqs->pru_send) + (so, sendflags, top, addr, control, current_proc()); + + return error; +} + /* * Supported only connected sockets (no address) without ancillary data * (control mbuf) for atomic protocols @@ -2684,12 +2727,12 @@ sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags) for (n = m; n != NULL; n = n->m_next) { if ((m->m_flags & M_EXT)) mlen = m->m_ext.ext_size - - m_leadingspace(m); + M_LEADINGSPACE(m); else if ((m->m_flags & M_PKTHDR)) mlen = - MHLEN - m_leadingspace(m); + MHLEN - M_LEADINGSPACE(m); else - mlen = MLEN - m_leadingspace(m); + mlen = MLEN - M_LEADINGSPACE(m); len = imin(mlen, bytes_to_copy); /* @@ -4798,6 +4841,7 @@ sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock) case SO_OOBINLINE: case SO_TIMESTAMP: case SO_TIMESTAMP_MONOTONIC: + case SO_TIMESTAMP_CONTINUOUS: case SO_DONTTRUNC: case SO_WANTMORE: case SO_WANTOOBFLAG: @@ -5495,6 +5539,7 @@ sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock) case SO_OOBINLINE: case SO_TIMESTAMP: case SO_TIMESTAMP_MONOTONIC: + case SO_TIMESTAMP_CONTINUOUS: case SO_DONTTRUNC: case SO_WANTMORE: case SO_WANTOOBFLAG: @@ -6187,8 +6232,6 @@ filt_sortouch(struct knote *kn, struct kevent_internal_s *kev) /* save off the new input fflags and data */ kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; - if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) - kn->kn_udata = kev->udata; /* determine if changes result in fired events */ retval = filt_soread_common(kn, so); @@ -6341,8 +6384,6 @@ filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev) /*save off the new input fflags and data */ kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; - if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) - kn->kn_udata = kev->udata; /* determine if these changes result in a triggered event */ ret = filt_sowrite_common(kn, so); @@ -6547,8 +6588,6 @@ filt_socktouch( /* save off the new input fflags and data */ kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; - if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) - kn->kn_udata = kev->udata; /* restrict the current results to the (smaller?) set of new interest */ /* @@ -6838,23 +6877,29 @@ sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce) if (so->so_flags & SOF_NODEFUNCT) { if (noforce) { err = EOPNOTSUPP; + if (p != PROC_NULL) { + SODEFUNCTLOG("%s[%d, %s]: (target pid %d " + "name %s level %d) so 0x%llx [%d,%d] " + "is not eligible for defunct " + "(%d)\n", __func__, proc_selfpid(), + proc_best_name(current_proc()), proc_pid(p), + proc_best_name(p), level, + (uint64_t)DEBUG_KERNEL_ADDRPERM(so), + SOCK_DOM(so), SOCK_TYPE(so), err); + } + return (err); + } + so->so_flags &= ~SOF_NODEFUNCT; + if (p != PROC_NULL) { SODEFUNCTLOG("%s[%d, %s]: (target pid %d " "name %s level %d) so 0x%llx [%d,%d] " - "is not eligible for defunct " + "defunct by force " "(%d)\n", __func__, proc_selfpid(), proc_best_name(current_proc()), proc_pid(p), proc_best_name(p), level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so), SOCK_TYPE(so), err); - return (err); } - so->so_flags &= ~SOF_NODEFUNCT; - SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) " - "so 0x%llx [%d,%d] defunct by force\n", __func__, - proc_selfpid(), proc_best_name(current_proc()), - proc_pid(p), proc_best_name(p), level, - (uint64_t)DEBUG_KERNEL_ADDRPERM(so), - SOCK_DOM(so), SOCK_TYPE(so)); } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) { struct inpcb *inp = (struct inpcb *)so->so_pcb; struct ifnet *ifp = inp->inp_last_outifp; @@ -6865,7 +6910,7 @@ sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce) OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd); } else if (soextbkidlestat.so_xbkidle_time == 0) { OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime); - } else if (noforce) { + } else if (noforce && p != PROC_NULL) { OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active); so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG; @@ -6875,14 +6920,14 @@ sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce) inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY); err = EOPNOTSUPP; - SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s " - "level %d) extend bk idle so 0x%llx rcv hw %d " - "cc %d\n", - __func__, proc_selfpid(), + SODEFUNCTLOG("%s[%d, %s]: (target pid %d " + "name %s level %d) so 0x%llx [%d,%d] " + "extend bk idle " + "(%d)\n", __func__, proc_selfpid(), proc_best_name(current_proc()), proc_pid(p), proc_best_name(p), level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so), - so->so_rcv.sb_hiwat, so->so_rcv.sb_cc); + SOCK_DOM(so), SOCK_TYPE(so), err); return (err); } else { OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced); @@ -6908,13 +6953,16 @@ sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce) } done: - SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) " - "so 0x%llx [%d,%d] %s defunct%s\n", __func__, proc_selfpid(), - proc_best_name(current_proc()), proc_pid(p), proc_best_name(p), - level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so), - SOCK_TYPE(so), defunct ? "is already" : "marked as", - (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ? " extbkidle" : ""); - + if (p != PROC_NULL) { + SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) " + "so 0x%llx [%d,%d] %s defunct%s\n", __func__, + proc_selfpid(), proc_best_name(current_proc()), + proc_pid(p), proc_best_name(p), level, + (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so), + SOCK_TYPE(so), defunct ? "is already" : "marked as", + (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ? + " extbkidle" : ""); + } return (err); } @@ -6938,23 +6986,29 @@ sodefunct(struct proc *p, struct socket *so, int level) char d[MAX_IPv6_STR_LEN]; struct inpcb *inp = sotoinpcb(so); - SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) " - "so 0x%llx [%s %s:%d -> %s:%d] is now defunct " - "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", - __func__, proc_selfpid(), proc_best_name(current_proc()), - proc_pid(p), proc_best_name(p), level, - (uint64_t)DEBUG_KERNEL_ADDRPERM(so), - (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP", - inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ? - (void *)&inp->inp_laddr.s_addr : (void *)&inp->in6p_laddr), - s, sizeof (s)), ntohs(inp->in6p_lport), - inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ? - (void *)&inp->inp_faddr.s_addr : (void *)&inp->in6p_faddr, - d, sizeof (d)), ntohs(inp->in6p_fport), - (uint32_t)rcv->sb_sel.si_flags, - (uint32_t)snd->sb_sel.si_flags, - rcv->sb_flags, snd->sb_flags); - } else { + if (p != PROC_NULL) { + SODEFUNCTLOG( + "%s[%d, %s]: (target pid %d name %s level %d) " + "so 0x%llx [%s %s:%d -> %s:%d] is now defunct " + "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, " + " snd_fl 0x%x]\n", __func__, + proc_selfpid(), proc_best_name(current_proc()), + proc_pid(p), proc_best_name(p), level, + (uint64_t)DEBUG_KERNEL_ADDRPERM(so), + (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP", + inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ? + (void *)&inp->inp_laddr.s_addr : + (void *)&inp->in6p_laddr), + s, sizeof (s)), ntohs(inp->in6p_lport), + inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ? + (void *)&inp->inp_faddr.s_addr : + (void *)&inp->in6p_faddr, + d, sizeof (d)), ntohs(inp->in6p_fport), + (uint32_t)rcv->sb_sel.si_flags, + (uint32_t)snd->sb_sel.si_flags, + rcv->sb_flags, snd->sb_flags); + } + } else if (p != PROC_NULL) { SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) " "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, " "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, diff --git a/bsd/kern/uipc_socket2.c b/bsd/kern/uipc_socket2.c index 87d06ad39..264819a7c 100644 --- a/bsd/kern/uipc_socket2.c +++ b/bsd/kern/uipc_socket2.c @@ -106,8 +106,6 @@ #define DBG_FNC_SBDROP NETDBG_CODE(DBG_NETSOCK, 4) #define DBG_FNC_SBAPPEND NETDBG_CODE(DBG_NETSOCK, 5) -extern char *proc_best_name(proc_t p); - SYSCTL_DECL(_kern_ipc); __private_extern__ u_int32_t net_io_policy_throttle_best_effort = 0; @@ -116,8 +114,6 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, throttle_best_effort, static inline void sbcompress(struct sockbuf *, struct mbuf *, struct mbuf *); static struct socket *sonewconn_internal(struct socket *, int); -static int sbappendaddr_internal(struct sockbuf *, struct sockaddr *, - struct mbuf *, struct mbuf *); static int sbappendcontrol_internal(struct sockbuf *, struct mbuf *, struct mbuf *); static void soevent_ifdenied(struct socket *); @@ -1110,23 +1106,20 @@ sbinsertoob(struct sockbuf *sb, struct mbuf *m0) } /* - * Append address and data, and optionally, control (ancillary) data - * to the receive queue of a socket. If present, - * m0 must include a packet header with total length. - * Returns 0 if no space in sockbuf or insufficient mbufs. + * Concatenate address (optional), control (optional) and data into one + * single mbuf chain. If sockbuf *sb is passed in, space check will be + * performed. * - * Returns: 0 No space/out of mbufs - * 1 Success + * Returns: mbuf chain pointer if succeeded, NULL if failed */ -static int -sbappendaddr_internal(struct sockbuf *sb, struct sockaddr *asa, - struct mbuf *m0, struct mbuf *control) +struct mbuf * +sbconcat_mbufs(struct sockbuf *sb, struct sockaddr *asa, struct mbuf *m0, struct mbuf *control) { - struct mbuf *m, *n, *nlast; - int space = asa->sa_len; + struct mbuf *m = NULL, *n = NULL; + int space = 0; if (m0 && (m0->m_flags & M_PKTHDR) == 0) - panic("sbappendaddr"); + panic("sbconcat_mbufs"); if (m0) space += m0->m_pkthdr.len; @@ -1135,22 +1128,59 @@ sbappendaddr_internal(struct sockbuf *sb, struct sockaddr *asa, if (n->m_next == 0) /* keep pointer to last control buf */ break; } - if (space > sbspace(sb)) - return (0); - if (asa->sa_len > MLEN) - return (0); - MGET(m, M_DONTWAIT, MT_SONAME); - if (m == 0) - return (0); - m->m_len = asa->sa_len; - bcopy((caddr_t)asa, mtod(m, caddr_t), asa->sa_len); + + if (asa != NULL) { + if (asa->sa_len > MLEN) + return (NULL); + space += asa->sa_len; + } + + if (sb != NULL && space > sbspace(sb)) + return (NULL); + if (n) n->m_next = m0; /* concatenate data to control */ else control = m0; - m->m_next = control; - SBLASTRECORDCHK(sb, "sbappendadddr 1"); + if (asa != NULL) { + MGET(m, M_DONTWAIT, MT_SONAME); + if (m == 0) { + if (n) { + /* unchain control and data if necessary */ + n->m_next = NULL; + } + return (NULL); + } + m->m_len = asa->sa_len; + bcopy((caddr_t)asa, mtod(m, caddr_t), asa->sa_len); + + m->m_next = control; + } else { + m = control; + } + + return (m); +} + +/* + * Queue mbuf chain to the receive queue of a socket. + * Parameter space is the total len of the mbuf chain. + * If passed in, sockbuf space will be checked. + * + * Returns: 0 Invalid mbuf chain + * 1 Success + */ +int +sbappendchain(struct sockbuf *sb, struct mbuf *m, int space) +{ + struct mbuf *n, *nlast; + + if (m == NULL) + return (0); + + if (space != 0 && space > sbspace(sb)) + return (0); for (n = m; n->m_next != NULL; n = n->m_next) sballoc(sb, n); @@ -1186,6 +1216,7 @@ sbappendaddr(struct sockbuf *sb, struct sockaddr *asa, struct mbuf *m0, { int result = 0; boolean_t sb_unix = (sb->sb_flags & SB_UNIX); + struct mbuf *mbuf_chain = NULL; if (error_out) *error_out = 0; @@ -1230,7 +1261,9 @@ sbappendaddr(struct sockbuf *sb, struct sockaddr *asa, struct mbuf *m0, m0->m_flags &= ~M_SKIPCFIL; } - result = sbappendaddr_internal(sb, asa, m0, control); + mbuf_chain = sbconcat_mbufs(sb, asa, m0, control); + SBLASTRECORDCHK(sb, "sbappendadddr 1"); + result = sbappendchain(sb, mbuf_chain, 0); if (result == 0) { if (m0) m_freem(m0); @@ -1359,6 +1392,9 @@ sbappendmsgstream_rcv(struct sockbuf *sb, struct mbuf *m, uint32_t seqnum, int ret = 0; struct socket *so = sb->sb_so; + if (m == NULL) + return (0); + VERIFY((m->m_flags & M_PKTHDR) && m_pktlen(m) > 0); VERIFY(so->so_msg_state != NULL); VERIFY(sb->sb_flags & SB_RECV); diff --git a/bsd/kern/uipc_usrreq.c b/bsd/kern/uipc_usrreq.c index b8b429c08..16a044f7f 100644 --- a/bsd/kern/uipc_usrreq.c +++ b/bsd/kern/uipc_usrreq.c @@ -2,7 +2,7 @@ * Copyright (c) 2000-2015 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* @@ -180,8 +180,8 @@ static int unp_listen(struct unpcb *, proc_t); static void unpcb_to_compat(struct unpcb *, struct unpcb_compat *); static void unp_get_locks_in_order(struct socket *so, struct socket *conn_so); -static void -unp_get_locks_in_order(struct socket *so, struct socket *conn_so) +static void +unp_get_locks_in_order(struct socket *so, struct socket *conn_so) { if (so < conn_so) { socket_lock(conn_so, 1); @@ -369,7 +369,7 @@ uipc_rcvd(struct socket *so, __unused int flags) #define snd (&so2->so_snd) if (unp->unp_conn == 0) break; - + so2 = unp->unp_conn->unp_socket; unp_get_locks_in_order(so, so2); /* @@ -485,7 +485,7 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, control = NULL; } - if (so != so2) + if (so != so2) socket_unlock(so2, 1); m = NULL; @@ -524,7 +524,7 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, so2 = unp->unp_conn->unp_socket; unp_get_locks_in_order(so, so2); - /* Check socket state again as we might have unlocked the socket + /* Check socket state again as we might have unlocked the socket * while trying to get the locks in order */ @@ -532,7 +532,7 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, error = EPIPE; socket_unlock(so2, 1); break; - } + } if (unp->unp_flags & UNP_TRACE_MDNS) { struct mdns_ipc_msg_hdr hdr; @@ -558,7 +558,7 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, snd->sb_mbmax -= rcv->sb_mbcnt - unp->unp_conn->unp_mbcnt; unp->unp_conn->unp_mbcnt = rcv->sb_mbcnt; - if ((int32_t)snd->sb_hiwat >= + if ((int32_t)snd->sb_hiwat >= (int32_t)(rcv->sb_cc - unp->unp_conn->unp_cc)) { snd->sb_hiwat -= rcv->sb_cc - unp->unp_conn->unp_cc; } else { @@ -844,7 +844,7 @@ unp_attach(struct socket *so) return (ENOBUFS); bzero(unp, sizeof (*unp)); - lck_mtx_init(&unp->unp_mtx, + lck_mtx_init(&unp->unp_mtx, unp_mtx_grp, unp_mtx_attr); lck_rw_lock_exclusive(unp_list_mtx); @@ -886,7 +886,7 @@ unp_detach(struct unpcb *unp) lck_rw_lock_exclusive(unp_list_mtx); LIST_REMOVE(unp, unp_link); - --unp_count; + --unp_count; ++unp_gencnt; lck_rw_done(unp_list_mtx); if (unp->unp_vnode) { @@ -915,7 +915,7 @@ unp_detach(struct unpcb *unp) /* This datagram socket is connected to one or more * sockets. In order to avoid a race condition between removing - * this reference and closing the connected socket, we need + * this reference and closing the connected socket, we need * to check disconnect_in_progress */ if (so_locked == 1) { @@ -935,12 +935,12 @@ unp_detach(struct unpcb *unp) unp2 = unp->unp_refs.lh_first; socket_lock(unp2->unp_socket, 1); } - + lck_mtx_lock(unp_disconnect_lock); disconnect_in_progress = 0; wakeup(&disconnect_in_progress); lck_mtx_unlock(unp_disconnect_lock); - + if (unp2 != NULL) { /* We already locked this socket and have a reference on it */ unp_drop(unp2, ECONNRESET); @@ -1005,10 +1005,11 @@ unp_bind( /* * Note: sun_path is not a zero terminated "C" string */ - ASSERT(namelen < SOCK_MAXADDRLEN); + if (namelen >= SOCK_MAXADDRLEN) + return (EINVAL); bcopy(soun->sun_path, buf, namelen); buf[namelen] = 0; - + socket_unlock(so, 0); NDINIT(&nd, CREATE, OP_MKFIFO, FOLLOW | LOCKPARENT, UIO_SYSSPACE, @@ -1119,7 +1120,8 @@ unp_connect(struct socket *so, struct sockaddr *nam, __unused proc_t p) /* * Note: sun_path is not a zero terminated "C" string */ - ASSERT(len < SOCK_MAXADDRLEN); + if (len >= SOCK_MAXADDRLEN) + return (EINVAL); bcopy(soun->sun_path, buf, len); buf[len] = 0; @@ -1298,7 +1300,7 @@ unp_connect(struct socket *so, struct sockaddr *nam, __unused proc_t p) unp2->unp_flags |= UNP_TRACE_MDNS; } } - + error = unp_connect2(so, so2); decref_out: @@ -1350,18 +1352,18 @@ unp_connect2(struct socket *so, struct socket *so2) return (EINVAL); unp->unp_conn = unp2; - so2->so_usecount++; - + so2->so_usecount++; + switch (so->so_type) { case SOCK_DGRAM: LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink); - if (so != so2) { + if (so != so2) { /* Avoid lock order reversals due to drop/acquire in soisconnected. */ /* Keep an extra reference on so2 that will be dropped - * soon after getting the locks in order - */ + * soon after getting the locks in order + */ socket_unlock(so2, 0); soisconnected(so); unp_get_locks_in_order(so, so2); @@ -1461,7 +1463,7 @@ unp_disconnect(struct unpcb *unp) socket_lock(so2, 1); waitso = so2; } else { - if (so_locked == 1) { + if (so_locked == 1) { socket_unlock(so, 0); } socket_lock(so2, 1); @@ -1476,18 +1478,18 @@ unp_disconnect(struct unpcb *unp) /* Check for the UNP_DONTDISCONNECT flag, if it * is set, release both sockets and go to sleep */ - + if ((((struct unpcb *)waitso->so_pcb)->unp_flags & UNP_DONTDISCONNECT) != 0) { if (so != so2) { socket_unlock(so2, 1); } so_locked = 0; - (void)msleep(waitso->so_pcb, &unp->unp_mtx, + (void)msleep(waitso->so_pcb, &unp->unp_mtx, PSOCK | PDROP, "unpdisconnect", NULL); goto try_again; } - + if (unp->unp_conn == NULL) { panic("unp_conn became NULL after sleep"); } @@ -1739,7 +1741,7 @@ unp_pcblist64 SYSCTL_HANDLER_ARGS if (req->oldptr == USER_ADDR_NULL) { n = unp_count; req->oldidx = 2 * sizeof (xug) + (n + n / 8) * - (sizeof (struct xunpcb64)); + (sizeof (struct xunpcb64)); lck_rw_done(unp_list_mtx); return (0); } @@ -1929,7 +1931,7 @@ unp_externalize(struct mbuf *rights) * now change each pointer to an fd in the global table to * an integer that is the index to the local fd table entry * that we set up to point to the global one we are transferring. - * XXX (1) this assumes a pointer and int are the same size, + * XXX (1) this assumes a pointer and int are the same size, * XXX or the mbuf can hold the expansion * XXX (2) allocation failures should be non-fatal */ @@ -1974,7 +1976,7 @@ unp_externalize(struct mbuf *rights) if (fileproc_l[i] != NULL) { VERIFY(fileproc_l[i]->f_fglob != NULL && (fileproc_l[i]->f_fglob->fg_lflags & FG_RMMSGQ)); - VERIFY(fds[i] > 0); + VERIFY(fds[i] >= 0); fg_removeuipc(fileproc_l[i]->f_fglob); /* Drop the iocount */ @@ -2079,7 +2081,7 @@ unp_internalize(struct mbuf *control, proc_t p) } rp = (struct fileglob **)(cm + 1); - /* On K64 we need to walk backwards because a fileglob * is twice the size of an fd + /* On K64 we need to walk backwards because a fileglob * is twice the size of an fd * and doing them in-order would result in stomping over unprocessed fd's */ for (i = (oldfds - 1); i >= 0; i--) { @@ -2227,7 +2229,7 @@ unp_gc(void) * message buffers. Follow those links and mark them * as accessible too. * - * In case a file is passed onto itself we need to + * In case a file is passed onto itself we need to * release the file lock. */ lck_mtx_unlock(&fg->fg_lock); @@ -2316,7 +2318,7 @@ unp_gc(void) so = (struct socket *)(tfg->fg_data); socket_lock(so, 0); - + sorflush(so); socket_unlock(so, 0); @@ -2435,7 +2437,7 @@ unp_lock(struct socket *so, int refcount, void * lr) if (so->so_pcb) { lck_mtx_lock(&((struct unpcb *)so->so_pcb)->unp_mtx); } else { - panic("unp_lock: so=%p NO PCB! lr=%p ref=0x%x\n", + panic("unp_lock: so=%p NO PCB! lr=%p ref=0x%x\n", so, lr_saved, so->so_usecount); } @@ -2482,7 +2484,7 @@ unp_unlock(struct socket *so, int refcount, void * lr) if (unp->unp_addr) FREE(unp->unp_addr, M_SONAME); - + lck_mtx_unlock(mutex_held); lck_mtx_destroy(&unp->unp_mtx, unp_mtx_grp); @@ -2511,4 +2513,3 @@ unp_getlock(struct socket *so, __unused int flags) return (so->so_proto->pr_domain->dom_mtx); } } - diff --git a/bsd/libkern/Makefile b/bsd/libkern/Makefile index 6b0060acd..5d65346c8 100644 --- a/bsd/libkern/Makefile +++ b/bsd/libkern/Makefile @@ -7,7 +7,7 @@ include $(MakeInc_cmd) include $(MakeInc_def) KERNELFILES = \ - libkern.h + libkern.h copyio.h EXPORT_MI_LIST = ${KERNELFILES} diff --git a/bsd/libkern/copyio.h b/bsd/libkern/copyio.h new file mode 100644 index 000000000..1bec805e4 --- /dev/null +++ b/bsd/libkern/copyio.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef _LIBKERN_COPYIO_H_ +#define _LIBKERN_COPYIO_H_ + +#include + +__BEGIN_DECLS + +int copyin(const user_addr_t uaddr, void *kaddr, size_t len); +int copyout(const void *kaddr, user_addr_t udaddr, size_t len); + +#if defined (_FORTIFY_SOURCE) && _FORTIFY_SOURCE == 0 +/* FORTIFY_SOURCE disabled */ +#else +__attribute__((always_inline)) static inline int +__copyin_chk(const user_addr_t uaddr, void *kaddr, size_t len, size_t chk_size) +{ + if (chk_size < len) { + panic("__copyin_chk object size check failed: uaddr %p, kaddr %p, (%zu < %zu)", (void*)uaddr, kaddr, len, chk_size); + } + return copyin(uaddr, kaddr, len); +} + +__attribute__((always_inline)) static inline int +__copyout_chk(const void *kaddr, user_addr_t uaddr, size_t len, size_t chk_size) +{ + if (chk_size < len) { + panic("__copyout_chk object size check failed: uaddr %p, kaddr %p, (%zu < %zu)", (void*)uaddr, kaddr, len, chk_size); + } + return copyout(kaddr, uaddr, len); +} +#define copyin(uaddr, kaddr, len) __copyin_chk(uaddr, kaddr, len, __builtin_object_size(kaddr, 0)) +#define copyout(kaddr, uaddr, len) __copyout_chk(kaddr, uaddr, len, __builtin_object_size(kaddr, 0)) +#endif +__END_DECLS +#endif /* _LIBKERN_COPYIO_H_ */ diff --git a/bsd/libkern/libkern.h b/bsd/libkern/libkern.h index fa8317325..73545298d 100644 --- a/bsd/libkern/libkern.h +++ b/bsd/libkern/libkern.h @@ -77,6 +77,8 @@ #include #include #include +#include +#include #if defined(__arm__) || defined(__arm64__) #include /* for _ARM_ARCH_* */ @@ -191,8 +193,6 @@ __nosan_crc16(uint16_t crc, const void *bufp, size_t len) { return crc16(crc, bu int copystr(const void *kfaddr, void *kdaddr, size_t len, size_t *done); int copyinstr(const user_addr_t uaddr, void *kaddr, size_t len, size_t *done); int copyoutstr(const void *kaddr, user_addr_t udaddr, size_t len, size_t *done); -int copyin(const user_addr_t uaddr, void *kaddr, size_t len); -int copyout(const void *kaddr, user_addr_t udaddr, size_t len); #if XNU_KERNEL_PRIVATE extern int copyin_word(const user_addr_t user_addr, uint64_t *kernel_addr, vm_size_t nbytes); #endif diff --git a/bsd/man/man2/Makefile b/bsd/man/man2/Makefile index 331ca334e..0e724cc86 100644 --- a/bsd/man/man2/Makefile +++ b/bsd/man/man2/Makefile @@ -41,6 +41,7 @@ DATAFILES = \ disconnectx.2 \ dup.2 \ dup2.2 \ + errno.2 \ execve.2 \ exchangedata.2 \ faccessat.2 \ diff --git a/bsd/man/man2/exchangedata.2 b/bsd/man/man2/exchangedata.2 index 10a22ae34..48d05b6c8 100644 --- a/bsd/man/man2/exchangedata.2 +++ b/bsd/man/man2/exchangedata.2 @@ -73,6 +73,16 @@ Programs that reference the file via an object identifier will continue to reference the original file, but now it has the new data. .Pp . +WARNING: This system call is largely supported only by HFS and AFP file systems. Many other +file systems, including APFS, do not support it. Further, it is not supported on iOS, tvOS, or watchOS. +It is recommended that callers refer +instead to +.Fn rename +or +.Fn renamex_np +to conduct safe-save operations instead. +.Pp +. .\" path1 and path2 parameters . The @@ -115,6 +125,7 @@ is set to indicate the error. .Sh COMPATIBILITY Not all volumes support .Fn exchangedata . +This includes APFS volumes. You can test whether a volume supports .Fn exchangedata by using diff --git a/bsd/man/man2/fs_snapshot_create.2 b/bsd/man/man2/fs_snapshot_create.2 index d9b740f67..6407f428e 100644 --- a/bsd/man/man2/fs_snapshot_create.2 +++ b/bsd/man/man2/fs_snapshot_create.2 @@ -1,4 +1,4 @@ -.\" Copyright (c) 2017 Apple Computer, Inc. All rights reserved. +.\" Copyright (c) 2017-2018 Apple Computer, Inc. All rights reserved. .\" .\" The contents of this file constitute Original Code as defined in and .\" are subject to the Apple Public Source License Version 1.1 (the @@ -20,7 +20,7 @@ .Dt FS_SNAPSHOT_CREATE 2 .Os Darwin .Sh NAME -.Nm fs_snasphot_create +.Nm fs_snapshot_create .Nd create read only snapshot of a mounted filesystem .Sh SYNOPSIS .Fd #include diff --git a/bsd/man/man2/getsockname.2 b/bsd/man/man2/getsockname.2 index 58950e7df..fd2bd1d3b 100644 --- a/bsd/man/man2/getsockname.2 +++ b/bsd/man/man2/getsockname.2 @@ -63,6 +63,17 @@ On return it contains the actual size of the address returned (in bytes). .Pp The address is truncated if the buffer provided is too small. +.Pp +Note: For the UNIX domain, the address length returned is the +.Fa address_len +parameter passed to the previous +.Xr bind 2 +system call and not the +.Va sa_len +field of the +.Fa address +parameter passed to +.Xr bind 2 . .Sh RETURN VALUES .Rv -std getsockname .Sh ERRORS diff --git a/bsd/man/man2/searchfs.2 b/bsd/man/man2/searchfs.2 index f141a5437..b90765b34 100644 --- a/bsd/man/man2/searchfs.2 +++ b/bsd/man/man2/searchfs.2 @@ -16,7 +16,7 @@ .\" .\" @(#)searchfs.2 . -.Dd October 13, 2008 +.Dd November 16, 2017 .Dt SEARCHFS 2 .Os Darwin .Sh NAME @@ -26,7 +26,7 @@ .Fd #include .Fd #include .Ft int -.Fn searchfs "const char* path" "struct fssearchblock* searchBlock" "unsigned int* numMatches" "unsigned int scriptCode" "unsigned int options" "struct searchstate* state" +.Fn searchfs "const char* path" "struct fssearchblock* searchBlock" "unsigned long* numMatches" "unsigned int scriptCode" "unsigned int options" "struct searchstate* state" . .Sh DESCRIPTION The @@ -818,8 +818,8 @@ static int SearchFSDemo( SearchAttrBuf lower; SearchAttrBuf upper; static const unsigned char kAllOnes[4] = { 0xFF, 0xFF, 0xFF, 0xFF }; - unsigned int matchCount; - unsigned int matchIndex; + unsigned long matchCount; + unsigned long matchIndex; unsigned int options; searchstate_t state; ResultAttrBuf * thisEntry; diff --git a/bsd/man/man2/send.2 b/bsd/man/man2/send.2 index 17f3be243..acadd27b2 100644 --- a/bsd/man/man2/send.2 +++ b/bsd/man/man2/send.2 @@ -230,6 +230,9 @@ The socket is shut down for writing or the socket is connection-mode and is no longer connected. In the latter case, and if the socket is of type SOCK_STREAM, the SIGPIPE signal is generated to the calling thread. +.\" ========== +.It Bq Er EADDRNOTAVAIL +The specified address is not available or no longer available on this machine. .El .Pp The diff --git a/bsd/man/man2/setuid.2 b/bsd/man/man2/setuid.2 index a49ca1068..205e8cfdc 100644 --- a/bsd/man/man2/setuid.2 +++ b/bsd/man/man2/setuid.2 @@ -104,7 +104,7 @@ in this way, the effective user ID of a set-user-ID executable may be toggled by switching to the real user ID, then re-enabled by reverting to the set-user-ID value. Similarly, the effective group ID may be set to the value -of the real group ID or the saved set-user-ID. +of the real group ID or the saved set-group-ID. .Pp .Sh RETURN VALUES Upon success, these functions return 0; diff --git a/bsd/man/man3/getiopolicy_np.3 b/bsd/man/man3/getiopolicy_np.3 index 43bc02499..8c7e69743 100644 --- a/bsd/man/man3/getiopolicy_np.3 +++ b/bsd/man/man3/getiopolicy_np.3 @@ -23,20 +23,6 @@ or the current thread. The policy of the I/O of the given type can be get or set for the given .Fa scope . .Pp -The I/O type is specified in the argument -.Fa iotype . -The only currently supported I/O type is -.Dv IOPOL_TYPE_DISK , -which can mean either the I/O policy for I/Os to local disks or to -remote volumes. -I/Os to local disks are I/Os sent to the media without going through a network, -including I/Os to internal and external hard drives, optical media in internal -and external drives, flash drives, floppy disks, ram disks, and mounted disk -images which reside on these media. -I/Os to remote volumes are I/Os that require network activity to complete the -operation. -This is currently only supported for remote volumes mounted by SMB or AFP. -.Pp The scope that the I/O policy takes effect is specified in the argument .Fa scope as follows: @@ -55,8 +41,24 @@ the argument .Fa policy is an integer which contains the new I/O policy to be set for the given I/O type and scope. -.Fa Policy -can have the following values: +.Pp +The I/O type is specified in the argument +.Fa iotype . +The currently supported I/O types are as follows: +.Bl -tag -width F1 +.It IOPOL_TYPE_DISK +This can mean either the I/O policy for I/Os to local disks or to +remote volumes. +I/Os to local disks are I/Os sent to the media without going through a network, +including I/Os to internal and external hard drives, optical media in internal +and external drives, flash drives, floppy disks, ram disks, and mounted disk +images which reside on these media. +I/Os to remote volumes are I/Os that require network activity to complete the +operation. +This is currently only supported for remote volumes mounted by SMB or AFP. +.Pp +IOPOL_TYPE_DISK supports following values for +.Fa policy: .Bl -tag -width IOPOL_PASSIVEXXX .It IOPOL_IMPORTANT I/Os with the IMPORTANT policy are unrestricted. This policy should only be @@ -102,6 +104,28 @@ broken into smaller requests which are then issued serially. The I/O policy of a newly created process is inherited from its parent process. The I/O policy of an I/O request is the lowest priority policy of the current thread and the current process. +.It IOPOL_TYPE_VFS_ATIME_UPDATES +This +.Fa iotype +lets users change the access time updates policy for the files accessed +by the current thread or process. +.Pp +IOPOL_TYPE_VFS_ATIME_UPDATES supports following values for +.Fa policy: +.Bl -tag -width IOPOL_ATIME_UPDATES_DEFAULT +.It IOPOL_ATIME_UPDATES_OFF +The ATIME_UPDATES_OFF policy turns off access time updation for files accessed. +This policy is useful for applications which access a large number of files +to reduce the metadata I/O writes. +.It IOPOL_ATIME_UPDATES_DEFAULT +This is the default I/O policy for new threads. +.El +.El +.Pp +Like with IOPOL_TYPE_DISK, the I/O policy of a newly created process is +inherited from its parent process. Access time updates are turned off if the +I/O policy is set to IOPOL_ATIME_UPDATES_OFF for the current thread or current +process. .Sh RETURN VALUES The .Fn getiopolicy_np diff --git a/bsd/miscfs/devfs/devfs_tree.c b/bsd/miscfs/devfs/devfs_tree.c index adbc2e78f..6358c5c6b 100644 --- a/bsd/miscfs/devfs/devfs_tree.c +++ b/bsd/miscfs/devfs/devfs_tree.c @@ -967,7 +967,7 @@ static int dev_dup_entry(devnode_t * parent, devdirent_t * back, devdirent_t * *dnm_pp, struct devfsmount *dvm) { - devdirent_t * entry_p; + devdirent_t * entry_p = NULL; devdirent_t * newback; devdirent_t * newfront; int error; @@ -978,10 +978,14 @@ dev_dup_entry(devnode_t * parent, devdirent_t * back, devdirent_t * *dnm_pp, * go get the node made (if we need to) * use the back one as a prototype */ - if ((error = dev_add_entry(back->de_name, parent, type, - NULL, dnp, - parent?parent->dn_dvm:dvm, &entry_p)) != 0) { + error = dev_add_entry(back->de_name, parent, type, NULL, dnp, + parent?parent->dn_dvm:dvm, &entry_p); + if (!error && (entry_p == NULL)) { + error = ENOMEM; /* Really can't happen, but make static analyzer happy */ + } + if (error != 0) { printf("duplicating %s failed\n",back->de_name); + goto out; } /* @@ -1009,6 +1013,7 @@ dev_dup_entry(devnode_t * parent, devdirent_t * back, devdirent_t * *dnm_pp, } } } +out: *dnm_pp = entry_p; return error; } diff --git a/bsd/miscfs/nullfs/null_vnops.c b/bsd/miscfs/nullfs/null_vnops.c index 389adb7e4..05e28abc1 100644 --- a/bsd/miscfs/nullfs/null_vnops.c +++ b/bsd/miscfs/nullfs/null_vnops.c @@ -1035,3 +1035,36 @@ static struct vnodeopv_entry_desc nullfs_vnodeop_entries[] = { }; struct vnodeopv_desc nullfs_vnodeop_opv_desc = {&nullfs_vnodeop_p, nullfs_vnodeop_entries}; + +//NULLFS Specific helper function + +int +nullfs_getbackingvnode(vnode_t in_vp, vnode_t* out_vpp) +{ + int result = EINVAL; + + if (out_vpp == NULL || in_vp == NULL) { + goto end; + } + + struct vfsstatfs * sp = NULL; + mount_t mp = vnode_mount(in_vp); + + sp = vfs_statfs(mp); + //If this isn't a nullfs vnode or it is but it's a special vnode + if (strcmp(sp->f_fstypename, "nullfs") != 0 || nullfs_checkspecialvp(in_vp)) { + *out_vpp = NULLVP; + result = ENOENT; + goto end; + } + + vnode_t lvp = NULLVPTOLOWERVP(in_vp); + if ((result = vnode_getwithvid(lvp, NULLVPTOLOWERVID(in_vp)))) { + goto end; + } + + *out_vpp = lvp; + +end: + return result; +} diff --git a/bsd/miscfs/nullfs/nullfs.h b/bsd/miscfs/nullfs/nullfs.h index e29b9e696..80d8f174c 100644 --- a/bsd/miscfs/nullfs/nullfs.h +++ b/bsd/miscfs/nullfs/nullfs.h @@ -142,6 +142,8 @@ int null_getnewvnode( struct mount * mp, struct vnode * lowervp, struct vnode * dvp, struct vnode ** vpp, struct componentname * cnp, int root); void null_hashrem(struct null_node * xp); +int nullfs_getbackingvnode(vnode_t in_vp, vnode_t* out_vpp); + #define NULLVPTOLOWERVP(vp) (VTONULL(vp)->null_lowervp) #define NULLVPTOLOWERVID(vp) (VTONULL(vp)->null_lowervid) #define NULLVPTOMYVID(vp) (VTONULL(vp)->null_myvid) @@ -150,9 +152,6 @@ extern struct vnodeopv_desc nullfs_vnodeop_opv_desc; extern vop_t * nullfs_vnodeop_p; -// int nullfs_install_filesys(void); -// int nullfs_uninstall_filesys(void); - __END_DECLS #ifdef NULLFS_DEBUG diff --git a/bsd/miscfs/specfs/spec_vnops.c b/bsd/miscfs/specfs/spec_vnops.c index 6e0c09d1c..702787a41 100644 --- a/bsd/miscfs/specfs/spec_vnops.c +++ b/bsd/miscfs/specfs/spec_vnops.c @@ -1138,7 +1138,7 @@ throttle_timer(struct _throttle_io_info_t *info) ut = (uthread_t)TAILQ_FIRST(&info->throttle_uthlist[wake_level]); TAILQ_REMOVE(&info->throttle_uthlist[wake_level], ut, uu_throttlelist); ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE; - ut->uu_is_throttled = FALSE; + ut->uu_is_throttled = false; wake_address = (caddr_t)&ut->uu_on_throttlelist; } @@ -1156,7 +1156,7 @@ throttle_timer(struct _throttle_io_info_t *info) TAILQ_REMOVE(&info->throttle_uthlist[level], ut, uu_throttlelist); ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE; - ut->uu_is_throttled = FALSE; + ut->uu_is_throttled = false; wakeup(&ut->uu_on_throttlelist); } @@ -1335,13 +1335,12 @@ throttle_init(void) } void -sys_override_io_throttle(int flag) +sys_override_io_throttle(boolean_t enable_override) { - if (flag == THROTTLE_IO_ENABLE) - lowpri_throttle_enabled = 1; - - if (flag == THROTTLE_IO_DISABLE) + if (enable_override) lowpri_throttle_enabled = 0; + else + lowpri_throttle_enabled = 1; } int rethrottle_wakeups = 0; @@ -1382,19 +1381,19 @@ rethrottle_thread(uthread_t ut) boolean_t s = ml_set_interrupts_enabled(FALSE); lck_spin_lock(&ut->uu_rethrottle_lock); - if (ut->uu_is_throttled == FALSE) - ut->uu_was_rethrottled = TRUE; + if (!ut->uu_is_throttled) + ut->uu_was_rethrottled = true; else { int my_new_level = throttle_get_thread_throttle_level(ut); if (my_new_level != ut->uu_on_throttlelist) { /* * ut is currently blocked (as indicated by - * ut->uu_is_throttled == TRUE) + * ut->uu_is_throttled == true) * and we're changing it's throttle level, so * we need to wake it up. */ - ut->uu_is_throttled = FALSE; + ut->uu_is_throttled = false; wakeup(&ut->uu_on_throttlelist); rethrottle_wakeups++; @@ -1622,7 +1621,7 @@ throttle_get_thread_throttle_level_internal(uthread_t ut, int io_tier) { assert(ut != NULL); /* Bootcache misses should always be throttled */ - if (ut->uu_throttle_bc == TRUE) + if (ut->uu_throttle_bc) thread_throttle_level = THROTTLE_LEVEL_TIER3; /* @@ -1781,7 +1780,7 @@ throttle_lowpri_io(int sleep_amount) info = ut->uu_throttle_info; if (info == NULL) { - ut->uu_throttle_bc = FALSE; + ut->uu_throttle_bc = false; ut->uu_lowpri_window = 0; return (0); } @@ -1791,12 +1790,12 @@ throttle_lowpri_io(int sleep_amount) if (sleep_amount == 0) goto done; - if (sleep_amount == 1 && ut->uu_throttle_bc == FALSE) + if (sleep_amount == 1 && !ut->uu_throttle_bc) sleep_amount = 0; throttle_io_period_num = info->throttle_io_period_num; - ut->uu_was_rethrottled = FALSE; + ut->uu_was_rethrottled = false; while ( (throttle_type = throttle_io_will_be_throttled_internal(info, &mylevel, &throttling_level)) ) { @@ -1836,7 +1835,7 @@ throttle_lowpri_io(int sleep_amount) * this is the critical section w/r to our interaction * with "rethrottle_thread" */ - if (ut->uu_was_rethrottled == TRUE) { + if (ut->uu_was_rethrottled) { lck_spin_unlock(&ut->uu_rethrottle_lock); ml_set_interrupts_enabled(s); @@ -1844,7 +1843,7 @@ throttle_lowpri_io(int sleep_amount) KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 103)), thread_tid(ut->uu_thread), ut->uu_on_throttlelist, 0, 0, 0); - ut->uu_was_rethrottled = FALSE; + ut->uu_was_rethrottled = false; continue; } KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, PROCESS_THROTTLED)) | DBG_FUNC_NONE, @@ -1859,7 +1858,7 @@ throttle_lowpri_io(int sleep_amount) assert_wait((caddr_t)&ut->uu_on_throttlelist, THREAD_UNINT); - ut->uu_is_throttled = TRUE; + ut->uu_is_throttled = true; lck_spin_unlock(&ut->uu_rethrottle_lock); ml_set_interrupts_enabled(s); @@ -1869,8 +1868,8 @@ throttle_lowpri_io(int sleep_amount) ut->uu_wmesg = NULL; - ut->uu_is_throttled = FALSE; - ut->uu_was_rethrottled = FALSE; + ut->uu_is_throttled = false; + ut->uu_was_rethrottled = false; lck_mtx_lock(&info->throttle_lock); @@ -1904,7 +1903,7 @@ throttle_lowpri_io(int sleep_amount) } ut->uu_throttle_info = NULL; - ut->uu_throttle_bc = FALSE; + ut->uu_throttle_bc = false; ut->uu_lowpri_window = 0; throttle_info_rel(info); @@ -1942,7 +1941,7 @@ void throttle_info_reset_window(uthread_t ut) ut->uu_throttle_info = NULL; ut->uu_lowpri_window = 0; - ut->uu_throttle_bc = FALSE; + ut->uu_throttle_bc = false; } } @@ -2349,7 +2348,7 @@ spec_strategy(struct vnop_strategy_args *ap) if (kdebug_enable) { KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE, - buf_kernel_addrperm_addr(bp), bdev, (int)buf_blkno(bp), buf_count(bp), 0); + buf_kernel_addrperm_addr(bp), bdev, buf_blkno(bp), buf_count(bp), 0); } thread_update_io_stats(current_thread(), buf_count(bp), code); @@ -2650,7 +2649,7 @@ static void filt_specdetach(struct knote *kn); static int filt_specevent(struct knote *kn, long hint); static int filt_spectouch(struct knote *kn, struct kevent_internal_s *kev); static int filt_specprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); -static unsigned filt_specpeek(struct knote *kn); +static int filt_specpeek(struct knote *kn); SECURITY_READ_ONLY_EARLY(struct filterops) spec_filtops = { .f_isfd = 1, @@ -2719,6 +2718,14 @@ spec_knote_select_and_link(struct knote *kn) */ old_wqs = uth->uu_wqset; uth->uu_wqset = &(knote_get_kq(kn)->kq_wqs); + + /* + * Be sure that the waitq set is linked + * before calling select to avoid possible + * allocation under spinlocks. + */ + waitq_set_lazy_init_link(uth->uu_wqset); + /* * Now these are the laws of VNOP_SELECT, as old and as true as the sky, * And the device that shall keep it may prosper, but the device that shall @@ -2877,8 +2884,6 @@ filt_spectouch(struct knote *kn, struct kevent_internal_s *kev) { kn->kn_sdata = kev->data; kn->kn_sfflags = kev->fflags; - if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) - kn->kn_udata = kev->udata; if (kev->flags & EV_ENABLE) { return spec_knote_select_and_link(kn); @@ -2902,8 +2907,6 @@ filt_specprocess(struct knote *kn, struct filt_process_s *data, struct kevent_in ctx = vfs_context_current(); vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; - /* FIXME JMM - locking against touches? */ - error = vnode_getwithvid(vp, kn->kn_hookid); if (error != 0) { kn->kn_flags |= (EV_EOF | EV_ONESHOT); @@ -2930,7 +2933,7 @@ filt_specprocess(struct knote *kn, struct filt_process_s *data, struct kevent_in return res; } -static unsigned +static int filt_specpeek(struct knote *kn) { int selres = 0; @@ -2938,6 +2941,6 @@ filt_specpeek(struct knote *kn) selres = spec_knote_select_and_link(kn); filt_spec_common(kn, selres); - return kn->kn_data; + return kn->kn_data != 0; } diff --git a/bsd/net/Makefile b/bsd/net/Makefile index a2e90264f..60c66a58b 100644 --- a/bsd/net/Makefile +++ b/bsd/net/Makefile @@ -69,7 +69,8 @@ PRIVATE_DATAFILES = \ raw_cb.h \ route.h \ net_perf.h \ - net_kev.h + net_kev.h \ + nat464_utils.h PRIVATE_KERNELFILES = $(filter-out radix.h,${KERNELFILES}) \ bpfdesc.h ppp_comp.h \ diff --git a/bsd/net/bpf.c b/bsd/net/bpf.c index 70b69823b..860b73848 100644 --- a/bsd/net/bpf.c +++ b/bsd/net/bpf.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2017 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* @@ -62,7 +62,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)bpf.c 8.2 (Berkeley) 3/28/94 + * @(#)bpf.c 8.2 (Berkeley) 3/28/94 * * $FreeBSD: src/sys/net/bpf.c,v 1.59.2.5 2001/01/05 04:49:09 jdp Exp $ */ @@ -76,9 +76,9 @@ #include "bpf.h" #ifndef __GNUC__ -#define inline +#define inline #else -#define inline __inline +#define inline __inline #endif #include @@ -108,6 +108,8 @@ #include #include +#include +#include #include #include #include @@ -116,6 +118,8 @@ #include #include #include +#include +#include #include #include #include @@ -132,13 +136,17 @@ #include #endif /* MAC_NET */ +#include + extern int tvtohz(struct timeval *); -#define BPF_BUFSIZE 4096 -#define UIOMOVE(cp, len, code, uio) uiomove(cp, len, uio) +#define BPF_BUFSIZE 4096 +#define UIOMOVE(cp, len, code, uio) uiomove(cp, len, uio) +#define PRINET 26 /* interruptible */ -#define PRINET 26 /* interruptible */ +#define ISAKMP_HDR_SIZE (sizeof(struct isakmp) + sizeof(struct isakmp_gen)) +#define ESP_HDR_SIZE sizeof(struct newesp) typedef void (*pktcopyfunc_t)(const void *, void *, size_t); @@ -183,10 +191,11 @@ static struct bpf_if *bpf_iflist; * the bpf_d in a separate table indexed by minor device #. * * The value stored in bpf_dtab[n] represent three states: - * 0: device not opened - * 1: device opening or closing + * NULL: device not opened + * BPF_DEV_RESERVED: device opening or closing * other: device opened with pointer to storage */ +#define BPF_DEV_RESERVED ((struct bpf_d *)(uintptr_t)1) static struct bpf_d **bpf_dtab = NULL; static unsigned int bpf_dtab_size = 0; static unsigned int nbpfilter = 0; @@ -205,9 +214,10 @@ static int bpf_detachd(struct bpf_d *d, int); static void bpf_freed(struct bpf_d *); static int bpf_movein(struct uio *, int, struct mbuf **, struct sockaddr *, int *); -static int bpf_setif(struct bpf_d *, ifnet_t ifp); +static int bpf_setif(struct bpf_d *, ifnet_t ifp, bool, bool); static void bpf_timed_out(void *, void *); static void bpf_wakeup(struct bpf_d *); +static u_int get_pkt_trunc_len(u_char *, u_int); static void catchpacket(struct bpf_d *, struct bpf_packet *, u_int, int); static void reset_d(struct bpf_d *); static int bpf_setf(struct bpf_d *, u_int, user_addr_t, u_long); @@ -219,7 +229,7 @@ static void bpf_set_packet_service_class(struct mbuf *, int); static void bpf_acquire_d(struct bpf_d *); static void bpf_release_d(struct bpf_d *); -static int bpf_devsw_installed; +static int bpf_devsw_installed; void bpf_init(void *unused); static int bpf_tap_callback(struct ifnet *ifp, struct mbuf *m); @@ -235,9 +245,8 @@ static int bpf_tap_callback(struct ifnet *ifp, struct mbuf *m); ioctl_fcn_t bpfioctl; select_fcn_t bpfselect; - /* Darwin's cdevsw struct differs slightly from BSDs */ -#define CDEV_MAJOR 23 +#define CDEV_MAJOR 23 static struct cdevsw bpf_cdevsw = { /* open */ bpfopen, /* close */ bpfclose, @@ -249,16 +258,17 @@ static struct cdevsw bpf_cdevsw = { /* tty */ NULL, /* select */ bpfselect, /* mmap */ eno_mmap, - /* strategy*/ eno_strat, + /* strategy */ eno_strat, /* getc */ eno_getc, /* putc */ eno_putc, /* type */ 0 }; -#define SOCKADDR_HDR_LEN offsetof(struct sockaddr, sa_data) +#define SOCKADDR_HDR_LEN offsetof(struct sockaddr, sa_data) static int -bpf_movein(struct uio *uio, int linktype, struct mbuf **mp, struct sockaddr *sockp, int *datlen) +bpf_movein(struct uio *uio, int linktype, struct mbuf **mp, + struct sockaddr *sockp, int *datlen) { struct mbuf *m; int error; @@ -267,40 +277,40 @@ bpf_movein(struct uio *uio, int linktype, struct mbuf **mp, struct sockaddr *soc int hlen; switch (linktype) { - + #if SLIP case DLT_SLIP: sa_family = AF_INET; hlen = 0; break; #endif /* SLIP */ - + case DLT_EN10MB: sa_family = AF_UNSPEC; /* XXX Would MAXLINKHDR be better? */ hlen = sizeof(struct ether_header); break; - + #if FDDI case DLT_FDDI: - #if defined(__FreeBSD__) || defined(__bsdi__) +#if defined(__FreeBSD__) || defined(__bsdi__) sa_family = AF_IMPLINK; hlen = 0; - #else +#else sa_family = AF_UNSPEC; /* XXX 4(FORMAC)+6(dst)+6(src)+3(LLC)+5(SNAP) */ hlen = 24; - #endif +#endif break; #endif /* FDDI */ - + case DLT_RAW: case DLT_NULL: sa_family = AF_UNSPEC; hlen = 0; break; - - #ifdef __FreeBSD__ + +#ifdef __FreeBSD__ case DLT_ATM_RFC1483: /* * en atm driver requires 4-byte atm pseudo header. @@ -308,21 +318,21 @@ bpf_movein(struct uio *uio, int linktype, struct mbuf **mp, struct sockaddr *soc * specified anyway. */ sa_family = AF_UNSPEC; - hlen = 12; /* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */ + hlen = 12; /* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */ break; - #endif +#endif case DLT_PPP: sa_family = AF_UNSPEC; hlen = 4; /* This should match PPP_HDRLEN */ break; - + case DLT_APPLE_IP_OVER_IEEE1394: sa_family = AF_UNSPEC; hlen = sizeof(struct firewire_header); break; - case DLT_IEEE802_11: /* IEEE 802.11 wireless */ + case DLT_IEEE802_11: /* IEEE 802.11 wireless */ sa_family = AF_IEEE80211; hlen = 0; break; @@ -365,7 +375,7 @@ bpf_movein(struct uio *uio, int linktype, struct mbuf **mp, struct sockaddr *soc */ hlen = 0; } - + MGETHDR(m, M_WAIT, MT_DATA); if (m == 0) return (ENOBUFS); @@ -379,7 +389,7 @@ bpf_movein(struct uio *uio, int linktype, struct mbuf **mp, struct sockaddr *soc m->m_pkthdr.len = m->m_len = len; m->m_pkthdr.rcvif = NULL; *mp = m; - + /* * Make room for link header. */ @@ -394,24 +404,27 @@ bpf_movein(struct uio *uio, int linktype, struct mbuf **mp, struct sockaddr *soc error = UIOMOVE(mtod(m, caddr_t), len - hlen, UIO_WRITE, uio); if (error) goto bad; - + /* Check for multicast destination */ switch (linktype) { case DLT_EN10MB: { - struct ether_header *eh = mtod(m, struct ether_header *); - + struct ether_header *eh; + + eh = mtod(m, struct ether_header *); if (ETHER_IS_MULTICAST(eh->ether_dhost)) { - if (_ether_cmp(etherbroadcastaddr, eh->ether_dhost) == 0) + if (_ether_cmp(etherbroadcastaddr, + eh->ether_dhost) == 0) { m->m_flags |= M_BCAST; - else + } else { m->m_flags |= M_MCAST; + } } break; } } - - return 0; - bad: + + return (0); +bad: m_freem(m); return (error); } @@ -421,7 +434,7 @@ bpf_movein(struct uio *uio, int linktype, struct mbuf **mp, struct sockaddr *soc /* * The dynamic addition of a new device node must block all processes that * are opening the last device so that no process will get an unexpected - * ENOENT + * ENOENT */ static void bpf_make_dev_t(int maj) @@ -434,32 +447,33 @@ bpf_make_dev_t(int maj) while (bpf_growing) { /* Wait until new device has been created */ - (void)tsleep((caddr_t)&bpf_growing, PZERO, "bpf_growing", 0); + (void) tsleep((caddr_t)&bpf_growing, PZERO, "bpf_growing", 0); } if (nbpfilter > cur_size) { /* other thread grew it already */ return; } bpf_growing = 1; - + /* need to grow bpf_dtab first */ if (nbpfilter == bpf_dtab_size) { int new_dtab_size; struct bpf_d **new_dtab = NULL; struct bpf_d **old_dtab = NULL; - - new_dtab_size = bpf_dtab_size + NBPFILTER; - new_dtab = (struct bpf_d **)_MALLOC(sizeof(struct bpf_d *) * new_dtab_size, M_DEVBUF, M_WAIT); + + new_dtab_size = bpf_dtab_size + NBPFILTER; + new_dtab = (struct bpf_d **)_MALLOC( + sizeof(struct bpf_d *) * new_dtab_size, M_DEVBUF, M_WAIT); if (new_dtab == 0) { printf("bpf_make_dev_t: malloc bpf_dtab failed\n"); goto done; } if (bpf_dtab) { - bcopy(bpf_dtab, new_dtab, - sizeof(struct bpf_d *) * bpf_dtab_size); + bcopy(bpf_dtab, new_dtab, + sizeof(struct bpf_d *) * bpf_dtab_size); } - bzero(new_dtab + bpf_dtab_size, - sizeof(struct bpf_d *) * NBPFILTER); + bzero(new_dtab + bpf_dtab_size, + sizeof(struct bpf_d *) * NBPFILTER); old_dtab = bpf_dtab; bpf_dtab = new_dtab; bpf_dtab_size = new_dtab_size; @@ -485,7 +499,7 @@ bpf_attachd(struct bpf_d *d, struct bpf_if *bp) { int first = bp->bif_dlist == NULL; int error = 0; - + /* * Point d at bp, and add d to the interface's list of listeners. * Finally, point the driver's bpf cookie at the interface so @@ -505,7 +519,7 @@ bpf_attachd(struct bpf_d *d, struct bpf_if *bp) /* Find the default bpf entry for this ifp */ if (bp->bif_ifp->if_bpf == NULL) { struct bpf_if *tmp, *primary = NULL; - + for (tmp = bpf_iflist; tmp; tmp = tmp->bif_next) { if (tmp->bif_ifp == bp->bif_ifp) { primary = tmp; @@ -516,10 +530,12 @@ bpf_attachd(struct bpf_d *d, struct bpf_if *bp) } /* Only call dlil_set_bpf_tap for primary dlt */ if (bp->bif_ifp->if_bpf == bp) - dlil_set_bpf_tap(bp->bif_ifp, BPF_TAP_INPUT_OUTPUT, bpf_tap_callback); + dlil_set_bpf_tap(bp->bif_ifp, BPF_TAP_INPUT_OUTPUT, + bpf_tap_callback); if (bp->bif_tap != NULL) - error = bp->bif_tap(bp->bif_ifp, bp->bif_dlt, BPF_TAP_INPUT_OUTPUT); + error = bp->bif_tap(bp->bif_ifp, bp->bif_dlt, + BPF_TAP_INPUT_OUTPUT); } /* @@ -532,7 +548,7 @@ bpf_attachd(struct bpf_d *d, struct bpf_if *bp) } else { d->bd_flags &= ~BPF_FINALIZE_PKTAP; } - return error; + return (error); } /* @@ -583,7 +599,7 @@ bpf_detachd(struct bpf_d *d, int closing) dlil_set_bpf_tap(ifp, BPF_TAP_DISABLE, NULL); if (bp->bif_tap) bp->bif_tap(ifp, bp->bif_dlt, BPF_TAP_DISABLE); - + for (bp = bpf_iflist; bp; bp = bp->bif_next) if (bp->bif_ifp == ifp && bp->bif_dlist != 0) break; @@ -641,7 +657,6 @@ bpf_detachd(struct bpf_d *d, int closing) return (0); } - /* * Start asynchronous timer, if necessary. * Must be called with bpf_mlock held. @@ -660,7 +675,7 @@ bpf_start_timer(struct bpf_d *d) (uint64_t)tv.tv_sec * USEC_PER_SEC + tv.tv_usec, NSEC_PER_USEC, &deadline); /* - * The state is BPF_IDLE, so the timer hasn't + * The state is BPF_IDLE, so the timer hasn't * been started yet, and hasn't gone off yet; * there is no thread call scheduled, so this * won't change the schedule. @@ -684,10 +699,10 @@ bpf_stop_timer(struct bpf_d *d) * If the timer has already gone off, this does nothing. * Our caller is expected to set d->bd_state to BPF_IDLE, * with the bpf_mlock, after we are called. bpf_timed_out() - * also grabs bpf_mlock, so, if the timer has gone off and + * also grabs bpf_mlock, so, if the timer has gone off and * bpf_timed_out() hasn't finished, it's waiting for the - * lock; when this thread releases the lock, it will - * find the state is BPF_IDLE, and just release the + * lock; when this thread releases the lock, it will + * find the state is BPF_IDLE, and just release the * lock and return. */ return (thread_call_cancel(d->bd_thread_call)); @@ -737,7 +752,7 @@ bpf_release_d(struct bpf_d *d) /* ARGSUSED */ int bpfopen(dev_t dev, int flags, __unused int fmt, - __unused struct proc *p) + struct proc *p) { struct bpf_d *d; @@ -746,28 +761,30 @@ bpfopen(dev_t dev, int flags, __unused int fmt, lck_mtx_unlock(bpf_mlock); return (ENXIO); } - /* - * New device nodes are created on demand when opening the last one. - * The programming model is for processes to loop on the minor starting at 0 - * as long as EBUSY is returned. The loop stops when either the open succeeds or - * an error other that EBUSY is returned. That means that bpf_make_dev_t() must - * block all processes that are opening the last node. If not all - * processes are blocked, they could unexpectedly get ENOENT and abort their - * opening loop. + /* + * New device nodes are created on demand when opening the last one. + * The programming model is for processes to loop on the minor starting + * at 0 as long as EBUSY is returned. The loop stops when either the + * open succeeds or an error other that EBUSY is returned. That means + * that bpf_make_dev_t() must block all processes that are opening the + * last node. If not all processes are blocked, they could unexpectedly + * get ENOENT and abort their opening loop. */ if ((unsigned int) minor(dev) == (nbpfilter - 1)) bpf_make_dev_t(major(dev)); /* - * Each minor can be opened by only one process. If the requested + * Each minor can be opened by only one process. If the requested * minor is in use, return EBUSY. * - * Important: bpfopen() and bpfclose() have to check and set the status of a device - * in the same lockin context otherwise the device may be leaked because the vnode use count - * will be unpextectly greater than 1 when close() is called. + * Important: bpfopen() and bpfclose() have to check and set the status + * of a device in the same lockin context otherwise the device may be + * leaked because the vnode use count will be unpextectly greater than 1 + * when close() is called. */ - if (bpf_dtab[minor(dev)] == 0) { - bpf_dtab[minor(dev)] = (void *)1; /* Mark opening */ + if (bpf_dtab[minor(dev)] == NULL) { + /* Reserve while opening */ + bpf_dtab[minor(dev)] = BPF_DEV_RESERVED; } else { lck_mtx_unlock(bpf_mlock); return (EBUSY); @@ -779,7 +796,7 @@ bpfopen(dev_t dev, int flags, __unused int fmt, printf("bpfopen: malloc bpf_d failed\n"); bpf_dtab[minor(dev)] = NULL; lck_mtx_unlock(bpf_mlock); - return ENOMEM; + return (ENOMEM); } /* Mark "in use" and do most initialization. */ @@ -804,11 +821,14 @@ bpfopen(dev_t dev, int flags, __unused int fmt, return (ENOMEM); } + d->bd_opened_by = p; + uuid_generate(d->bd_uuid); + #if CONFIG_MACF_NET mac_bpfdesc_label_init(d); mac_bpfdesc_label_associate(kauth_cred_get(), d); #endif - bpf_dtab[minor(dev)] = d; /* Mark opened */ + bpf_dtab[minor(dev)] = d; /* Mark opened */ lck_mtx_unlock(bpf_mlock); return (0); @@ -821,7 +841,7 @@ bpfopen(dev_t dev, int flags, __unused int fmt, /* ARGSUSED */ int bpfclose(dev_t dev, __unused int flags, __unused int fmt, - __unused struct proc *p) + __unused struct proc *p) { struct bpf_d *d; @@ -829,7 +849,7 @@ bpfclose(dev_t dev, __unused int flags, __unused int fmt, lck_mtx_lock(bpf_mlock); d = bpf_dtab[minor(dev)]; - if (d == 0 || d == (void *)1) { + if (d == NULL || d == BPF_DEV_RESERVED) { lck_mtx_unlock(bpf_mlock); return (ENXIO); } @@ -843,7 +863,7 @@ bpfclose(dev_t dev, __unused int flags, __unused int fmt, printf("%s: %llx\n", __func__, (uint64_t)VM_KERNEL_ADDRPERM(d)); - bpf_dtab[minor(dev)] = (void *)1; /* Mark closing */ + bpf_dtab[minor(dev)] = BPF_DEV_RESERVED; /* Reserve while closing */ /* * Deal with any in-progress timeouts. @@ -866,9 +886,9 @@ bpfclose(dev_t dev, __unused int flags, __unused int fmt, */ if (!bpf_stop_timer(d)) { /* - * There was no pending call, so the call must + * There was no pending call, so the call must * have been in progress. Wait for the call to - * complete; we have to drop the lock while + * complete; we have to drop the lock while * waiting. to let the in-progrss call complete */ d->bd_state = BPF_DRAINING; @@ -891,8 +911,8 @@ bpfclose(dev_t dev, __unused int flags, __unused int fmt, * Another thread is blocked on a close waiting for * a timeout to finish. * This "shouldn't happen", as the first thread to enter - * bpfclose() will set bpf_dtab[minor(dev)] to 1, and - * all subsequent threads should see that and fail with + * bpfclose() will set bpf_dtab[minor(dev)] to 1, and + * all subsequent threads should see that and fail with * ENXIO. */ panic("Two threads blocked in a BPF close"); @@ -907,7 +927,7 @@ bpfclose(dev_t dev, __unused int flags, __unused int fmt, #endif thread_call_free(d->bd_thread_call); - while (d->bd_hbuf_read) + while (d->bd_hbuf_read != 0) msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading", NULL); bpf_freed(d); @@ -922,18 +942,38 @@ bpfclose(dev_t dev, __unused int flags, __unused int fmt, return (0); } - -#define BPF_SLEEP bpf_sleep +#define BPF_SLEEP bpf_sleep static int bpf_sleep(struct bpf_d *d, int pri, const char *wmesg, int timo) { u_int64_t abstime = 0; - if(timo) + if (timo != 0) clock_interval_to_deadline(timo, NSEC_PER_SEC / hz, &abstime); - - return msleep1((caddr_t)d, bpf_mlock, pri, wmesg, abstime); + + return (msleep1((caddr_t)d, bpf_mlock, pri, wmesg, abstime)); +} + +static void +bpf_finalize_pktap(struct bpf_hdr *hp, struct pktap_header *pktaphdr) +{ + if (pktaphdr->pth_flags & PTH_FLAG_V2_HDR) { + struct pktap_v2_hdr *pktap_v2_hdr; + + pktap_v2_hdr = (struct pktap_v2_hdr *)pktaphdr; + + if (pktap_v2_hdr->pth_flags & PTH_FLAG_DELAY_PKTAP) + pktap_v2_finalize_proc_info(pktap_v2_hdr); + } else { + if (pktaphdr->pth_flags & PTH_FLAG_DELAY_PKTAP) + pktap_finalize_proc_info(pktaphdr); + + if (pktaphdr->pth_flags & PTH_FLAG_TSTAMP) { + hp->bh_tstamp.tv_sec = pktaphdr->pth_tstamp.tv_sec; + hp->bh_tstamp.tv_usec = pktaphdr->pth_tstamp.tv_usec; + } + } } /* @@ -941,8 +981,8 @@ bpf_sleep(struct bpf_d *d, int pri, const char *wmesg, int timo) * into the hold slot, and the free buffer into the store slot. * Zero the length of the new store buffer. */ -#define ROTATE_BUFFERS(d) \ - if (d->bd_hbuf_read) \ +#define ROTATE_BUFFERS(d) \ + if (d->bd_hbuf_read != 0) \ panic("rotating bpf buffers during read"); \ (d)->bd_hbuf = (d)->bd_sbuf; \ (d)->bd_hlen = (d)->bd_slen; \ @@ -958,7 +998,7 @@ int bpfread(dev_t dev, struct uio *uio, int ioflag) { struct bpf_d *d; - caddr_t hbuf; + caddr_t hbuf; int timed_out, hbuf_len; int error; int flags; @@ -966,7 +1006,8 @@ bpfread(dev_t dev, struct uio *uio, int ioflag) lck_mtx_lock(bpf_mlock); d = bpf_dtab[minor(dev)]; - if (d == 0 || d == (void *)1 || (d->bd_flags & BPF_CLOSING) != 0) { + if (d == NULL || d == BPF_DEV_RESERVED || + (d->bd_flags & BPF_CLOSING) != 0) { lck_mtx_unlock(bpf_mlock); return (ENXIO); } @@ -982,14 +1023,14 @@ bpfread(dev_t dev, struct uio *uio, int ioflag) lck_mtx_unlock(bpf_mlock); return (EINVAL); } - - if (d->bd_state == BPF_WAITING) + + if (d->bd_state == BPF_WAITING) bpf_stop_timer(d); - + timed_out = (d->bd_state == BPF_TIMED_OUT); d->bd_state = BPF_IDLE; - while (d->bd_hbuf_read) + while (d->bd_hbuf_read != 0) msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading", NULL); if ((d->bd_flags & BPF_CLOSING) != 0) { @@ -1003,8 +1044,8 @@ bpfread(dev_t dev, struct uio *uio, int ioflag) * have arrived to fill the store buffer. */ while (d->bd_hbuf == 0) { - if ((d->bd_immediate || timed_out || (ioflag & IO_NDELAY)) - && d->bd_slen != 0) { + if ((d->bd_immediate || timed_out || (ioflag & IO_NDELAY)) && + d->bd_slen != 0) { /* * We're in immediate mode, or are reading * in non-blocking mode, or a timer was @@ -1034,8 +1075,7 @@ bpfread(dev_t dev, struct uio *uio, int ioflag) lck_mtx_unlock(bpf_mlock); return (EWOULDBLOCK); } - error = BPF_SLEEP(d, PRINET|PCATCH, "bpf", - d->bd_rtout); + error = BPF_SLEEP(d, PRINET|PCATCH, "bpf", d->bd_rtout); /* * Make sure device is still opened */ @@ -1045,8 +1085,9 @@ bpfread(dev_t dev, struct uio *uio, int ioflag) return (ENXIO); } - while (d->bd_hbuf_read) - msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading", NULL); + while (d->bd_hbuf_read != 0) + msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading", + NULL); if ((d->bd_flags & BPF_CLOSING) != 0) { bpf_release_d(d); @@ -1111,7 +1152,7 @@ bpfread(dev_t dev, struct uio *uio, int ioflag) */ /* - * Set the hold buffer read. So we do not + * Set the hold buffer read. So we do not * rotate the buffers until the hold buffer * read is complete. Also to avoid issues resulting * from page faults during disk sleep (). @@ -1138,7 +1179,7 @@ bpfread(dev_t dev, struct uio *uio, int ioflag) int found = 0; ehp = (struct bpf_hdr_ext *)(void *)p; - if ((flowid = ehp->bh_flowid)) { + if ((flowid = ehp->bh_flowid) != 0) { if (ehp->bh_proto == IPPROTO_TCP) found = inp_findinpcb_procinfo(&tcbinfo, flowid, &soprocinfo); @@ -1147,26 +1188,20 @@ bpfread(dev_t dev, struct uio *uio, int ioflag) flowid, &soprocinfo); if (found == 1) { ehp->bh_pid = soprocinfo.spi_pid; - proc_name(ehp->bh_pid, ehp->bh_comm, MAXCOMLEN); + proc_name(ehp->bh_pid, ehp->bh_comm, + MAXCOMLEN); } ehp->bh_flowid = 0; } if (flags & BPF_FINALIZE_PKTAP) { struct pktap_header *pktaphdr; - + pktaphdr = (struct pktap_header *)(void *) (p + BPF_WORDALIGN(ehp->bh_hdrlen)); - if (pktaphdr->pth_flags & PTH_FLAG_DELAY_PKTAP) - pktap_finalize_proc_info(pktaphdr); - - if (pktaphdr->pth_flags & PTH_FLAG_TSTAMP) { - ehp->bh_tstamp.tv_sec = - pktaphdr->pth_tstamp.tv_sec; - ehp->bh_tstamp.tv_usec = - pktaphdr->pth_tstamp.tv_usec; - } + bpf_finalize_pktap((struct bpf_hdr *) ehp, + pktaphdr); } p += BPF_WORDALIGN(ehp->bh_hdrlen + ehp->bh_caplen); } @@ -1177,20 +1212,12 @@ bpfread(dev_t dev, struct uio *uio, int ioflag) while (p < hbuf + hbuf_len) { struct bpf_hdr *hp; struct pktap_header *pktaphdr; - + hp = (struct bpf_hdr *)(void *)p; pktaphdr = (struct pktap_header *)(void *) (p + BPF_WORDALIGN(hp->bh_hdrlen)); - if (pktaphdr->pth_flags & PTH_FLAG_DELAY_PKTAP) - pktap_finalize_proc_info(pktaphdr); - - if (pktaphdr->pth_flags & PTH_FLAG_TSTAMP) { - hp->bh_tstamp.tv_sec = - pktaphdr->pth_tstamp.tv_sec; - hp->bh_tstamp.tv_usec = - pktaphdr->pth_tstamp.tv_usec; - } + bpf_finalize_pktap(hp, pktaphdr); p += BPF_WORDALIGN(hp->bh_hdrlen + hp->bh_caplen); } @@ -1203,7 +1230,7 @@ bpfread(dev_t dev, struct uio *uio, int ioflag) * we checked above that the read buffer is bpf_bufsize bytes. */ error = UIOMOVE(hbuf, hbuf_len, UIO_READ, uio); - + lck_mtx_lock(bpf_mlock); /* * Make sure device is still opened @@ -1213,7 +1240,7 @@ bpfread(dev_t dev, struct uio *uio, int ioflag) lck_mtx_unlock(bpf_mlock); return (ENXIO); } - + d->bd_hbuf_read = 0; d->bd_fbuf = d->bd_hbuf; d->bd_hbuf = NULL; @@ -1227,7 +1254,6 @@ bpfread(dev_t dev, struct uio *uio, int ioflag) } - /* * If there are processes sleeping on this descriptor, wake them up. */ @@ -1247,7 +1273,6 @@ bpf_wakeup(struct bpf_d *d) KNOTE(&d->bd_sel.si_note, 1); } - static void bpf_timed_out(void *arg, __unused void *dummy) { @@ -1256,7 +1281,7 @@ bpf_timed_out(void *arg, __unused void *dummy) lck_mtx_lock(bpf_mlock); if (d->bd_state == BPF_WAITING) { /* - * There's a select or kqueue waiting for this; if there's + * There's a select or kqueue waiting for this; if there's * now stuff to read, wake it up. */ d->bd_state = BPF_TIMED_OUT; @@ -1272,13 +1297,9 @@ bpf_timed_out(void *arg, __unused void *dummy) } lck_mtx_unlock(bpf_mlock); } - - - - /* keep in sync with bpf_movein above: */ -#define MAX_DATALINK_HDR_LEN (sizeof(struct firewire_header)) +#define MAX_DATALINK_HDR_LEN (sizeof(struct firewire_header)) int bpfwrite(dev_t dev, struct uio *uio, __unused int ioflag) @@ -1287,7 +1308,7 @@ bpfwrite(dev_t dev, struct uio *uio, __unused int ioflag) struct ifnet *ifp; struct mbuf *m = NULL; int error; - char dst_buf[SOCKADDR_HDR_LEN + MAX_DATALINK_HDR_LEN]; + char dst_buf[SOCKADDR_HDR_LEN + MAX_DATALINK_HDR_LEN]; int datlen = 0; int bif_dlt; int bd_hdrcmplt; @@ -1295,7 +1316,8 @@ bpfwrite(dev_t dev, struct uio *uio, __unused int ioflag) lck_mtx_lock(bpf_mlock); d = bpf_dtab[minor(dev)]; - if (d == 0 || d == (void *)1 || (d->bd_flags & BPF_CLOSING) != 0) { + if (d == NULL || d == BPF_DEV_RESERVED || + (d->bd_flags & BPF_CLOSING) != 0) { lck_mtx_unlock(bpf_mlock); return (ENXIO); } @@ -1332,7 +1354,7 @@ bpfwrite(dev_t dev, struct uio *uio, __unused int ioflag) /* bpf_movein allocating mbufs; drop lock */ lck_mtx_unlock(bpf_mlock); - error = bpf_movein(uio, bif_dlt, &m, + error = bpf_movein(uio, bif_dlt, &m, bd_hdrcmplt ? NULL : (struct sockaddr *)dst_buf, &datlen); @@ -1366,7 +1388,6 @@ bpfwrite(dev_t dev, struct uio *uio, __unused int ioflag) return (EMSGSIZE); } - #if CONFIG_MACF_NET mac_mbuf_label_associate_bpfdesc(d, m); #endif @@ -1402,7 +1423,7 @@ bpfwrite(dev_t dev, struct uio *uio, __unused int ioflag) static void reset_d(struct bpf_d *d) { - if (d->bd_hbuf_read) + if (d->bd_hbuf_read != 0) panic("resetting buffers during read"); if (d->bd_hbuf) { @@ -1418,6 +1439,146 @@ reset_d(struct bpf_d *d) d->bd_dcount = 0; } +static struct bpf_d * +bpf_get_device_from_uuid(uuid_t uuid) +{ + unsigned int i; + + for (i = 0; i < nbpfilter; i++) { + struct bpf_d *d = bpf_dtab[i]; + + if (d == NULL || d == BPF_DEV_RESERVED || + (d->bd_flags & BPF_CLOSING) != 0) + continue; + if (uuid_compare(uuid, d->bd_uuid) == 0) + return (d); + } + + return (NULL); +} + +/* + * The BIOCSETUP command "atomically" attach to the interface and + * copy the buffer from another interface. This minimizes the risk + * of missing packet because this is done while holding + * the BPF global lock + */ +static int +bpf_setup(struct bpf_d *d_to, uuid_t uuid_from, ifnet_t ifp) +{ + struct bpf_d *d_from; + int error = 0; + + LCK_MTX_ASSERT(bpf_mlock, LCK_MTX_ASSERT_OWNED); + + /* + * Sanity checks + */ + d_from = bpf_get_device_from_uuid(uuid_from); + if (d_from == NULL) { + error = ENOENT; + os_log_info(OS_LOG_DEFAULT, + "%s: uuids not found error %d", + __func__, error); + return (error); + } + if (d_from->bd_opened_by != d_to->bd_opened_by) { + error = EACCES; + os_log_info(OS_LOG_DEFAULT, + "%s: processes not matching error %d", + __func__, error); + return (error); + } + + /* + * Prevent any read while copying + */ + while (d_to->bd_hbuf_read != 0) + msleep((caddr_t)d_to, bpf_mlock, PRINET, __func__, NULL); + d_to->bd_hbuf_read = 1; + + while (d_from->bd_hbuf_read != 0) + msleep((caddr_t)d_from, bpf_mlock, PRINET, __func__, NULL); + d_from->bd_hbuf_read = 1; + + /* + * Verify the devices have not been closed + */ + if (d_to->bd_flags & BPF_CLOSING) { + error = ENXIO; + os_log_info(OS_LOG_DEFAULT, + "%s: d_to is closing error %d", + __func__, error); + goto done; + } + if (d_from->bd_flags & BPF_CLOSING) { + error = ENXIO; + os_log_info(OS_LOG_DEFAULT, + "%s: d_from is closing error %d", + __func__, error); + goto done; + } + + /* + * For now require the same buffer size + */ + if (d_from->bd_bufsize != d_to->bd_bufsize) { + error = EINVAL; + os_log_info(OS_LOG_DEFAULT, + "%s: bufsizes not matching error %d", + __func__, error); + goto done; + } + + /* + * Attach to the interface + */ + error = bpf_setif(d_to, ifp, false, true); + if (error != 0) { + os_log_info(OS_LOG_DEFAULT, + "%s: bpf_setif() failed error %d", + __func__, error); + goto done; + } + + /* + * Make sure the buffers are setup as expected by bpf_setif() + */ + ASSERT(d_to->bd_hbuf == NULL); + ASSERT(d_to->bd_sbuf != NULL); + ASSERT(d_to->bd_fbuf != NULL); + + /* + * Copy the buffers and update the pointers and counts + */ + memcpy(d_to->bd_sbuf, d_from->bd_sbuf, d_from->bd_slen); + d_to->bd_slen = d_from->bd_slen; + d_to->bd_scnt = d_from->bd_scnt; + + if (d_from->bd_hbuf != NULL) { + d_to->bd_hbuf = d_to->bd_fbuf; + d_to->bd_fbuf = NULL; + memcpy(d_to->bd_hbuf, d_from->bd_hbuf, d_from->bd_hlen); + } + d_to->bd_hlen = d_from->bd_hlen; + d_to->bd_hcnt = d_from->bd_hcnt; + + if (bpf_debug > 0) { + os_log_info(OS_LOG_DEFAULT, + "%s: done slen %u scnt %u hlen %u hcnt %u", + __func__, d_to->bd_slen, d_to->bd_scnt, + d_to->bd_hlen, d_to->bd_hcnt); + } +done: + d_from->bd_hbuf_read = 0; + wakeup((caddr_t)d_from); + + d_to->bd_hbuf_read = 0; + wakeup((caddr_t)d_to); + + return (error); +} + /* * FIONREAD Check for read packet available. * SIOCGIFADDR Get interface address - convenient hook to driver. @@ -1456,7 +1617,8 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags, lck_mtx_lock(bpf_mlock); d = bpf_dtab[minor(dev)]; - if (d == 0 || d == (void *)1 || (d->bd_flags & BPF_CLOSING) != 0) { + if (d == NULL || d == BPF_DEV_RESERVED || + (d->bd_flags & BPF_CLOSING) != 0) { lck_mtx_unlock(bpf_mlock); return (ENXIO); } @@ -1511,30 +1673,48 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags, /* * Set buffer length. */ - case BIOCSBLEN: /* u_int */ - if (d->bd_bif != 0 || (d->bd_flags & BPF_DETACHING)) - error = EINVAL; - else { - u_int size; + case BIOCSBLEN: { /* u_int */ + u_int size; + unsigned int maxbufsize = bpf_maxbufsize; - bcopy(addr, &size, sizeof (size)); + /* + * Allow larger buffer in head drop mode to with the + * assumption the reading process may be low priority but + * is interested in the most recent traffic + */ + if (d->bd_headdrop != 0) { + maxbufsize = 2 * bpf_maxbufsize; + } + if (d->bd_bif != 0 || (d->bd_flags & BPF_DETACHING)) { /* - * Allow larger buffer in head drop mode with the - * assumption the capture is in standby mode to - * keep a cache of recent traffic + * Interface already attached, unable to change buffers */ - if (d->bd_headdrop != 0 && size > 2 * bpf_maxbufsize) - size = 2 * bpf_maxbufsize; - else if (size > bpf_maxbufsize) - size = bpf_maxbufsize; - else if (size < BPF_MINBUFSIZE) - size = BPF_MINBUFSIZE; - bcopy(&size, addr, sizeof (size)); + error = EINVAL; + break; + } + bcopy(addr, &size, sizeof (size)); + + if (size > maxbufsize) { + d->bd_bufsize = maxbufsize; + + os_log_info(OS_LOG_DEFAULT, + "%s bufsize capped to %u from %u", + __func__, d->bd_bufsize, size); + } else if (size < BPF_MINBUFSIZE) { + d->bd_bufsize = BPF_MINBUFSIZE; + + os_log_info(OS_LOG_DEFAULT, + "%s bufsize bumped to %u from %u", + __func__, d->bd_bufsize, size); + } else { d->bd_bufsize = size; } - break; + /* It's a read/write ioctl */ + bcopy(&d->bd_bufsize, addr, sizeof (u_int)); + break; + } /* * Set link layer read filter. */ @@ -1561,8 +1741,9 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags, * Flush read packet buffer. */ case BIOCFLUSH: - while (d->bd_hbuf_read) { - msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading", NULL); + while (d->bd_hbuf_read != 0) { + msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading", + NULL); } if ((d->bd_flags & BPF_CLOSING) != 0) { error = ENXIO; @@ -1622,9 +1803,9 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags, u_int dlt; bcopy(addr, &dlt, sizeof (dlt)); - - if (dlt == DLT_PKTAP && !(d->bd_flags & BPF_WANT_PKTAP)) { - printf("BIOCSDLT downgrade DLT_PKTAP to DLT_RAW\n"); + + if (dlt == DLT_PKTAP && + !(d->bd_flags & BPF_WANT_PKTAP)) { dlt = DLT_RAW; } error = bpf_setdlt(d, dlt); @@ -1657,7 +1838,7 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags, if (ifp == NULL) error = ENXIO; else - error = bpf_setif(d, ifp); + error = bpf_setif(d, ifp, true, false); break; } @@ -1846,7 +2027,7 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags, d->bd_flags &= ~BPF_EXTENDED_HDR; break; - case BIOCGIFATTACHCOUNT: { /* struct ifreq */ + case BIOCGIFATTACHCOUNT: { /* struct ifreq */ ifnet_t ifp; struct bpf_if *bp; @@ -1860,10 +2041,11 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags, ifr.ifr_intval = 0; for (bp = bpf_iflist; bp != 0; bp = bp->bif_next) { struct bpf_d *bpf_d; - + if (bp->bif_ifp == NULL || bp->bif_ifp != ifp) continue; - for (bpf_d = bp->bif_dlist; bpf_d; bpf_d = bpf_d->bd_next) { + for (bpf_d = bp->bif_dlist; bpf_d; + bpf_d = bpf_d->bd_next) { ifr.ifr_intval += 1; } } @@ -1876,11 +2058,11 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags, break; case BIOCSWANTPKTAP: /* u_int */ - bcopy(addr, &int_arg, sizeof (int_arg)); - if (int_arg) - d->bd_flags |= BPF_WANT_PKTAP; - else - d->bd_flags &= ~BPF_WANT_PKTAP; + bcopy(addr, &int_arg, sizeof (int_arg)); + if (int_arg) + d->bd_flags |= BPF_WANT_PKTAP; + else + d->bd_flags &= ~BPF_WANT_PKTAP; break; #endif @@ -1892,6 +2074,49 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags, case BIOCGHEADDROP: bcopy(&d->bd_headdrop, addr, sizeof (int)); break; + + case BIOCSTRUNCATE: + bcopy(addr, &int_arg, sizeof(int_arg)); + if (int_arg) + d->bd_flags |= BPF_TRUNCATE; + else + d->bd_flags &= ~BPF_TRUNCATE; + break; + + case BIOCGETUUID: + bcopy(&d->bd_uuid, addr, sizeof (uuid_t)); + break; + + case BIOCSETUP: { + struct bpf_setup_args bsa; + ifnet_t ifp; + + bcopy(addr, &bsa, sizeof (struct bpf_setup_args)); + bsa.bsa_ifname[IFNAMSIZ - 1] = 0; + ifp = ifunit(bsa.bsa_ifname); + if (ifp == NULL) { + error = ENXIO; + os_log_info(OS_LOG_DEFAULT, + "%s: ifnet not found for %s error %d", + __func__, bsa.bsa_ifname, error); + break; + } + + error = bpf_setup(d, bsa.bsa_uuid, ifp); + break; + } + case BIOCSPKTHDRV2: + bcopy(addr, &int_arg, sizeof(int_arg)); + if (int_arg != 0) + d->bd_flags |= BPF_PKTHDRV2; + else + d->bd_flags &= ~BPF_PKTHDRV2; + break; + + case BIOCGPKTHDRV2: + int_arg = d->bd_flags & BPF_PKTHDRV2 ? 1 : 0; + bcopy(&int_arg, addr, sizeof (int)); + break; } bpf_release_d(d); @@ -1911,12 +2136,12 @@ bpf_setf(struct bpf_d *d, u_int bf_len, user_addr_t bf_insns, struct bpf_insn *fcode, *old; u_int flen, size; - while (d->bd_hbuf_read) + while (d->bd_hbuf_read != 0) msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading", NULL); if ((d->bd_flags & BPF_CLOSING) != 0) return (ENXIO); - + old = d->bd_filter; if (bf_insns == USER_ADDR_NULL) { if (bf_len != 0) @@ -1940,10 +2165,10 @@ bpf_setf(struct bpf_d *d, u_int bf_len, user_addr_t bf_insns, if (copyin(bf_insns, (caddr_t)fcode, size) == 0 && bpf_validate(fcode, (int)flen)) { d->bd_filter = fcode; - + if (cmd == BIOCSETF32 || cmd == BIOCSETF64) reset_d(d); - + if (old != 0) FREE((caddr_t)old, M_DEVBUF); @@ -1959,12 +2184,12 @@ bpf_setf(struct bpf_d *d, u_int bf_len, user_addr_t bf_insns, * Return an errno or 0. */ static int -bpf_setif(struct bpf_d *d, ifnet_t theywant) +bpf_setif(struct bpf_d *d, ifnet_t theywant, bool do_reset, bool has_hbuf_read) { struct bpf_if *bp; int error; - while (d->bd_hbuf_read) + while (d->bd_hbuf_read != 0 && !has_hbuf_read) msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading", NULL); if ((d->bd_flags & BPF_CLOSING) != 0) @@ -2006,15 +2231,15 @@ bpf_setif(struct bpf_d *d, ifnet_t theywant) if (bpf_attachd(d, bp) != 0) return (ENXIO); } + if (do_reset) { reset_d(d); + } return (0); } /* Not found. */ return (ENXIO); } - - /* * Get a list of available data link type of the interface. */ @@ -2042,7 +2267,7 @@ bpf_getdltlist(struct bpf_d *d, caddr_t addr, struct proc *p) for (bp = bpf_iflist; bp; bp = bp->bif_next) { if (bp->bif_ifp != ifp) continue; - /* + /* * Do not use DLT_PKTAP, unless requested explicitly */ if (bp->bif_dlt == DLT_PKTAP && !(d->bd_flags & BPF_WANT_PKTAP)) @@ -2074,11 +2299,11 @@ bpf_setdlt(struct bpf_d *d, uint32_t dlt) int error, opromisc; struct ifnet *ifp; struct bpf_if *bp; - + if (d->bd_bif->bif_dlt == dlt) return (0); - - while (d->bd_hbuf_read) + + while (d->bd_hbuf_read != 0) msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading", NULL); if ((d->bd_flags & BPF_CLOSING) != 0) @@ -2090,7 +2315,8 @@ bpf_setdlt(struct bpf_d *d, uint32_t dlt) /* * Do not use DLT_PKTAP, unless requested explicitly */ - if (bp->bif_dlt == DLT_PKTAP && !(d->bd_flags & BPF_WANT_PKTAP)) { + if (bp->bif_dlt == DLT_PKTAP && + !(d->bd_flags & BPF_WANT_PKTAP)) { continue; } break; @@ -2103,8 +2329,9 @@ bpf_setdlt(struct bpf_d *d, uint32_t dlt) error = bpf_attachd(d, bp); if (error) { printf("bpf_setdlt: bpf_attachd %s%d failed (%d)\n", - ifnet_name(bp->bif_ifp), ifnet_unit(bp->bif_ifp), error); - return error; + ifnet_name(bp->bif_ifp), ifnet_unit(bp->bif_ifp), + error); + return (error); } reset_d(d); if (opromisc) { @@ -2161,7 +2388,8 @@ bpfselect(dev_t dev, int which, void * wql, struct proc *p) lck_mtx_lock(bpf_mlock); d = bpf_dtab[minor(dev)]; - if (d == 0 || d == (void *)1 || (d->bd_flags & BPF_CLOSING) != 0) { + if (d == NULL || d == BPF_DEV_RESERVED || + (d->bd_flags & BPF_CLOSING) != 0) { lck_mtx_unlock(bpf_mlock); return (ENXIO); } @@ -2174,7 +2402,7 @@ bpfselect(dev_t dev, int which, void * wql, struct proc *p) return (ENXIO); } - while (d->bd_hbuf_read) + while (d->bd_hbuf_read != 0) msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading", NULL); if ((d->bd_flags & BPF_CLOSING) != 0) { @@ -2186,8 +2414,8 @@ bpfselect(dev_t dev, int which, void * wql, struct proc *p) switch (which) { case FREAD: if (d->bd_hlen != 0 || - ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) && - d->bd_slen != 0)) + ((d->bd_immediate || + d->bd_state == BPF_TIMED_OUT) && d->bd_slen != 0)) ret = 1; /* read has data to return */ else { /* @@ -2201,7 +2429,8 @@ bpfselect(dev_t dev, int which, void * wql, struct proc *p) break; case FWRITE: - ret = 1; /* can't determine whether a write would block */ + /* can't determine whether a write would block */ + ret = 1; break; } @@ -2211,7 +2440,6 @@ bpfselect(dev_t dev, int which, void * wql, struct proc *p) return (ret); } - /* * Support for kevent() system call. Register EVFILT_READ filters and * reject all others. @@ -2220,10 +2448,11 @@ int bpfkqfilter(dev_t dev, struct knote *kn); static void filt_bpfdetach(struct knote *); static int filt_bpfread(struct knote *, long); static int filt_bpftouch(struct knote *kn, struct kevent_internal_s *kev); -static int filt_bpfprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); +static int filt_bpfprocess(struct knote *kn, struct filt_process_s *data, + struct kevent_internal_s *kev); SECURITY_READ_ONLY_EARLY(struct filterops) bpfread_filtops = { - .f_isfd = 1, + .f_isfd = 1, .f_detach = filt_bpfdetach, .f_event = filt_bpfread, .f_touch = filt_bpftouch, @@ -2237,24 +2466,23 @@ filt_bpfread_common(struct knote *kn, struct bpf_d *d) if (d->bd_immediate) { /* - * If there's data in the hold buffer, it's the + * If there's data in the hold buffer, it's the * amount of data a read will return. * * If there's no data in the hold buffer, but * there's data in the store buffer, a read will - * immediately rotate the store buffer to the + * immediately rotate the store buffer to the * hold buffer, the amount of data in the store - * buffer is the amount of data a read will + * buffer is the amount of data a read will * return. * - * If there's no data in either buffer, we're not + * If there's no data in either buffer, we're not * ready to read. */ - kn->kn_data = ((d->bd_hlen == 0 || d->bd_hbuf_read) - ? d->bd_slen : d->bd_hlen); + kn->kn_data = (d->bd_hlen == 0 || d->bd_hbuf_read != 0 ? + d->bd_slen : d->bd_hlen); int64_t lowwat = 1; - if (kn->kn_sfflags & NOTE_LOWAT) - { + if (kn->kn_sfflags & NOTE_LOWAT) { if (kn->kn_sdata > d->bd_bufsize) lowwat = d->bd_bufsize; else if (kn->kn_sdata > lowwat) @@ -2263,22 +2491,22 @@ filt_bpfread_common(struct knote *kn, struct bpf_d *d) ready = (kn->kn_data >= lowwat); } else { /* - * If there's data in the hold buffer, it's the + * If there's data in the hold buffer, it's the * amount of data a read will return. * - * If there's no data in the hold buffer, but - * there's data in the store buffer, if the + * If there's no data in the hold buffer, but + * there's data in the store buffer, if the * timer has expired a read will immediately * rotate the store buffer to the hold buffer, - * so the amount of data in the store buffer is + * so the amount of data in the store buffer is * the amount of data a read will return. * - * If there's no data in either buffer, or there's - * no data in the hold buffer and the timer hasn't + * If there's no data in either buffer, or there's + * no data in the hold buffer and the timer hasn't * expired, we're not ready to read. */ - kn->kn_data = ((d->bd_hlen == 0 || d->bd_hbuf_read) && d->bd_state == BPF_TIMED_OUT ? - d->bd_slen : d->bd_hlen); + kn->kn_data = ((d->bd_hlen == 0 || d->bd_hbuf_read != 0) && + d->bd_state == BPF_TIMED_OUT ? d->bd_slen : d->bd_hlen); ready = (kn->kn_data > 0); } if (!ready) @@ -2300,21 +2528,20 @@ bpfkqfilter(dev_t dev, struct knote *kn) kn->kn_filter != EVFILT_READ) { kn->kn_flags = EV_ERROR; kn->kn_data = EINVAL; - return 0; + return (0); } lck_mtx_lock(bpf_mlock); d = bpf_dtab[minor(dev)]; - if (d == 0 || - d == (void *)1 || - d->bd_bif == NULL || - (d->bd_flags & BPF_CLOSING) != 0) { + if (d == NULL || d == BPF_DEV_RESERVED || + (d->bd_flags & BPF_CLOSING) != 0 || + d->bd_bif == NULL) { lck_mtx_unlock(bpf_mlock); kn->kn_flags = EV_ERROR; kn->kn_data = ENXIO; - return 0; + return (0); } kn->kn_hook = d; @@ -2349,7 +2576,7 @@ filt_bpfread(struct knote *kn, long hint) #pragma unused(hint) struct bpf_d *d = (struct bpf_d *)kn->kn_hook; - return filt_bpfread_common(kn, d); + return (filt_bpfread_common(kn, d)); } static int @@ -2363,19 +2590,18 @@ filt_bpftouch(struct knote *kn, struct kevent_internal_s *kev) /* save off the lowat threshold and flag */ kn->kn_sdata = kev->data; kn->kn_sfflags = kev->fflags; - if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) - kn->kn_udata = kev->udata; /* output data will be re-generated here */ res = filt_bpfread_common(kn, d); lck_mtx_unlock(bpf_mlock); - return res; + return (res); } static int -filt_bpfprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev) +filt_bpfprocess(struct knote *kn, struct filt_process_s *data, + struct kevent_internal_s *kev) { #pragma unused(data) struct bpf_d *d = (struct bpf_d *)kn->kn_hook; @@ -2388,11 +2614,11 @@ filt_bpfprocess(struct knote *kn, struct filt_process_s *data, struct kevent_int } lck_mtx_unlock(bpf_mlock); - return res; + return (res); } /* - * Copy data from an mbuf chain into a buffer. This code is derived + * Copy data from an mbuf chain into a buffer. This code is derived * from m_copydata in kern/uipc_mbuf.c. */ static void @@ -2456,11 +2682,44 @@ bpf_tap_imp( goto done; } for (d = bp->bif_dlist; d; d = d->bd_next) { + struct bpf_packet *bpf_pkt_saved = bpf_pkt; + struct bpf_packet bpf_pkt_tmp; + struct pktap_header_buffer bpfp_header_tmp; + if (outbound && !d->bd_seesent) continue; + ++d->bd_rcount; slen = bpf_filter(d->bd_filter, (u_char *)bpf_pkt, - bpf_pkt->bpfp_total_length, 0); + bpf_pkt->bpfp_total_length, 0); + if (bp->bif_ifp->if_type == IFT_PKTAP && + bp->bif_dlt == DLT_PKTAP) { + /* + * Need to copy the bpf_pkt because the conversion + * to v2 pktap header modifies the content of the + * bpfp_header + */ + if ((d->bd_flags & BPF_PKTHDRV2) && + bpf_pkt->bpfp_header_length <= sizeof(bpfp_header_tmp)) { + bpf_pkt_tmp = *bpf_pkt; + + bpf_pkt = &bpf_pkt_tmp; + + memcpy(&bpfp_header_tmp, bpf_pkt->bpfp_header, + bpf_pkt->bpfp_header_length); + + bpf_pkt->bpfp_header = &bpfp_header_tmp; + + convert_to_pktap_header_to_v2(bpf_pkt, + !!(d->bd_flags & BPF_TRUNCATE)); + } + + if (d->bd_flags & BPF_TRUNCATE) { + slen = min(slen, + get_pkt_trunc_len((u_char *)bpf_pkt, + bpf_pkt->bpfp_total_length)); + } + } if (slen != 0) { #if CONFIG_MACF_NET if (mac_bpfdesc_check_receive(d, bp->bif_ifp) != 0) @@ -2468,9 +2727,10 @@ bpf_tap_imp( #endif catchpacket(d, bpf_pkt, slen, outbound); } + bpf_pkt = bpf_pkt_saved; } - done: +done: lck_mtx_unlock(bpf_mlock); } @@ -2531,11 +2791,24 @@ bpf_tap_in( static int bpf_tap_callback(struct ifnet *ifp, struct mbuf *m) { bpf_tap_mbuf(ifp, 0, m, NULL, 0, mbuf_pkthdr_rcvif(m) == NULL); - - return 0; + + return (0); } +static errno_t +bpf_copydata(struct bpf_packet *pkt, size_t off, size_t len, void* out_data) +{ + errno_t err = 0; + if (pkt->bpfp_type == BPF_PACKET_TYPE_MBUF) { + err = mbuf_copydata(pkt->bpfp_mbuf, off, len, out_data); + } else { + err = EINVAL; + } + + return (err); +} + static void copy_bpf_packet(struct bpf_packet * pkt, void * dst, size_t len) { @@ -2560,6 +2833,347 @@ copy_bpf_packet(struct bpf_packet * pkt, void * dst, size_t len) } } +static uint16_t +get_esp_trunc_len(__unused struct bpf_packet *pkt, __unused uint16_t off, + const uint16_t remaining_caplen) +{ + /* + * For some reason tcpdump expects to have one byte beyond the ESP header + */ + uint16_t trunc_len = ESP_HDR_SIZE + 1; + + if (trunc_len > remaining_caplen) + return (remaining_caplen); + + return (trunc_len); +} + +static uint16_t +get_isakmp_trunc_len(__unused struct bpf_packet *pkt, __unused uint16_t off, + const uint16_t remaining_caplen) +{ + /* + * Include the payload generic header + */ + uint16_t trunc_len = ISAKMP_HDR_SIZE; + + if (trunc_len > remaining_caplen) + return (remaining_caplen); + + return (trunc_len); +} + +static uint16_t +get_isakmp_natt_trunc_len(struct bpf_packet *pkt, uint16_t off, + const uint16_t remaining_caplen) +{ + int err = 0; + uint16_t trunc_len = 0; + char payload[remaining_caplen]; + + err = bpf_copydata(pkt, off, remaining_caplen, payload); + if (err != 0) + return (remaining_caplen); + /* + * They are three cases: + * - IKE: payload start with 4 bytes header set to zero before ISAKMP header + * - keep alive: 1 byte payload + * - otherwise it's ESP + */ + if (remaining_caplen >= 4 && + payload[0] == 0 && payload[1] == 0 && + payload[2] == 0 && payload[3] == 0) { + trunc_len = 4 + get_isakmp_trunc_len(pkt, off + 4, remaining_caplen - 4); + } else if (remaining_caplen == 1) { + trunc_len = 1; + } else { + trunc_len = get_esp_trunc_len(pkt, off, remaining_caplen); + } + + if (trunc_len > remaining_caplen) + return (remaining_caplen); + + return (trunc_len); + +} + +static uint16_t +get_udp_trunc_len(struct bpf_packet *pkt, uint16_t off, const uint16_t remaining_caplen) +{ + int err = 0; + uint16_t trunc_len = sizeof(struct udphdr); /* By default no UDP payload */ + + if (trunc_len >= remaining_caplen) + return (remaining_caplen); + + struct udphdr udphdr; + err = bpf_copydata(pkt, off, sizeof(struct udphdr), &udphdr); + if (err != 0) + return (remaining_caplen); + + u_short sport, dport; + + sport = EXTRACT_SHORT(&udphdr.uh_sport); + dport = EXTRACT_SHORT(&udphdr.uh_dport); + + if (dport == PORT_DNS || sport == PORT_DNS) { + /* + * Full UDP payload for DNS + */ + trunc_len = remaining_caplen; + } else if ((sport == PORT_BOOTPS && dport == PORT_BOOTPC) || + (sport == PORT_BOOTPC && dport == PORT_BOOTPS)) { + /* + * Full UDP payload for BOOTP and DHCP + */ + trunc_len = remaining_caplen; + } else if (dport == PORT_ISAKMP && sport == PORT_ISAKMP) { + /* + * Return the ISAKMP header + */ + trunc_len += get_isakmp_trunc_len(pkt, off + sizeof(struct udphdr), + remaining_caplen - sizeof(struct udphdr)); + } else if (dport == PORT_ISAKMP_NATT && sport == PORT_ISAKMP_NATT) { + trunc_len += get_isakmp_natt_trunc_len(pkt, off + sizeof(struct udphdr), + remaining_caplen - sizeof(struct udphdr)); + } + if (trunc_len >= remaining_caplen) + return (remaining_caplen); + + return (trunc_len); +} + +static uint16_t +get_tcp_trunc_len(struct bpf_packet *pkt, uint16_t off, const uint16_t remaining_caplen) +{ + int err = 0; + uint16_t trunc_len = sizeof(struct tcphdr); /* By default no TCP payload */ + if (trunc_len >= remaining_caplen) + return (remaining_caplen); + + struct tcphdr tcphdr; + err = bpf_copydata(pkt, off, sizeof(struct tcphdr), &tcphdr); + if (err != 0) + return (remaining_caplen); + + u_short sport, dport; + sport = EXTRACT_SHORT(&tcphdr.th_sport); + dport = EXTRACT_SHORT(&tcphdr.th_dport); + + if (dport == PORT_DNS || sport == PORT_DNS) { + /* + * Full TCP payload for DNS + */ + trunc_len = remaining_caplen; + } else { + trunc_len = tcphdr.th_off << 2; + } + if (trunc_len >= remaining_caplen) + return (remaining_caplen); + + return (trunc_len); +} + +static uint16_t +get_proto_trunc_len(uint8_t proto, struct bpf_packet *pkt, uint16_t off, const uint16_t remaining_caplen) +{ + uint16_t trunc_len; + + switch (proto) { + case IPPROTO_ICMP: { + /* + * Full IMCP payload + */ + trunc_len = remaining_caplen; + break; + } + case IPPROTO_ICMPV6: { + /* + * Full IMCPV6 payload + */ + trunc_len = remaining_caplen; + break; + } + case IPPROTO_IGMP: { + /* + * Full IGMP payload + */ + trunc_len = remaining_caplen; + break; + } + case IPPROTO_UDP: { + trunc_len = get_udp_trunc_len(pkt, off, remaining_caplen); + break; + } + case IPPROTO_TCP: { + trunc_len = get_tcp_trunc_len(pkt, off, remaining_caplen); + break; + } + case IPPROTO_ESP: { + trunc_len = get_esp_trunc_len(pkt, off, remaining_caplen); + break; + } + default: { + /* + * By default we only include the IP header + */ + trunc_len = 0; + break; + } + } + if (trunc_len >= remaining_caplen) + return (remaining_caplen); + + return (trunc_len); +} + +static uint16_t +get_ip_trunc_len(struct bpf_packet *pkt, uint16_t off, const uint16_t remaining_caplen) +{ + int err = 0; + uint16_t iplen = sizeof(struct ip); + if (iplen >= remaining_caplen) + return (remaining_caplen); + + struct ip iphdr; + err = bpf_copydata(pkt, off, sizeof(struct ip), &iphdr); + if (err != 0) + return (remaining_caplen); + + uint8_t proto = 0; + + iplen = iphdr.ip_hl << 2; + if (iplen >= remaining_caplen) + return (remaining_caplen); + + proto = iphdr.ip_p; + iplen += get_proto_trunc_len(proto, pkt, off + iplen, remaining_caplen - iplen); + + if (iplen >= remaining_caplen) + return (remaining_caplen); + + return (iplen); +} + +static uint16_t +get_ip6_trunc_len(struct bpf_packet *pkt, uint16_t off, const uint16_t remaining_caplen) +{ + int err = 0; + uint16_t iplen = sizeof(struct ip6_hdr); + if (iplen >= remaining_caplen) + return (remaining_caplen); + + struct ip6_hdr ip6hdr; + err = bpf_copydata(pkt, off, sizeof(struct ip6_hdr), &ip6hdr); + if (err != 0) + return (remaining_caplen); + + uint8_t proto = 0; + + /* + * TBD: process the extension headers + */ + proto = ip6hdr.ip6_nxt; + iplen += get_proto_trunc_len(proto, pkt, off + iplen, remaining_caplen - iplen); + + if (iplen >= remaining_caplen) + return (remaining_caplen); + + return (iplen); +} + +static uint16_t +get_ether_trunc_len(struct bpf_packet *pkt, int off, const uint16_t remaining_caplen) +{ + int err = 0; + uint16_t ethlen = sizeof(struct ether_header); + if (ethlen >= remaining_caplen) + return (remaining_caplen); + + struct ether_header eh; + u_short type; + err = bpf_copydata(pkt, off, sizeof(struct ether_header), &eh); + if (err != 0) + return (remaining_caplen); + + type = EXTRACT_SHORT(&eh.ether_type); + /* Include full ARP */ + if (type == ETHERTYPE_ARP) { + ethlen = remaining_caplen; + } else if (type != ETHERTYPE_IP && type != ETHERTYPE_IPV6) { + ethlen = min(BPF_MIN_PKT_SIZE, remaining_caplen); + } else { + if (type == ETHERTYPE_IP) { + ethlen += get_ip_trunc_len(pkt, sizeof(struct ether_header), + remaining_caplen); + } else if (type == ETHERTYPE_IPV6) { + ethlen += get_ip6_trunc_len(pkt, sizeof(struct ether_header), + remaining_caplen); + } + } + return (ethlen); +} + +static uint32_t +get_pkt_trunc_len(u_char *p, u_int len) +{ + struct bpf_packet *pkt = (struct bpf_packet *)(void *) p; + struct pktap_header *pktap = (struct pktap_header *) (pkt->bpfp_header); + uint32_t out_pkt_len = 0, tlen = 0; + /* + * pktap->pth_frame_pre_length is L2 header length and accounts + * for both pre and pre_adjust. + * pktap->pth_length is sizeof(pktap_header) (excl the pre/pre_adjust) + * pkt->bpfp_header_length is (pktap->pth_length + pre_adjust) + * pre is the offset to the L3 header after the bpfp_header, or length + * of L2 header after bpfp_header, if present. + */ + uint32_t pre = pktap->pth_frame_pre_length - + (pkt->bpfp_header_length - pktap->pth_length); + + /* Length of the input packet starting from L3 header */ + uint32_t in_pkt_len = len - pkt->bpfp_header_length - pre; + if (pktap->pth_protocol_family == AF_INET || + pktap->pth_protocol_family == AF_INET6) { + /* Contains L2 header */ + if (pre > 0) { + if (pre < sizeof(struct ether_header)) + goto too_short; + + out_pkt_len = get_ether_trunc_len(pkt, 0, in_pkt_len); + } else if (pre == 0) { + if (pktap->pth_protocol_family == AF_INET) { + out_pkt_len = get_ip_trunc_len(pkt, pre, in_pkt_len); + } else if (pktap->pth_protocol_family == AF_INET6) { + out_pkt_len = get_ip6_trunc_len(pkt, pre, in_pkt_len); + } + } else { + /* Ideally pre should be >= 0. This is an exception */ + out_pkt_len = min(BPF_MIN_PKT_SIZE, in_pkt_len); + } + } else { + if (pktap->pth_iftype == IFT_ETHER) { + if (in_pkt_len < sizeof(struct ether_header)) { + goto too_short; + } + /* At most include the Ethernet header and 16 bytes */ + out_pkt_len = MIN(sizeof(struct ether_header) + 16, + in_pkt_len); + } else { + /* + * For unknown protocols include at most 16 bytes + */ + out_pkt_len = MIN(16, in_pkt_len); + } + } +done: + tlen = pkt->bpfp_header_length + out_pkt_len + pre; + return (tlen); +too_short: + out_pkt_len = in_pkt_len; + goto done; +} + /* * Move the packet data from interface memory (pkt) into the * store buffer. Return 1 if it's time to wakeup a listener (buffer full), @@ -2605,11 +3219,11 @@ catchpacket(struct bpf_d *d, struct bpf_packet * pkt, * We cannot rotate buffers if a read is in progress * so drop the packet */ - if (d->bd_hbuf_read) { + if (d->bd_hbuf_read != 0) { ++d->bd_dcount; return; } - + if (d->bd_fbuf == NULL) { if (d->bd_headdrop == 0) { /* @@ -2630,11 +3244,10 @@ catchpacket(struct bpf_d *d, struct bpf_packet * pkt, } do_wakeup = 1; curlen = 0; - } - else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) + } else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) /* - * Immediate mode is set, or the read timeout has - * already expired during a select call. A packet + * Immediate mode is set, or the read timeout has + * already expired during a select call. A packet * arrived, so the reader should be woken up. */ do_wakeup = 1; @@ -2643,18 +3256,18 @@ catchpacket(struct bpf_d *d, struct bpf_packet * pkt, * Append the bpf header. */ microtime(&tv); - if (d->bd_flags & BPF_EXTENDED_HDR) { + if (d->bd_flags & BPF_EXTENDED_HDR) { struct mbuf *m; m = (pkt->bpfp_type == BPF_PACKET_TYPE_MBUF) ? pkt->bpfp_mbuf : NULL; - ehp = (struct bpf_hdr_ext *)(void *)(d->bd_sbuf + curlen); - memset(ehp, 0, sizeof(*ehp)); - ehp->bh_tstamp.tv_sec = tv.tv_sec; - ehp->bh_tstamp.tv_usec = tv.tv_usec; + ehp = (struct bpf_hdr_ext *)(void *)(d->bd_sbuf + curlen); + memset(ehp, 0, sizeof(*ehp)); + ehp->bh_tstamp.tv_sec = tv.tv_sec; + ehp->bh_tstamp.tv_usec = tv.tv_usec; ehp->bh_datalen = pkt->bpfp_total_length; - ehp->bh_hdrlen = hdrlen; + ehp->bh_hdrlen = hdrlen; caplen = ehp->bh_caplen = totlen - hdrlen; if (m == NULL) { if (outbound) { @@ -2688,16 +3301,16 @@ catchpacket(struct bpf_d *d, struct bpf_packet * pkt, } } else ehp->bh_flags |= BPF_HDR_EXT_FLAGS_DIR_IN; - payload = (u_char *)ehp + hdrlen; - } else { - hp = (struct bpf_hdr *)(void *)(d->bd_sbuf + curlen); - hp->bh_tstamp.tv_sec = tv.tv_sec; - hp->bh_tstamp.tv_usec = tv.tv_usec; + payload = (u_char *)ehp + hdrlen; + } else { + hp = (struct bpf_hdr *)(void *)(d->bd_sbuf + curlen); + hp->bh_tstamp.tv_sec = tv.tv_sec; + hp->bh_tstamp.tv_usec = tv.tv_usec; hp->bh_datalen = pkt->bpfp_total_length; - hp->bh_hdrlen = hdrlen; + hp->bh_hdrlen = hdrlen; caplen = hp->bh_caplen = totlen - hdrlen; - payload = (u_char *)hp + hdrlen; - } + payload = (u_char *)hp + hdrlen; + } /* * Copy the packet data into the store buffer and update its length. */ @@ -2757,12 +3370,12 @@ bpf_freed(struct bpf_d *d) * been detached from its interface and it yet hasn't been marked * free. */ - if (d->bd_hbuf_read) + if (d->bd_hbuf_read != 0) panic("bpf buffer freed during read"); if (d->bd_sbuf != 0) { FREE(d->bd_sbuf, M_DEVBUF); - if (d->bd_hbuf != 0) + if (d->bd_hbuf != 0) FREE(d->bd_hbuf, M_DEVBUF); if (d->bd_fbuf != 0) FREE(d->bd_fbuf, M_DEVBUF); @@ -2772,7 +3385,7 @@ bpf_freed(struct bpf_d *d) } /* - * Attach an interface to bpf. driverp is a pointer to a (struct bpf_if *) + * Attach an interface to bpf. driverp is a pointer to a (struct bpf_if *) * in the driver's softc; dlt is the link layer type; hdrlen is the fixed * size of the link header (variable length headers not yet supported). */ @@ -2833,20 +3446,19 @@ bpf_attach( printf("bpfattach - %s with dlt %d is already attached\n", if_name(ifp), dlt); FREE(bp_new, M_DEVBUF); - return EEXIST; + return (EEXIST); } - + bp_new->bif_ifp = ifp; bp_new->bif_dlt = dlt; bp_new->bif_send = send; bp_new->bif_tap = tap; - + if (bp_first == NULL) { /* No other entries for this ifp */ bp_new->bif_next = bpf_iflist; bpf_iflist = bp_new; - } - else { + } else { if (ifnet_type(ifp) == IFT_ETHER && dlt == DLT_EN10MB) { /* Make this the first entry for this interface */ if (bp_before_first != NULL) { @@ -2863,7 +3475,7 @@ bpf_attach( bp_last->bif_next = bp_new; } } - + /* * Compute the length of the bpf header. This is not necessarily * equal to SIZEOF_BPF_HDR because we want to insert spacing such @@ -2873,7 +3485,7 @@ bpf_attach( bp_new->bif_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen; bp_new->bif_exthdrlen = BPF_WORDALIGN(hdrlen + sizeof(struct bpf_hdr_ext)) - hdrlen; - + /* Take a reference on the interface */ ifnet_reference(ifp); @@ -2884,7 +3496,7 @@ bpf_attach( printf("bpf: %s attached\n", if_name(ifp)); #endif - return 0; + return (0); } /* @@ -2945,7 +3557,7 @@ void bpf_init(__unused void *unused) { #ifdef __APPLE__ - int i; + int i; int maj; if (bpf_devsw_installed == 0) { @@ -2962,17 +3574,17 @@ bpf_init(__unused void *unused) lck_grp_free(bpf_mlock_grp); if (bpf_mlock_grp_attr) lck_grp_attr_free(bpf_mlock_grp_attr); - + bpf_mlock = NULL; bpf_mlock_attr = NULL; bpf_mlock_grp = NULL; bpf_mlock_grp_attr = NULL; bpf_devsw_installed = 0; - printf("bpf_init: failed to allocate a major number!\n"); + printf("bpf_init: failed to allocate a major number\n"); return; } - for (i = 0 ; i < NBPFILTER; i++) + for (i = 0; i < NBPFILTER; i++) bpf_make_dev_t(maj); } #else @@ -2981,7 +3593,7 @@ bpf_init(__unused void *unused) } #ifndef __APPLE__ -SYSINIT(bpfdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,bpf_drvinit,NULL) +SYSINIT(bpfdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE+CDEV_MAJOR, bpf_drvinit, NULL) #endif #if CONFIG_MACF_NET diff --git a/bsd/net/bpf.h b/bsd/net/bpf.h index ff4eb1bff..0457a93ab 100644 --- a/bsd/net/bpf.h +++ b/bsd/net/bpf.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2017 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -83,10 +83,42 @@ #include #include +#ifdef PRIVATE +#include +#include + +struct bpf_setup_args { + uuid_t bsa_uuid; + char bsa_ifname[IFNAMSIZ]; +}; +#endif /* PRIVATE */ + #ifdef KERNEL #include + +#if !defined(__i386__) && !defined(__x86_64__) +#define BPF_ALIGN 1 +#else /* defined(__i386__) || defined(__x86_64__) */ +#define BPF_ALIGN 0 +#endif /* defined(__i386__) || defined(__x86_64__) */ + +#if !BPF_ALIGN +#define EXTRACT_SHORT(p) ((u_int16_t)ntohs(*(u_int16_t *)(void *)p)) +#define EXTRACT_LONG(p) (ntohl(*(u_int32_t *)(void *)p)) +#else +#define EXTRACT_SHORT(p)\ + ((u_int16_t)\ + ((u_int16_t)*((u_char *)p+0)<<8|\ + (u_int16_t)*((u_char *)p+1)<<0)) +#define EXTRACT_LONG(p)\ + ((u_int32_t)*((u_char *)p+0)<<24|\ + (u_int32_t)*((u_char *)p+1)<<16|\ + (u_int32_t)*((u_char *)p+2)<<8|\ + (u_int32_t)*((u_char *)p+3)<<0) #endif +#endif /* KERNEL */ + /* BSD style release date */ #define BPF_RELEASE 199606 @@ -113,7 +145,8 @@ struct bpf_program { }; #ifdef KERNEL_PRIVATE -/* LP64 version of bpf_program. all pointers +/* + * LP64 version of bpf_program. all pointers * grow when we're dealing with a 64-bit process. * WARNING - keep in sync with bpf_program */ @@ -211,6 +244,11 @@ struct bpf_version { #define BIOCSWANTPKTAP _IOWR('B', 127, u_int) #define BIOCSHEADDROP _IOW('B', 128, int) #define BIOCGHEADDROP _IOR('B', 128, int) +#define BIOCSTRUNCATE _IOW('B', 129, u_int) +#define BIOCGETUUID _IOR('B', 130, uuid_t) +#define BIOCSETUP _IOW('B', 131, struct bpf_setup_args) +#define BIOCSPKTHDRV2 _IOW('B', 132, int) +#define BIOCGPKTHDRV2 _IOW('B', 133, int) #endif /* PRIVATE */ /* * Structure prepended to each packet. @@ -268,6 +306,7 @@ struct bpf_mtag { #define BPF_MTAG_DIR_IN 0 #define BPF_MTAG_DIR_OUT 1 }; + #endif /* PRIVATE */ /* @@ -1299,6 +1338,13 @@ struct bpf_dltlist { #pragma pack() #ifdef KERNEL_PRIVATE +#define BPF_MIN_PKT_SIZE 40 +#define PORT_DNS 53 +#define PORT_BOOTPS 67 +#define PORT_BOOTPC 68 +#define PORT_ISAKMP 500 +#define PORT_ISAKMP_NATT 4500 /* rfc3948 */ + /* Forward declerations */ struct ifnet; struct mbuf; diff --git a/bsd/net/bpf_filter.c b/bsd/net/bpf_filter.c index 80e31cd06..7fbafb3c0 100644 --- a/bsd/net/bpf_filter.c +++ b/bsd/net/bpf_filter.c @@ -74,27 +74,6 @@ #include #endif -#if !defined(__i386__) && !defined(__x86_64__) -#define BPF_ALIGN 1 -#else /* defined(__i386__) || defined(__x86_64__) */ -#define BPF_ALIGN 0 -#endif /* defined(__i386__) || defined(__x86_64__) */ - -#if !BPF_ALIGN -#define EXTRACT_SHORT(p) ((u_int16_t)ntohs(*(u_int16_t *)(void *)p)) -#define EXTRACT_LONG(p) (ntohl(*(u_int32_t *)(void *)p)) -#else -#define EXTRACT_SHORT(p)\ - ((u_int16_t)\ - ((u_int16_t)*((u_char *)p+0)<<8|\ - (u_int16_t)*((u_char *)p+1)<<0)) -#define EXTRACT_LONG(p)\ - ((u_int32_t)*((u_char *)p+0)<<24|\ - (u_int32_t)*((u_char *)p+1)<<16|\ - (u_int32_t)*((u_char *)p+2)<<8|\ - (u_int32_t)*((u_char *)p+3)<<0) -#endif - #ifdef KERNEL #include #endif diff --git a/bsd/net/bpfdesc.h b/bsd/net/bpfdesc.h index dcb9ac0af..8e18cb937 100644 --- a/bsd/net/bpfdesc.h +++ b/bsd/net/bpfdesc.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2014 Apple Inc. All rights reserved. + * Copyright (c) 2000-2017 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -75,8 +75,10 @@ * The items in this header file should be wrapped in #ifdef KERNEL. */ +#include #include #include +#include /* * Descriptor associated with each open bpf file. @@ -145,6 +147,9 @@ struct bpf_d { void *bd_unref_lr[BPF_REF_HIST]; int bd_next_ref_lr; int bd_next_unref_lr; + + struct proc *bd_opened_by; + uuid_t bd_uuid; }; /* Values for bd_state */ @@ -159,13 +164,15 @@ struct bpf_d { (bd)->bd_slen != 0)) /* Values for bd_flags */ -#define BPF_EXTENDED_HDR 0x01 /* process req. the extended header */ -#define BPF_WANT_PKTAP 0x02 /* knows how to handle DLT_PKTAP */ -#define BPF_FINALIZE_PKTAP 0x04 /* finalize pktap header on read */ -#define BPF_KNOTE 0x08 /* kernel note attached */ -#define BPF_DETACHING 0x10 /* bpf_d is being detached */ -#define BPF_DETACHED 0x20 /* bpf_d is detached */ -#define BPF_CLOSING 0x40 /* bpf_d is being closed */ +#define BPF_EXTENDED_HDR 0x0001 /* process req. the extended header */ +#define BPF_WANT_PKTAP 0x0002 /* knows how to handle DLT_PKTAP */ +#define BPF_FINALIZE_PKTAP 0x0004 /* finalize pktap header on read */ +#define BPF_KNOTE 0x0008 /* kernel note attached */ +#define BPF_DETACHING 0x0010 /* bpf_d is being detached */ +#define BPF_DETACHED 0x0020 /* bpf_d is detached */ +#define BPF_CLOSING 0x0040 /* bpf_d is being closed */ +#define BPF_TRUNCATE 0x0080 /* truncate the packet payload */ +#define BPF_PKTHDRV2 0x0100 /* pktap header version 2 */ /* * Descriptor associated with each attached hardware interface. diff --git a/bsd/net/classq/classq_fq_codel.h b/bsd/net/classq/classq_fq_codel.h index 6db259bb7..7f9411802 100644 --- a/bsd/net/classq/classq_fq_codel.h +++ b/bsd/net/classq/classq_fq_codel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018 Apple Inc. All rights reserved. + * Copyright (c) 2016-2017 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * diff --git a/bsd/net/content_filter.c b/bsd/net/content_filter.c index 4a685a80a..ae7ff13ab 100644 --- a/bsd/net/content_filter.c +++ b/bsd/net/content_filter.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2017 Apple Inc. All rights reserved. + * Copyright (c) 2013-2018 Apple Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -303,6 +303,9 @@ #include #include #include +#include +#include +#include #include #include @@ -313,10 +316,12 @@ #include #include #include +#include +#include #include #include - +#include #define MAX_CONTENT_FILTER 2 @@ -344,6 +349,7 @@ struct content_filter { struct content_filter **content_filters = NULL; uint32_t cfil_active_count = 0; /* Number of active content filters */ uint32_t cfil_sock_attached_count = 0; /* Number of sockets attachements */ +uint32_t cfil_sock_udp_attached_count = 0; /* Number of UDP sockets attachements */ uint32_t cfil_close_wait_timeout = 1000; /* in milliseconds */ static kern_ctl_ref cfil_kctlref = NULL; @@ -430,6 +436,8 @@ struct cfil_entry { (cfil)->cfi_op_list_ctr ++; \ } +struct cfil_hash_entry; + /* * struct cfil_info * @@ -454,7 +462,9 @@ struct cfil_info { */ uint64_t cfi_pending_first; uint64_t cfi_pending_last; - int cfi_pending_mbcnt; + uint32_t cfi_pending_mbcnt; + uint32_t cfi_pending_mbnum; + uint32_t cfi_tail_drop_cnt; /* * cfi_pass_offset is the minimum of all the filters */ @@ -468,6 +478,7 @@ struct cfil_info { } cfi_snd, cfi_rcv; struct cfil_entry cfi_entries[MAX_CONTENT_FILTER]; + struct cfil_hash_entry *cfi_hash_entry; } __attribute__((aligned(8))); #define CFIF_DROP 0x0001 /* drop action applied */ @@ -488,6 +499,98 @@ TAILQ_HEAD(cfil_sock_head, cfil_info) cfil_sock_head; #define CFIL_QUEUE_VERIFY(x) if (cfil_debug) cfil_queue_verify(x) #define CFIL_INFO_VERIFY(x) if (cfil_debug) cfil_info_verify(x) +/* + * UDP Socket Support + */ +LIST_HEAD(cfilhashhead, cfil_hash_entry); +#define CFILHASHSIZE 16 +#define CFIL_HASH(laddr, faddr, lport, fport) ((faddr) ^ ((laddr) >> 16) ^ (fport) ^ (lport)) +#define IS_UDP(so) (so && so->so_proto->pr_type == SOCK_DGRAM && so->so_proto->pr_protocol == IPPROTO_UDP) +#define UNCONNECTED(inp) (inp && (((inp->inp_vflag & INP_IPV4) && (inp->inp_faddr.s_addr == INADDR_ANY)) || \ + ((inp->inp_vflag & INP_IPV6) && IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)))) +#define IS_ENTRY_ATTACHED(cfil_info, kcunit) (cfil_info != NULL && (kcunit <= MAX_CONTENT_FILTER) && \ + cfil_info->cfi_entries[kcunit - 1].cfe_filter != NULL) +#define IS_DNS(local, remote) (check_port(local, 53) || check_port(remote, 53) || check_port(local, 5353) || check_port(remote, 5353)) + +/* + * UDP Garbage Collection: + */ +static struct thread *cfil_udp_gc_thread; +#define UDP_FLOW_GC_IDLE_TO 30 // Flow Idle Timeout in seconds +#define UDP_FLOW_GC_ACTION_TO 10 // Flow Action Timeout (no action from user space) in seconds +#define UDP_FLOW_GC_MAX_COUNT 100 // Max UDP flows to be handled per run +#define UDP_FLOW_GC_RUN_INTERVAL_NSEC (10 * NSEC_PER_SEC) // GC wakes up every 10 seconds + +/* + * UDP flow queue thresholds + */ +#define UDP_FLOW_GC_MBUF_CNT_MAX (2 << MBSHIFT) // Max mbuf byte count in flow queue (2MB) +#define UDP_FLOW_GC_MBUF_NUM_MAX (UDP_FLOW_GC_MBUF_CNT_MAX >> MCLSHIFT) // Max mbuf count in flow queue (1K) +#define UDP_FLOW_GC_MBUF_SHIFT 5 // Shift to get 1/32 of platform limits +/* + * UDP flow queue threshold globals: + */ +static unsigned int cfil_udp_gc_mbuf_num_max = UDP_FLOW_GC_MBUF_NUM_MAX; +static unsigned int cfil_udp_gc_mbuf_cnt_max = UDP_FLOW_GC_MBUF_CNT_MAX; + +/* + * struct cfil_hash_entry + * + * Hash entry for cfil_info + */ +struct cfil_hash_entry { + LIST_ENTRY(cfil_hash_entry) cfentry_link; + struct cfil_info *cfentry_cfil; + u_short cfentry_fport; + u_short cfentry_lport; + sa_family_t cfentry_family; + u_int32_t cfentry_flowhash; + u_int32_t cfentry_lastused; + union { + /* foreign host table entry */ + struct in_addr_4in6 addr46; + struct in6_addr addr6; + } cfentry_faddr; + union { + /* local host table entry */ + struct in_addr_4in6 addr46; + struct in6_addr addr6; + } cfentry_laddr; +}; + +/* + * struct cfil_db + * + * For each UDP socket, this is a hash table maintaining all cfil_info structs + * keyed by the flow 4-tuples . + */ +struct cfil_db { + struct socket *cfdb_so; + uint32_t cfdb_count; /* Number of total content filters */ + struct cfilhashhead *cfdb_hashbase; + u_long cfdb_hashmask; + struct cfil_hash_entry *cfdb_only_entry; /* Optimization for connected UDP */ +}; + +/* + * CFIL specific mbuf tag: + * Save state of socket at the point of data entry into cfil. + * Use saved state for reinjection at protocol layer. + */ +struct cfil_tag { + union sockaddr_in_4_6 cfil_faddr; + uint32_t cfil_so_state_change_cnt; + short cfil_so_options; +}; + +#define CFIL_HASH_ENTRY_ZONE_NAME "cfil_entry_hash" +#define CFIL_HASH_ENTRY_ZONE_MAX 1024 +static struct zone *cfil_hash_entry_zone = NULL; + +#define CFIL_DB_ZONE_NAME "cfil_db" +#define CFIL_DB_ZONE_MAX 1024 +static struct zone *cfil_db_zone = NULL; + /* * Statistics */ @@ -500,6 +603,15 @@ struct cfil_stats cfil_stats; int cfil_log_level = LOG_ERR; int cfil_debug = 1; +// Debug controls added for selective debugging. +// Disabled for production. If enabled, +// these will have performance impact +#define LIFECYCLE_DEBUG 0 +#define VERDICT_DEBUG 0 +#define DATA_DEBUG 0 +#define SHOW_DEBUG 0 +#define GC_DEBUG 0 + /* * Sysctls for logs and statistics */ @@ -541,31 +653,32 @@ SYSCTL_STRUCT(_net_cfil, OID_AUTO, stats, CTLFLAG_RD|CTLFLAG_LOCKED, /* * Forward declaration to appease the compiler */ -static int cfil_action_data_pass(struct socket *, uint32_t, int, +static int cfil_action_data_pass(struct socket *, struct cfil_info *, uint32_t, int, uint64_t, uint64_t); -static int cfil_action_drop(struct socket *, uint32_t); +static int cfil_action_drop(struct socket *, struct cfil_info *, uint32_t); static int cfil_action_bless_client(uint32_t, struct cfil_msg_hdr *); -static int cfil_dispatch_closed_event(struct socket *, int); -static int cfil_data_common(struct socket *, int, struct sockaddr *, +static int cfil_dispatch_closed_event(struct socket *, struct cfil_info *, int); +static int cfil_data_common(struct socket *, struct cfil_info *, int, struct sockaddr *, struct mbuf *, struct mbuf *, uint32_t); -static int cfil_data_filter(struct socket *, uint32_t, int, +static int cfil_data_filter(struct socket *, struct cfil_info *, uint32_t, int, struct mbuf *, uint64_t); static void fill_ip_sockaddr_4_6(union sockaddr_in_4_6 *, struct in_addr, u_int16_t); static void fill_ip6_sockaddr_4_6(union sockaddr_in_4_6 *, struct in6_addr *, u_int16_t); -static int cfil_dispatch_attach_event(struct socket *, uint32_t); -static void cfil_info_free(struct socket *, struct cfil_info *); -static struct cfil_info * cfil_info_alloc(struct socket *); -static int cfil_info_attach_unit(struct socket *, uint32_t); -static struct socket * cfil_socket_from_sock_id(cfil_sock_id_t); -static struct socket *cfil_socket_from_client_uuid(uuid_t, bool *); -static int cfil_service_pending_queue(struct socket *, uint32_t, int); -static int cfil_data_service_ctl_q(struct socket *, uint32_t, int); +; +static int cfil_dispatch_attach_event(struct socket *, struct cfil_info *, uint32_t); +static void cfil_info_free(struct cfil_info *); +static struct cfil_info * cfil_info_alloc(struct socket *, struct cfil_hash_entry *); +static int cfil_info_attach_unit(struct socket *, uint32_t, struct cfil_info *); +static struct socket * cfil_socket_from_sock_id(cfil_sock_id_t, bool); +static struct socket * cfil_socket_from_client_uuid(uuid_t, bool *); +static int cfil_service_pending_queue(struct socket *, struct cfil_info *, uint32_t, int); +static int cfil_data_service_ctl_q(struct socket *, struct cfil_info *, uint32_t, int); static void cfil_info_verify(struct cfil_info *); -static int cfil_update_data_offsets(struct socket *, uint32_t, int, +static int cfil_update_data_offsets(struct socket *, struct cfil_info *, uint32_t, int, uint64_t, uint64_t); -static int cfil_acquire_sockbuf(struct socket *, int); +static int cfil_acquire_sockbuf(struct socket *, struct cfil_info *, int); static void cfil_release_sockbuf(struct socket *, int); static int cfil_filters_attached(struct socket *); @@ -576,7 +689,41 @@ static void cfil_rw_unlock_shared(lck_rw_t *); static boolean_t cfil_rw_lock_shared_to_exclusive(lck_rw_t *); static void cfil_rw_lock_exclusive_to_shared(lck_rw_t *); -static unsigned int cfil_data_length(struct mbuf *, int *); +static unsigned int cfil_data_length(struct mbuf *, int *, int *); +static errno_t cfil_db_init(struct socket *); +static void cfil_db_free(struct socket *so); +struct cfil_hash_entry *cfil_db_lookup_entry(struct cfil_db *, struct sockaddr *, struct sockaddr *); +struct cfil_hash_entry *cfil_db_lookup_entry_with_sockid(struct cfil_db *, u_int64_t); +struct cfil_hash_entry *cfil_db_add_entry(struct cfil_db *, struct sockaddr *, struct sockaddr *); +void cfil_db_delete_entry(struct cfil_db *, struct cfil_hash_entry *); +struct cfil_hash_entry *cfil_sock_udp_get_flow(struct socket *, uint32_t, bool, struct sockaddr *, struct sockaddr *); +struct cfil_info *cfil_db_get_cfil_info(struct cfil_db *, cfil_sock_id_t); +static errno_t cfil_sock_udp_handle_data(bool, struct socket *, struct sockaddr *, struct sockaddr *, + struct mbuf *, struct mbuf *, uint32_t); +static int32_t cfil_sock_udp_data_pending(struct sockbuf *, bool); +static void cfil_sock_udp_is_closed(struct socket *); +static int cfil_sock_udp_notify_shutdown(struct socket *, int , int, int); +static int cfil_sock_udp_shutdown(struct socket *, int *); +static void cfil_sock_udp_close_wait(struct socket *); +static void cfil_sock_udp_buf_update(struct sockbuf *); +static int cfil_filters_udp_attached(struct socket *, bool); +static void cfil_get_flow_address_v6(struct cfil_hash_entry *, struct inpcb *, + struct in6_addr **, struct in6_addr **, + u_int16_t *, u_int16_t *); +static void cfil_get_flow_address(struct cfil_hash_entry *, struct inpcb *, + struct in_addr *, struct in_addr *, + u_int16_t *, u_int16_t *); +static void cfil_info_log(int, struct cfil_info *, const char *); +void cfil_filter_show(u_int32_t); +void cfil_info_show(void); +bool cfil_info_idle_timed_out(struct cfil_info *, int, u_int32_t); +bool cfil_info_action_timed_out(struct cfil_info *, int); +bool cfil_info_buffer_threshold_exceeded(struct cfil_info *); +struct m_tag *cfil_udp_save_socket_state(struct cfil_info *, struct mbuf *); +static void cfil_udp_gc_thread_func(void *, wait_result_t); +static void cfil_info_udp_expire(void *, wait_result_t); + +bool check_port(struct sockaddr *, u_short); /* * Content filter global read write lock @@ -676,29 +823,65 @@ cfil_rw_lock_assert_held(lck_rw_t *lck, int exclusive) /* * Return the number of bytes in the mbuf chain using the same * method as m_length() or sballoc() + * + * Returns data len - starting from PKT start + * - retmbcnt - optional param to get total mbuf bytes in chain + * - retmbnum - optional param to get number of mbufs in chain */ static unsigned int -cfil_data_length(struct mbuf *m, int *retmbcnt) +cfil_data_length(struct mbuf *m, int *retmbcnt, int *retmbnum) { struct mbuf *m0; - unsigned int pktlen; + unsigned int pktlen = 0; int mbcnt; + int mbnum; + + // Locate the start of data + for (m0 = m; m0 != NULL; m0 = m0->m_next) { + if (m0->m_flags & M_PKTHDR) + break; + } + if (m0 == NULL) { + CFIL_LOG(LOG_ERR, "cfil_data_length: no M_PKTHDR"); + return (0); + } + m = m0; - if (retmbcnt == NULL) + if (retmbcnt == NULL && retmbnum == NULL) return (m_length(m)); pktlen = 0; mbcnt = 0; + mbnum = 0; for (m0 = m; m0 != NULL; m0 = m0->m_next) { pktlen += m0->m_len; + mbnum++; mbcnt += MSIZE; if (m0->m_flags & M_EXT) mbcnt += m0->m_ext.ext_size; } - *retmbcnt = mbcnt; + if (retmbcnt) { + *retmbcnt = mbcnt; + } + if (retmbnum) { + *retmbnum = mbnum; + } return (pktlen); } +static struct mbuf * +cfil_data_start(struct mbuf *m) +{ + struct mbuf *m0; + + // Locate the start of data + for (m0 = m; m0 != NULL; m0 = m0->m_next) { + if (m0->m_flags & M_PKTHDR) + break; + } + return m0; +} + /* * Common mbuf queue utilities */ @@ -754,6 +937,7 @@ cfil_queue_len(struct cfil_queue *cfq) static void cfil_queue_verify(struct cfil_queue *cfq) { + mbuf_t chain; mbuf_t m; mbuf_t n; uint64_t queuesize = 0; @@ -769,11 +953,15 @@ cfil_queue_verify(struct cfil_queue *cfq) (!MBUFQ_EMPTY(&cfq->q_mq) && cfq->q_start != cfq->q_end)); - MBUFQ_FOREACH(m, &cfq->q_mq) { + MBUFQ_FOREACH(chain, &cfq->q_mq) { size_t chainsize = 0; - unsigned int mlen = m_length(m); + m = chain; + unsigned int mlen = cfil_data_length(m, NULL, NULL); + // skip the addr and control stuff if present + m = cfil_data_start(m); - if (m == (void *)M_TAG_FREE_PATTERN || + if (m == NULL || + m == (void *)M_TAG_FREE_PATTERN || m->m_next == (void *)M_TAG_FREE_PATTERN || m->m_nextpkt == (void *)M_TAG_FREE_PATTERN) panic("%s - mq %p is free at %p", __func__, @@ -812,7 +1000,7 @@ cfil_queue_remove(struct cfil_queue *cfq, mbuf_t m, size_t len) { CFIL_QUEUE_VERIFY(cfq); - VERIFY(m_length(m) == len); + VERIFY(cfil_data_length(m, NULL, NULL) == len); MBUFQ_REMOVE(&cfq->q_mq, m); MBUFQ_NEXT(m) = NULL; @@ -984,6 +1172,7 @@ cfil_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo) errno_t error = 0; struct content_filter *cfc; struct cfil_entry *entry; + uint64_t sock_flow_id = 0; CFIL_LOG(LOG_NOTICE, ""); @@ -1028,6 +1217,7 @@ cfil_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo) if (entry->cfe_cfil_info && entry->cfe_cfil_info->cfi_so) { struct cfil_info *cfil_info = entry->cfe_cfil_info; struct socket *so = cfil_info->cfi_so; + sock_flow_id = cfil_info->cfi_sock_id; /* Need to let data flow immediately */ entry->cfe_flags |= CFEF_SENT_SOCK_ATTACHED | @@ -1044,37 +1234,43 @@ cfil_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo) * When cfe_filter is NULL the filter is detached * and the entry has been removed from cf_sock_entries */ - if (so->so_cfil == NULL || entry->cfe_filter == NULL) { + if ((so->so_cfil == NULL && so->so_cfil_db == NULL) || entry->cfe_filter == NULL) { cfil_rw_lock_exclusive(&cfil_lck_rw); goto release; } - (void) cfil_action_data_pass(so, kcunit, 1, + + (void) cfil_action_data_pass(so, cfil_info, kcunit, 1, CFM_MAX_OFFSET, CFM_MAX_OFFSET); - (void) cfil_action_data_pass(so, kcunit, 0, + (void) cfil_action_data_pass(so, cfil_info, kcunit, 0, CFM_MAX_OFFSET, CFM_MAX_OFFSET); cfil_rw_lock_exclusive(&cfil_lck_rw); /* - * Check again as the socket may have been unlocked - * when when calling cfil_acquire_sockbuf() + * Check again to make sure if the cfil_info is still valid + * as the socket may have been unlocked when when calling + * cfil_acquire_sockbuf() */ - if (so->so_cfil == NULL || entry->cfe_filter == NULL) + if (entry->cfe_filter == NULL || + (so->so_cfil == NULL && cfil_db_get_cfil_info(so->so_cfil_db, sock_flow_id) == NULL)) { goto release; + } /* The filter is now detached */ entry->cfe_flags |= CFEF_CFIL_DETACHED; +#if LIFECYCLE_DEBUG + cfil_info_log(LOG_DEBUG, cfil_info, "CFIL: LIFECYCLE: - FILTER DISCONNECTED"); +#endif CFIL_LOG(LOG_NOTICE, "so %llx detached %u", (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit); - - if ((so->so_cfil->cfi_flags & CFIF_CLOSE_WAIT) && + if ((cfil_info->cfi_flags & CFIF_CLOSE_WAIT) && cfil_filters_attached(so) == 0) { CFIL_LOG(LOG_NOTICE, "so %llx waking", (uint64_t)VM_KERNEL_ADDRPERM(so)); - wakeup((caddr_t)&so->so_cfil); + wakeup((caddr_t)cfil_info); } /* @@ -1126,7 +1322,7 @@ cfil_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo) * sblock(), sbunlock() or sodefunct() */ static int -cfil_acquire_sockbuf(struct socket *so, int outgoing) +cfil_acquire_sockbuf(struct socket *so, struct cfil_info *cfil_info, int outgoing) { thread_t tp = current_thread(); struct sockbuf *sb = outgoing ? &so->so_snd : &so->so_rcv; @@ -1168,11 +1364,11 @@ cfil_acquire_sockbuf(struct socket *so, int outgoing) sb->sb_cfil_refs++; /* We acquire the socket buffer when we need to cleanup */ - if (so->so_cfil == NULL) { + if (cfil_info == NULL) { CFIL_LOG(LOG_ERR, "so %llx cfil detached", (uint64_t)VM_KERNEL_ADDRPERM(so)); error = 0; - } else if (so->so_cfil->cfi_flags & CFIF_DROP) { + } else if (cfil_info->cfi_flags & CFIF_DROP) { CFIL_LOG(LOG_ERR, "so %llx drop set", (uint64_t)VM_KERNEL_ADDRPERM(so)); error = EPIPE; @@ -1221,15 +1417,36 @@ cfil_sock_id_from_socket(struct socket *so) return (CFIL_SOCK_ID_NONE); } +static bool +cfil_socket_safe_lock(struct inpcb *inp) +{ + if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) { + socket_lock(inp->inp_socket, 1); + if (in_pcb_checkstate(inp, WNT_RELEASE, 1) != WNT_STOPUSING) { + return true; + } + socket_unlock(inp->inp_socket, 1); + } + return false; +} + static struct socket * -cfil_socket_from_sock_id(cfil_sock_id_t cfil_sock_id) +cfil_socket_from_sock_id(cfil_sock_id_t cfil_sock_id, bool udp_only) { struct socket *so = NULL; u_int64_t gencnt = cfil_sock_id >> 32; u_int32_t flowhash = (u_int32_t)(cfil_sock_id & 0x0ffffffff); struct inpcb *inp = NULL; - struct inpcbinfo *pcbinfo = &tcbinfo; + struct inpcbinfo *pcbinfo = NULL; + +#if VERDICT_DEBUG + CFIL_LOG(LOG_ERR, "CFIL: VERDICT: search for socket: id %llu gencnt %llx flowhash %x", cfil_sock_id, gencnt, flowhash); +#endif + + if (udp_only) + goto find_udp; + pcbinfo = &tcbinfo; lck_rw_lock_shared(pcbinfo->ipi_lock); LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { if (inp->inp_state != INPCB_STATE_DEAD && @@ -1237,12 +1454,33 @@ cfil_socket_from_sock_id(cfil_sock_id_t cfil_sock_id) inp->inp_flowhash == flowhash && (inp->inp_socket->so_gencnt & 0x0ffffffff) == gencnt && inp->inp_socket->so_cfil != NULL) { - so = inp->inp_socket; + if (cfil_socket_safe_lock(inp)) + so = inp->inp_socket; + break; + } + } + lck_rw_done(pcbinfo->ipi_lock); + if (so != NULL) { + goto done; + } + +find_udp: + + pcbinfo = &udbinfo; + lck_rw_lock_shared(pcbinfo->ipi_lock); + LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { + if (inp->inp_state != INPCB_STATE_DEAD && + inp->inp_socket != NULL && + inp->inp_socket->so_cfil_db != NULL && + (inp->inp_socket->so_gencnt & 0x0ffffffff) == gencnt) { + if (cfil_socket_safe_lock(inp)) + so = inp->inp_socket; break; } } lck_rw_done(pcbinfo->ipi_lock); +done: if (so == NULL) { OSIncrementAtomic(&cfil_stats.cfs_sock_id_not_found); CFIL_LOG(LOG_DEBUG, @@ -1266,12 +1504,31 @@ cfil_socket_from_client_uuid(uuid_t necp_client_uuid, bool *cfil_attached) inp->inp_socket != NULL && uuid_compare(inp->necp_client_uuid, necp_client_uuid) == 0) { *cfil_attached = (inp->inp_socket->so_cfil != NULL); - so = inp->inp_socket; + if (cfil_socket_safe_lock(inp)) + so = inp->inp_socket; + break; + } + } + lck_rw_done(pcbinfo->ipi_lock); + if (so != NULL) { + goto done; + } + + pcbinfo = &udbinfo; + lck_rw_lock_shared(pcbinfo->ipi_lock); + LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { + if (inp->inp_state != INPCB_STATE_DEAD && + inp->inp_socket != NULL && + uuid_compare(inp->necp_client_uuid, necp_client_uuid) == 0) { + *cfil_attached = (inp->inp_socket->so_cfil_db != NULL); + if (cfil_socket_safe_lock(inp)) + so = inp->inp_socket; break; } } lck_rw_done(pcbinfo->ipi_lock); +done: return (so); } @@ -1286,6 +1543,7 @@ cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m, struct socket *so; struct cfil_msg_action *action_msg; struct cfil_entry *entry; + struct cfil_info *cfil_info = NULL; CFIL_LOG(LOG_INFO, ""); @@ -1359,31 +1617,32 @@ cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m, cfil_rw_unlock_shared(&cfil_lck_rw); goto done; } + cfil_rw_unlock_shared(&cfil_lck_rw); - so = cfil_socket_from_sock_id(msghdr->cfm_sock_id); + // Search for socket (TCP+UDP and lock so) + so = cfil_socket_from_sock_id(msghdr->cfm_sock_id, false); if (so == NULL) { CFIL_LOG(LOG_NOTICE, "bad sock_id %llx", msghdr->cfm_sock_id); error = EINVAL; - cfil_rw_unlock_shared(&cfil_lck_rw); goto done; } - cfil_rw_unlock_shared(&cfil_lck_rw); - socket_lock(so, 1); + cfil_info = so->so_cfil_db != NULL ? + cfil_db_get_cfil_info(so->so_cfil_db, msghdr->cfm_sock_id) : so->so_cfil; - if (so->so_cfil == NULL) { - CFIL_LOG(LOG_NOTICE, "so %llx not attached", - (uint64_t)VM_KERNEL_ADDRPERM(so)); + if (cfil_info == NULL) { + CFIL_LOG(LOG_NOTICE, "so %llx not attached", + (uint64_t)VM_KERNEL_ADDRPERM(so), msghdr->cfm_sock_id); error = EINVAL; goto unlock; - } else if (so->so_cfil->cfi_flags & CFIF_DROP) { + } else if (cfil_info->cfi_flags & CFIF_DROP) { CFIL_LOG(LOG_NOTICE, "so %llx drop set", (uint64_t)VM_KERNEL_ADDRPERM(so)); error = EINVAL; goto unlock; } - entry = &so->so_cfil->cfi_entries[kcunit - 1]; + entry = &cfil_info->cfi_entries[kcunit - 1]; if (entry->cfe_filter == NULL) { CFIL_LOG(LOG_NOTICE, "so %llx no filter", (uint64_t)VM_KERNEL_ADDRPERM(so)); @@ -1402,15 +1661,22 @@ cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m, } microuptime(&entry->cfe_last_action); - CFI_ADD_TIME_LOG(so->so_cfil, &entry->cfe_last_action, &so->so_cfil->cfi_first_event, msghdr->cfm_op); + CFI_ADD_TIME_LOG(cfil_info, &entry->cfe_last_action, &cfil_info->cfi_first_event, msghdr->cfm_op); action_msg = (struct cfil_msg_action *)msghdr; switch (msghdr->cfm_op) { case CFM_OP_DATA_UPDATE: +#if VERDICT_DEBUG + CFIL_LOG(LOG_ERR, "CFIL: VERDICT RECEIVED: ", + (uint64_t)VM_KERNEL_ADDRPERM(so), + cfil_info->cfi_sock_id, + action_msg->cfa_in_peek_offset, action_msg->cfa_in_pass_offset, + action_msg->cfa_out_peek_offset, action_msg->cfa_out_pass_offset); +#endif if (action_msg->cfa_out_peek_offset != 0 || action_msg->cfa_out_pass_offset != 0) - error = cfil_action_data_pass(so, kcunit, 1, + error = cfil_action_data_pass(so, cfil_info, kcunit, 1, action_msg->cfa_out_pass_offset, action_msg->cfa_out_peek_offset); if (error == EJUSTRETURN) @@ -1419,7 +1685,7 @@ cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m, break; if (action_msg->cfa_in_peek_offset != 0 || action_msg->cfa_in_pass_offset != 0) - error = cfil_action_data_pass(so, kcunit, 0, + error = cfil_action_data_pass(so, cfil_info, kcunit, 0, action_msg->cfa_in_pass_offset, action_msg->cfa_in_peek_offset); if (error == EJUSTRETURN) @@ -1427,7 +1693,7 @@ cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m, break; case CFM_OP_DROP: - error = cfil_action_drop(so, kcunit); + error = cfil_action_drop(so, cfil_info, kcunit); break; default: @@ -1452,6 +1718,7 @@ cfil_ctl_getopt(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, int opt, void *data, size_t *len) { #pragma unused(kctlref, opt) + struct cfil_info *cfil_info = NULL; errno_t error = 0; struct content_filter *cfc = (struct content_filter *)unitinfo; @@ -1501,14 +1768,6 @@ cfil_ctl_getopt(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, struct cfil_opt_sock_info *sock_info = (struct cfil_opt_sock_info *) data; - struct socket *sock = - cfil_socket_from_sock_id(sock_info->cfs_sock_id); - if (sock == NULL) { - CFIL_LOG(LOG_NOTICE, "bad sock_id %llx", - sock_info->cfs_sock_id); - error = ENOENT; - goto done; - } // Unlock here so that we never hold both cfil_lck_rw and the // socket_lock at the same time. Otherwise, this can deadlock @@ -1521,11 +1780,26 @@ cfil_ctl_getopt(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, // goto return_already_unlocked from this branch. cfil_rw_unlock_shared(&cfil_lck_rw); - socket_lock(sock, 1); + // Search (TCP+UDP) and lock socket + struct socket *sock = + cfil_socket_from_sock_id(sock_info->cfs_sock_id, false); + if (sock == NULL) { +#if LIFECYCLE_DEBUG + CFIL_LOG(LOG_ERR, "CFIL: GET_SOCKET_INFO failed: bad sock_id %llu", + sock_info->cfs_sock_id); +#endif + error = ENOENT; + goto return_already_unlocked; + } + + cfil_info = (sock->so_cfil_db != NULL) ? + cfil_db_get_cfil_info(sock->so_cfil_db, sock_info->cfs_sock_id) : sock->so_cfil; - if (sock->so_cfil == NULL) { - CFIL_LOG(LOG_NOTICE, "so %llx not attached, cannot fetch info", + if (cfil_info == NULL) { +#if LIFECYCLE_DEBUG + CFIL_LOG(LOG_ERR, "CFIL: GET_SOCKET_INFO failed: so %llx not attached, cannot fetch info", (uint64_t)VM_KERNEL_ADDRPERM(sock)); +#endif error = EINVAL; socket_unlock(sock, 1); goto return_already_unlocked; @@ -1539,15 +1813,21 @@ cfil_ctl_getopt(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, // Source and destination addresses struct inpcb *inp = sotoinpcb(sock); if (inp->inp_vflag & INP_IPV6) { - fill_ip6_sockaddr_4_6(&sock_info->cfs_local, - &inp->in6p_laddr, inp->inp_lport); - fill_ip6_sockaddr_4_6(&sock_info->cfs_remote, - &inp->in6p_faddr, inp->inp_fport); + struct in6_addr *laddr = NULL, *faddr = NULL; + u_int16_t lport = 0, fport = 0; + + cfil_get_flow_address_v6(cfil_info->cfi_hash_entry, inp, + &laddr, &faddr, &lport, &fport); + fill_ip6_sockaddr_4_6(&sock_info->cfs_local, laddr, lport); + fill_ip6_sockaddr_4_6(&sock_info->cfs_remote, faddr, fport); } else if (inp->inp_vflag & INP_IPV4) { - fill_ip_sockaddr_4_6(&sock_info->cfs_local, - inp->inp_laddr, inp->inp_lport); - fill_ip_sockaddr_4_6(&sock_info->cfs_remote, - inp->inp_faddr, inp->inp_fport); + struct in_addr laddr = {0}, faddr = {0}; + u_int16_t lport = 0, fport = 0; + + cfil_get_flow_address(cfil_info->cfi_hash_entry, inp, + &laddr, &faddr, &lport, &fport); + fill_ip_sockaddr_4_6(&sock_info->cfs_local, laddr, lport); + fill_ip_sockaddr_4_6(&sock_info->cfs_remote, faddr, fport); } // Set the pid info @@ -1644,6 +1924,7 @@ cfil_ctl_rcvd(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, int flags) struct socket *so = NULL; int error; struct cfil_entry *entry; + struct cfil_info *cfil_info = NULL; CFIL_LOG(LOG_INFO, ""); @@ -1697,22 +1978,23 @@ cfil_ctl_rcvd(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, int flags) OSIncrementAtomic(&cfil_stats.cfs_ctl_rcvd_flow_lift); - so = entry->cfe_cfil_info->cfi_so; + cfil_info = entry->cfe_cfil_info; + so = cfil_info->cfi_so; cfil_rw_unlock_shared(&cfil_lck_rw); socket_lock(so, 1); do { - error = cfil_acquire_sockbuf(so, 1); + error = cfil_acquire_sockbuf(so, cfil_info, 1); if (error == 0) - error = cfil_data_service_ctl_q(so, kcunit, 1); + error = cfil_data_service_ctl_q(so, cfil_info, kcunit, 1); cfil_release_sockbuf(so, 1); if (error != 0) break; - error = cfil_acquire_sockbuf(so, 0); + error = cfil_acquire_sockbuf(so, cfil_info, 0); if (error == 0) - error = cfil_data_service_ctl_q(so, kcunit, 0); + error = cfil_data_service_ctl_q(so, cfil_info, kcunit, 0); cfil_release_sockbuf(so, 0); } while (0); @@ -1731,7 +2013,10 @@ cfil_init(void) struct kern_ctl_reg kern_ctl; errno_t error = 0; vm_size_t content_filter_size = 0; /* size of content_filter */ - vm_size_t cfil_info_size = 0; /* size of cfil_info */ + vm_size_t cfil_info_size = 0; /* size of cfil_info */ + vm_size_t cfil_hash_entry_size = 0; /* size of cfil_hash_entry */ + vm_size_t cfil_db_size = 0; /* size of cfil_db */ + unsigned int mbuf_limit = 0; CFIL_LOG(LOG_NOTICE, ""); @@ -1800,6 +2085,33 @@ cfil_init(void) zone_change(cfil_info_zone, Z_CALLERACCT, FALSE); zone_change(cfil_info_zone, Z_EXPAND, TRUE); + /* + * Zone for content filters cfil hash entries and db + */ + cfil_hash_entry_size = sizeof(struct cfil_hash_entry); + cfil_hash_entry_zone = zinit(cfil_hash_entry_size, + CFIL_HASH_ENTRY_ZONE_MAX * cfil_hash_entry_size, + 0, + CFIL_HASH_ENTRY_ZONE_NAME); + if (cfil_hash_entry_zone == NULL) { + panic("%s: zinit(%s) failed", __func__, CFIL_HASH_ENTRY_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(cfil_hash_entry_zone, Z_CALLERACCT, FALSE); + zone_change(cfil_hash_entry_zone, Z_EXPAND, TRUE); + + cfil_db_size = sizeof(struct cfil_db); + cfil_db_zone = zinit(cfil_db_size, + CFIL_DB_ZONE_MAX * cfil_db_size, + 0, + CFIL_DB_ZONE_NAME); + if (cfil_db_zone == NULL) { + panic("%s: zinit(%s) failed", __func__, CFIL_DB_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(cfil_db_zone, Z_CALLERACCT, FALSE); + zone_change(cfil_db_zone, Z_EXPAND, TRUE); + /* * Allocate locks */ @@ -1843,10 +2155,24 @@ cfil_init(void) CFIL_LOG(LOG_ERR, "ctl_register failed: %d", error); return; } + + // Spawn thread for gargage collection + if (kernel_thread_start(cfil_udp_gc_thread_func, NULL, + &cfil_udp_gc_thread) != KERN_SUCCESS) { + panic_plain("%s: Can't create UDP GC thread", __func__); + /* NOTREACHED */ + } + /* this must not fail */ + VERIFY(cfil_udp_gc_thread != NULL); + + // Set UDP per-flow mbuf thresholds to 1/32 of platform max + mbuf_limit = MAX(UDP_FLOW_GC_MBUF_CNT_MAX, (nmbclusters << MCLSHIFT) >> UDP_FLOW_GC_MBUF_SHIFT); + cfil_udp_gc_mbuf_num_max = (mbuf_limit >> MCLSHIFT); + cfil_udp_gc_mbuf_cnt_max = mbuf_limit; } struct cfil_info * -cfil_info_alloc(struct socket *so) +cfil_info_alloc(struct socket *so, struct cfil_hash_entry *hash_entry) { int kcunit; struct cfil_info *cfil_info = NULL; @@ -1880,6 +2206,11 @@ cfil_info_alloc(struct socket *so) entry->cfe_rcv.cfe_pass_offset = 0; entry->cfe_rcv.cfe_peek_offset = 0; entry->cfe_rcv.cfe_peeked = 0; + /* + * Timestamp the last action to avoid pre-maturely + * triggering garbage collection + */ + microuptime(&entry->cfe_last_action); cfil_queue_init(&entry->cfe_snd.cfe_pending_q); cfil_queue_init(&entry->cfe_rcv.cfe_pending_q); @@ -1888,16 +2219,36 @@ cfil_info_alloc(struct socket *so) } cfil_rw_lock_exclusive(&cfil_lck_rw); - - so->so_cfil = cfil_info; - cfil_info->cfi_so = so; + /* * Create a cfi_sock_id that's not the socket pointer! */ - if (inp->inp_flowhash == 0) - inp->inp_flowhash = inp_calc_flowhash(inp); - cfil_info->cfi_sock_id = - ((so->so_gencnt << 32) | inp->inp_flowhash); + + if (hash_entry == NULL) { + // This is the TCP case, cfil_info is tracked per socket + if (inp->inp_flowhash == 0) + inp->inp_flowhash = inp_calc_flowhash(inp); + + so->so_cfil = cfil_info; + cfil_info->cfi_so = so; + cfil_info->cfi_sock_id = + ((so->so_gencnt << 32) | inp->inp_flowhash); + } else { + // This is the UDP case, cfil_info is tracked in per-socket hash + cfil_info->cfi_so = so; + hash_entry->cfentry_cfil = cfil_info; + cfil_info->cfi_hash_entry = hash_entry; + cfil_info->cfi_sock_id = ((so->so_gencnt << 32) | (hash_entry->cfentry_flowhash & 0xffffffff)); + CFIL_LOG(LOG_DEBUG, "CFIL: UDP inp_flowhash %x so_gencnt %llx entry flowhash %x sockID %llx", + inp->inp_flowhash, so->so_gencnt, hash_entry->cfentry_flowhash, cfil_info->cfi_sock_id); + + // Wake up gc thread if this is first flow added + if (cfil_sock_udp_attached_count == 0) { + thread_wakeup((caddr_t)&cfil_sock_udp_attached_count); + } + + cfil_sock_udp_attached_count++; + } TAILQ_INSERT_TAIL(&cfil_sock_head, cfil_info, cfi_link); @@ -1915,10 +2266,9 @@ cfil_info_alloc(struct socket *so) } int -cfil_info_attach_unit(struct socket *so, uint32_t filter_control_unit) +cfil_info_attach_unit(struct socket *so, uint32_t filter_control_unit, struct cfil_info *cfil_info) { int kcunit; - struct cfil_info *cfil_info = so->so_cfil; int attached = 0; CFIL_LOG(LOG_INFO, ""); @@ -1956,19 +2306,12 @@ cfil_info_attach_unit(struct socket *so, uint32_t filter_control_unit) } static void -cfil_info_free(struct socket *so, struct cfil_info *cfil_info) +cfil_info_free(struct cfil_info *cfil_info) { int kcunit; uint64_t in_drain = 0; uint64_t out_drained = 0; - so->so_cfil = NULL; - - if (so->so_flags & SOF_CONTENT_FILTER) { - so->so_flags &= ~SOF_CONTENT_FILTER; - VERIFY(so->so_usecount > 0); - so->so_usecount--; - } if (cfil_info == NULL) return; @@ -1999,6 +2342,8 @@ cfil_info_free(struct socket *so, struct cfil_info *cfil_info) verify_content_filter(cfc); } + if (cfil_info->cfi_hash_entry != NULL) + cfil_sock_udp_attached_count--; cfil_sock_attached_count--; TAILQ_REMOVE(&cfil_sock_head, cfil_info, cfi_link); @@ -2061,20 +2406,20 @@ cfil_sock_attach(struct socket *so) OSIncrementAtomic(&cfil_stats.cfs_sock_attach_already); CFIL_LOG(LOG_ERR, "already attached"); } else { - cfil_info_alloc(so); + cfil_info_alloc(so, NULL); if (so->so_cfil == NULL) { error = ENOMEM; OSIncrementAtomic(&cfil_stats.cfs_sock_attach_no_mem); goto done; } } - if (cfil_info_attach_unit(so, filter_control_unit) == 0) { + if (cfil_info_attach_unit(so, filter_control_unit, so->so_cfil) == 0) { CFIL_LOG(LOG_ERR, "cfil_info_attach_unit(%u) failed", filter_control_unit); OSIncrementAtomic(&cfil_stats.cfs_sock_attach_failed); goto done; } - CFIL_LOG(LOG_INFO, "so %llx filter_control_unit %u sockid %llx", + CFIL_LOG(LOG_INFO, "so %llx filter_control_unit %u sockID %llx", (uint64_t)VM_KERNEL_ADDRPERM(so), filter_control_unit, so->so_cfil->cfi_sock_id); @@ -2084,7 +2429,7 @@ cfil_sock_attach(struct socket *so) /* Hold a reference on the socket */ so->so_usecount++; - error = cfil_dispatch_attach_event(so, filter_control_unit); + error = cfil_dispatch_attach_event(so, so->so_cfil, filter_control_unit); /* We can recover from flow control or out of memory errors */ if (error == ENOBUFS || error == ENOMEM) error = 0; @@ -2103,15 +2448,26 @@ cfil_sock_attach(struct socket *so) errno_t cfil_sock_detach(struct socket *so) { + if (IS_UDP(so)) { + cfil_db_free(so); + return (0); + } + if (so->so_cfil) { - cfil_info_free(so, so->so_cfil); + if (so->so_flags & SOF_CONTENT_FILTER) { + so->so_flags &= ~SOF_CONTENT_FILTER; + VERIFY(so->so_usecount > 0); + so->so_usecount--; + } + cfil_info_free(so->so_cfil); + so->so_cfil = NULL; OSIncrementAtomic(&cfil_stats.cfs_sock_detached); } return (0); } static int -cfil_dispatch_attach_event(struct socket *so, uint32_t filter_control_unit) +cfil_dispatch_attach_event(struct socket *so, struct cfil_info *cfil_info, uint32_t filter_control_unit) { errno_t error = 0; struct cfil_entry *entry = NULL; @@ -2137,7 +2493,7 @@ cfil_dispatch_attach_event(struct socket *so, uint32_t filter_control_unit) continue; if (cfc->cf_necp_control_unit != filter_control_unit) continue; - entry = &so->so_cfil->cfi_entries[kcunit - 1]; + entry = &cfil_info->cfi_entries[kcunit - 1]; if (entry->cfe_filter == NULL) continue; @@ -2180,6 +2536,12 @@ cfil_dispatch_attach_event(struct socket *so, uint32_t filter_control_unit) msg_attached.cfs_e_pid = so->last_pid; memcpy(msg_attached.cfs_e_uuid, so->last_uuid, sizeof(uuid_t)); } + +#if LIFECYCLE_DEBUG + CFIL_LOG(LOG_DEBUG, "CFIL: LIFECYCLE: SENDING ATTACH UP ", + entry->cfe_cfil_info->cfi_sock_id); +#endif + error = ctl_enqueuedata(entry->cfe_filter->cf_kcref, entry->cfe_filter->cf_kcunit, &msg_attached, @@ -2190,8 +2552,8 @@ cfil_dispatch_attach_event(struct socket *so, uint32_t filter_control_unit) goto done; } microuptime(&entry->cfe_last_event); - so->so_cfil->cfi_first_event.tv_sec = entry->cfe_last_event.tv_sec; - so->so_cfil->cfi_first_event.tv_usec = entry->cfe_last_event.tv_usec; + cfil_info->cfi_first_event.tv_sec = entry->cfe_last_event.tv_sec; + cfil_info->cfi_first_event.tv_usec = entry->cfe_last_event.tv_usec; entry->cfe_flags |= CFEF_SENT_SOCK_ATTACHED; OSIncrementAtomic(&cfil_stats.cfs_attach_event_ok); @@ -2218,7 +2580,7 @@ cfil_dispatch_attach_event(struct socket *so, uint32_t filter_control_unit) } static int -cfil_dispatch_disconnect_event(struct socket *so, uint32_t kcunit, int outgoing) +cfil_dispatch_disconnect_event(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit, int outgoing) { errno_t error = 0; struct mbuf *msg = NULL; @@ -2231,7 +2593,7 @@ cfil_dispatch_disconnect_event(struct socket *so, uint32_t kcunit, int outgoing) cfil_rw_lock_shared(&cfil_lck_rw); - entry = &so->so_cfil->cfi_entries[kcunit - 1]; + entry = &cfil_info->cfi_entries[kcunit - 1]; if (outgoing) entrybuf = &entry->cfe_snd; else @@ -2242,7 +2604,7 @@ cfil_dispatch_disconnect_event(struct socket *so, uint32_t kcunit, int outgoing) goto done; CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d", - (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing); + (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing); /* * Send the disconnection event once @@ -2270,6 +2632,12 @@ cfil_dispatch_disconnect_event(struct socket *so, uint32_t kcunit, int outgoing) goto done; } +#if LIFECYCLE_DEBUG + cfil_info_log(LOG_ERR, cfil_info, outgoing ? + "CFIL: LIFECYCLE: OUT - SENDING DISCONNECT UP": + "CFIL: LIFECYCLE: IN - SENDING DISCONNECT UP"); +#endif + bzero(&msg_disconnected, sizeof(struct cfil_msg_hdr)); msg_disconnected.cfm_len = sizeof(struct cfil_msg_hdr); msg_disconnected.cfm_version = CFM_VERSION_CURRENT; @@ -2288,7 +2656,7 @@ cfil_dispatch_disconnect_event(struct socket *so, uint32_t kcunit, int outgoing) goto done; } microuptime(&entry->cfe_last_event); - CFI_ADD_TIME_LOG(so->so_cfil, &entry->cfe_last_event, &so->so_cfil->cfi_first_event, msg_disconnected.cfm_op); + CFI_ADD_TIME_LOG(cfil_info, &entry->cfe_last_event, &cfil_info->cfi_first_event, msg_disconnected.cfm_op); /* Remember we have sent the disconnection message */ if (outgoing) { @@ -2321,7 +2689,7 @@ cfil_dispatch_disconnect_event(struct socket *so, uint32_t kcunit, int outgoing) } int -cfil_dispatch_closed_event(struct socket *so, int kcunit) +cfil_dispatch_closed_event(struct socket *so, struct cfil_info *cfil_info, int kcunit) { struct cfil_entry *entry; struct cfil_msg_sock_closed msg_closed; @@ -2332,13 +2700,13 @@ cfil_dispatch_closed_event(struct socket *so, int kcunit) cfil_rw_lock_shared(&cfil_lck_rw); - entry = &so->so_cfil->cfi_entries[kcunit - 1]; + entry = &cfil_info->cfi_entries[kcunit - 1]; cfc = entry->cfe_filter; if (cfc == NULL) goto done; CFIL_LOG(LOG_INFO, "so %llx kcunit %d", - (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit); + (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit); /* Would be wasteful to try when flow controlled */ if (cfc->cf_flags & CFF_FLOW_CONTROLLED) { @@ -2354,7 +2722,7 @@ cfil_dispatch_closed_event(struct socket *so, int kcunit) goto done; microuptime(&entry->cfe_last_event); - CFI_ADD_TIME_LOG(so->so_cfil, &entry->cfe_last_event, &so->so_cfil->cfi_first_event, CFM_OP_SOCKET_CLOSED); + CFI_ADD_TIME_LOG(cfil_info, &entry->cfe_last_event, &cfil_info->cfi_first_event, CFM_OP_SOCKET_CLOSED); bzero(&msg_closed, sizeof(struct cfil_msg_sock_closed)); msg_closed.cfc_msghdr.cfm_len = sizeof(struct cfil_msg_sock_closed); @@ -2362,13 +2730,15 @@ cfil_dispatch_closed_event(struct socket *so, int kcunit) msg_closed.cfc_msghdr.cfm_type = CFM_TYPE_EVENT; msg_closed.cfc_msghdr.cfm_op = CFM_OP_SOCKET_CLOSED; msg_closed.cfc_msghdr.cfm_sock_id = entry->cfe_cfil_info->cfi_sock_id; - msg_closed.cfc_first_event.tv_sec = so->so_cfil->cfi_first_event.tv_sec; - msg_closed.cfc_first_event.tv_usec = so->so_cfil->cfi_first_event.tv_usec; - memcpy(msg_closed.cfc_op_time, so->so_cfil->cfi_op_time, sizeof(uint32_t)*CFI_MAX_TIME_LOG_ENTRY); - memcpy(msg_closed.cfc_op_list, so->so_cfil->cfi_op_list, sizeof(unsigned char)*CFI_MAX_TIME_LOG_ENTRY); - msg_closed.cfc_op_list_ctr = so->so_cfil->cfi_op_list_ctr; - - CFIL_LOG(LOG_INFO, "sock id %llu, op ctr %d, start time %llu.%llu", msg_closed.cfc_msghdr.cfm_sock_id, so->so_cfil->cfi_op_list_ctr, so->so_cfil->cfi_first_event.tv_sec, so->so_cfil->cfi_first_event.tv_usec); + msg_closed.cfc_first_event.tv_sec = cfil_info->cfi_first_event.tv_sec; + msg_closed.cfc_first_event.tv_usec = cfil_info->cfi_first_event.tv_usec; + memcpy(msg_closed.cfc_op_time, cfil_info->cfi_op_time, sizeof(uint32_t)*CFI_MAX_TIME_LOG_ENTRY); + memcpy(msg_closed.cfc_op_list, cfil_info->cfi_op_list, sizeof(unsigned char)*CFI_MAX_TIME_LOG_ENTRY); + msg_closed.cfc_op_list_ctr = cfil_info->cfi_op_list_ctr; + +#if LIFECYCLE_DEBUG + CFIL_LOG(LOG_ERR, "CFIL: LIFECYCLE: SENDING CLOSED UP: op ctr %d, start time %llu.%llu", msg_closed.cfc_msghdr.cfm_sock_id, cfil_info->cfi_op_list_ctr, cfil_info->cfi_first_event.tv_sec, cfil_info->cfi_first_event.tv_usec); +#endif /* for debugging if (msg_closed.cfc_op_list_ctr > CFI_MAX_TIME_LOG_ENTRY) { msg_closed.cfc_op_list_ctr = CFI_MAX_TIME_LOG_ENTRY; // just in case @@ -2441,9 +2811,45 @@ fill_ip_sockaddr_4_6(union sockaddr_in_4_6 *sin46, sin->sin_addr.s_addr = ip.s_addr; } +static void +cfil_get_flow_address_v6(struct cfil_hash_entry *entry, struct inpcb *inp, + struct in6_addr **laddr, struct in6_addr **faddr, + u_int16_t *lport, u_int16_t *fport) +{ + if (entry != NULL) { + *laddr = &entry->cfentry_laddr.addr6; + *faddr = &entry->cfentry_faddr.addr6; + *lport = entry->cfentry_lport; + *fport = entry->cfentry_fport; + } else { + *laddr = &inp->in6p_laddr; + *faddr = &inp->in6p_faddr; + *lport = inp->inp_lport; + *fport = inp->inp_fport; + } +} + +static void +cfil_get_flow_address(struct cfil_hash_entry *entry, struct inpcb *inp, + struct in_addr *laddr, struct in_addr *faddr, + u_int16_t *lport, u_int16_t *fport) +{ + if (entry != NULL) { + *laddr = entry->cfentry_laddr.addr46.ia46_addr4; + *faddr = entry->cfentry_faddr.addr46.ia46_addr4; + *lport = entry->cfentry_lport; + *fport = entry->cfentry_fport; + } else { + *laddr = inp->inp_laddr; + *faddr = inp->inp_faddr; + *lport = inp->inp_lport; + *fport = inp->inp_fport; + } +} + static int -cfil_dispatch_data_event(struct socket *so, uint32_t kcunit, int outgoing, - struct mbuf *data, unsigned int copyoffset, unsigned int copylen) +cfil_dispatch_data_event(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit, int outgoing, + struct mbuf *data, unsigned int copyoffset, unsigned int copylen) { errno_t error = 0; struct mbuf *copy = NULL; @@ -2459,7 +2865,7 @@ cfil_dispatch_data_event(struct socket *so, uint32_t kcunit, int outgoing, cfil_rw_lock_shared(&cfil_lck_rw); - entry = &so->so_cfil->cfi_entries[kcunit - 1]; + entry = &cfil_info->cfi_entries[kcunit - 1]; if (outgoing) entrybuf = &entry->cfe_snd; else @@ -2469,6 +2875,12 @@ cfil_dispatch_data_event(struct socket *so, uint32_t kcunit, int outgoing, if (cfc == NULL) goto done; + data = cfil_data_start(data); + if (data == NULL || (data->m_flags & M_PKTHDR) == 0) { + CFIL_LOG(LOG_ERR, "NOT PKTHDR"); + goto done; + } + CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d", (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing); @@ -2522,33 +2934,36 @@ cfil_dispatch_data_event(struct socket *so, uint32_t kcunit, int outgoing, * parameters */ if (inp->inp_vflag & INP_IPV6) { + struct in6_addr *laddr = NULL, *faddr = NULL; + u_int16_t lport = 0, fport = 0; + + cfil_get_flow_address_v6(cfil_info->cfi_hash_entry, inp, + &laddr, &faddr, &lport, &fport); if (outgoing) { - fill_ip6_sockaddr_4_6(&data_req->cfc_src, - &inp->in6p_laddr, inp->inp_lport); - fill_ip6_sockaddr_4_6(&data_req->cfc_dst, - &inp->in6p_faddr, inp->inp_fport); + fill_ip6_sockaddr_4_6(&data_req->cfc_src, laddr, lport); + fill_ip6_sockaddr_4_6(&data_req->cfc_dst, faddr, fport); } else { - fill_ip6_sockaddr_4_6(&data_req->cfc_src, - &inp->in6p_faddr, inp->inp_fport); - fill_ip6_sockaddr_4_6(&data_req->cfc_dst, - &inp->in6p_laddr, inp->inp_lport); + fill_ip6_sockaddr_4_6(&data_req->cfc_src, faddr, fport); + fill_ip6_sockaddr_4_6(&data_req->cfc_dst, laddr, lport); } } else if (inp->inp_vflag & INP_IPV4) { + struct in_addr laddr = {0}, faddr = {0}; + u_int16_t lport = 0, fport = 0; + + cfil_get_flow_address(cfil_info->cfi_hash_entry, inp, + &laddr, &faddr, &lport, &fport); + if (outgoing) { - fill_ip_sockaddr_4_6(&data_req->cfc_src, - inp->inp_laddr, inp->inp_lport); - fill_ip_sockaddr_4_6(&data_req->cfc_dst, - inp->inp_faddr, inp->inp_fport); + fill_ip_sockaddr_4_6(&data_req->cfc_src, laddr, lport); + fill_ip_sockaddr_4_6(&data_req->cfc_dst, faddr, fport); } else { - fill_ip_sockaddr_4_6(&data_req->cfc_src, - inp->inp_faddr, inp->inp_fport); - fill_ip_sockaddr_4_6(&data_req->cfc_dst, - inp->inp_laddr, inp->inp_lport); + fill_ip_sockaddr_4_6(&data_req->cfc_src, faddr, fport); + fill_ip_sockaddr_4_6(&data_req->cfc_dst, laddr, lport); } } microuptime(&tv); - CFI_ADD_TIME_LOG(so->so_cfil, &tv, &so->so_cfil->cfi_first_event, data_req->cfd_msghdr.cfm_op); + CFI_ADD_TIME_LOG(cfil_info, &tv, &cfil_info->cfi_first_event, data_req->cfd_msghdr.cfm_op); /* Pass the message to the content filter */ error = ctl_enqueuembuf(entry->cfe_filter->cf_kcref, @@ -2561,6 +2976,12 @@ cfil_dispatch_data_event(struct socket *so, uint32_t kcunit, int outgoing, } entry->cfe_flags &= ~CFEF_FLOW_CONTROLLED; OSIncrementAtomic(&cfil_stats.cfs_data_event_ok); + +#if VERDICT_DEBUG + CFIL_LOG(LOG_ERR, "CFIL: VERDICT ACTION: so %llx sockID %llu outgoing %d: mbuf %llx copyoffset %u copylen %u", + (uint64_t)VM_KERNEL_ADDRPERM(so), cfil_info->cfi_sock_id, outgoing, (uint64_t)VM_KERNEL_ADDRPERM(data), copyoffset, copylen); +#endif + done: if (error == ENOBUFS) { entry->cfe_flags |= CFEF_FLOW_CONTROLLED; @@ -2586,7 +3007,7 @@ cfil_dispatch_data_event(struct socket *so, uint32_t kcunit, int outgoing, * Process the queue of data waiting to be delivered to content filter */ static int -cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing) +cfil_data_service_ctl_q(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit, int outgoing) { errno_t error = 0; struct mbuf *data, *tmp = NULL; @@ -2595,7 +3016,7 @@ cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing) struct cfe_buf *entrybuf; uint64_t currentoffset = 0; - if (so->so_cfil == NULL) + if (cfil_info == NULL) return (0); CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d", @@ -2603,7 +3024,7 @@ cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing) socket_lock_assert_owned(so); - entry = &so->so_cfil->cfi_entries[kcunit - 1]; + entry = &cfil_info->cfi_entries[kcunit - 1]; if (outgoing) entrybuf = &entry->cfe_snd; else @@ -2611,7 +3032,7 @@ cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing) /* Send attached message if not yet done */ if ((entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED) == 0) { - error = cfil_dispatch_attach_event(so, kcunit); + error = cfil_dispatch_attach_event(so, cfil_info, kcunit); if (error != 0) { /* We can recover from flow control */ if (error == ENOBUFS || error == ENOMEM) @@ -2622,15 +3043,18 @@ cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing) OSIncrementAtomic(&cfil_stats.cfs_ctl_q_not_started); goto done; } - CFIL_LOG(LOG_DEBUG, "pass_offset %llu peeked %llu peek_offset %llu", + +#if DATA_DEBUG + CFIL_LOG(LOG_DEBUG, "CFIL: SERVICE CTL-Q: pass_offset %llu peeked %llu peek_offset %llu", entrybuf->cfe_pass_offset, entrybuf->cfe_peeked, entrybuf->cfe_peek_offset); +#endif /* Move all data that can pass */ while ((data = cfil_queue_first(&entrybuf->cfe_ctl_q)) != NULL && entrybuf->cfe_ctl_q.q_start < entrybuf->cfe_pass_offset) { - datalen = cfil_data_length(data, NULL); + datalen = cfil_data_length(data, NULL, NULL); tmp = data; if (entrybuf->cfe_ctl_q.q_start + datalen <= @@ -2648,15 +3072,17 @@ cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing) } VERIFY(copylen <= datalen); +#if DATA_DEBUG CFIL_LOG(LOG_DEBUG, - "%llx first %llu peeked %llu pass %llu peek %llu" - "datalen %u copylen %u", - (uint64_t)VM_KERNEL_ADDRPERM(tmp), - entrybuf->cfe_ctl_q.q_start, - entrybuf->cfe_peeked, - entrybuf->cfe_pass_offset, - entrybuf->cfe_peek_offset, - datalen, copylen); + "CFIL: SERVICE CTL-Q PASSING: %llx first %llu peeked %llu pass %llu peek %llu" + "datalen %u copylen %u", + (uint64_t)VM_KERNEL_ADDRPERM(tmp), + entrybuf->cfe_ctl_q.q_start, + entrybuf->cfe_peeked, + entrybuf->cfe_pass_offset, + entrybuf->cfe_peek_offset, + datalen, copylen); +#endif /* * Data that passes has been peeked at explicitly or @@ -2683,7 +3109,7 @@ cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing) OSAddAtomic64(datalen, &cfil_stats.cfs_pending_q_in_enqueued); } - CFIL_INFO_VERIFY(so->so_cfil); + CFIL_INFO_VERIFY(cfil_info); if (tmp != NULL) CFIL_LOG(LOG_DEBUG, "%llx first %llu peeked %llu pass %llu peek %llu" @@ -2702,7 +3128,7 @@ cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing) data != NULL && currentoffset < entrybuf->cfe_peek_offset; data = cfil_queue_next(&entrybuf->cfe_ctl_q, data), currentoffset += datalen) { - datalen = cfil_data_length(data, NULL); + datalen = cfil_data_length(data, NULL, NULL); tmp = data; /* We've already peeked at this mbuf */ @@ -2725,15 +3151,17 @@ cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing) (currentoffset + copyoffset); } +#if DATA_DEBUG CFIL_LOG(LOG_DEBUG, - "%llx current %llu peeked %llu pass %llu peek %llu" - "datalen %u copylen %u copyoffset %u", - (uint64_t)VM_KERNEL_ADDRPERM(tmp), - currentoffset, - entrybuf->cfe_peeked, - entrybuf->cfe_pass_offset, - entrybuf->cfe_peek_offset, - datalen, copylen, copyoffset); + "CFIL: SERVICE CTL-Q PEEKING: %llx current %llu peeked %llu pass %llu peek %llu " + "datalen %u copylen %u copyoffset %u", + (uint64_t)VM_KERNEL_ADDRPERM(tmp), + currentoffset, + entrybuf->cfe_peeked, + entrybuf->cfe_pass_offset, + entrybuf->cfe_peek_offset, + datalen, copylen, copyoffset); +#endif /* * Stop if there is nothing more to peek at @@ -2743,7 +3171,7 @@ cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing) /* * Let the filter get a peek at this span of data */ - error = cfil_dispatch_data_event(so, kcunit, + error = cfil_dispatch_data_event(so, cfil_info, kcunit, outgoing, data, copyoffset, copylen); if (error != 0) { /* On error, leave data in ctl_q */ @@ -2761,7 +3189,7 @@ cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing) if (copylen + copyoffset < datalen) break; } - CFIL_INFO_VERIFY(so->so_cfil); + CFIL_INFO_VERIFY(cfil_info); if (tmp != NULL) CFIL_LOG(LOG_DEBUG, "%llx first %llu peeked %llu pass %llu peek %llu" @@ -2776,7 +3204,7 @@ cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing) /* * Process data that has passed the filter */ - error = cfil_service_pending_queue(so, kcunit, outgoing); + error = cfil_service_pending_queue(so, cfil_info, kcunit, outgoing); if (error != 0) { CFIL_LOG(LOG_ERR, "cfil_service_pending_queue() error %d", error); @@ -2786,16 +3214,16 @@ cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing) /* * Dispatch disconnect events that could not be sent */ - if (so->so_cfil == NULL) + if (cfil_info == NULL) goto done; else if (outgoing) { - if ((so->so_cfil->cfi_flags & CFIF_SHUT_WR) && + if ((cfil_info->cfi_flags & CFIF_SHUT_WR) && !(entry->cfe_flags & CFEF_SENT_DISCONNECT_OUT)) - cfil_dispatch_disconnect_event(so, kcunit, 1); + cfil_dispatch_disconnect_event(so, cfil_info, kcunit, 1); } else { - if ((so->so_cfil->cfi_flags & CFIF_SHUT_RD) && + if ((cfil_info->cfi_flags & CFIF_SHUT_RD) && !(entry->cfe_flags & CFEF_SENT_DISCONNECT_IN)) - cfil_dispatch_disconnect_event(so, kcunit, 0); + cfil_dispatch_disconnect_event(so, cfil_info, kcunit, 0); } done: @@ -2806,7 +3234,7 @@ cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing) entrybuf->cfe_pass_offset, entrybuf->cfe_peek_offset); - CFIL_INFO_VERIFY(so->so_cfil); + CFIL_INFO_VERIFY(cfil_info); return (error); } @@ -2816,7 +3244,7 @@ cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing) * Process data for a content filter installed on a socket */ int -cfil_data_filter(struct socket *so, uint32_t kcunit, int outgoing, +cfil_data_filter(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit, int outgoing, struct mbuf *data, uint64_t datalen) { errno_t error = 0; @@ -2828,7 +3256,7 @@ cfil_data_filter(struct socket *so, uint32_t kcunit, int outgoing, socket_lock_assert_owned(so); - entry = &so->so_cfil->cfi_entries[kcunit - 1]; + entry = &cfil_info->cfi_entries[kcunit - 1]; if (outgoing) entrybuf = &entry->cfe_snd; else @@ -2849,7 +3277,7 @@ cfil_data_filter(struct socket *so, uint32_t kcunit, int outgoing, OSAddAtomic64(datalen, &cfil_stats.cfs_ctl_q_in_enqueued); - error = cfil_data_service_ctl_q(so, kcunit, outgoing); + error = cfil_data_service_ctl_q(so, cfil_info, kcunit, outgoing); if (error != 0) { CFIL_LOG(LOG_ERR, "cfil_data_service_ctl_q() error %d", error); @@ -2860,7 +3288,7 @@ cfil_data_filter(struct socket *so, uint32_t kcunit, int outgoing, */ error = EJUSTRETURN; done: - CFIL_INFO_VERIFY(so->so_cfil); + CFIL_INFO_VERIFY(cfil_info); CFIL_LOG(LOG_INFO, "return %d", error); return (error); @@ -2871,103 +3299,84 @@ cfil_data_filter(struct socket *so, uint32_t kcunit, int outgoing, * content filters */ static int -cfil_service_inject_queue(struct socket *so, int outgoing) +cfil_service_inject_queue(struct socket *so, struct cfil_info *cfil_info, int outgoing) { mbuf_t data; unsigned int datalen; - int mbcnt; - unsigned int copylen; + int mbcnt = 0; + int mbnum = 0; errno_t error = 0; - struct mbuf *copy = NULL; struct cfi_buf *cfi_buf; struct cfil_queue *inject_q; int need_rwakeup = 0; + int count = 0; - if (so->so_cfil == NULL) + if (cfil_info == NULL) return (0); - CFIL_LOG(LOG_INFO, "so %llx outgoing %d", - (uint64_t)VM_KERNEL_ADDRPERM(so), outgoing); - socket_lock_assert_owned(so); if (outgoing) { - cfi_buf = &so->so_cfil->cfi_snd; - so->so_cfil->cfi_flags &= ~CFIF_RETRY_INJECT_OUT; + cfi_buf = &cfil_info->cfi_snd; + cfil_info->cfi_flags &= ~CFIF_RETRY_INJECT_OUT; } else { - cfi_buf = &so->so_cfil->cfi_rcv; - so->so_cfil->cfi_flags &= ~CFIF_RETRY_INJECT_IN; + cfi_buf = &cfil_info->cfi_rcv; + cfil_info->cfi_flags &= ~CFIF_RETRY_INJECT_IN; } inject_q = &cfi_buf->cfi_inject_q; - while ((data = cfil_queue_first(inject_q)) != NULL) { - datalen = cfil_data_length(data, &mbcnt); - - CFIL_LOG(LOG_INFO, "data %llx datalen %u", - (uint64_t)VM_KERNEL_ADDRPERM(data), datalen); - - /* Make a copy in case of injection error */ - copy = m_copym_mode(data, 0, M_COPYALL, M_DONTWAIT, - M_COPYM_COPY_HDR); - if (copy == NULL) { - CFIL_LOG(LOG_ERR, "m_copym_mode() failed"); - error = ENOMEM; - break; - } + if (cfil_queue_empty(inject_q)) + return (0); - if ((copylen = m_length(copy)) != datalen) - panic("%s so %p copylen %d != datalen %d", - __func__, so, copylen, datalen); +#if DATA_DEBUG | VERDICT_DEBUG + CFIL_LOG(LOG_ERR, "CFIL: SERVICE INJECT-Q: outgoing %d queue len %llu", + (uint64_t)VM_KERNEL_ADDRPERM(so), outgoing, cfil_queue_len(inject_q)); +#endif - if (outgoing) { - socket_unlock(so, 0); + while ((data = cfil_queue_first(inject_q)) != NULL) { + datalen = cfil_data_length(data, &mbcnt, &mbnum); - /* - * Set both DONTWAIT and NBIO flags are we really - * do not want to block - */ - error = sosend(so, NULL, NULL, - copy, NULL, - MSG_SKIPCFIL | MSG_DONTWAIT | MSG_NBIO); +#if DATA_DEBUG + CFIL_LOG(LOG_DEBUG, "CFIL: SERVICE INJECT-Q: <%s>: data %llx datalen %u (mbcnt %u)", + remote_addr_ptr ? "UNCONNECTED" : "CONNECTED", + (uint64_t)VM_KERNEL_ADDRPERM(so), (uint64_t)VM_KERNEL_ADDRPERM(data), datalen, mbcnt); +#endif - socket_lock(so, 0); + /* Remove data from queue and adjust stats */ + cfil_queue_remove(inject_q, data, datalen); + cfi_buf->cfi_pending_first += datalen; + cfi_buf->cfi_pending_mbcnt -= mbcnt; + cfi_buf->cfi_pending_mbnum -= mbnum; + cfil_info_buf_verify(cfi_buf); + if (outgoing) { + error = sosend_reinject(so, NULL, data, NULL, 0); if (error != 0) { - CFIL_LOG(LOG_ERR, "sosend() failed %d", - error); +#if DATA_DEBUG + cfil_info_log(LOG_ERR, cfil_info, "CFIL: Error: sosend_reinject() failed"); + CFIL_LOG(LOG_ERR, "### sosend() failed %d", error); +#endif + break; } + // At least one injection succeeded, need to wake up pending threads. + need_rwakeup = 1; } else { - copy->m_flags |= M_SKIPCFIL; + data->m_flags |= M_SKIPCFIL; /* - * NOTE: - * This work only because we support plain TCP - * For UDP, RAWIP, MPTCP and message TCP we'll + * NOTE: We currently only support TCP and UDP. + * For RAWIP, MPTCP and message TCP we'll * need to call the appropriate sbappendxxx() * of fix sock_inject_data_in() */ - if (sbappendstream(&so->so_rcv, copy)) - need_rwakeup = 1; - } - - /* Need to reassess if filter is still attached after unlock */ - if (so->so_cfil == NULL) { - CFIL_LOG(LOG_ERR, "so %llx cfil detached", - (uint64_t)VM_KERNEL_ADDRPERM(so)); - OSIncrementAtomic(&cfil_stats.cfs_inject_q_detached); - error = 0; - break; + if (IS_UDP(so) == TRUE) { + if (sbappendchain(&so->so_rcv, data, 0)) + need_rwakeup = 1; + } else { + if (sbappendstream(&so->so_rcv, data)) + need_rwakeup = 1; + } } - if (error != 0) - break; - - /* Injection successful */ - cfil_queue_remove(inject_q, data, datalen); - mbuf_freem(data); - - cfi_buf->cfi_pending_first += datalen; - cfi_buf->cfi_pending_mbcnt -= mbcnt; - cfil_info_buf_verify(cfi_buf); if (outgoing) OSAddAtomic64(datalen, @@ -2975,23 +3384,34 @@ cfil_service_inject_queue(struct socket *so, int outgoing) else OSAddAtomic64(datalen, &cfil_stats.cfs_inject_q_in_passed); + + count++; } +#if DATA_DEBUG | VERDICT_DEBUG + CFIL_LOG(LOG_ERR, "CFIL: SERVICE INJECT-Q: injected %d", + (uint64_t)VM_KERNEL_ADDRPERM(so), count); +#endif + /* A single wakeup is for several packets is more efficient */ - if (need_rwakeup) - sorwakeup(so); + if (need_rwakeup) { + if (outgoing == TRUE) + sowwakeup(so); + else + sorwakeup(so); + } - if (error != 0 && so->so_cfil) { + if (error != 0 && cfil_info) { if (error == ENOBUFS) OSIncrementAtomic(&cfil_stats.cfs_inject_q_nobufs); if (error == ENOMEM) OSIncrementAtomic(&cfil_stats.cfs_inject_q_nomem); if (outgoing) { - so->so_cfil->cfi_flags |= CFIF_RETRY_INJECT_OUT; + cfil_info->cfi_flags |= CFIF_RETRY_INJECT_OUT; OSIncrementAtomic(&cfil_stats.cfs_inject_q_out_fail); } else { - so->so_cfil->cfi_flags |= CFIF_RETRY_INJECT_IN; + cfil_info->cfi_flags |= CFIF_RETRY_INJECT_IN; OSIncrementAtomic(&cfil_stats.cfs_inject_q_in_fail); } } @@ -2999,26 +3419,26 @@ cfil_service_inject_queue(struct socket *so, int outgoing) /* * Notify */ - if (so->so_cfil && (so->so_cfil->cfi_flags & CFIF_SHUT_WR)) { + if (cfil_info && (cfil_info->cfi_flags & CFIF_SHUT_WR)) { cfil_sock_notify_shutdown(so, SHUT_WR); if (cfil_sock_data_pending(&so->so_snd) == 0) soshutdownlock_final(so, SHUT_WR); } - if (so->so_cfil && (so->so_cfil->cfi_flags & CFIF_CLOSE_WAIT)) { + if (cfil_info && (cfil_info->cfi_flags & CFIF_CLOSE_WAIT)) { if (cfil_filters_attached(so) == 0) { CFIL_LOG(LOG_INFO, "so %llx waking", (uint64_t)VM_KERNEL_ADDRPERM(so)); - wakeup((caddr_t)&so->so_cfil); + wakeup((caddr_t)cfil_info); } } - CFIL_INFO_VERIFY(so->so_cfil); + CFIL_INFO_VERIFY(cfil_info); return (error); } static int -cfil_service_pending_queue(struct socket *so, uint32_t kcunit, int outgoing) +cfil_service_pending_queue(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit, int outgoing) { uint64_t passlen, curlen; mbuf_t data; @@ -3033,7 +3453,7 @@ cfil_service_pending_queue(struct socket *so, uint32_t kcunit, int outgoing) socket_lock_assert_owned(so); - entry = &so->so_cfil->cfi_entries[kcunit - 1]; + entry = &cfil_info->cfi_entries[kcunit - 1]; if (outgoing) entrybuf = &entry->cfe_snd; else @@ -3049,12 +3469,14 @@ cfil_service_pending_queue(struct socket *so, uint32_t kcunit, int outgoing) */ curlen = 0; while ((data = cfil_queue_first(pending_q)) != NULL) { - datalen = cfil_data_length(data, NULL); + datalen = cfil_data_length(data, NULL, NULL); - CFIL_LOG(LOG_INFO, - "data %llx datalen %u passlen %llu curlen %llu", +#if DATA_DEBUG + CFIL_LOG(LOG_DEBUG, + "CFIL: SERVICE PENDING-Q: data %llx datalen %u passlen %llu curlen %llu", (uint64_t)VM_KERNEL_ADDRPERM(data), datalen, passlen, curlen); +#endif if (curlen + datalen > passlen) break; @@ -3066,7 +3488,7 @@ cfil_service_pending_queue(struct socket *so, uint32_t kcunit, int outgoing) for (kcunit += 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) { - error = cfil_data_filter(so, kcunit, outgoing, + error = cfil_data_filter(so, cfil_info, kcunit, outgoing, data, datalen); /* 0 means passed so we can continue */ if (error != 0) @@ -3076,13 +3498,13 @@ cfil_service_pending_queue(struct socket *so, uint32_t kcunit, int outgoing) if (error == 0) { if (outgoing) { cfil_queue_enqueue( - &so->so_cfil->cfi_snd.cfi_inject_q, + &cfil_info->cfi_snd.cfi_inject_q, data, datalen); OSAddAtomic64(datalen, &cfil_stats.cfs_inject_q_out_enqueued); } else { cfil_queue_enqueue( - &so->so_cfil->cfi_rcv.cfi_inject_q, + &cfil_info->cfi_rcv.cfi_inject_q, data, datalen); OSAddAtomic64(datalen, &cfil_stats.cfs_inject_q_in_enqueued); @@ -3090,13 +3512,13 @@ cfil_service_pending_queue(struct socket *so, uint32_t kcunit, int outgoing) } } - CFIL_INFO_VERIFY(so->so_cfil); + CFIL_INFO_VERIFY(cfil_info); return (error); } int -cfil_update_data_offsets(struct socket *so, uint32_t kcunit, int outgoing, +cfil_update_data_offsets(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit, int outgoing, uint64_t pass_offset, uint64_t peek_offset) { errno_t error = 0; @@ -3108,19 +3530,19 @@ cfil_update_data_offsets(struct socket *so, uint32_t kcunit, int outgoing, socket_lock_assert_owned(so); - if (so->so_cfil == NULL) { + if (cfil_info == NULL) { CFIL_LOG(LOG_ERR, "so %llx cfil detached", (uint64_t)VM_KERNEL_ADDRPERM(so)); error = 0; goto done; - } else if (so->so_cfil->cfi_flags & CFIF_DROP) { + } else if (cfil_info->cfi_flags & CFIF_DROP) { CFIL_LOG(LOG_ERR, "so %llx drop set", (uint64_t)VM_KERNEL_ADDRPERM(so)); error = EPIPE; goto done; } - entry = &so->so_cfil->cfi_entries[kcunit - 1]; + entry = &cfil_info->cfi_entries[kcunit - 1]; if (outgoing) entrybuf = &entry->cfe_snd; else @@ -3148,7 +3570,7 @@ cfil_update_data_offsets(struct socket *so, uint32_t kcunit, int outgoing, goto done; /* Move data held in control queue to pending queue if needed */ - error = cfil_data_service_ctl_q(so, kcunit, outgoing); + error = cfil_data_service_ctl_q(so, cfil_info, kcunit, outgoing); if (error != 0) { CFIL_LOG(LOG_ERR, "cfil_data_service_ctl_q() error %d", error); @@ -3165,20 +3587,28 @@ cfil_update_data_offsets(struct socket *so, uint32_t kcunit, int outgoing, if (entry != NULL && ((entry->cfe_snd.cfe_pass_offset == CFM_MAX_OFFSET && entry->cfe_rcv.cfe_pass_offset == CFM_MAX_OFFSET) || - ((so->so_cfil->cfi_flags & CFIF_CLOSE_WAIT) && + ((cfil_info->cfi_flags & CFIF_CLOSE_WAIT) && cfil_queue_empty(&entry->cfe_snd.cfe_ctl_q) && cfil_queue_empty(&entry->cfe_rcv.cfe_ctl_q)))) { entry->cfe_flags |= CFEF_CFIL_DETACHED; +#if LIFECYCLE_DEBUG + cfil_info_log(LOG_ERR, cfil_info, outgoing ? + "CFIL: LIFECYCLE: OUT - PASSED ALL - DETACH": + "CFIL: LIFECYCLE: IN - PASSED ALL - DETACH"); +#endif CFIL_LOG(LOG_INFO, "so %llx detached %u", (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit); - if ((so->so_cfil->cfi_flags & CFIF_CLOSE_WAIT) && + if ((cfil_info->cfi_flags & CFIF_CLOSE_WAIT) && cfil_filters_attached(so) == 0) { +#if LIFECYCLE_DEBUG + cfil_info_log(LOG_ERR, cfil_info, "CFIL: LIFECYCLE: WAKING"); +#endif CFIL_LOG(LOG_INFO, "so %llx waking", (uint64_t)VM_KERNEL_ADDRPERM(so)); - wakeup((caddr_t)&so->so_cfil); + wakeup((caddr_t)cfil_info); } } - CFIL_INFO_VERIFY(so->so_cfil); + CFIL_INFO_VERIFY(cfil_info); CFIL_LOG(LOG_INFO, "return %d", error); return (error); } @@ -3187,7 +3617,7 @@ cfil_update_data_offsets(struct socket *so, uint32_t kcunit, int outgoing, * Update pass offset for socket when no data is pending */ static int -cfil_set_socket_pass_offset(struct socket *so, int outgoing) +cfil_set_socket_pass_offset(struct socket *so, struct cfil_info *cfil_info, int outgoing) { struct cfi_buf *cfi_buf; struct cfil_entry *entry; @@ -3195,7 +3625,7 @@ cfil_set_socket_pass_offset(struct socket *so, int outgoing) uint32_t kcunit; uint64_t pass_offset = 0; - if (so->so_cfil == NULL) + if (cfil_info == NULL) return (0); CFIL_LOG(LOG_INFO, "so %llx outgoing %d", @@ -3204,13 +3634,17 @@ cfil_set_socket_pass_offset(struct socket *so, int outgoing) socket_lock_assert_owned(so); if (outgoing) - cfi_buf = &so->so_cfil->cfi_snd; + cfi_buf = &cfil_info->cfi_snd; else - cfi_buf = &so->so_cfil->cfi_rcv; + cfi_buf = &cfil_info->cfi_rcv; + + CFIL_LOG(LOG_DEBUG, "CFIL: outgoing %d cfi_pending_first %llu cfi_pending_last %llu", + (uint64_t)VM_KERNEL_ADDRPERM(so), cfil_info->cfi_sock_id, outgoing, + cfi_buf->cfi_pending_first, cfi_buf->cfi_pending_last); if (cfi_buf->cfi_pending_last - cfi_buf->cfi_pending_first == 0) { for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) { - entry = &so->so_cfil->cfi_entries[kcunit - 1]; + entry = &cfil_info->cfi_entries[kcunit - 1]; /* Are we attached to a filter? */ if (entry->cfe_filter == NULL) @@ -3228,11 +3662,14 @@ cfil_set_socket_pass_offset(struct socket *so, int outgoing) cfi_buf->cfi_pass_offset = pass_offset; } + CFIL_LOG(LOG_DEBUG, "CFIL: , cfi_pass_offset %llu", + (uint64_t)VM_KERNEL_ADDRPERM(so), cfil_info->cfi_sock_id, cfi_buf->cfi_pass_offset); + return (0); } int -cfil_action_data_pass(struct socket *so, uint32_t kcunit, int outgoing, +cfil_action_data_pass(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit, int outgoing, uint64_t pass_offset, uint64_t peek_offset) { errno_t error = 0; @@ -3241,7 +3678,7 @@ cfil_action_data_pass(struct socket *so, uint32_t kcunit, int outgoing, socket_lock_assert_owned(so); - error = cfil_acquire_sockbuf(so, outgoing); + error = cfil_acquire_sockbuf(so, cfil_info, outgoing); if (error != 0) { CFIL_LOG(LOG_INFO, "so %llx %s dropped", (uint64_t)VM_KERNEL_ADDRPERM(so), @@ -3249,14 +3686,14 @@ cfil_action_data_pass(struct socket *so, uint32_t kcunit, int outgoing, goto release; } - error = cfil_update_data_offsets(so, kcunit, outgoing, + error = cfil_update_data_offsets(so, cfil_info, kcunit, outgoing, pass_offset, peek_offset); - cfil_service_inject_queue(so, outgoing); + cfil_service_inject_queue(so, cfil_info, outgoing); - cfil_set_socket_pass_offset(so, outgoing); + cfil_set_socket_pass_offset(so, cfil_info, outgoing); release: - CFIL_INFO_VERIFY(so->so_cfil); + CFIL_INFO_VERIFY(cfil_info); cfil_release_sockbuf(so, outgoing); return (error); @@ -3264,13 +3701,13 @@ cfil_action_data_pass(struct socket *so, uint32_t kcunit, int outgoing, static void -cfil_flush_queues(struct socket *so) +cfil_flush_queues(struct socket *so, struct cfil_info *cfil_info) { struct cfil_entry *entry; int kcunit; uint64_t drained; - if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL) + if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || cfil_info == NULL) goto done; socket_lock_assert_owned(so); @@ -3279,19 +3716,19 @@ cfil_flush_queues(struct socket *so) * Flush the output queues and ignore errors as long as * we are attached */ - (void) cfil_acquire_sockbuf(so, 1); - if (so->so_cfil != NULL) { + (void) cfil_acquire_sockbuf(so, cfil_info, 1); + if (cfil_info != NULL) { drained = 0; for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) { - entry = &so->so_cfil->cfi_entries[kcunit - 1]; + entry = &cfil_info->cfi_entries[kcunit - 1]; drained += cfil_queue_drain(&entry->cfe_snd.cfe_ctl_q); - drained += cfil_queue_drain( - &entry->cfe_snd.cfe_pending_q); + drained += cfil_queue_drain(&entry->cfe_snd.cfe_pending_q); } - drained += cfil_queue_drain(&so->so_cfil->cfi_snd.cfi_inject_q); + drained += cfil_queue_drain(&cfil_info->cfi_snd.cfi_inject_q); + if (drained) { - if (so->so_cfil->cfi_flags & CFIF_DROP) + if (cfil_info->cfi_flags & CFIF_DROP) OSIncrementAtomic( &cfil_stats.cfs_flush_out_drop); else @@ -3304,20 +3741,21 @@ cfil_flush_queues(struct socket *so) /* * Flush the input queues */ - (void) cfil_acquire_sockbuf(so, 0); - if (so->so_cfil != NULL) { + (void) cfil_acquire_sockbuf(so, cfil_info, 0); + if (cfil_info != NULL) { drained = 0; for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) { - entry = &so->so_cfil->cfi_entries[kcunit - 1]; + entry = &cfil_info->cfi_entries[kcunit - 1]; drained += cfil_queue_drain( &entry->cfe_rcv.cfe_ctl_q); drained += cfil_queue_drain( &entry->cfe_rcv.cfe_pending_q); } - drained += cfil_queue_drain(&so->so_cfil->cfi_rcv.cfi_inject_q); + drained += cfil_queue_drain(&cfil_info->cfi_rcv.cfi_inject_q); + if (drained) { - if (so->so_cfil->cfi_flags & CFIF_DROP) + if (cfil_info->cfi_flags & CFIF_DROP) OSIncrementAtomic( &cfil_stats.cfs_flush_in_drop); else @@ -3327,28 +3765,28 @@ cfil_flush_queues(struct socket *so) } cfil_release_sockbuf(so, 0); done: - CFIL_INFO_VERIFY(so->so_cfil); + CFIL_INFO_VERIFY(cfil_info); } int -cfil_action_drop(struct socket *so, uint32_t kcunit) +cfil_action_drop(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit) { errno_t error = 0; struct cfil_entry *entry; struct proc *p; - if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL) + if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || cfil_info == NULL) goto done; socket_lock_assert_owned(so); - entry = &so->so_cfil->cfi_entries[kcunit - 1]; + entry = &cfil_info->cfi_entries[kcunit - 1]; /* Are we attached to the filter? */ if (entry->cfe_filter == NULL) goto done; - so->so_cfil->cfi_flags |= CFIF_DROP; + cfil_info->cfi_flags |= CFIF_DROP; p = current_proc(); @@ -3356,28 +3794,33 @@ cfil_action_drop(struct socket *so, uint32_t kcunit) * Force the socket to be marked defunct * (forcing fixed along with rdar://19391339) */ - error = sosetdefunct(p, so, - SHUTDOWN_SOCKET_LEVEL_CONTENT_FILTER | SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL, - FALSE); + if (so->so_cfil_db == NULL) { + error = sosetdefunct(p, so, + SHUTDOWN_SOCKET_LEVEL_CONTENT_FILTER | SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL, + FALSE); - /* Flush the socket buffer and disconnect */ - if (error == 0) - error = sodefunct(p, so, - SHUTDOWN_SOCKET_LEVEL_CONTENT_FILTER | SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL); + /* Flush the socket buffer and disconnect */ + if (error == 0) + error = sodefunct(p, so, + SHUTDOWN_SOCKET_LEVEL_CONTENT_FILTER | SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL); + } /* The filter is done, mark as detached */ entry->cfe_flags |= CFEF_CFIL_DETACHED; +#if LIFECYCLE_DEBUG + cfil_info_log(LOG_ERR, cfil_info, "CFIL: LIFECYCLE: DROP - DETACH"); +#endif CFIL_LOG(LOG_INFO, "so %llx detached %u", (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit); /* Pending data needs to go */ - cfil_flush_queues(so); + cfil_flush_queues(so, cfil_info); - if (so->so_cfil && (so->so_cfil->cfi_flags & CFIF_CLOSE_WAIT)) { + if (cfil_info && (cfil_info->cfi_flags & CFIF_CLOSE_WAIT)) { if (cfil_filters_attached(so) == 0) { CFIL_LOG(LOG_INFO, "so %llx waking", (uint64_t)VM_KERNEL_ADDRPERM(so)); - wakeup((caddr_t)&so->so_cfil); + wakeup((caddr_t)cfil_info); } } done: @@ -3388,33 +3831,42 @@ int cfil_action_bless_client(uint32_t kcunit, struct cfil_msg_hdr *msghdr) { errno_t error = 0; - - cfil_rw_lock_exclusive(&cfil_lck_rw); + struct cfil_info *cfil_info = NULL; bool cfil_attached = false; struct cfil_msg_bless_client *blessmsg = (struct cfil_msg_bless_client *)msghdr; + + // Search and lock socket struct socket *so = cfil_socket_from_client_uuid(blessmsg->cfb_client_uuid, &cfil_attached); if (so == NULL) { error = ENOENT; } else { // The client gets a pass automatically - socket_lock(so, 1); + cfil_info = (so->so_cfil_db != NULL) ? + cfil_db_get_cfil_info(so->so_cfil_db, msghdr->cfm_sock_id) : so->so_cfil; + if (cfil_attached) { - (void)cfil_action_data_pass(so, kcunit, 1, CFM_MAX_OFFSET, CFM_MAX_OFFSET); - (void)cfil_action_data_pass(so, kcunit, 0, CFM_MAX_OFFSET, CFM_MAX_OFFSET); +#if VERDICT_DEBUG + if (cfil_info != NULL) { + CFIL_LOG(LOG_ERR, "CFIL: VERDICT RECEIVED: BLESS %s ", + cfil_info->cfi_hash_entry ? "UDP" : "TCP", + (uint64_t)VM_KERNEL_ADDRPERM(so), + cfil_info->cfi_sock_id); + } +#endif + (void)cfil_action_data_pass(so, cfil_info, kcunit, 1, CFM_MAX_OFFSET, CFM_MAX_OFFSET); + (void)cfil_action_data_pass(so, cfil_info, kcunit, 0, CFM_MAX_OFFSET, CFM_MAX_OFFSET); } else { so->so_flags1 |= SOF1_CONTENT_FILTER_SKIP; } socket_unlock(so, 1); } - cfil_rw_unlock_exclusive(&cfil_lck_rw); - return (error); } static int -cfil_update_entry_offsets(struct socket *so, int outgoing, unsigned int datalen) +cfil_update_entry_offsets(struct socket *so, struct cfil_info *cfil_info, int outgoing, unsigned int datalen) { struct cfil_entry *entry; struct cfe_buf *entrybuf; @@ -3424,7 +3876,7 @@ cfil_update_entry_offsets(struct socket *so, int outgoing, unsigned int datalen) (uint64_t)VM_KERNEL_ADDRPERM(so), outgoing, datalen); for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) { - entry = &so->so_cfil->cfi_entries[kcunit - 1]; + entry = &cfil_info->cfi_entries[kcunit - 1]; /* Are we attached to the filter? */ if (entry->cfe_filter == NULL) @@ -3446,62 +3898,94 @@ cfil_update_entry_offsets(struct socket *so, int outgoing, unsigned int datalen) entrybuf->cfe_pending_q.q_start += datalen; entrybuf->cfe_pending_q.q_end += datalen; } - CFIL_INFO_VERIFY(so->so_cfil); + CFIL_INFO_VERIFY(cfil_info); return (0); } int -cfil_data_common(struct socket *so, int outgoing, struct sockaddr *to, +cfil_data_common(struct socket *so, struct cfil_info *cfil_info, int outgoing, struct sockaddr *to, struct mbuf *data, struct mbuf *control, uint32_t flags) { #pragma unused(to, control, flags) errno_t error = 0; unsigned int datalen; - int mbcnt; + int mbcnt = 0; + int mbnum = 0; int kcunit; struct cfi_buf *cfi_buf; + struct mbuf *chain = NULL; - if (so->so_cfil == NULL) { + if (cfil_info == NULL) { CFIL_LOG(LOG_ERR, "so %llx cfil detached", (uint64_t)VM_KERNEL_ADDRPERM(so)); error = 0; goto done; - } else if (so->so_cfil->cfi_flags & CFIF_DROP) { + } else if (cfil_info->cfi_flags & CFIF_DROP) { CFIL_LOG(LOG_ERR, "so %llx drop set", (uint64_t)VM_KERNEL_ADDRPERM(so)); error = EPIPE; goto done; } - datalen = cfil_data_length(data, &mbcnt); - - CFIL_LOG(LOG_INFO, "so %llx %s m %llx len %u flags 0x%x nextpkt %llx", - (uint64_t)VM_KERNEL_ADDRPERM(so), - outgoing ? "out" : "in", - (uint64_t)VM_KERNEL_ADDRPERM(data), datalen, data->m_flags, - (uint64_t)VM_KERNEL_ADDRPERM(data->m_nextpkt)); + datalen = cfil_data_length(data, &mbcnt, &mbnum); if (outgoing) - cfi_buf = &so->so_cfil->cfi_snd; + cfi_buf = &cfil_info->cfi_snd; else - cfi_buf = &so->so_cfil->cfi_rcv; + cfi_buf = &cfil_info->cfi_rcv; cfi_buf->cfi_pending_last += datalen; cfi_buf->cfi_pending_mbcnt += mbcnt; + cfi_buf->cfi_pending_mbnum += mbnum; + + if (IS_UDP(so)) { + if (cfi_buf->cfi_pending_mbnum > cfil_udp_gc_mbuf_num_max || + cfi_buf->cfi_pending_mbcnt > cfil_udp_gc_mbuf_cnt_max) { + cfi_buf->cfi_tail_drop_cnt++; + cfi_buf->cfi_pending_mbcnt -= mbcnt; + cfi_buf->cfi_pending_mbnum -= mbnum; + return (EPIPE); + } + } + cfil_info_buf_verify(cfi_buf); - CFIL_LOG(LOG_INFO, "so %llx cfi_pending_last %llu cfi_pass_offset %llu", - (uint64_t)VM_KERNEL_ADDRPERM(so), - cfi_buf->cfi_pending_last, - cfi_buf->cfi_pass_offset); +#if DATA_DEBUG + CFIL_LOG(LOG_DEBUG, "CFIL: QUEUEING DATA: %s: data %llx len %u flags 0x%x nextpkt %llx - cfi_pending_last %llu cfi_pending_mbcnt %u cfi_pass_offset %llu", + (uint64_t)VM_KERNEL_ADDRPERM(so), + outgoing ? "OUT" : "IN", + (uint64_t)VM_KERNEL_ADDRPERM(data), datalen, data->m_flags, + (uint64_t)VM_KERNEL_ADDRPERM(data->m_nextpkt), + cfi_buf->cfi_pending_last, + cfi_buf->cfi_pending_mbcnt, + cfi_buf->cfi_pass_offset); +#endif /* Fast path when below pass offset */ if (cfi_buf->cfi_pending_last <= cfi_buf->cfi_pass_offset) { - cfil_update_entry_offsets(so, outgoing, datalen); + cfil_update_entry_offsets(so, cfil_info, outgoing, datalen); +#if DATA_DEBUG + CFIL_LOG(LOG_DEBUG, "CFIL: QUEUEING DATA: FAST PATH"); +#endif } else { for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) { - error = cfil_data_filter(so, kcunit, outgoing, data, - datalen); + // Is cfil attached to this filter? + if (IS_ENTRY_ATTACHED(cfil_info, kcunit)) { + if (IS_UDP(so)) { + /* UDP only: + * Chain addr (incoming only TDB), control (optional) and data into one chain. + * This full chain will be reinjected into socket after recieving verdict. + */ + (void) cfil_udp_save_socket_state(cfil_info, data); + chain = sbconcat_mbufs(NULL, outgoing ? NULL : to, data, control); + if (chain == NULL) { + return (ENOBUFS); + } + data = chain; + } + error = cfil_data_filter(so, cfil_info, kcunit, outgoing, data, + datalen); + } /* 0 means passed so continue with next filter */ if (error != 0) break; @@ -3512,10 +3996,11 @@ cfil_data_common(struct socket *so, int outgoing, struct sockaddr *to, if (error == 0) { cfi_buf->cfi_pending_first += datalen; cfi_buf->cfi_pending_mbcnt -= mbcnt; + cfi_buf->cfi_pending_mbnum -= mbnum; cfil_info_buf_verify(cfi_buf); } done: - CFIL_INFO_VERIFY(so->so_cfil); + CFIL_INFO_VERIFY(cfil_info); return (error); } @@ -3528,6 +4013,10 @@ cfil_sock_data_out(struct socket *so, struct sockaddr *to, struct mbuf *data, struct mbuf *control, uint32_t flags) { int error = 0; + + if (IS_UDP(so)) { + return (cfil_sock_udp_handle_data(TRUE, so, NULL, to, data, control, flags)); + } if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL) return (0); @@ -3556,7 +4045,7 @@ cfil_sock_data_out(struct socket *so, struct sockaddr *to, panic("%s sb_cfil_thread %p not NULL", __func__, so->so_snd.sb_cfil_thread); - error = cfil_data_common(so, 1, to, data, control, flags); + error = cfil_data_common(so, so->so_cfil, 1, to, data, control, flags); return (error); } @@ -3570,6 +4059,10 @@ cfil_sock_data_in(struct socket *so, struct sockaddr *from, { int error = 0; + if (IS_UDP(so)) { + return (cfil_sock_udp_handle_data(FALSE, so, NULL, from, data, control, flags)); + } + if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL) return (0); @@ -3590,7 +4083,7 @@ cfil_sock_data_in(struct socket *so, struct sockaddr *from, (uint64_t)VM_KERNEL_ADDRPERM(so)); OSIncrementAtomic(&cfil_stats.cfs_data_in_oob); } - error = cfil_data_common(so, 0, from, data, control, flags); + error = cfil_data_common(so, so->so_cfil, 0, from, data, control, flags); return (error); } @@ -3608,6 +4101,10 @@ cfil_sock_shutdown(struct socket *so, int *how) { int error = 0; + if (IS_UDP(so)) { + return (cfil_sock_udp_shutdown(so, how)); + } + if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL) goto done; @@ -3689,6 +4186,11 @@ cfil_sock_is_closed(struct socket *so) errno_t error = 0; int kcunit; + if (IS_UDP(so)) { + cfil_sock_udp_is_closed(so); + return; + } + if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL) return; @@ -3698,19 +4200,19 @@ cfil_sock_is_closed(struct socket *so) for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) { /* Let the filters know of the closing */ - error = cfil_dispatch_closed_event(so, kcunit); + error = cfil_dispatch_closed_event(so, so->so_cfil, kcunit); } /* Last chance to push passed data out */ - error = cfil_acquire_sockbuf(so, 1); + error = cfil_acquire_sockbuf(so, so->so_cfil, 1); if (error == 0) - cfil_service_inject_queue(so, 1); + cfil_service_inject_queue(so, so->so_cfil, 1); cfil_release_sockbuf(so, 1); so->so_cfil->cfi_flags |= CFIF_SOCK_CLOSED; /* Pending data needs to go */ - cfil_flush_queues(so); + cfil_flush_queues(so, so->so_cfil); CFIL_INFO_VERIFY(so->so_cfil); } @@ -3727,6 +4229,11 @@ cfil_sock_notify_shutdown(struct socket *so, int how) errno_t error = 0; int kcunit; + if (IS_UDP(so)) { + cfil_sock_udp_notify_shutdown(so, how, 0, 0); + return; + } + if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL) return; @@ -3738,10 +4245,10 @@ cfil_sock_notify_shutdown(struct socket *so, int how) for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) { /* Disconnect incoming side */ if (how != SHUT_WR) - error = cfil_dispatch_disconnect_event(so, kcunit, 0); + error = cfil_dispatch_disconnect_event(so, so->so_cfil, kcunit, 0); /* Disconnect outgoing side */ if (how != SHUT_RD) - error = cfil_dispatch_disconnect_event(so, kcunit, 1); + error = cfil_dispatch_disconnect_event(so, so->so_cfil, kcunit, 1); } } @@ -3752,6 +4259,10 @@ cfil_filters_attached(struct socket *so) uint32_t kcunit; int attached = 0; + if (IS_UDP(so)) { + return cfil_filters_udp_attached(so, FALSE); + } + if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL) return (0); @@ -3785,6 +4296,11 @@ cfil_sock_close_wait(struct socket *so) struct timespec ts; int error; + if (IS_UDP(so)) { + cfil_sock_udp_close_wait(so); + return; + } + if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL) return; @@ -3818,7 +4334,7 @@ cfil_sock_close_wait(struct socket *so) OSIncrementAtomic(&cfil_stats.cfs_close_wait); so->so_cfil->cfi_flags |= CFIF_CLOSE_WAIT; - error = msleep((caddr_t)&so->so_cfil, mutex_held, + error = msleep((caddr_t)so->so_cfil, mutex_held, PSOCK | PCATCH, "cfil_sock_close_wait", &ts); so->so_cfil->cfi_flags &= ~CFIF_CLOSE_WAIT; @@ -3845,6 +4361,10 @@ cfil_sock_data_pending(struct sockbuf *sb) struct socket *so = sb->sb_so; uint64_t pending = 0; + if (IS_UDP(so)) { + return (cfil_sock_udp_data_pending(sb, FALSE)); + } + if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_cfil != NULL) { struct cfi_buf *cfi_buf; @@ -3881,6 +4401,10 @@ cfil_sock_data_space(struct sockbuf *sb) struct socket *so = sb->sb_so; uint64_t pending = 0; + if (IS_UDP(so)) { + return (cfil_sock_udp_data_pending(sb, TRUE)); + } + if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_cfil != NULL && so->so_snd.sb_cfil_thread != current_thread()) { struct cfi_buf *cfi_buf; @@ -3920,6 +4444,11 @@ cfil_sock_buf_update(struct sockbuf *sb) int error; struct socket *so = sb->sb_so; + if (IS_UDP(so)) { + cfil_sock_udp_buf_update(sb); + return; + } + if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL) return; @@ -3943,9 +4472,9 @@ cfil_sock_buf_update(struct sockbuf *sb) CFIL_LOG(LOG_NOTICE, "so %llx outgoing %d", (uint64_t)VM_KERNEL_ADDRPERM(so), outgoing); - error = cfil_acquire_sockbuf(so, outgoing); + error = cfil_acquire_sockbuf(so, so->so_cfil, outgoing); if (error == 0) - cfil_service_inject_queue(so, outgoing); + cfil_service_inject_queue(so, so->so_cfil, outgoing); cfil_release_sockbuf(so, outgoing); } @@ -3995,6 +4524,14 @@ sysctl_cfil_filter_list(struct sysctl_oid *oidp, void *arg1, int arg2, cfil_rw_unlock_shared(&cfil_lck_rw); +#if SHOW_DEBUG + if (req->oldptr != USER_ADDR_NULL) { + for (i = 1; content_filters != NULL && i <= MAX_CONTENT_FILTER; i++) { + cfil_filter_show(i); + } + } +#endif + return (error); } @@ -4046,6 +4583,10 @@ static int sysctl_cfil_sock_list(struct sysctl_oid *oidp, void *arg1, int arg2, memcpy(stat.cfs_e_uuid, so->last_uuid, sizeof(uuid_t)); } + + stat.cfs_sock_family = so->so_proto->pr_domain->dom_family; + stat.cfs_sock_type = so->so_proto->pr_type; + stat.cfs_sock_protocol = so->so_proto->pr_protocol; } stat.cfs_snd.cbs_pending_first = @@ -4128,5 +4669,1388 @@ static int sysctl_cfil_sock_list(struct sysctl_oid *oidp, void *arg1, int arg2, done: cfil_rw_unlock_shared(&cfil_lck_rw); +#if SHOW_DEBUG + if (req->oldptr != USER_ADDR_NULL) { + cfil_info_show(); + } +#endif + + return (error); +} + +/* + * UDP Socket Support + */ +static void +cfil_hash_entry_log(int level, struct socket *so, struct cfil_hash_entry *entry, uint64_t sockId, const char* msg) +{ + char local[MAX_IPv6_STR_LEN+6]; + char remote[MAX_IPv6_STR_LEN+6]; + const void *addr; + + // No sock or not UDP, no-op + if (so == NULL || entry == NULL) { + return; + } + + local[0] = remote[0] = 0x0; + + switch (entry->cfentry_family) { + case AF_INET6: + addr = &entry->cfentry_laddr.addr6; + inet_ntop(AF_INET6, addr, local, sizeof(local)); + addr = &entry->cfentry_faddr.addr6; + inet_ntop(AF_INET6, addr, remote, sizeof(local)); + break; + case AF_INET: + addr = &entry->cfentry_laddr.addr46.ia46_addr4.s_addr; + inet_ntop(AF_INET, addr, local, sizeof(local)); + addr = &entry->cfentry_faddr.addr46.ia46_addr4.s_addr; + inet_ntop(AF_INET, addr, remote, sizeof(local)); + break; + default: + return; + } + + CFIL_LOG(level, "<%s>: lport %d fport %d laddr %s faddr %s", + msg, + (uint64_t)VM_KERNEL_ADDRPERM(so), entry, sockId, + ntohs(entry->cfentry_lport), ntohs(entry->cfentry_fport), local, remote); +} + +static void +cfil_inp_log(int level, struct socket *so, const char* msg) +{ + struct inpcb *inp = NULL; + char local[MAX_IPv6_STR_LEN+6]; + char remote[MAX_IPv6_STR_LEN+6]; + const void *addr; + + if (so == NULL) { + return; + } + + inp = sotoinpcb(so); + if (inp == NULL) { + return; + } + + local[0] = remote[0] = 0x0; + +#if INET6 + if (inp->inp_vflag & INP_IPV6) { + addr = &inp->in6p_laddr.s6_addr32; + inet_ntop(AF_INET6, addr, local, sizeof(local)); + addr = &inp->in6p_faddr.s6_addr32; + inet_ntop(AF_INET6, addr, remote, sizeof(local)); + } else +#endif /* INET6 */ + { + addr = &inp->inp_laddr.s_addr; + inet_ntop(AF_INET, addr, local, sizeof(local)); + addr = &inp->inp_faddr.s_addr; + inet_ntop(AF_INET, addr, remote, sizeof(local)); + } + + if (so->so_cfil != NULL) + CFIL_LOG(level, "<%s>: <%s so %llx - flags 0x%x 0x%x, sockID %llu> lport %d fport %d laddr %s faddr %s", + msg, IS_UDP(so) ? "UDP" : "TCP", + (uint64_t)VM_KERNEL_ADDRPERM(so), inp->inp_flags, inp->inp_socket->so_flags, so->so_cfil->cfi_sock_id, + ntohs(inp->inp_lport), ntohs(inp->inp_fport), local, remote); + else + CFIL_LOG(level, "<%s>: <%s so %llx - flags 0x%x 0x%x> lport %d fport %d laddr %s faddr %s", + msg, IS_UDP(so) ? "UDP" : "TCP", + (uint64_t)VM_KERNEL_ADDRPERM(so), inp->inp_flags, inp->inp_socket->so_flags, + ntohs(inp->inp_lport), ntohs(inp->inp_fport), local, remote); +} + +static void +cfil_info_log(int level, struct cfil_info *cfil_info, const char* msg) +{ + if (cfil_info == NULL) + return; + + if (cfil_info->cfi_hash_entry != NULL) + cfil_hash_entry_log(level, cfil_info->cfi_so, cfil_info->cfi_hash_entry, cfil_info->cfi_sock_id, msg); + else + cfil_inp_log(level, cfil_info->cfi_so, msg); +} + +errno_t +cfil_db_init(struct socket *so) +{ + errno_t error = 0; + struct cfil_db *db = NULL; + + CFIL_LOG(LOG_INFO, ""); + + db = zalloc(cfil_db_zone); + if (db == NULL) { + error = ENOMEM; + goto done; + } + bzero(db, sizeof(struct cfil_db)); + db->cfdb_so = so; + db->cfdb_hashbase = hashinit(CFILHASHSIZE, M_CFIL, &db->cfdb_hashmask); + if (db->cfdb_hashbase == NULL) { + zfree(cfil_db_zone, db); + db = NULL; + error = ENOMEM; + goto done; + } + + so->so_cfil_db = db; + +done: + return (error); +} + +void +cfil_db_free(struct socket *so) +{ + struct cfil_hash_entry *entry = NULL; + struct cfil_hash_entry *temp_entry = NULL; + struct cfilhashhead *cfilhash = NULL; + struct cfil_db *db = NULL; + + CFIL_LOG(LOG_INFO, ""); + + if (so == NULL || so->so_cfil_db == NULL) { + return; + } + db = so->so_cfil_db; + +#if LIFECYCLE_DEBUG + CFIL_LOG(LOG_ERR, "CFIL: LIFECYCLE: freeing db (count == %d)", + (uint64_t)VM_KERNEL_ADDRPERM(so), db, db->cfdb_count); +#endif + + for (int i = 0; i < CFILHASHSIZE; i++) { + cfilhash = &db->cfdb_hashbase[i]; + LIST_FOREACH_SAFE(entry, cfilhash, cfentry_link, temp_entry) { + if (entry->cfentry_cfil != NULL) { +#if LIFECYCLE_DEBUG + cfil_info_log(LOG_ERR, entry->cfentry_cfil, "CFIL: LIFECYCLE: DB FREE CLEAN UP"); +#endif + cfil_info_free(entry->cfentry_cfil); + OSIncrementAtomic(&cfil_stats.cfs_sock_detached); + entry->cfentry_cfil = NULL; + } + + cfil_db_delete_entry(db, entry); + if (so->so_flags & SOF_CONTENT_FILTER) { + if (db->cfdb_count == 0) + so->so_flags &= ~SOF_CONTENT_FILTER; + VERIFY(so->so_usecount > 0); + so->so_usecount--; + } + } + } + + // Make sure all entries are cleaned up! + VERIFY(db->cfdb_count == 0); +#if LIFECYCLE_DEBUG + CFIL_LOG(LOG_ERR, "CFIL: LIFECYCLE: so usecount %d", so->so_usecount); +#endif + + FREE(db->cfdb_hashbase, M_CFIL); + zfree(cfil_db_zone, db); + so->so_cfil_db = NULL; +} + +static bool +fill_cfil_hash_entry_from_address(struct cfil_hash_entry *entry, bool isLocal, struct sockaddr *addr) +{ + struct sockaddr_in *sin = NULL; + struct sockaddr_in6 *sin6 = NULL; + + if (entry == NULL || addr == NULL) { + return FALSE; + } + + switch (addr->sa_family) { + case AF_INET: + sin = satosin(addr); + if (sin->sin_len != sizeof(*sin)) { + return FALSE; + } + if (isLocal == TRUE) { + entry->cfentry_lport = sin->sin_port; + entry->cfentry_laddr.addr46.ia46_addr4.s_addr = sin->sin_addr.s_addr; + } else { + entry->cfentry_fport = sin->sin_port; + entry->cfentry_faddr.addr46.ia46_addr4.s_addr = sin->sin_addr.s_addr; + } + entry->cfentry_family = AF_INET; + return TRUE; + case AF_INET6: + sin6 = satosin6(addr); + if (sin6->sin6_len != sizeof(*sin6)) { + return FALSE; + } + if (isLocal == TRUE) { + entry->cfentry_lport = sin6->sin6_port; + entry->cfentry_laddr.addr6 = sin6->sin6_addr; + } else { + entry->cfentry_fport = sin6->sin6_port; + entry->cfentry_faddr.addr6 = sin6->sin6_addr; + } + entry->cfentry_family = AF_INET6; + return TRUE; + default: + return FALSE; + } +} + +static bool +fill_cfil_hash_entry_from_inp(struct cfil_hash_entry *entry, bool isLocal, struct inpcb *inp) +{ + if (entry == NULL || inp == NULL) { + return FALSE; + } + + if (inp->inp_vflag & INP_IPV4) { + if (isLocal == TRUE) { + entry->cfentry_lport = inp->inp_lport; + entry->cfentry_laddr.addr46.ia46_addr4.s_addr = inp->inp_laddr.s_addr; + } else { + entry->cfentry_fport = inp->inp_fport; + entry->cfentry_faddr.addr46.ia46_addr4.s_addr = inp->inp_faddr.s_addr; + } + entry->cfentry_family = AF_INET; + return TRUE; + } else if (inp->inp_vflag & INP_IPV6) { + if (isLocal == TRUE) { + entry->cfentry_lport = inp->inp_lport; + entry->cfentry_laddr.addr6 = inp->in6p_laddr; + } else { + entry->cfentry_fport = inp->inp_fport; + entry->cfentry_faddr.addr6 = inp->in6p_faddr; + } + entry->cfentry_family = AF_INET6; + return TRUE; + } + return FALSE; +} + +bool +check_port(struct sockaddr *addr, u_short port) +{ + struct sockaddr_in *sin = NULL; + struct sockaddr_in6 *sin6 = NULL; + + if (addr == NULL || port == 0) { + return FALSE; + } + + switch (addr->sa_family) { + case AF_INET: + sin = satosin(addr); + if (sin->sin_len != sizeof(*sin)) { + return FALSE; + } + if (port == ntohs(sin->sin_port)) { + return TRUE; + } + break; + case AF_INET6: + sin6 = satosin6(addr); + if (sin6->sin6_len != sizeof(*sin6)) { + return FALSE; + } + if (port == ntohs(sin6->sin6_port)) { + return TRUE; + } + break; + default: + break; + } + return FALSE; +} + +struct cfil_hash_entry * +cfil_db_lookup_entry_with_sockid(struct cfil_db *db, u_int64_t sock_id) +{ + struct cfilhashhead *cfilhash = NULL; + u_int32_t flowhash = (u_int32_t)(sock_id & 0x0ffffffff); + struct cfil_hash_entry *nextentry; + + if (db == NULL || db->cfdb_hashbase == NULL || sock_id == 0) { + return NULL; + } + + flowhash &= db->cfdb_hashmask; + cfilhash = &db->cfdb_hashbase[flowhash]; + + LIST_FOREACH(nextentry, cfilhash, cfentry_link) { + if (nextentry->cfentry_cfil != NULL && + nextentry->cfentry_cfil->cfi_sock_id == sock_id) { + CFIL_LOG(LOG_DEBUG, "CFIL: UDP matched ", + (uint64_t)VM_KERNEL_ADDRPERM(db->cfdb_so), nextentry->cfentry_cfil->cfi_sock_id, flowhash); + cfil_hash_entry_log(LOG_DEBUG, db->cfdb_so, nextentry, 0, "CFIL: UDP found entry"); + return nextentry; + } + } + + CFIL_LOG(LOG_DEBUG, "CFIL: UDP NOT matched ", + (uint64_t)VM_KERNEL_ADDRPERM(db->cfdb_so), sock_id, flowhash); + return NULL; +} + +struct cfil_hash_entry * +cfil_db_lookup_entry(struct cfil_db *db, struct sockaddr *local, struct sockaddr *remote) +{ + struct cfil_hash_entry matchentry; + struct cfil_hash_entry *nextentry = NULL; + struct inpcb *inp = sotoinpcb(db->cfdb_so); + u_int32_t hashkey_faddr = 0, hashkey_laddr = 0; + int inp_hash_element = 0; + struct cfilhashhead *cfilhash = NULL; + + CFIL_LOG(LOG_INFO, ""); + + if (inp == NULL) { + goto done; + } + + if (local != NULL) { + fill_cfil_hash_entry_from_address(&matchentry, TRUE, local); + } else { + fill_cfil_hash_entry_from_inp(&matchentry, TRUE, inp); + } + if (remote != NULL) { + fill_cfil_hash_entry_from_address(&matchentry, FALSE, remote); + } else { + fill_cfil_hash_entry_from_inp(&matchentry, FALSE, inp); + } + +#if INET6 + if (inp->inp_vflag & INP_IPV6) { + hashkey_faddr = matchentry.cfentry_faddr.addr6.s6_addr32[3]; + hashkey_laddr = matchentry.cfentry_laddr.addr6.s6_addr32[3]; + } else +#endif /* INET6 */ + { + hashkey_faddr = matchentry.cfentry_faddr.addr46.ia46_addr4.s_addr; + hashkey_laddr = matchentry.cfentry_laddr.addr46.ia46_addr4.s_addr; + } + + inp_hash_element = CFIL_HASH(hashkey_laddr, hashkey_faddr, + matchentry.cfentry_lport, matchentry.cfentry_fport); + inp_hash_element &= db->cfdb_hashmask; + + cfilhash = &db->cfdb_hashbase[inp_hash_element]; + + LIST_FOREACH(nextentry, cfilhash, cfentry_link) { + +#if INET6 + if ((inp->inp_vflag & INP_IPV6) && + nextentry->cfentry_lport == matchentry.cfentry_lport && + nextentry->cfentry_fport == matchentry.cfentry_fport && + IN6_ARE_ADDR_EQUAL(&nextentry->cfentry_laddr.addr6, &matchentry.cfentry_laddr.addr6) && + IN6_ARE_ADDR_EQUAL(&nextentry->cfentry_faddr.addr6, &matchentry.cfentry_faddr.addr6)) { +#if DATA_DEBUG + cfil_hash_entry_log(LOG_DEBUG, db->cfdb_so, &matchentry, 0, "CFIL LOOKUP ENTRY: UDP V6 found entry"); +#endif + return nextentry; + } else +#endif /* INET6 */ + if (nextentry->cfentry_lport == matchentry.cfentry_lport && + nextentry->cfentry_fport == matchentry.cfentry_fport && + nextentry->cfentry_laddr.addr46.ia46_addr4.s_addr == matchentry.cfentry_laddr.addr46.ia46_addr4.s_addr && + nextentry->cfentry_faddr.addr46.ia46_addr4.s_addr == matchentry.cfentry_faddr.addr46.ia46_addr4.s_addr) { +#if DATA_DEBUG + cfil_hash_entry_log(LOG_DEBUG, db->cfdb_so, &matchentry, 0, "CFIL LOOKUP ENTRY: UDP V4 found entry"); +#endif + return nextentry; + } + } + +done: +#if DATA_DEBUG + cfil_hash_entry_log(LOG_DEBUG, db->cfdb_so, &matchentry, 0, "CFIL LOOKUP ENTRY: UDP no entry found"); +#endif + return NULL; +} + +void +cfil_db_delete_entry(struct cfil_db *db, struct cfil_hash_entry *hash_entry) +{ + if (hash_entry == NULL) + return; + + LIST_REMOVE(hash_entry, cfentry_link); + zfree(cfil_hash_entry_zone, hash_entry); + db->cfdb_count--; + if (db->cfdb_only_entry == hash_entry) + db->cfdb_only_entry = NULL; +} + +struct cfil_hash_entry * +cfil_db_add_entry(struct cfil_db *db, struct sockaddr *local, struct sockaddr *remote) +{ + struct cfil_hash_entry *entry = NULL; + struct inpcb *inp = sotoinpcb(db->cfdb_so); + u_int32_t hashkey_faddr = 0, hashkey_laddr = 0; + int inp_hash_element = 0; + struct cfilhashhead *cfilhash = NULL; + + CFIL_LOG(LOG_INFO, ""); + + if (inp == NULL) { + goto done; + } + + entry = zalloc(cfil_hash_entry_zone); + if (entry == NULL) { + goto done; + } + bzero(entry, sizeof(struct cfil_hash_entry)); + + if (local != NULL) { + fill_cfil_hash_entry_from_address(entry, TRUE, local); + } else { + fill_cfil_hash_entry_from_inp(entry, TRUE, inp); + } + if (remote != NULL) { + fill_cfil_hash_entry_from_address(entry, FALSE, remote); + } else { + fill_cfil_hash_entry_from_inp(entry, FALSE, inp); + } + entry->cfentry_lastused = net_uptime(); + +#if INET6 + if (inp->inp_vflag & INP_IPV6) { + hashkey_faddr = entry->cfentry_faddr.addr6.s6_addr32[3]; + hashkey_laddr = entry->cfentry_laddr.addr6.s6_addr32[3]; + } else +#endif /* INET6 */ + { + hashkey_faddr = entry->cfentry_faddr.addr46.ia46_addr4.s_addr; + hashkey_laddr = entry->cfentry_laddr.addr46.ia46_addr4.s_addr; + } + entry->cfentry_flowhash = CFIL_HASH(hashkey_laddr, hashkey_faddr, + entry->cfentry_lport, entry->cfentry_fport); + inp_hash_element = entry->cfentry_flowhash & db->cfdb_hashmask; + + cfilhash = &db->cfdb_hashbase[inp_hash_element]; + + LIST_INSERT_HEAD(cfilhash, entry, cfentry_link); + db->cfdb_count++; + db->cfdb_only_entry = entry; + cfil_hash_entry_log(LOG_DEBUG, db->cfdb_so, entry, 0, "CFIL: cfil_db_add_entry: ADDED"); + +done: + CFIL_LOG(LOG_DEBUG, "CFIL: UDP total count %d", (uint64_t)VM_KERNEL_ADDRPERM(db->cfdb_so), db->cfdb_count); + return entry; +} + +struct cfil_info * +cfil_db_get_cfil_info(struct cfil_db *db, cfil_sock_id_t id) +{ + struct cfil_hash_entry *hash_entry = NULL; + + CFIL_LOG(LOG_INFO, ""); + + if (db == NULL || id == 0) { + CFIL_LOG(LOG_DEBUG, "CFIL: UDP NULL DB ", + (uint64_t)VM_KERNEL_ADDRPERM(db->cfdb_so), id); + return NULL; + } + + // This is an optimization for connected UDP socket which only has one flow. + // No need to do the hash lookup. + if (db->cfdb_count == 1) { + if (db->cfdb_only_entry && db->cfdb_only_entry->cfentry_cfil && + db->cfdb_only_entry->cfentry_cfil->cfi_sock_id == id) { + return (db->cfdb_only_entry->cfentry_cfil); + } + } + + hash_entry = cfil_db_lookup_entry_with_sockid(db, id); + return (hash_entry != NULL ? hash_entry->cfentry_cfil : NULL); +} + +struct cfil_hash_entry * +cfil_sock_udp_get_flow(struct socket *so, uint32_t filter_control_unit, bool outgoing, struct sockaddr *local, struct sockaddr *remote) +{ +#pragma unused(so, filter_control_unit, outgoing, local, remote) + struct cfil_hash_entry *hash_entry = NULL; + + errno_t error = 0; + socket_lock_assert_owned(so); + + // If new socket, allocate cfil db + if (so->so_cfil_db == NULL) { + if (cfil_db_init(so) != 0) { + return (NULL); + } + } + + // See if flow already exists. + hash_entry = cfil_db_lookup_entry(so->so_cfil_db, local, remote); + if (hash_entry != NULL) { + return (hash_entry); + } + + hash_entry = cfil_db_add_entry(so->so_cfil_db, local, remote); + if (hash_entry == NULL) { + OSIncrementAtomic(&cfil_stats.cfs_sock_attach_no_mem); + CFIL_LOG(LOG_ERR, "CFIL: UDP failed to add entry"); + return (NULL); + } + + if (cfil_info_alloc(so, hash_entry) == NULL || + hash_entry->cfentry_cfil == NULL) { + cfil_db_delete_entry(so->so_cfil_db, hash_entry); + CFIL_LOG(LOG_ERR, "CFIL: UDP failed to alloc cfil_info"); + OSIncrementAtomic(&cfil_stats.cfs_sock_attach_no_mem); + return (NULL); + } + +#if LIFECYCLE_DEBUG + cfil_info_log(LOG_ERR, hash_entry->cfentry_cfil, "CFIL: LIFECYCLE: ADDED"); +#endif + + if (cfil_info_attach_unit(so, filter_control_unit, hash_entry->cfentry_cfil) == 0) { + CFIL_LOG(LOG_ERR, "CFIL: UDP cfil_info_attach_unit(%u) failed", + filter_control_unit); + OSIncrementAtomic(&cfil_stats.cfs_sock_attach_failed); + return (NULL); + } + CFIL_LOG(LOG_DEBUG, "CFIL: UDP filter_control_unit %u sockID %llu attached", + (uint64_t)VM_KERNEL_ADDRPERM(so), + filter_control_unit, hash_entry->cfentry_cfil->cfi_sock_id); + + so->so_flags |= SOF_CONTENT_FILTER; + OSIncrementAtomic(&cfil_stats.cfs_sock_attached); + + /* Hold a reference on the socket for each flow */ + so->so_usecount++; + + error = cfil_dispatch_attach_event(so, hash_entry->cfentry_cfil, filter_control_unit); + /* We can recover from flow control or out of memory errors */ + if (error != 0 && error != ENOBUFS && error != ENOMEM) + return (NULL); + + CFIL_INFO_VERIFY(hash_entry->cfentry_cfil); + return (hash_entry); +} + +errno_t +cfil_sock_udp_handle_data(bool outgoing, struct socket *so, + struct sockaddr *local, struct sockaddr *remote, + struct mbuf *data, struct mbuf *control, uint32_t flags) +{ +#pragma unused(outgoing, so, local, remote, data, control, flags) + errno_t error = 0; + uint32_t filter_control_unit; + struct cfil_hash_entry *hash_entry = NULL; + struct cfil_info *cfil_info = NULL; + + socket_lock_assert_owned(so); + + if (cfil_active_count == 0) { + CFIL_LOG(LOG_DEBUG, "CFIL: UDP no active filter"); + OSIncrementAtomic(&cfil_stats.cfs_sock_attach_in_vain); + return (error); + } + + filter_control_unit = necp_socket_get_content_filter_control_unit(so); + if (filter_control_unit == 0) { + CFIL_LOG(LOG_DEBUG, "CFIL: UDP failed to get control unit"); + return (error); + } + + if ((filter_control_unit & NECP_MASK_USERSPACE_ONLY) != 0) { + CFIL_LOG(LOG_DEBUG, "CFIL: UDP user space only"); + OSIncrementAtomic(&cfil_stats.cfs_sock_userspace_only); + return (error); + } + + hash_entry = cfil_sock_udp_get_flow(so, filter_control_unit, outgoing, local, remote); + if (hash_entry == NULL || hash_entry->cfentry_cfil == NULL) { + CFIL_LOG(LOG_ERR, "CFIL: Falied to create UDP flow"); + return (EPIPE); + } + // Update last used timestamp, this is for flow Idle TO + hash_entry->cfentry_lastused = net_uptime(); + cfil_info = hash_entry->cfentry_cfil; + + if (cfil_info->cfi_flags & CFIF_DROP) { +#if DATA_DEBUG + cfil_hash_entry_log(LOG_DEBUG, so, hash_entry, 0, "CFIL: UDP DROP"); +#endif + return (EPIPE); + } + if (control != NULL) { + OSIncrementAtomic(&cfil_stats.cfs_data_in_control); + } + if (data->m_type == MT_OOBDATA) { + CFIL_LOG(LOG_ERR, "so %llx MSG_OOB", + (uint64_t)VM_KERNEL_ADDRPERM(so)); + OSIncrementAtomic(&cfil_stats.cfs_data_in_oob); + } + + error = cfil_data_common(so, cfil_info, outgoing, remote, data, control, flags); + + return (error); +} + +/* + * Go through all UDP flows for specified socket and returns TRUE if + * any flow is still attached. If need_wait is TRUE, wait on first + * attached flow. + */ +static int +cfil_filters_udp_attached(struct socket *so, bool need_wait) +{ + struct timespec ts; + lck_mtx_t *mutex_held; + struct cfilhashhead *cfilhash = NULL; + struct cfil_db *db = NULL; + struct cfil_hash_entry *hash_entry = NULL; + struct cfil_hash_entry *temp_hash_entry = NULL; + struct cfil_info *cfil_info = NULL; + struct cfil_entry *entry = NULL; + errno_t error = 0; + int kcunit; + int attached = 0; + + socket_lock_assert_owned(so); + + if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_cfil_db != NULL) { + + if (so->so_proto->pr_getlock != NULL) + mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK); + else + mutex_held = so->so_proto->pr_domain->dom_mtx; + LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED); + + db = so->so_cfil_db; + + for (int i = 0; i < CFILHASHSIZE; i++) { + cfilhash = &db->cfdb_hashbase[i]; + + LIST_FOREACH_SAFE(hash_entry, cfilhash, cfentry_link, temp_hash_entry) { + + if (hash_entry->cfentry_cfil != NULL) { + + cfil_info = hash_entry->cfentry_cfil; + for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) { + entry = &cfil_info->cfi_entries[kcunit - 1]; + + /* Are we attached to the filter? */ + if (entry->cfe_filter == NULL) { + continue; + } + + if ((entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED) == 0) + continue; + if ((entry->cfe_flags & CFEF_CFIL_DETACHED) != 0) + continue; + + attached = 1; + + if (need_wait == TRUE) { +#if LIFECYCLE_DEBUG + cfil_info_log(LOG_ERR, cfil_info, "CFIL: LIFECYCLE: WAIT FOR FLOW TO FINISH"); +#endif + + ts.tv_sec = cfil_close_wait_timeout / 1000; + ts.tv_nsec = (cfil_close_wait_timeout % 1000) * + NSEC_PER_USEC * 1000; + + OSIncrementAtomic(&cfil_stats.cfs_close_wait); + cfil_info->cfi_flags |= CFIF_CLOSE_WAIT; + error = msleep((caddr_t)cfil_info, mutex_held, + PSOCK | PCATCH, "cfil_filters_udp_attached", &ts); + cfil_info->cfi_flags &= ~CFIF_CLOSE_WAIT; + +#if LIFECYCLE_DEBUG + cfil_info_log(LOG_ERR, cfil_info, "CFIL: LIFECYCLE: WAIT FOR FLOW DONE"); +#endif + + /* + * Force close in case of timeout + */ + if (error != 0) { + OSIncrementAtomic(&cfil_stats.cfs_close_wait_timeout); +#if LIFECYCLE_DEBUG + cfil_info_log(LOG_ERR, cfil_info, "CFIL: LIFECYCLE: WAIT FOR FLOW TIMED OUT, FORCE DETACH"); +#endif + entry->cfe_flags |= CFEF_CFIL_DETACHED; + break; + } + } + goto done; + } + } + } + } + } + +done: + return (attached); +} + +int32_t +cfil_sock_udp_data_pending(struct sockbuf *sb, bool check_thread) +{ + struct socket *so = sb->sb_so; + struct cfi_buf *cfi_buf; + uint64_t pending = 0; + uint64_t total_pending = 0; + struct cfilhashhead *cfilhash = NULL; + struct cfil_db *db = NULL; + struct cfil_hash_entry *hash_entry = NULL; + struct cfil_hash_entry *temp_hash_entry = NULL; + + socket_lock_assert_owned(so); + + if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_cfil_db != NULL && + (check_thread == FALSE || so->so_snd.sb_cfil_thread != current_thread())) { + + db = so->so_cfil_db; + + for (int i = 0; i < CFILHASHSIZE; i++) { + cfilhash = &db->cfdb_hashbase[i]; + + LIST_FOREACH_SAFE(hash_entry, cfilhash, cfentry_link, temp_hash_entry) { + + if (hash_entry->cfentry_cfil != NULL) { + if ((sb->sb_flags & SB_RECV) == 0) + cfi_buf = &hash_entry->cfentry_cfil->cfi_snd; + else + cfi_buf = &hash_entry->cfentry_cfil->cfi_rcv; + + pending = cfi_buf->cfi_pending_last - cfi_buf->cfi_pending_first; + /* + * If we are limited by the "chars of mbufs used" roughly + * adjust so we won't overcommit + */ + if ((uint64_t)cfi_buf->cfi_pending_mbcnt > pending) + pending = cfi_buf->cfi_pending_mbcnt; + + total_pending += pending; + } + } + } + + VERIFY(total_pending < INT32_MAX); +#if DATA_DEBUG + CFIL_LOG(LOG_DEBUG, "CFIL: total pending %llu ", + (uint64_t)VM_KERNEL_ADDRPERM(so), + total_pending, check_thread); +#endif + } + + return (int32_t)(total_pending); +} + +int +cfil_sock_udp_notify_shutdown(struct socket *so, int how, int drop_flag, int shut_flag) +{ + struct cfil_info *cfil_info = NULL; + struct cfilhashhead *cfilhash = NULL; + struct cfil_db *db = NULL; + struct cfil_hash_entry *hash_entry = NULL; + struct cfil_hash_entry *temp_hash_entry = NULL; + errno_t error = 0; + int done_count = 0; + int kcunit; + + socket_lock_assert_owned(so); + + if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_cfil_db != NULL) { + + db = so->so_cfil_db; + + for (int i = 0; i < CFILHASHSIZE; i++) { + cfilhash = &db->cfdb_hashbase[i]; + + LIST_FOREACH_SAFE(hash_entry, cfilhash, cfentry_link, temp_hash_entry) { + + if (hash_entry->cfentry_cfil != NULL) { + cfil_info = hash_entry->cfentry_cfil; + + // This flow is marked as DROP + if (cfil_info->cfi_flags & drop_flag) { + done_count++; + continue; + } + + // This flow has been shut already, skip + if (cfil_info->cfi_flags & shut_flag) { + continue; + } + // Mark flow as shut + cfil_info->cfi_flags |= shut_flag; + done_count++; + + for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) { + /* Disconnect incoming side */ + if (how != SHUT_WR) { + error = cfil_dispatch_disconnect_event(so, cfil_info, kcunit, 0); + } + /* Disconnect outgoing side */ + if (how != SHUT_RD) { + error = cfil_dispatch_disconnect_event(so, cfil_info, kcunit, 1); + } + } + } + } + } + } + + if (done_count == 0) { + error = ENOTCONN; + } return (error); } + +int +cfil_sock_udp_shutdown(struct socket *so, int *how) +{ + int error = 0; + + if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || (so->so_cfil_db == NULL)) + goto done; + + socket_lock_assert_owned(so); + + CFIL_LOG(LOG_INFO, "so %llx how %d", + (uint64_t)VM_KERNEL_ADDRPERM(so), *how); + + /* + * Check the state of the socket before the content filter + */ + if (*how != SHUT_WR && (so->so_state & SS_CANTRCVMORE) != 0) { + /* read already shut down */ + error = ENOTCONN; + goto done; + } + if (*how != SHUT_RD && (so->so_state & SS_CANTSENDMORE) != 0) { + /* write already shut down */ + error = ENOTCONN; + goto done; + } + + /* + * shutdown read: SHUT_RD or SHUT_RDWR + */ + if (*how != SHUT_WR) { + error = cfil_sock_udp_notify_shutdown(so, SHUT_RD, CFIF_DROP, CFIF_SHUT_RD); + if (error != 0) + goto done; + } + /* + * shutdown write: SHUT_WR or SHUT_RDWR + */ + if (*how != SHUT_RD) { + error = cfil_sock_udp_notify_shutdown(so, SHUT_WR, CFIF_DROP, CFIF_SHUT_WR); + if (error != 0) + goto done; + + /* + * When outgoing data is pending, we delay the shutdown at the + * protocol level until the content filters give the final + * verdict on the pending data. + */ + if (cfil_sock_data_pending(&so->so_snd) != 0) { + /* + * When shutting down the read and write sides at once + * we can proceed to the final shutdown of the read + * side. Otherwise, we just return. + */ + if (*how == SHUT_WR) { + error = EJUSTRETURN; + } else if (*how == SHUT_RDWR) { + *how = SHUT_RD; + } + } + } +done: + return (error); +} + +void +cfil_sock_udp_close_wait(struct socket *so) +{ + socket_lock_assert_owned(so); + + while (cfil_filters_udp_attached(so, FALSE)) { + /* + * Notify the filters we are going away so they can detach + */ + cfil_sock_udp_notify_shutdown(so, SHUT_RDWR, 0, 0); + + /* + * Make sure we need to wait after the filter are notified + * of the disconnection + */ + if (cfil_filters_udp_attached(so, TRUE) == 0) + break; + } +} + +void +cfil_sock_udp_is_closed(struct socket *so) +{ + struct cfil_info *cfil_info = NULL; + struct cfilhashhead *cfilhash = NULL; + struct cfil_db *db = NULL; + struct cfil_hash_entry *hash_entry = NULL; + struct cfil_hash_entry *temp_hash_entry = NULL; + errno_t error = 0; + int kcunit; + + socket_lock_assert_owned(so); + + if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_cfil_db != NULL) { + + db = so->so_cfil_db; + + for (int i = 0; i < CFILHASHSIZE; i++) { + cfilhash = &db->cfdb_hashbase[i]; + + LIST_FOREACH_SAFE(hash_entry, cfilhash, cfentry_link, temp_hash_entry) { + if (hash_entry->cfentry_cfil != NULL) { + + cfil_info = hash_entry->cfentry_cfil; + + for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) { + /* Let the filters know of the closing */ + error = cfil_dispatch_closed_event(so, cfil_info, kcunit); + } + + /* Last chance to push passed data out */ + error = cfil_acquire_sockbuf(so, cfil_info, 1); + if (error == 0) + cfil_service_inject_queue(so, cfil_info, 1); + cfil_release_sockbuf(so, 1); + + cfil_info->cfi_flags |= CFIF_SOCK_CLOSED; + + /* Pending data needs to go */ + cfil_flush_queues(so, cfil_info); + + CFIL_INFO_VERIFY(cfil_info); + } + } + } + } +} + +void +cfil_sock_udp_buf_update(struct sockbuf *sb) +{ + struct cfil_info *cfil_info = NULL; + struct cfilhashhead *cfilhash = NULL; + struct cfil_db *db = NULL; + struct cfil_hash_entry *hash_entry = NULL; + struct cfil_hash_entry *temp_hash_entry = NULL; + errno_t error = 0; + int outgoing; + struct socket *so = sb->sb_so; + + socket_lock_assert_owned(so); + + if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_cfil_db != NULL) { + + if (!cfil_sbtrim) + return; + + db = so->so_cfil_db; + + for (int i = 0; i < CFILHASHSIZE; i++) { + cfilhash = &db->cfdb_hashbase[i]; + + LIST_FOREACH_SAFE(hash_entry, cfilhash, cfentry_link, temp_hash_entry) { + if (hash_entry->cfentry_cfil != NULL) { + + cfil_info = hash_entry->cfentry_cfil; + + if ((sb->sb_flags & SB_RECV) == 0) { + if ((cfil_info->cfi_flags & CFIF_RETRY_INJECT_OUT) == 0) + return; + outgoing = 1; + OSIncrementAtomic(&cfil_stats.cfs_inject_q_out_retry); + } else { + if ((cfil_info->cfi_flags & CFIF_RETRY_INJECT_IN) == 0) + return; + outgoing = 0; + OSIncrementAtomic(&cfil_stats.cfs_inject_q_in_retry); + } + + CFIL_LOG(LOG_NOTICE, "so %llx outgoing %d", + (uint64_t)VM_KERNEL_ADDRPERM(so), outgoing); + + error = cfil_acquire_sockbuf(so, cfil_info, outgoing); + if (error == 0) + cfil_service_inject_queue(so, cfil_info, outgoing); + cfil_release_sockbuf(so, outgoing); + } + } + } + } +} + +void +cfil_filter_show(u_int32_t kcunit) +{ + struct content_filter *cfc = NULL; + struct cfil_entry *entry; + int count = 0; + + if (content_filters == NULL) { + return; + } + if (kcunit > MAX_CONTENT_FILTER) { + return; + } + + cfil_rw_lock_shared(&cfil_lck_rw); + + if (content_filters[kcunit - 1] == NULL) { + cfil_rw_unlock_shared(&cfil_lck_rw); + return; + } + cfc = content_filters[kcunit - 1]; + + CFIL_LOG(LOG_ERR, "CFIL: FILTER SHOW: Filter flags <%lx>:", + kcunit, cfc->cf_sock_count, (unsigned long)cfc->cf_flags); + if (cfc->cf_flags & CFF_DETACHING) + CFIL_LOG(LOG_ERR, "CFIL: FILTER SHOW: - DETACHING"); + if (cfc->cf_flags & CFF_ACTIVE) + CFIL_LOG(LOG_ERR, "CFIL: FILTER SHOW: - ACTIVE"); + if (cfc->cf_flags & CFF_FLOW_CONTROLLED) + CFIL_LOG(LOG_ERR, "CFIL: FILTER SHOW: - FLOW CONTROLLED"); + + TAILQ_FOREACH(entry, &cfc->cf_sock_entries, cfe_link) { + + if (entry->cfe_cfil_info && entry->cfe_cfil_info->cfi_so) { + struct cfil_info *cfil_info = entry->cfe_cfil_info; + + count++; + + if (entry->cfe_flags & CFEF_CFIL_DETACHED) + cfil_info_log(LOG_ERR, cfil_info, "CFIL: FILTER SHOW: - DETACHED"); + else + cfil_info_log(LOG_ERR, cfil_info, "CFIL: FILTER SHOW: - ATTACHED"); + } + } + + CFIL_LOG(LOG_ERR, "CFIL: FILTER SHOW: Filter - total entries shown: %d", count); + + cfil_rw_unlock_shared(&cfil_lck_rw); + +} + +void +cfil_info_show(void) +{ + struct cfil_info *cfil_info; + int count = 0; + + cfil_rw_lock_shared(&cfil_lck_rw); + + CFIL_LOG(LOG_ERR, "CFIL: INFO SHOW: count %d", cfil_sock_attached_count); + + TAILQ_FOREACH(cfil_info, &cfil_sock_head, cfi_link) { + + count++; + + cfil_info_log(LOG_ERR, cfil_info, "CFIL: INFO SHOW"); + + if (cfil_info->cfi_flags & CFIF_DROP) + CFIL_LOG(LOG_ERR, "CFIL: INFO FLAG - DROP"); + if (cfil_info->cfi_flags & CFIF_CLOSE_WAIT) + CFIL_LOG(LOG_ERR, "CFIL: INFO FLAG - CLOSE_WAIT"); + if (cfil_info->cfi_flags & CFIF_SOCK_CLOSED) + CFIL_LOG(LOG_ERR, "CFIL: INFO FLAG - SOCK_CLOSED"); + if (cfil_info->cfi_flags & CFIF_RETRY_INJECT_IN) + CFIL_LOG(LOG_ERR, "CFIL: INFO FLAG - RETRY_INJECT_IN"); + if (cfil_info->cfi_flags & CFIF_RETRY_INJECT_OUT) + CFIL_LOG(LOG_ERR, "CFIL: INFO FLAG - RETRY_INJECT_OUT"); + if (cfil_info->cfi_flags & CFIF_SHUT_WR) + CFIL_LOG(LOG_ERR, "CFIL: INFO FLAG - SHUT_WR"); + if (cfil_info->cfi_flags & CFIF_SHUT_RD) + CFIL_LOG(LOG_ERR, "CFIL: INFO FLAG - SHUT_RD"); + } + + CFIL_LOG(LOG_ERR, "CFIL: INFO SHOW: total cfil_info shown: %d", count); + + cfil_rw_unlock_shared(&cfil_lck_rw); +} + +bool +cfil_info_idle_timed_out(struct cfil_info *cfil_info, int timeout, u_int32_t current_time) +{ + if (cfil_info && cfil_info->cfi_hash_entry && + (current_time - cfil_info->cfi_hash_entry->cfentry_lastused >= (u_int32_t)timeout)) { +#if GC_DEBUG + cfil_info_log(LOG_ERR, cfil_info, "CFIL: flow IDLE timeout expired"); +#endif + return true; + } + return false; +} + +bool +cfil_info_action_timed_out(struct cfil_info *cfil_info, int timeout) +{ + struct cfil_entry *entry; + struct timeval current_tv; + struct timeval diff_time; + + if (cfil_info == NULL) + return false; + + /* + * If we have queued up more data than passed offset and we haven't received + * an action from user space for a while (the user space filter might have crashed), + * return action timed out. + */ + if (cfil_info->cfi_snd.cfi_pending_last > cfil_info->cfi_snd.cfi_pass_offset || + cfil_info->cfi_rcv.cfi_pending_last > cfil_info->cfi_rcv.cfi_pass_offset) { + + microuptime(¤t_tv); + + for (int kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) { + entry = &cfil_info->cfi_entries[kcunit - 1]; + + if (entry->cfe_filter == NULL) + continue; + + if (cfil_info->cfi_snd.cfi_pending_last > entry->cfe_snd.cfe_pass_offset || + cfil_info->cfi_rcv.cfi_pending_last > entry->cfe_rcv.cfe_pass_offset) { + // haven't gotten an action from this filter, check timeout + timersub(¤t_tv, &entry->cfe_last_action, &diff_time); + if (diff_time.tv_sec >= timeout) { +#if GC_DEBUG + cfil_info_log(LOG_ERR, cfil_info, "CFIL: flow ACTION timeout expired"); +#endif + return true; + } + } + } + } + return false; +} + +bool +cfil_info_buffer_threshold_exceeded(struct cfil_info *cfil_info) +{ + if (cfil_info == NULL) + return false; + + /* + * Clean up flow if it exceeded queue thresholds + */ + if (cfil_info->cfi_snd.cfi_tail_drop_cnt || + cfil_info->cfi_rcv.cfi_tail_drop_cnt) { +#if GC_DEBUG + CFIL_LOG(LOG_ERR, "CFIL: queue threshold exceeded: mbuf max tail drop count ", + cfil_udp_gc_mbuf_num_max, + cfil_udp_gc_mbuf_cnt_max, + cfil_info->cfi_snd.cfi_tail_drop_cnt, + cfil_info->cfi_rcv.cfi_tail_drop_cnt); + cfil_info_log(LOG_ERR, cfil_info, "CFIL: queue threshold exceeded"); +#endif + return true; + } + + return false; +} + +static void +cfil_udp_gc_thread_sleep(bool forever) +{ + if (forever) { + (void) assert_wait((event_t) &cfil_sock_udp_attached_count, + THREAD_INTERRUPTIBLE); + } else { + uint64_t deadline = 0; + nanoseconds_to_absolutetime(UDP_FLOW_GC_RUN_INTERVAL_NSEC, &deadline); + clock_absolutetime_interval_to_deadline(deadline, &deadline); + + (void) assert_wait_deadline(&cfil_sock_udp_attached_count, + THREAD_INTERRUPTIBLE, deadline); + } +} + +static void +cfil_udp_gc_thread_func(void *v, wait_result_t w) +{ +#pragma unused(v, w) + + ASSERT(cfil_udp_gc_thread == current_thread()); + thread_set_thread_name(current_thread(), "CFIL_UPD_GC"); + + // Kick off gc shortly + cfil_udp_gc_thread_sleep(false); + thread_block_parameter((thread_continue_t) cfil_info_udp_expire, NULL); + /* NOTREACHED */ +} + +static void +cfil_info_udp_expire(void *v, wait_result_t w) +{ +#pragma unused(v, w) + + static uint64_t expired_array[UDP_FLOW_GC_MAX_COUNT]; + static uint32_t expired_count = 0; + + struct cfil_info *cfil_info; + struct cfil_hash_entry *hash_entry; + struct cfil_db *db; + struct socket *so; + u_int32_t current_time = 0; + + current_time = net_uptime(); + + // Get all expired UDP flow ids + cfil_rw_lock_shared(&cfil_lck_rw); + + if (cfil_sock_udp_attached_count == 0) { + cfil_rw_unlock_shared(&cfil_lck_rw); + goto go_sleep; + } + + TAILQ_FOREACH(cfil_info, &cfil_sock_head, cfi_link) { + if (expired_count >= UDP_FLOW_GC_MAX_COUNT) + break; + + if (IS_UDP(cfil_info->cfi_so)) { + if (cfil_info_idle_timed_out(cfil_info, UDP_FLOW_GC_IDLE_TO, current_time) || + cfil_info_action_timed_out(cfil_info, UDP_FLOW_GC_ACTION_TO) || + cfil_info_buffer_threshold_exceeded(cfil_info)) { + expired_array[expired_count] = cfil_info->cfi_sock_id; + expired_count++; + } + } + } + cfil_rw_unlock_shared(&cfil_lck_rw); + + if (expired_count == 0) + goto go_sleep; + + for (uint32_t i = 0; i < expired_count; i++) { + + // Search for socket (UDP only and lock so) + so = cfil_socket_from_sock_id(expired_array[i], true); + if (so == NULL) { + continue; + } + + cfil_info = cfil_db_get_cfil_info(so->so_cfil_db, expired_array[i]); + if (cfil_info == NULL) { + goto unlock; + } + + db = so->so_cfil_db; + hash_entry = cfil_info->cfi_hash_entry; + + if (db == NULL || hash_entry == NULL) { + goto unlock; + } + +#if GC_DEBUG || LIFECYCLE_DEBUG + cfil_info_log(LOG_ERR, cfil_info, "CFIL: LIFECYCLE: GC CLEAN UP"); +#endif + + cfil_db_delete_entry(db, hash_entry); + cfil_info_free(cfil_info); + OSIncrementAtomic(&cfil_stats.cfs_sock_detached); + + if (so->so_flags & SOF_CONTENT_FILTER) { + if (db->cfdb_count == 0) + so->so_flags &= ~SOF_CONTENT_FILTER; + VERIFY(so->so_usecount > 0); + so->so_usecount--; + } +unlock: + socket_unlock(so, 1); + } + +#if GC_DEBUG + CFIL_LOG(LOG_ERR, "CFIL: UDP flow idle timeout check: expired %d idle flows", expired_count); +#endif + expired_count = 0; + +go_sleep: + + // Sleep forever (until waken up) if no more UDP flow to clean + cfil_rw_lock_shared(&cfil_lck_rw); + cfil_udp_gc_thread_sleep(cfil_sock_udp_attached_count == 0 ? true : false); + cfil_rw_unlock_shared(&cfil_lck_rw); + thread_block_parameter((thread_continue_t)cfil_info_udp_expire, NULL); + /* NOTREACHED */ +} + +struct m_tag * +cfil_udp_save_socket_state(struct cfil_info *cfil_info, struct mbuf *m) +{ + struct m_tag *tag = NULL; + struct cfil_tag *ctag = NULL; + struct cfil_hash_entry *hash_entry = NULL; + + if (cfil_info == NULL || cfil_info->cfi_so == NULL || + cfil_info->cfi_hash_entry == NULL || m == NULL || !(m->m_flags & M_PKTHDR)) { + return NULL; + } + + /* Allocate a tag */ + tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_CFIL_UDP, + sizeof(struct cfil_tag), M_DONTWAIT, m); + + if (tag) { + ctag = (struct cfil_tag*)(tag + 1); + ctag->cfil_so_state_change_cnt = cfil_info->cfi_so->so_state_change_cnt; + ctag->cfil_so_options = cfil_info->cfi_so->so_options; + + hash_entry = cfil_info->cfi_hash_entry; + if (hash_entry->cfentry_family == AF_INET6) { + fill_ip6_sockaddr_4_6(&ctag->cfil_faddr, + &hash_entry->cfentry_faddr.addr6, + hash_entry->cfentry_fport); + } else if (hash_entry->cfentry_family == AF_INET) { + fill_ip_sockaddr_4_6(&ctag->cfil_faddr, + hash_entry->cfentry_faddr.addr46.ia46_addr4, + hash_entry->cfentry_fport); + } + m_tag_prepend(m, tag); + return (tag); + } + return NULL; +} + +struct m_tag * +cfil_udp_get_socket_state(struct mbuf *m, uint32_t *state_change_cnt, short *options, + struct sockaddr **faddr) +{ + struct m_tag *tag = NULL; + struct cfil_tag *ctag = NULL; + + tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_CFIL_UDP, NULL); + if (tag) { + ctag = (struct cfil_tag *)(tag + 1); + if (state_change_cnt) + *state_change_cnt = ctag->cfil_so_state_change_cnt; + if (options) + *options = ctag->cfil_so_options; + if (faddr) + *faddr = (struct sockaddr *) &ctag->cfil_faddr; + + /* + * Unlink tag and hand it over to caller. + * Note that caller will be responsible to free it. + */ + m_tag_unlink(m, tag); + return tag; + } + return NULL; +} + + diff --git a/bsd/net/content_filter.h b/bsd/net/content_filter.h index e4d1ce5d4..55249920b 100644 --- a/bsd/net/content_filter.h +++ b/bsd/net/content_filter.h @@ -422,6 +422,8 @@ extern void cfil_sock_buf_update(struct sockbuf *sb); extern cfil_sock_id_t cfil_sock_id_from_socket(struct socket *so); +extern struct m_tag *cfil_udp_get_socket_state(struct mbuf *m, uint32_t *state_change_cnt, + short *options, struct sockaddr **faddr); #endif /* BSD_KERNEL_PRIVATE */ __END_DECLS diff --git a/bsd/net/dlil.c b/bsd/net/dlil.c index 0438c30a1..cd4d8d963 100644 --- a/bsd/net/dlil.c +++ b/bsd/net/dlil.c @@ -78,7 +78,7 @@ #include #include #include - +#include #if INET #include #include @@ -90,15 +90,21 @@ #include #include #include +#include +#include +#include #endif /* INET */ #if INET6 +#include #include #include #include #include +#include +#include #endif /* INET6 */ - +#include #include #include @@ -278,7 +284,7 @@ static unsigned int dlif_size; /* size of dlil_ifnet to allocate */ static unsigned int dlif_bufsize; /* size of dlif_size + headroom */ static struct zone *dlif_zone; /* zone for dlil_ifnet */ -#define DLIF_ZONE_MAX 64 /* maximum elements in zone */ +#define DLIF_ZONE_MAX IFNETS_MAX /* maximum elements in zone */ #define DLIF_ZONE_NAME "ifnet" /* zone name */ static unsigned int dlif_filt_size; /* size of ifnet_filter */ @@ -397,7 +403,9 @@ static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *, u_int32_t, ifnet_model_t, boolean_t); static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *, boolean_t, boolean_t); - +static int dlil_is_clat_needed(protocol_family_t , mbuf_t ); +static errno_t dlil_clat46(ifnet_t, protocol_family_t *, mbuf_t *); +static errno_t dlil_clat64(ifnet_t, protocol_family_t *, mbuf_t *); #if DEBUG || DEVELOPMENT static void dlil_verify_sum16(void); #endif /* DEBUG || DEVELOPMENT */ @@ -1718,6 +1726,9 @@ dlil_init(void) /* Initialize the interface port list */ if_ports_used_init(); + /* Initialize the interface low power mode event handler */ + if_low_power_evhdlr_init(); + #if DEBUG || DEVELOPMENT /* Run self-tests */ dlil_verify_sum16(); @@ -3816,15 +3827,15 @@ static void dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m, u_int32_t cnt, ifnet_model_t mode, boolean_t ext) { - int error = 0; - protocol_family_t protocol_family; - mbuf_t next_packet; - ifnet_t ifp = ifp_param; - char * frame_header; - struct if_proto * last_ifproto = NULL; - mbuf_t pkt_first = NULL; - mbuf_t * pkt_next = NULL; - u_int32_t poll_thresh = 0, poll_ival = 0; + int error = 0; + protocol_family_t protocol_family; + mbuf_t next_packet; + ifnet_t ifp = ifp_param; + char *frame_header = NULL; + struct if_proto *last_ifproto = NULL; + mbuf_t pkt_first = NULL; + mbuf_t *pkt_next = NULL; + u_int32_t poll_thresh = 0, poll_ival = 0; KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0); @@ -3892,6 +3903,69 @@ dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m, protocol_family = 0; } + pktap_input(ifp, protocol_family, m, frame_header); + + /* Drop v4 packets received on CLAT46 enabled interface */ + if (protocol_family == PF_INET && IS_INTF_CLAT46(ifp)) { + m_freem(m); + ip6stat.ip6s_clat464_in_v4_drop++; + goto next; + } + + /* Translate the packet if it is received on CLAT interface */ + if (protocol_family == PF_INET6 && IS_INTF_CLAT46(ifp) + && dlil_is_clat_needed(protocol_family, m)) { + char *data = NULL; + struct ether_header eh; + struct ether_header *ehp = NULL; + + if (ifp->if_type == IFT_ETHER) { + ehp = (struct ether_header *)(void *)frame_header; + /* Skip RX Ethernet packets if they are not IPV6 */ + if (ntohs(ehp->ether_type) != ETHERTYPE_IPV6) + goto skip_clat; + + /* Keep a copy of frame_header for Ethernet packets */ + bcopy(frame_header, (caddr_t)&eh, ETHER_HDR_LEN); + } + error = dlil_clat64(ifp, &protocol_family, &m); + data = (char *) mbuf_data(m); + if (error != 0) { + m_freem(m); + ip6stat.ip6s_clat464_in_drop++; + goto next; + } + /* Native v6 should be No-op */ + if (protocol_family != PF_INET) + goto skip_clat; + + /* Do this only for translated v4 packets. */ + switch (ifp->if_type) { + case IFT_CELLULAR: + frame_header = data; + break; + case IFT_ETHER: + /* + * Drop if the mbuf doesn't have enough + * space for Ethernet header + */ + if (M_LEADINGSPACE(m) < ETHER_HDR_LEN) { + m_free(m); + ip6stat.ip6s_clat464_in_drop++; + goto next; + } + /* + * Set the frame_header ETHER_HDR_LEN bytes + * preceeding the data pointer. Change + * the ether_type too. + */ + frame_header = data - ETHER_HDR_LEN; + eh.ether_type = htons(ETHERTYPE_IP); + bcopy((caddr_t)&eh, frame_header, ETHER_HDR_LEN); + break; + } + } +skip_clat: if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) && !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) dlil_input_cksum_dbg(ifp, m, frame_header, @@ -3912,7 +3986,6 @@ dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m, (CSUM_DATA_VALID | CSUM_PARTIAL)) == (CSUM_DATA_VALID | CSUM_PARTIAL)) { int adj; - if (frame_header == NULL || frame_header < (char *)mbuf_datastart(m) || frame_header > (char *)m->m_data || @@ -3926,7 +3999,8 @@ dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m, } } - pktap_input(ifp, protocol_family, m, frame_header); + if (clat_debug) + pktap_input(ifp, protocol_family, m, frame_header); if (m->m_flags & (M_BCAST|M_MCAST)) atomic_add_64(&ifp->if_imcasts, 1); @@ -4288,7 +4362,7 @@ dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist, char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4]; char dst_linkaddr_buffer[MAX_LINKADDR * 4]; struct if_proto *proto = NULL; - mbuf_t m; + mbuf_t m = NULL; mbuf_t send_head = NULL; mbuf_t *send_tail = &send_head; int iorefcnt = 0; @@ -4297,6 +4371,9 @@ dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist, int32_t flen = 0; struct timespec now; u_int64_t now_nsec; + boolean_t did_clat46 = FALSE; + protocol_family_t old_proto_family = proto_family; + struct rtentry *rt = NULL; KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0); @@ -4339,6 +4416,85 @@ dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist, packetlist = packetlist->m_nextpkt; m->m_nextpkt = NULL; + /* + * Perform address family translation for the first + * packet outside the loop in order to perform address + * lookup for the translated proto family. + */ + if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) && + (ifp->if_type == IFT_CELLULAR || + dlil_is_clat_needed(proto_family, m))) { + retval = dlil_clat46(ifp, &proto_family, &m); + /* + * Go to the next packet if translation fails + */ + if (retval != 0) { + m_freem(m); + m = NULL; + ip6stat.ip6s_clat464_out_drop++; + /* Make sure that the proto family is PF_INET */ + ASSERT(proto_family == PF_INET); + goto preout_again; + } + /* + * Free the old one and make it point to the IPv6 proto structure. + * + * Change proto for the first time we have successfully + * performed address family translation. + */ + if (!did_clat46 && proto_family == PF_INET6) { + struct sockaddr_in6 dest6; + did_clat46 = TRUE; + + if (proto != NULL) + if_proto_free(proto); + ifnet_lock_shared(ifp); + /* callee holds a proto refcnt upon success */ + proto = find_attached_proto(ifp, proto_family); + if (proto == NULL) { + ifnet_lock_done(ifp); + retval = ENXIO; + m_freem(m); + m = NULL; + goto cleanup; + } + ifnet_lock_done(ifp); + if (ifp->if_type == IFT_ETHER) { + /* Update the dest to translated v6 address */ + dest6.sin6_len = sizeof(struct sockaddr_in6); + dest6.sin6_family = AF_INET6; + dest6.sin6_addr = (mtod(m, struct ip6_hdr *))->ip6_dst; + dest = (const struct sockaddr *)&dest6; + + /* + * Lookup route to the translated destination + * Free this route ref during cleanup + */ + rt = rtalloc1_scoped((struct sockaddr *)&dest6, + 0, 0, ifp->if_index); + + route = rt; + } + } + } + + /* + * This path gets packet chain going to the same destination. + * The pre output routine is used to either trigger resolution of + * the next hop or retreive the next hop's link layer addressing. + * For ex: ether_inet(6)_pre_output routine. + * + * If the routine returns EJUSTRETURN, it implies that packet has + * been queued, and therefore we have to call preout_again for the + * following packet in the chain. + * + * For errors other than EJUSTRETURN, the current packet is freed + * and the rest of the chain (pointed by packetlist is freed as + * part of clean up. + * + * Else if there is no error the retrieved information is used for + * all the packets in the chain. + */ if (raw == 0) { proto_media_preout preoutp = (proto->proto_kpi == kProtoKPI_v1 ? proto->kpi.v1.pre_output : proto->kpi.v2.pre_output); @@ -4351,6 +4507,7 @@ dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist, if (retval == EJUSTRETURN) goto preout_again; m_freem(m); + m = NULL; goto cleanup; } } @@ -4366,6 +4523,30 @@ dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist, #endif do { + /* + * Perform address family translation if needed. + * For now we only support stateless 4 to 6 translation + * on the out path. + * + * The routine below translates IP header, updates protocol + * checksum and also translates ICMP. + * + * We skip the first packet as it is already translated and + * the proto family is set to PF_INET6. + */ + if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) && + (ifp->if_type == IFT_CELLULAR || + dlil_is_clat_needed(proto_family, m))) { + retval = dlil_clat46(ifp, &proto_family, &m); + /* Goto the next packet if the translation fails */ + if (retval != 0) { + m_freem(m); + m = NULL; + ip6stat.ip6s_clat464_out_drop++; + goto next; + } + } + #if CONFIG_DTRACE if (!raw && proto_family == PF_INET) { struct ip *ip = mtod(m, struct ip *); @@ -4557,6 +4738,9 @@ dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist, packetlist = packetlist->m_nextpkt; m->m_nextpkt = NULL; } + /* Reset the proto family to old proto family for CLAT */ + if (did_clat46) + proto_family = old_proto_family; } while (m != NULL); if (send_head != NULL) { @@ -4631,10 +4815,323 @@ dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist, retval = 0; if (iorefcnt == 1) ifnet_decr_iorefcnt(ifp); + if (rt != NULL) { + rtfree(rt); + rt = NULL; + } return (retval); } +/* + * This routine checks if the destination address is not a loopback, link-local, + * multicast or broadcast address. + */ +static int +dlil_is_clat_needed(protocol_family_t proto_family, mbuf_t m) +{ + int ret = 0; + switch(proto_family) { + case PF_INET: { + struct ip *iph = mtod(m, struct ip *); + if (CLAT46_NEEDED(ntohl(iph->ip_dst.s_addr))) + ret = 1; + break; + } + case PF_INET6: { + struct ip6_hdr *ip6h = mtod(m, struct ip6_hdr *); + if ((size_t)m_pktlen(m) >= sizeof(struct ip6_hdr) && + CLAT64_NEEDED(&ip6h->ip6_dst)) + ret = 1; + break; + } + } + + return (ret); +} +/* + * @brief This routine translates IPv4 packet to IPv6 packet, + * updates protocol checksum and also translates ICMP for code + * along with inner header translation. + * + * @param ifp Pointer to the interface + * @param proto_family pointer to protocol family. It is updated if function + * performs the translation successfully. + * @param m Pointer to the pointer pointing to the packet. Needed because this + * routine can end up changing the mbuf to a different one. + * + * @return 0 on success or else a negative value. + */ +static errno_t +dlil_clat46(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m) +{ + VERIFY(*proto_family == PF_INET); + VERIFY(IS_INTF_CLAT46(ifp)); + + pbuf_t pbuf_store, *pbuf = NULL; + struct ip *iph = NULL; + struct in_addr osrc, odst; + uint8_t proto = 0; + struct in6_ifaddr *ia6_clat_src = NULL; + struct in6_addr *src = NULL; + struct in6_addr dst; + int error = 0; + uint32_t off = 0; + uint64_t tot_len = 0; + uint16_t ip_id_val = 0; + uint16_t ip_frag_off = 0; + + boolean_t is_frag = FALSE; + boolean_t is_first_frag = TRUE; + boolean_t is_last_frag = TRUE; + + pbuf_init_mbuf(&pbuf_store, *m, ifp); + pbuf = &pbuf_store; + iph = pbuf->pb_data; + + osrc = iph->ip_src; + odst = iph->ip_dst; + proto = iph->ip_p; + off = iph->ip_hl << 2; + ip_id_val = iph->ip_id; + ip_frag_off = ntohs(iph->ip_off) & IP_OFFMASK; + + tot_len = ntohs(iph->ip_len); + + /* + * For packets that are not first frags + * we only need to adjust CSUM. + * For 4 to 6, Fragmentation header gets appended + * after proto translation. + */ + if (ntohs(iph->ip_off) & ~(IP_DF | IP_RF)) { + is_frag = TRUE; + + /* If the offset is not zero, it is not first frag */ + if (ip_frag_off != 0) + is_first_frag = FALSE; + + /* If IP_MF is set, then it is not last frag */ + if (ntohs(iph->ip_off) & IP_MF) + is_last_frag = FALSE; + } + + /* + * Retrive the local IPv6 CLAT46 address reserved for stateless + * translation. + */ + ia6_clat_src = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46); + if (ia6_clat_src == NULL) { + ip6stat.ip6s_clat464_out_nov6addr_drop++; + error = -1; + goto cleanup; + } + + src = &ia6_clat_src->ia_addr.sin6_addr; + + /* + * Translate IPv4 destination to IPv6 destination by using the + * prefixes learned through prior PLAT discovery. + */ + if ((error = nat464_synthesize_ipv6(ifp, &odst, &dst)) != 0) { + ip6stat.ip6s_clat464_out_v6synthfail_drop++; + goto cleanup; + } + + /* Translate the IP header part first */ + error = (nat464_translate_46(pbuf, off, iph->ip_tos, iph->ip_p, + iph->ip_ttl, *src, dst, tot_len) == NT_NAT64) ? 0 : -1; + + iph = NULL; /* Invalidate iph as pbuf has been modified */ + + if (error != 0) { + ip6stat.ip6s_clat464_out_46transfail_drop++; + goto cleanup; + } + + /* + * Translate protocol header, update checksum, checksum flags + * and related fields. + */ + error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc, (struct nat464_addr *)&odst, + proto, PF_INET, PF_INET6, NT_OUT, !is_first_frag) == NT_NAT64) ? 0 : -1; + + if (error != 0) { + ip6stat.ip6s_clat464_out_46proto_transfail_drop++; + goto cleanup; + } + + /* Now insert the IPv6 fragment header */ + if (is_frag) { + error = nat464_insert_frag46(pbuf, ip_id_val, ip_frag_off, is_last_frag); + + if (error != 0) { + ip6stat.ip6s_clat464_out_46frag_transfail_drop++; + goto cleanup; + } + } + +cleanup: + if (ia6_clat_src != NULL) + IFA_REMREF(&ia6_clat_src->ia_ifa); + + if (pbuf_is_valid(pbuf)) { + *m = pbuf->pb_mbuf; + pbuf->pb_mbuf = NULL; + pbuf_destroy(pbuf); + } else { + error = -1; + ip6stat.ip6s_clat464_out_invalpbuf_drop++; + } + + if (error == 0) { + *proto_family = PF_INET6; + ip6stat.ip6s_clat464_out_success++; + } + + return (error); +} + +/* + * @brief This routine translates incoming IPv6 to IPv4 packet, + * updates protocol checksum and also translates ICMPv6 outer + * and inner headers + * + * @return 0 on success or else a negative value. + */ +static errno_t +dlil_clat64(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m) +{ + VERIFY(*proto_family == PF_INET6); + VERIFY(IS_INTF_CLAT46(ifp)); + + struct ip6_hdr *ip6h = NULL; + struct in6_addr osrc, odst; + uint8_t proto = 0; + struct in6_ifaddr *ia6_clat_dst = NULL; + struct in_ifaddr *ia4_clat_dst = NULL; + struct in_addr *dst = NULL; + struct in_addr src; + int error = 0; + uint32_t off = 0; + u_int64_t tot_len = 0; + uint8_t tos = 0; + boolean_t is_first_frag = TRUE; + + /* Incoming mbuf does not contain valid IP6 header */ + if ((size_t)(*m)->m_pkthdr.len < sizeof(struct ip6_hdr) || + ((size_t)(*m)->m_len < sizeof(struct ip6_hdr) && + (*m = m_pullup(*m, sizeof(struct ip6_hdr))) == NULL)) { + ip6stat.ip6s_clat464_in_tooshort_drop++; + return (-1); + } + + ip6h = mtod(*m, struct ip6_hdr *); + /* Validate that mbuf contains IP payload equal to ip6_plen */ + if ((size_t)(*m)->m_pkthdr.len < ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr)) { + ip6stat.ip6s_clat464_in_tooshort_drop++; + return (-1); + } + + osrc = ip6h->ip6_src; + odst = ip6h->ip6_dst; + + /* + * Retrieve the local CLAT46 reserved IPv6 address. + * Let the packet pass if we don't find one, as the flag + * may get set before IPv6 configuration has taken place. + */ + ia6_clat_dst = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46); + if (ia6_clat_dst == NULL) + goto done; + + /* + * Check if the original dest in the packet is same as the reserved + * CLAT46 IPv6 address + */ + if (IN6_ARE_ADDR_EQUAL(&odst, &ia6_clat_dst->ia_addr.sin6_addr)) { + pbuf_t pbuf_store, *pbuf = NULL; + pbuf_init_mbuf(&pbuf_store, *m, ifp); + pbuf = &pbuf_store; + + /* + * Retrive the local CLAT46 IPv4 address reserved for stateless + * translation. + */ + ia4_clat_dst = inifa_ifpclatv4(ifp); + if (ia4_clat_dst == NULL) { + IFA_REMREF(&ia6_clat_dst->ia_ifa); + ip6stat.ip6s_clat464_in_nov4addr_drop++; + error = -1; + goto cleanup; + } + IFA_REMREF(&ia6_clat_dst->ia_ifa); + + /* Translate IPv6 src to IPv4 src by removing the NAT64 prefix */ + dst = &ia4_clat_dst->ia_addr.sin_addr; + if ((error = nat464_synthesize_ipv4(ifp, &osrc, &src)) != 0) { + ip6stat.ip6s_clat464_in_v4synthfail_drop++; + error = -1; + goto cleanup; + } + + ip6h = pbuf->pb_data; + off = sizeof(struct ip6_hdr); + proto = ip6h->ip6_nxt; + tos = (ntohl(ip6h->ip6_flow) >> 20) & 0xff; + tot_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr); + + /* + * Translate the IP header and update the fragmentation + * header if needed + */ + error = (nat464_translate_64(pbuf, off, tos, &proto, + ip6h->ip6_hlim, src, *dst, tot_len, &is_first_frag) == NT_NAT64) ? + 0 : -1; + + ip6h = NULL; /* Invalidate ip6h as pbuf has been changed */ + + if (error != 0) { + ip6stat.ip6s_clat464_in_64transfail_drop++; + goto cleanup; + } + + /* + * Translate protocol header, update checksum, checksum flags + * and related fields. + */ + error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc, + (struct nat464_addr *)&odst, proto, PF_INET6, PF_INET, + NT_IN, !is_first_frag) == NT_NAT64) ? 0 : -1; + + if (error != 0) { + ip6stat.ip6s_clat464_in_64proto_transfail_drop++; + goto cleanup; + } + +cleanup: + if (ia4_clat_dst != NULL) + IFA_REMREF(&ia4_clat_dst->ia_ifa); + + if (pbuf_is_valid(pbuf)) { + *m = pbuf->pb_mbuf; + pbuf->pb_mbuf = NULL; + pbuf_destroy(pbuf); + } else { + error = -1; + ip6stat.ip6s_clat464_in_invalpbuf_drop++; + } + + if (error == 0) { + *proto_family = PF_INET; + ip6stat.ip6s_clat464_in_success++; + } + } /* CLAT traffic */ + +done: + return (error); +} + errno_t ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code, void *ioctl_arg) @@ -6169,6 +6666,9 @@ ifnet_detach(ifnet_t ifp) ifp->if_eflags &= ~IFEF_ECN_DISABLE; ifp->if_eflags &= ~IFEF_ECN_ENABLE; + /* Reset CLAT46 flag */ + ifp->if_eflags &= ~IFEF_CLAT46; + /* * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will * no longer be visible during lookups from this point. @@ -8246,6 +8746,9 @@ ifnet_set_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes) &prefixes[i].ipv6_prefix; if (prefix_len == 0) { + clat_log0((LOG_DEBUG, + "NAT64 prefixes purged from Interface %s\n", + if_name(ifp))); /* Allow clearing the signature */ IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = 0; bzero(&IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix, @@ -8258,11 +8761,15 @@ ifnet_set_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes) prefix_len != NAT64_PREFIX_LEN_56 && prefix_len != NAT64_PREFIX_LEN_64 && prefix_len != NAT64_PREFIX_LEN_96) { + clat_log0((LOG_DEBUG, + "NAT64 prefixlen is incorrect %d\n", prefix_len)); error = EINVAL; goto out; } if (IN6_IS_SCOPE_EMBED(prefix)) { + clat_log0((LOG_DEBUG, + "NAT64 prefix has interface/link local scope.\n")); error = EINVAL; goto out; } @@ -8270,6 +8777,9 @@ ifnet_set_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes) IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = prefix_len; bcopy(prefix, &IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix, sizeof(struct in6_addr)); + clat_log0((LOG_DEBUG, + "NAT64 prefix set to %s with prefixlen: %d\n", + ip6_sprintf(prefix), prefix_len)); one_set = 1; } @@ -8643,7 +9153,8 @@ dlil_verify_sum16(void) kprintf("DLIL: running SUM16 self-tests ... "); m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR); - MH_ALIGN(m, sizeof (uint32_t)); /* 32-bit starting alignment */ + m_align(m, sizeof(sumdata) + (sizeof (uint64_t) * 2)); + buf = mtod(m, uint8_t *); /* base address */ for (n = 0; n < SUMTBL_MAX; n++) { diff --git a/bsd/net/ethernet.h b/bsd/net/ethernet.h index 3f61bc94f..23719456b 100644 --- a/bsd/net/ethernet.h +++ b/bsd/net/ethernet.h @@ -82,18 +82,18 @@ /* * Structure of a 10Mb/s Ethernet header. */ -struct ether_header { +typedef struct ether_header { u_char ether_dhost[ETHER_ADDR_LEN]; u_char ether_shost[ETHER_ADDR_LEN]; u_short ether_type; -}; +} ether_header_t; /* * Structure of a 48-bit Ethernet address. */ -struct ether_addr { +typedef struct ether_addr { u_char octet[ETHER_ADDR_LEN]; -}; +} ether_addr_t; #define ether_addr_octet octet diff --git a/bsd/net/if.c b/bsd/net/if.c index 0fe5872ea..7bf8dc604 100644 --- a/bsd/net/if.c +++ b/bsd/net/if.c @@ -112,6 +112,7 @@ #include #include #include +#include #include #include #include @@ -502,9 +503,12 @@ if_clone_create(char *name, int len, void *params) if (unit > ifc->ifc_maxunit) return (ENXIO); + lck_mtx_lock(&ifc->ifc_mutex); err = (*ifc->ifc_create)(ifc, unit, params); - if (err != 0) + if (err != 0) { + lck_mtx_unlock(&ifc->ifc_mutex); return (err); + } if (!wildcard) { bytoff = unit >> 3; @@ -533,6 +537,7 @@ if_clone_create(char *name, int len, void *params) } } + lck_mtx_unlock(&ifc->ifc_mutex); return (0); } @@ -543,36 +548,55 @@ if_clone_create(char *name, int len, void *params) static int if_clone_destroy(const char *name) { - struct if_clone *ifc; - struct ifnet *ifp; + struct if_clone *ifc = NULL; + struct ifnet *ifp = NULL; int bytoff, bitoff; u_int32_t unit; + int error = 0; ifc = if_clone_lookup(name, &unit); - if (ifc == NULL) - return (EINVAL); - if (unit < ifc->ifc_minifs) - return (EINVAL); + if (ifc == NULL) { + error = EINVAL; + goto done; + } - ifp = ifunit(name); - if (ifp == NULL) - return (ENXIO); + if (unit < ifc->ifc_minifs) { + error = EINVAL; + goto done; + } - if (ifc->ifc_destroy == NULL) - return (EOPNOTSUPP); + ifp = ifunit_ref(name); + if (ifp == NULL) { + error = ENXIO; + goto done; + } + + if (ifc->ifc_destroy == NULL) { + error = EOPNOTSUPP; + goto done; + } - (*ifc->ifc_destroy)(ifp); + lck_mtx_lock(&ifc->ifc_mutex); + error = (*ifc->ifc_destroy)(ifp); - /* - * Compute offset in the bitmap and deallocate the unit. - */ + if (error) { + lck_mtx_unlock(&ifc->ifc_mutex); + goto done; + } + + /* Compute offset in the bitmap and deallocate the unit. */ bytoff = unit >> 3; bitoff = unit - (bytoff << 3); KASSERT((ifc->ifc_units[bytoff] & (1 << bitoff)) != 0, ("%s: bit is already cleared", __func__)); ifc->ifc_units[bytoff] &= ~(1 << bitoff); - return (0); + lck_mtx_unlock(&ifc->ifc_mutex); + +done: + if (ifp != NULL) + ifnet_decr_iorefcnt(ifp); + return (error); } /* @@ -617,6 +641,28 @@ if_clone_lookup(const char *name, u_int32_t *unitp) return (ifc); } +void * +if_clone_softc_allocate(const struct if_clone *ifc) +{ + void *p_clone = NULL; + + VERIFY(ifc != NULL); + + p_clone = zalloc(ifc->ifc_zone); + if (p_clone != NULL) + bzero(p_clone, ifc->ifc_softc_size); + + return (p_clone); +} + +void +if_clone_softc_deallocate(const struct if_clone *ifc, void *p_softc) +{ + VERIFY(ifc != NULL && p_softc != NULL); + bzero(p_softc, ifc->ifc_softc_size); + zfree(ifc->ifc_zone, p_softc); +} + /* * Register a network interface cloner. */ @@ -643,6 +689,18 @@ if_clone_attach(struct if_clone *ifc) if (ifc->ifc_units == NULL) return (ENOBUFS); ifc->ifc_bmlen = len; + lck_mtx_init(&ifc->ifc_mutex, ifnet_lock_group, ifnet_lock_attr); + + if (ifc->ifc_softc_size != 0) { + ifc->ifc_zone = zinit(ifc->ifc_softc_size, + ifc->ifc_zone_max_elem * ifc->ifc_softc_size, 0, ifc->ifc_name); + if (ifc->ifc_zone == NULL) { + FREE(ifc->ifc_units, M_CLONE); + return (ENOBUFS); + } + zone_change(ifc->ifc_zone, Z_EXPAND, TRUE); + zone_change(ifc->ifc_zone, Z_CALLERACCT, FALSE); + } LIST_INSERT_HEAD(&if_cloners, ifc, ifc_list); if_cloners_count++; @@ -670,6 +728,10 @@ if_clone_detach(struct if_clone *ifc) { LIST_REMOVE(ifc, ifc_list); FREE(ifc->ifc_units, M_CLONE); + if (ifc->ifc_softc_size != 0) + zdestroy(ifc->ifc_zone); + + lck_mtx_destroy(&ifc->ifc_mutex, ifnet_lock_group); if_cloners_count--; } @@ -728,6 +790,8 @@ if_functional_type(struct ifnet *ifp, bool exclude_delegate) ret = IFRTYPE_FUNCTIONAL_INTCOPROC; } else if ((exclude_delegate && (ifp->if_family == IFNET_FAMILY_ETHERNET || + ifp->if_family == IFNET_FAMILY_BOND || + ifp->if_family == IFNET_FAMILY_VLAN || ifp->if_family == IFNET_FAMILY_FIREWIRE)) || (!exclude_delegate && IFNET_IS_WIRED(ifp))) { ret = IFRTYPE_FUNCTIONAL_WIRED; @@ -2235,50 +2299,6 @@ ifioctl_iforder(u_long cmd, caddr_t data) break; } - case SIOCGIFORDER: { /* struct if_order */ - struct if_order *ifo = (struct if_order *)(void *)data; - u_int32_t ordered_count = *((volatile u_int32_t *)&if_ordered_count); - - if (ifo->ifo_count == 0 || - ordered_count == 0) { - ifo->ifo_count = 0; - } else if (ifo->ifo_ordered_indices != USER_ADDR_NULL) { - u_int32_t count_to_copy = - MIN(ordered_count, ifo->ifo_count); - size_t length = (count_to_copy * sizeof(u_int32_t)); - struct ifnet *ifp = NULL; - u_int32_t cursor = 0; - - ordered_indices = _MALLOC(length, M_NECP, M_WAITOK | M_ZERO); - if (ordered_indices == NULL) { - error = ENOMEM; - break; - } - - ifnet_head_lock_shared(); - TAILQ_FOREACH(ifp, &ifnet_ordered_head, if_ordered_link) { - if (cursor >= count_to_copy || - cursor >= if_ordered_count) { - break; - } - ordered_indices[cursor] = ifp->if_index; - cursor++; - } - ifnet_head_done(); - - /* We might have parsed less than the original length - * because the list could have changed. - */ - length = cursor * sizeof(u_int32_t); - ifo->ifo_count = cursor; - error = copyout(ordered_indices, - ifo->ifo_ordered_indices, length); - } else { - error = EINVAL; - } - break; - } - default: { VERIFY(0); /* NOTREACHED */ @@ -2342,10 +2362,14 @@ ifioctl_nat64prefix(struct ifnet *ifp, u_long cmd, caddr_t data) switch (cmd) { case SIOCSIFNAT64PREFIX: /* struct if_nat64req */ error = ifnet_set_nat64prefix(ifp, ifnat64->ifnat64_prefixes); + if (error != 0) + ip6stat.ip6s_clat464_plat64_pfx_setfail++; break; case SIOCGIFNAT64PREFIX: /* struct if_nat64req */ error = ifnet_get_nat64prefix(ifp, ifnat64->ifnat64_prefixes); + if (error != 0) + ip6stat.ip6s_clat464_plat64_pfx_getfail++; break; default: @@ -2355,6 +2379,36 @@ ifioctl_nat64prefix(struct ifnet *ifp, u_long cmd, caddr_t data) return (error); } + +static __attribute__((noinline)) int +ifioctl_clat46addr(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct if_clat46req *ifclat46 = (struct if_clat46req *)(void *)data; + struct in6_ifaddr *ia6_clat = NULL; + int error = 0; + + VERIFY(ifp != NULL); + + switch (cmd) { + case SIOCGIFCLAT46ADDR: + ia6_clat = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46); + if (ia6_clat == NULL) { + error = ENOENT; + break; + } + + bcopy(&ia6_clat->ia_addr.sin6_addr, &ifclat46->ifclat46_addr.v6_address, + sizeof(ifclat46->ifclat46_addr.v6_address)); + ifclat46->ifclat46_addr.v6_prefixlen = ia6_clat->ia_plen; + IFA_REMREF(&ia6_clat->ia_ifa); + break; + default: + VERIFY(0); + /* NOTREACHED */ + } + + return (error); +} #endif @@ -2380,7 +2434,7 @@ ifioctl_get_protolist(struct ifnet *ifp, u_int32_t * ret_count, if (count == 0) { goto done; } - list = _MALLOC(count * sizeof(*list), M_TEMP, M_WAITOK); + list = _MALLOC(count * sizeof(*list), M_TEMP, M_WAITOK | M_ZERO); if (list == NULL) { error = ENOMEM; goto done; @@ -2567,7 +2621,6 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) goto done; case SIOCSIFORDER: /* struct if_order */ - case SIOCGIFORDER: /* struct if_order */ error = ifioctl_iforder(cmd, data); goto done; @@ -2652,6 +2705,8 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) case SIOCGQOSMARKINGENABLED: /* struct ifreq */ case SIOCSIFLOWINTERNET: /* struct ifreq */ case SIOCGIFLOWINTERNET: /* struct ifreq */ + case SIOCGIFLOWPOWER: /* struct ifreq */ + case SIOCSIFLOWPOWER: /* struct ifreq */ { /* struct ifreq */ struct ifreq ifr; bcopy(data, &ifr, sizeof (ifr)); @@ -2851,10 +2906,14 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) break; #if INET6 - case SIOCSIFNAT64PREFIX: /* struct if_nsreq */ - case SIOCGIFNAT64PREFIX: /* struct if_nsreq */ + case SIOCSIFNAT64PREFIX: /* struct if_nat64req */ + case SIOCGIFNAT64PREFIX: /* struct if_nat64req */ error = ifioctl_nat64prefix(ifp, cmd, data); break; + + case SIOCGIFCLAT46ADDR: /* struct if_clat46req */ + error = ifioctl_clat46addr(ifp, cmd, data); + break; #endif case SIOCGIFPROTOLIST32: /* struct if_protolistreq32 */ @@ -3582,6 +3641,17 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p) IFRTYPE_LOW_INTERNET_ENABLE_DL; ifnet_lock_done(ifp); break; + case SIOCGIFLOWPOWER: + ifr->ifr_low_power_mode = + !!(ifp->if_xflags & IFXF_LOW_POWER); + break; + case SIOCSIFLOWPOWER: +#if (DEVELOPMENT || DEBUG) + error = if_set_low_power(ifp, !!(ifr->ifr_low_power_mode)); +#else /* DEVELOPMENT || DEBUG */ + error = EOPNOTSUPP; +#endif /* DEVELOPMENT || DEBUG */ + break; default: VERIFY(0); /* NOTREACHED */ @@ -4603,8 +4673,15 @@ if_rtmtu(struct radix_node *rn, void *arg) * has not been locked (RTV_MTU is not set) and * if it was non-zero to begin with. */ - if (!(rt->rt_rmx.rmx_locks & RTV_MTU) && rt->rt_rmx.rmx_mtu) + if (!(rt->rt_rmx.rmx_locks & RTV_MTU) && rt->rt_rmx.rmx_mtu) { rt->rt_rmx.rmx_mtu = ifp->if_mtu; + if (rt_key(rt)->sa_family == AF_INET && + INTF_ADJUST_MTU_FOR_CLAT46(ifp)) { + rt->rt_rmx.rmx_mtu = IN6_LINKMTU(ifp); + /* Further adjust the size for CLAT46 expansion */ + rt->rt_rmx.rmx_mtu -= CLAT46_HDR_EXPANSION_OVERHD; + } + } } RT_UNLOCK(rt); @@ -5134,9 +5211,6 @@ ifioctl_cassert(void) case SIOCGIFAGENTIDS64: case SIOCGIFAGENTDATA32: case SIOCGIFAGENTDATA64: - case SIOCGIFAGENTLIST32: - case SIOCGIFAGENTLIST64: - case SIOCSIFINTERFACESTATE: case SIOCGIFINTERFACESTATE: @@ -5150,13 +5224,37 @@ ifioctl_cassert(void) case SIOCGECNMODE: case SIOCSECNMODE: + case SIOCSIFORDER: + case SIOCSQOSMARKINGMODE: case SIOCSQOSMARKINGENABLED: case SIOCGQOSMARKINGMODE: case SIOCGQOSMARKINGENABLED: + case SIOCSIFTIMESTAMPENABLE: + case SIOCSIFTIMESTAMPDISABLE: + case SIOCGIFTIMESTAMPENABLED: + + case SIOCSIFDISABLEOUTPUT: + + case SIOCGIFAGENTLIST32: + case SIOCGIFAGENTLIST64: + + case SIOCSIFLOWINTERNET: + case SIOCGIFLOWINTERNET: + +#if INET6 + case SIOCGIFNAT64PREFIX: + case SIOCSIFNAT64PREFIX: + + case SIOCGIFCLAT46ADDR: +#endif /* INET6 */ + case SIOCGIFPROTOLIST32: case SIOCGIFPROTOLIST64: + + case SIOCGIFLOWPOWER: + case SIOCSIFLOWPOWER: ; } } diff --git a/bsd/net/if.h b/bsd/net/if.h index 2583ef5e7..1cf0a5f14 100644 --- a/bsd/net/if.h +++ b/bsd/net/if.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2017 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -137,6 +137,13 @@ struct if_clonereq32 { #define IFEF_VLAN 0x00000200 /* interface has one or more vlans */ #define IFEF_BOND 0x00000400 /* interface is part of bond */ #define IFEF_ARPLL 0x00000800 /* ARP for IPv4LL addresses */ +#define IFEF_CLAT46 0x00001000 /* CLAT46 RFC 6877 */ + +#define IS_INTF_CLAT46(ifp) ((ifp) != NULL && ((ifp)->if_eflags & IFEF_CLAT46)) +#define INTF_ADJUST_MTU_FOR_CLAT46(intf) \ + (IS_INTF_CLAT46((intf)) || \ + IS_INTF_CLAT46((intf)->if_delegated.ifp)) \ + /* * XXX IFEF_NOAUTOIPV6LL is deprecated and should be done away with. * Configd pretty much manages the interface configuration. @@ -175,6 +182,7 @@ struct if_clonereq32 { #define IFXF_LOW_INTERNET_UL 0x00000010 /* Uplink Low Internet is confirmed */ #define IFXF_LOW_INTERNET_DL 0x00000020 /* Downlink Low Internet is confirmed */ #define IFXF_ALLOC_KPI 0x00000040 /* Allocated via the ifnet_alloc KPI */ +#define IFXF_LOW_POWER 0x00000080 /* Low Power Mode */ /* * Current requirements for an AWDL interface. Setting/clearing IFEF_AWDL @@ -499,7 +507,7 @@ struct ifreq { #define IFRTYPE_LOW_INTERNET_DISABLE_UL_DL 0x0000 #define IFRTYPE_LOW_INTERNET_ENABLE_UL 0x0001 #define IFRTYPE_LOW_INTERNET_ENABLE_DL 0x0002 - + int ifru_low_power_mode; #endif /* PRIVATE */ } ifr_ifru; #define ifr_addr ifr_ifru.ifru_addr /* address */ @@ -549,6 +557,7 @@ struct ifreq { #define ifr_fastlane_enabled ifr_qosmarking_enabled #define ifr_disable_output ifr_ifru.ifru_disable_output #define ifr_low_internet ifr_ifru.ifru_low_internet +#define ifr_low_power_mode ifr_ifru.ifru_low_power_mode #endif /* PRIVATE */ }; @@ -928,6 +937,14 @@ struct kev_dl_rrc_state { u_int32_t rrc_state; }; +/* + * KEV_DL_LOW_POWER_MODE_CHANGED + */ +struct kev_dl_low_power_mode { + struct net_event_data link_data; + int low_power_event; +}; + /* * Length of network signature/fingerprint blob. */ @@ -960,14 +977,23 @@ struct ipv6_prefix { uint32_t prefix_len; }; -/* - * Structure for SIOC[S/G]IFNAT64PREFIX - */ +struct if_ipv6_address { + struct in6_addr v6_address; + uint32_t v6_prefixlen; +}; + +/* Structure for SIOC[S/G]IFNAT64PREFIX */ struct if_nat64req { char ifnat64_name[IFNAMSIZ]; struct ipv6_prefix ifnat64_prefixes[NAT64_MAX_NUM_PREFIXES]; }; +/* Structure for SIOCGIFCLAT46ADDR */ +struct if_clat46req { + char ifclat46_name[IFNAMSIZ]; + struct if_ipv6_address ifclat46_addr; +}; + /* * Structure for SIOC[S/G]IFORDER * diff --git a/bsd/net/if_bond.c b/bsd/net/if_bond.c index 204a04c0e..8682a9fe5 100644 --- a/bsd/net/if_bond.c +++ b/bsd/net/if_bond.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2017 Apple Inc. All rights reserved. + * Copyright (c) 2004-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -49,7 +49,6 @@ #include #include #include - #include #include #include @@ -68,6 +67,7 @@ #include #include +#include #include #include @@ -83,9 +83,14 @@ static struct ether_addr slow_proto_multicast = { IEEE8023AD_SLOW_PROTO_MULTICAST }; +typedef struct ifbond_s ifbond, * ifbond_ref; +typedef struct bondport_s bondport, * bondport_ref; + #define BOND_MAXUNIT 128 -#define BONDNAME "bond" -#define M_BOND M_DEVBUF +#define BOND_ZONE_MAX_ELEM MIN(IFNETS_MAX, BOND_MAXUNIT) +#define BONDNAME "bond" + +#define M_BOND M_DEVBUF #define EA_FORMAT "%x:%x:%x:%x:%x:%x" #define EA_CH(e, i) ((u_char)((u_char *)(e))[(i)]) @@ -617,24 +622,26 @@ bondport_collecting(bondport_ref p) static int bond_clone_create(struct if_clone *, u_int32_t, void *); static int bond_clone_destroy(struct ifnet *); static int bond_input(ifnet_t ifp, protocol_family_t protocol, mbuf_t m, - char *frame_header); + char *frame_header); static int bond_output(struct ifnet *ifp, struct mbuf *m); static int bond_ioctl(struct ifnet *ifp, u_long cmd, void * addr); static int bond_set_bpf_tap(struct ifnet * ifp, bpf_tap_mode mode, - bpf_packet_func func); + bpf_packet_func func); static int bond_attach_protocol(struct ifnet *ifp); static int bond_detach_protocol(struct ifnet *ifp); static int bond_setmulti(struct ifnet *ifp); static int bond_add_interface(struct ifnet * ifp, struct ifnet * port_ifp); static int bond_remove_interface(ifbond_ref ifb, struct ifnet * port_ifp); static void bond_if_free(struct ifnet * ifp); +static void interface_link_event(struct ifnet * ifp, u_int32_t event_code); static struct if_clone bond_cloner = IF_CLONE_INITIALIZER(BONDNAME, - bond_clone_create, - bond_clone_destroy, - 0, - BOND_MAXUNIT); -static void interface_link_event(struct ifnet * ifp, u_int32_t event_code); + bond_clone_create, + bond_clone_destroy, + 0, + BOND_MAXUNIT, + BOND_ZONE_MAX_ELEM, + sizeof(ifbond)); static int siocsifmtu(struct ifnet * ifp, int mtu) @@ -699,7 +706,7 @@ ifbond_release(ifbond_ref ifb) if (ifb->ifb_distributing_array != NULL) { FREE(ifb->ifb_distributing_array, M_BOND); } - FREE(ifb, M_BOND); + if_clone_softc_deallocate(&bond_cloner, ifb); break; default: break; @@ -1092,7 +1099,7 @@ bond_clone_create(struct if_clone * ifc, u_int32_t unit, __unused void *params) return (error); } - ifb = _MALLOC(sizeof(ifbond), M_BOND, M_WAITOK | M_ZERO); + ifb = if_clone_softc_allocate(&bond_cloner); if (ifb == NULL) { return (ENOMEM); } diff --git a/bsd/net/if_bridge.c b/bsd/net/if_bridge.c index 133d9af30..33ae35c66 100644 --- a/bsd/net/if_bridge.c +++ b/bsd/net/if_bridge.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2017 Apple Inc. All rights reserved. + * Copyright (c) 2004-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -888,7 +888,13 @@ static LIST_HEAD(, bridge_softc) bridge_list = static lck_grp_t *bridge_lock_grp = NULL; static lck_attr_t *bridge_lock_attr = NULL; -static if_clone_t bridge_cloner = NULL; +#define BRIDGENAME "bridge" +#define BRIDGES_MAX IF_MAXUNIT +#define BRIDGE_ZONE_MAX_ELEM MIN(IFNETS_MAX, BRIDGES_MAX) + +static struct if_clone bridge_cloner = + IF_CLONE_INITIALIZER(BRIDGENAME, bridge_clone_create, bridge_clone_destroy, + 0, BRIDGES_MAX, BRIDGE_ZONE_MAX_ELEM, sizeof(struct bridge_softc)); static int if_bridge_txstart = 0; SYSCTL_INT(_net_link_bridge, OID_AUTO, txstart, CTLFLAG_RW | CTLFLAG_LOCKED, @@ -1126,7 +1132,6 @@ bridgeattach(int n) #pragma unused(n) int error; lck_grp_attr_t *lck_grp_attr = NULL; - struct ifnet_clone_params ifnet_clone_params; bridge_rtnode_pool = zinit(sizeof (struct bridge_rtnode), 1024 * sizeof (struct bridge_rtnode), 0, "bridge_rtnode"); @@ -1153,11 +1158,7 @@ bridgeattach(int n) bstp_sys_init(); #endif /* BRIDGESTP */ - ifnet_clone_params.ifc_name = "bridge"; - ifnet_clone_params.ifc_create = bridge_clone_create; - ifnet_clone_params.ifc_destroy = bridge_clone_destroy; - - error = ifnet_clone_attach(&ifnet_clone_params, &bridge_cloner); + error = if_clone_attach(&bridge_cloner); if (error != 0) printf("%s: ifnet_clone_attach failed %d\n", __func__, error); @@ -1243,13 +1244,18 @@ bridge_clone_create(struct if_clone *ifc, uint32_t unit, void *params) { #pragma unused(params) struct ifnet *ifp = NULL; - struct bridge_softc *sc, *sc2; + struct bridge_softc *sc = NULL; + struct bridge_softc *sc2 = NULL; struct ifnet_init_eparams init_params; errno_t error = 0; uint8_t eth_hostid[ETHER_ADDR_LEN]; int fb, retry, has_hostid; - sc = _MALLOC(sizeof (*sc), M_DEVBUF, M_WAITOK | M_ZERO); + sc = if_clone_softc_allocate(&bridge_cloner); + if (sc == NULL) { + error = ENOMEM; + goto done; + } lck_mtx_init(&sc->sc_mtx, bridge_lock_grp, bridge_lock_attr); sc->sc_brtmax = BRIDGE_RTABLE_MAX; @@ -1422,7 +1428,7 @@ bridge_clone_create(struct if_clone *ifc, uint32_t unit, void *params) done: if (error != 0) { printf("%s failed error %d\n", __func__, error); - /* Cleanup TBD */ + /* TBD: Clean up: sc, sc_rthash etc */ } return (error); @@ -6033,8 +6039,7 @@ bridge_detach(ifnet_t ifp) ifnet_release(ifp); lck_mtx_destroy(&sc->sc_mtx, bridge_lock_grp); - - _FREE(sc, M_DEVBUF); + if_clone_softc_deallocate(&bridge_cloner, sc); } /* diff --git a/bsd/net/if_fake.c b/bsd/net/if_fake.c index 96f8a3e43..543a0cb81 100644 --- a/bsd/net/if_fake.c +++ b/bsd/net/if_fake.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017 Apple Inc. All rights reserved. + * Copyright (c) 2015-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -71,6 +71,7 @@ #include #include +#include #ifdef INET #include @@ -106,14 +107,24 @@ static int if_fake_debug = 0; SYSCTL_INT(_net_link_fake, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_LOCKED, &if_fake_debug, 0, "Fake interface debug logs"); +static int if_fake_wmm_mode = 0; +SYSCTL_INT(_net_link_fake, OID_AUTO, wmm_mode, CTLFLAG_RW | CTLFLAG_LOCKED, + &if_fake_wmm_mode, 0, "Fake interface in 802.11 WMM mode"); + /** ** virtual ethernet structures, types **/ +#define IFF_NUM_TX_RINGS_WMM_MODE 4 +#define IFF_NUM_RX_RINGS_WMM_MODE 1 +#define IFF_MAX_TX_RINGS IFF_NUM_TX_RINGS_WMM_MODE +#define IFF_MAX_RX_RINGS IFF_NUM_RX_RINGS_WMM_MODE + typedef uint16_t iff_flags_t; #define IFF_FLAGS_HWCSUM 0x0001 #define IFF_FLAGS_BSD_MODE 0x0002 #define IFF_FLAGS_DETACHING 0x0004 +#define IFF_FLAGS_WMM_MODE 0x0008 struct if_fake { @@ -169,6 +180,9 @@ feth_enable_dequeue_stall(ifnet_t ifp, uint32_t enable) return (error); } + +#define FETH_MAXUNIT IF_MAXUNIT +#define FETH_ZONE_MAX_ELEM MIN(IFNETS_MAX, FETH_MAXUNIT) #define M_FAKE M_DEVBUF static int feth_clone_create(struct if_clone *, u_int32_t, void *); @@ -183,10 +197,12 @@ static void feth_free(if_fake_ref fakeif); static struct if_clone feth_cloner = IF_CLONE_INITIALIZER(FAKE_ETHER_NAME, - feth_clone_create, - feth_clone_destroy, - 0, - IF_MAXUNIT); + feth_clone_create, + feth_clone_destroy, + 0, + FETH_MAXUNIT, + FETH_ZONE_MAX_ELEM, + sizeof(struct if_fake)); static void interface_link_event(ifnet_t ifp, u_int32_t event_code); /* some media words to pretend to be ethernet */ @@ -280,7 +296,7 @@ feth_free(if_fake_ref fakeif) } FETH_DPRINTF("%s\n", fakeif->iff_name); - FREE(fakeif, M_FAKE); + if_clone_softc_deallocate(&feth_cloner, fakeif); } static void @@ -363,7 +379,7 @@ feth_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params) ifnet_t ifp; uint8_t mac_address[ETHER_ADDR_LEN]; - fakeif = _MALLOC(sizeof(struct if_fake), M_FAKE, M_WAITOK | M_ZERO); + fakeif = if_clone_softc_allocate(&feth_cloner); if (fakeif == NULL) { return ENOBUFS; } diff --git a/bsd/net/if_gif.c b/bsd/net/if_gif.c index bb9d156d0..e0caf004b 100644 --- a/bsd/net/if_gif.c +++ b/bsd/net/if_gif.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2017 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -75,6 +75,7 @@ #include #include #include +#include #include #include @@ -113,7 +114,9 @@ #define GIFNAME "gif" #define GIFDEV "if_gif" -#define GIF_MAXUNIT 0x7fff /* ifp->if_unit is only 15 bits */ + +#define GIF_MAXUNIT IF_MAXUNIT +#define GIF_ZONE_MAX_ELEM MIN(IFNETS_MAX, GIF_MAXUNIT) /* gif lock variables */ static lck_grp_t *gif_mtx_grp; @@ -155,12 +158,15 @@ static struct ip6protosw in6_gif_protosw = }; #endif -static if_clone_t gif_cloner = NULL; +static int gif_remove(struct ifnet *); static int gif_clone_create(struct if_clone *, uint32_t, void *); static int gif_clone_destroy(struct ifnet *); static void gif_delete_tunnel(struct gif_softc *); static void gif_detach(struct ifnet *); +static struct if_clone gif_cloner = + IF_CLONE_INITIALIZER(GIFNAME, gif_clone_create, gif_clone_destroy, + 0, GIF_MAXUNIT, GIF_ZONE_MAX_ELEM, sizeof(struct gif_softc)); /* * Theory of operation: initially, one gif interface is created. * Any time a gif interface is configured, if there are no other @@ -251,8 +257,6 @@ void gif_init(void) { errno_t result; - struct ifnet_clone_params ifnet_clone_params; - struct if_clone *ifc = NULL; /* Initialize the list of interfaces */ TAILQ_INIT(&gifs); @@ -276,17 +280,11 @@ gif_init(void) printf("proto_register_plumber failed for AF_INET6 error=%d\n", result); - ifnet_clone_params.ifc_name = "gif"; - ifnet_clone_params.ifc_create = gif_clone_create; - ifnet_clone_params.ifc_destroy = gif_clone_destroy; - - result = ifnet_clone_attach(&ifnet_clone_params, &gif_cloner); + result = if_clone_attach(&gif_cloner); if (result != 0) - printf("gifattach: ifnet_clone_attach failed %d\n", result); + panic("%s: if_clone_attach() failed, error %d\n", __func__, result); - /* Create first device */ - ifc = if_clone_lookup("gif", NULL); - gif_clone_create(ifc, 0, NULL); + gif_clone_create(&gif_cloner, 0, NULL); } static errno_t @@ -310,7 +308,7 @@ gif_detach(struct ifnet *ifp) { struct gif_softc *sc = ifp->if_softc; lck_mtx_destroy(&sc->gif_lock, gif_mtx_grp); - _FREE(ifp->if_softc, M_DEVBUF); + if_clone_softc_deallocate(&gif_cloner, sc); ifp->if_softc = NULL; (void) ifnet_release(ifp); } @@ -330,7 +328,7 @@ gif_clone_create(struct if_clone *ifc, uint32_t unit, __unused void *params) goto done; } - sc = _MALLOC(sizeof (struct gif_softc), M_DEVBUF, M_WAITOK | M_ZERO); + sc = if_clone_softc_allocate(&gif_cloner); if (sc == NULL) { log(LOG_ERR, "gif_clone_create: failed to allocate gif%d\n", unit); @@ -366,7 +364,7 @@ gif_clone_create(struct if_clone *ifc, uint32_t unit, __unused void *params) error = ifnet_allocate_extended(&gif_init_params, &sc->gif_if); if (error != 0) { printf("gif_clone_create, ifnet_allocate failed - %d\n", error); - _FREE(sc, M_DEVBUF); + if_clone_softc_deallocate(&gif_cloner, sc); error = ENOBUFS; goto done; } @@ -378,7 +376,7 @@ gif_clone_create(struct if_clone *ifc, uint32_t unit, __unused void *params) if (sc->encap_cookie4 == NULL) { printf("%s: unable to attach encap4\n", if_name(sc->gif_if)); ifnet_release(sc->gif_if); - FREE(sc, M_DEVBUF); + if_clone_softc_deallocate(&gif_cloner, sc); error = ENOBUFS; goto done; } @@ -393,7 +391,7 @@ gif_clone_create(struct if_clone *ifc, uint32_t unit, __unused void *params) } printf("%s: unable to attach encap6\n", if_name(sc->gif_if)); ifnet_release(sc->gif_if); - FREE(sc, M_DEVBUF); + if_clone_softc_deallocate(&gif_cloner, sc); error = ENOBUFS; goto done; } @@ -405,6 +403,7 @@ gif_clone_create(struct if_clone *ifc, uint32_t unit, __unused void *params) /* turn off ingress filter */ sc->gif_if.if_flags |= IFF_LINK2; #endif + sc->gif_flags |= IFGIF_DETACHING; error = ifnet_attach(sc->gif_if, NULL); if (error != 0) { printf("gif_clone_create - ifnet_attach failed - %d\n", error); @@ -417,13 +416,14 @@ gif_clone_create(struct if_clone *ifc, uint32_t unit, __unused void *params) encap_detach(sc->encap_cookie6); sc->encap_cookie6 = NULL; } - FREE(sc, M_DEVBUF); + if_clone_softc_deallocate(&gif_cloner, sc); goto done; } #if CONFIG_MACF_NET mac_ifnet_label_init(&sc->gif_if); #endif bpfattach(sc->gif_if, DLT_NULL, sizeof (u_int)); + sc->gif_flags &= ~IFGIF_DETACHING; TAILQ_INSERT_TAIL(&gifs, sc, gif_link); ngif++; done: @@ -433,33 +433,63 @@ gif_clone_create(struct if_clone *ifc, uint32_t unit, __unused void *params) } static int -gif_clone_destroy(struct ifnet *ifp) +gif_remove(struct ifnet *ifp) { -#if defined(INET) || defined(INET6) int error = 0; -#endif - struct gif_softc *sc = ifp->if_softc; + struct gif_softc *sc = NULL; lck_mtx_lock(gif_mtx); + sc = ifp->if_softc; + + if (sc == NULL) { + error = EINVAL; + goto done; + } + + GIF_LOCK(sc); + if (sc->gif_flags & IFGIF_DETACHING) { + error = EINVAL; + goto done; + } + + sc->gif_flags |= IFGIF_DETACHING; TAILQ_REMOVE(&gifs, sc, gif_link); ngif--; - GIF_LOCK(sc); gif_delete_tunnel(sc); #ifdef INET6 if (sc->encap_cookie6 != NULL) { error = encap_detach(sc->encap_cookie6); - KASSERT(error == 0, ("gif_clone_destroy: Unexpected \ - error detaching encap_cookie6")); + KASSERT(error == 0, ("gif_clone_destroy: Unexpected " + "error detaching encap_cookie6")); } #endif #ifdef INET if (sc->encap_cookie4 != NULL) { error = encap_detach(sc->encap_cookie4); - KASSERT(error == 0, ("gif_clone_destroy: Unexpected \ - error detaching encap_cookie4")); + KASSERT(error == 0, ("gif_clone_destroy: Unexpected " + "error detaching encap_cookie4")); } #endif +done: + if (sc != NULL) + GIF_UNLOCK(sc); + lck_mtx_unlock(gif_mtx); + + return (error); +} + +static int +gif_clone_destroy(struct ifnet *ifp) +{ + int error = 0; + + error = gif_remove(ifp); + if (error != 0) { + printf("gif_clone_destroy: gif remove failed %d\n", error); + return (error); + } + error = ifnet_set_flags(ifp, 0, IFF_UP); if (error != 0) { printf("gif_clone_destroy: ifnet_set_flags failed %d\n", error); @@ -469,10 +499,6 @@ gif_clone_destroy(struct ifnet *ifp) if (error != 0) panic("gif_clone_destroy: ifnet_detach(%p) failed %d\n", ifp, error); - - GIF_UNLOCK(sc); - lck_mtx_unlock(gif_mtx); - return (0); } diff --git a/bsd/net/if_gif.h b/bsd/net/if_gif.h index 619653bee..296cdbf21 100644 --- a/bsd/net/if_gif.h +++ b/bsd/net/if_gif.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2013 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -87,6 +87,7 @@ struct gif_softc { #endif } gifsc_gifscr; int gif_flags; +#define IFGIF_DETACHING 0x1 int gif_called; const struct encaptab *encap_cookie4; const struct encaptab *encap_cookie6; diff --git a/bsd/net/if_ipsec.c b/bsd/net/if_ipsec.c index 098420596..22a1441dd 100644 --- a/bsd/net/if_ipsec.c +++ b/bsd/net/if_ipsec.c @@ -26,6 +26,7 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ + #include #include #include @@ -370,6 +371,7 @@ ipsec_interface_isvalid (ifnet_t interface) return 1; } +#if IPSEC_NEXUS boolean_t ipsec_interface_needs_netagent(ifnet_t interface) { @@ -387,6 +389,7 @@ ipsec_interface_needs_netagent(ifnet_t interface) return (pcb->ipsec_needs_netagent == true); } +#endif // IPSEC_NEXUS static errno_t ipsec_ifnet_set_attrs(ifnet_t ifp) @@ -2072,6 +2075,12 @@ ipsec_enable_channel(struct ipsec_pcb *pcb, struct proc *proc) struct kern_pbufpool_init pp_init; errno_t result; + kauth_cred_t cred = kauth_cred_get(); + result = priv_check_cred(cred, PRIV_SKYWALK_REGISTER_KERNEL_PIPE, 0); + if (result) { + return result; + } + result = ipsec_register_kernel_pipe_nexus(); if (result) { return result; @@ -2703,7 +2712,7 @@ ipsec_ctl_setopt(__unused kern_ctl_ref kctlref, if (result == 0) { printf("%s IPSEC_OPT_SET_DELEGATE_INTERFACE %s to %s\n", __func__, pcb->ipsec_ifp->if_xname, - del_ifp->if_xname); + del_ifp ? del_ifp->if_xname : "NULL"); result = ifnet_set_delegate(pcb->ipsec_ifp, del_ifp); if (del_ifp) @@ -3317,12 +3326,14 @@ ipsec_ioctl(ifnet_t interface, u_long command, void *data) { +#if IPSEC_NEXUS + struct ipsec_pcb *pcb = ifnet_softc(interface); +#endif errno_t result = 0; switch(command) { case SIOCSIFMTU: { #if IPSEC_NEXUS - struct ipsec_pcb *pcb = ifnet_softc(interface); if (pcb->ipsec_use_netif) { // Make sure we can fit packets in the channel buffers if (((uint64_t)((struct ifreq*)data)->ifr_mtu) > pcb->ipsec_slot_size) { @@ -3430,7 +3441,7 @@ ipsec_attach_proto(ifnet_t interface, errno_t ipsec_inject_inbound_packet(ifnet_t interface, - mbuf_t packet) + mbuf_t packet) { #if IPSEC_NEXUS struct ipsec_pcb *pcb = ifnet_softc(interface); diff --git a/bsd/net/if_low_power_mode.c b/bsd/net/if_low_power_mode.c new file mode 100644 index 000000000..aac91d5c7 --- /dev/null +++ b/bsd/net/if_low_power_mode.c @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include + +typedef enum { + IF_LOW_POWER_EVENT_OFF = 0, + IF_LOW_POWER_EVENT_ON = 1 +} if_low_power_ev_code_t; + +struct if_low_power_ev_args { + struct ifnet *ifp; + if_low_power_ev_code_t event_code; +}; + +struct if_low_power_ev_nwk_wq_entry { + struct nwk_wq_entry nwk_wqe; + struct if_low_power_ev_args ev_args; +}; + + +typedef void (*if_low_power_event_fn) (struct eventhandler_entry_arg, + struct ifnet *, if_low_power_ev_code_t); +EVENTHANDLER_DECLARE(if_low_power_event, if_low_power_event_fn); + +struct eventhandler_lists_ctxt if_low_power_evhdlr_ctx; + +static void if_low_power_evhdlr_callback(__unused struct eventhandler_entry_arg arg, + struct ifnet *ifp, if_low_power_ev_code_t event_code); + +#if 0 +static void if_low_power_nwk_ev_callback(void *arg); +static void if_low_power_event_enqueue_nwk_wq_entry(struct ifnet *ifp, + if_low_power_ev_code_t event_code); +#endif + +extern void shutdown_sockets_on_interface(struct ifnet *ifp); + +SYSCTL_DECL(_net_link_generic_system); +SYSCTL_NODE(_net_link_generic_system, OID_AUTO, low_power, + CTLFLAG_RW | CTLFLAG_LOCKED, 0, "low power mode"); + +int if_low_power_verbose = 0; +int if_low_power_restricted = 1; + +#if (DEVELOPMENT || DEBUG) +SYSCTL_INT(_net_link_generic_system_low_power, OID_AUTO, verbose, + CTLFLAG_RW | CTLFLAG_LOCKED, + &if_low_power_verbose, 0, ""); +SYSCTL_INT(_net_link_generic_system_low_power, OID_AUTO, restricted, + CTLFLAG_RW | CTLFLAG_LOCKED, + &if_low_power_restricted, 0, ""); +#endif /* (DEVELOPMENT || DEBUG) */ + + +static void +if_low_power_evhdlr_callback(__unused struct eventhandler_entry_arg arg, + struct ifnet *ifp, if_low_power_ev_code_t event_code) +{ + struct kev_dl_low_power_mode kev; + + if (!IF_FULLY_ATTACHED(ifp)) + return; + + if (if_low_power_verbose > 0) { + os_log_info(OS_LOG_DEFAULT, + "%s: ifp %s event_code %d", __func__, + if_name(ifp), event_code); + } + + ifnet_lock_exclusive(ifp); + if (event_code == IF_LOW_POWER_EVENT_OFF) { + ifp->if_xflags &= ~IFXF_LOW_POWER; + } else { + ifp->if_xflags |= IFXF_LOW_POWER; + } + ifnet_lock_done(ifp); + + if (event_code == IF_LOW_POWER_EVENT_ON) { + atomic_add_32(&ifp->if_low_power_gencnt, 1); + + if (if_low_power_restricted != 0) { + shutdown_sockets_on_interface(ifp); + intf_event_enqueue_nwk_wq_entry(ifp, NULL, + INTF_EVENT_CODE_LOW_POWER_UPDATE); + } + } + + bzero(&kev, sizeof(struct kev_dl_low_power_mode)); + kev.low_power_event = event_code; + dlil_post_msg(ifp, + KEV_DL_SUBCLASS, + KEV_DL_LOW_POWER_MODE_CHANGED, + (struct net_event_data *)&kev, + sizeof(struct kev_dl_low_power_mode)); +} + +void +if_low_power_evhdlr_init(void) +{ + eventhandler_lists_ctxt_init(&if_low_power_evhdlr_ctx); + + (void) EVENTHANDLER_REGISTER(&if_low_power_evhdlr_ctx, + if_low_power_event, + if_low_power_evhdlr_callback, + eventhandler_entry_dummy_arg, + EVENTHANDLER_PRI_ANY); +} + +#if 0 +static void +if_low_power_nwk_ev_callback(void *arg) +{ + struct if_low_power_ev_args *if_low_power_ev_args = + (struct if_low_power_ev_args *)arg; + + EVENTHANDLER_INVOKE(&if_low_power_evhdlr_ctx, + if_low_power_event, + if_low_power_ev_args->ifp, + if_low_power_ev_args->event_code); +} + +static void +if_low_power_event_enqueue_nwk_wq_entry(struct ifnet *ifp, + if_low_power_ev_code_t event_code) +{ + struct if_low_power_ev_nwk_wq_entry *event_nwk_wq_entry = NULL; + + MALLOC(event_nwk_wq_entry, struct if_low_power_ev_nwk_wq_entry *, + sizeof(struct if_low_power_ev_nwk_wq_entry), + M_NWKWQ, M_WAITOK | M_ZERO); + + event_nwk_wq_entry->ev_args.ifp = ifp; + event_nwk_wq_entry->ev_args.event_code = event_code; + + event_nwk_wq_entry->nwk_wqe.func = if_low_power_nwk_ev_callback; + event_nwk_wq_entry->nwk_wqe.is_arg_managed = TRUE; + event_nwk_wq_entry->nwk_wqe.arg = &event_nwk_wq_entry->ev_args; + + nwk_wq_enqueue((struct nwk_wq_entry*)event_nwk_wq_entry); +} +#endif + +int +if_set_low_power(ifnet_t ifp, bool on) +{ + int error = 0; + + if (ifp == NULL) + return (EINVAL); + + os_log(OS_LOG_DEFAULT, + "%s: ifp %s low_power mode %d", __func__, if_name(ifp), on); + + ifnet_lock_exclusive(ifp); + ifp->if_xflags = on ? (ifp->if_xflags | IFXF_LOW_POWER) : + (ifp->if_xflags & ~IFXF_LOW_POWER); + ifnet_lock_done(ifp); + + return (error); +} + diff --git a/bsd/net/if_pflog.c b/bsd/net/if_pflog.c index 1c9113d70..cbed433ab 100644 --- a/bsd/net/if_pflog.c +++ b/bsd/net/if_pflog.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2017 Apple Inc. All rights reserved. + * Copyright (c) 2007-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -70,6 +70,8 @@ #include #include +#include + #include #include #include @@ -102,6 +104,7 @@ #define DPRINTF(x) #endif +static int pflog_remove(struct ifnet *); static int pflog_clone_create(struct if_clone *, u_int32_t, void *); static int pflog_clone_destroy(struct ifnet *); static errno_t pflogoutput(struct ifnet *, struct mbuf *); @@ -116,7 +119,7 @@ static void pflogfree(struct ifnet *); static LIST_HEAD(, pflog_softc) pflogif_list; static struct if_clone pflog_cloner = IF_CLONE_INITIALIZER(PFLOGNAME, pflog_clone_create, pflog_clone_destroy, - 0, (PFLOGIFS_MAX - 1)); + 0, (PFLOGIFS_MAX - 1), PFLOGIF_ZONE_MAX_ELEM, sizeof(struct pflog_softc)); struct ifnet *pflogifs[PFLOGIFS_MAX]; /* for fast access */ @@ -146,8 +149,7 @@ pflog_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params) /* NOTREACHED */ } - if ((pflogif = _MALLOC(sizeof (*pflogif), - M_DEVBUF, M_WAITOK|M_ZERO)) == NULL) { + if ((pflogif = if_clone_softc_allocate(&pflog_cloner)) == NULL) { error = ENOMEM; goto done; } @@ -170,11 +172,12 @@ pflog_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params) bzero(pflogif, sizeof (*pflogif)); pflogif->sc_unit = unit; + pflogif->sc_flags |= IFPFLF_DETACHING; error = ifnet_allocate_extended(&pf_init, &pflogif->sc_if); if (error != 0) { printf("%s: ifnet_allocate failed - %d\n", __func__, error); - _FREE(pflogif, M_DEVBUF); + if_clone_softc_deallocate(&pflog_cloner, pflogif); goto done; } @@ -185,7 +188,7 @@ pflog_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params) if (error != 0) { printf("%s: ifnet_attach failed - %d\n", __func__, error); ifnet_release(pflogif->sc_if); - _FREE(pflogif, M_DEVBUF); + if_clone_softc_deallocate(&pflog_cloner, pflogif); goto done; } @@ -197,6 +200,7 @@ pflog_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params) lck_mtx_lock(pf_lock); LIST_INSERT_HEAD(&pflogif_list, pflogif, sc_list); pflogifs[unit] = pflogif->sc_if; + pflogif->sc_flags &= ~IFPFLF_DETACHING; lck_mtx_unlock(pf_lock); lck_rw_done(pf_perim_lock); @@ -205,21 +209,40 @@ pflog_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params) } static int -pflog_clone_destroy(struct ifnet *ifp) +pflog_remove(struct ifnet *ifp) { - struct pflog_softc *pflogif = ifp->if_softc; + int error = 0; + struct pflog_softc *pflogif = NULL; lck_rw_lock_shared(pf_perim_lock); lck_mtx_lock(pf_lock); - pflogifs[pflogif->sc_unit] = NULL; + pflogif = ifp->if_softc; + + if (pflogif == NULL || + (pflogif->sc_flags & IFPFLF_DETACHING) != 0) { + error = EINVAL; + goto done; + } + + pflogif->sc_flags |= IFPFLF_DETACHING; LIST_REMOVE(pflogif, sc_list); +done: lck_mtx_unlock(pf_lock); lck_rw_done(pf_perim_lock); + return error; +} - /* bpfdetach() is taken care of as part of interface detach */ - (void) ifnet_detach(ifp); +static int +pflog_clone_destroy(struct ifnet *ifp) +{ + int error = 0; - return 0; + if ((error = pflog_remove(ifp)) != 0) + goto done; + /* bpfdetach() is taken care of as part of interface detach */ + (void)ifnet_detach(ifp); +done: + return (error); } static errno_t @@ -278,7 +301,7 @@ pflogdelproto(struct ifnet *ifp, protocol_family_t pf) static void pflogfree(struct ifnet *ifp) { - _FREE(ifp->if_softc, M_DEVBUF); + if_clone_softc_deallocate(&pflog_cloner, ifp->if_softc); ifp->if_softc = NULL; (void) ifnet_release(ifp); } diff --git a/bsd/net/if_pflog.h b/bsd/net/if_pflog.h index 77f6fa5df..1ebfb6bb6 100644 --- a/bsd/net/if_pflog.h +++ b/bsd/net/if_pflog.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2009 Apple Inc. All rights reserved. + * Copyright (c) 2007-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -62,11 +62,14 @@ extern "C" { #endif -#define PFLOGIFS_MAX 16 +#define PFLOGIFS_MAX 16 +#define PFLOGIF_ZONE_MAX_ELEM MIN(IFNETS_MAX, PFLOGIFS_MAX) #if KERNEL_PRIVATE struct pflog_softc { struct ifnet *sc_if; /* back ptr to interface */ + u_int32_t sc_flags; +#define IFPFLF_DETACHING 0x1 int sc_unit; LIST_ENTRY(pflog_softc) sc_list; }; diff --git a/bsd/net/if_ports_used.c b/bsd/net/if_ports_used.c index f5f7f9c11..643df3b2d 100644 --- a/bsd/net/if_ports_used.c +++ b/bsd/net/if_ports_used.c @@ -34,13 +34,13 @@ #include #include #include +#include #include #include #include #include -#include #include #include @@ -461,13 +461,13 @@ sysctl_wakeuuid_not_set_last_time SYSCTL_HANDLER_ARGS #pragma unused(oidp, arg1, arg2) if (proc_is64bit(req->p)) { - struct user64_timeval tv; + struct user64_timeval tv = {}; tv.tv_sec = wakeuuid_not_set_last_time.tv_sec; tv.tv_usec = wakeuuid_not_set_last_time.tv_usec; return SYSCTL_OUT(req, &tv, sizeof(tv)); } else { - struct user32_timeval tv; + struct user32_timeval tv = {}; tv.tv_sec = wakeuuid_not_set_last_time.tv_sec; tv.tv_usec = wakeuuid_not_set_last_time.tv_usec; diff --git a/bsd/net/if_stf.c b/bsd/net/if_stf.c index c4fce0074..2d5f2e90b 100644 --- a/bsd/net/if_stf.c +++ b/bsd/net/if_stf.c @@ -317,7 +317,7 @@ stfattach(void) stfinit(); error = proto_register_plumber(PF_INET6, APPLE_IF_FAM_STF, - stf_attach_inet6, NULL); + stf_attach_inet6, NULL); if (error != 0) printf("proto_register_plumber failed for AF_INET6 error=%d\n", error); diff --git a/bsd/net/if_utun.c b/bsd/net/if_utun.c index 225d9c21c..b0e711694 100644 --- a/bsd/net/if_utun.c +++ b/bsd/net/if_utun.c @@ -1285,6 +1285,12 @@ utun_enable_channel(struct utun_pcb *pcb, struct proc *proc) struct kern_pbufpool_init pp_init; errno_t result; + kauth_cred_t cred = kauth_cred_get(); + result = priv_check_cred(cred, PRIV_SKYWALK_REGISTER_KERNEL_PIPE, 0); + if (result) { + return result; + } + result = utun_register_kernel_pipe_nexus(); if (result) { return result; @@ -2488,6 +2494,12 @@ utun_demux(__unused ifnet_t interface, __unused char *frame_header, protocol_family_t *protocol) { +#if UTUN_NEXUS + struct utun_pcb *pcb = ifnet_softc(interface); + struct ip *ip; + u_int ip_version; +#endif + while (data != NULL && mbuf_len(data) < 1) { data = mbuf_next(data); } @@ -2497,10 +2509,6 @@ utun_demux(__unused ifnet_t interface, } #if UTUN_NEXUS - struct utun_pcb *pcb = ifnet_softc(interface); - struct ip *ip; - u_int ip_version; - if (pcb->utun_use_netif) { ip = mtod(data, struct ip *); ip_version = ip->ip_v; @@ -2538,14 +2546,14 @@ utun_framer(ifnet_t interface, VERIFY(interface == pcb->utun_ifp); u_int32_t header_length = UTUN_HEADER_SIZE(pcb); - if (mbuf_prepend(packet, header_length, MBUF_DONTWAIT) != 0) { + if (mbuf_prepend(packet, header_length, MBUF_DONTWAIT) != 0) { printf("utun_framer - ifnet_output prepend failed\n"); ifnet_stat_increment_out(interface, 0, 0, 1); // just return, because the buffer was freed in mbuf_prepend - return EJUSTRETURN; - } + return EJUSTRETURN; + } if (prepend_len != NULL) { *prepend_len = header_length; } @@ -2553,8 +2561,8 @@ utun_framer(ifnet_t interface, *postpend_len = 0; } - // place protocol number at the beginning of the mbuf - *(protocol_family_t *)mbuf_data(*packet) = *(protocol_family_t *)(uintptr_t)(size_t)frame_type; + // place protocol number at the beginning of the mbuf + *(protocol_family_t *)mbuf_data(*packet) = *(protocol_family_t *)(uintptr_t)(size_t)frame_type; return 0; @@ -2590,12 +2598,14 @@ utun_ioctl(ifnet_t interface, u_long command, void *data) { +#if UTUN_NEXUS + struct utun_pcb *pcb = ifnet_softc(interface); +#endif errno_t result = 0; - + switch(command) { case SIOCSIFMTU: { #if UTUN_NEXUS - struct utun_pcb *pcb = ifnet_softc(interface); if (pcb->utun_use_netif) { // Make sure we can fit packets in the channel buffers // Allow for the headroom in the slot diff --git a/bsd/net/if_var.h b/bsd/net/if_var.h index 379b792ad..1c69489b5 100644 --- a/bsd/net/if_var.h +++ b/bsd/net/if_var.h @@ -75,7 +75,7 @@ #ifdef PRIVATE #include #endif -#ifdef BSD_KERN_PRIVATE +#ifdef BSD_KERNEL_PRIVATE #include #endif @@ -649,6 +649,8 @@ struct ifqueue { }; #ifdef BSD_KERNEL_PRIVATE +#define IFNETS_MAX 64 + /* * Internal storage of if_data. This is bound to change. Various places in the * stack will translate this data structure in to the externally visible @@ -996,6 +998,8 @@ struct ifnet { uuid_t *if_agentids; /* network agents attached to interface */ u_int32_t if_agentcount; + volatile uint32_t if_low_power_gencnt; + u_int32_t if_generation; /* generation to use with NECP clients */ u_int32_t if_fg_sendts; /* last send on a fg socket in seconds */ @@ -1036,6 +1040,7 @@ typedef enum { INTF_EVENT_CODE_IPADDR_DETACHED, INTF_EVENT_CODE_LLADDR_UPDATE, INTF_EVENT_CODE_MTU_CHANGED, + INTF_EVENT_CODE_LOW_POWER_UPDATE, } intf_event_code_t; typedef void (*ifnet_event_fn)(struct eventhandler_entry_arg, struct ifnet *, struct sockaddr *, intf_event_code_t); @@ -1072,20 +1077,34 @@ EVENTHANDLER_DECLARE(ifnet_event, ifnet_event_fn); */ struct if_clone { LIST_ENTRY(if_clone) ifc_list; /* on list of cloners */ + decl_lck_mtx_data(, ifc_mutex); /* To serialize clone create/delete */ const char *ifc_name; /* name of device, e.g. `vlan' */ size_t ifc_namelen; /* length of name */ u_int32_t ifc_minifs; /* minimum number of interfaces */ u_int32_t ifc_maxunit; /* maximum unit number */ unsigned char *ifc_units; /* bitmap to handle units */ u_int32_t ifc_bmlen; /* bitmap length */ - + u_int32_t ifc_zone_max_elem; /* Max elements for this zone type */ + u_int32_t ifc_softc_size; /* size of softc for the device */ + struct zone *ifc_zone; /* if_clone allocation zone */ int (*ifc_create)(struct if_clone *, u_int32_t, void *); int (*ifc_destroy)(struct ifnet *); }; -#define IF_CLONE_INITIALIZER(name, create, destroy, minifs, maxunit) { \ - { NULL, NULL }, name, (sizeof (name) - 1), minifs, maxunit, NULL, 0, \ - create, destroy \ +#define IF_CLONE_INITIALIZER(name, create, destroy, minifs, maxunit, zone_max_elem, softc_size) { \ + .ifc_list = { NULL, NULL }, \ + .ifc_mutex = {}, \ + .ifc_name = name, \ + .ifc_namelen = (sizeof (name) - 1), \ + .ifc_minifs = minifs, \ + .ifc_maxunit = maxunit, \ + .ifc_units = NULL, \ + .ifc_bmlen = 0, \ + .ifc_zone_max_elem = zone_max_elem, \ + .ifc_softc_size = softc_size, \ + .ifc_zone = NULL, \ + .ifc_create = create, \ + .ifc_destroy = destroy \ } #define M_CLONE M_IFADDR @@ -1381,6 +1400,12 @@ struct ifmultiaddr { ((_ifp)->if_eflags & IFEF_EXPENSIVE || \ (_ifp)->if_delegated.expensive) +#define IFNET_IS_LOW_POWER(_ifp) \ + (if_low_power_restricted != 0 && \ + ((_ifp)->if_xflags & IFXF_LOW_POWER) || \ + ((_ifp)->if_delegated.ifp != NULL && \ + ((_ifp)->if_delegated.ifp->if_xflags & IFXF_LOW_POWER))) + /* * We don't support AWDL interface delegation. */ @@ -1430,7 +1455,8 @@ extern void if_qflush_sc(struct ifnet *, mbuf_svc_class_t, u_int32_t, extern struct if_clone *if_clone_lookup(const char *, u_int32_t *); extern int if_clone_attach(struct if_clone *); extern void if_clone_detach(struct if_clone *); - +extern void *if_clone_softc_allocate(const struct if_clone *); +extern void if_clone_softc_deallocate(const struct if_clone *, void *); extern u_int32_t if_functional_type(struct ifnet *, bool); extern errno_t if_mcasts_update(struct ifnet *); @@ -1794,6 +1820,12 @@ __private_extern__ void ifnet_enqueue_multi_setup(struct ifnet *, uint16_t, uint16_t); __private_extern__ errno_t ifnet_enqueue_mbuf(struct ifnet *, struct mbuf *, boolean_t, boolean_t *); + +extern int if_low_power_verbose; +extern int if_low_power_restricted; +extern void if_low_power_evhdlr_init(void); +extern int if_set_low_power(struct ifnet *, bool); + #endif /* BSD_KERNEL_PRIVATE */ #ifdef XNU_KERNEL_PRIVATE /* for uuid.c */ diff --git a/bsd/net/if_vlan.c b/bsd/net/if_vlan.c index 5c3535e68..8d26fab65 100644 --- a/bsd/net/if_vlan.c +++ b/bsd/net/if_vlan.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2017 Apple Inc. All rights reserved. + * Copyright (c) 2003-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -97,6 +97,7 @@ #include #include +#include #ifdef INET #include @@ -369,6 +370,8 @@ SYSCTL_NODE(_net_link, IFT_L2VLAN, vlan, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "IEEE 802 SYSCTL_NODE(_net_link_vlan, PF_LINK, link, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "for consistency"); #endif +#define VLAN_UNITMAX IF_MAXUNIT +#define VLAN_ZONE_MAX_ELEM MIN(IFNETS_MAX, VLAN_UNITMAX) #define M_VLAN M_DEVBUF static int vlan_clone_create(struct if_clone *, u_int32_t, void *); @@ -386,10 +389,12 @@ static void vlan_if_free(struct ifnet * ifp); static int vlan_remove(ifvlan_ref ifv, int need_to_wait); static struct if_clone vlan_cloner = IF_CLONE_INITIALIZER(VLANNAME, - vlan_clone_create, - vlan_clone_destroy, - 0, - IF_MAXUNIT); + vlan_clone_create, + vlan_clone_destroy, + 0, + VLAN_UNITMAX, + VLAN_ZONE_MAX_ELEM, + sizeof(struct ifvlan)); static void interface_link_event(struct ifnet * ifp, u_int32_t event_code); static void vlan_parent_link_event(struct ifnet * p, u_int32_t event_code); @@ -429,7 +434,7 @@ ifvlan_release(ifvlan_ref ifv) printf("ifvlan_release(%s)\n", ifv->ifv_name); } ifv->ifv_signature = 0; - FREE(ifv, M_VLAN); + if_clone_softc_deallocate(&vlan_cloner, ifv); break; default: break; @@ -937,7 +942,7 @@ vlan_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params) if (error != 0) { return (error); } - ifv = _MALLOC(sizeof(struct ifvlan), M_VLAN, M_WAITOK | M_ZERO); + ifv = if_clone_softc_allocate(&vlan_cloner); if (ifv == NULL) return ENOBUFS; ifv->ifv_retain_count = 1; diff --git a/bsd/net/iptap.c b/bsd/net/iptap.c index a4c2cabdb..06fed0e3a 100644 --- a/bsd/net/iptap.c +++ b/bsd/net/iptap.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2017 Apple Inc. All rights reserved. + * Copyright (c) 1999-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -27,6 +27,7 @@ */ #include +#include #include #include @@ -107,12 +108,17 @@ static ipfilter_t iptap_ipf4, iptap_ipf6; void iptap_bpf_tap(struct mbuf *m, u_int32_t proto, int outgoing); +#define IPTAP_MAXUNIT IF_MAXUNIT +#define IPTAP_ZONE_MAX_ELEM MIN(IFNETS_MAX, IPTAP_MAXUNIT) + static struct if_clone iptap_cloner = IF_CLONE_INITIALIZER(IPTAP_IFNAME, iptap_clone_create, iptap_clone_destroy, 0, - IF_MAXUNIT); + IPTAP_MAXUNIT, + IPTAP_ZONE_MAX_ELEM, + sizeof(struct iptap_softc)); SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, OID_AUTO, iptap, CTLFLAG_RW|CTLFLAG_LOCKED, 0, @@ -189,7 +195,7 @@ iptap_clone_create(struct if_clone *ifc, u_int32_t unit, void *params) struct iptap_softc *iptap = NULL; struct ifnet_init_eparams if_init; - iptap = _MALLOC(sizeof(struct iptap_softc), M_DEVBUF, M_WAITOK | M_ZERO); + iptap = if_clone_softc_allocate(&iptap_cloner); if (iptap == NULL) { printf("%s: _MALLOC failed\n", __func__); error = ENOMEM; @@ -253,7 +259,7 @@ iptap_clone_create(struct if_clone *ifc, u_int32_t unit, void *params) done: if (error != 0) { if (iptap != NULL) - _FREE(iptap, M_DEVBUF); + if_clone_softc_deallocate(&iptap_cloner, iptap); } return (error); } @@ -445,7 +451,7 @@ iptap_ioctl(ifnet_t ifp, unsigned long cmd, void *data) __private_extern__ void iptap_detach(ifnet_t ifp) { - struct iptap_softc *iptap; + struct iptap_softc *iptap = NULL; iptap_lock_exclusive(); @@ -460,8 +466,7 @@ iptap_detach(ifnet_t ifp) /* Drop reference as it's no more on the global list */ ifnet_release(ifp); - - _FREE(iptap, M_DEVBUF); + if_clone_softc_deallocate(&iptap_cloner, iptap); /* This is for the reference taken by ifnet_attach() */ (void) ifnet_release(ifp); diff --git a/bsd/net/kpi_interface.c b/bsd/net/kpi_interface.c index 2109e94cb..b0c1e3531 100644 --- a/bsd/net/kpi_interface.c +++ b/bsd/net/kpi_interface.c @@ -2861,6 +2861,10 @@ ifnet_get_keepalive_offload_frames(ifnet_t ifp, if (frames_array_count == 0) return (0); + /* Keep-alive offload not required for CLAT interface */ + if (IS_INTF_CLAT46(ifp)) + return (0); + for (i = 0; i < frames_array_count; i++) { struct ifnet_keepalive_offload_frame *frame = frames_array + i; @@ -3128,3 +3132,24 @@ ifnet_normalise_unsent_data(void) } ifnet_head_done(); } + +errno_t +ifnet_set_low_power_mode(ifnet_t ifp, boolean_t on) +{ + errno_t error; + + error = if_set_low_power(ifp, on); + + return (error); +} + +errno_t +ifnet_get_low_power_mode(ifnet_t ifp, boolean_t *on) +{ + if (ifp == NULL || on == NULL) + return (EINVAL); + + *on = !!(ifp->if_xflags & IFXF_LOW_POWER); + + return (0); +} diff --git a/bsd/net/kpi_interface.h b/bsd/net/kpi_interface.h index 0dd25f44c..ba4736d59 100644 --- a/bsd/net/kpi_interface.h +++ b/bsd/net/kpi_interface.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2017 Apple Inc. All rights reserved. + * Copyright (c) 2004-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -3133,7 +3133,7 @@ typedef errno_t (*ifnet_clone_destroy_func)(ifnet_t interface); @field ifc_destroy The function to destroy an interface. */ struct ifnet_clone_params { - const char *ifc_name; + const char *ifc_name; ifnet_clone_create_func ifc_create; ifnet_clone_destroy_func ifc_destroy; }; @@ -3556,6 +3556,27 @@ extern errno_t ifnet_get_buffer_status(const ifnet_t interface, */ extern void ifnet_normalise_unsent_data(void); +/*************************************************************************/ +/* Low Power Mode */ +/*************************************************************************/ + +/*! + @function ifnet_set_low_power_mode + @param interface The interface. + @param on Set the truth value that the interface is in low power mode. + @result Returns 0 on success, error number otherwise. + */ +extern errno_t ifnet_set_low_power_mode(ifnet_t interface, boolean_t on); + +/*! + @function ifnet_get_low_power_mode + @param interface The interface. + @param on On output contains the truth value that the interface + is in low power mode. + @result Returns 0 on success, error number otherwise. + */ +extern errno_t ifnet_get_low_power_mode(ifnet_t interface, boolean_t *on); + /*! @function ifnet_touch_lastupdown @discussion Updates the lastupdown value to now. diff --git a/bsd/net/kpi_protocol.c b/bsd/net/kpi_protocol.c index c6314269d..6265a4b4b 100644 --- a/bsd/net/kpi_protocol.c +++ b/bsd/net/kpi_protocol.c @@ -266,6 +266,9 @@ proto_input(protocol_family_t protocol, mbuf_t packet_list) break; } + if (entry == NULL) + return (-1); + if (entry->domain && !(entry->domain->dom_flags & DOM_REENTRANT)) { lck_mtx_lock(entry->domain->dom_mtx); locked = 1; diff --git a/bsd/net/nat464_utils.c b/bsd/net/nat464_utils.c new file mode 100644 index 000000000..495ac6cfd --- /dev/null +++ b/bsd/net/nat464_utils.c @@ -0,0 +1,1212 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Copyright (c) 2001 Daniel Hartmeier + * Copyright (c) 2002 - 2013 Henning Brauer + * NAT64 - Copyright (c) 2010 Viagenie Inc. (http://www.viagenie.ca) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Effort sponsored in part by the Defense Advanced Research Projects + * Agency (DARPA) and Air Force Research Laboratory, Air Force + * Materiel Command, USAF, under agreement number F30602-01-2-0537. + * + */ +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int clat_debug = 0; + +os_log_t nat_log_handle; + +static void +nat464_addr_cksum_fixup(uint16_t *, struct nat464_addr *, struct nat464_addr *, + protocol_family_t, protocol_family_t, uint8_t, boolean_t); + +/* Synthesize ipv6 from ipv4 */ +int +nat464_synthesize_ipv6(ifnet_t ifp, const struct in_addr *addrv4, struct in6_addr *addr) +{ + static const struct in6_addr well_known_prefix = { + .__u6_addr.__u6_addr8 = {0x00, 0x64, 0xff, 0x9b, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00}, + }; + + struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES]; + int error = 0, i = 0; + /* Below call is not optimized as it creates a copy of prefixes */ + if ((error = ifnet_get_nat64prefix(ifp, nat64prefixes)) != 0) + return (error); + + for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) { + if (nat64prefixes[i].prefix_len != 0) + break; + } + + VERIFY (i < NAT64_MAX_NUM_PREFIXES); + + struct in6_addr prefix = nat64prefixes[i].ipv6_prefix; + int prefix_len = nat64prefixes[i].prefix_len; + + char *ptrv4 = __DECONST(char *, addrv4); + char *ptr = __DECONST(char *, addr); + + if (IN_ZERONET(ntohl(addrv4->s_addr)) || // 0.0.0.0/8 Source hosts on local network + IN_LOOPBACK(ntohl(addrv4->s_addr)) || // 127.0.0.0/8 Loopback + IN_LINKLOCAL(ntohl(addrv4->s_addr)) || // 169.254.0.0/16 Link Local + IN_DS_LITE(ntohl(addrv4->s_addr)) || // 192.0.0.0/29 DS-Lite + IN_6TO4_RELAY_ANYCAST(ntohl(addrv4->s_addr)) || // 192.88.99.0/24 6to4 Relay Anycast + IN_MULTICAST(ntohl(addrv4->s_addr)) || // 224.0.0.0/4 Multicast + INADDR_BROADCAST == addrv4->s_addr) { // 255.255.255.255/32 Limited Broadcast + return (-1); + } + + /* Check for the well-known prefix */ + if (prefix_len == NAT64_PREFIX_LEN_96 && + IN6_ARE_ADDR_EQUAL(&prefix, &well_known_prefix)) { // https://tools.ietf.org/html/rfc6052#section-3.1 + if (IN_PRIVATE(ntohl(addrv4->s_addr)) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use + IN_SHARED_ADDRESS_SPACE(ntohl(addrv4->s_addr))) // 100.64.0.0/10 Shared Address Space + return (-1); + } + + memcpy(ptr, (char *)&prefix, prefix_len); + + switch (prefix_len) { + case NAT64_PREFIX_LEN_96: + memcpy(ptr + 12, ptrv4, 4); + break; + case NAT64_PREFIX_LEN_64: + memcpy(ptr + 9, ptrv4, 4); + break; + case NAT64_PREFIX_LEN_56: + memcpy(ptr + 7, ptrv4, 1); + memcpy(ptr + 9, ptrv4 + 1, 3); + break; + case NAT64_PREFIX_LEN_48: + memcpy(ptr + 6, ptrv4, 2); + memcpy(ptr + 9, ptrv4 + 2, 2); + break; + case NAT64_PREFIX_LEN_40: + memcpy(ptr + 5, ptrv4, 3); + memcpy(ptr + 9, ptrv4 + 3, 1); + break; + case NAT64_PREFIX_LEN_32: + memcpy(ptr + 4, ptrv4, 4); + break; + default: + panic("NAT64-prefix len is wrong: %u\n", prefix_len); + } + + if (clat_debug) { + char buf[MAX_IPv6_STR_LEN]; + clat_log2((LOG_DEBUG, "%s synthesized %s\n", __func__, + inet_ntop(AF_INET6, (void *)addr, buf, sizeof(buf)))); + } + + return (error); +} + +/* Synthesize ipv4 from ipv6 */ +int +nat464_synthesize_ipv4(ifnet_t ifp, const struct in6_addr *addr, struct in_addr *addrv4) +{ + struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES]; + int error = 0, i = 0; + + /* Below call is not optimized as it creates a copy of prefixes */ + if ((error = ifnet_get_nat64prefix(ifp, nat64prefixes)) != 0) + return error; + + for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) { + if (nat64prefixes[i].prefix_len != 0) + break; + } + + VERIFY (i < NAT64_MAX_NUM_PREFIXES); + + struct in6_addr prefix = nat64prefixes[i].ipv6_prefix; + int prefix_len = nat64prefixes[i].prefix_len; + + char *ptrv4 = __DECONST(void *, addrv4); + char *ptr = __DECONST(void *, addr); + + if (memcmp(addr, &prefix, prefix_len) != 0) + return (-1); + + switch (prefix_len) { + case NAT64_PREFIX_LEN_96: + memcpy(ptrv4, ptr + 12, 4); + break; + case NAT64_PREFIX_LEN_64: + memcpy(ptrv4, ptr + 9, 4); + break; + case NAT64_PREFIX_LEN_56: + memcpy(ptrv4, ptr + 7, 1); + memcpy(ptrv4 + 1, ptr + 9, 3); + break; + case NAT64_PREFIX_LEN_48: + memcpy(ptrv4, ptr + 6, 2); + memcpy(ptrv4 + 2, ptr + 9, 2); + break; + case NAT64_PREFIX_LEN_40: + memcpy(ptrv4, ptr + 5, 3); + memcpy(ptrv4 + 3, ptr + 9, 1); + break; + case NAT64_PREFIX_LEN_32: + memcpy(ptrv4, ptr + 4, 4); + break; + default: + panic("NAT64-prefix len is wrong: %u\n", + prefix_len); + } + + if(clat_debug) { + char buf[MAX_IPv4_STR_LEN]; + clat_log2((LOG_DEBUG, "%s desynthesized to %s\n", __func__, + inet_ntop(AF_INET, (void *)addrv4, buf, sizeof(buf)))); + } + return (error); +} + +#define PTR_IP(field) ((int32_t)offsetof(struct ip, field)) +#define PTR_IP6(field) ((int32_t)offsetof(struct ip6_hdr, field)) + +/* + Translate the ICMP header +*/ +int +nat464_translate_icmp(int naf, void *arg) +{ + struct icmp *icmp4; + struct icmp6_hdr *icmp6; + uint32_t mtu; + int32_t ptr = -1; + uint8_t type; + uint8_t code; + + switch (naf) { + case AF_INET: + icmp6 = arg; + type = icmp6->icmp6_type; + code = icmp6->icmp6_code; + mtu = ntohl(icmp6->icmp6_mtu); + + switch (type) { + case ICMP6_ECHO_REQUEST: + type = ICMP_ECHO; + break; + case ICMP6_ECHO_REPLY: + type = ICMP_ECHOREPLY; + break; + case ICMP6_DST_UNREACH: + type = ICMP_UNREACH; + switch (code) { + case ICMP6_DST_UNREACH_NOROUTE: + case ICMP6_DST_UNREACH_BEYONDSCOPE: + case ICMP6_DST_UNREACH_ADDR: + code = ICMP_UNREACH_HOST; + break; + case ICMP6_DST_UNREACH_ADMIN: + code = ICMP_UNREACH_HOST_PROHIB; + break; + case ICMP6_DST_UNREACH_NOPORT: + code = ICMP_UNREACH_PORT; + break; + default: + return (-1); + } + break; + case ICMP6_PACKET_TOO_BIG: + type = ICMP_UNREACH; + code = ICMP_UNREACH_NEEDFRAG; + mtu -= 20; + break; + case ICMP6_TIME_EXCEEDED: + type = ICMP_TIMXCEED; + break; + case ICMP6_PARAM_PROB: + switch (code) { + case ICMP6_PARAMPROB_HEADER: + type = ICMP_PARAMPROB; + code = ICMP_PARAMPROB_ERRATPTR; + ptr = ntohl(icmp6->icmp6_pptr); + + if (ptr == PTR_IP6(ip6_vfc)) + ; /* preserve */ + else if (ptr == PTR_IP6(ip6_vfc) + 1) + ptr = PTR_IP(ip_tos); + else if (ptr == PTR_IP6(ip6_plen) || + ptr == PTR_IP6(ip6_plen) + 1) + ptr = PTR_IP(ip_len); + else if (ptr == PTR_IP6(ip6_nxt)) + ptr = PTR_IP(ip_p); + else if (ptr == PTR_IP6(ip6_hlim)) + ptr = PTR_IP(ip_ttl); + else if (ptr >= PTR_IP6(ip6_src) && + ptr < PTR_IP6(ip6_dst)) + ptr = PTR_IP(ip_src); + else if (ptr >= PTR_IP6(ip6_dst) && + ptr < (int32_t)sizeof(struct ip6_hdr)) + ptr = PTR_IP(ip_dst); + else { + return (-1); + } + break; + case ICMP6_PARAMPROB_NEXTHEADER: + type = ICMP_UNREACH; + code = ICMP_UNREACH_PROTOCOL; + break; + default: + return (-1); + } + break; + default: + return (-1); + } + icmp6->icmp6_type = type; + icmp6->icmp6_code = code; + /* aligns well with a icmpv4 nextmtu */ + icmp6->icmp6_mtu = htonl(mtu); + /* icmpv4 pptr is a one most significant byte */ + if (ptr >= 0) + icmp6->icmp6_pptr = htonl(ptr << 24); + break; + + case AF_INET6: + icmp4 = arg; + type = icmp4->icmp_type; + code = icmp4->icmp_code; + mtu = ntohs(icmp4->icmp_nextmtu); + + switch (type) { + case ICMP_ECHO: + type = ICMP6_ECHO_REQUEST; + break; + case ICMP_ECHOREPLY: + type = ICMP6_ECHO_REPLY; + break; + case ICMP_UNREACH: + type = ICMP6_DST_UNREACH; + switch (code) { + case ICMP_UNREACH_NET: + case ICMP_UNREACH_HOST: + case ICMP_UNREACH_NET_UNKNOWN: + case ICMP_UNREACH_HOST_UNKNOWN: + case ICMP_UNREACH_ISOLATED: + case ICMP_UNREACH_TOSNET: + case ICMP_UNREACH_TOSHOST: + code = ICMP6_DST_UNREACH_NOROUTE; + break; + case ICMP_UNREACH_PORT: + code = ICMP6_DST_UNREACH_NOPORT; + break; + case ICMP_UNREACH_NET_PROHIB: + case ICMP_UNREACH_HOST_PROHIB: + case ICMP_UNREACH_FILTER_PROHIB: + case ICMP_UNREACH_PRECEDENCE_CUTOFF: + code = ICMP6_DST_UNREACH_ADMIN; + break; + case ICMP_UNREACH_PROTOCOL: + type = ICMP6_PARAM_PROB; + code = ICMP6_PARAMPROB_NEXTHEADER; + ptr = offsetof(struct ip6_hdr, ip6_nxt); + break; + case ICMP_UNREACH_NEEDFRAG: + type = ICMP6_PACKET_TOO_BIG; + code = 0; + mtu += 20; + break; + default: + return (-1); + } + break; + case ICMP_TIMXCEED: + type = ICMP6_TIME_EXCEEDED; + break; + case ICMP_PARAMPROB: + type = ICMP6_PARAM_PROB; + switch (code) { + case ICMP_PARAMPROB_ERRATPTR: + code = ICMP6_PARAMPROB_HEADER; + break; + case ICMP_PARAMPROB_LENGTH: + code = ICMP6_PARAMPROB_HEADER; + break; + default: + return (-1); + } + + ptr = icmp4->icmp_pptr; + if (ptr == 0 || ptr == PTR_IP(ip_tos)) + ; /* preserve */ + else if (ptr == PTR_IP(ip_len) || + ptr == PTR_IP(ip_len) + 1) + ptr = PTR_IP6(ip6_plen); + else if (ptr == PTR_IP(ip_ttl)) + ptr = PTR_IP6(ip6_hlim); + else if (ptr == PTR_IP(ip_p)) + ptr = PTR_IP6(ip6_nxt); + else if (ptr >= PTR_IP(ip_src) && + ptr < PTR_IP(ip_dst)) + ptr = PTR_IP6(ip6_src); + else if (ptr >= PTR_IP(ip_dst) && + ptr < (int32_t)sizeof(struct ip)) + ptr = PTR_IP6(ip6_dst); + else { + return (-1); + } + break; + default: + return (-1); + } + icmp4->icmp_type = type; + icmp4->icmp_code = code; + icmp4->icmp_nextmtu = htons(mtu); + if (ptr >= 0) + icmp4->icmp_void = htonl(ptr); + break; + } + + return (0); +} + +/* + * @brief This routine is called to perform address family translation on the + * inner IP header (that may come as payload) of an ICMP(v4/v6) error + * response. + * + * @param pbuf Pointer to packet buffer + * @param off Points to end of ICMP header + * @param tot_len Pointer to total length of the outer IP header + * @param off2 Points to end of inner IP header + * @param proto2 Inner IP proto field + * @param ttl2 Inner IP ttl field + * @param tot_len2 Inner IP total length + * @param src Pointer to the generic v4/v6 src address + * @param dst Pointer to the generic v4/v6 dst address + * @param af Old protocol family + * @param naf New protocol family + * + * @return -1 on error and 0 on success + */ +int +nat464_translate_icmp_ip(pbuf_t *pbuf, uint32_t off, uint64_t *tot_len, uint32_t *off2, + uint8_t proto2, uint8_t ttl2, uint64_t tot_len2, struct nat464_addr *src, + struct nat464_addr *dst, protocol_family_t af, protocol_family_t naf) +{ + struct ip *ip4 = NULL; + struct ip6_hdr *ip6 = NULL; + void *hdr = NULL; + int hlen = 0, olen = 0; + + if (af == naf || (af != AF_INET && af != AF_INET6) || + (naf != AF_INET && naf != AF_INET6)) + return (-1); + + /* old header */ + olen = *off2 - off; + /* new header */ + hlen = naf == PF_INET ? sizeof(*ip4) : sizeof(*ip6); + + /* Modify the pbuf to accommodate the new header */ + hdr = pbuf_resize_segment(pbuf, off, olen, hlen); + if (hdr == NULL) + return (-1); + + /* translate inner ip/ip6 header */ + switch (naf) { + case AF_INET: + ip4 = hdr; + bzero(ip4, sizeof(*ip4)); + ip4->ip_v = IPVERSION; + ip4->ip_hl = sizeof(*ip4) >> 2; + ip4->ip_len = htons(sizeof(*ip4) + tot_len2 - olen); + ip4->ip_id = rfc6864 ? 0 : htons(ip_randomid()); + ip4->ip_off = htons(IP_DF); + ip4->ip_ttl = ttl2; + if (proto2 == IPPROTO_ICMPV6) + ip4->ip_p = IPPROTO_ICMP; + else + ip4->ip_p = proto2; + ip4->ip_src = src->natv4addr; + ip4->ip_dst = dst->natv4addr; + ip4->ip_sum = pbuf_inet_cksum(pbuf, 0, 0, ip4->ip_hl << 2); + + if (clat_debug) { + char buf[MAX_IPv4_STR_LEN]; + clat_log2((LOG_DEBUG, "%s translated to IPv4 (inner) " + "ip_len: %#x ip_p: %d ip_sum: %#x ip_src: %s ip_dst: %s \n", + __func__, ntohs(ip4->ip_len), ip4->ip_p, ntohs(ip4->ip_sum), + inet_ntop(AF_INET, (void *)&ip4->ip_src, buf, sizeof(buf)), + inet_ntop(AF_INET, (void *)&ip4->ip_dst, buf, sizeof(buf)))); + } + break; + case AF_INET6: + ip6 = hdr; + bzero(ip6, sizeof(*ip6)); + ip6->ip6_vfc = IPV6_VERSION; + ip6->ip6_plen = htons(tot_len2 - olen); + if (proto2 == IPPROTO_ICMP) + ip6->ip6_nxt = IPPROTO_ICMPV6; + else + ip6->ip6_nxt = proto2; + if (!ttl2 || ttl2 > IPV6_DEFHLIM) + ip6->ip6_hlim = IPV6_DEFHLIM; + else + ip6->ip6_hlim = ttl2; + ip6->ip6_src = src->natv6addr; + ip6->ip6_dst = dst->natv6addr; + + if (clat_debug) { + char buf2[MAX_IPv6_STR_LEN]; + clat_log2((LOG_DEBUG, "%s translated to IPv6 (inner) " + "ip6_plen: %#x ip6_nxt: %d ip6_src: %s ip6_dst: %s \n", + __func__, ntohs(ip6->ip6_plen), ip6->ip6_nxt, + inet_ntop(AF_INET6, (void *)&ip6->ip6_src, buf2, sizeof(buf2)), + inet_ntop(AF_INET6, (void *)&ip6->ip6_dst, buf2, sizeof(buf2)))); + } + break; + } + + /* adjust payload offset and total packet length */ + *off2 += hlen - olen; + *tot_len += hlen - olen; + + return (0); +} +/* + * @brief The function inserts IPv6 fragmentation header + * and populates it with the passed parameters. + * + * @param pbuf Pointer to the packet buffer + * @param ip_id IP identifier (in network byte order) + * @param frag_offset Fragment offset (in network byte order) + * @param is_last_frag Boolean indicating if the fragment header is for + * last fragment or not. + * + * @return -1 on error and 0 on success. + */ +int +nat464_insert_frag46(pbuf_t *pbuf, uint16_t ip_id_val, uint16_t frag_offset, + boolean_t is_last_frag) +{ + struct ip6_frag *p_ip6_frag = NULL; + struct ip6_hdr *p_ip6h = NULL; + + /* Insert IPv6 fragmentation header */ + if (pbuf_resize_segment(pbuf, sizeof(struct ip6_hdr), 0, + sizeof(struct ip6_frag)) == NULL) + return (-1); + + p_ip6h = mtod(pbuf->pb_mbuf, struct ip6_hdr *); + p_ip6_frag = (struct ip6_frag *)pbuf_contig_segment(pbuf, + sizeof(struct ip6_hdr), sizeof(struct ip6_frag)); + + if (p_ip6_frag == NULL) + return (-1); + + /* Populate IPv6 fragmentation header */ + p_ip6_frag->ip6f_nxt = p_ip6h->ip6_nxt; + p_ip6_frag->ip6f_reserved = 0; + p_ip6_frag->ip6f_offlg = (frag_offset) << 3; + if (!is_last_frag) + p_ip6_frag->ip6f_offlg |= 0x1; + p_ip6_frag->ip6f_offlg = htons(p_ip6_frag->ip6f_offlg); + p_ip6_frag->ip6f_ident = ip_id_val; + + /* Update IPv6 header */ + p_ip6h->ip6_nxt = IPPROTO_FRAGMENT; + p_ip6h->ip6_plen = htons(ntohs(p_ip6h->ip6_plen) + + sizeof(struct ip6_frag)); + + return (0); +} + +int +nat464_translate_64(pbuf_t *pbuf, int off, uint8_t tos, + uint8_t *proto, uint8_t ttl, struct in_addr src_v4, + struct in_addr dst_v4, uint64_t tot_len, boolean_t *p_is_first_frag) +{ + struct ip *ip4; + struct ip6_frag *p_frag6 = NULL; + struct ip6_frag frag6 = {}; + boolean_t is_frag = FALSE; + uint16_t ip_frag_off = 0; + + /* + * ip_input asserts for rcvif to be not NULL + * That may not be true for two corner cases + * 1. If for some reason a local app sends DNS + * AAAA query to local host + * 2. If IPv6 stack in kernel internally generates a + * message destined for a synthesized IPv6 end-point. + */ + if (pbuf->pb_ifp == NULL) + return (NT_DROP); + + if (*proto == IPPROTO_FRAGMENT) { + p_frag6 = (struct ip6_frag *)pbuf_contig_segment(pbuf, + sizeof(struct ip6_hdr), sizeof(struct ip6_frag)); + if (p_frag6 == NULL) { + ip6stat.ip6s_clat464_in_64frag_transfail_drop++; + return (NT_DROP); + } + + frag6 = *p_frag6; + p_frag6 = NULL; + *proto = frag6.ip6f_nxt; + off += sizeof(struct ip6_frag); + is_frag = TRUE; + ip_frag_off = (ntohs(frag6.ip6f_offlg & IP6F_OFF_MASK)) >> 3; + if (ip_frag_off != 0) { + *p_is_first_frag = FALSE; + } + } + + ip4 = (struct ip *)pbuf_resize_segment(pbuf, 0, off, sizeof(*ip4)); + if (ip4 == NULL) + return (NT_DROP); + ip4->ip_v = 4; + ip4->ip_hl = 5; + ip4->ip_tos = tos; + ip4->ip_len = htons(sizeof(*ip4) + (tot_len - off)); + ip4->ip_id = 0; + ip4->ip_off = 0; + ip4->ip_ttl = ttl; + ip4->ip_p = *proto; + ip4->ip_sum = 0; + ip4->ip_src = src_v4; + ip4->ip_dst = dst_v4; + if (is_frag) { + /* + * https://tools.ietf.org/html/rfc7915#section-5.1.1 + * Identification: Copied from the low-order 16 bits in the + * Identification field in the Fragment Header. + */ + ip4->ip_id = ntohl(frag6.ip6f_ident) & 0xffff; + ip4->ip_id = htons(ip4->ip_id); + if(frag6.ip6f_offlg & IP6F_MORE_FRAG) + ip_frag_off |= IP_MF; + ip4->ip_off = htons(ip_frag_off); + } else { + ip4->ip_off |= htons(IP_DF); + } + + /* + * Defer calculating ip_sum for ICMPv6 as we do it + * later in Protocol translation + */ + if (*proto != IPPROTO_ICMPV6) + ip4->ip_sum = pbuf_inet_cksum(pbuf, 0, 0, ip4->ip_hl << 2); + + if (clat_debug) { + char buf1[MAX_IPv4_STR_LEN], buf2[MAX_IPv4_STR_LEN]; + clat_log2((LOG_DEBUG, "%s translated to IPv4 ip_len: %#x " + "ip_p: %d ip_sum: %#x ip_src: %s ip_dst: %s \n", __func__, + ntohs(ip4->ip_len), ip4->ip_p, ntohs(ip4->ip_sum), + inet_ntop(AF_INET, (void *)&ip4->ip_src, buf1, sizeof(buf1)), + inet_ntop(AF_INET, (void *)&ip4->ip_dst, buf2, sizeof(buf2)))); + } + return (NT_NAT64); +} +/* + * @brief The routine translates the IPv4 header to IPv6 header. + * + * @param pbuf Pointer to the generic packet buffer + * @param off Offset to the end of IP header + * @param tos Type of service + * @param proto Protocol running over IP + * @param ttl Time to live + * @param src_v6 Source IPv6 address + * @param dst_v6 Destination IPv6 address + * @param tot_len Total payload length + * + * @return NT_NAT64 if IP header translation is successful, else error + */ +int +nat464_translate_46(pbuf_t *pbuf, int off, uint8_t tos, + uint8_t proto, uint8_t ttl, struct in6_addr src_v6, + struct in6_addr dst_v6, uint64_t tot_len) +{ + struct ip6_hdr *ip6; + + if (pbuf->pb_ifp == NULL) + return (NT_DROP); + + /* + * Trim the buffer from head of size equal to to off (which is equal to + * the size of IP header and prepend IPv6 header length to the buffer + */ + ip6 = (struct ip6_hdr *)pbuf_resize_segment(pbuf, 0, off, sizeof(*ip6)); + if (ip6 == NULL) + return (NT_DROP); + ip6->ip6_flow = htonl((6 << 28) | (tos << 20)); + ip6->ip6_plen = htons(tot_len - off); + ip6->ip6_nxt = proto; + ip6->ip6_hlim = ttl; + ip6->ip6_src = src_v6; + ip6->ip6_dst = dst_v6; + + if (clat_debug) { + char buf1[MAX_IPv6_STR_LEN], buf2[MAX_IPv6_STR_LEN]; + clat_log2((LOG_DEBUG, "%s translated to IPv6 ip6_plen: %#x " + " ip6_nxt: %d ip6_src: %s ip6_dst: %s \n", __func__, + ntohs(ip6->ip6_plen), ip6->ip6_nxt, + inet_ntop(AF_INET6, (void *)&ip6->ip6_src, buf1, sizeof(buf1)), + inet_ntop(AF_INET6, (void *)&ip6->ip6_dst, buf2, sizeof(buf2)))); + } + return (NT_NAT64); +} + +/* Handle the next protocol checksum */ +/* + * @brief This routine translates the Proto running over IP and updates the checksum + * for IP header translation. It also updates pbuf checksum flags and related fields. + * + * @param pbuf Pointer to protocol buffer + * @param nsrc New source address + * @param ndst New destination address + * @param af Old family + * @param naf New family + * + * @return void + */ +int +nat464_translate_proto(pbuf_t *pbuf, struct nat464_addr *osrc, + struct nat464_addr *odst, uint8_t oproto, protocol_family_t af, + protocol_family_t naf, int direction, boolean_t only_csum) +{ + struct ip *iph = NULL; + struct ip6_hdr *ip6h = NULL; + uint32_t hlen = 0, plen = 0; + uint64_t tot_len = 0; + void *nsrc = NULL, *ndst = NULL; + uint8_t *proto = 0; + uint16_t *psum = NULL; + boolean_t do_ones_complement = FALSE; + + /* For now these routines only support 464 translations */ + VERIFY(af != naf); + VERIFY(af == PF_INET || af == PF_INET6); + + /* + * For now out must be for v4 to v6 translation + * and in must be for v6 to v4 translation. + */ + switch (naf) { + case PF_INET: { + iph = pbuf->pb_data; + hlen = iph->ip_hl << 2; + plen = ntohs(iph->ip_len) - hlen; + tot_len = ntohs(iph->ip_len); + nsrc = &iph->ip_src; + ndst = &iph->ip_dst; + proto = &iph->ip_p; + break; + } + case PF_INET6: { + ip6h = pbuf->pb_data; + hlen = sizeof(*ip6h); + plen = ntohs(ip6h->ip6_plen); + tot_len = hlen + plen; + nsrc = &ip6h->ip6_src; + ndst = &ip6h->ip6_dst; + proto = &ip6h->ip6_nxt; + break; + } + } + + VERIFY(*proto == oproto); + + /* + * We may want to manipulate csum flags in some cases + * and not act on the protocol header as it may not + * carry protocol checksums. + * For example, fragments other than the first one would + * not carry protocol headers. + */ + if (only_csum) { + /* + * Only translate ICMP proto in the header + * and adjust checksums + */ + if (*proto == IPPROTO_ICMP) { + if (naf != PF_INET6) + return (NT_DROP); + + *proto = IPPROTO_ICMPV6; + } + else if (*proto == IPPROTO_ICMPV6) { + if (naf != PF_INET) + return (NT_DROP); + + *proto = IPPROTO_ICMP; + /* Recalculate IP checksum as proto field has changed */ + iph->ip_sum = 0; + iph->ip_sum = pbuf_inet_cksum(pbuf, 0, 0, hlen); + } + goto done; + } + + switch (*proto) { + case IPPROTO_UDP: { + struct udphdr *uh = (struct udphdr *)pbuf_contig_segment(pbuf, hlen, + sizeof(*uh)); + + if (uh == NULL) + return (NT_DROP); + + if (!(*pbuf->pb_csum_flags & (CSUM_UDP | CSUM_PARTIAL)) && + uh->uh_sum == 0 && af == PF_INET && naf == PF_INET6) { + uh->uh_sum = pbuf_inet6_cksum(pbuf, IPPROTO_UDP, + hlen, ntohs(ip6h->ip6_plen)); + if (uh->uh_sum == 0) + uh->uh_sum = 0xffff; + goto done; + } + + psum = &uh->uh_sum; + break; + } + case IPPROTO_TCP: { + struct tcphdr *th = (struct tcphdr *)pbuf_contig_segment(pbuf, hlen, + sizeof(*th)); + + if (th == NULL) + return (NT_DROP); + + psum = &th->th_sum; + break; + } + } + + /* + * Translate the protocol header, update IP header if needed, + * calculate checksums and update the checksum flags. + */ + switch (*proto) { + case IPPROTO_UDP: + /* Fall through */ + case IPPROTO_TCP: + { + /* + * If it is a locally generated and has CSUM flags set + * for TCP and UDP it means we have pseudo header checksum + * that has not yet been one's complemented. + */ + if (direction == NT_OUT && + (*pbuf->pb_csum_flags & CSUM_DELAY_DATA)) + do_ones_complement = TRUE; + + nat464_addr_cksum_fixup(psum, osrc, (struct nat464_addr *)nsrc, + af, naf, (*proto == IPPROTO_UDP) ? 1 : 0, do_ones_complement); + nat464_addr_cksum_fixup(psum, odst, (struct nat464_addr *)ndst, + af, naf, (*proto == IPPROTO_UDP) ? 1 : 0, do_ones_complement); + + break; + } + case IPPROTO_ICMP: { + if (naf != PF_INET6) /* allow only v6 as naf for ICMP */ + return (NT_DROP); + + struct icmp *icmph = NULL; + struct icmp6_hdr *icmp6h = NULL; + uint32_t ip2off = 0, hlen2 = 0, tot_len2 = 0; + + icmph = (struct icmp*) pbuf_contig_segment(pbuf, hlen, + ICMP_MINLEN); + if (icmph == NULL) + return (NT_DROP); + + /* Translate the ICMP header */ + if (nat464_translate_icmp(PF_INET6, icmph) != 0) + return (NT_DROP); + + *proto = IPPROTO_ICMPV6; + icmp6h = (struct icmp6_hdr *)(uintptr_t)icmph; + pbuf_copy_back(pbuf, hlen, sizeof(struct icmp6_hdr), + icmp6h); + + /*Translate the inner IP header only for error messages */ + if (ICMP6_ERRORTYPE(icmp6h->icmp6_type)) { + ip2off = hlen + sizeof(*icmp6h); + struct ip *iph2; + iph2 = (struct ip*) pbuf_contig_segment(pbuf, ip2off, + sizeof (*iph2)); + if (iph2 == NULL) + return (NT_DROP); + + hlen2 = ip2off + (iph2->ip_hl << 2); + tot_len2 = ntohs(iph2->ip_len); + + /* Destination in outer IP should be Source in inner IP */ + VERIFY(IN_ARE_ADDR_EQUAL(&odst->natv4addr, &iph2->ip_src)); + if (nat464_translate_icmp_ip(pbuf, ip2off, &tot_len, + &hlen2, iph2->ip_p, iph2->ip_ttl, tot_len2, + (struct nat464_addr *)ndst, (struct nat464_addr *)nsrc, + PF_INET, PF_INET6) != 0) + return (NT_DROP); + /* Update total length/payload length for outer header */ + switch (naf) { + case PF_INET: + iph->ip_len = htons(tot_len); + break; + case PF_INET6: + ip6h->ip6_plen = htons(tot_len - hlen); + break; + } + iph2 = NULL; + } + + icmp6h->icmp6_cksum = 0; + icmp6h->icmp6_cksum = pbuf_inet6_cksum(pbuf, IPPROTO_ICMPV6, hlen, + ntohs(ip6h->ip6_plen)); + + clat_log2((LOG_DEBUG, "%s translated to ICMPV6 type: %d " + "code: %d checksum: %#x \n", __func__, icmp6h->icmp6_type, + icmp6h->icmp6_code, icmp6h->icmp6_cksum)); + + icmph = NULL; + icmp6h = NULL; + break; + } + case IPPROTO_ICMPV6: + { if (naf != PF_INET) /* allow only v4 as naf for ICMPV6 */ + return (NT_DROP); + + struct icmp6_hdr *icmp6h = NULL; + struct icmp *icmph = NULL; + uint32_t ip2off = 0, hlen2 = 0, tot_len2 = 0; + + icmp6h = (struct icmp6_hdr*) pbuf_contig_segment(pbuf, hlen, + sizeof(*icmp6h)); + if (icmp6h == NULL) + return (NT_DROP); + + /* Translate the ICMP header */ + if (nat464_translate_icmp(PF_INET, icmp6h) != 0) + return (NT_DROP); + + *proto = IPPROTO_ICMP; + icmph = (struct icmp *)(uintptr_t)icmp6h; + pbuf_copy_back(pbuf, hlen, ICMP_MINLEN, + icmph); + + /*Translate the inner IP header only for error messages */ + if (ICMP_ERRORTYPE(icmph->icmp_type)) { + ip2off = hlen + ICMP_MINLEN; + struct ip6_hdr *iph2; + iph2 = (struct ip6_hdr*) pbuf_contig_segment(pbuf, ip2off, + sizeof (*iph2)); + if (iph2 == NULL) + return (NT_DROP); + + /* hlen2 points to end of inner IP header from the beginning */ + hlen2 = ip2off + sizeof(struct ip6_hdr); + tot_len2 = ntohs(iph2->ip6_plen) + sizeof(struct ip6_hdr); + + if (nat464_translate_icmp_ip(pbuf, ip2off, &tot_len, + &hlen2, iph2->ip6_nxt, iph2->ip6_hlim, tot_len2, + (struct nat464_addr *)ndst, (struct nat464_addr *)nsrc, + PF_INET6, PF_INET) != 0) + return (NT_DROP); + + /* Update total length for outer header */ + switch (naf) { + case PF_INET: + iph->ip_len = htons(tot_len); + break; + case PF_INET6: + ip6h->ip6_plen = htons(tot_len - hlen); + break; + } + iph2 = NULL; + } + /* Recalculate IP checksum as some IP fields might have changed */ + iph->ip_sum = 0; + iph->ip_sum = pbuf_inet_cksum(pbuf, 0, 0, iph->ip_hl << 2); + icmph->icmp_cksum = 0; + icmph->icmp_cksum = pbuf_inet_cksum(pbuf, 0, hlen, + ntohs(iph->ip_len) - hlen); + + clat_log2((LOG_DEBUG, "%s translated to ICMP type: %d " + "code: %d checksum: %#x \n", __func__, icmph->icmp_type, + icmph->icmp_code, icmph->icmp_cksum)); + + icmp6h = NULL; + icmph = NULL; + break; + } + + /* + * https://tools.ietf.org/html/rfc7915#section-5.1.1 + * If the Next Header field of the Fragment Header is an + * extension header (except ESP, but including the Authentication + * Header (AH)), then the packet SHOULD be dropped and logged. + */ + case IPPROTO_HOPOPTS: + case IPPROTO_ROUTING: + case IPPROTO_DSTOPTS: + case IPPROTO_AH: + return (NT_DROP); + + case IPPROTO_FRAGMENT: + /* + * The fragment header is appended after or removed before + * calling into this routine. + */ + VERIFY(FALSE); + case IPPROTO_ESP: + break; + + default: + return (NT_DROP); + } + +done: + /* Update checksum flags and offsets based on direction */ + if (direction == NT_OUT) { + if ((*pbuf->pb_csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) == + (CSUM_DATA_VALID | CSUM_PARTIAL)) { + (pbuf->pb_mbuf)->m_pkthdr.csum_tx_start += CLAT46_HDR_EXPANSION_OVERHD; + (pbuf->pb_mbuf)->m_pkthdr.csum_tx_stuff += CLAT46_HDR_EXPANSION_OVERHD; + } + + if(*pbuf->pb_csum_flags & CSUM_TCP) + *pbuf->pb_csum_flags |= CSUM_TCPIPV6; + if(*pbuf->pb_csum_flags & CSUM_UDP) + *pbuf->pb_csum_flags |= CSUM_UDPIPV6; + if (*pbuf->pb_csum_flags & CSUM_FRAGMENT) + *pbuf->pb_csum_flags |= CSUM_FRAGMENT_IPV6; + + /* Clear IPv4 checksum flags */ + *pbuf->pb_csum_flags &= ~(CSUM_IP | CSUM_IP_FRAGS | CSUM_DELAY_DATA | CSUM_FRAGMENT); + } else if (direction == NT_IN) { + /* XXX On input just reset csum flags */ + *pbuf->pb_csum_flags = 0; /* Reset all flags for now */ +#if 0 + /* Update csum flags and offsets for rx */ + if (*pbuf->pb_csum_flags & CSUM_PARTIAL) { + (pbuf->pb_mbuf)->m_pkthdr.csum_rx_start -= CLAT46_HDR_EXPANSION_OVERHD; + } +#endif + } + return (NT_NAT64); +} + +/* Fix the proto checksum for address change */ +static void +nat464_addr_cksum_fixup(uint16_t *pc, struct nat464_addr *ao, struct nat464_addr *an, + protocol_family_t af, protocol_family_t naf, uint8_t u, boolean_t do_ones_complement) +{ + /* Currently we only support v4 to v6 and vice versa */ + VERIFY (af != naf); + + switch (af) { + case PF_INET: + switch (naf) { + case PF_INET6: + if (do_ones_complement) { + *pc = ~nat464_cksum_fixup(nat464_cksum_fixup( + nat464_cksum_fixup(nat464_cksum_fixup(nat464_cksum_fixup( + nat464_cksum_fixup(nat464_cksum_fixup(nat464_cksum_fixup(~*pc, + ao->nataddr16[0], an->nataddr16[0], u), + ao->nataddr16[1], an->nataddr16[1], u), + 0, an->nataddr16[2], u), + 0, an->nataddr16[3], u), + 0, an->nataddr16[4], u), + 0, an->nataddr16[5], u), + 0, an->nataddr16[6], u), + 0, an->nataddr16[7], u); + } else { + *pc = nat464_cksum_fixup(nat464_cksum_fixup( + nat464_cksum_fixup(nat464_cksum_fixup(nat464_cksum_fixup( + nat464_cksum_fixup(nat464_cksum_fixup(nat464_cksum_fixup(*pc, + ao->nataddr16[0], an->nataddr16[0], u), + ao->nataddr16[1], an->nataddr16[1], u), + 0, an->nataddr16[2], u), + 0, an->nataddr16[3], u), + 0, an->nataddr16[4], u), + 0, an->nataddr16[5], u), + 0, an->nataddr16[6], u), + 0, an->nataddr16[7], u); + } + break; + } + break; + case PF_INET6: + /* + * XXX For NAT464 this only applies to the incoming path. + * The checksum therefore is already ones complemented. + * Therefore we just perform normal fixup. + */ + switch (naf) { + case PF_INET: + *pc = nat464_cksum_fixup(nat464_cksum_fixup( + nat464_cksum_fixup(nat464_cksum_fixup(nat464_cksum_fixup( + nat464_cksum_fixup(nat464_cksum_fixup(nat464_cksum_fixup(*pc, + ao->nataddr16[0], an->nataddr16[0], u), + ao->nataddr16[1], an->nataddr16[1], u), + ao->nataddr16[2], 0, u), + ao->nataddr16[3], 0, u), + ao->nataddr16[4], 0, u), + ao->nataddr16[5], 0, u), + ao->nataddr16[6], 0, u), + ao->nataddr16[7], 0, u); + break; + } + break; + } +} + +uint16_t +nat464_cksum_fixup(uint16_t cksum, uint16_t old, uint16_t new, uint8_t udp) +{ + uint32_t l; + + if (udp && !cksum) + return (0); + l = cksum + old - new; + l = (l >> 16) + (l & 0xffff); + l = l & 0xffff; + if (udp && !l) + return (0xffff); + return (l); +} + +/* CLAT46 event handlers */ +void +in6_clat46_eventhdlr_callback(struct eventhandler_entry_arg arg0 __unused, + in6_clat46_evhdlr_code_t in6_clat46_ev_code, pid_t epid, uuid_t euuid) +{ + struct kev_msg ev_msg; + struct kev_netevent_clat46_data clat46_event_data; + + bzero(&ev_msg, sizeof(ev_msg)); + bzero(&clat46_event_data, sizeof(clat46_event_data)); + + ev_msg.vendor_code = KEV_VENDOR_APPLE; + ev_msg.kev_class = KEV_NETWORK_CLASS; + ev_msg.kev_subclass = KEV_NETEVENT_SUBCLASS; + ev_msg.event_code = KEV_NETEVENT_CLAT46_EVENT; + + bzero(&clat46_event_data, sizeof(clat46_event_data)); + clat46_event_data.clat46_event_code = in6_clat46_ev_code; + clat46_event_data.epid = epid; + uuid_copy(clat46_event_data.euuid, euuid); + + ev_msg.dv[0].data_ptr = &clat46_event_data; + ev_msg.dv[0].data_length = sizeof(clat46_event_data); + + kev_post_msg(&ev_msg); +} + +static void +in6_clat46_event_callback(void *arg) +{ + struct kev_netevent_clat46_data *p_in6_clat46_ev = + (struct kev_netevent_clat46_data *)arg; + + EVENTHANDLER_INVOKE(&in6_clat46_evhdlr_ctxt, in6_clat46_event, + p_in6_clat46_ev->clat46_event_code, p_in6_clat46_ev->epid, + p_in6_clat46_ev->euuid); +} + +struct in6_clat46_event_nwk_wq_entry +{ + struct nwk_wq_entry nwk_wqe; + struct kev_netevent_clat46_data in6_clat46_ev_arg; +}; + +void +in6_clat46_event_enqueue_nwk_wq_entry(in6_clat46_evhdlr_code_t in6_clat46_event_code, + pid_t epid, uuid_t euuid) +{ + struct in6_clat46_event_nwk_wq_entry *p_ev = NULL; + + MALLOC(p_ev, struct in6_clat46_event_nwk_wq_entry *, + sizeof(struct in6_clat46_event_nwk_wq_entry), + M_NWKWQ, M_WAITOK | M_ZERO); + + p_ev->nwk_wqe.func = in6_clat46_event_callback; + p_ev->nwk_wqe.is_arg_managed = TRUE; + p_ev->nwk_wqe.arg = &p_ev->in6_clat46_ev_arg; + + p_ev->in6_clat46_ev_arg.clat46_event_code = in6_clat46_event_code; + p_ev->in6_clat46_ev_arg.epid = epid; + uuid_copy(p_ev->in6_clat46_ev_arg.euuid, euuid); + + nwk_wq_enqueue((struct nwk_wq_entry*)p_ev); +} diff --git a/bsd/net/nat464_utils.h b/bsd/net/nat464_utils.h new file mode 100644 index 000000000..be938d23e --- /dev/null +++ b/bsd/net/nat464_utils.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Copyright (c) 2001 Daniel Hartmeier + * Copyright (c) 2002 - 2013 Henning Brauer + * NAT64 - Copyright (c) 2010 Viagenie Inc. (http://www.viagenie.ca) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Effort sponsored in part by the Defense Advanced Research Projects + * Agency (DARPA) and Air Force Research Laboratory, Air Force + * Materiel Command, USAF, under agreement number F30602-01-2-0537. + * + */ +#ifndef _NET_NAT464_UTILS_H_ +#define _NET_NAT464_UTILS_H_ +#include +#include + +#define clat_log0(x) do { log x; } while (0) +#define clat_log1(x) do { if (clat_debug >= 1) log x; } while (0) +#define clat_log2(x) do { if (clat_debug >= 2) log x; } while (0) + +#define CLAT46_NEEDED(x) \ + (!IN_LOOPBACK(x) && !IN_LINKLOCAL(x) && !IN_MULTICAST(x) && \ + INADDR_BROADCAST != x) + +#define CLAT64_NEEDED(x) \ + (!IN6_IS_ADDR_LOOPBACK(x) && !IN6_IS_ADDR_LINKLOCAL(x) && \ + !IN6_IS_ADDR_MULTICAST(x)) + +extern int clat_debug; + +enum { NT_DROP, NT_NAT64 }; +enum { NT_IN, NT_OUT }; +struct nat464_addr { + union { + struct in_addr _v4addr; + struct in6_addr _v6addr; + uint8_t _addr8[16]; + uint16_t _addr16[8]; + uint32_t _addr32[4]; + } nat464a; /* 128-bit address */ +#define natv4addr nat464a._v4addr +#define natv6addr nat464a._v6addr +#define nataddr8 nat464a._addr8 +#define nataddr16 nat464a._addr16 +#define nataddr32 nat464a._addr32 +}; + +int +nat464_translate_icmp(int , void *); + +int +nat464_translate_icmp_ip(pbuf_t *, uint32_t , uint64_t *, uint32_t *, + uint8_t , uint8_t , uint64_t , struct nat464_addr *, + struct nat464_addr *, protocol_family_t , protocol_family_t ); + +int +nat464_synthesize_ipv6(ifnet_t, const struct in_addr *, struct in6_addr *); + +int +nat464_synthesize_ipv4(ifnet_t, const struct in6_addr *, struct in_addr *); + +int +nat464_translate_64(pbuf_t *, int, uint8_t, uint8_t *, uint8_t, struct in_addr, + struct in_addr, uint64_t, boolean_t *); + +int +nat464_translate_46(pbuf_t *, int, uint8_t, uint8_t, uint8_t, struct in6_addr, + struct in6_addr, uint64_t); + +int +nat464_translate_proto(pbuf_t *, struct nat464_addr *, struct nat464_addr *, + uint8_t, protocol_family_t, protocol_family_t, int, boolean_t); + +int +nat464_insert_frag46(pbuf_t *, uint16_t, uint16_t, boolean_t); + +int +nat464_remove_frag64(pbuf_t *, uint32_t, uint16_t, boolean_t); + +uint16_t +nat464_cksum_fixup(uint16_t, uint16_t , uint16_t , uint8_t); +#endif /* !_NET_NAT464_UTILS_H_ */ diff --git a/bsd/net/necp.c b/bsd/net/necp.c index b4b9ff340..17d00fd4f 100644 --- a/bsd/net/necp.c +++ b/bsd/net/necp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2017 Apple Inc. All rights reserved. + * Copyright (c) 2013-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -182,26 +182,27 @@ u_int32_t necp_session_count = 0; #define IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(x) ((x) == NECP_ROUTE_RULE_DENY_INTERFACE || (x) == NECP_ROUTE_RULE_ALLOW_INTERFACE) -#define NECP_KERNEL_CONDITION_ALL_INTERFACES 0x00001 -#define NECP_KERNEL_CONDITION_BOUND_INTERFACE 0x00002 -#define NECP_KERNEL_CONDITION_PROTOCOL 0x00004 -#define NECP_KERNEL_CONDITION_LOCAL_START 0x00008 -#define NECP_KERNEL_CONDITION_LOCAL_END 0x00010 -#define NECP_KERNEL_CONDITION_LOCAL_PREFIX 0x00020 -#define NECP_KERNEL_CONDITION_REMOTE_START 0x00040 -#define NECP_KERNEL_CONDITION_REMOTE_END 0x00080 -#define NECP_KERNEL_CONDITION_REMOTE_PREFIX 0x00100 -#define NECP_KERNEL_CONDITION_APP_ID 0x00200 -#define NECP_KERNEL_CONDITION_REAL_APP_ID 0x00400 -#define NECP_KERNEL_CONDITION_DOMAIN 0x00800 -#define NECP_KERNEL_CONDITION_ACCOUNT_ID 0x01000 -#define NECP_KERNEL_CONDITION_POLICY_ID 0x02000 -#define NECP_KERNEL_CONDITION_PID 0x04000 -#define NECP_KERNEL_CONDITION_UID 0x08000 -#define NECP_KERNEL_CONDITION_LAST_INTERFACE 0x10000 // Only set from packets looping between interfaces -#define NECP_KERNEL_CONDITION_TRAFFIC_CLASS 0x20000 -#define NECP_KERNEL_CONDITION_ENTITLEMENT 0x40000 -#define NECP_KERNEL_CONDITION_CUSTOM_ENTITLEMENT 0x80000 +#define NECP_KERNEL_CONDITION_ALL_INTERFACES 0x000001 +#define NECP_KERNEL_CONDITION_BOUND_INTERFACE 0x000002 +#define NECP_KERNEL_CONDITION_PROTOCOL 0x000004 +#define NECP_KERNEL_CONDITION_LOCAL_START 0x000008 +#define NECP_KERNEL_CONDITION_LOCAL_END 0x000010 +#define NECP_KERNEL_CONDITION_LOCAL_PREFIX 0x000020 +#define NECP_KERNEL_CONDITION_REMOTE_START 0x000040 +#define NECP_KERNEL_CONDITION_REMOTE_END 0x000080 +#define NECP_KERNEL_CONDITION_REMOTE_PREFIX 0x000100 +#define NECP_KERNEL_CONDITION_APP_ID 0x000200 +#define NECP_KERNEL_CONDITION_REAL_APP_ID 0x000400 +#define NECP_KERNEL_CONDITION_DOMAIN 0x000800 +#define NECP_KERNEL_CONDITION_ACCOUNT_ID 0x001000 +#define NECP_KERNEL_CONDITION_POLICY_ID 0x002000 +#define NECP_KERNEL_CONDITION_PID 0x004000 +#define NECP_KERNEL_CONDITION_UID 0x008000 +#define NECP_KERNEL_CONDITION_LAST_INTERFACE 0x010000 // Only set from packets looping between interfaces +#define NECP_KERNEL_CONDITION_TRAFFIC_CLASS 0x020000 +#define NECP_KERNEL_CONDITION_ENTITLEMENT 0x040000 +#define NECP_KERNEL_CONDITION_CUSTOM_ENTITLEMENT 0x080000 +#define NECP_KERNEL_CONDITION_AGENT_TYPE 0x100000 #define NECP_MAX_POLICY_RESULT_SIZE 512 #define NECP_MAX_ROUTE_RULES_ARRAY_SIZE 1024 @@ -223,6 +224,8 @@ struct necp_session { u_int32_t session_priority; // Descriptive priority rating u_int32_t session_order; + necp_policy_id last_policy_id; + decl_lck_mtx_data(, lock); bool proc_locked; // Messages must come from proc_uuid @@ -270,13 +273,6 @@ static lck_attr_t *necp_route_rule_mtx_attr = NULL; static lck_grp_t *necp_route_rule_mtx_grp = NULL; decl_lck_rw_data(static, necp_route_rule_lock); -static necp_policy_id necp_last_policy_id = 0; -static necp_kernel_policy_id necp_last_kernel_policy_id = 0; -static u_int32_t necp_last_uuid_id = 0; -static u_int32_t necp_last_string_id = 0; -static u_int32_t necp_last_route_rule_id = 0; -static u_int32_t necp_last_aggregate_route_rule_id = 0; - /* * On modification, invalidate cached lookups by bumping the generation count. * Other calls will need to take the slowpath of taking @@ -342,13 +338,13 @@ static bool necp_policy_mark_all_for_deletion(struct necp_session *session); static bool necp_policy_delete(struct necp_session *session, struct necp_session_policy *policy); static void necp_policy_apply_all(struct necp_session *session); -static necp_kernel_policy_id necp_kernel_socket_policy_add(necp_policy_id parent_policy_id, necp_policy_order order, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_app_id cond_app_id, necp_app_id cond_real_app_id, char *cond_custom_entitlement, u_int32_t cond_account_id, char *domain, pid_t cond_pid, uid_t cond_uid, ifnet_t cond_bound_interface, struct necp_policy_condition_tc_range cond_traffic_class, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter); +static necp_kernel_policy_id necp_kernel_socket_policy_add(necp_policy_order order, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_app_id cond_app_id, necp_app_id cond_real_app_id, char *cond_custom_entitlement, u_int32_t cond_account_id, char *domain, pid_t cond_pid, uid_t cond_uid, ifnet_t cond_bound_interface, struct necp_policy_condition_tc_range cond_traffic_class, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, struct necp_policy_condition_agent_type *cond_agent_type, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter); static bool necp_kernel_socket_policy_delete(necp_kernel_policy_id policy_id); static bool necp_kernel_socket_policies_reprocess(void); static bool necp_kernel_socket_policies_update_uuid_table(void); -static inline struct necp_kernel_socket_policy *necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy **policy_search_array, struct necp_socket_info *info, necp_kernel_policy_filter *return_filter, u_int32_t *return_route_rule_id, necp_kernel_policy_result *return_service_action, necp_kernel_policy_service *return_service, u_int32_t *return_netagent_array, size_t netagent_array_count, proc_t proc); +static inline struct necp_kernel_socket_policy *necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy **policy_search_array, struct necp_socket_info *info, necp_kernel_policy_filter *return_filter, u_int32_t *return_route_rule_id, necp_kernel_policy_result *return_service_action, necp_kernel_policy_service *return_service, u_int32_t *return_netagent_array, u_int32_t *return_netagent_use_flags_array, size_t netagent_array_count, struct necp_client_parameter_netagent_type *required_agent_types, u_int32_t num_required_agent_types, proc_t proc, necp_kernel_policy_id *skip_policy_id); -static necp_kernel_policy_id necp_kernel_ip_output_policy_add(necp_policy_id parent_policy_id, necp_policy_order order, necp_policy_order suborder, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_kernel_policy_id cond_policy_id, ifnet_t cond_bound_interface, u_int32_t cond_last_interface_index, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter); +static necp_kernel_policy_id necp_kernel_ip_output_policy_add(necp_policy_order order, necp_policy_order suborder, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_kernel_policy_id cond_policy_id, ifnet_t cond_bound_interface, u_int32_t cond_last_interface_index, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter); static bool necp_kernel_ip_output_policy_delete(necp_kernel_policy_id policy_id); static bool necp_kernel_ip_output_policies_reprocess(void); @@ -630,6 +626,12 @@ necp_session_find_from_fd(int fd, struct necp_session **session) } *session = (struct necp_session *)fp->f_fglob->fg_data; + if ((*session)->necp_fd_type != necp_fd_type_session) { + // Not a client fd, ignore + error = EINVAL; + goto done; + } + done: proc_fdunlock(p); return (error); @@ -839,7 +841,7 @@ necp_session_list_all(struct necp_session *session, struct necp_session_action_a u_int8_t *cursor = response; LIST_FOREACH(policy, &session->policies, chain) { if (!policy->pending_deletion && cur_policy_index < num_policies) { - cursor = necp_buffer_write_tlv(cursor, NECP_TLV_POLICY_ID, sizeof(u_int32_t), &policy->id, response, response_size); + cursor = necp_buffer_write_tlv(cursor, NECP_TLV_POLICY_ID, sizeof(u_int32_t), &policy->local_id, response, response_size); cur_policy_index++; } } @@ -1208,13 +1210,6 @@ necp_init(void) necp_kernel_ip_output_policies_count = 0; necp_kernel_ip_output_policies_non_id_count = 0; - necp_last_policy_id = 0; - necp_last_kernel_policy_id = 0; - necp_last_uuid_id = 0; - necp_last_string_id = 0; - necp_last_route_rule_id = 0; - necp_last_aggregate_route_rule_id = 0; - necp_kernel_socket_policies_gencount = 1; memset(&necp_kernel_socket_policies_map, 0, sizeof(necp_kernel_socket_policies_map)); @@ -1982,53 +1977,39 @@ necp_policy_result_is_valid(u_int8_t *buffer, u_int32_t length) u_int8_t type = necp_policy_result_get_type_from_buffer(buffer, length); u_int32_t parameter_length = necp_policy_result_get_parameter_length_from_buffer(buffer, length); switch (type) { - case NECP_POLICY_RESULT_PASS: { - validated = TRUE; - break; - } - case NECP_POLICY_RESULT_SKIP: { - if (parameter_length >= sizeof(u_int32_t)) { - validated = TRUE; - } - break; - } - case NECP_POLICY_RESULT_DROP: { + case NECP_POLICY_RESULT_PASS: + case NECP_POLICY_RESULT_DROP: + case NECP_POLICY_RESULT_ROUTE_RULES: + case NECP_POLICY_RESULT_SCOPED_DIRECT: { validated = TRUE; break; } - case NECP_POLICY_RESULT_SOCKET_DIVERT: { + case NECP_POLICY_RESULT_SKIP: + case NECP_POLICY_RESULT_SOCKET_DIVERT: + case NECP_POLICY_RESULT_SOCKET_FILTER: { if (parameter_length >= sizeof(u_int32_t)) { validated = TRUE; } break; } - case NECP_POLICY_RESULT_SOCKET_SCOPED: { - if (parameter_length > 0) { - validated = TRUE; - } - break; - } case NECP_POLICY_RESULT_IP_TUNNEL: { if (parameter_length > sizeof(u_int32_t)) { validated = TRUE; } break; } - case NECP_POLICY_RESULT_SOCKET_FILTER: { - if (parameter_length >= sizeof(u_int32_t)) { + case NECP_POLICY_RESULT_SOCKET_SCOPED: { + if (parameter_length > 0) { validated = TRUE; } break; } - case NECP_POLICY_RESULT_ROUTE_RULES: { - validated = TRUE; - break; - } case NECP_POLICY_RESULT_TRIGGER: case NECP_POLICY_RESULT_TRIGGER_IF_NEEDED: case NECP_POLICY_RESULT_TRIGGER_SCOPED: case NECP_POLICY_RESULT_NO_TRIGGER_SCOPED: - case NECP_POLICY_RESULT_USE_NETAGENT: { + case NECP_POLICY_RESULT_USE_NETAGENT: + case NECP_POLICY_RESULT_NETAGENT_SCOPED:{ if (parameter_length >= sizeof(uuid_t)) { validated = TRUE; } @@ -2115,7 +2096,9 @@ necp_policy_condition_is_valid(u_int8_t *buffer, u_int32_t length, u_int8_t poli policy_result_type == NECP_POLICY_RESULT_NO_TRIGGER_SCOPED || policy_result_type == NECP_POLICY_RESULT_SOCKET_SCOPED || policy_result_type == NECP_POLICY_RESULT_ROUTE_RULES || - policy_result_type == NECP_POLICY_RESULT_USE_NETAGENT) ? TRUE : FALSE; + policy_result_type == NECP_POLICY_RESULT_USE_NETAGENT || + policy_result_type == NECP_POLICY_RESULT_NETAGENT_SCOPED || + policy_result_type == NECP_POLICY_RESULT_SCOPED_DIRECT) ? TRUE : FALSE; u_int32_t condition_length = necp_policy_condition_get_value_length_from_buffer(buffer, length); u_int8_t *condition_value = necp_policy_condition_get_value_pointer_from_buffer(buffer, length); u_int8_t type = necp_policy_condition_get_type_from_buffer(buffer, length); @@ -2190,6 +2173,13 @@ necp_policy_condition_is_valid(u_int8_t *buffer, u_int32_t length, u_int8_t poli } break; } + case NECP_POLICY_CONDITION_AGENT_TYPE: { + if (!(flags & NECP_POLICY_CONDITION_FLAGS_NEGATIVE) && + condition_length >= sizeof(struct necp_policy_condition_agent_type)) { + validated = TRUE; + } + break; + } default: { validated = FALSE; break; @@ -2228,6 +2218,10 @@ necp_policy_route_rule_is_valid(u_int8_t *buffer, u_int32_t length) validated = TRUE; break; } + case NECP_ROUTE_RULE_DENY_LQM_ABORT: { + validated = TRUE; + break; + } default: { validated = FALSE; break; @@ -2658,9 +2652,9 @@ necp_handle_policy_add(struct necp_session *session, u_int32_t message_id, mbuf_ } if (packet != NULL) { - necp_send_policy_id_response(session, NECP_PACKET_TYPE_POLICY_ADD, message_id, policy->id); + necp_send_policy_id_response(session, NECP_PACKET_TYPE_POLICY_ADD, message_id, policy->local_id); } - return (policy->id); + return (policy->local_id); fail: if (policy_result != NULL) { @@ -2815,7 +2809,7 @@ necp_handle_policy_list_all(struct necp_session *session, u_int32_t message_id, LIST_FOREACH(policy, &session->policies, chain) { if (!policy->pending_deletion && cur_policy_index < num_policies) { - cursor = necp_buffer_write_tlv(cursor, NECP_TLV_POLICY_ID, sizeof(u_int32_t), &policy->id, response, response_size); + cursor = necp_buffer_write_tlv(cursor, NECP_TLV_POLICY_ID, sizeof(u_int32_t), &policy->local_id, response, response_size); cur_policy_index++; } } @@ -2836,22 +2830,17 @@ necp_handle_policy_delete_all(struct necp_session *session, u_int32_t message_id } static necp_policy_id -necp_policy_get_new_id(void) +necp_policy_get_new_id(struct necp_session *session) { - necp_policy_id newid = 0; - - lck_rw_lock_exclusive(&necp_kernel_policy_lock); - - necp_last_policy_id++; - if (necp_last_policy_id < 1) { - necp_last_policy_id = 1; + session->last_policy_id++; + if (session->last_policy_id < 1) { + session->last_policy_id = 1; } - newid = necp_last_policy_id; - lck_rw_done(&necp_kernel_policy_lock); + necp_policy_id newid = session->last_policy_id; if (newid == 0) { - NECPLOG0(LOG_DEBUG, "Allocate policy id failed.\n"); + NECPLOG0(LOG_ERR, "Allocate policy id failed.\n"); return (0); } @@ -3112,6 +3101,10 @@ necp_handle_policy_dump_all(struct necp_session *session, u_int32_t message_id, } num_conditions++; } + if (condition_mask & NECP_KERNEL_CONDITION_AGENT_TYPE) { + condition_tlv_length += sizeof(struct necp_policy_condition_agent_type); + num_conditions++; + } } condition_tlv_length += num_conditions * (sizeof(u_int8_t) + sizeof(u_int32_t)); // These are for the condition TLVs. The space for "value" is already accounted for above. @@ -3236,6 +3229,11 @@ necp_handle_policy_dump_all(struct necp_session *session, u_int32_t message_id, cond_buf, condition_tlv_length); } } + if (condition_mask & NECP_KERNEL_CONDITION_AGENT_TYPE) { + cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_AGENT_TYPE, + sizeof(policy->cond_agent_type), &policy->cond_agent_type, + cond_buf, condition_tlv_length); + } } cursor = necp_buffer_write_tlv(cursor, NECP_TLV_POLICY_CONDITION, cond_buf_cursor - cond_buf, cond_buf, tlv_buffer, total_allocated_bytes); @@ -3377,7 +3375,7 @@ necp_policy_create(struct necp_session *session, necp_policy_order order, u_int8 new_policy->route_rules_size = route_rules_array_size; new_policy->result = result; new_policy->result_size = result_size; - new_policy->id = necp_policy_get_new_id(); + new_policy->local_id = necp_policy_get_new_id(session); LIST_INSERT_SORTED_ASCENDING(&session->policies, new_policy, chain, order, tmp_policy); @@ -3399,7 +3397,7 @@ necp_policy_find(struct necp_session *session, necp_policy_id policy_id) } LIST_FOREACH(policy, &session->policies, chain) { - if (policy->id == policy_id) { + if (policy->local_id == policy_id) { return (policy); } } @@ -3610,6 +3608,7 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli u_int32_t offset = 0; u_int8_t ultimate_result = 0; u_int32_t secondary_result = 0; + struct necp_policy_condition_agent_type cond_agent_type = {}; necp_kernel_policy_result_parameter secondary_result_parameter; memset(&secondary_result_parameter, 0, sizeof(secondary_result_parameter)); u_int32_t cond_last_interface_index = 0; @@ -3862,6 +3861,14 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli socket_ip_conditions = TRUE; break; } + case NECP_POLICY_CONDITION_AGENT_TYPE: { + if (condition_length >= sizeof(cond_agent_type)) { + master_condition_mask |= NECP_KERNEL_CONDITION_AGENT_TYPE; + memcpy(&cond_agent_type, condition_value, sizeof(cond_agent_type)); + socket_only_conditions = TRUE; + } + break; + } default: { break; } @@ -3978,7 +3985,8 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli } break; } - case NECP_POLICY_RESULT_USE_NETAGENT: { + case NECP_POLICY_RESULT_USE_NETAGENT: + case NECP_POLICY_RESULT_NETAGENT_SCOPED: { uuid_t netagent_uuid; if (necp_policy_get_result_parameter(policy, (u_int8_t *)&netagent_uuid, sizeof(netagent_uuid))) { ultimate_result_parameter.netagent_id = necp_create_uuid_service_id_mapping(netagent_uuid); @@ -4004,6 +4012,10 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli } break; } + case NECP_POLICY_RESULT_SCOPED_DIRECT: { + socket_layer_non_id_conditions = TRUE; + break; + } case NECP_POLICY_RESULT_ROUTE_RULES: { if (policy->route_rules != NULL && policy->route_rules_size > 0) { u_int32_t route_rule_id = necp_create_route_rule(&necp_route_rules, policy->route_rules, policy->route_rules_size); @@ -4021,7 +4033,7 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli } if (socket_layer_non_id_conditions) { - necp_kernel_policy_id policy_id = necp_kernel_socket_policy_add(policy->id, policy->order, session->session_order, session->proc_pid, master_condition_mask, master_condition_negated_mask, cond_app_id, cond_real_app_id, cond_custom_entitlement, cond_account_id, cond_domain, cond_pid, cond_uid, cond_bound_interface, cond_traffic_class, cond_protocol, &cond_local_start, &cond_local_end, cond_local_prefix, &cond_remote_start, &cond_remote_end, cond_remote_prefix, ultimate_result, ultimate_result_parameter); + necp_kernel_policy_id policy_id = necp_kernel_socket_policy_add(policy->order, session->session_order, session->proc_pid, master_condition_mask, master_condition_negated_mask, cond_app_id, cond_real_app_id, cond_custom_entitlement, cond_account_id, cond_domain, cond_pid, cond_uid, cond_bound_interface, cond_traffic_class, cond_protocol, &cond_local_start, &cond_local_end, cond_local_prefix, &cond_remote_start, &cond_remote_end, cond_remote_prefix, &cond_agent_type, ultimate_result, ultimate_result_parameter); if (policy_id == 0) { NECPLOG0(LOG_DEBUG, "Error applying socket kernel policy"); @@ -4037,7 +4049,7 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli if (ip_output_layer_non_id_only) { condition_mask |= NECP_KERNEL_CONDITION_POLICY_ID; } - necp_kernel_policy_id policy_id = necp_kernel_ip_output_policy_add(policy->id, policy->order, NECP_KERNEL_POLICY_SUBORDER_NON_ID_CONDITIONS, session->session_order, session->proc_pid, condition_mask, master_condition_negated_mask, NECP_KERNEL_POLICY_ID_NONE, cond_bound_interface, 0, cond_protocol, &cond_local_start, &cond_local_end, cond_local_prefix, &cond_remote_start, &cond_remote_end, cond_remote_prefix, ultimate_result, ultimate_result_parameter); + necp_kernel_policy_id policy_id = necp_kernel_ip_output_policy_add(policy->order, NECP_KERNEL_POLICY_SUBORDER_NON_ID_CONDITIONS, session->session_order, session->proc_pid, condition_mask, master_condition_negated_mask, NECP_KERNEL_POLICY_ID_NONE, cond_bound_interface, 0, cond_protocol, &cond_local_start, &cond_local_end, cond_local_prefix, &cond_remote_start, &cond_remote_end, cond_remote_prefix, ultimate_result, ultimate_result_parameter); if (policy_id == 0) { NECPLOG0(LOG_DEBUG, "Error applying IP output kernel policy"); @@ -4048,7 +4060,7 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli } if (ip_output_layer_id_condition) { - necp_kernel_policy_id policy_id = necp_kernel_ip_output_policy_add(policy->id, policy->order, NECP_KERNEL_POLICY_SUBORDER_ID_CONDITION, session->session_order, session->proc_pid, NECP_KERNEL_CONDITION_POLICY_ID | NECP_KERNEL_CONDITION_ALL_INTERFACES, 0, cond_ip_output_layer_id, NULL, 0, 0, NULL, NULL, 0, NULL, NULL, 0, ultimate_result, ultimate_result_parameter); + necp_kernel_policy_id policy_id = necp_kernel_ip_output_policy_add(policy->order, NECP_KERNEL_POLICY_SUBORDER_ID_CONDITION, session->session_order, session->proc_pid, NECP_KERNEL_CONDITION_POLICY_ID | NECP_KERNEL_CONDITION_ALL_INTERFACES, 0, cond_ip_output_layer_id, NULL, 0, 0, NULL, NULL, 0, NULL, NULL, 0, ultimate_result, ultimate_result_parameter); if (policy_id == 0) { NECPLOG0(LOG_DEBUG, "Error applying IP output kernel policy"); @@ -4060,7 +4072,7 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli // Extra policies for IP Output tunnels for when packets loop back if (ip_output_layer_tunnel_condition_from_id) { - necp_kernel_policy_id policy_id = necp_kernel_ip_output_policy_add(policy->id, policy->order, NECP_KERNEL_POLICY_SUBORDER_NON_ID_TUNNEL_CONDITION, session->session_order, session->proc_pid, NECP_KERNEL_CONDITION_POLICY_ID | NECP_KERNEL_CONDITION_LAST_INTERFACE | NECP_KERNEL_CONDITION_ALL_INTERFACES, 0, policy->kernel_ip_output_policies[NECP_KERNEL_POLICY_SUBORDER_NON_ID_CONDITIONS], NULL, cond_last_interface_index, 0, NULL, NULL, 0, NULL, NULL, 0, secondary_result, secondary_result_parameter); + necp_kernel_policy_id policy_id = necp_kernel_ip_output_policy_add(policy->order, NECP_KERNEL_POLICY_SUBORDER_NON_ID_TUNNEL_CONDITION, session->session_order, session->proc_pid, NECP_KERNEL_CONDITION_POLICY_ID | NECP_KERNEL_CONDITION_LAST_INTERFACE | NECP_KERNEL_CONDITION_ALL_INTERFACES, 0, policy->kernel_ip_output_policies[NECP_KERNEL_POLICY_SUBORDER_NON_ID_CONDITIONS], NULL, cond_last_interface_index, 0, NULL, NULL, 0, NULL, NULL, 0, secondary_result, secondary_result_parameter); if (policy_id == 0) { NECPLOG0(LOG_DEBUG, "Error applying IP output kernel policy"); @@ -4071,7 +4083,7 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli } if (ip_output_layer_tunnel_condition_from_id) { - necp_kernel_policy_id policy_id = necp_kernel_ip_output_policy_add(policy->id, policy->order, NECP_KERNEL_POLICY_SUBORDER_ID_TUNNEL_CONDITION, session->session_order, session->proc_pid, NECP_KERNEL_CONDITION_POLICY_ID | NECP_KERNEL_CONDITION_LAST_INTERFACE | NECP_KERNEL_CONDITION_ALL_INTERFACES, 0, policy->kernel_ip_output_policies[NECP_KERNEL_POLICY_SUBORDER_ID_CONDITION], NULL, cond_last_interface_index, 0, NULL, NULL, 0, NULL, NULL, 0, secondary_result, secondary_result_parameter); + necp_kernel_policy_id policy_id = necp_kernel_ip_output_policy_add(policy->order, NECP_KERNEL_POLICY_SUBORDER_ID_TUNNEL_CONDITION, session->session_order, session->proc_pid, NECP_KERNEL_CONDITION_POLICY_ID | NECP_KERNEL_CONDITION_LAST_INTERFACE | NECP_KERNEL_CONDITION_ALL_INTERFACES, 0, policy->kernel_ip_output_policies[NECP_KERNEL_POLICY_SUBORDER_ID_CONDITION], NULL, cond_last_interface_index, 0, NULL, NULL, 0, NULL, NULL, 0, secondary_result, secondary_result_parameter); if (policy_id == 0) { NECPLOG0(LOG_DEBUG, "Error applying IP output kernel policy"); @@ -4189,9 +4201,9 @@ necp_kernel_policy_get_new_id(bool socket_level) return (newid); } -#define NECP_KERNEL_VALID_SOCKET_CONDITIONS (NECP_KERNEL_CONDITION_APP_ID | NECP_KERNEL_CONDITION_REAL_APP_ID | NECP_KERNEL_CONDITION_DOMAIN | NECP_KERNEL_CONDITION_ACCOUNT_ID | NECP_KERNEL_CONDITION_PID | NECP_KERNEL_CONDITION_UID | NECP_KERNEL_CONDITION_ALL_INTERFACES | NECP_KERNEL_CONDITION_BOUND_INTERFACE | NECP_KERNEL_CONDITION_TRAFFIC_CLASS | NECP_KERNEL_CONDITION_PROTOCOL | NECP_KERNEL_CONDITION_LOCAL_START | NECP_KERNEL_CONDITION_LOCAL_END | NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_REMOTE_START | NECP_KERNEL_CONDITION_REMOTE_END | NECP_KERNEL_CONDITION_REMOTE_PREFIX | NECP_KERNEL_CONDITION_ENTITLEMENT | NECP_KERNEL_CONDITION_CUSTOM_ENTITLEMENT) +#define NECP_KERNEL_VALID_SOCKET_CONDITIONS (NECP_KERNEL_CONDITION_APP_ID | NECP_KERNEL_CONDITION_REAL_APP_ID | NECP_KERNEL_CONDITION_DOMAIN | NECP_KERNEL_CONDITION_ACCOUNT_ID | NECP_KERNEL_CONDITION_PID | NECP_KERNEL_CONDITION_UID | NECP_KERNEL_CONDITION_ALL_INTERFACES | NECP_KERNEL_CONDITION_BOUND_INTERFACE | NECP_KERNEL_CONDITION_TRAFFIC_CLASS | NECP_KERNEL_CONDITION_PROTOCOL | NECP_KERNEL_CONDITION_LOCAL_START | NECP_KERNEL_CONDITION_LOCAL_END | NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_REMOTE_START | NECP_KERNEL_CONDITION_REMOTE_END | NECP_KERNEL_CONDITION_REMOTE_PREFIX | NECP_KERNEL_CONDITION_ENTITLEMENT | NECP_KERNEL_CONDITION_CUSTOM_ENTITLEMENT | NECP_KERNEL_CONDITION_AGENT_TYPE) static necp_kernel_policy_id -necp_kernel_socket_policy_add(necp_policy_id parent_policy_id, necp_policy_order order, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_app_id cond_app_id, necp_app_id cond_real_app_id, char *cond_custom_entitlement, u_int32_t cond_account_id, char *cond_domain, pid_t cond_pid, uid_t cond_uid, ifnet_t cond_bound_interface, struct necp_policy_condition_tc_range cond_traffic_class, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter) +necp_kernel_socket_policy_add(necp_policy_order order, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_app_id cond_app_id, necp_app_id cond_real_app_id, char *cond_custom_entitlement, u_int32_t cond_account_id, char *cond_domain, pid_t cond_pid, uid_t cond_uid, ifnet_t cond_bound_interface, struct necp_policy_condition_tc_range cond_traffic_class, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, struct necp_policy_condition_agent_type *cond_agent_type, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter) { struct necp_kernel_socket_policy *new_kernel_policy = NULL; struct necp_kernel_socket_policy *tmp_kernel_policy = NULL; @@ -4202,7 +4214,6 @@ necp_kernel_socket_policy_add(necp_policy_id parent_policy_id, necp_policy_order } memset(new_kernel_policy, 0, sizeof(*new_kernel_policy)); // M_ZERO is not supported for MALLOC_ZONE - new_kernel_policy->parent_policy_id = parent_policy_id; new_kernel_policy->id = necp_kernel_policy_get_new_id(true); new_kernel_policy->order = order; new_kernel_policy->session_order = session_order; @@ -4281,6 +4292,9 @@ necp_kernel_socket_policy_add(necp_policy_id parent_policy_id, necp_policy_order if (new_kernel_policy->condition_mask & NECP_KERNEL_CONDITION_REMOTE_PREFIX) { new_kernel_policy->cond_remote_prefix = cond_remote_prefix; } + if (new_kernel_policy->condition_mask & NECP_KERNEL_CONDITION_AGENT_TYPE) { + memcpy(&new_kernel_policy->cond_agent_type, cond_agent_type, sizeof(*cond_agent_type)); + } new_kernel_policy->result = result; memcpy(&new_kernel_policy->result_parameter, &result_parameter, sizeof(result_parameter)); @@ -4388,6 +4402,10 @@ necp_get_result_description(char *result_string, necp_kernel_policy_result resul snprintf(result_string, MAX_RESULT_STRING_LEN, "SocketScoped (%s%d)", ifnet_name(interface), ifnet_unit(interface)); break; } + case NECP_KERNEL_POLICY_RESULT_SCOPED_DIRECT: { + snprintf(result_string, MAX_RESULT_STRING_LEN, "ScopedDirect"); + break; + } case NECP_KERNEL_POLICY_RESULT_ROUTE_RULES: { int index = 0; char interface_names[IFXNAMSIZ][MAX_ROUTE_RULE_INTERFACES]; @@ -4498,6 +4516,16 @@ necp_get_result_description(char *result_string, necp_kernel_policy_result resul snprintf(result_string, MAX_RESULT_STRING_LEN, "UseNetAgent (%s)", found_mapping ? uuid_string : "Unknown"); break; } + case NECP_KERNEL_POLICY_RESULT_NETAGENT_SCOPED: { + bool found_mapping = FALSE; + struct necp_uuid_id_mapping *mapping = necp_uuid_lookup_uuid_with_service_id_locked(result_parameter.netagent_id); + if (mapping != NULL) { + uuid_unparse(mapping->uuid, uuid_string); + found_mapping = TRUE; + } + snprintf(result_string, MAX_RESULT_STRING_LEN, "NetAgentScoped (%s)", found_mapping ? uuid_string : "Unknown"); + break; + } case NECP_POLICY_RESULT_TRIGGER: { bool found_mapping = FALSE; struct necp_uuid_id_mapping *mapping = necp_uuid_lookup_uuid_with_service_id_locked(result_parameter.service.identifier); @@ -4597,7 +4625,8 @@ necp_kernel_socket_policy_results_overlap(struct necp_kernel_socket_policy *uppe return (TRUE); } else if (upper_policy->result == NECP_KERNEL_POLICY_RESULT_SOCKET_FILTER || upper_policy->result == NECP_KERNEL_POLICY_RESULT_ROUTE_RULES || - upper_policy->result == NECP_KERNEL_POLICY_RESULT_USE_NETAGENT) { + upper_policy->result == NECP_KERNEL_POLICY_RESULT_USE_NETAGENT || + upper_policy->result == NECP_KERNEL_POLICY_RESULT_NETAGENT_SCOPED) { // Filters and route rules never cancel out lower policies return (FALSE); } else if (necp_kernel_socket_result_is_trigger_service_type(upper_policy)) { @@ -4766,6 +4795,11 @@ necp_kernel_socket_policy_is_unnecessary(struct necp_kernel_socket_policy *polic } } + if (compared_policy->condition_mask & NECP_KERNEL_CONDITION_AGENT_TYPE && + memcmp(&compared_policy->cond_agent_type, &policy->cond_agent_type, sizeof(policy->cond_agent_type)) == 0) { + continue; + } + return (TRUE); } @@ -4813,6 +4847,11 @@ necp_kernel_socket_policies_reprocess(void) necp_kernel_application_policies_count++; app_layer_allocation_count++; + if ((kernel_policy->condition_mask & NECP_KERNEL_CONDITION_AGENT_TYPE)) { + // Agent type conditions only apply to app layer + continue; + } + // Update socket layer bucket mask/counts necp_kernel_socket_policies_condition_mask |= kernel_policy->condition_mask; necp_kernel_socket_policies_count++; @@ -4850,7 +4889,19 @@ necp_kernel_socket_policies_reprocess(void) // Fill out maps LIST_FOREACH(kernel_policy, &necp_kernel_socket_policies, chain) { - // Insert pointers into map + // Add app layer policies + if (!necp_kernel_socket_policy_is_unnecessary(kernel_policy, necp_kernel_socket_policies_app_layer_map, app_layer_current_free_index)) { + necp_kernel_socket_policies_app_layer_map[app_layer_current_free_index] = kernel_policy; + app_layer_current_free_index++; + necp_kernel_socket_policies_app_layer_map[app_layer_current_free_index] = NULL; + } + + if ((kernel_policy->condition_mask & NECP_KERNEL_CONDITION_AGENT_TYPE)) { + // Agent type conditions only apply to app layer + continue; + } + + // Add socket policies if (!(kernel_policy->condition_mask & NECP_KERNEL_CONDITION_APP_ID) || kernel_policy->condition_negated_mask & NECP_KERNEL_CONDITION_APP_ID) { for (app_i = 0; app_i < NECP_KERNEL_SOCKET_POLICIES_MAP_NUM_APP_ID_BUCKETS; app_i++) { @@ -4868,12 +4919,6 @@ necp_kernel_socket_policies_reprocess(void) (necp_kernel_socket_policies_map[app_i])[(bucket_current_free_index[app_i])] = NULL; } } - - if (!necp_kernel_socket_policy_is_unnecessary(kernel_policy, necp_kernel_socket_policies_app_layer_map, app_layer_current_free_index)) { - necp_kernel_socket_policies_app_layer_map[app_layer_current_free_index] = kernel_policy; - app_layer_current_free_index++; - necp_kernel_socket_policies_app_layer_map[app_layer_current_free_index] = NULL; - } } necp_kernel_socket_policies_dump_all(); BUMP_KERNEL_SOCKET_POLICIES_GENERATION_COUNT(); @@ -4902,18 +4947,29 @@ necp_kernel_socket_policies_reprocess(void) static u_int32_t necp_get_new_string_id(void) { + static u_int32_t necp_last_string_id = 0; + u_int32_t newid = 0; LCK_RW_ASSERT(&necp_kernel_policy_lock, LCK_RW_ASSERT_EXCLUSIVE); - necp_last_string_id++; - if (necp_last_string_id < 1) { - necp_last_string_id = 1; - } + bool wrapped = FALSE; + do { + necp_last_string_id++; + if (necp_last_string_id < 1) { + if (wrapped) { + // Already wrapped, give up + NECPLOG0(LOG_ERR, "Failed to find a free app UUID.\n"); + return (0); + } + necp_last_string_id = 1; + wrapped = TRUE; + } + newid = necp_last_string_id; + } while (necp_lookup_string_with_id_locked(&necp_account_id_list, newid) != NULL); // If already used, keep trying - newid = necp_last_string_id; if (newid == 0) { - NECPLOG0(LOG_DEBUG, "Allocate string id failed.\n"); + NECPLOG0(LOG_ERR, "Allocate string id failed.\n"); return (0); } @@ -5007,42 +5063,57 @@ necp_remove_string_to_id_mapping(struct necp_string_id_mapping_list *list, char return (FALSE); } +#define NECP_FIRST_VALID_ROUTE_RULE_ID 1 +#define NECP_FIRST_VALID_AGGREGATE_ROUTE_RULE_ID UINT16_MAX static u_int32_t -necp_get_new_route_rule_id(void) +necp_get_new_route_rule_id(bool aggregate) { - u_int32_t newid = 0; + static u_int32_t necp_last_route_rule_id = 0; + static u_int32_t necp_last_aggregate_route_rule_id = 0; - LCK_RW_ASSERT(&necp_kernel_policy_lock, LCK_RW_ASSERT_EXCLUSIVE); - - necp_last_route_rule_id++; - if (necp_last_route_rule_id < 1 || necp_last_route_rule_id > UINT16_MAX) { - necp_last_route_rule_id = 1; - } - - newid = necp_last_route_rule_id; - if (newid == 0) { - NECPLOG0(LOG_DEBUG, "Allocate route rule id failed.\n"); - return (0); - } - - return (newid); -} - -static u_int32_t -necp_get_new_aggregate_route_rule_id(void) -{ u_int32_t newid = 0; - LCK_RW_ASSERT(&necp_route_rule_lock, LCK_RW_ASSERT_EXCLUSIVE); + if (!aggregate) { + // Main necp_kernel_policy_lock protects non-aggregate rule IDs + LCK_RW_ASSERT(&necp_kernel_policy_lock, LCK_RW_ASSERT_EXCLUSIVE); - necp_last_aggregate_route_rule_id++; - if (necp_last_aggregate_route_rule_id <= UINT16_MAX) { - necp_last_aggregate_route_rule_id = UINT16_MAX + 1; + bool wrapped = FALSE; + do { + necp_last_route_rule_id++; + if (necp_last_route_rule_id < NECP_FIRST_VALID_ROUTE_RULE_ID || + necp_last_route_rule_id >= NECP_FIRST_VALID_AGGREGATE_ROUTE_RULE_ID) { + if (wrapped) { + // Already wrapped, give up + NECPLOG0(LOG_ERR, "Failed to find a free route rule id.\n"); + return (0); + } + necp_last_route_rule_id = NECP_FIRST_VALID_ROUTE_RULE_ID; + wrapped = TRUE; + } + newid = necp_last_route_rule_id; + } while (necp_lookup_route_rule_locked(&necp_route_rules, newid) != NULL); // If already used, keep trying + } else { + // necp_route_rule_lock protects aggregate rule IDs + LCK_RW_ASSERT(&necp_route_rule_lock, LCK_RW_ASSERT_EXCLUSIVE); + + bool wrapped = FALSE; + do { + necp_last_aggregate_route_rule_id++; + if (necp_last_aggregate_route_rule_id < NECP_FIRST_VALID_AGGREGATE_ROUTE_RULE_ID) { + if (wrapped) { + // Already wrapped, give up + NECPLOG0(LOG_ERR, "Failed to find a free aggregate route rule id.\n"); + return (0); + } + necp_last_aggregate_route_rule_id = NECP_FIRST_VALID_AGGREGATE_ROUTE_RULE_ID; + wrapped = TRUE; + } + newid = necp_last_aggregate_route_rule_id; + } while (necp_lookup_route_rule_locked(&necp_route_rules, newid) != NULL); // If already used, keep trying } - newid = necp_last_aggregate_route_rule_id; if (newid == 0) { - NECPLOG0(LOG_DEBUG, "Allocate aggregate route rule id failed.\n"); + NECPLOG0(LOG_ERR, "Allocate route rule ID failed.\n"); return (0); } @@ -5202,7 +5273,7 @@ necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t *route_rules_ MALLOC(new_rule, struct necp_route_rule *, sizeof(struct necp_route_rule), M_NECP, M_WAITOK); if (new_rule != NULL) { memset(new_rule, 0, sizeof(struct necp_route_rule)); - route_rule_id = new_rule->id = necp_get_new_route_rule_id(); + route_rule_id = new_rule->id = necp_get_new_route_rule_id(false); new_rule->default_action = default_action; new_rule->cellular_action = cellular_action; new_rule->wifi_action = wifi_action; @@ -5308,7 +5379,7 @@ necp_create_aggregate_route_rule(u_int32_t *rule_ids) MALLOC(new_rule, struct necp_aggregate_route_rule *, sizeof(struct necp_aggregate_route_rule), M_NECP, M_WAITOK); if (new_rule != NULL) { memset(new_rule, 0, sizeof(struct necp_aggregate_route_rule)); - aggregate_route_rule_id = new_rule->id = necp_get_new_aggregate_route_rule_id(); + aggregate_route_rule_id = new_rule->id = necp_get_new_route_rule_id(true); new_rule->id = aggregate_route_rule_id; memcpy(new_rule->rule_ids, rule_ids, (sizeof(u_int32_t) * MAX_AGGREGATE_ROUTE_RULES)); LIST_INSERT_HEAD(&necp_aggregate_route_rules, new_rule, chain); @@ -5319,22 +5390,54 @@ necp_create_aggregate_route_rule(u_int32_t *rule_ids) } #define NECP_NULL_SERVICE_ID 1 +#define NECP_FIRST_VALID_SERVICE_ID 2 +#define NECP_FIRST_VALID_APP_ID UINT16_MAX static u_int32_t -necp_get_new_uuid_id(void) +necp_get_new_uuid_id(bool service) { + static u_int32_t necp_last_service_uuid_id = 0; + static u_int32_t necp_last_app_uuid_id = 0; + u_int32_t newid = 0; LCK_RW_ASSERT(&necp_kernel_policy_lock, LCK_RW_ASSERT_EXCLUSIVE); - necp_last_uuid_id++; - if (necp_last_uuid_id < (NECP_NULL_SERVICE_ID + 1)) { - necp_last_uuid_id = (NECP_NULL_SERVICE_ID + 1); + if (service) { + bool wrapped = FALSE; + do { + necp_last_service_uuid_id++; + if (necp_last_service_uuid_id < NECP_FIRST_VALID_SERVICE_ID || + necp_last_service_uuid_id >= NECP_FIRST_VALID_APP_ID) { + if (wrapped) { + // Already wrapped, give up + NECPLOG0(LOG_ERR, "Failed to find a free service UUID.\n"); + return (NECP_NULL_SERVICE_ID); + } + necp_last_service_uuid_id = NECP_FIRST_VALID_SERVICE_ID; + wrapped = TRUE; + } + newid = necp_last_service_uuid_id; + } while (necp_uuid_lookup_uuid_with_service_id_locked(newid) != NULL); // If already used, keep trying + } else { + bool wrapped = FALSE; + do { + necp_last_app_uuid_id++; + if (necp_last_app_uuid_id < NECP_FIRST_VALID_APP_ID) { + if (wrapped) { + // Already wrapped, give up + NECPLOG0(LOG_ERR, "Failed to find a free app UUID.\n"); + return (NECP_NULL_SERVICE_ID); + } + necp_last_app_uuid_id = NECP_FIRST_VALID_APP_ID; + wrapped = TRUE; + } + newid = necp_last_app_uuid_id; + } while (necp_uuid_lookup_uuid_with_app_id_locked(newid) != NULL); // If already used, keep trying } - newid = necp_last_uuid_id; - if (newid == 0) { - NECPLOG0(LOG_DEBUG, "Allocate uuid id failed.\n"); - return (0); + if (newid == NECP_NULL_SERVICE_ID) { + NECPLOG0(LOG_ERR, "Allocate uuid ID failed.\n"); + return (NECP_NULL_SERVICE_ID); } return (newid); @@ -5399,7 +5502,7 @@ necp_create_uuid_app_id_mapping(uuid_t uuid, bool *allocated_mapping, bool uuid_ MALLOC(new_mapping, struct necp_uuid_id_mapping *, sizeof(*new_mapping), M_NECP, M_WAITOK); if (new_mapping != NULL) { uuid_copy(new_mapping->uuid, uuid); - new_mapping->id = necp_get_new_uuid_id(); + new_mapping->id = necp_get_new_uuid_id(false); new_mapping->refcount = 1; if (uuid_policy_table) { new_mapping->table_refcount = 1; @@ -5520,7 +5623,7 @@ necp_create_uuid_service_id_mapping(uuid_t uuid) MALLOC(new_mapping, struct necp_uuid_id_mapping *, sizeof(*new_mapping), M_NECP, M_WAITOK); if (new_mapping != NULL) { uuid_copy(new_mapping->uuid, uuid); - new_mapping->id = necp_get_new_uuid_id(); + new_mapping->id = necp_get_new_uuid_id(true); new_mapping->refcount = 1; LIST_INSERT_HEAD(&necp_uuid_service_id_list, new_mapping, chain); @@ -5588,7 +5691,7 @@ necp_kernel_socket_policies_update_uuid_table(void) #define NECP_KERNEL_VALID_IP_OUTPUT_CONDITIONS (NECP_KERNEL_CONDITION_ALL_INTERFACES | NECP_KERNEL_CONDITION_BOUND_INTERFACE | NECP_KERNEL_CONDITION_PROTOCOL | NECP_KERNEL_CONDITION_LOCAL_START | NECP_KERNEL_CONDITION_LOCAL_END | NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_REMOTE_START | NECP_KERNEL_CONDITION_REMOTE_END | NECP_KERNEL_CONDITION_REMOTE_PREFIX | NECP_KERNEL_CONDITION_POLICY_ID | NECP_KERNEL_CONDITION_LAST_INTERFACE) static necp_kernel_policy_id -necp_kernel_ip_output_policy_add(necp_policy_id parent_policy_id, necp_policy_order order, necp_policy_order suborder, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_kernel_policy_id cond_policy_id, ifnet_t cond_bound_interface, u_int32_t cond_last_interface_index, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter) +necp_kernel_ip_output_policy_add(necp_policy_order order, necp_policy_order suborder, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_kernel_policy_id cond_policy_id, ifnet_t cond_bound_interface, u_int32_t cond_last_interface_index, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter) { struct necp_kernel_ip_output_policy *new_kernel_policy = NULL; struct necp_kernel_ip_output_policy *tmp_kernel_policy = NULL; @@ -5599,7 +5702,6 @@ necp_kernel_ip_output_policy_add(necp_policy_id parent_policy_id, necp_policy_or } memset(new_kernel_policy, 0, sizeof(*new_kernel_policy)); // M_ZERO is not supported for MALLOC_ZONE - new_kernel_policy->parent_policy_id = parent_policy_id; new_kernel_policy->id = necp_kernel_policy_get_new_id(false); new_kernel_policy->suborder = suborder; new_kernel_policy->order = order; @@ -5894,12 +5996,17 @@ necp_kernel_ip_output_policies_reprocess(void) necp_kernel_ip_output_policies_condition_mask |= kernel_policy->condition_mask; necp_kernel_ip_output_policies_count++; - // Update bucket counts - if (!(kernel_policy->condition_mask & NECP_KERNEL_CONDITION_POLICY_ID)) { - necp_kernel_ip_output_policies_non_id_count++; + /* Update bucket counts: + * Non-id and SKIP policies will be added to all buckets + */ + if (!(kernel_policy->condition_mask & NECP_KERNEL_CONDITION_POLICY_ID) || + kernel_policy->result == NECP_KERNEL_POLICY_RESULT_SKIP) { for (i = 0; i < NECP_KERNEL_IP_OUTPUT_POLICIES_MAP_NUM_ID_BUCKETS; i++) { bucket_allocation_counts[i]++; } + } + if (!(kernel_policy->condition_mask & NECP_KERNEL_CONDITION_POLICY_ID)) { + necp_kernel_ip_output_policies_non_id_count++; } else { bucket_allocation_counts[NECP_IP_OUTPUT_MAP_ID_TO_BUCKET(kernel_policy->cond_policy_id)]++; } @@ -5921,7 +6028,8 @@ necp_kernel_ip_output_policies_reprocess(void) LIST_FOREACH(kernel_policy, &necp_kernel_ip_output_policies, chain) { // Insert pointers into map - if (!(kernel_policy->condition_mask & NECP_KERNEL_CONDITION_POLICY_ID)) { + if (!(kernel_policy->condition_mask & NECP_KERNEL_CONDITION_POLICY_ID) || + kernel_policy->result == NECP_KERNEL_POLICY_RESULT_SKIP) { for (i = 0; i < NECP_KERNEL_IP_OUTPUT_POLICIES_MAP_NUM_ID_BUCKETS; i++) { if (!necp_kernel_ip_output_policy_is_unnecessary(kernel_policy, necp_kernel_ip_output_policies_map[i], bucket_current_free_index[i])) { (necp_kernel_ip_output_policies_map[i])[(bucket_current_free_index[i])] = kernel_policy; @@ -6074,6 +6182,27 @@ necp_copy_string(char *string, size_t length) return (copied_string); } +static u_int32_t +necp_get_primary_direct_interface_index(void) +{ + u_int32_t interface_index = IFSCOPE_NONE; + + ifnet_head_lock_shared(); + struct ifnet *ordered_interface = NULL; + TAILQ_FOREACH(ordered_interface, &ifnet_ordered_head, if_ordered_link) { + const u_int8_t functional_type = if_functional_type(ordered_interface, TRUE); + if (functional_type != IFRTYPE_FUNCTIONAL_UNKNOWN && + functional_type != IFRTYPE_FUNCTIONAL_LOOPBACK) { + // All known, non-loopback functional types represent direct physical interfaces (Wi-Fi, Cellular, Wired) + interface_index = ordered_interface->if_index; + break; + } + } + ifnet_head_done(); + + return interface_index; +} + static inline void necp_get_parent_cred_result(proc_t proc, struct necp_socket_info *info) { @@ -6244,8 +6373,15 @@ necp_application_find_policy_match_internal(proc_t proc, char *domain = NULL; char *account = NULL; +#define NECP_MAX_REQUIRED_AGENTS 16 + u_int32_t num_required_agent_types = 0; + struct necp_client_parameter_netagent_type required_agent_types[NECP_MAX_REQUIRED_AGENTS]; + memset(&required_agent_types, 0, sizeof(required_agent_types)); + u_int32_t netagent_ids[NECP_MAX_NETAGENTS]; + u_int32_t netagent_use_flags[NECP_MAX_NETAGENTS]; memset(&netagent_ids, 0, sizeof(netagent_ids)); + memset(&netagent_use_flags, 0, sizeof(netagent_use_flags)); int netagent_cursor; bool has_checked_delegation_entitlement = FALSE; @@ -6398,6 +6534,17 @@ necp_application_find_policy_match_internal(proc_t proc, if (length >= sizeof(client_flags)) { memcpy(&client_flags, value, sizeof(client_flags)); } + break; + } + case NECP_CLIENT_PARAMETER_REQUIRE_AGENT_TYPE: { + if (num_required_agent_types >= NECP_MAX_REQUIRED_AGENTS) { + break; + } + if (length >= sizeof(struct necp_client_parameter_netagent_type)) { + memcpy(&required_agent_types[num_required_agent_types], value, sizeof(struct necp_client_parameter_netagent_type)); + num_required_agent_types++; + } + break; } default: { break; @@ -6413,7 +6560,7 @@ necp_application_find_policy_match_internal(proc_t proc, lck_rw_lock_shared(&necp_kernel_policy_lock); necp_application_fillout_info_locked(application_uuid, real_application_uuid, account, domain, pid, uid, protocol, bound_interface_index, traffic_class, &local_addr, &remote_addr, proc, &info); - matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_app_layer_map, &info, &filter_control_unit, &route_rule_id, &service_action, &service, netagent_ids, NECP_MAX_NETAGENTS, proc); + matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_app_layer_map, &info, &filter_control_unit, &route_rule_id, &service_action, &service, netagent_ids, netagent_use_flags, NECP_MAX_NETAGENTS, required_agent_types, num_required_agent_types, proc, NULL); if (matched_policy) { returned_result->policy_id = matched_policy->id; returned_result->routing_result = matched_policy->result; @@ -6460,7 +6607,7 @@ necp_application_find_policy_match_internal(proc_t proc, mapping = necp_uuid_lookup_uuid_with_service_id_locked(netagent_id); if (mapping != NULL) { uuid_copy(returned_result->netagents[netagent_cursor], mapping->uuid); - returned_result->netagent_flags[netagent_cursor] = netagent_get_flags(mapping->uuid); + returned_result->netagent_use_flags[netagent_cursor] = netagent_use_flags[netagent_cursor]; } } @@ -6470,6 +6617,14 @@ necp_application_find_policy_match_internal(proc_t proc, output_bound_interface = returned_result->routing_result_parameter.scoped_interface_index; } else if (returned_result->routing_result == NECP_KERNEL_POLICY_RESULT_IP_TUNNEL) { output_bound_interface = returned_result->routing_result_parameter.tunnel_interface_index; + } else if (returned_result->routing_result == NECP_KERNEL_POLICY_RESULT_SCOPED_DIRECT) { + output_bound_interface = necp_get_primary_direct_interface_index(); + if (output_bound_interface == IFSCOPE_NONE) { + returned_result->routing_result = NECP_KERNEL_POLICY_RESULT_DROP; + } else { + returned_result->routing_result = NECP_KERNEL_POLICY_RESULT_SOCKET_SCOPED; + returned_result->routing_result_parameter.scoped_interface_index = output_bound_interface; + } } if (local_addr.sa.sa_len == 0 || @@ -6508,6 +6663,13 @@ necp_application_find_policy_match_internal(proc_t proc, rt = rtalloc1_scoped((struct sockaddr *)&remote_addr, 0, 0, output_bound_interface); + if (remote_addr.sa.sa_family == AF_INET && rt != NULL && + IS_INTF_CLAT46(rt->rt_ifp)) { + rtfree(rt); + rt = NULL; + returned_result->routed_interface_index = 0; + } + if (no_remote_addr && remote_family == 0 && (rt == NULL || rt->rt_ifp == NULL)) { // Route lookup for default IPv4 failed, try IPv6 @@ -6674,6 +6836,10 @@ necp_application_find_policy_match_internal(proc_t proc, if (necp_update_qos_marking(rt->rt_ifp, route_rule_id)) { *flags |= NECP_CLIENT_RESULT_FLAG_ALLOW_QOS_MARKING; } + + if (IFNET_IS_LOW_POWER(rt->rt_ifp)) { + *flags |= NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER; + } } } @@ -6697,7 +6863,7 @@ necp_application_find_policy_match_internal(proc_t proc, returned_result->routed_interface_index); if (v4Route != NULL) { - if (v4Route->rt_ifp != NULL) { + if (v4Route->rt_ifp != NULL && !IS_INTF_CLAT46(v4Route->rt_ifp)) { *flags |= NECP_CLIENT_RESULT_FLAG_HAS_IPV4; } rtfree(v4Route); @@ -6707,6 +6873,10 @@ necp_application_find_policy_match_internal(proc_t proc, if (v6Route != NULL) { if (v6Route->rt_ifp != NULL) { *flags |= NECP_CLIENT_RESULT_FLAG_HAS_IPV6; + + if (ifnet_get_nat64prefix(v6Route->rt_ifp, NULL) == 0) { + *flags |= NECP_CLIENT_RESULT_FLAG_HAS_NAT64; + } } rtfree(v6Route); v6Route = NULL; @@ -6741,7 +6911,7 @@ necp_application_find_policy_match_internal(proc_t proc, } static bool -necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_app_id app_id, necp_app_id real_app_id, errno_t cred_result, u_int32_t account_id, struct substring domain, u_int8_t domain_dot_count, pid_t pid, uid_t uid, u_int32_t bound_interface_index, u_int32_t traffic_class, u_int16_t protocol, union necp_sockaddr_union *local, union necp_sockaddr_union *remote, proc_t proc) +necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_app_id app_id, necp_app_id real_app_id, errno_t cred_result, u_int32_t account_id, struct substring domain, u_int8_t domain_dot_count, pid_t pid, uid_t uid, u_int32_t bound_interface_index, u_int32_t traffic_class, u_int16_t protocol, union necp_sockaddr_union *local, union necp_sockaddr_union *remote, struct necp_client_parameter_netagent_type *required_agent_types, u_int32_t num_required_agent_types, proc_t proc) { if (!(kernel_policy->condition_mask & NECP_KERNEL_CONDITION_ALL_INTERFACES)) { if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_BOUND_INTERFACE) { @@ -6914,6 +7084,24 @@ necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_a } } + if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_AGENT_TYPE) { + bool matches_agent_type = FALSE; + for (u_int32_t i = 0; i < num_required_agent_types; i++) { + struct necp_client_parameter_netagent_type *required_agent_type = &required_agent_types[i]; + if ((strlen(kernel_policy->cond_agent_type.agent_domain) == 0 || + strncmp(required_agent_type->netagent_domain, kernel_policy->cond_agent_type.agent_domain, NETAGENT_DOMAINSIZE) == 0) && + (strlen(kernel_policy->cond_agent_type.agent_type) == 0 || + strncmp(required_agent_type->netagent_type, kernel_policy->cond_agent_type.agent_type, NETAGENT_TYPESIZE) == 0)) { + // Found a required agent that matches + matches_agent_type = TRUE; + break; + } + } + if (!matches_agent_type) { + return (FALSE); + } + } + if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_START) { if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_END) { bool inRange = necp_is_addr_in_range((struct sockaddr *)local, (struct sockaddr *)&kernel_policy->cond_local_start, (struct sockaddr *)&kernel_policy->cond_local_end); @@ -7097,7 +7285,12 @@ necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_loc } static inline struct necp_kernel_socket_policy * -necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy **policy_search_array, struct necp_socket_info *info, necp_kernel_policy_filter *return_filter, u_int32_t *return_route_rule_id, necp_kernel_policy_result *return_service_action, necp_kernel_policy_service *return_service, u_int32_t *return_netagent_array, size_t netagent_array_count, proc_t proc) +necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy **policy_search_array, struct necp_socket_info *info, + necp_kernel_policy_filter *return_filter, u_int32_t *return_route_rule_id, + necp_kernel_policy_result *return_service_action, necp_kernel_policy_service *return_service, + u_int32_t *return_netagent_array, u_int32_t *return_netagent_use_flags_array, size_t netagent_array_count, + struct necp_client_parameter_netagent_type *required_agent_types, + u_int32_t num_required_agent_types, proc_t proc, necp_kernel_policy_id *skip_policy_id) { struct necp_kernel_socket_policy *matched_policy = NULL; u_int32_t skip_order = 0; @@ -7152,7 +7345,7 @@ necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy // Skip this policy continue; } - if (necp_socket_check_policy(policy_search_array[i], info->application_id, info->real_application_id, info->cred_result, info->account_id, domain_substring, domain_dot_count, info->pid, info->uid, info->bound_interface_index, info->traffic_class, info->protocol, &info->local_addr, &info->remote_addr, proc)) { + if (necp_socket_check_policy(policy_search_array[i], info->application_id, info->real_application_id, info->cred_result, info->account_id, domain_substring, domain_dot_count, info->pid, info->uid, info->bound_interface_index, info->traffic_class, info->protocol, &info->local_addr, &info->remote_addr, required_agent_types, num_required_agent_types, proc)) { if (policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_SOCKET_FILTER) { if (return_filter && *return_filter == 0) { *return_filter = policy_search_array[i]->result_parameter.filter_control_unit; @@ -7184,13 +7377,21 @@ necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy } } continue; - } else if (policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_USE_NETAGENT) { + } else if (policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_USE_NETAGENT || + policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_NETAGENT_SCOPED) { if (return_netagent_array != NULL && netagent_cursor < netagent_array_count) { return_netagent_array[netagent_cursor] = policy_search_array[i]->result_parameter.netagent_id; + if (return_netagent_use_flags_array != NULL && + policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_NETAGENT_SCOPED) { + return_netagent_use_flags_array[netagent_cursor] |= NECP_AGENT_USE_FLAG_SCOPE; + } netagent_cursor++; if (necp_debug > 1) { - NECPLOG(LOG_DEBUG, "Socket Policy: (Application %d Real Application %d BoundInterface %d Proto %d) Use Netagent %d", info->application_id, info->real_application_id, info->bound_interface_index, info->protocol, policy_search_array[i]->result_parameter.netagent_id); + NECPLOG(LOG_DEBUG, "Socket Policy: (Application %d Real Application %d BoundInterface %d Proto %d) %s Netagent %d", + info->application_id, info->real_application_id, info->bound_interface_index, info->protocol, + policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_USE_NETAGENT ? "Use" : "Scope", + policy_search_array[i]->result_parameter.netagent_id); } } continue; @@ -7200,6 +7401,9 @@ necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy if (policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_SKIP) { skip_order = policy_search_array[i]->result_parameter.skip_policy_order; skip_session_order = policy_search_array[i]->session_order + 1; + if (skip_policy_id) { + *skip_policy_id = policy_search_array[i]->id; + } continue; } @@ -7324,6 +7528,7 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local (!(inp->inp_flags2 & INP2_WANT_APP_POLICY) && necp_kernel_socket_policies_non_app_count == 0)) { if (necp_drop_all_order > 0) { inp->inp_policyresult.policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; + inp->inp_policyresult.skip_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; inp->inp_policyresult.policy_gencount = 0; inp->inp_policyresult.app_id = 0; inp->inp_policyresult.flowhash = 0; @@ -7342,6 +7547,7 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local if (necp_socket_bypass(override_local_addr, override_remote_addr, inp)) { // Mark socket as a pass inp->inp_policyresult.policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; + inp->inp_policyresult.skip_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; inp->inp_policyresult.policy_gencount = 0; inp->inp_policyresult.app_id = 0; inp->inp_policyresult.flowhash = 0; @@ -7371,7 +7577,8 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local } // Match socket to policy - matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_map[NECP_SOCKET_MAP_APP_ID_TO_BUCKET(info.application_id)], &info, &filter_control_unit, &route_rule_id, &service_action, &service, netagent_ids, NECP_MAX_NETAGENTS, current_proc()); + necp_kernel_policy_id skip_policy_id; + matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_map[NECP_SOCKET_MAP_APP_ID_TO_BUCKET(info.application_id)], &info, &filter_control_unit, &route_rule_id, &service_action, &service, netagent_ids, NULL, NECP_MAX_NETAGENTS, NULL, 0, current_proc(), &skip_policy_id); // If the socket matched a scoped service policy, mark as Drop if not registered. // This covers the cases in which a service is required (on demand) but hasn't started yet. if ((service_action == NECP_KERNEL_POLICY_RESULT_TRIGGER_SCOPED || @@ -7389,6 +7596,7 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local if (!service_is_registered) { // Mark socket as a drop if service is not registered inp->inp_policyresult.policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; + inp->inp_policyresult.skip_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; inp->inp_policyresult.policy_gencount = necp_kernel_socket_policies_gencount; inp->inp_policyresult.flowhash = flowhash; inp->inp_policyresult.results.filter_control_unit = 0; @@ -7429,6 +7637,7 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local // Mark socket as a drop if required agent is not active inp->inp_policyresult.policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; + inp->inp_policyresult.skip_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; inp->inp_policyresult.policy_gencount = necp_kernel_socket_policies_gencount; inp->inp_policyresult.flowhash = flowhash; inp->inp_policyresult.results.filter_control_unit = 0; @@ -7449,6 +7658,7 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local if (matched_policy) { matched_policy_id = matched_policy->id; inp->inp_policyresult.policy_id = matched_policy->id; + inp->inp_policyresult.skip_policy_id = skip_policy_id; inp->inp_policyresult.policy_gencount = necp_kernel_socket_policies_gencount; inp->inp_policyresult.flowhash = flowhash; inp->inp_policyresult.results.filter_control_unit = filter_control_unit; @@ -7476,6 +7686,7 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local } else if (necp_drop_all_order > 0) { // Mark socket as a drop if set inp->inp_policyresult.policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; + inp->inp_policyresult.skip_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; inp->inp_policyresult.policy_gencount = necp_kernel_socket_policies_gencount; inp->inp_policyresult.flowhash = flowhash; inp->inp_policyresult.results.filter_control_unit = 0; @@ -7484,6 +7695,7 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local } else { // Mark non-matching socket so we don't re-check it inp->inp_policyresult.policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; + inp->inp_policyresult.skip_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH; inp->inp_policyresult.policy_gencount = necp_kernel_socket_policies_gencount; inp->inp_policyresult.flowhash = flowhash; inp->inp_policyresult.results.filter_control_unit = filter_control_unit; // We may have matched a filter, so mark it! @@ -7498,7 +7710,7 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local } static bool -necp_ip_output_check_policy(struct necp_kernel_ip_output_policy *kernel_policy, necp_kernel_policy_id socket_policy_id, u_int32_t bound_interface_index, u_int32_t last_interface_index, u_int16_t protocol, union necp_sockaddr_union *local, union necp_sockaddr_union *remote) +necp_ip_output_check_policy(struct necp_kernel_ip_output_policy *kernel_policy, necp_kernel_policy_id socket_policy_id, necp_kernel_policy_id socket_skip_policy_id, u_int32_t bound_interface_index, u_int32_t last_interface_index, u_int16_t protocol, union necp_sockaddr_union *local, union necp_sockaddr_union *remote) { if (!(kernel_policy->condition_mask & NECP_KERNEL_CONDITION_ALL_INTERFACES)) { if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_BOUND_INTERFACE) { @@ -7527,7 +7739,9 @@ necp_ip_output_check_policy(struct necp_kernel_ip_output_policy *kernel_policy, } if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_POLICY_ID) { - if (socket_policy_id != kernel_policy->cond_policy_id) { + necp_kernel_policy_id matched_policy_id = + kernel_policy->result == NECP_KERNEL_POLICY_RESULT_SKIP ? socket_skip_policy_id : socket_policy_id; + if (matched_policy_id != kernel_policy->cond_policy_id) { // No match, does not match required id return (FALSE); } @@ -7609,7 +7823,7 @@ necp_ip_output_check_policy(struct necp_kernel_ip_output_policy *kernel_policy, } static inline struct necp_kernel_ip_output_policy * -necp_ip_output_find_policy_match_locked(necp_kernel_policy_id socket_policy_id, u_int32_t bound_interface_index, u_int32_t last_interface_index, u_int16_t protocol, union necp_sockaddr_union *local_addr, union necp_sockaddr_union *remote_addr) +necp_ip_output_find_policy_match_locked(necp_kernel_policy_id socket_policy_id, necp_kernel_policy_id socket_skip_policy_id, u_int32_t bound_interface_index, u_int32_t last_interface_index, u_int16_t protocol, union necp_sockaddr_union *local_addr, union necp_sockaddr_union *remote_addr) { u_int32_t skip_order = 0; u_int32_t skip_session_order = 0; @@ -7640,7 +7854,7 @@ necp_ip_output_find_policy_match_locked(necp_kernel_policy_id socket_policy_id, // Skip this policy continue; } - if (necp_ip_output_check_policy(policy_search_array[i], socket_policy_id, bound_interface_index, last_interface_index, protocol, local_addr, remote_addr)) { + if (necp_ip_output_check_policy(policy_search_array[i], socket_policy_id, socket_skip_policy_id, bound_interface_index, last_interface_index, protocol, local_addr, remote_addr)) { // Passed all tests, found a match matched_policy = policy_search_array[i]; @@ -7679,6 +7893,7 @@ necp_ip_output_find_policy_match(struct mbuf *packet, int flags, struct ip_out_a struct ip *ip = NULL; int hlen = sizeof(struct ip); necp_kernel_policy_id socket_policy_id = NECP_KERNEL_POLICY_ID_NONE; + necp_kernel_policy_id socket_skip_policy_id = NECP_KERNEL_POLICY_ID_NONE; necp_kernel_policy_id matched_policy_id = NECP_KERNEL_POLICY_ID_NONE; struct necp_kernel_ip_output_policy *matched_policy = NULL; u_int16_t protocol = 0; @@ -7700,6 +7915,7 @@ necp_ip_output_find_policy_match(struct mbuf *packet, int flags, struct ip_out_a } socket_policy_id = necp_get_policy_id_from_packet(packet); + socket_skip_policy_id = necp_get_skip_policy_id_from_packet(packet); // Exit early for an empty list // Don't lock. Possible race condition, but we don't want the performance hit. @@ -7782,7 +7998,7 @@ necp_ip_output_find_policy_match(struct mbuf *packet, int flags, struct ip_out_a // Match packet to policy lck_rw_lock_shared(&necp_kernel_policy_lock); - matched_policy = necp_ip_output_find_policy_match_locked(socket_policy_id, bound_interface_index, last_interface_index, protocol, &local_addr, &remote_addr); + matched_policy = necp_ip_output_find_policy_match_locked(socket_policy_id, socket_skip_policy_id, bound_interface_index, last_interface_index, protocol, &local_addr, &remote_addr); if (matched_policy) { matched_policy_id = matched_policy->id; if (result) { @@ -7815,6 +8031,7 @@ necp_ip6_output_find_policy_match(struct mbuf *packet, int flags, struct ip6_out int next = -1; int offset = 0; necp_kernel_policy_id socket_policy_id = NECP_KERNEL_POLICY_ID_NONE; + necp_kernel_policy_id socket_skip_policy_id = NECP_KERNEL_POLICY_ID_NONE; necp_kernel_policy_id matched_policy_id = NECP_KERNEL_POLICY_ID_NONE; struct necp_kernel_ip_output_policy *matched_policy = NULL; u_int16_t protocol = 0; @@ -7836,6 +8053,7 @@ necp_ip6_output_find_policy_match(struct mbuf *packet, int flags, struct ip6_out } socket_policy_id = necp_get_policy_id_from_packet(packet); + socket_skip_policy_id = necp_get_skip_policy_id_from_packet(packet); // Exit early for an empty list // Don't lock. Possible race condition, but we don't want the performance hit. @@ -7915,7 +8133,7 @@ necp_ip6_output_find_policy_match(struct mbuf *packet, int flags, struct ip6_out // Match packet to policy lck_rw_lock_shared(&necp_kernel_policy_lock); - matched_policy = necp_ip_output_find_policy_match_locked(socket_policy_id, bound_interface_index, last_interface_index, protocol, &local_addr, &remote_addr); + matched_policy = necp_ip_output_find_policy_match_locked(socket_policy_id, socket_skip_policy_id, bound_interface_index, last_interface_index, protocol, &local_addr, &remote_addr); if (matched_policy) { matched_policy_id = matched_policy->id; if (result) { @@ -8266,6 +8484,22 @@ necp_socket_update_qos_marking(struct inpcb *inp, struct rtentry *route, struct } } +static bool +necp_route_is_lqm_abort(struct ifnet *ifp, struct ifnet *delegated_ifp) +{ + if (ifp != NULL && + (ifp->if_interface_state.valid_bitmask & IF_INTERFACE_STATE_LQM_STATE_VALID) && + ifp->if_interface_state.lqm_state == IFNET_LQM_THRESH_ABORT) { + return true; + } + if (delegated_ifp != NULL && + (delegated_ifp->if_interface_state.valid_bitmask & IF_INTERFACE_STATE_LQM_STATE_VALID) && + delegated_ifp->if_interface_state.lqm_state == IFNET_LQM_THRESH_ABORT) { + return true; + } + return false; +} + static bool necp_route_is_allowed_inner(struct rtentry *route, struct ifnet *ifp, u_int32_t route_rule_id, u_int32_t *interface_type_denied) { @@ -8296,65 +8530,104 @@ necp_route_is_allowed_inner(struct rtentry *route, struct ifnet *ifp, u_int32_t if (route_rule->exception_if_indices[exception_index] == 0) { break; } - if (IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(route_rule->exception_if_actions[exception_index]) == FALSE) { - continue; - } if (route_rule->exception_if_indices[exception_index] == ifp->if_index || (delegated_ifp != NULL && route_rule->exception_if_indices[exception_index] == delegated_ifp->if_index)) { - if (necp_debug > 1) { - NECPLOG(LOG_DEBUG, "Route Allowed: Interface match %d for Rule %d Allowed %d", route_rule->exception_if_indices[exception_index], route_rule_id, ((route_rule->exception_if_actions[exception_index] == NECP_ROUTE_RULE_DENY_INTERFACE) ? FALSE : TRUE)); + if (route_rule->exception_if_actions[exception_index] == NECP_ROUTE_RULE_DENY_LQM_ABORT) { + const bool lqm_abort = necp_route_is_lqm_abort(ifp, delegated_ifp); + if (necp_debug > 1 && lqm_abort) { + NECPLOG(LOG_DEBUG, "Route Allowed: Interface match %d for Rule %d Deny LQM Abort", + route_rule->exception_if_indices[exception_index], route_rule_id); + } + return false; + } else if (IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(route_rule->exception_if_actions[exception_index])) { + if (necp_debug > 1) { + NECPLOG(LOG_DEBUG, "Route Allowed: Interface match %d for Rule %d Allowed %d", route_rule->exception_if_indices[exception_index], route_rule_id, ((route_rule->exception_if_actions[exception_index] == NECP_ROUTE_RULE_DENY_INTERFACE) ? FALSE : TRUE)); + } + return ((route_rule->exception_if_actions[exception_index] == NECP_ROUTE_RULE_DENY_INTERFACE) ? FALSE : TRUE); } - return ((route_rule->exception_if_actions[exception_index] == NECP_ROUTE_RULE_DENY_INTERFACE) ? FALSE : TRUE); } } - if (IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(route_rule->cellular_action) && - IFNET_IS_CELLULAR(ifp)) { - if (interface_type_denied != NULL) { - *interface_type_denied = IFRTYPE_FUNCTIONAL_CELLULAR; + if (IFNET_IS_CELLULAR(ifp)) { + if (route_rule->cellular_action == NECP_ROUTE_RULE_DENY_LQM_ABORT) { + if (necp_route_is_lqm_abort(ifp, delegated_ifp)) { + if (interface_type_denied != NULL) { + *interface_type_denied = IFRTYPE_FUNCTIONAL_CELLULAR; + } + // Mark aggregate action as deny + type_aggregate_action = NECP_ROUTE_RULE_DENY_INTERFACE; + } + } else if (IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(route_rule->cellular_action)) { + if (interface_type_denied != NULL) { + *interface_type_denied = IFRTYPE_FUNCTIONAL_CELLULAR; + } + if (type_aggregate_action == NECP_ROUTE_RULE_NONE || + (type_aggregate_action == NECP_ROUTE_RULE_ALLOW_INTERFACE && + route_rule->cellular_action == NECP_ROUTE_RULE_DENY_INTERFACE)) { + // Deny wins if there is a conflict + type_aggregate_action = route_rule->cellular_action; + } } - if (type_aggregate_action == NECP_ROUTE_RULE_NONE || - (type_aggregate_action == NECP_ROUTE_RULE_ALLOW_INTERFACE && - route_rule->cellular_action == NECP_ROUTE_RULE_DENY_INTERFACE)) { - // Deny wins if there is a conflict - type_aggregate_action = route_rule->cellular_action; - } } - if (IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(route_rule->wifi_action) && - IFNET_IS_WIFI(ifp)) { - if (interface_type_denied != NULL) { - *interface_type_denied = IFRTYPE_FUNCTIONAL_WIFI_INFRA; + if (IFNET_IS_WIFI(ifp)) { + if (route_rule->wifi_action == NECP_ROUTE_RULE_DENY_LQM_ABORT) { + if (necp_route_is_lqm_abort(ifp, delegated_ifp)) { + if (interface_type_denied != NULL) { + *interface_type_denied = IFRTYPE_FUNCTIONAL_WIFI_INFRA; + } + // Mark aggregate action as deny + type_aggregate_action = NECP_ROUTE_RULE_DENY_INTERFACE; + } + } else if (IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(route_rule->wifi_action)) { + if (interface_type_denied != NULL) { + *interface_type_denied = IFRTYPE_FUNCTIONAL_WIFI_INFRA; + } + if (type_aggregate_action == NECP_ROUTE_RULE_NONE || + (type_aggregate_action == NECP_ROUTE_RULE_ALLOW_INTERFACE && + route_rule->wifi_action == NECP_ROUTE_RULE_DENY_INTERFACE)) { + // Deny wins if there is a conflict + type_aggregate_action = route_rule->wifi_action; + } } - if (type_aggregate_action == NECP_ROUTE_RULE_NONE || - (type_aggregate_action == NECP_ROUTE_RULE_ALLOW_INTERFACE && - route_rule->wifi_action == NECP_ROUTE_RULE_DENY_INTERFACE)) { - // Deny wins if there is a conflict - type_aggregate_action = route_rule->wifi_action; - } } - if (IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(route_rule->wired_action) && - IFNET_IS_WIRED(ifp)) { - if (interface_type_denied != NULL) { - *interface_type_denied = IFRTYPE_FUNCTIONAL_WIRED; + if (IFNET_IS_WIRED(ifp)) { + if (route_rule->wired_action == NECP_ROUTE_RULE_DENY_LQM_ABORT) { + if (necp_route_is_lqm_abort(ifp, delegated_ifp)) { + if (interface_type_denied != NULL) { + *interface_type_denied = IFRTYPE_FUNCTIONAL_WIRED; + } + // Mark aggregate action as deny + type_aggregate_action = NECP_ROUTE_RULE_DENY_INTERFACE; + } + } else if (IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(route_rule->wired_action)) { + if (interface_type_denied != NULL) { + *interface_type_denied = IFRTYPE_FUNCTIONAL_WIRED; + } + if (type_aggregate_action == NECP_ROUTE_RULE_NONE || + (type_aggregate_action == NECP_ROUTE_RULE_ALLOW_INTERFACE && + route_rule->wired_action == NECP_ROUTE_RULE_DENY_INTERFACE)) { + // Deny wins if there is a conflict + type_aggregate_action = route_rule->wired_action; + } } - if (type_aggregate_action == NECP_ROUTE_RULE_NONE || - (type_aggregate_action == NECP_ROUTE_RULE_ALLOW_INTERFACE && - route_rule->wired_action == NECP_ROUTE_RULE_DENY_INTERFACE)) { - // Deny wins if there is a conflict - type_aggregate_action = route_rule->wired_action; - } } - if (IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(route_rule->expensive_action) && - IFNET_IS_EXPENSIVE(ifp)) { - if (type_aggregate_action == NECP_ROUTE_RULE_NONE || - (type_aggregate_action == NECP_ROUTE_RULE_ALLOW_INTERFACE && - route_rule->expensive_action == NECP_ROUTE_RULE_DENY_INTERFACE)) { - // Deny wins if there is a conflict - type_aggregate_action = route_rule->expensive_action; + if (IFNET_IS_EXPENSIVE(ifp)) { + if (route_rule->expensive_action == NECP_ROUTE_RULE_DENY_LQM_ABORT) { + if (necp_route_is_lqm_abort(ifp, delegated_ifp)) { + // Mark aggregate action as deny + type_aggregate_action = NECP_ROUTE_RULE_DENY_INTERFACE; } + } else if (IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(route_rule->expensive_action)) { + if (type_aggregate_action == NECP_ROUTE_RULE_NONE || + (type_aggregate_action == NECP_ROUTE_RULE_ALLOW_INTERFACE && + route_rule->expensive_action == NECP_ROUTE_RULE_DENY_INTERFACE)) { + // Deny wins if there is a conflict + type_aggregate_action = route_rule->expensive_action; + } + } } if (type_aggregate_action != NECP_ROUTE_RULE_NONE) { @@ -8442,7 +8715,7 @@ necp_netagents_allow_traffic(u_int32_t *netagent_ids, size_t netagent_id_count) } static bool -necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr *override_local_addr, struct sockaddr *override_remote_addr, ifnet_t interface, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id) +necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr *override_local_addr, struct sockaddr *override_remote_addr, ifnet_t interface, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id, necp_kernel_policy_id *return_skip_policy_id) { u_int32_t verifyifindex = interface ? interface->if_index : 0; bool allowed_to_receive = TRUE; @@ -8460,6 +8733,9 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr if (return_policy_id) { *return_policy_id = NECP_KERNEL_POLICY_ID_NONE; } + if (return_skip_policy_id) { + *return_skip_policy_id = NECP_KERNEL_POLICY_ID_NONE; + } if (return_route_rule_id) { *return_route_rule_id = 0; } @@ -8511,6 +8787,9 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr if (return_policy_id) { *return_policy_id = inp->inp_policyresult.policy_id; } + if (return_skip_policy_id) { + *return_skip_policy_id = inp->inp_policyresult.skip_policy_id; + } if (return_route_rule_id) { *return_route_rule_id = inp->inp_policyresult.results.route_rule_id; } @@ -8552,7 +8831,7 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr goto done; } - struct necp_kernel_socket_policy *matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_map[NECP_SOCKET_MAP_APP_ID_TO_BUCKET(info.application_id)], &info, NULL, &route_rule_id, &service_action, &service, netagent_ids, NECP_MAX_NETAGENTS, current_proc()); + struct necp_kernel_socket_policy *matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_map[NECP_SOCKET_MAP_APP_ID_TO_BUCKET(info.application_id)], &info, NULL, &route_rule_id, &service_action, &service, netagent_ids, NULL, NECP_MAX_NETAGENTS, NULL, 0, current_proc(), return_skip_policy_id); if (matched_policy != NULL) { if (matched_policy->result == NECP_KERNEL_POLICY_RESULT_DROP || matched_policy->result == NECP_KERNEL_POLICY_RESULT_SOCKET_DIVERT || @@ -8601,10 +8880,10 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr } bool -necp_socket_is_allowed_to_send_recv_v4(struct inpcb *inp, u_int16_t local_port, u_int16_t remote_port, struct in_addr *local_addr, struct in_addr *remote_addr, ifnet_t interface, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id) +necp_socket_is_allowed_to_send_recv_v4(struct inpcb *inp, u_int16_t local_port, u_int16_t remote_port, struct in_addr *local_addr, struct in_addr *remote_addr, ifnet_t interface, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id, necp_kernel_policy_id *return_skip_policy_id) { - struct sockaddr_in local; - struct sockaddr_in remote; + struct sockaddr_in local = {}; + struct sockaddr_in remote = {}; local.sin_family = remote.sin_family = AF_INET; local.sin_len = remote.sin_len = sizeof(struct sockaddr_in); local.sin_port = local_port; @@ -8612,14 +8891,15 @@ necp_socket_is_allowed_to_send_recv_v4(struct inpcb *inp, u_int16_t local_port, memcpy(&local.sin_addr, local_addr, sizeof(local.sin_addr)); memcpy(&remote.sin_addr, remote_addr, sizeof(remote.sin_addr)); - return (necp_socket_is_allowed_to_send_recv_internal(inp, (struct sockaddr *)&local, (struct sockaddr *)&remote, interface, return_policy_id, return_route_rule_id)); + return (necp_socket_is_allowed_to_send_recv_internal(inp, (struct sockaddr *)&local, (struct sockaddr *)&remote, interface, + return_policy_id, return_route_rule_id, return_skip_policy_id)); } bool -necp_socket_is_allowed_to_send_recv_v6(struct inpcb *inp, u_int16_t local_port, u_int16_t remote_port, struct in6_addr *local_addr, struct in6_addr *remote_addr, ifnet_t interface, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id) +necp_socket_is_allowed_to_send_recv_v6(struct inpcb *inp, u_int16_t local_port, u_int16_t remote_port, struct in6_addr *local_addr, struct in6_addr *remote_addr, ifnet_t interface, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id, necp_kernel_policy_id *return_skip_policy_id) { - struct sockaddr_in6 local; - struct sockaddr_in6 remote; + struct sockaddr_in6 local = {}; + struct sockaddr_in6 remote = {}; local.sin6_family = remote.sin6_family = AF_INET6; local.sin6_len = remote.sin6_len = sizeof(struct sockaddr_in6); local.sin6_port = local_port; @@ -8627,17 +8907,20 @@ necp_socket_is_allowed_to_send_recv_v6(struct inpcb *inp, u_int16_t local_port, memcpy(&local.sin6_addr, local_addr, sizeof(local.sin6_addr)); memcpy(&remote.sin6_addr, remote_addr, sizeof(remote.sin6_addr)); - return (necp_socket_is_allowed_to_send_recv_internal(inp, (struct sockaddr *)&local, (struct sockaddr *)&remote, interface, return_policy_id, return_route_rule_id)); + return (necp_socket_is_allowed_to_send_recv_internal(inp, (struct sockaddr *)&local, (struct sockaddr *)&remote, interface, + return_policy_id, return_route_rule_id, return_skip_policy_id)); } bool -necp_socket_is_allowed_to_send_recv(struct inpcb *inp, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id) +necp_socket_is_allowed_to_send_recv(struct inpcb *inp, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id, + necp_kernel_policy_id *return_skip_policy_id) { - return (necp_socket_is_allowed_to_send_recv_internal(inp, NULL, NULL, NULL, return_policy_id, return_route_rule_id)); + return (necp_socket_is_allowed_to_send_recv_internal(inp, NULL, NULL, NULL, return_policy_id, return_route_rule_id, return_skip_policy_id)); } int -necp_mark_packet_from_socket(struct mbuf *packet, struct inpcb *inp, necp_kernel_policy_id policy_id, u_int32_t route_rule_id) +necp_mark_packet_from_socket(struct mbuf *packet, struct inpcb *inp, necp_kernel_policy_id policy_id, u_int32_t route_rule_id, + necp_kernel_policy_id skip_policy_id) { if (packet == NULL || inp == NULL || !(packet->m_flags & M_PKTHDR)) { return (EINVAL); @@ -8660,6 +8943,10 @@ necp_mark_packet_from_socket(struct mbuf *packet, struct inpcb *inp, necp_kernel } packet->m_pkthdr.necp_mtag.necp_app_id = inp->inp_policyresult.app_id; + if (skip_policy_id != NECP_KERNEL_POLICY_ID_NONE) { + packet->m_pkthdr.necp_mtag.necp_skip_policy_id = skip_policy_id; + } + return (0); } @@ -8721,6 +9008,16 @@ necp_get_policy_id_from_packet(struct mbuf *packet) return (packet->m_pkthdr.necp_mtag.necp_policy_id); } +necp_kernel_policy_id +necp_get_skip_policy_id_from_packet(struct mbuf *packet) +{ + if (packet == NULL || !(packet->m_flags & M_PKTHDR)) { + return (NECP_KERNEL_POLICY_ID_NONE); + } + + return (packet->m_pkthdr.necp_mtag.necp_skip_policy_id); +} + u_int32_t necp_get_last_interface_index_from_packet(struct mbuf *packet) { @@ -8817,7 +9114,8 @@ necp_socket_should_rescope(struct inpcb *inp) return (FALSE); } - return (inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_SOCKET_SCOPED); + return (inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_SOCKET_SCOPED || + inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_SCOPED_DIRECT); } u_int @@ -8829,6 +9127,8 @@ necp_socket_get_rescope_if_index(struct inpcb *inp) if (inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_SOCKET_SCOPED) { return (inp->inp_policyresult.results.result_parameter.scoped_interface_index); + } else if (inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_SCOPED_DIRECT) { + return (necp_get_primary_direct_interface_index()); } return (0); diff --git a/bsd/net/necp.h b/bsd/net/necp.h index d4f42f386..8eb159c17 100644 --- a/bsd/net/necp.h +++ b/bsd/net/necp.h @@ -135,6 +135,7 @@ struct necp_packet_header { #define NECP_POLICY_CONDITION_REMOTE_ADDR 13 // necp_policy_condition_addr #define NECP_POLICY_CONDITION_LOCAL_ADDR_RANGE 14 // necp_policy_condition_addr_range #define NECP_POLICY_CONDITION_REMOTE_ADDR_RANGE 15 // necp_policy_condition_addr_range +#define NECP_POLICY_CONDITION_AGENT_TYPE 16 // struct necp_policy_condition_agent_type /* * Results @@ -153,8 +154,10 @@ struct necp_packet_header { #define NECP_POLICY_RESULT_SOCKET_SCOPED 12 // String, interface name #define NECP_POLICY_RESULT_ROUTE_RULES 13 // N/A, must have route rules defined #define NECP_POLICY_RESULT_USE_NETAGENT 14 // netagent uuid_t +#define NECP_POLICY_RESULT_NETAGENT_SCOPED 15 // netagent uuid_t +#define NECP_POLICY_RESULT_SCOPED_DIRECT 16 // N/A, scopes to primary physical interface -#define NECP_POLICY_RESULT_MAX NECP_POLICY_RESULT_USE_NETAGENT +#define NECP_POLICY_RESULT_MAX NECP_POLICY_RESULT_SCOPED_DIRECT /* * Route Rules @@ -163,7 +166,8 @@ struct necp_packet_header { #define NECP_ROUTE_RULE_NONE 0 // N/A #define NECP_ROUTE_RULE_DENY_INTERFACE 1 // String, or empty to match all #define NECP_ROUTE_RULE_ALLOW_INTERFACE 2 // String, or empty to match all -#define NECP_ROUTE_RULE_QOS_MARKING 3 // String, or empty to match all +#define NECP_ROUTE_RULE_QOS_MARKING 3 // String, or empty to match all +#define NECP_ROUTE_RULE_DENY_LQM_ABORT 4 // String, or empty to match all #define NECP_ROUTE_RULE_FLAG_CELLULAR 0x01 #define NECP_ROUTE_RULE_FLAG_WIFI 0x02 @@ -212,6 +216,11 @@ struct necp_policy_condition_addr_range { } end_address; } __attribute__((__packed__)); +struct necp_policy_condition_agent_type { + char agent_domain[32]; + char agent_type[32]; +} __attribute__((__packed__)); + #define NECP_SESSION_PRIORITY_UNKNOWN 0 #define NECP_SESSION_PRIORITY_CONTROL 1 #define NECP_SESSION_PRIORITY_PRIVILEGED_TUNNEL 2 @@ -238,6 +247,8 @@ typedef union { #define NECP_SERVICE_FLAGS_REGISTERED 0x01 #define NECP_MAX_NETAGENTS 8 +#define NECP_AGENT_USE_FLAG_SCOPE 0x01 + #define NECP_TFO_COOKIE_LEN_MAX 16 struct necp_aggregate_result { necp_kernel_policy_result routing_result; @@ -250,7 +261,7 @@ struct necp_aggregate_result { u_int routed_interface_index; u_int32_t policy_id; uuid_t netagents[NECP_MAX_NETAGENTS]; - u_int32_t netagent_flags[NECP_MAX_NETAGENTS]; + u_int32_t netagent_use_flags[NECP_MAX_NETAGENTS]; u_int8_t mss_recommended; }; @@ -438,6 +449,9 @@ typedef struct necp_cache_buffer { #define NECP_CLIENT_ACTION_UPDATE_CACHE 14 // Update heuristics and cache #define NECP_CLIENT_ACTION_COPY_CLIENT_UPDATE 15 // Fetch an updated client for push-mode observer. Output: Client id, struct necp_client_observer_update in buffer #define NECP_CLIENT_ACTION_COPY_UPDATED_RESULT 16 // Copy client result only if changed. Input: client_id; Output: result in buffer +#define NECP_CLIENT_ACTION_ADD_FLOW 17 // Add a flow. Input: client_id; Output: struct necp_client_add_flow +#define NECP_CLIENT_ACTION_REMOVE_FLOW 18 // Remove a flow. Input: flow_id, optional struct ifnet_stats_per_flow + #define NECP_CLIENT_PARAMETER_APPLICATION NECP_POLICY_CONDITION_APPLICATION // Requires entitlement #define NECP_CLIENT_PARAMETER_REAL_APPLICATION NECP_POLICY_CONDITION_REAL_APPLICATION // Requires entitlement @@ -463,10 +477,14 @@ typedef struct necp_cache_buffer { #define NECP_CLIENT_PARAMETER_REQUIRE_AGENT 112 // uuid_t, network agent UUID #define NECP_CLIENT_PARAMETER_REQUIRE_AGENT_TYPE 113 // struct necp_client_parameter_netagent_type -// "Prefer" will choose an interface with that property, or best otherwise if not found +// "Prefer" will choose an interface with an agent, or best otherwise if not found #define NECP_CLIENT_PARAMETER_PREFER_AGENT 122 // uuid_t, network agent UUID #define NECP_CLIENT_PARAMETER_PREFER_AGENT_TYPE 123 // struct necp_client_parameter_netagent_type +// "Avoid" will choose an interface without an agent, or best otherwise if unavoidable +#define NECP_CLIENT_PARAMETER_AVOID_AGENT 124 // uuid_t, network agent UUID +#define NECP_CLIENT_PARAMETER_AVOID_AGENT_TYPE 125 // struct necp_client_parameter_netagent_type + // Use actions with NECP_CLIENT_ACTION_AGENT #define NECP_CLIENT_PARAMETER_TRIGGER_AGENT 130 // uuid_t, network agent UUID #define NECP_CLIENT_PARAMETER_ASSERT_AGENT 131 // uuid_t, network agent UUID @@ -486,6 +504,8 @@ typedef struct necp_cache_buffer { #define NECP_CLIENT_PARAMETER_FLAG_ECN_ENABLE 0x0020 // Client is requesting to enable ECN #define NECP_CLIENT_PARAMETER_FLAG_ECN_DISABLE 0x0040 // Client is requesting to disable ECN #define NECP_CLIENT_PARAMETER_FLAG_TFO_ENABLE 0x0080 // Client is requesting to enable TFO +#define NECP_CLIENT_PARAMETER_FLAG_ONLY_PRIMARY_REQUIRES_TYPE 0x0100 // Interpret NECP_CLIENT_PARAMETER_REQUIRE_IF_TYPE only for primary + // interface, and allow exceptions for multipath or listeners #define NECP_CLIENT_RESULT_CLIENT_ID 1 // uuid_t #define NECP_CLIENT_RESULT_POLICY_RESULT 2 // u_int32_t @@ -495,19 +515,21 @@ typedef struct necp_cache_buffer { #define NECP_CLIENT_RESULT_NETAGENT 6 // struct necp_client_result_netagent #define NECP_CLIENT_RESULT_FLAGS 7 // u_int32_t, see NECP_CLIENT_RESULT_FLAG_* values #define NECP_CLIENT_RESULT_INTERFACE 8 // struct necp_client_result_interface -#define NECP_CLIENT_RESULT_MULTIPATH_INTERFACE 9 // struct necp_client_result_interface +#define NECP_CLIENT_RESULT_INTERFACE_OPTION 9 // struct necp_client_interface_option #define NECP_CLIENT_RESULT_EFFECTIVE_MTU 10 // u_int32_t #define NECP_CLIENT_RESULT_FLOW 11 // TLV array of a single flow's state #define NECP_CLIENT_RESULT_PROTO_CTL_EVENT 12 #define NECP_CLIENT_RESULT_TFO_COOKIE 13 // NECP_TFO_COOKIE_LEN_MAX #define NECP_CLIENT_RESULT_TFO_FLAGS 14 // u_int8_t #define NECP_CLIENT_RESULT_RECOMMENDED_MSS 15 // u_int8_t +#define NECP_CLIENT_RESULT_FLOW_ID 16 // uuid_t #define NECP_CLIENT_RESULT_INTERFACE_TIME_DELTA 17 // u_int32_t, seconds since interface up/down #define NECP_CLIENT_RESULT_NEXUS_INSTANCE 100 // uuid_t #define NECP_CLIENT_RESULT_NEXUS_PORT 101 // u_int16_t #define NECP_CLIENT_RESULT_NEXUS_KEY 102 // uuid_t #define NECP_CLIENT_RESULT_NEXUS_PORT_FLOW_INDEX 103 // u_int32_t +#define NECP_CLIENT_RESULT_NEXUS_FLOW_STATS 104 // struct sk_stats_flow * #define NECP_CLIENT_RESULT_LOCAL_ENDPOINT 200 // struct necp_client_endpoint #define NECP_CLIENT_RESULT_REMOTE_ENDPOINT 201 // struct necp_client_endpoint @@ -528,6 +550,10 @@ typedef struct necp_cache_buffer { #define NECP_CLIENT_RESULT_FLAG_FAST_OPEN_BLOCKED 0x0400 // Fast open should not be used #define NECP_CLIENT_RESULT_FLAG_LINK_QUALITY_ABORT 0x0800 // Link quality is very bad, recommend close connections #define NECP_CLIENT_RESULT_FLAG_ALLOW_QOS_MARKING 0x1000 // QoS marking is allowed +#define NECP_CLIENT_RESULT_FLAG_HAS_NAT64 0x2000 // Has NAT64 prefix +#define NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER 0x4000 // Interface is in low-power mode + +#define NECP_CLIENT_RESULT_FLAG_FORCE_UPDATE (NECP_CLIENT_RESULT_FLAG_HAS_IPV4 | NECP_CLIENT_RESULT_FLAG_HAS_IPV6 | NECP_CLIENT_RESULT_FLAG_HAS_NAT64 | NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER) #define NECP_CLIENT_RESULT_FAST_OPEN_SND_PROBE 0x01 // DEPRECATED - Fast open send probe #define NECP_CLIENT_RESULT_FAST_OPEN_RCV_PROBE 0x02 // DEPRECATED - Fast open receive probe @@ -556,6 +582,8 @@ struct necp_interface_details { #define NECP_INTERFACE_FLAG_EXPENSIVE 0x0001 #define NECP_INTERFACE_FLAG_TXSTART 0X0002 #define NECP_INTERFACE_FLAG_NOACKPRI 0x0004 +#define NECP_INTERFACE_FLAG_3CARRIERAGG 0x0008 +#define NECP_INTERFACE_FLAG_IS_LOW_POWER 0x0010 struct necp_client_parameter_netagent_type { char netagent_domain[32]; @@ -572,6 +600,12 @@ struct necp_client_result_interface { u_int32_t index; }; +struct necp_client_interface_option { + u_int32_t interface_index; + u_int32_t interface_generation; + uuid_t nexus_agent; +}; + struct necp_client_endpoint { union { struct sockaddr sa; @@ -596,6 +630,24 @@ struct kev_necp_policies_changed_data { u_int32_t changed_count; // Defaults to 0. }; +#define NECP_CLIENT_FLOW_FLAGS_ALLOW_NEXUS 0x01 // Request a nexus instance upon adding a flow +#define NECP_CLIENT_FLOW_FLAGS_USE_CLIENT_ID 0x02 // Register the client ID rather than the flow registration ID with network agents + +struct necp_client_flow_stats { + u_int32_t stats_type; // NECP_CLIENT_STATISTICS_TYPE_* + u_int32_t stats_version; // NECP_CLIENT_STATISTICS_TYPE_*_VER + u_int32_t stats_size; + mach_vm_address_t stats_addr; +}; + +struct necp_client_add_flow { + uuid_t agent_uuid; + uuid_t registration_id; + u_int16_t flags; // NECP_CLIENT_FLOW_FLAGS_* + u_int16_t stats_request_count; + struct necp_client_flow_stats stats_requests[0]; +} __attribute__((__packed__)); + struct necp_agent_use_parameters { uuid_t agent_uuid; uint64_t out_use_count; @@ -622,7 +674,6 @@ struct necp_client_observer_update { #include #include #include -#include #include #include #include @@ -739,6 +790,8 @@ typedef u_int32_t necp_app_id; #define NECP_KERNEL_POLICY_RESULT_SOCKET_SCOPED NECP_POLICY_RESULT_SOCKET_SCOPED #define NECP_KERNEL_POLICY_RESULT_ROUTE_RULES NECP_POLICY_RESULT_ROUTE_RULES #define NECP_KERNEL_POLICY_RESULT_USE_NETAGENT NECP_POLICY_RESULT_USE_NETAGENT +#define NECP_KERNEL_POLICY_RESULT_NETAGENT_SCOPED NECP_POLICY_RESULT_NETAGENT_SCOPED +#define NECP_KERNEL_POLICY_RESULT_SCOPED_DIRECT NECP_POLICY_RESULT_SCOPED_DIRECT typedef struct { u_int32_t identifier; @@ -764,7 +817,6 @@ enum necp_boolean_state { struct necp_kernel_socket_policy { LIST_ENTRY(necp_kernel_socket_policy) chain; - necp_policy_id parent_policy_id; necp_kernel_policy_id id; necp_policy_order order; u_int32_t session_order; @@ -791,6 +843,7 @@ struct necp_kernel_socket_policy { union necp_sockaddr_union cond_remote_start; // Matches remote IP address (or start) union necp_sockaddr_union cond_remote_end; // Matches IP address range u_int8_t cond_remote_prefix; // Defines subnet + struct necp_policy_condition_agent_type cond_agent_type; necp_kernel_policy_result result; necp_kernel_policy_result_parameter result_parameter; @@ -798,7 +851,6 @@ struct necp_kernel_socket_policy { struct necp_kernel_ip_output_policy { LIST_ENTRY(necp_kernel_ip_output_policy) chain; - necp_policy_id parent_policy_id; necp_kernel_policy_id id; necp_policy_order suborder; necp_policy_order order; @@ -829,7 +881,7 @@ struct necp_session_policy { bool applied; // Applied into the kernel table bool pending_deletion; // Waiting to be removed from kernel table bool pending_update; // Policy has been modified since creation/last application - necp_policy_id id; + necp_policy_id local_id; necp_policy_order order; u_int8_t *result; u_int32_t result_size; @@ -861,6 +913,7 @@ struct necp_aggregate_socket_result { struct necp_inpcb_result { u_int32_t app_id; necp_kernel_policy_id policy_id; + necp_kernel_policy_id skip_policy_id; int32_t policy_gencount; u_int32_t flowhash; struct necp_aggregate_socket_result results; @@ -872,7 +925,6 @@ extern errno_t necp_set_socket_attributes(struct socket *so, struct sockopt *sop extern errno_t necp_get_socket_attributes(struct socket *so, struct sockopt *sopt); extern void necp_inpcb_remove_cb(struct inpcb *inp); extern void necp_inpcb_dispose(struct inpcb *inp); -extern void necp_mppcb_dispose(struct mppcb *mpp); extern u_int32_t necp_socket_get_content_filter_control_unit(struct socket *so); @@ -884,19 +936,23 @@ extern u_int necp_socket_get_rescope_if_index(struct inpcb *inp); extern u_int32_t necp_socket_get_effective_mtu(struct inpcb *inp, u_int32_t current_mtu); extern bool necp_socket_is_allowed_to_send_recv(struct inpcb *inp, necp_kernel_policy_id *return_policy_id, - u_int32_t *return_route_rule_id); + u_int32_t *return_route_rule_id, + necp_kernel_policy_id *return_skip_policy_id); extern bool necp_socket_is_allowed_to_send_recv_v4(struct inpcb *inp, u_int16_t local_port, u_int16_t remote_port, struct in_addr *local_addr, struct in_addr *remote_addr, ifnet_t interface, - necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id); + necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id, + necp_kernel_policy_id *return_skip_policy_id); extern bool necp_socket_is_allowed_to_send_recv_v6(struct inpcb *inp, u_int16_t local_port, u_int16_t remote_port, struct in6_addr *local_addr, struct in6_addr *remote_addr, ifnet_t interface, - necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id); + necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id, + necp_kernel_policy_id *return_skip_policy_id); extern void necp_socket_update_qos_marking(struct inpcb *inp, struct rtentry *route, struct ifnet *interface, u_int32_t route_rule_id); extern int necp_mark_packet_from_socket(struct mbuf *packet, struct inpcb *inp, necp_kernel_policy_id policy_id, - u_int32_t route_rule_id); + u_int32_t route_rule_id, necp_kernel_policy_id skip_policy_id); extern necp_kernel_policy_id necp_get_policy_id_from_packet(struct mbuf *packet); +extern necp_kernel_policy_id necp_get_skip_policy_id_from_packet(struct mbuf *packet); extern u_int32_t necp_get_last_interface_index_from_packet(struct mbuf *packet); extern u_int32_t necp_get_route_rule_id_from_packet(struct mbuf *packet); extern int necp_get_app_uuid_from_packet(struct mbuf *packet, @@ -924,9 +980,7 @@ extern bool necp_get_is_keepalive_from_packet(struct mbuf *packet); extern void necp_update_all_clients(void); // Handle general re-evaluate event -extern void necp_force_update_client(uuid_t client_id, uuid_t remove_netagent_uuid); // Cause a single client to get an update event - -extern void necp_client_early_close(uuid_t client_id); // Cause a single client to close stats, etc +extern void necp_force_update_client(uuid_t client_id, uuid_t remove_netagent_uuid, u_int32_t agent_generation); // Cause a single client to get an update event extern void necp_set_client_as_background(proc_t proc, struct fileproc *fp, bool background); // Set all clients for an fp as background or not @@ -936,7 +990,7 @@ extern void necp_fd_defunct(proc_t proc, struct necp_fd_data *client_fd); // Set extern int necp_client_register_socket_flow(pid_t pid, uuid_t client_id, struct inpcb *inp); -extern int necp_client_register_multipath_cb(pid_t pid, uuid_t client_id, struct mppcb *mpp); +extern int necp_client_assert_bb_radio_manager(uuid_t client_id, bool assert); extern int necp_client_assign_from_socket(pid_t pid, uuid_t client_id, struct inpcb *inp); @@ -956,7 +1010,7 @@ necp_update_flow_protoctl_event(uuid_t netagent_uuid, uuid_t client_id, #define NECP_FLOWADV_IDX_INVALID UINT32_MAX extern void *necp_create_nexus_assign_message(uuid_t nexus_instance, u_int32_t nexus_port, void *key, uint32_t key_length, struct necp_client_endpoint *local_endpoint, struct necp_client_endpoint *remote_endpoint, - u_int32_t flow_adv_index, size_t *message_length); + u_int32_t flow_adv_index, void *flow_stats, size_t *message_length); struct necp_client_nexus_parameters { pid_t pid; @@ -971,41 +1025,21 @@ struct necp_client_nexus_parameters { unsigned allow_qos_marking:1; }; -extern int necp_client_copy_parameters(uuid_t client_uuid, struct necp_client_nexus_parameters *parameters); - #define NECP_CLIENT_CBACTION_NONVIABLE 1 #define NECP_CLIENT_CBACTION_VIABLE 2 #define NECP_CLIENT_CBACTION_INITIAL 3 -struct necp_client_flow { - LIST_ENTRY(necp_client_flow) flow_chain; - unsigned invalid : 1; - unsigned nexus : 1; // If true, flow is a nexus; if false, flow is attached to socket - unsigned socket : 1; - unsigned viable : 1; - unsigned requested_nexus : 1; - unsigned assigned : 1; - unsigned has_protoctl_event : 1; - unsigned check_tcp_heuristics : 1; - union { - uuid_t nexus_agent; - struct { - void *socket_handle; - void (*cb)(void *handle, int action, struct necp_client_flow *flow); - }; - } u; - uint32_t interface_index; - uint16_t interface_flags; - uint32_t necp_flow_flags; - struct necp_client_flow_protoctl_event protoctl_event; - union necp_sockaddr_union local_addr; - union necp_sockaddr_union remote_addr; +struct necp_client_add_flow_default { + uuid_t agent_uuid; + uuid_t registration_id; + u_int16_t flags; // NECP_CLIENT_FLOW_FLAGS_* + u_int16_t stats_request_count; + struct necp_client_flow_stats stats_requests[1]; +} __attribute__((__packed__)); - size_t assigned_results_length; - u_int8_t *assigned_results; -}; +typedef void (*necp_client_flow_cb)(void *handle, int action, uint32_t interface_index, uint32_t necp_flags, bool *viable); -extern void necp_client_reap_caches(boolean_t); +extern void necp_client_reap_caches(boolean_t purge); #endif /* BSD_KERNEL_PRIVATE */ diff --git a/bsd/net/necp_client.c b/bsd/net/necp_client.c index 41e6efaa8..814f2f0be 100644 --- a/bsd/net/necp_client.c +++ b/bsd/net/necp_client.c @@ -145,10 +145,6 @@ extern u_int32_t necp_debug; -// proc_best_name() is declared here in advance of it landing in a header file. -// See comment in kern_proc.c -extern char *proc_best_name(proc_t p); - static int noop_read(struct fileproc *, struct uio *, int, vfs_context_t); static int noop_write(struct fileproc *, struct uio *, int, vfs_context_t); static int noop_ioctl(struct fileproc *, unsigned long, caddr_t, @@ -192,15 +188,17 @@ extern unsigned int get_maxmtu(struct rtentry *); #define NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT 0x00040 #define NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_AGENT 0x00080 #define NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT 0x00100 -#define NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT_TYPE 0x00200 -#define NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_AGENT_TYPE 0x00400 -#define NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT_TYPE 0x00800 -#define NECP_PARSED_PARAMETERS_FIELD_FLAGS 0x01000 -#define NECP_PARSED_PARAMETERS_FIELD_IP_PROTOCOL 0x02000 -#define NECP_PARSED_PARAMETERS_FIELD_EFFECTIVE_PID 0x04000 -#define NECP_PARSED_PARAMETERS_FIELD_EFFECTIVE_UUID 0x08000 -#define NECP_PARSED_PARAMETERS_FIELD_TRAFFIC_CLASS 0x10000 -#define NECP_PARSED_PARAMETERS_FIELD_LOCAL_PORT 0x20000 +#define NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT 0x00200 +#define NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT_TYPE 0x00400 +#define NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_AGENT_TYPE 0x00800 +#define NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT_TYPE 0x01000 +#define NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT_TYPE 0x02000 +#define NECP_PARSED_PARAMETERS_FIELD_FLAGS 0x04000 +#define NECP_PARSED_PARAMETERS_FIELD_IP_PROTOCOL 0x08000 +#define NECP_PARSED_PARAMETERS_FIELD_EFFECTIVE_PID 0x10000 +#define NECP_PARSED_PARAMETERS_FIELD_EFFECTIVE_UUID 0x20000 +#define NECP_PARSED_PARAMETERS_FIELD_TRAFFIC_CLASS 0x40000 +#define NECP_PARSED_PARAMETERS_FIELD_LOCAL_PORT 0x80000 #define NECP_MAX_PARSED_PARAMETERS 16 struct necp_client_parsed_parameters { @@ -215,9 +213,11 @@ struct necp_client_parsed_parameters { struct necp_client_parameter_netagent_type required_netagent_types[NECP_MAX_PARSED_PARAMETERS]; struct necp_client_parameter_netagent_type prohibited_netagent_types[NECP_MAX_PARSED_PARAMETERS]; struct necp_client_parameter_netagent_type preferred_netagent_types[NECP_MAX_PARSED_PARAMETERS]; + struct necp_client_parameter_netagent_type avoided_netagent_types[NECP_MAX_PARSED_PARAMETERS]; uuid_t required_netagents[NECP_MAX_PARSED_PARAMETERS]; uuid_t prohibited_netagents[NECP_MAX_PARSED_PARAMETERS]; uuid_t preferred_netagents[NECP_MAX_PARSED_PARAMETERS]; + uuid_t avoided_netagents[NECP_MAX_PARSED_PARAMETERS]; u_int16_t ip_protocol; pid_t effective_pid; uuid_t effective_uuid; @@ -226,7 +226,7 @@ struct necp_client_parsed_parameters { static bool necp_find_matching_interface_index(struct necp_client_parsed_parameters *parsed_parameters, - u_int *return_ifindex); + u_int *return_ifindex, bool *validate_agents); static bool necp_ifnet_matches_local_address(struct ifnet *ifp, struct sockaddr *sa); @@ -234,7 +234,8 @@ necp_ifnet_matches_local_address(struct ifnet *ifp, struct sockaddr *sa); static bool necp_ifnet_matches_parameters(struct ifnet *ifp, struct necp_client_parsed_parameters *parsed_parameters, - u_int32_t *preferred_count, bool ignore_require_if); + u_int32_t *preferred_count, + bool secondary_interface); static const struct fileops necp_fd_ops = { .fo_type = DTYPE_NETPOLICY, @@ -254,6 +255,8 @@ struct necp_client_assertion { struct necp_client_flow_header { struct necp_tlv_header outer_header; + struct necp_tlv_header flow_id_tlv_header; + uuid_t flow_id; struct necp_tlv_header flags_tlv_header; u_int32_t flags_value; struct necp_tlv_header interface_tlv_header; @@ -274,10 +277,64 @@ struct necp_client_nexus_flow_header { } __attribute__((__packed__)); +struct necp_client_flow { + LIST_ENTRY(necp_client_flow) flow_chain; + unsigned invalid : 1; + unsigned nexus : 1; // If true, flow is a nexus; if false, flow is attached to socket + unsigned socket : 1; + unsigned viable : 1; + unsigned assigned : 1; + unsigned has_protoctl_event : 1; + unsigned check_tcp_heuristics : 1; + unsigned _reserved : 1; + union { + uuid_t nexus_agent; + struct { + void *socket_handle; + necp_client_flow_cb cb; + }; + } u; + uint32_t interface_index; + uint16_t interface_flags; + uint32_t necp_flow_flags; + struct necp_client_flow_protoctl_event protoctl_event; + union necp_sockaddr_union local_addr; + union necp_sockaddr_union remote_addr; + + size_t assigned_results_length; + u_int8_t *assigned_results; +}; + +struct necp_client_flow_registration { + RB_ENTRY(necp_client_flow_registration) fd_link; + RB_ENTRY(necp_client_flow_registration) global_link; + RB_ENTRY(necp_client_flow_registration) client_link; + LIST_ENTRY(necp_client_flow_registration) collect_stats_chain; + uuid_t registration_id; + u_int32_t flags; + unsigned flow_result_read : 1; + unsigned defunct : 1; + void *interface_handle; + necp_client_flow_cb interface_cb; + struct necp_client *client; + LIST_HEAD(_necp_registration_flow_list, necp_client_flow) flow_list; + u_int64_t last_interface_details __attribute__((aligned(sizeof(u_int64_t)))); +}; + +static int necp_client_flow_id_cmp(struct necp_client_flow_registration *flow0, struct necp_client_flow_registration *flow1); + +RB_HEAD(_necp_client_flow_tree, necp_client_flow_registration); +RB_PROTOTYPE_PREV(_necp_client_flow_tree, necp_client_flow_registration, client_link, necp_client_flow_id_cmp); +RB_GENERATE_PREV(_necp_client_flow_tree, necp_client_flow_registration, client_link, necp_client_flow_id_cmp); + +#define NECP_CLIENT_INTERFACE_OPTION_STATIC_COUNT 4 +#define NECP_CLIENT_MAX_INTERFACE_OPTIONS 16 + +#define NECP_CLIENT_INTERFACE_OPTION_EXTRA_COUNT (NECP_CLIENT_MAX_INTERFACE_OPTIONS - NECP_CLIENT_INTERFACE_OPTION_STATIC_COUNT) + struct necp_client { RB_ENTRY(necp_client) link; RB_ENTRY(necp_client) global_link; - LIST_ENTRY(necp_client) collect_stats_chain; decl_lck_mtx_data(, lock); decl_lck_mtx_data(, route_lock); @@ -285,10 +342,9 @@ struct necp_client { uuid_t client_id; unsigned result_read : 1; - unsigned flow_result_read : 1; unsigned allow_multiple_flows : 1; + unsigned legacy_client_is_flow : 1; - unsigned defunct : 1; unsigned background : 1; unsigned background_update : 1; unsigned platform_binary : 1; @@ -301,13 +357,18 @@ struct necp_client { u_int16_t ip_protocol; int proc_pid; - LIST_HEAD(_necp_client_flow_list, necp_client_flow) flow_list; + struct _necp_client_flow_tree flow_registrations; LIST_HEAD(_necp_client_assertion_list, necp_client_assertion) assertion_list; struct rtentry *current_route; - void *interface_handle; - void (*interface_cb)(void *handle, int action, struct necp_client_flow *flow); + struct necp_client_interface_option interface_options[NECP_CLIENT_INTERFACE_OPTION_STATIC_COUNT]; + struct necp_client_interface_option *extra_interface_options; + u_int8_t interface_option_count; // Number in interface_options + extra_interface_options + + struct necp_client_result_netagent failed_trigger_agent; + + void *agent_handle; size_t parameters_length; u_int8_t parameters[0]; @@ -331,18 +392,19 @@ necp_client_add_assertion(struct necp_client *client, uuid_t netagent_uuid); static bool necp_client_remove_assertion(struct necp_client *client, uuid_t netagent_uuid); -LIST_HEAD(_necp_client_list, necp_client); -static struct _necp_client_list necp_collect_stats_client_list; +LIST_HEAD(_necp_flow_registration_list, necp_client_flow_registration); +static struct _necp_flow_registration_list necp_collect_stats_flow_list; -struct necp_client_defunct { - LIST_ENTRY(necp_client_defunct) chain; +struct necp_flow_defunct { + LIST_ENTRY(necp_flow_defunct) chain; - uuid_t client_id; + uuid_t flow_id; uuid_t nexus_agent; + void *agent_handle; int proc_pid; }; -LIST_HEAD(_necp_client_defunct_list, necp_client_defunct); +LIST_HEAD(_necp_flow_defunct_list, necp_flow_defunct); static int necp_client_id_cmp(struct necp_client *client0, struct necp_client *client1); @@ -354,7 +416,16 @@ RB_HEAD(_necp_client_global_tree, necp_client); RB_PROTOTYPE_PREV(_necp_client_global_tree, necp_client, global_link, necp_client_id_cmp); RB_GENERATE_PREV(_necp_client_global_tree, necp_client, global_link, necp_client_id_cmp); +RB_HEAD(_necp_fd_flow_tree, necp_client_flow_registration); +RB_PROTOTYPE_PREV(_necp_fd_flow_tree, necp_client_flow_registration, fd_link, necp_client_flow_id_cmp); +RB_GENERATE_PREV(_necp_fd_flow_tree, necp_client_flow_registration, fd_link, necp_client_flow_id_cmp); + +RB_HEAD(_necp_client_flow_global_tree, necp_client_flow_registration); +RB_PROTOTYPE_PREV(_necp_client_flow_global_tree, necp_client_flow_registration, global_link, necp_client_flow_id_cmp); +RB_GENERATE_PREV(_necp_client_flow_global_tree, necp_client_flow_registration, global_link, necp_client_flow_id_cmp); + static struct _necp_client_global_tree necp_client_global_tree; +static struct _necp_client_flow_global_tree necp_client_flow_global_tree; struct necp_client_update { TAILQ_ENTRY(necp_client_update) chain; @@ -366,10 +437,15 @@ struct necp_client_update { }; +#define NAIF_ATTACHED 0x1 // arena is attached to list +#define NAIF_REDIRECT 0x2 // arena mmap has been redirected +#define NAIF_DEFUNCT 0x4 // arena is now defunct + struct necp_fd_data { u_int8_t necp_fd_type; LIST_ENTRY(necp_fd_data) chain; struct _necp_client_tree clients; + struct _necp_fd_flow_tree flows; TAILQ_HEAD(_necp_client_update_list, necp_client_update) update_list; int update_count; int flags; @@ -392,11 +468,17 @@ static LIST_HEAD(_necp_fd_observer_list, necp_fd_data) necp_fd_observer_list; static unsigned int necp_client_fd_size; /* size of zone element */ static struct zone *necp_client_fd_zone; /* zone for necp_fd_data */ -#define NECP_FLOW_ZONE_MAX 512 -#define NECP_FLOW_ZONE_NAME "necp.flow" +#define NECP_FLOW_ZONE_NAME "necp.flow" +#define NECP_FLOW_REGISTRATION_ZONE_NAME "necp.flowregistration" static unsigned int necp_flow_size; /* size of necp_client_flow */ -static struct mcache *necp_flow_cache; /* cache for necp_client_flow */ +static struct mcache *necp_flow_cache; /* cache for necp_client_flow */ + +static unsigned int necp_flow_registration_size; /* size of necp_client_flow_registration */ +static struct mcache *necp_flow_registration_cache; /* cache for necp_client_flow_registration */ + +#define NECP_ARENA_INFO_ZONE_MAX 128 +#define NECP_ARENA_INFO_ZONE_NAME "necp.arenainfo" static lck_grp_attr_t *necp_fd_grp_attr = NULL; @@ -406,6 +488,7 @@ static lck_grp_t *necp_fd_mtx_grp = NULL; decl_lck_rw_data(static, necp_fd_lock); decl_lck_rw_data(static, necp_observer_lock); decl_lck_rw_data(static, necp_client_tree_lock); +decl_lck_rw_data(static, necp_flow_tree_lock); decl_lck_rw_data(static, necp_collect_stats_list_lock); #define NECP_STATS_LIST_LOCK_EXCLUSIVE() lck_rw_lock_exclusive(&necp_collect_stats_list_lock) @@ -415,6 +498,12 @@ decl_lck_rw_data(static, necp_collect_stats_list_lock); #define NECP_CLIENT_TREE_LOCK_EXCLUSIVE() lck_rw_lock_exclusive(&necp_client_tree_lock) #define NECP_CLIENT_TREE_LOCK_SHARED() lck_rw_lock_shared(&necp_client_tree_lock) #define NECP_CLIENT_TREE_UNLOCK() lck_rw_done(&necp_client_tree_lock) +#define NECP_CLIENT_TREE_ASSERT_LOCKED() LCK_RW_ASSERT(&necp_client_tree_lock, LCK_RW_ASSERT_HELD) + +#define NECP_FLOW_TREE_LOCK_EXCLUSIVE() lck_rw_lock_exclusive(&necp_flow_tree_lock) +#define NECP_FLOW_TREE_LOCK_SHARED() lck_rw_lock_shared(&necp_flow_tree_lock) +#define NECP_FLOW_TREE_UNLOCK() lck_rw_done(&necp_flow_tree_lock) +#define NECP_FLOW_TREE_ASSERT_LOCKED() LCK_RW_ASSERT(&necp_flow_tree_lock, LCK_RW_ASSERT_HELD) #define NECP_FD_LIST_LOCK_EXCLUSIVE() lck_rw_lock_exclusive(&necp_fd_lock) #define NECP_FD_LIST_LOCK_SHARED() lck_rw_lock_shared(&necp_fd_lock) @@ -428,7 +517,8 @@ decl_lck_rw_data(static, necp_collect_stats_list_lock); // Take NECP_FD_LIST_LOCK when accessing or modifying the necp_fd_list // Take NECP_CLIENT_TREE_LOCK when accessing or modifying the necp_client_global_tree -// Take NECP_STATS_LIST_LOCK when accessing or modifying the necp_collect_stats_client_list +// Take NECP_FLOW_TREE_LOCK when accessing or modifying the necp_client_flow_global_tree +// Take NECP_STATS_LIST_LOCK when accessing or modifying the necp_collect_stats_flow_list // Take NECP_FD_LOCK when accessing or modifying an necp_fd_data entry // Take NECP_CLIENT_LOCK when accessing or modifying a single necp_client // Take NECP_CLIENT_ROUTE_LOCK when accessing or modifying a client's route @@ -438,8 +528,9 @@ decl_lck_rw_data(static, necp_collect_stats_list_lock); // 2. NECP_FD_LOCK (any) // 3. NECP_CLIENT_TREE_LOCK // 4. NECP_CLIENT_LOCK (any) -// 5. NECP_STATS_LIST_LOCK -// 6. NECP_CLIENT_ROUTE_LOCK (any) +// 5. NECP_FLOW_TREE_LOCK +// 6. NECP_STATS_LIST_LOCK +// 7. NECP_CLIENT_ROUTE_LOCK (any) static thread_call_t necp_client_update_tcall; @@ -489,6 +580,19 @@ necp_fd_notify(struct necp_fd_data *fd_data, bool locked) } } +static inline bool +necp_client_has_unread_flows(struct necp_client *client) +{ + NECP_CLIENT_ASSERT_LOCKED(client); + struct necp_client_flow_registration *flow_registration = NULL; + RB_FOREACH(flow_registration, _necp_client_flow_tree, &client->flow_registrations) { + if (!flow_registration->flow_result_read) { + return true; + } + } + return false; +} + static int necp_fd_poll(struct necp_fd_data *fd_data, int events, void *wql, struct proc *p, int is_kevent) { @@ -508,7 +612,7 @@ necp_fd_poll(struct necp_fd_data *fd_data, int events, void *wql, struct proc *p bool has_unread_clients = FALSE; RB_FOREACH(client, _necp_client_tree, &fd_data->clients) { NECP_CLIENT_LOCK(client); - if (!client->result_read || !client->flow_result_read) { + if (!client->result_read || necp_client_has_unread_flows(client)) { has_unread_clients = TRUE; } NECP_CLIENT_UNLOCK(client); @@ -526,14 +630,96 @@ necp_fd_poll(struct necp_fd_data *fd_data, int events, void *wql, struct proc *p return (revents); } +static inline void +necp_generate_client_id(uuid_t client_id, bool is_flow) +{ + uuid_generate_random(client_id); + + if (is_flow) { + client_id[9] |= 0x01; + } else { + client_id[9] &= ~0x01; + } +} + +static inline bool +necp_client_id_is_flow(uuid_t client_id) +{ + return (client_id[9] & 0x01); +} + static struct necp_client * -necp_client_fd_find_client_and_lock(struct necp_fd_data *client_fd, uuid_t client_id) +necp_find_client_and_lock(uuid_t client_id) +{ + NECP_CLIENT_TREE_ASSERT_LOCKED(); + + struct necp_client *client = NULL; + + if (necp_client_id_is_flow(client_id)) { + NECP_FLOW_TREE_LOCK_SHARED(); + struct necp_client_flow_registration find; + uuid_copy(find.registration_id, client_id); + struct necp_client_flow_registration *flow = RB_FIND(_necp_client_flow_global_tree, &necp_client_flow_global_tree, &find); + if (flow != NULL) { + client = flow->client; + } + NECP_FLOW_TREE_UNLOCK(); + } else { + struct necp_client find; + uuid_copy(find.client_id, client_id); + client = RB_FIND(_necp_client_global_tree, &necp_client_global_tree, &find); + } + + if (client != NULL) { + NECP_CLIENT_LOCK(client); + } + + return (client); +} + +static struct necp_client_flow_registration * +necp_client_find_flow(struct necp_client *client, uuid_t flow_id) +{ + NECP_CLIENT_ASSERT_LOCKED(client); + struct necp_client_flow_registration *flow = NULL; + + if (necp_client_id_is_flow(flow_id)) { + struct necp_client_flow_registration find; + uuid_copy(find.registration_id, flow_id); + flow = RB_FIND(_necp_client_flow_tree, &client->flow_registrations, &find); + } else { + flow = RB_ROOT(&client->flow_registrations); + } + + return (flow); +} + +static struct necp_client * +necp_client_fd_find_client_unlocked(struct necp_fd_data *client_fd, uuid_t client_id) { - struct necp_client find; NECP_FD_ASSERT_LOCKED(client_fd); - uuid_copy(find.client_id, client_id); - struct necp_client *client = RB_FIND(_necp_client_tree, &client_fd->clients, &find); + struct necp_client *client = NULL; + + if (necp_client_id_is_flow(client_id)) { + struct necp_client_flow_registration find; + uuid_copy(find.registration_id, client_id); + struct necp_client_flow_registration *flow = RB_FIND(_necp_fd_flow_tree, &client_fd->flows, &find); + if (flow != NULL) { + client = flow->client; + } + } else { + struct necp_client find; + uuid_copy(find.client_id, client_id); + client = RB_FIND(_necp_client_tree, &client_fd->clients, &find); + } + + return (client); +} +static struct necp_client * +necp_client_fd_find_client_and_lock(struct necp_fd_data *client_fd, uuid_t client_id) +{ + struct necp_client *client = necp_client_fd_find_client_unlocked(client_fd, client_id); if (client != NULL) { NECP_CLIENT_LOCK(client); } @@ -547,6 +733,12 @@ necp_client_id_cmp(struct necp_client *client0, struct necp_client *client1) return (uuid_compare(client0->client_id, client1->client_id)); } +static inline int +necp_client_flow_id_cmp(struct necp_client_flow_registration *flow0, struct necp_client_flow_registration *flow1) +{ + return (uuid_compare(flow0->registration_id, flow1->registration_id)); +} + static int necpop_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx) { @@ -630,8 +822,6 @@ necp_fd_knrtouch(struct knote *kn, struct kevent_internal_s *kev) fd_data = (struct necp_fd_data *)kn->kn_hook; NECP_FD_LOCK(fd_data); - if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) - kn->kn_udata = kev->udata; revents = necp_fd_poll(fd_data, POLLIN, NULL, current_proc(), 1); NECP_FD_UNLOCK(fd_data); @@ -681,58 +871,47 @@ necpop_kqfilter(struct fileproc *fp, struct knote *kn, return ((revents & POLLIN) != 0); } +#define INTERFACE_FLAGS_SHIFT 32 +#define INTERFACE_FLAGS_MASK 0xffff +#define INTERFACE_INDEX_SHIFT 0 +#define INTERFACE_INDEX_MASK 0xffffffff -static bool -necp_set_client_defunct(struct necp_client *client) +static uint64_t +combine_interface_details(uint32_t interface_index, uint16_t interface_flags) { - bool updated = FALSE; - u_int32_t flags = 0; - u_int32_t value_size = 0; - - client->defunct = TRUE; - - u_int8_t *flags_pointer = necp_buffer_get_tlv_value(client->result, 0, &value_size); - if (flags_pointer && value_size == sizeof(flags)) { - memcpy(&flags, flags_pointer, value_size); - - flags |= NECP_CLIENT_RESULT_FLAG_DEFUNCT; - - (void)necp_buffer_write_tlv_if_different(client->result, NECP_CLIENT_RESULT_FLAGS, - sizeof(flags), &flags, &updated, client->result, sizeof(client->result)); - - if (updated) { - client->result_read = FALSE; - } - } - - return (updated); + return (((uint64_t)interface_flags & INTERFACE_FLAGS_MASK) << INTERFACE_FLAGS_SHIFT | + ((uint64_t)interface_index & INTERFACE_INDEX_MASK) << INTERFACE_INDEX_SHIFT); } + static void -necp_defunct_client_for_policy(struct necp_client *client, - struct _necp_client_defunct_list *defunct_list) +necp_defunct_flow_registration(struct necp_client *client, + struct necp_client_flow_registration *flow_registration, + struct _necp_flow_defunct_list *defunct_list) { NECP_CLIENT_ASSERT_LOCKED(client); - - if (!client->defunct) { + + if (!flow_registration->defunct) { bool needs_defunct = false; struct necp_client_flow *search_flow = NULL; - LIST_FOREACH(search_flow, &client->flow_list, flow_chain) { + LIST_FOREACH(search_flow, &flow_registration->flow_list, flow_chain) { if (search_flow->nexus && - !uuid_is_null(search_flow->u.nexus_agent) && - search_flow->requested_nexus) { + !uuid_is_null(search_flow->u.nexus_agent)) { // Save defunct values for the nexus if (defunct_list != NULL) { // Sleeping alloc won't fail; copy only what's necessary - struct necp_client_defunct *client_defunct = _MALLOC(sizeof (struct necp_client_defunct), - M_NECP, M_WAITOK | M_ZERO); - uuid_copy(client_defunct->nexus_agent, search_flow->u.nexus_agent); - uuid_copy(client_defunct->client_id, client->client_id); - client_defunct->proc_pid = client->proc_pid; + struct necp_flow_defunct *flow_defunct = _MALLOC(sizeof (struct necp_flow_defunct), + M_NECP, M_WAITOK | M_ZERO); + uuid_copy(flow_defunct->nexus_agent, search_flow->u.nexus_agent); + uuid_copy(flow_defunct->flow_id, ((flow_registration->flags & NECP_CLIENT_FLOW_FLAGS_USE_CLIENT_ID) ? + client->client_id : + flow_registration->registration_id)); + flow_defunct->proc_pid = client->proc_pid; + flow_defunct->agent_handle = client->agent_handle; // Add to the list provided by caller - LIST_INSERT_HEAD(defunct_list, client_defunct, chain); + LIST_INSERT_HEAD(defunct_list, flow_defunct, chain); } needs_defunct = true; @@ -740,12 +919,25 @@ necp_defunct_client_for_policy(struct necp_client *client, } if (needs_defunct) { + // Only set defunct if there was some assigned flow - client->defunct = true; + flow_registration->defunct = true; } } } +static void +necp_defunct_client_for_policy(struct necp_client *client, + struct _necp_flow_defunct_list *defunct_list) +{ + NECP_CLIENT_ASSERT_LOCKED(client); + + struct necp_client_flow_registration *flow_registration = NULL; + RB_FOREACH(flow_registration, _necp_client_flow_tree, &client->flow_registrations) { + necp_defunct_flow_registration(client, flow_registration, defunct_list); + } +} + static void necp_client_free(struct necp_client *client) { @@ -753,6 +945,9 @@ necp_client_free(struct necp_client *client) NECP_CLIENT_UNLOCK(client); + FREE(client->extra_interface_options, M_NECP); + client->extra_interface_options = NULL; + lck_mtx_destroy(&client->route_lock, necp_fd_mtx_grp); lck_mtx_destroy(&client->lock, necp_fd_mtx_grp); @@ -927,32 +1122,26 @@ necp_client_update_observer_remove(struct necp_client *client) } static void -necp_destroy_client(struct necp_client *client, pid_t pid, bool abort) +necp_destroy_client_flow_registration(struct necp_client *client, + struct necp_client_flow_registration *flow_registration, + pid_t pid, bool abort) { - NECP_CLIENT_ASSERT_UNLOCKED(client); - - necp_client_update_observer_remove(client); - - NECP_CLIENT_LOCK(client); + NECP_CLIENT_ASSERT_LOCKED(client); - // Free route - NECP_CLIENT_ROUTE_LOCK(client); - if (client->current_route != NULL) { - rtfree(client->current_route); - client->current_route = NULL; - } - NECP_CLIENT_ROUTE_UNLOCK(client); - // Remove flow assignments struct necp_client_flow *search_flow = NULL; struct necp_client_flow *temp_flow = NULL; - LIST_FOREACH_SAFE(search_flow, &client->flow_list, flow_chain, temp_flow) { + LIST_FOREACH_SAFE(search_flow, &flow_registration->flow_list, flow_chain, temp_flow) { if (search_flow->nexus && - !uuid_is_null(search_flow->u.nexus_agent) && - search_flow->requested_nexus) { + !uuid_is_null(search_flow->u.nexus_agent)) { // Note that if we had defuncted the client earlier, this would result in a harmless ENOENT - int netagent_error = netagent_client_message(search_flow->u.nexus_agent, client->client_id, pid, - abort ? NETAGENT_MESSAGE_TYPE_ABORT_NEXUS : NETAGENT_MESSAGE_TYPE_CLOSE_NEXUS); + int netagent_error = netagent_client_message(search_flow->u.nexus_agent, + ((flow_registration->flags & NECP_CLIENT_FLOW_FLAGS_USE_CLIENT_ID) ? + client->client_id : + flow_registration->registration_id), + pid, client->agent_handle, + (abort ? NETAGENT_MESSAGE_TYPE_ABORT_NEXUS : + NETAGENT_MESSAGE_TYPE_CLOSE_NEXUS)); if (netagent_error != 0 && netagent_error != ENOENT) { NECPLOG(LOG_ERR, "necp_client_remove close nexus error (%d)", netagent_error); } @@ -971,11 +1160,42 @@ necp_destroy_client(struct necp_client *client, pid_t pid, bool abort) mcache_free(necp_flow_cache, search_flow); } + RB_REMOVE(_necp_client_flow_tree, &client->flow_registrations, flow_registration); + flow_registration->client = NULL; + + mcache_free(necp_flow_registration_cache, flow_registration); +} + +static void +necp_destroy_client(struct necp_client *client, pid_t pid, bool abort) +{ + NECP_CLIENT_ASSERT_UNLOCKED(client); + + necp_client_update_observer_remove(client); + + NECP_CLIENT_LOCK(client); + + // Free route + NECP_CLIENT_ROUTE_LOCK(client); + if (client->current_route != NULL) { + rtfree(client->current_route); + client->current_route = NULL; + } + NECP_CLIENT_ROUTE_UNLOCK(client); + + // Remove flow assignments + struct necp_client_flow_registration *flow_registration = NULL; + struct necp_client_flow_registration *temp_flow_registration = NULL; + RB_FOREACH_SAFE(flow_registration, _necp_client_flow_tree, &client->flow_registrations, temp_flow_registration) { + necp_destroy_client_flow_registration(client, flow_registration, pid, abort); + } + // Remove agent assertions struct necp_client_assertion *search_assertion = NULL; struct necp_client_assertion *temp_assertion = NULL; LIST_FOREACH_SAFE(search_assertion, &client->assertion_list, assertion_chain, temp_assertion) { - int netagent_error = netagent_client_message(search_assertion->asserted_netagent, client->client_id, pid, NETAGENT_MESSAGE_TYPE_CLIENT_UNASSERT); + int netagent_error = netagent_client_message(search_assertion->asserted_netagent, client->client_id, pid, + client->agent_handle, NETAGENT_MESSAGE_TYPE_CLIENT_UNASSERT); if (netagent_error != 0) { NECPLOG((netagent_error == ENOENT ? LOG_DEBUG : LOG_ERR), "necp_client_remove unassert agent error (%d)", netagent_error); @@ -1018,6 +1238,16 @@ necpop_close(struct fileglob *fg, vfs_context_t ctx) NECP_FD_LOCK(fd_data); pid_t pid = fd_data->proc_pid; + + struct necp_client_flow_registration *flow_registration = NULL; + struct necp_client_flow_registration *temp_flow_registration = NULL; + RB_FOREACH_SAFE(flow_registration, _necp_fd_flow_tree, &fd_data->flows, temp_flow_registration) { + NECP_FLOW_TREE_LOCK_EXCLUSIVE(); + RB_REMOVE(_necp_client_flow_global_tree, &necp_client_flow_global_tree, flow_registration); + NECP_FLOW_TREE_UNLOCK(); + RB_REMOVE(_necp_fd_flow_tree, &fd_data->flows, flow_registration); + } + struct necp_client *client = NULL; struct necp_client *temp_client = NULL; RB_FOREACH_SAFE(client, _necp_client_tree, &fd_data->clients, temp_client) { @@ -1089,75 +1319,143 @@ necp_find_fd_data(int fd, struct necp_fd_data **fd_data) } *fd_data = (struct necp_fd_data *)fp->f_fglob->fg_data; + if ((*fd_data)->necp_fd_type != necp_fd_type_client) { + // Not a client fd, ignore + error = EINVAL; + goto done; + } + done: proc_fdunlock(p); return (error); } -static void -necp_client_add_socket_flow(struct necp_client *client, struct inpcb *inp) -{ - struct necp_client_flow *new_flow = mcache_alloc(necp_flow_cache, MCR_SLEEP); - if (new_flow == NULL) { - NECPLOG0(LOG_ERR, "Failed to allocate socket flow"); - return; - } - - memset(new_flow, 0, sizeof(*new_flow)); - - new_flow->socket = TRUE; - new_flow->u.socket_handle = inp; - new_flow->u.cb = inp->necp_cb; - - OSIncrementAtomic(&necp_socket_flow_count); - - LIST_INSERT_HEAD(&client->flow_list, new_flow, flow_chain); -} - -static void -necp_client_add_interface_flow(struct necp_client *client, uint32_t interface_index) +static struct necp_client_flow * +necp_client_add_interface_flow(struct necp_client_flow_registration *flow_registration, + uint32_t interface_index) { struct necp_client_flow *new_flow = mcache_alloc(necp_flow_cache, MCR_SLEEP); if (new_flow == NULL) { NECPLOG0(LOG_ERR, "Failed to allocate interface flow"); - return; + return NULL; } memset(new_flow, 0, sizeof(*new_flow)); // Neither nexus nor socket new_flow->interface_index = interface_index; - new_flow->u.socket_handle = client->interface_handle; - new_flow->u.cb = client->interface_cb; + new_flow->u.socket_handle = flow_registration->interface_handle; + new_flow->u.cb = flow_registration->interface_cb; OSIncrementAtomic(&necp_if_flow_count); - LIST_INSERT_HEAD(&client->flow_list, new_flow, flow_chain); + LIST_INSERT_HEAD(&flow_registration->flow_list, new_flow, flow_chain); + + return new_flow; } -static void -necp_client_add_interface_flow_if_needed(struct necp_client *client, uint32_t interface_index) +static struct necp_client_flow * +necp_client_add_interface_flow_if_needed(struct necp_client *client, + struct necp_client_flow_registration *flow_registration, + uint32_t interface_index) { if (!client->allow_multiple_flows || interface_index == IFSCOPE_NONE) { // Interface not set, or client not allowed to use this mode - return; + return NULL; } struct necp_client_flow *flow = NULL; - LIST_FOREACH(flow, &client->flow_list, flow_chain) { + LIST_FOREACH(flow, &flow_registration->flow_list, flow_chain) { if (!flow->nexus && !flow->socket && flow->interface_index == interface_index) { // Already have the flow flow->invalid = FALSE; + flow->u.socket_handle = flow_registration->interface_handle; + flow->u.cb = flow_registration->interface_cb; + return NULL; + } + } + return necp_client_add_interface_flow(flow_registration, interface_index); +} + +static void +necp_client_add_interface_option_if_needed(struct necp_client *client, + uint32_t interface_index, + uint32_t interface_generation, + uuid_t *nexus_agent) +{ + if (interface_index == IFSCOPE_NONE || + (client->interface_option_count != 0 && !client->allow_multiple_flows)) { + // Interface not set, or client not allowed to use this mode + return; + } + + if (client->interface_option_count >= NECP_CLIENT_MAX_INTERFACE_OPTIONS) { + // Cannot take any more interface options + return; + } - flow->u.socket_handle = client->interface_handle; - flow->u.cb = client->interface_cb; - return; + // Check if already present + for (u_int32_t option_i = 0; option_i < client->interface_option_count; option_i++) { + if (option_i < NECP_CLIENT_INTERFACE_OPTION_STATIC_COUNT) { + struct necp_client_interface_option *option = &client->interface_options[option_i]; + if (option->interface_index == interface_index) { + if (nexus_agent == NULL) { + return; + } + if (uuid_compare(option->nexus_agent, *nexus_agent) == 0) { + return; + } + if (uuid_is_null(option->nexus_agent)) { + uuid_copy(option->nexus_agent, *nexus_agent); + return; + } + // If we get to this point, this is a new nexus flow + } + } else { + struct necp_client_interface_option *option = &client->extra_interface_options[option_i - NECP_CLIENT_INTERFACE_OPTION_STATIC_COUNT]; + if (option->interface_index == interface_index) { + if (nexus_agent == NULL) { + return; + } + if (uuid_compare(option->nexus_agent, *nexus_agent) == 0) { + return; + } + if (uuid_is_null(option->nexus_agent)) { + uuid_copy(option->nexus_agent, *nexus_agent); + return; + } + // If we get to this point, this is a new nexus flow + } } } - necp_client_add_interface_flow(client, interface_index); + // Add a new entry + if (client->interface_option_count < NECP_CLIENT_INTERFACE_OPTION_STATIC_COUNT) { + // Add to static + struct necp_client_interface_option *option = &client->interface_options[client->interface_option_count]; + option->interface_index = interface_index; + option->interface_generation = interface_generation; + if (nexus_agent != NULL) { + uuid_copy(option->nexus_agent, *nexus_agent); + } + client->interface_option_count++; + } else { + // Add to extra + if (client->extra_interface_options == NULL) { + client->extra_interface_options = _MALLOC(sizeof(struct necp_client_interface_option) * NECP_CLIENT_INTERFACE_OPTION_EXTRA_COUNT, M_NECP, M_WAITOK | M_ZERO); + } + if (client->extra_interface_options != NULL) { + struct necp_client_interface_option *option = &client->extra_interface_options[client->interface_option_count - NECP_CLIENT_INTERFACE_OPTION_STATIC_COUNT]; + option->interface_index = interface_index; + option->interface_generation = interface_generation; + if (nexus_agent != NULL) { + uuid_copy(option->nexus_agent, *nexus_agent); + } + client->interface_option_count++; + } + } } static bool @@ -1179,66 +1477,110 @@ necp_client_flow_is_viable(proc_t proc, struct necp_client *client, result.routing_result != NECP_KERNEL_POLICY_RESULT_DROP); } +static void +necp_flow_add_interface_flows(proc_t proc, + struct necp_client *client, + struct necp_client_flow_registration *flow_registration, + bool send_initial) +{ + // Traverse all interfaces and add a tracking flow if needed + for (u_int32_t option_i = 0; option_i < client->interface_option_count; option_i++) { + if (option_i < NECP_CLIENT_INTERFACE_OPTION_STATIC_COUNT) { + struct necp_client_interface_option *option = &client->interface_options[option_i]; + struct necp_client_flow *flow = necp_client_add_interface_flow_if_needed(client, flow_registration, option->interface_index); + if (flow != NULL && send_initial) { + flow->viable = necp_client_flow_is_viable(proc, client, flow); + if (flow->viable && flow->u.cb) { + bool viable = flow->viable; + flow->u.cb(flow_registration->interface_handle, NECP_CLIENT_CBACTION_INITIAL, flow->interface_index, flow->necp_flow_flags, &viable); + flow->viable = viable; + } + } + } else { + struct necp_client_interface_option *option = &client->extra_interface_options[option_i - NECP_CLIENT_INTERFACE_OPTION_STATIC_COUNT]; + struct necp_client_flow *flow = necp_client_add_interface_flow_if_needed(client, flow_registration, option->interface_index); + if (flow != NULL && send_initial) { + flow->viable = necp_client_flow_is_viable(proc, client, flow); + if (flow->viable && flow->u.cb) { + bool viable = flow->viable; + flow->u.cb(flow_registration->interface_handle, NECP_CLIENT_CBACTION_INITIAL, flow->interface_index, flow->necp_flow_flags, &viable); + flow->viable = viable; + } + } + } + } +} + static bool necp_client_update_flows(proc_t proc, struct necp_client *client, - struct _necp_client_defunct_list *defunct_list, - bool *defuncted_by_flow) + struct _necp_flow_defunct_list *defunct_list) { NECP_CLIENT_ASSERT_LOCKED(client); bool client_updated = FALSE; struct necp_client_flow *flow = NULL; struct necp_client_flow *temp_flow = NULL; - LIST_FOREACH_SAFE(flow, &client->flow_list, flow_chain, temp_flow) { - // Check policy result for flow - int old_flags = flow->necp_flow_flags; - bool viable = necp_client_flow_is_viable(proc, client, flow); - - // TODO: Defunct nexus flows that are blocked by policy - - if (flow->viable != viable) { - flow->viable = viable; - client_updated = TRUE; + struct necp_client_flow_registration *flow_registration = NULL; + RB_FOREACH(flow_registration, _necp_client_flow_tree, &client->flow_registrations) { + if (flow_registration->interface_cb != NULL) { + // Add any interface flows that are not already tracked + necp_flow_add_interface_flows(proc, client, flow_registration, false); } - if ((old_flags & (NECP_CLIENT_RESULT_FLAG_HAS_IPV4 | NECP_CLIENT_RESULT_FLAG_HAS_IPV6)) != - (flow->necp_flow_flags & (NECP_CLIENT_RESULT_FLAG_HAS_IPV4 | NECP_CLIENT_RESULT_FLAG_HAS_IPV6))) { - client_updated = TRUE; - } + LIST_FOREACH_SAFE(flow, &flow_registration->flow_list, flow_chain, temp_flow) { + // Check policy result for flow + int old_flags = flow->necp_flow_flags; + bool viable = necp_client_flow_is_viable(proc, client, flow); - if (flow->viable && client_updated && (flow->socket || (!flow->socket && !flow->nexus)) && flow->u.cb) { - flow->u.cb(flow->u.socket_handle, NECP_CLIENT_CBACTION_VIABLE, flow); - } + // TODO: Defunct nexus flows that are blocked by policy - if (!flow->viable || flow->invalid) { - if (client_updated && (flow->socket || (!flow->socket && !flow->nexus)) && flow->u.cb) { - flow->u.cb(flow->u.socket_handle, NECP_CLIENT_CBACTION_NONVIABLE, flow); + if (flow->viable != viable) { + flow->viable = viable; + client_updated = TRUE; + } + + if ((old_flags & NECP_CLIENT_RESULT_FLAG_FORCE_UPDATE) != + (flow->necp_flow_flags & NECP_CLIENT_RESULT_FLAG_FORCE_UPDATE)) { + client_updated = TRUE; } - // The callback might change the viable-flag of the - // flow depending on its policy. Thus, we need to - // check again the flags after the callback. - } - (void)defunct_list; - (void)defuncted_by_flow; + if (flow->viable && client_updated && (flow->socket || (!flow->socket && !flow->nexus)) && flow->u.cb) { + bool flow_viable = flow->viable; + flow->u.cb(flow->u.socket_handle, NECP_CLIENT_CBACTION_VIABLE, flow->interface_index, flow->necp_flow_flags, &viable); + flow->viable = flow_viable; + } - // Handle flows that no longer match - if (!flow->viable || flow->invalid) { - // Drop them as long as they aren't assigned data - if (!flow->requested_nexus && !flow->assigned) { - if (flow->assigned_results != NULL) { - FREE(flow->assigned_results, M_NETAGENT); - flow->assigned_results = NULL; - client_updated = TRUE; + if (!flow->viable || flow->invalid) { + if (client_updated && (flow->socket || (!flow->socket && !flow->nexus)) && flow->u.cb) { + bool flow_viable = flow->viable; + flow->u.cb(flow->u.socket_handle, NECP_CLIENT_CBACTION_NONVIABLE, flow->interface_index, flow->necp_flow_flags, &viable); + flow->viable = flow_viable; } - LIST_REMOVE(flow, flow_chain); - if (flow->socket) { - OSDecrementAtomic(&necp_socket_flow_count); - } else { - OSDecrementAtomic(&necp_if_flow_count); + // The callback might change the viable-flag of the + // flow depending on its policy. Thus, we need to + // check the flags again after the callback. + } + + (void)defunct_list; + + // Handle flows that no longer match + if (!flow->viable || flow->invalid) { + // Drop them as long as they aren't assigned data + if (!flow->nexus && !flow->assigned) { + if (flow->assigned_results != NULL) { + FREE(flow->assigned_results, M_NETAGENT); + flow->assigned_results = NULL; + client_updated = TRUE; + } + LIST_REMOVE(flow, flow_chain); + if (flow->socket) { + OSDecrementAtomic(&necp_socket_flow_count); + } else { + OSDecrementAtomic(&necp_if_flow_count); + } + mcache_free(necp_flow_cache, flow); } - mcache_free(necp_flow_cache, flow); } } } @@ -1249,23 +1591,29 @@ necp_client_update_flows(proc_t proc, static void necp_client_mark_all_nonsocket_flows_as_invalid(struct necp_client *client) { + struct necp_client_flow_registration *flow_registration = NULL; struct necp_client_flow *flow = NULL; - LIST_FOREACH(flow, &client->flow_list, flow_chain) { - if (!flow->socket) { // Socket flows are not marked as invalid - flow->invalid = TRUE; + RB_FOREACH(flow_registration, _necp_client_flow_tree, &client->flow_registrations) { + LIST_FOREACH(flow, &flow_registration->flow_list, flow_chain) { + if (!flow->socket) { // Socket flows are not marked as invalid + flow->invalid = TRUE; + } } } + + // Reset option count every update + client->interface_option_count = 0; } static bool -necp_netagent_applies_to_client(__unused struct necp_client *client, +necp_netagent_applies_to_client(struct necp_client *client, const struct necp_client_parsed_parameters *parameters, - uuid_t netagent_uuid, bool allow_nexus, - uint32_t interface_index, u_int16_t interface_flags) + uuid_t *netagent_uuid, bool allow_nexus, + uint32_t interface_index, uint32_t interface_generation) { -#pragma unused(interface_index, interface_flags) +#pragma unused(interface_index, interface_generation) bool applies = FALSE; - u_int32_t flags = netagent_get_flags(netagent_uuid); + u_int32_t flags = netagent_get_flags(*netagent_uuid); if (!(flags & NETAGENT_FLAG_REGISTERED)) { // Unregistered agents never apply return (applies); @@ -1279,6 +1627,17 @@ necp_netagent_applies_to_client(__unused struct necp_client *client, return (applies); } + if (uuid_compare(client->failed_trigger_agent.netagent_uuid, *netagent_uuid) == 0) { + if (client->failed_trigger_agent.generation == netagent_get_generation(*netagent_uuid)) { + // If this agent was triggered, and failed, and hasn't changed, keep hiding it + return (applies); + } else { + // Mismatch generation, clear out old trigger + uuid_clear(client->failed_trigger_agent.netagent_uuid); + client->failed_trigger_agent.generation = 0; + } + } + if (flags & NETAGENT_FLAG_SPECIFIC_USE_ONLY) { // Specific use agents only apply when required bool required = FALSE; @@ -1288,7 +1647,7 @@ necp_netagent_applies_to_client(__unused struct necp_client *client, if (uuid_is_null(parameters->required_netagents[i])) { break; } - if (uuid_compare(parameters->required_netagents[i], netagent_uuid) == 0) { + if (uuid_compare(parameters->required_netagents[i], *netagent_uuid) == 0) { required = TRUE; break; } @@ -1309,7 +1668,7 @@ necp_netagent_applies_to_client(__unused struct necp_client *client, } if (!fetched_type) { - if (netagent_get_agent_domain_and_type(netagent_uuid, netagent_domain, netagent_type)) { + if (netagent_get_agent_domain_and_type(*netagent_uuid, netagent_domain, netagent_type)) { fetched_type = TRUE; } else { break; @@ -1337,18 +1696,18 @@ necp_netagent_applies_to_client(__unused struct necp_client *client, } static void -necp_client_add_agent_flows_for_interface(struct necp_client *client, - const struct necp_client_parsed_parameters *parsed_parameters, - ifnet_t ifp) +necp_client_add_agent_interface_options(struct necp_client *client, + const struct necp_client_parsed_parameters *parsed_parameters, + ifnet_t ifp) { if (ifp != NULL && ifp->if_agentids != NULL) { for (u_int32_t i = 0; i < ifp->if_agentcount; i++) { if (uuid_is_null(ifp->if_agentids[i])) { continue; } - u_int16_t if_flags = nstat_ifnet_to_flags(ifp); // Relies on the side effect that nexus agents that apply will create flows - (void)necp_netagent_applies_to_client(client, parsed_parameters, ifp->if_agentids[i], TRUE, ifp->if_index, if_flags); + (void)necp_netagent_applies_to_client(client, parsed_parameters, &ifp->if_agentids[i], TRUE, + ifp->if_index, ifnet_get_generation(ifp)); } } } @@ -1378,9 +1737,11 @@ necp_client_parse_parameters(u_int8_t *parameters, u_int32_t num_required_agents = 0; u_int32_t num_prohibited_agents = 0; u_int32_t num_preferred_agents = 0; + u_int32_t num_avoided_agents = 0; u_int32_t num_required_agent_types = 0; u_int32_t num_prohibited_agent_types = 0; u_int32_t num_preferred_agent_types = 0; + u_int32_t num_avoided_agent_types = 0; if (parsed_parameters == NULL) { return (EINVAL); @@ -1536,6 +1897,17 @@ necp_client_parse_parameters(u_int8_t *parameters, } break; } + case NECP_CLIENT_PARAMETER_AVOID_AGENT: { + if (num_avoided_agents >= NECP_MAX_PARSED_PARAMETERS) { + break; + } + if (length >= sizeof(uuid_t)) { + memcpy(&parsed_parameters->avoided_netagents[num_avoided_agents], value, sizeof(uuid_t)); + num_avoided_agents++; + parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT; + } + break; + } case NECP_CLIENT_PARAMETER_REQUIRE_AGENT_TYPE: { if (num_required_agent_types >= NECP_MAX_PARSED_PARAMETERS) { break; @@ -1569,9 +1941,20 @@ necp_client_parse_parameters(u_int8_t *parameters, } break; } - case NECP_CLIENT_PARAMETER_FLAGS: { - if (length >= sizeof(u_int32_t)) { - memcpy(&parsed_parameters->flags, value, sizeof(parsed_parameters->flags)); + case NECP_CLIENT_PARAMETER_AVOID_AGENT_TYPE: { + if (num_avoided_agent_types >= NECP_MAX_PARSED_PARAMETERS) { + break; + } + if (length >= sizeof(struct necp_client_parameter_netagent_type)) { + memcpy(&parsed_parameters->avoided_netagent_types[num_avoided_agent_types], value, sizeof(struct necp_client_parameter_netagent_type)); + num_avoided_agent_types++; + parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT_TYPE; + } + break; + } + case NECP_CLIENT_PARAMETER_FLAGS: { + if (length >= sizeof(u_int32_t)) { + memcpy(&parsed_parameters->flags, value, sizeof(parsed_parameters->flags)); parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_FLAGS; } break; @@ -1621,8 +2004,10 @@ static int necp_client_parse_result(u_int8_t *result, u_int32_t result_size, union necp_sockaddr_union *local_address, - union necp_sockaddr_union *remote_address) + union necp_sockaddr_union *remote_address, + void **flow_stats) { +#pragma unused(flow_stats) int error = 0; size_t offset = 0; @@ -1665,37 +2050,109 @@ necp_client_parse_result(u_int8_t *result, return (error); } +static struct necp_client_flow_registration * +necp_client_create_flow_registration(struct necp_fd_data *fd_data, struct necp_client *client) +{ + NECP_FD_ASSERT_LOCKED(fd_data); + NECP_CLIENT_ASSERT_LOCKED(client); + + struct necp_client_flow_registration *new_registration = mcache_alloc(necp_flow_registration_cache, MCR_SLEEP); + if (new_registration == NULL) { + return NULL; + } + + memset(new_registration, 0, sizeof(*new_registration)); + + new_registration->last_interface_details = combine_interface_details(IFSCOPE_NONE, NSTAT_IFNET_IS_UNKNOWN_TYPE); + + necp_generate_client_id(new_registration->registration_id, true); + LIST_INIT(&new_registration->flow_list); + + // Add registration to client list + RB_INSERT(_necp_client_flow_tree, &client->flow_registrations, new_registration); + + // Add registration to fd list + RB_INSERT(_necp_fd_flow_tree, &fd_data->flows, new_registration); + + // Add registration to global tree for lookup + NECP_FLOW_TREE_LOCK_EXCLUSIVE(); + RB_INSERT(_necp_client_flow_global_tree, &necp_client_flow_global_tree, new_registration); + NECP_FLOW_TREE_UNLOCK(); + + new_registration->client = client; + + // Start out assuming there is nothing to read from the flow + new_registration->flow_result_read = true; + + return new_registration; +} + +static void +necp_client_add_socket_flow(struct necp_client_flow_registration *flow_registration, + struct inpcb *inp) +{ + struct necp_client_flow *new_flow = mcache_alloc(necp_flow_cache, MCR_SLEEP); + if (new_flow == NULL) { + NECPLOG0(LOG_ERR, "Failed to allocate socket flow"); + return; + } + + memset(new_flow, 0, sizeof(*new_flow)); + + new_flow->socket = TRUE; + new_flow->u.socket_handle = inp; + new_flow->u.cb = inp->necp_cb; + + OSIncrementAtomic(&necp_socket_flow_count); + + LIST_INSERT_HEAD(&flow_registration->flow_list, new_flow, flow_chain); +} + int necp_client_register_socket_flow(pid_t pid, uuid_t client_id, struct inpcb *inp) { int error = 0; + struct necp_fd_data *client_fd = NULL; bool found_client = FALSE; - NECP_CLIENT_TREE_LOCK_SHARED(); - - struct necp_client find; - uuid_copy(find.client_id, client_id); - struct necp_client *client = RB_FIND(_necp_client_global_tree, &necp_client_global_tree, &find); - if (client != NULL) { - NECP_CLIENT_LOCK(client); + NECP_FD_LIST_LOCK_SHARED(); + LIST_FOREACH(client_fd, &necp_fd_list, chain) { + NECP_FD_LOCK(client_fd); + struct necp_client *client = necp_client_fd_find_client_and_lock(client_fd, client_id); + if (client != NULL) { + if (!pid || client->proc_pid == pid) { + struct necp_client_flow_registration *flow_registration = necp_client_find_flow(client, client_id); + if (flow_registration != NULL) { + // Found the right client and flow registration, add a new flow + found_client = TRUE; + necp_client_add_socket_flow(flow_registration, inp); + } else if (RB_EMPTY(&client->flow_registrations) && !necp_client_id_is_flow(client_id)) { + // No flows yet on this client, add a new registration + flow_registration = necp_client_create_flow_registration(client_fd, client); + if (flow_registration == NULL) { + error = ENOMEM; + } else { + // Add a new flow + found_client = TRUE; + necp_client_add_socket_flow(flow_registration, inp); + } + } + } - if (!pid || client->proc_pid == pid) { - // Found the right client! - found_client = TRUE; - necp_client_add_socket_flow(client, inp); + NECP_CLIENT_UNLOCK(client); } + NECP_FD_UNLOCK(client_fd); - NECP_CLIENT_UNLOCK(client); + if (found_client) { + break; + } } - - NECP_CLIENT_TREE_UNLOCK(); + NECP_FD_LIST_UNLOCK(); if (!found_client) { error = ENOENT; } else { - /* - * Count the sockets that have the NECP client UUID set - */ + // Count the sockets that have the NECP client UUID set struct socket *so = inp->inp_socket; if (!(so->so_flags1 & SOF1_HAS_NECP_CLIENT_UUID)) { so->so_flags1 |= SOF1_HAS_NECP_CLIENT_UUID; @@ -1707,62 +2164,192 @@ necp_client_register_socket_flow(pid_t pid, uuid_t client_id, struct inpcb *inp) } static void -necp_client_add_multipath_cb(struct necp_client *client, struct mppcb *mpp) +necp_client_add_multipath_interface_flows(struct necp_client_flow_registration *flow_registration, + struct necp_client *client, + struct mppcb *mpp) { - struct necp_client_flow *flow = NULL; + flow_registration->interface_handle = mpp; + flow_registration->interface_cb = mpp->necp_cb; - client->interface_handle = mpp; - client->interface_cb = mpp->necp_cb; + proc_t proc = proc_find(client->proc_pid); + if (proc == PROC_NULL) { + return; + } - LIST_FOREACH(flow, &client->flow_list, flow_chain) { - if (flow->nexus || flow->socket) { + // Traverse all interfaces and add a tracking flow if needed + necp_flow_add_interface_flows(proc, client, flow_registration, true); + + proc_rele(proc); + proc = PROC_NULL; +} + +int +necp_client_register_multipath_cb(pid_t pid, uuid_t client_id, struct mppcb *mpp) +{ + int error = 0; + struct necp_fd_data *client_fd = NULL; + bool found_client = FALSE; + + NECP_FD_LIST_LOCK_SHARED(); + LIST_FOREACH(client_fd, &necp_fd_list, chain) { + NECP_FD_LOCK(client_fd); + struct necp_client *client = necp_client_fd_find_client_and_lock(client_fd, client_id); + if (client != NULL) { + if (!pid || client->proc_pid == pid) { + struct necp_client_flow_registration *flow_registration = necp_client_find_flow(client, client_id); + if (flow_registration != NULL) { + // Found the right client and flow registration, add a new flow + found_client = TRUE; + necp_client_add_multipath_interface_flows(flow_registration, client, mpp); + } else if (RB_EMPTY(&client->flow_registrations) && !necp_client_id_is_flow(client_id)) { + // No flows yet on this client, add a new registration + flow_registration = necp_client_create_flow_registration(client_fd, client); + if (flow_registration == NULL) { + error = ENOMEM; + } else { + // Add a new flow + found_client = TRUE; + necp_client_add_multipath_interface_flows(flow_registration, client, mpp); + } + } + } + + NECP_CLIENT_UNLOCK(client); + } + NECP_FD_UNLOCK(client_fd); + + if (found_client) { + break; + } + } + NECP_FD_LIST_UNLOCK(); + + if (!found_client && error == 0) { + error = ENOENT; + } + + return (error); +} + +#define NETAGENT_DOMAIN_RADIO_MANAGER "WirelessRadioManager" +#define NETAGENT_TYPE_RADIO_MANAGER "WirelessRadioManager:BB Manager" + +static int +necp_client_lookup_bb_radio_manager(struct necp_client *client, + uuid_t netagent_uuid) +{ + char netagent_domain[NETAGENT_DOMAINSIZE]; + char netagent_type[NETAGENT_TYPESIZE]; + struct necp_aggregate_result result; + proc_t proc; + int error; + + proc = proc_find(client->proc_pid); + if (proc == PROC_NULL) { + return ESRCH; + } + + error = necp_application_find_policy_match_internal(proc, client->parameters, (u_int32_t)client->parameters_length, + &result, NULL, 0, NULL, NULL, NULL, true); + + proc_rele(proc); + proc = PROC_NULL; + + if (error) { + return error; + } + + for (int i = 0; i < NECP_MAX_NETAGENTS; i++) { + if (uuid_is_null(result.netagents[i])) { + // Passed end of valid agents + break; + } + + memset(&netagent_domain, 0, NETAGENT_DOMAINSIZE); + memset(&netagent_type, 0, NETAGENT_TYPESIZE); + if (netagent_get_agent_domain_and_type(result.netagents[i], netagent_domain, netagent_type) == FALSE) { + continue; + } + + if (strncmp(netagent_domain, NETAGENT_DOMAIN_RADIO_MANAGER, NETAGENT_DOMAINSIZE) != 0) { continue; } - flow->u.socket_handle = mpp; - flow->u.cb = mpp->necp_cb; + if (strncmp(netagent_type, NETAGENT_TYPE_RADIO_MANAGER, NETAGENT_TYPESIZE) != 0) { + continue; + } + + uuid_copy(netagent_uuid, result.netagents[i]); - if (flow->viable && flow->u.cb) { - flow->u.cb(mpp, NECP_CLIENT_CBACTION_INITIAL, flow); + break; + } + + return 0; +} + +static int +necp_client_assert_bb_radio_manager_common(struct necp_client *client, bool assert) +{ + uuid_t netagent_uuid; + uint8_t assert_type; + int error; + + error = necp_client_lookup_bb_radio_manager(client, netagent_uuid); + if (error) { + NECPLOG0(LOG_ERR, "BB radio manager agent not found"); + return error; + } + + // Before unasserting, verify that the assertion was already taken + if (assert == FALSE) { + assert_type = NETAGENT_MESSAGE_TYPE_CLIENT_UNASSERT; + + if (!necp_client_remove_assertion(client, netagent_uuid)) { + return EINVAL; } + } else { + assert_type = NETAGENT_MESSAGE_TYPE_CLIENT_ASSERT; + } + + error = netagent_client_message(netagent_uuid, client->client_id, client->proc_pid, client->agent_handle, assert_type); + if (error) { + NECPLOG0(LOG_ERR, "netagent_client_message failed"); + return error; } + + // Only save the assertion if the action succeeded + if (assert == TRUE) { + necp_client_add_assertion(client, netagent_uuid); + } + + return 0; } int -necp_client_register_multipath_cb(pid_t pid, uuid_t client_id, struct mppcb *mpp) +necp_client_assert_bb_radio_manager(uuid_t client_id, bool assert) { + struct necp_client *client; int error = 0; - bool found_client = FALSE; NECP_CLIENT_TREE_LOCK_SHARED(); - struct necp_client find; - uuid_copy(find.client_id, client_id); - struct necp_client *client = RB_FIND(_necp_client_global_tree, &necp_client_global_tree, &find); - if (client != NULL) { - NECP_CLIENT_LOCK(client); + client = necp_find_client_and_lock(client_id); - if (!pid || client->proc_pid == pid) { - // Found the right client! - found_client = TRUE; - necp_client_add_multipath_cb(client, mpp); - } + if (client) { + // Found the right client! + error = necp_client_assert_bb_radio_manager_common(client, assert); NECP_CLIENT_UNLOCK(client); + } else { + NECPLOG0(LOG_ERR, "Couldn't find client"); + error = ENOENT; } NECP_CLIENT_TREE_UNLOCK(); - if (!found_client) { - error = ENOENT; - } - return (error); } -#define NETAGENT_DOMAIN_NETEXT "NetworkExtension" -#define NETAGENT_TYPE_PATHCTRL "PathController" - static int necp_client_unregister_socket_flow(uuid_t client_id, void *handle) { @@ -1777,22 +2364,26 @@ necp_client_unregister_socket_flow(uuid_t client_id, void *handle) struct necp_client *client = necp_client_fd_find_client_and_lock(client_fd, client_id); if (client != NULL) { - // Found the right client! - found_client = TRUE; + struct necp_client_flow_registration *flow_registration = necp_client_find_flow(client, client_id); + if (flow_registration != NULL) { + // Found the right client and flow! + found_client = TRUE; - // Remove flow assignment - struct necp_client_flow *search_flow = NULL; - struct necp_client_flow *temp_flow = NULL; - LIST_FOREACH_SAFE(search_flow, &client->flow_list, flow_chain, temp_flow) { - if (search_flow->socket && search_flow->u.socket_handle == handle) { - if (search_flow->assigned_results != NULL) { - FREE(search_flow->assigned_results, M_NETAGENT); - search_flow->assigned_results = NULL; + // Remove flow assignment + struct necp_client_flow *search_flow = NULL; + struct necp_client_flow *temp_flow = NULL; + LIST_FOREACH_SAFE(search_flow, &flow_registration->flow_list, flow_chain, temp_flow) { + if (search_flow->socket && search_flow->u.socket_handle == handle) { + if (search_flow->assigned_results != NULL) { + FREE(search_flow->assigned_results, M_NETAGENT); + search_flow->assigned_results = NULL; + } + client_updated = TRUE; + flow_registration->flow_result_read = FALSE; + LIST_REMOVE(search_flow, flow_chain); + OSDecrementAtomic(&necp_socket_flow_count); + mcache_free(necp_flow_cache, search_flow); } - client_updated = TRUE; - LIST_REMOVE(search_flow, flow_chain); - OSDecrementAtomic(&necp_socket_flow_count); - mcache_free(necp_flow_cache, search_flow); } } @@ -1800,7 +2391,6 @@ necp_client_unregister_socket_flow(uuid_t client_id, void *handle) } if (client_updated) { - client->flow_result_read = FALSE; necp_fd_notify(client_fd, true); } NECP_FD_UNLOCK(client_fd); @@ -1826,28 +2416,27 @@ necp_client_unregister_multipath_cb(uuid_t client_id, void *handle) NECP_CLIENT_TREE_LOCK_SHARED(); - struct necp_client find; - uuid_copy(find.client_id, client_id); - struct necp_client *client = RB_FIND(_necp_client_global_tree, &necp_client_global_tree, &find); + struct necp_client *client = necp_find_client_and_lock(client_id); if (client != NULL) { - NECP_CLIENT_LOCK(client); - - // Found the right client! - found_client = TRUE; + struct necp_client_flow_registration *flow_registration = necp_client_find_flow(client, client_id); + if (flow_registration != NULL) { + // Found the right client and flow! + found_client = TRUE; - // Remove flow assignment - struct necp_client_flow *search_flow = NULL; - struct necp_client_flow *temp_flow = NULL; - LIST_FOREACH_SAFE(search_flow, &client->flow_list, flow_chain, temp_flow) { - if (!search_flow->socket && !search_flow->nexus && - search_flow->u.socket_handle == handle) { - search_flow->u.socket_handle = NULL; - search_flow->u.cb = NULL; + // Remove flow assignment + struct necp_client_flow *search_flow = NULL; + struct necp_client_flow *temp_flow = NULL; + LIST_FOREACH_SAFE(search_flow, &flow_registration->flow_list, flow_chain, temp_flow) { + if (!search_flow->socket && !search_flow->nexus && + search_flow->u.socket_handle == handle) { + search_flow->u.socket_handle = NULL; + search_flow->u.cb = NULL; + } } - } - client->interface_handle = NULL; - client->interface_cb = NULL; + flow_registration->interface_handle = NULL; + flow_registration->interface_cb = NULL; + } NECP_CLIENT_UNLOCK(client); } @@ -1884,58 +2473,68 @@ necp_client_assign_from_socket(pid_t pid, uuid_t client_id, struct inpcb *inp) struct necp_client *client = necp_client_fd_find_client_and_lock(client_fd, client_id); if (client != NULL) { - // Found the right client! - found_client = TRUE; - - struct necp_client_flow *flow = NULL; - LIST_FOREACH(flow, &client->flow_list, flow_chain) { - if (flow->socket && flow->u.socket_handle == inp) { - // Release prior results and route - if (flow->assigned_results != NULL) { - FREE(flow->assigned_results, M_NETAGENT); - flow->assigned_results = NULL; - } + struct necp_client_flow_registration *flow_registration = necp_client_find_flow(client, client_id); + if (flow_registration == NULL && RB_EMPTY(&client->flow_registrations) && !necp_client_id_is_flow(client_id)) { + // No flows yet on this client, add a new registration + flow_registration = necp_client_create_flow_registration(client_fd, client); + if (flow_registration == NULL) { + error = ENOMEM; + } + } + if (flow_registration != NULL) { + // Found the right client and flow! + found_client = TRUE; - ifnet_t ifp = NULL; - if ((inp->inp_flags & INP_BOUND_IF) && inp->inp_boundifp) { - ifp = inp->inp_boundifp; - } else { - ifp = inp->inp_last_outifp; - } + struct necp_client_flow *flow = NULL; + LIST_FOREACH(flow, &flow_registration->flow_list, flow_chain) { + if (flow->socket && flow->u.socket_handle == inp) { + // Release prior results and route + if (flow->assigned_results != NULL) { + FREE(flow->assigned_results, M_NETAGENT); + flow->assigned_results = NULL; + } - if (ifp != NULL) { - flow->interface_index = ifp->if_index; - } else { - flow->interface_index = IFSCOPE_NONE; - } + ifnet_t ifp = NULL; + if ((inp->inp_flags & INP_BOUND_IF) && inp->inp_boundifp) { + ifp = inp->inp_boundifp; + } else { + ifp = inp->inp_last_outifp; + } - if (inp->inp_vflag & INP_IPV4) { - flow->local_addr.sin.sin_family = AF_INET; - flow->local_addr.sin.sin_len = sizeof(struct sockaddr_in); - flow->local_addr.sin.sin_port = inp->inp_lport; - memcpy(&flow->local_addr.sin.sin_addr, &inp->inp_laddr, sizeof(struct in_addr)); - - flow->remote_addr.sin.sin_family = AF_INET; - flow->remote_addr.sin.sin_len = sizeof(struct sockaddr_in); - flow->remote_addr.sin.sin_port = inp->inp_fport; - memcpy(&flow->remote_addr.sin.sin_addr, &inp->inp_faddr, sizeof(struct in_addr)); - } else if (inp->inp_vflag & INP_IPV6) { - in6_ip6_to_sockaddr(&inp->in6p_laddr, inp->inp_lport, &flow->local_addr.sin6, sizeof(flow->local_addr)); - in6_ip6_to_sockaddr(&inp->in6p_faddr, inp->inp_fport, &flow->remote_addr.sin6, sizeof(flow->remote_addr)); - } + if (ifp != NULL) { + flow->interface_index = ifp->if_index; + } else { + flow->interface_index = IFSCOPE_NONE; + } - flow->viable = necp_client_flow_is_viable(proc, client, flow); + if (inp->inp_vflag & INP_IPV4) { + flow->local_addr.sin.sin_family = AF_INET; + flow->local_addr.sin.sin_len = sizeof(struct sockaddr_in); + flow->local_addr.sin.sin_port = inp->inp_lport; + memcpy(&flow->local_addr.sin.sin_addr, &inp->inp_laddr, sizeof(struct in_addr)); + + flow->remote_addr.sin.sin_family = AF_INET; + flow->remote_addr.sin.sin_len = sizeof(struct sockaddr_in); + flow->remote_addr.sin.sin_port = inp->inp_fport; + memcpy(&flow->remote_addr.sin.sin_addr, &inp->inp_faddr, sizeof(struct in_addr)); + } else if (inp->inp_vflag & INP_IPV6) { + in6_ip6_to_sockaddr(&inp->in6p_laddr, inp->inp_lport, &flow->local_addr.sin6, sizeof(flow->local_addr)); + in6_ip6_to_sockaddr(&inp->in6p_faddr, inp->inp_fport, &flow->remote_addr.sin6, sizeof(flow->remote_addr)); + } - uuid_t empty_uuid; - uuid_clear(empty_uuid); - flow->assigned = TRUE; - flow->assigned_results = necp_create_nexus_assign_message(empty_uuid, 0, NULL, 0, - (struct necp_client_endpoint *)&flow->local_addr, - (struct necp_client_endpoint *)&flow->remote_addr, - 0, &flow->assigned_results_length); - client->flow_result_read = FALSE; - client_updated = TRUE; - break; + flow->viable = necp_client_flow_is_viable(proc, client, flow); + + uuid_t empty_uuid; + uuid_clear(empty_uuid); + flow->assigned = TRUE; + flow->assigned_results = necp_create_nexus_assign_message(empty_uuid, 0, NULL, 0, + (struct necp_client_endpoint *)&flow->local_addr, + (struct necp_client_endpoint *)&flow->remote_addr, + 0, NULL, &flow->assigned_results_length); + flow_registration->flow_result_read = FALSE; + client_updated = TRUE; + break; + } } } @@ -1955,10 +2554,12 @@ necp_client_assign_from_socket(pid_t pid, uuid_t client_id, struct inpcb *inp) } NECP_FD_LIST_UNLOCK(); - if (!found_client) { - error = ENOENT; - } else if (!client_updated) { - error = EINVAL; + if (error == 0) { + if (!found_client) { + error = ENOENT; + } else if (!client_updated) { + error = EINVAL; + } } return (error); @@ -1985,22 +2586,25 @@ necp_update_flow_protoctl_event(uuid_t netagent_uuid, uuid_t client_id, struct necp_client *client = necp_client_fd_find_client_and_lock(client_fd, client_id); if (client != NULL) { - /* Found the right client! */ - found_client = TRUE; + struct necp_client_flow_registration *flow_registration = necp_client_find_flow(client, client_id); + if (flow_registration != NULL) { + // Found the right client and flow! + found_client = TRUE; - struct necp_client_flow *flow = NULL; - LIST_FOREACH(flow, &client->flow_list, flow_chain) { - // Verify that the client nexus agent matches - if (flow->nexus && - uuid_compare(flow->u.nexus_agent, - netagent_uuid) == 0) { - flow->has_protoctl_event = TRUE; - flow->protoctl_event.protoctl_event_code = protoctl_event_code; - flow->protoctl_event.protoctl_event_val = protoctl_event_val; - flow->protoctl_event.protoctl_event_tcp_seq_num = protoctl_event_tcp_seq_number; - client->flow_result_read = FALSE; - client_updated = TRUE; - break; + struct necp_client_flow *flow = NULL; + LIST_FOREACH(flow, &flow_registration->flow_list, flow_chain) { + // Verify that the client nexus agent matches + if (flow->nexus && + uuid_compare(flow->u.nexus_agent, + netagent_uuid) == 0) { + flow->has_protoctl_event = TRUE; + flow->protoctl_event.protoctl_event_code = protoctl_event_code; + flow->protoctl_event.protoctl_event_val = protoctl_event_val; + flow->protoctl_event.protoctl_event_tcp_seq_num = protoctl_event_tcp_seq_number; + flow_registration->flow_result_read = FALSE; + client_updated = TRUE; + break; + } } } @@ -2033,6 +2637,7 @@ static bool necp_assign_client_result_locked(struct proc *proc, struct necp_fd_data *client_fd, struct necp_client *client, + struct necp_client_flow_registration *flow_registration, uuid_t netagent_uuid, u_int8_t *assigned_results, size_t assigned_results_length, @@ -2044,7 +2649,7 @@ necp_assign_client_result_locked(struct proc *proc, NECP_CLIENT_ASSERT_LOCKED(client); struct necp_client_flow *flow = NULL; - LIST_FOREACH(flow, &client->flow_list, flow_chain) { + LIST_FOREACH(flow, &flow_registration->flow_list, flow_chain) { // Verify that the client nexus agent matches if (flow->nexus && uuid_compare(flow->u.nexus_agent, netagent_uuid) == 0) { @@ -2054,9 +2659,10 @@ necp_assign_client_result_locked(struct proc *proc, flow->assigned_results = NULL; } + void *nexus_stats = NULL; if (assigned_results != NULL && assigned_results_length > 0) { int error = necp_client_parse_result(assigned_results, (u_int32_t)assigned_results_length, - &flow->local_addr, &flow->remote_addr); + &flow->local_addr, &flow->remote_addr, &nexus_stats); VERIFY(error == 0); } @@ -2065,7 +2671,7 @@ necp_assign_client_result_locked(struct proc *proc, flow->assigned = TRUE; flow->assigned_results = assigned_results; flow->assigned_results_length = assigned_results_length; - client->flow_result_read = FALSE; + flow_registration->flow_result_read = FALSE; client_updated = TRUE; break; } @@ -2099,12 +2705,14 @@ necp_assign_client_result(uuid_t netagent_uuid, uuid_t client_id, NECP_FD_LOCK(client_fd); struct necp_client *client = necp_client_fd_find_client_and_lock(client_fd, client_id); if (client != NULL) { - // Found the right client! - found_client = TRUE; - - if (necp_assign_client_result_locked(proc, client_fd, client, netagent_uuid, - assigned_results, assigned_results_length, true)) { - client_updated = TRUE; + struct necp_client_flow_registration *flow_registration = necp_client_find_flow(client, client_id); + if (flow_registration != NULL) { + // Found the right client and flow! + found_client = TRUE; + if (necp_assign_client_result_locked(proc, client_fd, client, flow_registration, netagent_uuid, + assigned_results, assigned_results_length, true)) { + client_updated = TRUE; + } } NECP_CLIENT_UNLOCK(client); @@ -2114,36 +2722,251 @@ necp_assign_client_result(uuid_t netagent_uuid, uuid_t client_id, proc_rele(proc); proc = PROC_NULL; - if (found_client) { - break; + if (found_client) { + break; + } + } + + NECP_FD_LIST_UNLOCK(); + + // upon error, client must free assigned_results + if (!found_client) { + error = ENOENT; + } else if (!client_updated) { + error = EINVAL; + } + + return (error); +} + +/// Client updating + +static bool +necp_update_parsed_parameters(struct necp_client_parsed_parameters *parsed_parameters, + struct necp_aggregate_result *result) +{ + if (parsed_parameters == NULL || + result == NULL) { + return (false); + } + + bool updated = false; + for (int i = 0; i < NECP_MAX_NETAGENTS; i++) { + if (uuid_is_null(result->netagents[i])) { + // Passed end of valid agents + break; + } + + if (!(result->netagent_use_flags[i] & NECP_AGENT_USE_FLAG_SCOPE)) { + // Not a scoped agent, ignore + continue; + } + + // This is a scoped agent. Add it to the required agents. + if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT) { + // Already some required agents, add this at the end + for (int j = 0; j < NECP_MAX_PARSED_PARAMETERS; j++) { + if (uuid_compare(parsed_parameters->required_netagents[j], result->netagents[i]) == 0) { + // Already required, break + break; + } + if (uuid_is_null(parsed_parameters->required_netagents[j])) { + // Add here + memcpy(&parsed_parameters->required_netagents[j], result->netagents[i], sizeof(uuid_t)); + updated = true; + break; + } + } + } else { + // No required agents yet, add this one + parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT; + memcpy(&parsed_parameters->required_netagents[0], result->netagents[i], sizeof(uuid_t)); + updated = true; + } + + // Remove requirements for agents of the same type + if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT_TYPE) { + char remove_agent_domain[NETAGENT_DOMAINSIZE] = { 0 }; + char remove_agent_type[NETAGENT_TYPESIZE] = { 0 }; + if (netagent_get_agent_domain_and_type(result->netagents[i], remove_agent_domain, remove_agent_type)) { + for (int j = 0; j < NECP_MAX_PARSED_PARAMETERS; j++) { + if (strlen(parsed_parameters->required_netagent_types[j].netagent_domain) == 0 && + strlen(parsed_parameters->required_netagent_types[j].netagent_type) == 0) { + break; + } + + if (strncmp(parsed_parameters->required_netagent_types[j].netagent_domain, remove_agent_domain, NETAGENT_DOMAINSIZE) == 0 && + strncmp(parsed_parameters->required_netagent_types[j].netagent_type, remove_agent_type, NETAGENT_TYPESIZE) == 0) { + + updated = true; + + if (j == NECP_MAX_PARSED_PARAMETERS - 1) { + // Last field, just clear and break + memset(&parsed_parameters->required_netagent_types[NECP_MAX_PARSED_PARAMETERS - 1], 0, sizeof(struct necp_client_parameter_netagent_type)); + break; + } else { + // Move the parameters down, clear the last entry + memmove(&parsed_parameters->required_netagent_types[j], + &parsed_parameters->required_netagent_types[j + 1], + sizeof(struct necp_client_parameter_netagent_type) * (NECP_MAX_PARSED_PARAMETERS - (j + 1))); + memset(&parsed_parameters->required_netagent_types[NECP_MAX_PARSED_PARAMETERS - 1], 0, sizeof(struct necp_client_parameter_netagent_type)); + // Continue, don't increment but look at the new shifted item instead + continue; + } + } + + // Increment j to look at the next agent type parameter + j++; + } + } + } + } + + if (updated && + parsed_parameters->required_interface_index != IFSCOPE_NONE && + (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IF) == 0) { + // A required interface index was added after the fact. Clear it. + parsed_parameters->required_interface_index = IFSCOPE_NONE; + } + + + return (updated); +} + +static inline bool +necp_agent_types_match(const char *agent_domain1, const char *agent_type1, + const char *agent_domain2, const char *agent_type2) +{ + return ((strlen(agent_domain1) == 0 || + strncmp(agent_domain2, agent_domain1, NETAGENT_DOMAINSIZE) == 0) && + (strlen(agent_type1) == 0 || + strncmp(agent_type2, agent_type1, NETAGENT_TYPESIZE) == 0)); +} + +static inline bool +necp_calculate_client_result(proc_t proc, + struct necp_client *client, + struct necp_client_parsed_parameters *parsed_parameters, + struct necp_aggregate_result *result, + u_int32_t *flags) +{ + struct rtentry *route = NULL; + + // Check parameters to find best interface + bool validate_agents = false; + u_int matching_if_index = 0; + if (necp_find_matching_interface_index(parsed_parameters, &matching_if_index, &validate_agents)) { + if (matching_if_index != 0) { + parsed_parameters->required_interface_index = matching_if_index; + } + // Interface found or not needed, match policy. + memset(result, 0, sizeof(*result)); + int error = necp_application_find_policy_match_internal(proc, client->parameters, + (u_int32_t)client->parameters_length, + result, flags, matching_if_index, + NULL, NULL, &route, false); + if (error != 0) { + if (route != NULL) { + rtfree(route); + } + return (FALSE); + } + + if (validate_agents) { + bool requirement_failed = FALSE; + if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT) { + for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) { + if (uuid_is_null(parsed_parameters->required_netagents[i])) { + break; + } + + bool requirement_found = FALSE; + for (int j = 0; j < NECP_MAX_NETAGENTS; j++) { + if (uuid_is_null(result->netagents[j])) { + break; + } + + if (uuid_compare(parsed_parameters->required_netagents[i], result->netagents[j]) == 0) { + requirement_found = TRUE; + break; + } + } + + if (!requirement_found) { + requirement_failed = TRUE; + break; + } + } + } + + if (!requirement_failed && parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT_TYPE) { + for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) { + if (strlen(parsed_parameters->required_netagent_types[i].netagent_domain) == 0 && + strlen(parsed_parameters->required_netagent_types[i].netagent_type) == 0) { + break; + } + + bool requirement_found = FALSE; + for (int j = 0; j < NECP_MAX_NETAGENTS; j++) { + if (uuid_is_null(result->netagents[j])) { + break; + } + + char policy_agent_domain[NETAGENT_DOMAINSIZE] = { 0 }; + char policy_agent_type[NETAGENT_TYPESIZE] = { 0 }; + + if (netagent_get_agent_domain_and_type(result->netagents[j], policy_agent_domain, policy_agent_type)) { + if (necp_agent_types_match(parsed_parameters->required_netagent_types[i].netagent_domain, + parsed_parameters->required_netagent_types[i].netagent_type, + policy_agent_domain, policy_agent_type)) { + requirement_found = TRUE; + break; + } + } + } + + if (!requirement_found) { + requirement_failed = TRUE; + break; + } + } + } + + if (requirement_failed) { + // Agent requirement failed. Clear out the whole result, make everything fail. + memset(result, 0, sizeof(*result)); + if (route != NULL) { + rtfree(route); + } + return (TRUE); + } } - } - - NECP_FD_LIST_UNLOCK(); - // upon error, client must free assigned_results - if (!found_client) { - error = ENOENT; - } else if (!client_updated) { - error = EINVAL; + // Reset current route + NECP_CLIENT_ROUTE_LOCK(client); + if (client->current_route != NULL) { + rtfree(client->current_route); + } + client->current_route = route; + NECP_CLIENT_ROUTE_UNLOCK(client); + } else { + // Interface not found. Clear out the whole result, make everything fail. + memset(result, 0, sizeof(*result)); } - return (error); + return (TRUE); } -/// Client updating - static bool necp_update_client_result(proc_t proc, struct necp_fd_data *client_fd, struct necp_client *client, - struct _necp_client_defunct_list *defunct_list) + struct _necp_flow_defunct_list *defunct_list) { struct necp_client_result_netagent netagent; struct necp_aggregate_result result; struct necp_client_parsed_parameters *parsed_parameters = NULL; u_int32_t flags = 0; - struct rtentry *route = NULL; NECP_CLIENT_ASSERT_LOCKED(client); @@ -2165,35 +2988,18 @@ necp_update_client_result(proc_t proc, // Update saved IP protocol client->ip_protocol = parsed_parameters->ip_protocol; - // Check parameters to find best interface - u_int matching_if_index = 0; - if (necp_find_matching_interface_index(parsed_parameters, &matching_if_index)) { - if (matching_if_index != 0) { - parsed_parameters->required_interface_index = matching_if_index; - } - // Interface found or not needed, match policy. - error = necp_application_find_policy_match_internal(proc, client->parameters, - (u_int32_t)client->parameters_length, - &result, &flags, matching_if_index, - NULL, NULL, &route, false); - if (error != 0) { - if (route != NULL) { - rtfree(route); - } + // Calculate the policy result + if (!necp_calculate_client_result(proc, client, parsed_parameters, &result, &flags)) { + FREE(parsed_parameters, M_NECP); + return (FALSE); + } + + if (necp_update_parsed_parameters(parsed_parameters, &result)) { + // Changed the parameters based on result, try again (only once) + if (!necp_calculate_client_result(proc, client, parsed_parameters, &result, &flags)) { FREE(parsed_parameters, M_NECP); return (FALSE); } - - // Reset current route - NECP_CLIENT_ROUTE_LOCK(client); - if (client->current_route != NULL) { - rtfree(client->current_route); - } - client->current_route = route; - NECP_CLIENT_ROUTE_UNLOCK(client); - } else { - // Interface not found. Clear out the whole result, make everything fail. - memset(&result, 0, sizeof(result)); } // Save the last policy id on the client @@ -2223,9 +3029,6 @@ necp_update_client_result(proc_t proc, } // Recalculate flags - if (client->defunct) { - flags |= NECP_CLIENT_RESULT_FLAG_DEFUNCT; - } if (parsed_parameters->flags & NECP_CLIENT_PARAMETER_FLAG_LISTENER) { // Listeners are valid as long as they aren't dropped if (result.routing_result != NECP_KERNEL_POLICY_RESULT_DROP) { @@ -2303,7 +3106,7 @@ necp_update_client_result(proc_t proc, } uuid_copy(netagent.netagent_uuid, result.netagents[i]); netagent.generation = netagent_get_generation(netagent.netagent_uuid); - if (necp_netagent_applies_to_client(client, parsed_parameters, netagent.netagent_uuid, TRUE, 0, 0)) { + if (necp_netagent_applies_to_client(client, parsed_parameters, &netagent.netagent_uuid, TRUE, 0, 0)) { cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_NETAGENT, sizeof(netagent), &netagent, &updated, client->result, sizeof(client->result)); } @@ -2374,10 +3177,11 @@ necp_update_client_result(proc_t proc, TAILQ_FOREACH(multi_interface, &ifnet_ordered_head, if_ordered_link) { if (necp_ifnet_matches_parameters(multi_interface, parsed_parameters, NULL, true)) { // Add multipath interface flows for kernel MPTCP - necp_client_add_interface_flow_if_needed(client, multi_interface->if_index); + necp_client_add_interface_option_if_needed(client, multi_interface->if_index, + ifnet_get_generation(multi_interface), NULL); // Add nexus agents for multipath - necp_client_add_agent_flows_for_interface(client, parsed_parameters, multi_interface); + necp_client_add_agent_interface_options(client, parsed_parameters, multi_interface); } } } else if ((parsed_parameters->flags & NECP_CLIENT_PARAMETER_FLAG_LISTENER) && @@ -2385,9 +3189,9 @@ necp_update_client_result(proc_t proc, // Get listener interface options from global list struct ifnet *listen_interface = NULL; TAILQ_FOREACH(listen_interface, &ifnet_head, if_link) { - if (necp_ifnet_matches_parameters(listen_interface, parsed_parameters, NULL, false)) { + if (necp_ifnet_matches_parameters(listen_interface, parsed_parameters, NULL, true)) { // Add nexus agents for listeners - necp_client_add_agent_flows_for_interface(client, parsed_parameters, listen_interface); + necp_client_add_agent_interface_options(client, parsed_parameters, listen_interface); } } } @@ -2400,10 +3204,10 @@ necp_update_client_result(proc_t proc, if (uuid_is_null(original_scoped_interface->if_agentids[i])) { continue; } - u_int16_t if_flags = nstat_ifnet_to_flags(original_scoped_interface); uuid_copy(netagent.netagent_uuid, original_scoped_interface->if_agentids[i]); netagent.generation = netagent_get_generation(netagent.netagent_uuid); - if (necp_netagent_applies_to_client(client, parsed_parameters, netagent.netagent_uuid, FALSE, original_scoped_interface->if_index, if_flags)) { + if (necp_netagent_applies_to_client(client, parsed_parameters, &netagent.netagent_uuid, FALSE, + original_scoped_interface->if_index, ifnet_get_generation(original_scoped_interface))) { cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_NETAGENT, sizeof(netagent), &netagent, &updated, client->result, sizeof(client->result)); } @@ -2418,10 +3222,10 @@ necp_update_client_result(proc_t proc, if (uuid_is_null(direct_interface->if_agentids[i])) { continue; } - u_int16_t if_flags = nstat_ifnet_to_flags(direct_interface); uuid_copy(netagent.netagent_uuid, direct_interface->if_agentids[i]); netagent.generation = netagent_get_generation(netagent.netagent_uuid); - if (necp_netagent_applies_to_client(client, parsed_parameters, netagent.netagent_uuid, TRUE, direct_interface->if_index, if_flags)) { + if (necp_netagent_applies_to_client(client, parsed_parameters, &netagent.netagent_uuid, TRUE, + direct_interface->if_index, ifnet_get_generation(direct_interface))) { cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_NETAGENT, sizeof(netagent), &netagent, &updated, client->result, sizeof(client->result)); } @@ -2436,10 +3240,10 @@ necp_update_client_result(proc_t proc, if (uuid_is_null(delegate_interface->if_agentids[i])) { continue; } - u_int16_t if_flags = nstat_ifnet_to_flags(delegate_interface); uuid_copy(netagent.netagent_uuid, delegate_interface->if_agentids[i]); netagent.generation = netagent_get_generation(netagent.netagent_uuid); - if (necp_netagent_applies_to_client(client, parsed_parameters, netagent.netagent_uuid, FALSE, delegate_interface->if_index, if_flags)) { + if (necp_netagent_applies_to_client(client, parsed_parameters, &netagent.netagent_uuid, FALSE, + delegate_interface->if_index, ifnet_get_generation(delegate_interface))) { cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_NETAGENT, sizeof(netagent), &netagent, &updated, client->result, sizeof(client->result)); } @@ -2449,6 +3253,19 @@ necp_update_client_result(proc_t proc, } ifnet_head_done(); + // Add interface options + for (u_int32_t option_i = 0; option_i < client->interface_option_count; option_i++) { + if (option_i < NECP_CLIENT_INTERFACE_OPTION_STATIC_COUNT) { + struct necp_client_interface_option *option = &client->interface_options[option_i]; + cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_INTERFACE_OPTION, sizeof(*option), option, &updated, + client->result, sizeof(client->result)); + } else { + struct necp_client_interface_option *option = &client->extra_interface_options[option_i - NECP_CLIENT_INTERFACE_OPTION_STATIC_COUNT]; + cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_INTERFACE_OPTION, sizeof(*option), option, &updated, + client->result, sizeof(client->result)); + } + } + size_t new_result_length = (cursor - client->result); if (new_result_length != client->result_length) { client->result_length = new_result_length; @@ -2456,14 +3273,8 @@ necp_update_client_result(proc_t proc, } // Update flow viability/flags - bool defuncted_by_flow = FALSE; - if (necp_client_update_flows(proc, client, defunct_list, &defuncted_by_flow)) { + if (necp_client_update_flows(proc, client, defunct_list)) { updated = TRUE; - if (defuncted_by_flow && client->defunct) { - // Reset initial TLV - flags |= NECP_CLIENT_RESULT_FLAG_DEFUNCT; - (void)necp_buffer_write_tlv_if_different(client->result, NECP_CLIENT_RESULT_FLAGS, sizeof(flags), &flags, &updated, client->result, sizeof(client->result)); - } } if (updated) { @@ -2476,7 +3287,7 @@ necp_update_client_result(proc_t proc, } static inline void -necp_defunct_client_fd_locked(struct necp_fd_data *client_fd, struct _necp_client_defunct_list *defunct_list, struct proc *proc) +necp_defunct_client_fd_locked(struct necp_fd_data *client_fd, struct _necp_flow_defunct_list *defunct_list, struct proc *proc) { #pragma unused(proc) bool updated_result = FALSE; @@ -2485,27 +3296,34 @@ necp_defunct_client_fd_locked(struct necp_fd_data *client_fd, struct _necp_clien NECP_FD_ASSERT_LOCKED(client_fd); RB_FOREACH(client, _necp_client_tree, &client_fd->clients) { + struct necp_client_flow_registration *flow_registration = NULL; + NECP_CLIENT_LOCK(client); - if (!client->defunct) { - updated_result = necp_set_client_defunct(client); - // Prepare close events to be sent to the nexus to effectively remove the flows - struct necp_client_flow *search_flow = NULL; - LIST_FOREACH(search_flow, &client->flow_list, flow_chain) { + // Prepare close events to be sent to the nexus to effectively remove the flows + struct necp_client_flow *search_flow = NULL; + RB_FOREACH(flow_registration, _necp_client_flow_tree, &client->flow_registrations) { + LIST_FOREACH(search_flow, &flow_registration->flow_list, flow_chain) { if (search_flow->nexus && - !uuid_is_null(search_flow->u.nexus_agent) && - search_flow->requested_nexus) { + !uuid_is_null(search_flow->u.nexus_agent)) { - struct necp_client_defunct *client_defunct; + struct necp_flow_defunct *flow_defunct; // Sleeping alloc won't fail; copy only what's necessary - client_defunct = _MALLOC(sizeof (struct necp_client_defunct), M_NECP, M_WAITOK | M_ZERO); - uuid_copy(client_defunct->nexus_agent, search_flow->u.nexus_agent); - uuid_copy(client_defunct->client_id, client->client_id); - client_defunct->proc_pid = client->proc_pid; + flow_defunct = _MALLOC(sizeof (struct necp_flow_defunct), M_NECP, M_WAITOK | M_ZERO); + uuid_copy(flow_defunct->nexus_agent, search_flow->u.nexus_agent); + uuid_copy(flow_defunct->flow_id, ((flow_registration->flags & NECP_CLIENT_FLOW_FLAGS_USE_CLIENT_ID) ? + client->client_id : + flow_registration->registration_id)); + flow_defunct->proc_pid = client->proc_pid; + flow_defunct->agent_handle = client->agent_handle; // Add to the list provided by caller - LIST_INSERT_HEAD(defunct_list, client_defunct, chain); + LIST_INSERT_HEAD(defunct_list, flow_defunct, chain); + + flow_registration->defunct = true; + flow_registration->flow_result_read = false; + updated_result = true; } } } @@ -2521,7 +3339,7 @@ necp_defunct_client_fd_locked(struct necp_fd_data *client_fd, struct _necp_clien static inline void necp_update_client_fd_locked(struct necp_fd_data *client_fd, proc_t proc, - struct _necp_client_defunct_list *defunct_list) + struct _necp_flow_defunct_list *defunct_list) { struct necp_client *client = NULL; bool updated_result = FALSE; @@ -2545,7 +3363,7 @@ necp_update_all_clients_callout(__unused thread_call_param_t dummy, { struct necp_fd_data *client_fd = NULL; - struct _necp_client_defunct_list defunct_list; + struct _necp_flow_defunct_list defunct_list; LIST_INIT(&defunct_list); NECP_FD_LIST_LOCK_SHARED(); @@ -2569,25 +3387,26 @@ necp_update_all_clients_callout(__unused thread_call_param_t dummy, // Handle the case in which some clients became newly defunct if (!LIST_EMPTY(&defunct_list)) { - struct necp_client_defunct *client_defunct = NULL; - struct necp_client_defunct *temp_client_defunct = NULL; + struct necp_flow_defunct *flow_defunct = NULL; + struct necp_flow_defunct *temp_flow_defunct = NULL; // For each newly defunct client, send a message to the nexus to remove the flow - LIST_FOREACH_SAFE(client_defunct, &defunct_list, chain, temp_client_defunct) { - if (!uuid_is_null(client_defunct->nexus_agent)) { - int netagent_error = netagent_client_message(client_defunct->nexus_agent, - client_defunct->client_id, - client_defunct->proc_pid, + LIST_FOREACH_SAFE(flow_defunct, &defunct_list, chain, temp_flow_defunct) { + if (!uuid_is_null(flow_defunct->nexus_agent)) { + int netagent_error = netagent_client_message(flow_defunct->nexus_agent, + flow_defunct->flow_id, + flow_defunct->proc_pid, + flow_defunct->agent_handle, NETAGENT_MESSAGE_TYPE_ABORT_NEXUS); if (netagent_error != 0) { char namebuf[MAXCOMLEN+1]; (void) strlcpy(namebuf, "unknown", sizeof (namebuf)); - proc_name(client_defunct->proc_pid, namebuf, sizeof (namebuf)); - NECPLOG((netagent_error == ENOENT ? LOG_DEBUG : LOG_ERR), "necp_update_client abort nexus error (%d) for pid %d %s", netagent_error, client_defunct->proc_pid, namebuf); + proc_name(flow_defunct->proc_pid, namebuf, sizeof (namebuf)); + NECPLOG((netagent_error == ENOENT ? LOG_DEBUG : LOG_ERR), "necp_update_client abort nexus error (%d) for pid %d %s", netagent_error, flow_defunct->proc_pid, namebuf); } } - LIST_REMOVE(client_defunct, chain); - FREE(client_defunct, M_NECP); + LIST_REMOVE(flow_defunct, chain); + FREE(flow_defunct, M_NECP); } } ASSERT(LIST_EMPTY(&defunct_list)); @@ -2646,11 +3465,14 @@ necp_set_client_as_background(proc_t proc, NECP_CLIENT_LOCK(client); bool has_assigned_flow = FALSE; + struct necp_client_flow_registration *flow_registration = NULL; struct necp_client_flow *search_flow = NULL; - LIST_FOREACH(search_flow, &client->flow_list, flow_chain) { - if (search_flow->assigned) { - has_assigned_flow = TRUE; - break; + RB_FOREACH(flow_registration, _necp_client_flow_tree, &client->flow_registrations) { + LIST_FOREACH(search_flow, &flow_registration->flow_list, flow_chain) { + if (search_flow->assigned) { + has_assigned_flow = TRUE; + break; + } } } @@ -2683,7 +3505,7 @@ necp_fd_memstatus(proc_t proc, uint32_t status, void necp_fd_defunct(proc_t proc, struct necp_fd_data *client_fd) { - struct _necp_client_defunct_list defunct_list; + struct _necp_flow_defunct_list defunct_list; ASSERT(proc != PROC_NULL); ASSERT(client_fd != NULL); @@ -2702,22 +3524,23 @@ necp_fd_defunct(proc_t proc, struct necp_fd_data *client_fd) NECP_FD_UNLOCK(client_fd); if (!LIST_EMPTY(&defunct_list)) { - struct necp_client_defunct *client_defunct = NULL; - struct necp_client_defunct *temp_client_defunct = NULL; + struct necp_flow_defunct *flow_defunct = NULL; + struct necp_flow_defunct *temp_flow_defunct = NULL; // For each defunct client, remove flow from the nexus - LIST_FOREACH_SAFE(client_defunct, &defunct_list, chain, temp_client_defunct) { - if (!uuid_is_null(client_defunct->nexus_agent)) { - int netagent_error = netagent_client_message(client_defunct->nexus_agent, - client_defunct->client_id, - client_defunct->proc_pid, + LIST_FOREACH_SAFE(flow_defunct, &defunct_list, chain, temp_flow_defunct) { + if (!uuid_is_null(flow_defunct->nexus_agent)) { + int netagent_error = netagent_client_message(flow_defunct->nexus_agent, + flow_defunct->flow_id, + flow_defunct->proc_pid, + flow_defunct->agent_handle, NETAGENT_MESSAGE_TYPE_ABORT_NEXUS); if (netagent_error != 0) { NECPLOG((netagent_error == ENOENT ? LOG_DEBUG : LOG_ERR), "necp_defunct_client abort nexus error (%d)", netagent_error); } } - LIST_REMOVE(client_defunct, chain); - FREE(client_defunct, M_NECP); + LIST_REMOVE(flow_defunct, chain); + FREE(flow_defunct, M_NECP); } } ASSERT(LIST_EMPTY(&defunct_list)); @@ -2757,7 +3580,7 @@ necp_client_remove_agent_from_result(struct necp_client *client, uuid_t netagent } void -necp_force_update_client(uuid_t client_id, uuid_t remove_netagent_uuid) +necp_force_update_client(uuid_t client_id, uuid_t remove_netagent_uuid, u_int32_t agent_generation) { struct necp_fd_data *client_fd = NULL; @@ -2768,10 +3591,12 @@ necp_force_update_client(uuid_t client_id, uuid_t remove_netagent_uuid) NECP_FD_LOCK(client_fd); struct necp_client *client = necp_client_fd_find_client_and_lock(client_fd, client_id); if (client != NULL) { + client->failed_trigger_agent.generation = agent_generation; + uuid_copy(client->failed_trigger_agent.netagent_uuid, remove_netagent_uuid); if (!uuid_is_null(remove_netagent_uuid)) { necp_client_remove_agent_from_result(client, remove_netagent_uuid); } - client->flow_result_read = FALSE; + client->result_read = FALSE; // Found the client, break updated_result = TRUE; NECP_CLIENT_UNLOCK(client); @@ -2799,19 +3624,28 @@ necp_force_update_client(uuid_t client_id, uuid_t remove_netagent_uuid) NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT | \ NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_AGENT | \ NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT | \ + NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT | \ NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT_TYPE | \ NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_AGENT_TYPE | \ - NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT_TYPE) - -#define NECP_PARSED_PARAMETERS_SCOPED_IFNET_FIELDS (NECP_PARSED_PARAMETERS_FIELD_LOCAL_ADDR | \ - NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IFTYPE | \ - NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT | \ - NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT | \ - NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT_TYPE | \ - NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT_TYPE) - -#define NECP_PARSED_PARAMETERS_PREFERRED_IFNET_FIELDS (NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT | \ - NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT_TYPE) + NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT_TYPE | \ + NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT_TYPE) + +#define NECP_PARSED_PARAMETERS_SCOPED_FIELDS (NECP_PARSED_PARAMETERS_FIELD_LOCAL_ADDR | \ + NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IFTYPE | \ + NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT | \ + NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT | \ + NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT | \ + NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT_TYPE | \ + NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT_TYPE | \ + NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT_TYPE) + +#define NECP_PARSED_PARAMETERS_SCOPED_IFNET_FIELDS (NECP_PARSED_PARAMETERS_FIELD_LOCAL_ADDR | \ + NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IFTYPE) + +#define NECP_PARSED_PARAMETERS_PREFERRED_FIELDS (NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT | \ + NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT | \ + NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT_TYPE | \ + NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT_TYPE) static bool necp_ifnet_matches_type(struct ifnet *ifp, u_int8_t interface_type, bool check_delegates) @@ -2872,7 +3706,7 @@ necp_ifnet_matches_agent(struct ifnet *ifp, uuid_t *agent_uuid, bool check_deleg } static bool -necp_necp_ifnet_matches_agent_type(struct ifnet *ifp, const char *agent_domain, const char *agent_type, bool check_delegates) +necp_ifnet_matches_agent_type(struct ifnet *ifp, const char *agent_domain, const char *agent_type, bool check_delegates) { struct ifnet *check_ifp = ifp; @@ -2888,13 +3722,10 @@ necp_necp_ifnet_matches_agent_type(struct ifnet *ifp, const char *agent_domain, char if_agent_type[NETAGENT_TYPESIZE] = { 0 }; if (netagent_get_agent_domain_and_type(check_ifp->if_agentids[index], if_agent_domain, if_agent_type)) { - if ((strlen(agent_domain) == 0 || - strncmp(if_agent_domain, agent_domain, NETAGENT_DOMAINSIZE) == 0) && - (strlen(agent_type) == 0 || - strncmp(if_agent_type, agent_type, NETAGENT_TYPESIZE) == 0)) { - ifnet_lock_done(check_ifp); - return (TRUE); - } + if (necp_agent_types_match(agent_domain, agent_type, if_agent_domain, if_agent_type)) { + ifnet_lock_done(check_ifp); + return (TRUE); + } } } } @@ -2952,10 +3783,13 @@ necp_interface_type_is_primary_eligible(u_int8_t interface_type) #define NECP_IFP_IS_ON_ORDERED_LIST(_ifp) ((_ifp)->if_ordered_link.tqe_next != NULL || (_ifp)->if_ordered_link.tqe_prev != NULL) +// Secondary interface flag indicates that the interface is being +// used for multipath or a listener as an extra path static bool necp_ifnet_matches_parameters(struct ifnet *ifp, struct necp_client_parsed_parameters *parsed_parameters, - u_int32_t *preferred_count, bool ignore_require_if) + u_int32_t *preferred_count, + bool secondary_interface) { if (preferred_count) { *preferred_count = 0; @@ -2974,7 +3808,9 @@ necp_ifnet_matches_parameters(struct ifnet *ifp, } } - if (!ignore_require_if && + if ((!secondary_interface || // Enforce interface type if this is the primary interface + !(parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_FLAGS) || // or if there are no flags + !(parsed_parameters->flags & NECP_CLIENT_PARAMETER_FLAG_ONLY_PRIMARY_REQUIRES_TYPE)) && // or if the flags don't give an exception (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IFTYPE) && !necp_ifnet_matches_type(ifp, parsed_parameters->required_interface_type, FALSE)) { return (FALSE); @@ -3035,7 +3871,7 @@ necp_ifnet_matches_parameters(struct ifnet *ifp, break; } - if (!necp_necp_ifnet_matches_agent_type(ifp, parsed_parameters->required_netagent_types[i].netagent_domain, parsed_parameters->required_netagent_types[i].netagent_type, FALSE)) { + if (!necp_ifnet_matches_agent_type(ifp, parsed_parameters->required_netagent_types[i].netagent_domain, parsed_parameters->required_netagent_types[i].netagent_type, FALSE)) { return (FALSE); } } @@ -3048,7 +3884,7 @@ necp_ifnet_matches_parameters(struct ifnet *ifp, break; } - if (necp_necp_ifnet_matches_agent_type(ifp, parsed_parameters->prohibited_netagent_types[i].netagent_domain, parsed_parameters->prohibited_netagent_types[i].netagent_type, TRUE)) { + if (necp_ifnet_matches_agent_type(ifp, parsed_parameters->prohibited_netagent_types[i].netagent_domain, parsed_parameters->prohibited_netagent_types[i].netagent_type, TRUE)) { return (FALSE); } } @@ -3075,7 +3911,33 @@ necp_ifnet_matches_parameters(struct ifnet *ifp, break; } - if (necp_necp_ifnet_matches_agent_type(ifp, parsed_parameters->preferred_netagent_types[i].netagent_domain, parsed_parameters->preferred_netagent_types[i].netagent_type, TRUE)) { + if (necp_ifnet_matches_agent_type(ifp, parsed_parameters->preferred_netagent_types[i].netagent_domain, parsed_parameters->preferred_netagent_types[i].netagent_type, TRUE)) { + (*preferred_count)++; + } + } + } + + if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT) { + for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) { + if (uuid_is_null(parsed_parameters->avoided_netagents[i])) { + break; + } + + if (!necp_ifnet_matches_agent(ifp, &parsed_parameters->avoided_netagents[i], TRUE)) { + (*preferred_count)++; + } + } + } + + if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT_TYPE) { + for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) { + if (strlen(parsed_parameters->avoided_netagent_types[i].netagent_domain) == 0 && + strlen(parsed_parameters->avoided_netagent_types[i].netagent_type) == 0) { + break; + } + + if (!necp_ifnet_matches_agent_type(ifp, parsed_parameters->avoided_netagent_types[i].netagent_domain, + parsed_parameters->avoided_netagent_types[i].netagent_type, TRUE)) { (*preferred_count)++; } } @@ -3086,7 +3948,8 @@ necp_ifnet_matches_parameters(struct ifnet *ifp, } static bool -necp_find_matching_interface_index(struct necp_client_parsed_parameters *parsed_parameters, u_int *return_ifindex) +necp_find_matching_interface_index(struct necp_client_parsed_parameters *parsed_parameters, + u_int *return_ifindex, bool *validate_agents) { struct ifnet *ifp = NULL; u_int32_t best_preferred_count = 0; @@ -3102,12 +3965,12 @@ necp_find_matching_interface_index(struct necp_client_parsed_parameters *parsed_ return (TRUE); } - has_preferred_fields = (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_PREFERRED_IFNET_FIELDS); + has_preferred_fields = (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_PREFERRED_FIELDS); // We have interesting parameters to parse and find a matching interface ifnet_head_lock_shared(); - if (!(parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_SCOPED_IFNET_FIELDS)) { + if (!(parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_SCOPED_FIELDS)) { // We do have fields to match, but they are only prohibitory // If the first interface in the list matches, or there are no ordered interfaces, we don't need to scope ifp = TAILQ_FIRST(&ifnet_ordered_head); @@ -3137,7 +4000,7 @@ necp_find_matching_interface_index(struct necp_client_parsed_parameters *parsed_ } // Then check the remaining interfaces - if ((parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_SCOPED_IFNET_FIELDS) && + if ((parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_SCOPED_FIELDS) && ((!(parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IFTYPE)) || !necp_interface_type_is_primary_eligible(parsed_parameters->required_interface_type)) && *return_ifindex == 0) { @@ -3165,13 +4028,21 @@ necp_find_matching_interface_index(struct necp_client_parsed_parameters *parsed_ ifnet_head_done(); - if ((parsed_parameters->valid_fields == (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_PREFERRED_IFNET_FIELDS)) && + if ((parsed_parameters->valid_fields == (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_PREFERRED_FIELDS)) && best_preferred_count == 0) { // If only has preferred fields, and nothing was found, clear the interface index and return TRUE *return_ifindex = 0; return (TRUE); } + if (*return_ifindex == 0 && + !(parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_SCOPED_IFNET_FIELDS)) { + // Has required fields, but not including specific interface fields. Pass for now, and check + // to see if agents are satisfied by policy. + *validate_agents = TRUE; + return (TRUE); + } + return (*return_ifindex != 0); } @@ -3194,7 +4065,8 @@ necp_open(struct proc *p, struct necp_open_args *uap, int *retval) struct fileproc *fp = NULL; int fd = -1; - if (uap->flags & NECP_OPEN_FLAG_OBSERVER) { + if (uap->flags & NECP_OPEN_FLAG_OBSERVER || + uap->flags & NECP_OPEN_FLAG_PUSH_OBSERVER) { if (necp_skywalk_priv_check_cred(p, kauth_cred_get()) != 0 && priv_check_cred(kauth_cred_get(), PRIV_NET_PRIVILEGED_NETWORK_STATISTICS, 0) != 0) { NECPLOG0(LOG_ERR, "Client does not hold necessary entitlement to observe other NECP clients"); @@ -3218,6 +4090,7 @@ necp_open(struct proc *p, struct necp_open_args *uap, int *retval) fd_data->necp_fd_type = necp_fd_type_client; fd_data->flags = uap->flags; RB_INIT(&fd_data->clients); + RB_INIT(&fd_data->flows); TAILQ_INIT(&fd_data->update_list); lck_mtx_init(&fd_data->fd_lock, necp_fd_mtx_grp, necp_fd_mtx_attr); klist_init(&fd_data->si.si_note); @@ -3309,11 +4182,12 @@ necp_client_add(struct proc *p, struct necp_fd_data *fd_data, struct necp_client client->parameters_length = uap->buffer_size; client->proc_pid = fd_data->proc_pid; // Save off proc pid in case the client will persist past fd + client->agent_handle = (void *)fd_data; client->platform_binary = ((csproc_get_platform_binary(p) == 0) ? 0 : 1); - uuid_generate_random(client->client_id); + necp_generate_client_id(client->client_id, false); LIST_INIT(&client->assertion_list); - LIST_INIT(&client->flow_list); + RB_INIT(&client->flow_registrations); error = copyout(client->client_id, uap->client_id, sizeof(uuid_t)); if (error) { @@ -3351,8 +4225,6 @@ static int necp_client_remove(struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval) { int error = 0; - struct necp_client *client = NULL; - struct necp_client find = {}; uuid_t client_id = {}; struct ifnet_stats_per_flow flow_ifnet_stats = {}; @@ -3382,15 +4254,27 @@ necp_client_remove(struct necp_fd_data *fd_data, struct necp_client_action_args NECP_FD_LOCK(fd_data); pid_t pid = fd_data->proc_pid; - uuid_copy(find.client_id, client_id); - client = RB_FIND(_necp_client_tree, &fd_data->clients, &find); + struct necp_client *client = necp_client_fd_find_client_unlocked(fd_data, client_id); if (client != NULL) { + // Remove any flow registrations that match + struct necp_client_flow_registration *flow_registration = NULL; + struct necp_client_flow_registration *temp_flow_registration = NULL; + RB_FOREACH_SAFE(flow_registration, _necp_fd_flow_tree, &fd_data->flows, temp_flow_registration) { + if (flow_registration->client == client) { + NECP_FLOW_TREE_LOCK_EXCLUSIVE(); + RB_REMOVE(_necp_client_flow_global_tree, &necp_client_flow_global_tree, flow_registration); + NECP_FLOW_TREE_UNLOCK(); + RB_REMOVE(_necp_fd_flow_tree, &fd_data->flows, flow_registration); + } + } + // Remove client from lists NECP_CLIENT_TREE_LOCK_EXCLUSIVE(); RB_REMOVE(_necp_client_global_tree, &necp_client_global_tree, client); NECP_CLIENT_TREE_UNLOCK(); RB_REMOVE(_necp_client_tree, &fd_data->clients, client); } + NECP_FD_UNLOCK(fd_data); if (client != NULL) { @@ -3400,13 +4284,13 @@ necp_client_remove(struct necp_fd_data *fd_data, struct necp_client_action_args error = ENOENT; NECPLOG(LOG_ERR, "necp_client_remove invalid client_id (%d)", error); } - done: *retval = error; return (error); } + static int necp_client_check_tcp_heuristics(struct necp_client *client, struct necp_client_flow *flow, u_int32_t *flags, u_int8_t *tfo_cookie, u_int8_t *tfo_cookie_len) { @@ -3492,188 +4376,278 @@ necp_client_check_tcp_heuristics(struct necp_client *client, struct necp_client_ return (error); } +static size_t +necp_client_calculate_flow_tlv_size(struct necp_client_flow_registration *flow_registration) +{ + size_t assigned_results_size = 0; + struct necp_client_flow *flow = NULL; + LIST_FOREACH(flow, &flow_registration->flow_list, flow_chain) { + if (flow->assigned) { + size_t header_length = 0; + if (flow->nexus) { + header_length = sizeof(struct necp_client_nexus_flow_header); + } else { + header_length = sizeof(struct necp_client_flow_header); + } + assigned_results_size += (header_length + flow->assigned_results_length); + + if (flow->has_protoctl_event) { + assigned_results_size += sizeof(struct necp_client_flow_protoctl_event_header); + } + } + } + return assigned_results_size; +} + +static int +necp_client_fillout_flow_tlvs(struct necp_client *client, + bool client_is_observed, + struct necp_client_flow_registration *flow_registration, + struct necp_client_action_args *uap, + size_t *assigned_results_cursor) +{ + int error = 0; + struct necp_client_flow *flow = NULL; + LIST_FOREACH(flow, &flow_registration->flow_list, flow_chain) { + if (flow->assigned) { + // Write TLV headers + struct necp_client_nexus_flow_header header = {}; + u_int32_t length = 0; + u_int32_t flags = 0; + u_int8_t tfo_cookie_len = 0; + u_int8_t type = 0; + + type = NECP_CLIENT_RESULT_FLOW_ID; + length = sizeof(header.flow_header.flow_id); + memcpy(&header.flow_header.flow_id_tlv_header.type, &type, sizeof(type)); + memcpy(&header.flow_header.flow_id_tlv_header.length, &length, sizeof(length)); + uuid_copy(header.flow_header.flow_id, flow_registration->registration_id); + + if (flow->nexus) { + if (flow->check_tcp_heuristics) { + u_int8_t tfo_cookie[NECP_TFO_COOKIE_LEN_MAX]; + tfo_cookie_len = NECP_TFO_COOKIE_LEN_MAX; + + if (necp_client_check_tcp_heuristics(client, flow, &flags, + tfo_cookie, &tfo_cookie_len) != 0) { + tfo_cookie_len = 0; + } else { + flow->check_tcp_heuristics = FALSE; + + if (tfo_cookie_len != 0) { + type = NECP_CLIENT_RESULT_TFO_COOKIE; + length = tfo_cookie_len; + memcpy(&header.tfo_cookie_tlv_header.type, &type, sizeof(type)); + memcpy(&header.tfo_cookie_tlv_header.length, &length, sizeof(length)); + memcpy(&header.tfo_cookie_value, tfo_cookie, tfo_cookie_len); + } + } + } + } + + size_t header_length = 0; + if (flow->nexus) { + if (tfo_cookie_len != 0) { + header_length = sizeof(struct necp_client_nexus_flow_header) - (NECP_TFO_COOKIE_LEN_MAX - tfo_cookie_len); + } else { + header_length = sizeof(struct necp_client_nexus_flow_header) - sizeof(struct necp_tlv_header) - NECP_TFO_COOKIE_LEN_MAX; + } + } else { + header_length = sizeof(struct necp_client_flow_header); + } + + type = NECP_CLIENT_RESULT_FLAGS; + length = sizeof(header.flow_header.flags_value); + memcpy(&header.flow_header.flags_tlv_header.type, &type, sizeof(type)); + memcpy(&header.flow_header.flags_tlv_header.length, &length, sizeof(length)); + if (flow->assigned) { + flags |= NECP_CLIENT_RESULT_FLAG_FLOW_ASSIGNED; + } + if (flow->viable) { + flags |= NECP_CLIENT_RESULT_FLAG_FLOW_VIABLE; + } + if (flow_registration->defunct) { + flags |= NECP_CLIENT_RESULT_FLAG_DEFUNCT; + } + flags |= flow->necp_flow_flags; + memcpy(&header.flow_header.flags_value, &flags, sizeof(flags)); + + type = NECP_CLIENT_RESULT_INTERFACE; + length = sizeof(header.flow_header.interface_value); + memcpy(&header.flow_header.interface_tlv_header.type, &type, sizeof(type)); + memcpy(&header.flow_header.interface_tlv_header.length, &length, sizeof(length)); + + struct necp_client_result_interface interface_struct; + interface_struct.generation = 0; + interface_struct.index = flow->interface_index; + + memcpy(&header.flow_header.interface_value, &interface_struct, sizeof(interface_struct)); + if (flow->nexus) { + type = NECP_CLIENT_RESULT_NETAGENT; + length = sizeof(header.agent_value); + memcpy(&header.agent_tlv_header.type, &type, sizeof(type)); + memcpy(&header.agent_tlv_header.length, &length, sizeof(length)); + + struct necp_client_result_netagent agent_struct; + agent_struct.generation = 0; + uuid_copy(agent_struct.netagent_uuid, flow->u.nexus_agent); + + memcpy(&header.agent_value, &agent_struct, sizeof(agent_struct)); + } + + // Don't include outer TLV header in length field + type = NECP_CLIENT_RESULT_FLOW; + length = (header_length - sizeof(struct necp_tlv_header) + flow->assigned_results_length); + if (flow->has_protoctl_event) { + length += sizeof(struct necp_client_flow_protoctl_event_header); + } + memcpy(&header.flow_header.outer_header.type, &type, sizeof(type)); + memcpy(&header.flow_header.outer_header.length, &length, sizeof(length)); + + error = copyout(&header, uap->buffer + client->result_length + *assigned_results_cursor, header_length); + if (error) { + NECPLOG(LOG_ERR, "necp_client_copy assigned results tlv_header copyout error (%d)", error); + return (error); + } + *assigned_results_cursor += header_length; + + if (flow->assigned_results && flow->assigned_results_length) { + // Write inner TLVs + error = copyout(flow->assigned_results, uap->buffer + client->result_length + *assigned_results_cursor, + flow->assigned_results_length); + if (error) { + NECPLOG(LOG_ERR, "necp_client_copy assigned results copyout error (%d)", error); + return (error); + } + } + *assigned_results_cursor += flow->assigned_results_length; + + /* Read the protocol event and reset it */ + if (flow->has_protoctl_event) { + struct necp_client_flow_protoctl_event_header protoctl_event_header = {}; + + type = NECP_CLIENT_RESULT_PROTO_CTL_EVENT; + length = sizeof(protoctl_event_header.protoctl_event); + + memcpy(&protoctl_event_header.protoctl_tlv_header.type, &type, sizeof(type)); + memcpy(&protoctl_event_header.protoctl_tlv_header.length, &length, sizeof(length)); + memcpy(&protoctl_event_header.protoctl_event, &flow->protoctl_event, + sizeof(flow->protoctl_event)); + + error = copyout(&protoctl_event_header, uap->buffer + client->result_length + *assigned_results_cursor, + sizeof(protoctl_event_header)); + + if (error) { + NECPLOG(LOG_ERR, "necp_client_copy protocol control event results" + " tlv_header copyout error (%d)", error); + return (error); + } + *assigned_results_cursor += sizeof(protoctl_event_header); + flow->has_protoctl_event = FALSE; + flow->protoctl_event.protoctl_event_code = 0; + flow->protoctl_event.protoctl_event_val = 0; + flow->protoctl_event.protoctl_event_tcp_seq_num = 0; + } + } + } + if (!client_is_observed) { + flow_registration->flow_result_read = TRUE; + } + return (0); +} + static int -necp_client_copy_internal(struct necp_client *client, bool client_is_observed, struct necp_client_action_args *uap, int *retval) +necp_client_copy_internal(struct necp_client *client, uuid_t client_id, bool client_is_observed, struct necp_client_action_args *uap, int *retval) { + NECP_CLIENT_ASSERT_LOCKED(client); int error = 0; // Copy results out if (uap->action == NECP_CLIENT_ACTION_COPY_PARAMETERS) { if (uap->buffer_size < client->parameters_length) { - error = EINVAL; - goto done; + return (EINVAL); } error = copyout(client->parameters, uap->buffer, client->parameters_length); if (error) { NECPLOG(LOG_ERR, "necp_client_copy parameters copyout error (%d)", error); - goto done; + return (error); } *retval = client->parameters_length; } else if (uap->action == NECP_CLIENT_ACTION_COPY_UPDATED_RESULT && - client->result_read && client->flow_result_read) { + client->result_read && !necp_client_has_unread_flows(client)) { // Copy updates only, but nothing to read // Just return 0 for bytes read *retval = 0; } else if (uap->action == NECP_CLIENT_ACTION_COPY_RESULT || uap->action == NECP_CLIENT_ACTION_COPY_UPDATED_RESULT) { size_t assigned_results_size = 0; - struct necp_client_flow *flow = NULL; - LIST_FOREACH(flow, &client->flow_list, flow_chain) { - if (flow->nexus || (flow->socket && flow->assigned)) { - size_t header_length = 0; - if (flow->nexus) { - header_length = sizeof(struct necp_client_nexus_flow_header); - } else { - header_length = sizeof(struct necp_client_flow_header); - } - assigned_results_size += (header_length + flow->assigned_results_length); - if (flow->has_protoctl_event) { - assigned_results_size += sizeof(struct necp_client_flow_protoctl_event_header); + bool some_flow_is_defunct = false; + struct necp_client_flow_registration *single_flow_registration = NULL; + if (necp_client_id_is_flow(client_id)) { + single_flow_registration = necp_client_find_flow(client, client_id); + if (single_flow_registration != NULL) { + assigned_results_size += necp_client_calculate_flow_tlv_size(single_flow_registration); + } + } else { + // This request is for the client, so copy everything + struct necp_client_flow_registration *flow_registration = NULL; + RB_FOREACH(flow_registration, _necp_client_flow_tree, &client->flow_registrations) { + if (flow_registration->defunct) { + some_flow_is_defunct = true; } + assigned_results_size += necp_client_calculate_flow_tlv_size(flow_registration); } } if (uap->buffer_size < (client->result_length + assigned_results_size)) { - error = EINVAL; - goto done; + return (EINVAL); + } + + u_int32_t original_flags = 0; + bool flags_updated = false; + if (some_flow_is_defunct && client->legacy_client_is_flow) { + // If our client expects the defunct flag in the client, add it now + u_int32_t client_flags = 0; + u_int32_t value_size = 0; + u_int8_t *flags_pointer = necp_buffer_get_tlv_value(client->result, 0, &value_size); + if (flags_pointer != NULL && value_size == sizeof(client_flags)) { + memcpy(&client_flags, flags_pointer, value_size); + original_flags = client_flags; + client_flags |= NECP_CLIENT_RESULT_FLAG_DEFUNCT; + (void)necp_buffer_write_tlv_if_different(client->result, NECP_CLIENT_RESULT_FLAGS, + sizeof(client_flags), &client_flags, &flags_updated, + client->result, sizeof(client->result)); + } } + error = copyout(client->result, uap->buffer, client->result_length); + + if (flags_updated) { + // Revert stored flags + (void)necp_buffer_write_tlv_if_different(client->result, NECP_CLIENT_RESULT_FLAGS, + sizeof(original_flags), &original_flags, &flags_updated, + client->result, sizeof(client->result)); + } + if (error) { NECPLOG(LOG_ERR, "necp_client_copy result copyout error (%d)", error); - goto done; + return (error); } size_t assigned_results_cursor = 0; - - flow = NULL; - LIST_FOREACH(flow, &client->flow_list, flow_chain) { - if (flow->nexus || (flow->socket && flow->assigned)) { - // Write TLV headers - struct necp_client_nexus_flow_header header = {}; - u_int32_t length = 0; - u_int32_t flags = 0; - u_int8_t tfo_cookie_len = 0; - u_int8_t type = 0; - - if (flow->nexus) { - if (flow->check_tcp_heuristics) { - u_int8_t tfo_cookie[NECP_TFO_COOKIE_LEN_MAX]; - tfo_cookie_len = NECP_TFO_COOKIE_LEN_MAX; - - if (necp_client_check_tcp_heuristics(client, flow, &flags, - tfo_cookie, &tfo_cookie_len) != 0) { - tfo_cookie_len = 0; - } else { - flow->check_tcp_heuristics = FALSE; - - if (tfo_cookie_len != 0) { - type = NECP_CLIENT_RESULT_TFO_COOKIE; - length = tfo_cookie_len; - memcpy(&header.tfo_cookie_tlv_header.type, &type, sizeof(type)); - memcpy(&header.tfo_cookie_tlv_header.length, &length, sizeof(length)); - memcpy(&header.tfo_cookie_value, tfo_cookie, tfo_cookie_len); - } - } - } - } - - size_t header_length = 0; - if (flow->nexus) { - if (tfo_cookie_len != 0) { - header_length = sizeof(struct necp_client_nexus_flow_header) - (NECP_TFO_COOKIE_LEN_MAX - tfo_cookie_len); - } else { - header_length = sizeof(struct necp_client_nexus_flow_header) - sizeof(struct necp_tlv_header) - NECP_TFO_COOKIE_LEN_MAX; - } - } else { - header_length = sizeof(struct necp_client_flow_header); - } - - type = NECP_CLIENT_RESULT_FLAGS; - length = sizeof(header.flow_header.flags_value); - memcpy(&header.flow_header.flags_tlv_header.type, &type, sizeof(type)); - memcpy(&header.flow_header.flags_tlv_header.length, &length, sizeof(length)); - if (flow->assigned) { - flags |= NECP_CLIENT_RESULT_FLAG_FLOW_ASSIGNED; - } - if (flow->viable) { - flags |= NECP_CLIENT_RESULT_FLAG_FLOW_VIABLE; - } - memcpy(&header.flow_header.flags_value, &flags, sizeof(flags)); - - type = NECP_CLIENT_RESULT_INTERFACE; - length = sizeof(header.flow_header.interface_value); - memcpy(&header.flow_header.interface_tlv_header.type, &type, sizeof(type)); - memcpy(&header.flow_header.interface_tlv_header.length, &length, sizeof(length)); - - struct necp_client_result_interface interface_struct; - interface_struct.generation = 0; - interface_struct.index = flow->interface_index; - - memcpy(&header.flow_header.interface_value, &interface_struct, sizeof(interface_struct)); - if (flow->nexus) { - type = NECP_CLIENT_RESULT_NETAGENT; - length = sizeof(header.agent_value); - memcpy(&header.agent_tlv_header.type, &type, sizeof(type)); - memcpy(&header.agent_tlv_header.length, &length, sizeof(length)); - - struct necp_client_result_netagent agent_struct; - agent_struct.generation = 0; - uuid_copy(agent_struct.netagent_uuid, flow->u.nexus_agent); - - memcpy(&header.agent_value, &agent_struct, sizeof(agent_struct)); + if (necp_client_id_is_flow(client_id)) { + if (single_flow_registration != NULL) { + error = necp_client_fillout_flow_tlvs(client, client_is_observed, single_flow_registration, uap, &assigned_results_cursor); + if (error != 0) { + return (error); } - - // Don't include outer TLV header in length field - type = NECP_CLIENT_RESULT_FLOW; - length = (header_length - sizeof(struct necp_tlv_header) + flow->assigned_results_length); - if (flow->has_protoctl_event) { - length += sizeof(struct necp_client_flow_protoctl_event_header); - } - memcpy(&header.flow_header.outer_header.type, &type, sizeof(type)); - memcpy(&header.flow_header.outer_header.length, &length, sizeof(length)); - - error = copyout(&header, uap->buffer + client->result_length + assigned_results_cursor, header_length); - if (error) { - NECPLOG(LOG_ERR, "necp_client_copy assigned results tlv_header copyout error (%d)", error); - goto done; - } - assigned_results_cursor += header_length; - - if (flow->assigned_results && flow->assigned_results_length) { - // Write inner TLVs - error = copyout(flow->assigned_results, uap->buffer + client->result_length + assigned_results_cursor, - flow->assigned_results_length); - if (error) { - NECPLOG(LOG_ERR, "necp_client_copy assigned results copyout error (%d)", error); - goto done; - } - } - assigned_results_cursor += flow->assigned_results_length; - - /* Read the protocol event and reset it */ - if (flow->has_protoctl_event) { - struct necp_client_flow_protoctl_event_header protoctl_event_header = {}; - - type = NECP_CLIENT_RESULT_PROTO_CTL_EVENT; - length = sizeof(protoctl_event_header.protoctl_event); - - memcpy(&protoctl_event_header.protoctl_tlv_header.type, &type, sizeof(type)); - memcpy(&protoctl_event_header.protoctl_tlv_header.length, &length, sizeof(length)); - memcpy(&protoctl_event_header.protoctl_event, &flow->protoctl_event, - sizeof(flow->protoctl_event)); - - error = copyout(&protoctl_event_header, uap->buffer + client->result_length + assigned_results_cursor, - sizeof(protoctl_event_header)); - - if (error) { - NECPLOG(LOG_ERR, "necp_client_copy protocol control event results" - " tlv_header copyout error (%d)", error); - goto done; - } - assigned_results_cursor += sizeof(protoctl_event_header); - flow->has_protoctl_event = FALSE; - flow->protoctl_event.protoctl_event_code = 0; - flow->protoctl_event.protoctl_event_val = 0; - flow->protoctl_event.protoctl_event_tcp_seq_num = 0; + } + } else { + // This request is for the client, so copy everything + struct necp_client_flow_registration *flow_registration = NULL; + RB_FOREACH(flow_registration, _necp_client_flow_tree, &client->flow_registrations) { + error = necp_client_fillout_flow_tlvs(client, client_is_observed, flow_registration, uap, &assigned_results_cursor); + if (error != 0) { + return (error); } } } @@ -3682,12 +4656,10 @@ necp_client_copy_internal(struct necp_client *client, bool client_is_observed, s if (!client_is_observed) { client->result_read = TRUE; - client->flow_result_read = TRUE; } } -done: - return (error); + return (0); } static int @@ -3701,28 +4673,25 @@ necp_client_copy(struct necp_fd_data *fd_data, struct necp_client_action_args *u *retval = 0; if (uap->buffer_size == 0 || uap->buffer == 0) { - error = EINVAL; - goto done; + return (EINVAL); } if (uap->action != NECP_CLIENT_ACTION_COPY_PARAMETERS && uap->action != NECP_CLIENT_ACTION_COPY_RESULT && uap->action != NECP_CLIENT_ACTION_COPY_UPDATED_RESULT) { - error = EINVAL; - goto done; + return (EINVAL); } if (uap->client_id) { if (uap->client_id_len != sizeof(uuid_t)) { NECPLOG(LOG_ERR, "Incorrect length (got %d, expected %d)", uap->client_id_len, sizeof(uuid_t)); - error = ERANGE; - goto done; + return (ERANGE); } error = copyin(uap->client_id, client_id, sizeof(uuid_t)); if (error) { NECPLOG(LOG_ERR, "necp_client_copy client_id copyin error (%d)", error); - goto done; + return (error); } } @@ -3735,7 +4704,7 @@ necp_client_copy(struct necp_fd_data *fd_data, struct necp_client_action_args *u struct necp_client *find_client = NULL; RB_FOREACH(find_client, _necp_client_tree, &fd_data->clients) { NECP_CLIENT_LOCK(find_client); - if (!find_client->result_read || !find_client->flow_result_read) { + if (!find_client->result_read || necp_client_has_unread_flows(find_client)) { client = find_client; // Leave the client locked, and break break; @@ -3749,7 +4718,7 @@ necp_client_copy(struct necp_fd_data *fd_data, struct necp_client_action_args *u if (client != NULL) { // If client is set, it is locked - error = necp_client_copy_internal(client, FALSE, uap, retval); + error = necp_client_copy_internal(client, client_id, FALSE, uap, retval); NECP_CLIENT_UNLOCK(client); } @@ -3765,16 +4734,11 @@ necp_client_copy(struct necp_fd_data *fd_data, struct necp_client_action_args *u bool found_client = FALSE; - struct necp_client find; - uuid_copy(find.client_id, client_id); - client = RB_FIND(_necp_client_global_tree, &necp_client_global_tree, &find); + client = necp_find_client_and_lock(client_id); if (client != NULL) { - NECP_CLIENT_LOCK(client); - // Matched, copy out data found_client = TRUE; - error = necp_client_copy_internal(client, TRUE, uap, retval); - + error = necp_client_copy_internal(client, client_id, TRUE, uap, retval); NECP_CLIENT_UNLOCK(client); } @@ -3783,17 +4747,14 @@ necp_client_copy(struct necp_fd_data *fd_data, struct necp_client_action_args *u // No client found, fail if (!found_client) { - error = ENOENT; - goto done; + return (ENOENT); } } else { // No client found, and not allowed to search other fds, fail - error = ENOENT; - goto done; + return (ENOENT); } } -done: return (error); } @@ -3856,7 +4817,8 @@ necp_client_copy_client_update(struct necp_fd_data *fd_data, struct necp_client_ } static int -necp_client_copy_parameters_locked(struct necp_client *client, struct necp_client_nexus_parameters *parameters) +necp_client_copy_parameters_locked(struct necp_client *client, + struct necp_client_nexus_parameters *parameters) { VERIFY(parameters != NULL); @@ -3890,44 +4852,6 @@ necp_client_copy_parameters_locked(struct necp_client *client, struct necp_clien return (error); } -int -necp_client_copy_parameters(uuid_t client_id, struct necp_client_nexus_parameters *parameters) -{ - int error = 0; - struct necp_client *client = NULL; - - if (parameters == NULL) { - return EINVAL; - } - - // Lock tree - NECP_CLIENT_TREE_LOCK_SHARED(); - - bool found_client = FALSE; - struct necp_client find; - uuid_copy(find.client_id, client_id); - client = RB_FIND(_necp_client_global_tree, &necp_client_global_tree, &find); - if (client != NULL) { - NECP_CLIENT_LOCK(client); - - // Matched, parse parameters - found_client = TRUE; - error = necp_client_copy_parameters_locked(client, parameters); - - NECP_CLIENT_UNLOCK(client); - } - - // Unlock tree - NECP_CLIENT_TREE_UNLOCK(); - - // No client found, fail - if (!found_client) { - return ENOENT; - } - - return error; -} - static int necp_client_list(struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval) { @@ -4145,6 +5069,7 @@ necp_client_agent_action(struct necp_fd_data *fd_data, struct necp_client_action error = netagent_client_message_with_params(agent_uuid, client_id, fd_data->proc_pid, + client->agent_handle, netagent_message_type, &parsed_parameters, NULL, NULL); @@ -4316,6 +5241,12 @@ necp_client_copy_interface(__unused struct necp_fd_data *fd_data, struct necp_cl if ((interface->if_eflags & IFEF_NOACKPRI) == IFEF_NOACKPRI) { interface_details.flags |= NECP_INTERFACE_FLAG_NOACKPRI; } + if ((interface->if_eflags & IFEF_3CA) == IFEF_3CA) { + interface_details.flags |= NECP_INTERFACE_FLAG_3CARRIERAGG; + } + if (IFNET_IS_LOW_POWER(interface)) { + interface_details.flags |= NECP_INTERFACE_FLAG_IS_LOW_POWER; + } interface_details.mtu = interface->if_mtu; u_int8_t ipv4_signature_len = sizeof(interface_details.ipv4_signature.signature); @@ -4439,9 +5370,17 @@ necp_client_update_cache(struct necp_fd_data *fd_data, struct necp_client_action goto done; } + struct necp_client_flow_registration *flow_registration = necp_client_find_flow(client, client_id); + if (flow_registration == NULL) { + NECP_CLIENT_UNLOCK(client); + NECP_FD_UNLOCK(fd_data); + error = ENOENT; + goto done; + } + NECP_CLIENT_ROUTE_LOCK(client); // This needs to be changed when TFO/ECN is supported by multiple flows - struct necp_client_flow *flow = LIST_FIRST(&client->flow_list); + struct necp_client_flow *flow = LIST_FIRST(&flow_registration->flow_list); if (flow == NULL || (flow->remote_addr.sa.sa_family != AF_INET && flow->remote_addr.sa.sa_family != AF_INET6) || @@ -4608,7 +5547,7 @@ necp_match_policy(struct proc *p, struct necp_match_policy_args *uap, int32_t *r { #pragma unused(retval) u_int8_t *parameters = NULL; - struct necp_aggregate_result returned_result = {}; + struct necp_aggregate_result returned_result; int error = 0; if (uap == NULL) { @@ -4821,7 +5760,7 @@ necp_get_socket_attributes(struct socket *so, struct sockopt *sopt) void * necp_create_nexus_assign_message(uuid_t nexus_instance, u_int32_t nexus_port, void *key, uint32_t key_length, struct necp_client_endpoint *local_endpoint, struct necp_client_endpoint *remote_endpoint, - u_int32_t flow_adv_index, size_t *message_length) + u_int32_t flow_adv_index, void *flow_stats, size_t *message_length) { u_int8_t *buffer = NULL; u_int8_t *cursor = NULL; @@ -4846,6 +5785,9 @@ necp_create_nexus_assign_message(uuid_t nexus_instance, u_int32_t nexus_port, vo if (remote_endpoint != NULL) { valsize += sizeof(struct necp_tlv_header) + sizeof(struct necp_client_endpoint); } + if (flow_stats != NULL) { + valsize += sizeof(struct necp_tlv_header) + sizeof(void *); + } if (valsize == 0) { return (NULL); } @@ -4872,6 +5814,9 @@ necp_create_nexus_assign_message(uuid_t nexus_instance, u_int32_t nexus_port, vo if (remote_endpoint != NULL) { cursor = necp_buffer_write_tlv(cursor, NECP_CLIENT_RESULT_REMOTE_ENDPOINT, sizeof(struct necp_client_endpoint), remote_endpoint, buffer, valsize); } + if (flow_stats != NULL) { + cursor = necp_buffer_write_tlv(cursor, NECP_CLIENT_RESULT_NEXUS_FLOW_STATS, sizeof(void *), &flow_stats, buffer, valsize); + } *message_length = valsize; @@ -4949,6 +5894,13 @@ necp_client_init(void) /* NOTREACHED */ } + necp_flow_registration_size = sizeof(struct necp_client_flow_registration); + necp_flow_registration_cache = mcache_create(NECP_FLOW_REGISTRATION_ZONE_NAME, necp_flow_registration_size, sizeof (uint64_t), 0, MCR_SLEEP); + if (necp_flow_registration_cache == NULL) { + panic("mcache_create(necp_client_flow_registration) failed\n"); + /* NOTREACHED */ + } + necp_client_update_tcall = thread_call_allocate_with_options(necp_update_all_clients_callout, NULL, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE); VERIFY(necp_client_update_tcall != NULL); @@ -4956,13 +5908,15 @@ necp_client_init(void) lck_rw_init(&necp_fd_lock, necp_fd_mtx_grp, necp_fd_mtx_attr); lck_rw_init(&necp_observer_lock, necp_fd_mtx_grp, necp_fd_mtx_attr); lck_rw_init(&necp_client_tree_lock, necp_fd_mtx_grp, necp_fd_mtx_attr); + lck_rw_init(&necp_flow_tree_lock, necp_fd_mtx_grp, necp_fd_mtx_attr); lck_rw_init(&necp_collect_stats_list_lock, necp_fd_mtx_grp, necp_fd_mtx_attr); LIST_INIT(&necp_fd_list); LIST_INIT(&necp_fd_observer_list); - LIST_INIT(&necp_collect_stats_client_list); + LIST_INIT(&necp_collect_stats_flow_list); RB_INIT(&necp_client_global_tree); + RB_INIT(&necp_client_flow_global_tree); return (0); } @@ -4971,5 +5925,6 @@ void necp_client_reap_caches(boolean_t purge) { mcache_reap_now(necp_flow_cache, purge); + mcache_reap_now(necp_flow_registration_cache, purge); } diff --git a/bsd/net/net_kev.h b/bsd/net/net_kev.h index 366b801a3..f7fd5a699 100644 --- a/bsd/net/net_kev.h +++ b/bsd/net/net_kev.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2017 Apple Inc. All rights reserved. + * Copyright (c) 2016-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -81,6 +81,7 @@ #define KEV_DL_AWDL_UNRESTRICTED 27 #define KEV_DL_RRC_STATE_CHANGED 28 #define KEV_DL_QOS_MODE_CHANGED 29 +#define KEV_DL_LOW_POWER_MODE_CHANGED 30 #ifdef PRIVATE #define KEV_NETPOLICY_SUBCLASS 3 /* Network policy subclass */ @@ -135,6 +136,7 @@ #define KEV_NETEVENT_SUBCLASS 11 /* Generic Net events subclass */ /* KEV_NETEVENT_SUBCLASS event codes */ #define KEV_NETEVENT_APNFALLBACK 1 +#define KEV_NETEVENT_CLAT46_EVENT 2 #define KEV_MPTCP_SUBCLASS 12 /* Global MPTCP events subclass */ /* KEV_MPTCP_SUBCLASS event codes */ diff --git a/bsd/net/net_stubs.c b/bsd/net/net_stubs.c index 36385a019..24bc5426f 100644 --- a/bsd/net/net_stubs.c +++ b/bsd/net/net_stubs.c @@ -28,6 +28,8 @@ #include +#if !NETWORKING + #define STUB(name) \ int name(void); \ int name(void) \ @@ -36,8 +38,6 @@ return (0); \ } -#if !NETWORKING - STUB(bpf_attach); STUB(bpf_tap_in); STUB(bpf_tap_out); @@ -350,6 +350,7 @@ STUB(ifnet_get_fastlane_capable); STUB(ifnet_get_unsent_bytes); STUB(ifnet_get_buffer_status); STUB(ifnet_normalise_unsent_data); +STUB(ifnet_set_low_power_mode); STUB(in6_localaddr); STUB(in_localaddr); STUB(in6addr_local); @@ -365,7 +366,6 @@ STUB(m_mtod); STUB(m_prepend_2); STUB(m_pullup); STUB(m_split); -STUB(m_trailingspace); STUB(mbuf_get_driver_scratch); STUB(mbuf_get_unsent_data_bytes); STUB(mbuf_get_buffer_status); @@ -461,13 +461,10 @@ STUB(sock_socket_internal); /* * Called from vm_pageout.c. Nothing to be done when there's no networking. */ -void m_drain(void); -void m_drain(void) +void mbuf_drain(boolean_t); +void mbuf_drain(boolean_t) { return; } -#else /* NETWORKING */ - - #endif /* !NETWORKING */ diff --git a/bsd/net/network_agent.c b/bsd/net/network_agent.c index 392665f18..a52cd6506 100644 --- a/bsd/net/network_agent.c +++ b/bsd/net/network_agent.c @@ -417,6 +417,12 @@ netagent_send_error_response(struct netagent_session *session, u_int8_t message_ int error = 0; u_int8_t *response = NULL; size_t response_size = sizeof(struct netagent_message_header); + + if (session == NULL) { + NETAGENTLOG0(LOG_ERR, "Got a NULL session"); + return (EINVAL); + } + MALLOC(response, u_int8_t *, response_size, M_NETAGENT, M_WAITOK); if (response == NULL) { return (ENOMEM); @@ -1038,7 +1044,7 @@ netagent_handle_update_inner(struct netagent_session *session, struct netagent_w search_client = NULL; temp_client = NULL; LIST_FOREACH_SAFE(search_client, &pending_triggers_list_copy, client_chain, temp_client) { - necp_force_update_client(search_client->client_id, session->wrapper->netagent.netagent_uuid); + necp_force_update_client(search_client->client_id, session->wrapper->netagent.netagent_uuid, session->wrapper->generation); netagent_send_cellular_failed_event(new_wrapper, search_client->client_pid, search_client->client_proc_uuid); LIST_REMOVE(search_client, client_chain); FREE(search_client, M_NETAGENT); @@ -1826,7 +1832,7 @@ netagent_get_agent_domain_and_type(uuid_t uuid, char *domain, char *type) memcpy(domain, wrapper->netagent.netagent_domain, NETAGENT_DOMAINSIZE); memcpy(type, wrapper->netagent.netagent_type, NETAGENT_TYPESIZE); } else { - NETAGENTLOG0(LOG_DEBUG, "Type requested for invalid netagent"); + NETAGENTLOG0(LOG_ERR, "Type requested for invalid netagent"); } lck_rw_done(&netagent_lock); @@ -1871,6 +1877,7 @@ int netagent_client_message_with_params(uuid_t agent_uuid, uuid_t necp_client_uuid, pid_t pid, + void *handle, u_int8_t message_type, struct necp_client_nexus_parameters *parameters, void **assigned_results, @@ -1938,13 +1945,16 @@ netagent_client_message_with_params(uuid_t agent_uuid, } if (wrapper->control_unit == 0) { - should_unlock = FALSE; - lck_rw_done(&netagent_lock); if (wrapper->event_handler == NULL) { // No event handler registered for kernel agent error = EINVAL; } else { - error = wrapper->event_handler(message_type, necp_client_uuid, pid, wrapper->event_context, parameters, assigned_results, assigned_results_length); + // We hold the shared lock during the event handler callout, so it is expected + // that the event handler will not lead to any registrations or unregistrations + // of network agents. + error = wrapper->event_handler(message_type, necp_client_uuid, pid, handle, + wrapper->event_context, parameters, + assigned_results, assigned_results_length); if (error != 0) { VERIFY(assigned_results == NULL || *assigned_results == NULL); VERIFY(assigned_results_length == NULL || *assigned_results_length == 0); @@ -1998,9 +2008,9 @@ netagent_client_message_with_params(uuid_t agent_uuid, } int -netagent_client_message(uuid_t agent_uuid, uuid_t necp_client_uuid, pid_t pid, u_int8_t message_type) +netagent_client_message(uuid_t agent_uuid, uuid_t necp_client_uuid, pid_t pid, void *handle, u_int8_t message_type) { - return (netagent_client_message_with_params(agent_uuid, necp_client_uuid, pid, message_type, NULL, NULL, NULL)); + return (netagent_client_message_with_params(agent_uuid, necp_client_uuid, pid, handle, message_type, NULL, NULL, NULL)); } int diff --git a/bsd/net/network_agent.h b/bsd/net/network_agent.h index 3e2c86417..0eddfa2aa 100644 --- a/bsd/net/network_agent.h +++ b/bsd/net/network_agent.h @@ -219,11 +219,12 @@ extern bool netagent_get_agent_domain_and_type(uuid_t uuid, char *domain, char * extern int netagent_kernel_trigger(uuid_t uuid); -extern int netagent_client_message(uuid_t agent_uuid, uuid_t necp_client_uuid, pid_t pid, u_int8_t message_type); +extern int netagent_client_message(uuid_t agent_uuid, uuid_t necp_client_uuid, pid_t pid, void *handle, u_int8_t message_type); extern int netagent_client_message_with_params(uuid_t agent_uuid, uuid_t necp_client_uuid, pid_t pid, + void *handle, u_int8_t message_type, struct necp_client_nexus_parameters *parameters, void **assigned_results, @@ -248,7 +249,7 @@ struct netagent_nexus_agent { #define NETAGENT_EVENT_NEXUS_FLOW_REMOVE NETAGENT_MESSAGE_TYPE_CLOSE_NEXUS #define NETAGENT_EVENT_NEXUS_FLOW_ABORT NETAGENT_MESSAGE_TYPE_ABORT_NEXUS -typedef errno_t (*netagent_event_f)(u_int8_t event, uuid_t necp_client_uuid, pid_t pid, void *context, struct necp_client_nexus_parameters *parameters, void **assigned_results, size_t *assigned_results_length); +typedef errno_t (*netagent_event_f)(u_int8_t event, uuid_t necp_client_uuid, pid_t pid, void *necp_handle, void *context, struct necp_client_nexus_parameters *parameters, void **assigned_results, size_t *assigned_results_length); extern netagent_session_t netagent_create(netagent_event_f event_handler, void *handle); diff --git a/bsd/net/ntstat.c b/bsd/net/ntstat.c index e370d7c93..eefb69aaf 100644 --- a/bsd/net/ntstat.c +++ b/bsd/net/ntstat.c @@ -3044,6 +3044,9 @@ nstat_sysinfo_send_data_internal( nstat_set_keyval_scalar(&kv[i++], NSTAT_SYSINFO_MPTCP_CELL_PROXY, data->u.tcp_stats.mptcp_cell_proxy); + nstat_set_keyval_scalar(&kv[i++], + NSTAT_SYSINFO_MPTCP_TRIGGERED_CELL, + data->u.tcp_stats.mptcp_triggered_cell); VERIFY(i == nkeyvals); break; } @@ -5106,3 +5109,97 @@ nstat_control_send( } +static int +tcp_progress_indicators_for_interface(unsigned int ifindex, uint64_t recentflow_maxduration, struct xtcpprogress_indicators *indicators) +{ + int error = 0; + struct inpcb *inp; + uint64_t min_recent_start_time; + + min_recent_start_time = mach_continuous_time() - recentflow_maxduration; + bzero(indicators, sizeof(*indicators)); + + lck_rw_lock_shared(tcbinfo.ipi_lock); + /* + * For progress indicators we don't need to special case TCP to collect time wait connections + */ + LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) + { + struct tcpcb *tp = intotcpcb(inp); + if (tp && inp->inp_last_outifp && + inp->inp_last_outifp->if_index == ifindex && + inp->inp_state != INPCB_STATE_DEAD && + !(tp->t_flags & TF_LOCAL)) + { + struct tcp_conn_status connstatus; + indicators->xp_numflows++; + tcp_get_connectivity_status(tp, &connstatus); + if (connstatus.write_probe_failed) + indicators->xp_write_probe_fails++; + if (connstatus.read_probe_failed) + indicators->xp_read_probe_fails++; + if (connstatus.conn_probe_failed) + indicators->xp_conn_probe_fails++; + if (inp->inp_start_timestamp > min_recent_start_time) + { + uint64_t flow_count; + + indicators->xp_recentflows++; + atomic_get_64(flow_count, &inp->inp_stat->rxbytes); + indicators->xp_recentflows_rxbytes += flow_count; + atomic_get_64(flow_count, &inp->inp_stat->txbytes); + indicators->xp_recentflows_txbytes += flow_count; + + indicators->xp_recentflows_rxooo += tp->t_stat.rxoutoforderbytes; + indicators->xp_recentflows_rxdup += tp->t_stat.rxduplicatebytes; + indicators->xp_recentflows_retx += tp->t_stat.txretransmitbytes; + if (tp->snd_max - tp->snd_una) + { + indicators->xp_recentflows_unacked++; + } + } + } + } + lck_rw_done(tcbinfo.ipi_lock); + + return (error); +} + + +__private_extern__ int +ntstat_tcp_progress_indicators(struct sysctl_req *req) +{ + struct xtcpprogress_indicators indicators = {}; + int error = 0; + struct tcpprogressreq requested; + + if (priv_check_cred(kauth_cred_get(), PRIV_NET_PRIVILEGED_NETWORK_STATISTICS, 0) != 0) + { + return EACCES; + } + if (req->newptr == USER_ADDR_NULL) + { + return EINVAL; + } + if (req->newlen < sizeof(req)) + { + return EINVAL; + } + error = SYSCTL_IN(req, &requested, sizeof(requested)); + if (error != 0) + { + return error; + } + error = tcp_progress_indicators_for_interface(requested.ifindex, requested.recentflow_maxduration, &indicators); + if (error != 0) + { + return error; + } + error = SYSCTL_OUT(req, &indicators, sizeof(indicators)); + + return (error); +} + + + + diff --git a/bsd/net/ntstat.h b/bsd/net/ntstat.h index af474f67a..82577499f 100644 --- a/bsd/net/ntstat.h +++ b/bsd/net/ntstat.h @@ -316,9 +316,10 @@ enum ,NSTAT_SYSINFO_MPTCP_WIFI_PROXY = 184 ,NSTAT_SYSINFO_MPTCP_CELL_PROXY = 185 ,NSTAT_SYSINFO_ECN_IFNET_FALLBACK_SYNRST = 186 + ,NSTAT_SYSINFO_MPTCP_TRIGGERED_CELL = 187 // NSTAT_SYSINFO_ENUM_VERSION must be updated any time a value is added -#define NSTAT_SYSINFO_ENUM_VERSION 20170623 +#define NSTAT_SYSINFO_ENUM_VERSION 20180416 }; #define NSTAT_SYSINFO_API_FIRST NSTAT_SYSINFO_API_IF_FLTR_ATTACH @@ -1058,9 +1059,11 @@ typedef struct nstat_sysinfo_tcp_stats u_int64_t mptcp_aggregate_all_bytes; u_int32_t mptcp_wifi_proxy; /* Total number of new subflows that fell back to regular TCP on cell */ u_int32_t mptcp_cell_proxy; /* Total number of new subflows that fell back to regular TCP on WiFi */ + u_int32_t mptcp_triggered_cell; /* Total number of times an MPTCP-connection triggered cell bringup */ + u_int32_t _padding; /* When adding/removing here, also adjust NSTAT_SYSINFO_TCP_STATS_COUNT */ } nstat_sysinfo_tcp_stats; -#define NSTAT_SYSINFO_TCP_STATS_COUNT 70 +#define NSTAT_SYSINFO_TCP_STATS_COUNT 71 enum { NSTAT_IFNET_ECN_PROTO_IPV4 = 1 @@ -1158,6 +1161,8 @@ void nstat_ifnet_threshold_reached(unsigned int ifindex); void nstat_sysinfo_send_data(struct nstat_sysinfo_data *); +int ntstat_tcp_progress_indicators(struct sysctl_req *req); + // Utilities for userland stats reporting u_int16_t nstat_ifnet_to_flags(struct ifnet *ifp); diff --git a/bsd/net/packet_mangler.c b/bsd/net/packet_mangler.c index a09e7c74c..24d18870a 100644 --- a/bsd/net/packet_mangler.c +++ b/bsd/net/packet_mangler.c @@ -1068,6 +1068,7 @@ static void chksm_update(mbuf_t data) u_int16_t ip_sum; u_int16_t tsum; struct tcphdr *tcp; + errno_t err; unsigned char *ptr = (unsigned char *)mbuf_data(data); struct ip *ip = (struct ip *)(void *)ptr; @@ -1076,16 +1077,17 @@ static void chksm_update(mbuf_t data) } ip->ip_sum = 0; - mbuf_inet_cksum(data, 0, 0, ip->ip_hl << 2, &ip_sum); // ip sum - - ip->ip_sum = ip_sum; + err = mbuf_inet_cksum(data, 0, 0, ip->ip_hl << 2, &ip_sum); // ip sum + if (err == 0) + ip->ip_sum = ip_sum; switch (ip->ip_p) { case IPPROTO_TCP: tcp = (struct tcphdr *)(void *)(ptr + (ip->ip_hl << 2)); tcp->th_sum = 0; - mbuf_inet_cksum(data, IPPROTO_TCP, ip->ip_hl << 2, + err = mbuf_inet_cksum(data, IPPROTO_TCP, ip->ip_hl << 2, ntohs(ip->ip_len) - (ip->ip_hl << 2), &tsum); - tcp->th_sum = tsum; + if (err == 0) + tcp->th_sum = tsum; break; case IPPROTO_UDP: /* Don't handle UDP */ diff --git a/bsd/net/pf.c b/bsd/net/pf.c index 0ddbf167a..70f1f906d 100644 --- a/bsd/net/pf.c +++ b/bsd/net/pf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2017 Apple Inc. All rights reserved. + * Copyright (c) 2007-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -82,7 +82,6 @@ #include #include -#include #include #include @@ -108,6 +107,7 @@ #include #include #include +#include #include #include @@ -2061,16 +2061,7 @@ pf_addr_wrap_neq(struct pf_addr_wrap *aw1, struct pf_addr_wrap *aw2) u_int16_t pf_cksum_fixup(u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp) { - u_int32_t l; - - if (udp && !cksum) - return (0); - l = cksum + old - new; - l = (l >> 16) + (l & 0xffff); - l = l & 0xffff; - if (udp && !l) - return (0xffff); - return (l); + return (nat464_cksum_fixup(cksum, old, new, udp)); } /* @@ -2111,17 +2102,23 @@ pf_change_ap(int dir, pbuf_t *pbuf, struct pf_addr *a, u_int16_t *p, ao.addr16[0], an->addr16[0], 0), ao.addr16[1], an->addr16[1], 0); *p = pn; - /* - * If the packet is originated from an ALG on the NAT gateway - * (source address is loopback or local), in which case the - * TCP/UDP checksum field contains the pseudo header checksum - * that's not yet complemented. A packet generated locally - * will have UDP/TCP CSUM flag set (gets set in protocol - * output). - */ + /* + * If the packet is originated from an ALG on the NAT gateway + * (source address is loopback or local), in which case the + * TCP/UDP checksum field contains the pseudo header checksum + * that's not yet complemented. + * In that case we do not need to fixup the checksum for port + * translation as the pseudo header checksum doesn't include ports. + * + * A packet generated locally will have UDP/TCP CSUM flag + * set (gets set in protocol output). + * + * It should be noted that the fixup doesn't do anything if the + * checksum is 0. + */ if (dir == PF_OUT && pbuf != NULL && - (*pbuf->pb_csum_flags & (CSUM_TCP | CSUM_UDP))) { - /* Pseudo-header checksum does not include ports */ + (*pbuf->pb_csum_flags & (CSUM_TCP | CSUM_UDP))) { + /* Pseudo-header checksum does not include ports */ *pc = ~pf_cksum_fixup(pf_cksum_fixup(~*pc, ao.addr16[0], an->addr16[0], u), ao.addr16[1], an->addr16[1], u); @@ -4062,7 +4059,16 @@ pf_calc_mss(struct pf_addr *addr, sa_family_t af, u_int16_t offer) } if (rt && rt->rt_ifp) { - mss = rt->rt_ifp->if_mtu - hlen - sizeof (struct tcphdr); + /* This is relevant only for PF SYN Proxy */ + int interface_mtu = rt->rt_ifp->if_mtu; + + if (af == AF_INET && + INTF_ADJUST_MTU_FOR_CLAT46(rt->rt_ifp)) { + interface_mtu = IN6_LINKMTU(rt->rt_ifp); + /* Further adjust the size for CLAT46 expansion */ + interface_mtu -= CLAT46_HDR_EXPANSION_OVERHD; + } + mss = interface_mtu - hlen - sizeof (struct tcphdr); mss = max(tcp_mssdflt, mss); rtfree(rt); } @@ -4483,10 +4489,10 @@ pf_nat64_ipv6(pbuf_t *pbuf, int off, struct pf_pdesc *pd) ip4->ip_hl = 5; ip4->ip_tos = pd->tos & htonl(0x0ff00000); ip4->ip_len = htons(sizeof(*ip4) + (pd->tot_len - off)); - ip4->ip_id = 0; - ip4->ip_off = htons(IP_DF); - ip4->ip_ttl = pd->ttl; - ip4->ip_p = pd->proto; + ip4->ip_id = 0; + ip4->ip_off = htons(IP_DF); + ip4->ip_ttl = pd->ttl; + ip4->ip_p = pd->proto; ip4->ip_sum = 0; ip4->ip_src = pd->naddr.v4addr; ip4->ip_dst = pd->ndaddr.v4addr; @@ -4500,7 +4506,7 @@ pf_nat64_ipv6(pbuf_t *pbuf, int off, struct pf_pdesc *pd) icmp = (struct icmp *)pbuf_contig_segment(pbuf, hlen, ICMP_MINLEN); if (icmp == NULL) - return (PF_NAT64); + return (PF_DROP); icmp->icmp_cksum = 0; icmp->icmp_cksum = pbuf_inet_cksum(pbuf, 0, hlen, @@ -4628,11 +4634,7 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, icmptype = pd->hdr.icmp->icmp_type; icmpcode = pd->hdr.icmp->icmp_code; - if (icmptype == ICMP_UNREACH || - icmptype == ICMP_SOURCEQUENCH || - icmptype == ICMP_REDIRECT || - icmptype == ICMP_TIMXCEED || - icmptype == ICMP_PARAMPROB) + if (ICMP_ERRORTYPE(icmptype)) state_icmp++; break; #endif /* INET */ @@ -4645,10 +4647,7 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, icmptype = pd->hdr.icmp6->icmp6_type; icmpcode = pd->hdr.icmp6->icmp6_code; - if (icmptype == ICMP6_DST_UNREACH || - icmptype == ICMP6_PACKET_TOO_BIG || - icmptype == ICMP6_TIME_EXCEEDED || - icmptype == ICMP6_PARAM_PROB) + if (ICMP6_ERRORTYPE(icmptype)) state_icmp++; break; #endif /* INET6 */ @@ -7374,11 +7373,7 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, icmpid = pd->hdr.icmp->icmp_id; icmpsum = &pd->hdr.icmp->icmp_cksum; - if (icmptype == ICMP_UNREACH || - icmptype == ICMP_SOURCEQUENCH || - icmptype == ICMP_REDIRECT || - icmptype == ICMP_TIMXCEED || - icmptype == ICMP_PARAMPROB) + if (ICMP_ERRORTYPE(icmptype)) state_icmp++; break; #endif /* INET */ @@ -7388,10 +7383,7 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, icmpid = pd->hdr.icmp6->icmp6_id; icmpsum = &pd->hdr.icmp6->icmp6_cksum; - if (icmptype == ICMP6_DST_UNREACH || - icmptype == ICMP6_PACKET_TOO_BIG || - icmptype == ICMP6_TIME_EXCEEDED || - icmptype == ICMP6_PARAM_PROB) + if (ICMP6_ERRORTYPE(icmptype)) state_icmp++; break; #endif /* INET6 */ @@ -8735,7 +8727,7 @@ pf_route(pbuf_t **pbufp, struct pf_rule *r, int dir, struct ifnet *oifp, struct pf_src_node *sn = NULL; int error = 0; uint32_t sw_csum; - + int interface_mtu = 0; bzero(&iproute, sizeof (iproute)); if (pbufp == NULL || !pbuf_is_valid(*pbufp) || r == NULL || @@ -8837,7 +8829,15 @@ pf_route(pbuf_t **pbufp, struct pf_rule *r, int dir, struct ifnet *oifp, ip_output_checksum(ifp, m0, ((ip->ip_hl) << 2), ntohs(ip->ip_len), &sw_csum); - if (ntohs(ip->ip_len) <= ifp->if_mtu || TSO_IPV4_OK(ifp, m0) || + interface_mtu = ifp->if_mtu; + + if (INTF_ADJUST_MTU_FOR_CLAT46(ifp)) { + interface_mtu = IN6_LINKMTU(ifp); + /* Further adjust the size for CLAT46 expansion */ + interface_mtu -= CLAT46_HDR_EXPANSION_OVERHD; + } + + if (ntohs(ip->ip_len) <= interface_mtu || TSO_IPV4_OK(ifp, m0) || (!(ip->ip_off & htons(IP_DF)) && (ifp->if_hwassist & CSUM_FRAGMENT))) { ip->ip_sum = 0; @@ -8860,7 +8860,7 @@ pf_route(pbuf_t **pbufp, struct pf_rule *r, int dir, struct ifnet *oifp, ipstat.ips_cantfrag++; if (r->rt != PF_DUPTO) { icmp_error(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, - ifp->if_mtu); + interface_mtu); goto done; } else goto bad; @@ -8873,7 +8873,7 @@ pf_route(pbuf_t **pbufp, struct pf_rule *r, int dir, struct ifnet *oifp, NTOHS(ip->ip_off); NTOHS(ip->ip_len); #endif - error = ip_fragment(m0, ifp, ifp->if_mtu, sw_csum); + error = ip_fragment(m0, ifp, interface_mtu, sw_csum); if (error) { m0 = NULL; diff --git a/bsd/net/pf_ioctl.c b/bsd/net/pf_ioctl.c index 977751814..395568c48 100644 --- a/bsd/net/pf_ioctl.c +++ b/bsd/net/pf_ioctl.c @@ -459,6 +459,7 @@ pfinit(void) _CASSERT((SC_AV & SCIDX_MASK) == SCIDX_AV); _CASSERT((SC_RV & SCIDX_MASK) == SCIDX_RV); _CASSERT((SC_VI & SCIDX_MASK) == SCIDX_VI); + _CASSERT((SC_SIG & SCIDX_MASK) == SCIDX_SIG); _CASSERT((SC_VO & SCIDX_MASK) == SCIDX_VO); _CASSERT((SC_CTL & SCIDX_MASK) == SCIDX_CTL); diff --git a/bsd/net/pf_pbuf.c b/bsd/net/pf_pbuf.c index a5b69b226..86cc47c3b 100644 --- a/bsd/net/pf_pbuf.c +++ b/bsd/net/pf_pbuf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2017 Apple Inc. All rights reserved. + * Copyright (c) 2016-2018 Apple Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -257,8 +257,7 @@ pbuf_resize_segment(pbuf_t *pbuf, int off, int olen, int nlen) } pbuf_sync(pbuf); - } else - if (pbuf->pb_type == PBUF_TYPE_MEMORY) { + } else if (pbuf->pb_type == PBUF_TYPE_MEMORY) { struct pbuf_memory *nm = &pbuf->pb_memory; u_int true_offset, move_len; int delta_len; @@ -280,9 +279,9 @@ pbuf_resize_segment(pbuf_t *pbuf, int off, int olen, int nlen) VERIFY((nm->pm_len + nm->pm_offset) <= nm->pm_buffer_len); pbuf_sync(pbuf); - } else + } else { panic("pbuf_csum_flags_get: bad pb_type: %d", pbuf->pb_type); - + } return (rv); } @@ -293,7 +292,7 @@ pbuf_contig_segment(pbuf_t *pbuf, int off, int len) VERIFY(off >= 0); VERIFY(len >= 0); - VERIFY((u_int)(off + len) < pbuf->pb_packet_len); + VERIFY((u_int)(off + len) <= pbuf->pb_packet_len); /* * Note: If this fails, then the pbuf is destroyed. This is a @@ -301,7 +300,6 @@ pbuf_contig_segment(pbuf_t *pbuf, int off, int len) * * PF expects this behaviour so it's not a real problem. */ - if (pbuf->pb_type == PBUF_TYPE_MBUF) { struct mbuf *n; int moff; diff --git a/bsd/net/pf_pbuf.h b/bsd/net/pf_pbuf.h index 55c7f0aa8..ec6d0333a 100644 --- a/bsd/net/pf_pbuf.h +++ b/bsd/net/pf_pbuf.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016 Apple Inc. All rights reserved. + * Copyright (c) 2016-2018 Apple Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -68,7 +68,7 @@ typedef struct pbuf { uint32_t pb_packet_len; uint32_t pb_contig_len; uint32_t *pb_csum_flags; - uint32_t *pb_csum_data; + uint32_t *pb_csum_data; /* data field used by csum routines */ uint8_t *pb_proto; uint8_t *pb_flowsrc; uint32_t *pb_flowid; @@ -76,6 +76,7 @@ typedef struct pbuf { struct pf_mtag *pb_pftag; struct ifnet *pb_ifp; struct pbuf *pb_next; + } pbuf_t; #define pbuf_is_valid(pb) (!((pb) == NULL || (pb)->pb_type == PBUF_TYPE_ZOMBIE)) diff --git a/bsd/net/pfvar.h b/bsd/net/pfvar.h index d7ea4c6d1..8b6f61ced 100644 --- a/bsd/net/pfvar.h +++ b/bsd/net/pfvar.h @@ -780,6 +780,7 @@ struct pf_rule { #define SC_AV 0x15 #define SC_RV 0x16 #define SC_VI 0x17 +#define SC_SIG 0x17 #define SC_VO 0x18 #define SC_CTL 0x19 diff --git a/bsd/net/pktap.c b/bsd/net/pktap.c index 66616a73f..41da2471d 100644 --- a/bsd/net/pktap.c +++ b/bsd/net/pktap.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2017 Apple Inc. All rights reserved. + * Copyright (c) 2012-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -27,6 +27,7 @@ */ #include +#include #include #include @@ -127,12 +128,17 @@ static LIST_HEAD(pktap_list, pktap_softc) pktap_list = int pktap_clone_create(struct if_clone *, u_int32_t, void *); int pktap_clone_destroy(struct ifnet *); +#define PKTAP_MAXUNIT IF_MAXUNIT +#define PKTAP_ZONE_MAX_ELEM MIN(IFNETS_MAX, PKTAP_MAXUNIT) + static struct if_clone pktap_cloner = IF_CLONE_INITIALIZER(PKTAP_IFNAME, pktap_clone_create, pktap_clone_destroy, 0, - IF_MAXUNIT); + PKTAP_MAXUNIT, + PKTAP_ZONE_MAX_ELEM, + sizeof(struct pktap_softc)); errno_t pktap_if_output(ifnet_t, mbuf_t); errno_t pktap_demux(ifnet_t, mbuf_t, char *, protocol_family_t *); @@ -175,12 +181,17 @@ pktap_hexdump(int mask, void *addr, size_t len) printf("\n"); } +#define _CASSERT_OFFFSETOF_FIELD(s1, s2, f) \ + _CASSERT(offsetof(struct s1, f) == offsetof(struct s2, f)) + __private_extern__ void pktap_init(void) { int error = 0; lck_grp_attr_t *lck_grp_attr = NULL; + _CASSERT_OFFFSETOF_FIELD(pktap_header, pktap_v2_hdr, pth_flags); + /* Make sure we're called only once */ VERIFY(pktap_inited == 0); @@ -212,8 +223,7 @@ pktap_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params) PKTAP_LOG(PKTP_LOG_FUNC, "unit %u\n", unit); - pktap = _MALLOC(sizeof(struct pktap_softc), M_DEVBUF, - M_WAITOK | M_ZERO); + pktap = if_clone_softc_allocate(&pktap_cloner); if (pktap == NULL) { printf("%s: _MALLOC failed\n", __func__); error = ENOMEM; @@ -291,10 +301,8 @@ pktap_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params) LIST_INSERT_HEAD(&pktap_list, pktap, pktp_link); lck_rw_done(pktap_lck_rw); done: - if (error != 0) { - if (pktap != NULL) - _FREE(pktap, M_DEVBUF); - } + if (error != 0 && pktap != NULL) + if_clone_softc_deallocate(&pktap_cloner, pktap); return (error); } @@ -682,8 +690,7 @@ pktap_detach(ifnet_t ifp) /* Drop reference as it's no more on the global list */ ifnet_release(ifp); - _FREE(pktap, M_DEVBUF); - + if_clone_softc_deallocate(&pktap_cloner, pktap); /* This is for the reference taken by ifnet_attach() */ (void) ifnet_release(ifp); } @@ -766,16 +773,15 @@ static void pktap_set_procinfo(struct pktap_header *hdr, struct so_procinfo *soprocinfo) { hdr->pth_pid = soprocinfo->spi_pid; - proc_name(soprocinfo->spi_pid, hdr->pth_comm, MAXCOMLEN); + if (hdr->pth_comm[0] == 0) + proc_name(soprocinfo->spi_pid, hdr->pth_comm, MAXCOMLEN); if (soprocinfo->spi_pid != 0) uuid_copy(hdr->pth_uuid, soprocinfo->spi_uuid); - /* - * When not delegated, the effective pid is the same as the real pid - */ if (soprocinfo->spi_delegated != 0) { hdr->pth_flags |= PTH_FLAG_PROC_DELEGATED; hdr->pth_epid = soprocinfo->spi_epid; + if (hdr->pth_ecomm[0] == 0) proc_name(soprocinfo->spi_epid, hdr->pth_ecomm, MAXCOMLEN); uuid_copy(hdr->pth_euuid, soprocinfo->spi_euuid); } @@ -790,11 +796,6 @@ pktap_finalize_proc_info(struct pktap_header *hdr) if (!(hdr->pth_flags & PTH_FLAG_DELAY_PKTAP)) return; - /* - * Clear the flag as it's internal - */ - hdr->pth_flags &= ~PTH_FLAG_DELAY_PKTAP; - if (hdr->pth_ipproto == IPPROTO_TCP) found = inp_findinpcb_procinfo(&tcbinfo, hdr->pth_flowid, &soprocinfo); @@ -809,13 +810,83 @@ pktap_finalize_proc_info(struct pktap_header *hdr) pktap_set_procinfo(hdr, &soprocinfo); } +static void +pktap_v2_set_procinfo(struct pktap_v2_hdr *pktap_v2_hdr, + struct so_procinfo *soprocinfo) +{ + pktap_v2_hdr->pth_pid = soprocinfo->spi_pid; + + if (soprocinfo->spi_pid != 0 && soprocinfo->spi_pid != -1) { + if (pktap_v2_hdr->pth_comm_offset != 0) { + char *ptr = ((char *)pktap_v2_hdr) + + pktap_v2_hdr->pth_comm_offset; + + proc_name(soprocinfo->spi_pid, + ptr, PKTAP_MAX_COMM_SIZE); + } + if (pktap_v2_hdr->pth_uuid_offset != 0) { + uuid_t *ptr = (uuid_t *) (((char *)pktap_v2_hdr) + + pktap_v2_hdr->pth_uuid_offset); + + uuid_copy(*ptr, soprocinfo->spi_uuid); + } + } + + if (!(pktap_v2_hdr->pth_flags & PTH_FLAG_PROC_DELEGATED)) + return; + + /* + * The effective UUID may be set independently from the effective pid + */ + if (soprocinfo->spi_delegated != 0) { + pktap_v2_hdr->pth_flags |= PTH_FLAG_PROC_DELEGATED; + pktap_v2_hdr->pth_e_pid = soprocinfo->spi_epid; + + if (soprocinfo->spi_pid != 0 && soprocinfo->spi_pid != -1 && + pktap_v2_hdr->pth_e_comm_offset != 0) { + char *ptr = ((char *)pktap_v2_hdr) + + pktap_v2_hdr->pth_e_comm_offset; + + proc_name(soprocinfo->spi_epid, + ptr, PKTAP_MAX_COMM_SIZE); + } + if (pktap_v2_hdr->pth_e_uuid_offset != 0) { + uuid_t *ptr = (uuid_t *) (((char *)pktap_v2_hdr) + + pktap_v2_hdr->pth_e_uuid_offset); + + uuid_copy(*ptr, soprocinfo->spi_euuid); + } + } +} + __private_extern__ void -pktap_fill_proc_info(struct pktap_header *hdr, protocol_family_t proto, - struct mbuf *m, u_int32_t pre, int outgoing, struct ifnet *ifp) +pktap_v2_finalize_proc_info(struct pktap_v2_hdr *pktap_v2_hdr) { - int found = 0; + int found; struct so_procinfo soprocinfo; + if (!(pktap_v2_hdr->pth_flags & PTH_FLAG_DELAY_PKTAP)) + return; + + if (pktap_v2_hdr->pth_ipproto == IPPROTO_TCP) { + found = inp_findinpcb_procinfo(&tcbinfo, + pktap_v2_hdr->pth_flowid, &soprocinfo); + } else if (pktap_v2_hdr->pth_ipproto == IPPROTO_UDP) { + found = inp_findinpcb_procinfo(&udbinfo, + pktap_v2_hdr->pth_flowid, &soprocinfo); + } else { + found = inp_findinpcb_procinfo(&ripcbinfo, + pktap_v2_hdr->pth_flowid, &soprocinfo); + } + if (found == 1) { + pktap_v2_set_procinfo(pktap_v2_hdr, &soprocinfo); + } +} + +__private_extern__ void +pktap_fill_proc_info(struct pktap_header *hdr, protocol_family_t proto, + struct mbuf *m, u_int32_t pre, int outgoing, struct ifnet *ifp) +{ /* * Getting the pid and procname is expensive * For outgoing, do the lookup only if there's an @@ -823,22 +894,54 @@ pktap_fill_proc_info(struct pktap_header *hdr, protocol_family_t proto, */ if (outgoing != 0 && m->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) { /* - * To avoid lock ordering issues we delay the process lookup + * To avoid lock ordering issues we delay the proc UUID lookup * to the BPF read as we cannot * assume the socket lock is unlocked on output */ - found = 0; hdr->pth_flags |= PTH_FLAG_DELAY_PKTAP; + hdr->pth_flags |= PTH_FLAG_SOCKET; hdr->pth_flowid = m->m_pkthdr.pkt_flowid; - if (m->m_pkthdr.pkt_flags & PKTF_FLOW_RAWSOCK) + + if (m->m_pkthdr.pkt_flags & PKTF_FLOW_RAWSOCK) { hdr->pth_ipproto = IPPROTO_RAW; - else + } else { hdr->pth_ipproto = m->m_pkthdr.pkt_proto; - if (m->m_pkthdr.pkt_flags & PKTF_NEW_FLOW) + } + + if (hdr->pth_ipproto == IPPROTO_TCP) { + hdr->pth_pid = m->m_pkthdr.tx_tcp_pid; + hdr->pth_epid = m->m_pkthdr.tx_tcp_e_pid; + } else if (hdr->pth_ipproto == IPPROTO_UDP) { + hdr->pth_pid = m->m_pkthdr.tx_udp_pid; + hdr->pth_epid = m->m_pkthdr.tx_udp_e_pid; + } else if (hdr->pth_ipproto == IPPROTO_RAW) { + hdr->pth_pid = m->m_pkthdr.tx_rawip_pid; + hdr->pth_epid = m->m_pkthdr.tx_rawip_e_pid; + } + + if (hdr->pth_pid != 0 && hdr->pth_pid != -1) { + proc_name(hdr->pth_pid, hdr->pth_comm, MAXCOMLEN); + } else { + hdr->pth_pid = -1; + } + + if (hdr->pth_epid != 0 && hdr->pth_epid != -1) { + hdr->pth_flags|= PTH_FLAG_PROC_DELEGATED; + proc_name(hdr->pth_epid, hdr->pth_ecomm, MAXCOMLEN); + } else { + hdr->pth_epid = -1; + } + + if (m->m_pkthdr.pkt_flags & PKTF_NEW_FLOW) { hdr->pth_flags |= PTH_FLAG_NEW_FLOW; + } } else if (outgoing == 0) { + int found = 0; + struct so_procinfo soprocinfo; struct inpcb *inp = NULL; + memset(&soprocinfo, 0, sizeof(struct so_procinfo)); + if (proto == PF_INET) { struct ip ip; errno_t error; @@ -969,22 +1072,24 @@ pktap_fill_proc_info(struct pktap_header *hdr, protocol_family_t proto, } } if (inp != NULL) { + hdr->pth_flags |= PTH_FLAG_SOCKET; if (inp->inp_state != INPCB_STATE_DEAD && inp->inp_socket != NULL) { found = 1; inp_get_soprocinfo(inp, &soprocinfo); } in_pcb_checkstate(inp, WNT_RELEASE, 0); } - } done: - /* - * -1 means PID not found - */ - hdr->pth_pid = -1; - hdr->pth_epid = -1; + /* + * -1 means PID not found + */ + hdr->pth_pid = -1; + hdr->pth_epid = -1; + if (found != 0) pktap_set_procinfo(hdr, &soprocinfo); } +} __private_extern__ void pktap_bpf_tap(struct ifnet *ifp, protocol_family_t proto, struct mbuf *m, @@ -994,7 +1099,6 @@ pktap_bpf_tap(struct ifnet *ifp, protocol_family_t proto, struct mbuf *m, void (*bpf_tap_func)(ifnet_t, u_int32_t, mbuf_t, void *, size_t) = outgoing ? bpf_tap_out : bpf_tap_in; - /* * Skip the coprocessor interface */ @@ -1084,7 +1188,8 @@ pktap_bpf_tap(struct ifnet *ifp, protocol_family_t proto, struct mbuf *m, hdr->pth_dlt = DLT_APPLE_IP_OVER_IEEE1394; break; case IFT_OTHER: - if (strncmp(ifp->if_name, "utun", strlen("utun")) == 0) { + if (ifp->if_subfamily == IFNET_SUBFAMILY_IPSEC || + ifp->if_subfamily == IFNET_SUBFAMILY_UTUN) { /* * For utun: * - incoming packets do not have the prefix set to four @@ -1141,6 +1246,11 @@ pktap_bpf_tap(struct ifnet *ifp, protocol_family_t proto, struct mbuf *m, hdr->pth_iftype = ifp->if_type; hdr->pth_ifunit = ifp->if_unit; + if (m->m_pkthdr.pkt_flags & PKTF_KEEPALIVE) + hdr->pth_flags |= PTH_FLAG_KEEP_ALIVE; + if (m->m_pkthdr.pkt_flags & PKTF_TCP_REXMT) + hdr->pth_flags |= PTH_FLAG_REXMIT; + pktap_fill_proc_info(hdr, proto, m, pre, outgoing, ifp); hdr->pth_svc = so_svc2tc(m->m_pkthdr.pkt_svc); @@ -1212,3 +1322,163 @@ pktap_output(struct ifnet *ifp, protocol_family_t proto, struct mbuf *m, pktap_bpf_tap(ifp, proto, m, pre, post, 1); } + +void +convert_to_pktap_header_to_v2(struct bpf_packet *bpf_pkt, bool truncate) +{ + struct pktap_header *pktap_header; + size_t extra_src_size; + struct pktap_buffer_v2_hdr_extra pktap_buffer_v2_hdr_extra; + struct pktap_v2_hdr_space *pktap_v2_hdr_space; + struct pktap_v2_hdr *pktap_v2_hdr; + uint8_t *ptr; + + pktap_header = (struct pktap_header *)bpf_pkt->bpfp_header; + + if (pktap_header->pth_type_next != PTH_TYPE_PACKET) { + return; + } + + VERIFY(bpf_pkt->bpfp_header_length >= sizeof(struct pktap_header)); + + /* + * extra_src_size is the length of the optional link layer header + */ + extra_src_size = bpf_pkt->bpfp_header_length - + sizeof(struct pktap_header); + + VERIFY(extra_src_size <= sizeof(union pktap_header_extra)); + + pktap_v2_hdr_space = &pktap_buffer_v2_hdr_extra.hdr_space; + pktap_v2_hdr = &pktap_v2_hdr_space->pth_hdr; + ptr = (uint8_t *) (pktap_v2_hdr + 1); + + COPY_PKTAP_COMMON_FIELDS_TO_V2(pktap_v2_hdr, pktap_header); + + /* + * When truncating don't bother with the process UUIDs + */ + if (!truncate) { + if ((pktap_header->pth_flags & PTH_FLAG_DELAY_PKTAP)) { + pktap_v2_hdr->pth_uuid_offset = pktap_v2_hdr->pth_length; + pktap_v2_hdr->pth_length += sizeof(uuid_t); + uuid_clear(*(uuid_t *)ptr); + ptr += sizeof(uuid_t); + VERIFY((void *)ptr < (void *)(pktap_v2_hdr_space + 1)); + } else if (!uuid_is_null(pktap_header->pth_uuid)) { + pktap_v2_hdr->pth_uuid_offset = pktap_v2_hdr->pth_length; + uuid_copy(*(uuid_t *)ptr, pktap_header->pth_uuid); + pktap_v2_hdr->pth_length += sizeof(uuid_t); + ptr += sizeof(uuid_t); + VERIFY((void *)ptr < (void *)(pktap_v2_hdr_space + 1)); + } + + if ((pktap_header->pth_flags & PTH_FLAG_DELAY_PKTAP)) { + if (pktap_header->pth_flags & PTH_FLAG_PROC_DELEGATED) { + pktap_v2_hdr->pth_e_uuid_offset = pktap_v2_hdr->pth_length; + uuid_clear(*(uuid_t *)ptr); + pktap_v2_hdr->pth_length += sizeof(uuid_t); + ptr += sizeof(uuid_t); + VERIFY((void *)ptr < (void *)(pktap_v2_hdr_space + 1)); + } + } else if(!uuid_is_null(pktap_header->pth_euuid)) { + pktap_v2_hdr->pth_e_uuid_offset = pktap_v2_hdr->pth_length; + uuid_copy(*(uuid_t *)ptr, pktap_header->pth_euuid); + pktap_v2_hdr->pth_length += sizeof(uuid_t); + ptr += sizeof(uuid_t); + VERIFY((void *)ptr < (void *)(pktap_v2_hdr_space + 1)); + } + } + + if (pktap_header->pth_ifname[0] != 0) { + size_t strsize; + + pktap_v2_hdr->pth_ifname_offset = pktap_v2_hdr->pth_length; + + /* + * Note: strlcpy() returns the length of the string so we need + * to add one for the end-of-string + */ + strsize = 1 + strlcpy((char *)ptr, pktap_header->pth_ifname, + sizeof(pktap_v2_hdr_space->pth_ifname)); + pktap_v2_hdr->pth_length += strsize; + ptr += strsize; + VERIFY((void *)ptr < (void *)(pktap_v2_hdr_space + 1)); + } + + /* + * Do not waste space with the process name if we do not have a pid + */ + if (pktap_header->pth_pid != 0 && pktap_header->pth_pid != -1) { + if (pktap_header->pth_comm[0] != 0) { + size_t strsize; + + pktap_v2_hdr->pth_comm_offset = pktap_v2_hdr->pth_length; + + strsize = 1 + strlcpy((char *)ptr, pktap_header->pth_comm, + sizeof(pktap_v2_hdr_space->pth_comm)); + pktap_v2_hdr->pth_length += strsize; + ptr += strsize; + VERIFY((void *)ptr < (void *)(pktap_v2_hdr_space + 1)); + } else if ((pktap_header->pth_flags & PTH_FLAG_DELAY_PKTAP)) { + size_t strsize = sizeof(pktap_v2_hdr_space->pth_comm); + + pktap_v2_hdr->pth_comm_offset = pktap_v2_hdr->pth_length; + + *ptr = 0; /* empty string by default */ + pktap_v2_hdr->pth_length += strsize; + ptr += strsize; + VERIFY((void *)ptr < (void *)(pktap_v2_hdr_space + 1)); + } + } + + /* + * Do not waste space with the effective process name if we do not have + * an effective pid or it's the same as the pid + */ + if (pktap_header->pth_epid != 0 && pktap_header->pth_epid != -1 && + pktap_header->pth_epid != pktap_header->pth_pid) { + if (pktap_header->pth_ecomm[0] != 0) { + size_t strsize; + + pktap_v2_hdr->pth_e_comm_offset = pktap_v2_hdr->pth_length; + + strsize = 1 + strlcpy((char *)ptr, pktap_header->pth_ecomm, + sizeof(pktap_v2_hdr_space->pth_e_comm)); + pktap_v2_hdr->pth_length += strsize; + ptr += strsize; + VERIFY((void *)ptr < (void *)(pktap_v2_hdr_space + 1)); + } else if ((pktap_header->pth_flags & PTH_FLAG_DELAY_PKTAP)) { + size_t strsize = sizeof(pktap_v2_hdr_space->pth_e_comm); + + pktap_v2_hdr->pth_e_comm_offset = pktap_v2_hdr->pth_length; + *ptr = 0; /* empty string by default */ + pktap_v2_hdr->pth_length += strsize; + ptr += strsize; + VERIFY((void *)ptr < (void *)(pktap_v2_hdr_space + 1)); + } + } + + if (extra_src_size > 0) { + char *extra_src_ptr = (char *)(pktap_header + 1); + char *extra_dst_ptr = ((char *)pktap_v2_hdr) + + pktap_v2_hdr->pth_length; + + VERIFY(pktap_v2_hdr->pth_length + extra_src_size <= + sizeof(struct pktap_buffer_v2_hdr_extra)); + + memcpy(extra_dst_ptr, extra_src_ptr, extra_src_size); + } + + VERIFY(pktap_v2_hdr->pth_length + extra_src_size <= + bpf_pkt->bpfp_header_length); + + memcpy(bpf_pkt->bpfp_header, pktap_v2_hdr, + pktap_v2_hdr->pth_length + extra_src_size); + + bpf_pkt->bpfp_total_length += pktap_v2_hdr->pth_length - + sizeof(struct pktap_header); + bpf_pkt->bpfp_header_length += pktap_v2_hdr->pth_length - + sizeof(struct pktap_header); +} + diff --git a/bsd/net/pktap.h b/bsd/net/pktap.h index 74b0b5bd1..25ed642fd 100644 --- a/bsd/net/pktap.h +++ b/bsd/net/pktap.h @@ -33,6 +33,7 @@ #include #include #include +#include #ifdef PRIVATE @@ -123,23 +124,113 @@ struct pktap_header { }; /* - * + * The original version 1 of the pktap_header structure always had the field + * pth_type_next set to PTH_TYPE_PACKET */ #define PTH_TYPE_NONE 0 /* No more data following */ #define PTH_TYPE_PACKET 1 /* Actual captured packet data */ -#define PTH_FLAG_DIR_IN 0x0001 /* Outgoing packet */ -#define PTH_FLAG_DIR_OUT 0x0002 /* Incoming packet */ -#define PTH_FLAG_PROC_DELEGATED 0x0004 /* Process delegated */ -#define PTH_FLAG_IF_DELEGATED 0x0008 /* Interface delegated */ +/* + * Size of buffer that can contain any pktap header + * followed by the optional 4 bytes protocol field + * or 16 bytes link layer header + */ +union pktap_header_extra { + uint8_t llhdr[16]; + uint32_t proto; +}; + +/* + * Version 2 version of the header + * + * The field pth_flags is at the same offset as the orignal pktap_header and + * the flag PTH_FLAG_V2_HDR allows to differentiate the header version. + */ + +#define PKTAP_MAX_COMM_SIZE (MAXCOMLEN + 1) + +struct pktap_v2_hdr { + uint8_t pth_length; /* length of this header */ + uint8_t pth_uuid_offset; /* max size: sizeof(uuid_t) */ + uint8_t pth_e_uuid_offset; /* max size: sizeof(uuid_t) */ + uint8_t pth_ifname_offset; /* max size: PKTAP_IFXNAMESIZE*/ + uint8_t pth_comm_offset; /* max size: PKTAP_MAX_COMM_SIZE */ + uint8_t pth_e_comm_offset; /* max size: PKTAP_MAX_COMM_SIZE */ + uint16_t pth_dlt; /* DLT of packet */ + uint16_t pth_frame_pre_length; + uint16_t pth_frame_post_length; + uint16_t pth_iftype; + uint16_t pth_ipproto; + uint32_t pth_protocol_family; + uint32_t pth_svc; /* service class */ + uint32_t pth_flowid; + pid_t pth_pid; /* process ID */ + pid_t pth_e_pid; /* effective process ID */ + uint32_t pth_flags; /* flags */ +}; + +struct pktap_v2_hdr_space { + struct pktap_v2_hdr pth_hdr; + uint8_t pth_uuid[sizeof(uuid_t)]; + uint8_t pth_e_uuid[sizeof(uuid_t)]; + uint8_t pth_ifname[PKTAP_IFXNAMESIZE]; + uint8_t pth_comm[PKTAP_MAX_COMM_SIZE]; + uint8_t pth_e_comm[PKTAP_MAX_COMM_SIZE]; +}; + +struct pktap_buffer_v2_hdr_extra { + struct pktap_v2_hdr_space hdr_space; + union pktap_header_extra extra; +}; + +#define COPY_PKTAP_COMMON_FIELDS_TO_V2(pktap_v2_hdr_dst, pktap_header_src) { \ + (pktap_v2_hdr_dst)->pth_length = sizeof(struct pktap_v2_hdr); \ + (pktap_v2_hdr_dst)->pth_uuid_offset = 0; \ + (pktap_v2_hdr_dst)->pth_e_uuid_offset = 0; \ + (pktap_v2_hdr_dst)->pth_ifname_offset = 0; \ + (pktap_v2_hdr_dst)->pth_comm_offset = 0; \ + (pktap_v2_hdr_dst)->pth_e_comm_offset = 0; \ + (pktap_v2_hdr_dst)->pth_dlt = (pktap_header_src)->pth_dlt; \ + (pktap_v2_hdr_dst)->pth_frame_pre_length = (pktap_header_src)->pth_frame_pre_length; \ + (pktap_v2_hdr_dst)->pth_frame_post_length = (pktap_header_src)->pth_frame_post_length; \ + (pktap_v2_hdr_dst)->pth_iftype = (pktap_header_src)->pth_iftype; \ + (pktap_v2_hdr_dst)->pth_ipproto = (pktap_header_src)->pth_ipproto; \ + (pktap_v2_hdr_dst)->pth_protocol_family = (pktap_header_src)->pth_protocol_family; \ + (pktap_v2_hdr_dst)->pth_svc = (pktap_header_src)->pth_svc; \ + (pktap_v2_hdr_dst)->pth_flowid = (pktap_header_src)->pth_flowid; \ + (pktap_v2_hdr_dst)->pth_pid = (pktap_header_src)->pth_pid; \ + (pktap_v2_hdr_dst)->pth_e_pid = (pktap_header_src)->pth_epid; \ + (pktap_v2_hdr_dst)->pth_flags = (pktap_header_src)->pth_flags; \ + (pktap_v2_hdr_dst)->pth_flags |= PTH_FLAG_V2_HDR; \ +} + +/* + * Values for field pth_flags + */ +#define PTH_FLAG_DIR_IN 0x00000001 /* Outgoing packet */ +#define PTH_FLAG_DIR_OUT 0x00000002 /* Incoming packet */ +#define PTH_FLAG_PROC_DELEGATED 0x00000004 /* Process delegated */ +#define PTH_FLAG_IF_DELEGATED 0x00000008 /* Interface delegated */ #ifdef BSD_KERNEL_PRIVATE -#define PTH_FLAG_DELAY_PKTAP 0x1000 /* Finalize pktap header on read */ +#define PTH_FLAG_DELAY_PKTAP 0x00001000 /* Finalize pktap header on read */ #endif /* BSD_KERNEL_PRIVATE */ -#define PTH_FLAG_TSTAMP 0x2000 /* Has time stamp */ -#define PTH_FLAG_NEW_FLOW 0x4000 /* Packet from a new flow */ -#define PTH_FLAG_MSFSW 0x8000 /* Multi stack flow switch */ +#define PTH_FLAG_TSTAMP 0x00002000 /* Has time stamp */ +#define PTH_FLAG_NEW_FLOW 0x00004000 /* Packet from a new flow */ +#define PTH_FLAG_REXMIT 0x00008000 /* Packet is a retransmission */ +#define PTH_FLAG_KEEP_ALIVE 0x00010000 /* Is keep alive packet */ +#define PTH_FLAG_SOCKET 0x00020000 /* Packet on a Socket */ +#define PTH_FLAG_NEXUS_CHAN 0x00040000 /* Packet on a nexus channel */ +#define PTH_FLAG_V2_HDR 0x00080000 /* Version 2 of pktap */ #ifdef BSD_KERNEL_PRIVATE + +#include + +struct pktap_header_buffer { + struct pktap_header pkth; + union pktap_header_extra extra; +} ; + extern uint32_t pktap_total_tap_count; extern void pktap_init(void); @@ -149,7 +240,8 @@ extern void pktap_output(struct ifnet *, protocol_family_t, struct mbuf *, extern void pktap_fill_proc_info(struct pktap_header *, protocol_family_t , struct mbuf *, u_int32_t , int , struct ifnet *); extern void pktap_finalize_proc_info(struct pktap_header *); - +extern void pktap_v2_finalize_proc_info(struct pktap_v2_hdr *); +extern void convert_to_pktap_header_to_v2(struct bpf_packet *bpf_pkt, bool truncate); #endif /* BSD_KERNEL_PRIVATE */ #endif /* PRIVATE */ diff --git a/bsd/net/pktsched/pktsched_fq_codel.c b/bsd/net/pktsched/pktsched_fq_codel.c index c30dc2eb1..425173a5f 100644 --- a/bsd/net/pktsched/pktsched_fq_codel.c +++ b/bsd/net/pktsched/pktsched_fq_codel.c @@ -206,6 +206,7 @@ fq_if_service_to_priority(fq_if_t *fqs, mbuf_svc_class_t svc) case MBUF_SC_AV: case MBUF_SC_RV: case MBUF_SC_VI: + case MBUF_SC_SIG: pri = FQ_IF_VI_INDEX; break; case MBUF_SC_VO: @@ -245,6 +246,9 @@ fq_if_service_to_priority(fq_if_t *fqs, mbuf_svc_class_t svc) case MBUF_SC_VI: pri = FQ_IF_VI_INDEX; break; + case MBUF_SC_SIG: + pri = FQ_IF_SIG_INDEX; + break; case MBUF_SC_VO: pri = FQ_IF_VO_INDEX; break; @@ -827,6 +831,10 @@ fq_if_setup_ifclassq(struct ifclassq *ifq, u_int32_t flags, fq_if_classq_init(fqs, FQ_IF_VO_INDEX, 600, 8, MBUF_SC_VO); } else { + /* SIG shares same INDEX with VI */ + _CASSERT(SCIDX_SIG == SCIDX_VI); + _CASSERT(FQ_IF_SIG_INDEX == FQ_IF_VI_INDEX); + fq_if_classq_init(fqs, FQ_IF_BK_SYS_INDEX, 1500, 2, MBUF_SC_BK_SYS); fq_if_classq_init(fqs, FQ_IF_BK_INDEX, 1500, diff --git a/bsd/net/pktsched/pktsched_fq_codel.h b/bsd/net/pktsched/pktsched_fq_codel.h index be7629a71..0929882a6 100644 --- a/bsd/net/pktsched/pktsched_fq_codel.h +++ b/bsd/net/pktsched/pktsched_fq_codel.h @@ -105,6 +105,7 @@ enum fq_if_state { #define FQ_IF_AV_INDEX 4 #define FQ_IF_RV_INDEX 3 #define FQ_IF_VI_INDEX 2 +#define FQ_IF_SIG_INDEX 2 #define FQ_IF_VO_INDEX 1 #define FQ_IF_CTL_INDEX 0 diff --git a/bsd/net/route.c b/bsd/net/route.c index 1eda0fa30..986c2f6a2 100644 --- a/bsd/net/route.c +++ b/bsd/net/route.c @@ -92,6 +92,7 @@ #include #include #include +#include #include #include @@ -1687,6 +1688,16 @@ ifa_ifwithroute_common_locked(int flags, const struct sockaddr *dst, ifa = NULL; } + /* + * ifa's address family must match destination's address family + * after all is said and done. + */ + if (ifa != NULL && + ifa->ifa_addr->sa_family != dst->sa_family) { + IFA_REMREF(ifa); + ifa = NULL; + } + return (ifa); } @@ -3464,8 +3475,15 @@ rtinit_locked(struct ifaddr *ifa, int cmd, int flags) * If rmx_mtu is not locked, update it * to the MTU used by the new interface. */ - if (!(rt->rt_rmx.rmx_locks & RTV_MTU)) + if (!(rt->rt_rmx.rmx_locks & RTV_MTU)) { rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; + if (dst->sa_family == AF_INET && + INTF_ADJUST_MTU_FOR_CLAT46(rt->rt_ifp)) { + rt->rt_rmx.rmx_mtu = IN6_LINKMTU(rt->rt_ifp); + /* Further adjust the size for CLAT46 expansion */ + rt->rt_rmx.rmx_mtu -= CLAT46_HDR_EXPANSION_OVERHD; + } + } /* * Now ask the protocol to check if it needs diff --git a/bsd/net/rtsock.c b/bsd/net/rtsock.c index dff054212..8ae08e206 100644 --- a/bsd/net/rtsock.c +++ b/bsd/net/rtsock.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2017 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -84,6 +84,8 @@ #include #include #include +#include +#include #include extern struct rtstat rtstat; @@ -552,12 +554,7 @@ route_output(struct mbuf *m, struct socket *so) struct ifaddr *ifa2; report: cred = kauth_cred_proc_ref(current_proc()); - - if (rt->rt_ifp == lo_ifp || - route_op_entitlement_check(so, NULL, ROUTE_OP_READ, TRUE) != 0) - credp = &cred; - else - credp = NULL; + credp = &cred; ifa2 = NULL; RT_LOCK_ASSERT_HELD(rt); @@ -961,8 +958,15 @@ rt_setif(struct rtentry *rt, struct sockaddr *Ifpaddr, struct sockaddr *Ifaaddr, * If rmx_mtu is not locked, update it * to the MTU used by the new interface. */ - if (!(rt->rt_rmx.rmx_locks & RTV_MTU)) + if (!(rt->rt_rmx.rmx_locks & RTV_MTU)) { rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; + if (rt_key(rt)->sa_family == AF_INET && + INTF_ADJUST_MTU_FOR_CLAT46(ifp)) { + rt->rt_rmx.rmx_mtu = IN6_LINKMTU(rt->rt_ifp); + /* Further adjust the size for CLAT46 expansion */ + rt->rt_rmx.rmx_mtu -= CLAT46_HDR_EXPANSION_OVERHD; + } + } if (rt->rt_ifa != NULL) { IFA_LOCK_SPIN(rt->rt_ifa); @@ -1522,15 +1526,25 @@ sysctl_dumpentry(struct radix_node *rn, void *vw) kauth_cred_t *credp; cred = kauth_cred_proc_ref(current_proc()); - if (rt->rt_ifp == lo_ifp || - route_op_entitlement_check(NULL, cred, ROUTE_OP_READ, TRUE) != 0) - credp = &cred; - else - credp = NULL; + credp = &cred; RT_LOCK(rt); - if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg)) + if ((w->w_op == NET_RT_FLAGS || w->w_op == NET_RT_FLAGS_PRIV) && + !(rt->rt_flags & w->w_arg)) goto done; + + /* + * If the matching route has RTF_LLINFO set, then we can skip scrubbing the MAC + * only if the outgoing interface is not loopback and the process has entitlement + * for neighbor cache read. + */ + if (w->w_op == NET_RT_FLAGS_PRIV && (rt->rt_flags & RTF_LLINFO)) { + if (rt->rt_ifp != lo_ifp && + (route_op_entitlement_check(NULL, cred, ROUTE_OP_READ, TRUE) == 0)) { + credp = NULL; + } + } + bzero((caddr_t)&info, sizeof (info)); info.rti_info[RTAX_DST] = rt_key(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; @@ -1720,6 +1734,12 @@ sysctl_iflist(int af, struct walkarg *w) IFA_UNLOCK(ifa); continue; } + if (ifa->ifa_addr->sa_family == AF_INET6 && + (((struct in6_ifaddr *)ifa)->ia6_flags & + IN6_IFF_CLAT46) != 0) { + IFA_UNLOCK(ifa); + continue; + } info.rti_info[RTAX_IFA] = ifa->ifa_addr; info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask; info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; @@ -1877,6 +1897,13 @@ sysctl_iflist2(int af, struct walkarg *w) IFA_UNLOCK(ifa); continue; } + if (ifa->ifa_addr->sa_family == AF_INET6 && + (((struct in6_ifaddr *)ifa)->ia6_flags & + IN6_IFF_CLAT46) != 0) { + IFA_UNLOCK(ifa); + continue; + } + info.rti_info[RTAX_IFA] = ifa->ifa_addr; info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask; info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; @@ -2051,6 +2078,7 @@ sysctl_rtsock SYSCTL_HANDLER_ARGS case NET_RT_DUMP: case NET_RT_DUMP2: case NET_RT_FLAGS: + case NET_RT_FLAGS_PRIV: lck_mtx_lock(rnh_lock); for (i = 1; i <= AF_MAX; i++) if ((rnh = rt_tables[i]) && (af == 0 || af == i) && diff --git a/bsd/netinet/icmp6.h b/bsd/netinet/icmp6.h index 2176a0299..7e19edab7 100644 --- a/bsd/netinet/icmp6.h +++ b/bsd/netinet/icmp6.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -229,6 +229,10 @@ struct mld_hdr { #define mld_v2_reserved mld_icmp6_hdr.icmp6_data16[0] #define mld_v2_numrecs mld_icmp6_hdr.icmp6_data16[1] + +#define ICMP6_ERRORTYPE(type) \ + ((type) == ICMP6_DST_UNREACH || (type) == ICMP6_PACKET_TOO_BIG || \ + (type) == ICMP6_TIME_EXCEEDED || (type) == ICMP6_PARAM_PROB) /* * Neighbor Discovery */ diff --git a/bsd/netinet/in.c b/bsd/netinet/in.c index cbb7f8cb6..61de1526d 100644 --- a/bsd/netinet/in.c +++ b/bsd/netinet/in.c @@ -2603,3 +2603,57 @@ in_lltattach(struct ifnet *ifp) return (llt); } + +struct in_ifaddr* +inifa_ifpwithflag(struct ifnet * ifp, uint32_t flag) +{ + struct ifaddr *ifa; + + ifnet_lock_shared(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_link) + { + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_addr->sa_family != AF_INET) { + IFA_UNLOCK(ifa); + continue; + } + if ((((struct in_ifaddr *)ifa)->ia_flags & flag) == flag) { + IFA_ADDREF_LOCKED(ifa); + IFA_UNLOCK(ifa); + break; + } + IFA_UNLOCK(ifa); + } + ifnet_lock_done(ifp); + + return ((struct in_ifaddr *)ifa); +} + +struct in_ifaddr * +inifa_ifpclatv4(struct ifnet * ifp) +{ + struct ifaddr *ifa; + + ifnet_lock_shared(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_link) + { + uint32_t addr = 0; + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_addr->sa_family != AF_INET) { + IFA_UNLOCK(ifa); + continue; + } + + addr = ntohl(SIN(ifa->ifa_addr)->sin_addr.s_addr); + if (!IN_LINKLOCAL(addr) && + !IN_LOOPBACK(addr)) { + IFA_ADDREF_LOCKED(ifa); + IFA_UNLOCK(ifa); + break; + } + IFA_UNLOCK(ifa); + } + ifnet_lock_done(ifp); + + return ((struct in_ifaddr *)ifa); +} diff --git a/bsd/netinet/in.h b/bsd/netinet/in.h index 07732679a..5a8400e22 100644 --- a/bsd/netinet/in.h +++ b/bsd/netinet/in.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2017 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -392,6 +392,10 @@ struct sockaddr_in { char sin_zero[8]; }; +#define IN_ARE_ADDR_EQUAL(a, b) \ + (bcmp(&(a)->s_addr, &(b)->s_addr, \ + sizeof (struct in_addr)) == 0) + #ifdef PRIVATE /* * sockaddr_in with scope ID field; this is used internally to keep @@ -811,6 +815,8 @@ union sockaddr_in_4_6 { struct sockaddr_in6 sin6; }; +#define CLAT46_HDR_EXPANSION_OVERHD (sizeof(struct ip6_hdr) - sizeof(struct ip)) + /* * Recommended DiffServ Code Point values */ @@ -880,6 +886,8 @@ extern uint32_t in_cksum_mbuf_ref(struct mbuf *, int, int, uint32_t); extern int in_getconninfo(struct socket *, sae_connid_t, uint32_t *, uint32_t *, int32_t *, user_addr_t, socklen_t *, user_addr_t, socklen_t *, uint32_t *, user_addr_t, uint32_t *); +extern struct in_ifaddr * inifa_ifpwithflag(struct ifnet *, uint32_t); +extern struct in_ifaddr * inifa_ifpclatv4(struct ifnet *); #define in_cksum(_m, _l) \ inet_cksum(_m, 0, 0, _l) diff --git a/bsd/netinet/in_arp.c b/bsd/netinet/in_arp.c index 674da52bf..2b717a5d9 100644 --- a/bsd/netinet/in_arp.c +++ b/bsd/netinet/in_arp.c @@ -83,6 +83,8 @@ #include #include +#include +#include #include #include @@ -1955,8 +1957,14 @@ arp_ip_handle_input(ifnet_t ifp, u_short arpop, * If rmx_mtu is not locked, update it * to the MTU used by the new interface. */ - if (!(route->rt_rmx.rmx_locks & RTV_MTU)) + if (!(route->rt_rmx.rmx_locks & RTV_MTU)) { route->rt_rmx.rmx_mtu = route->rt_ifp->if_mtu; + if (INTF_ADJUST_MTU_FOR_CLAT46(ifp)) { + route->rt_rmx.rmx_mtu = IN6_LINKMTU(route->rt_ifp); + /* Further adjust the size for CLAT46 expansion */ + route->rt_rmx.rmx_mtu -= CLAT46_HDR_EXPANSION_OVERHD; + } + } rtsetifa(route, &best_ia->ia_ifa); gateway->sdl_index = ifp->if_index; diff --git a/bsd/netinet/in_cksum.c b/bsd/netinet/in_cksum.c index 29594c123..538794d8d 100644 --- a/bsd/netinet/in_cksum.c +++ b/bsd/netinet/in_cksum.c @@ -305,7 +305,6 @@ inet_cksum_buffer(const void *buffer, uint32_t nxt, uint32_t off, } #if DEBUG || DEVELOPMENT -#include #include #define CKSUM_ERR kprintf diff --git a/bsd/netinet/in_pcb.c b/bsd/netinet/in_pcb.c index b74b0af2f..3d2e8c91d 100644 --- a/bsd/netinet/in_pcb.c +++ b/bsd/netinet/in_pcb.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2017 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -92,6 +92,7 @@ #include #include #include +#include #include #include @@ -132,8 +133,6 @@ static boolean_t inpcb_garbage_collecting = FALSE; /* gc timer is scheduled */ static boolean_t inpcb_ticking = FALSE; /* "slow" timer is scheduled */ static boolean_t inpcb_fast_timer_on = FALSE; -extern char *proc_best_name(proc_t); - #define INPCB_GCREQ_THRESHOLD 50000 static thread_call_t inpcb_thread_call, inpcb_fast_thread_call; @@ -219,6 +218,8 @@ static boolean_t apn_fallbk_enabled = TRUE; SYSCTL_DECL(_net_inet); SYSCTL_NODE(_net_inet, OID_AUTO, apn_fallback, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "APN Fallback"); +SYSCTL_UINT(_net_inet_apn_fallback, OID_AUTO, enable, CTLFLAG_RW | CTLFLAG_LOCKED, + &apn_fallbk_enabled, 0, "APN fallback enable"); SYSCTL_UINT(_net_inet_apn_fallback, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_LOCKED, &apn_fallbk_debug, 0, "APN fallback debug enable"); #else @@ -806,7 +807,8 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) uid_t u; #if !CONFIG_EMBEDDED - if (ntohs(lport) < IPPORT_RESERVED) { + if (ntohs(lport) < IPPORT_RESERVED && + SIN(nam)->sin_addr.s_addr != 0) { cred = kauth_cred_proc_ref(p); error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0); @@ -1148,7 +1150,7 @@ apn_fallback_required (proc_t proc, struct socket *so, struct sockaddr_in *p_dst } static void -apn_fallback_trigger(proc_t proc) +apn_fallback_trigger(proc_t proc, struct socket *so) { pid_t pid = 0; struct kev_msg ev_msg; @@ -1168,8 +1170,14 @@ apn_fallback_trigger(proc_t proc) ev_msg.event_code = KEV_NETEVENT_APNFALLBACK; bzero(&apnfallbk_data, sizeof(apnfallbk_data)); - apnfallbk_data.epid = pid; - uuid_copy(apnfallbk_data.euuid, application_uuid); + + if (so->so_flags & SOF_DELEGATED) { + apnfallbk_data.epid = so->e_pid; + uuid_copy(apnfallbk_data.euuid, so->e_uuid); + } else { + apnfallbk_data.epid = so->last_pid; + uuid_copy(apnfallbk_data.euuid, so->last_uuid); + } ev_msg.dv[0].data_ptr = &apnfallbk_data; ev_msg.dv[0].data_length = sizeof(apnfallbk_data); @@ -1306,7 +1314,7 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, struct in_addr *laddr, if (apn_fallback_required(proc, inp->inp_socket, (void *)nam)) - apn_fallback_trigger(proc); + apn_fallback_trigger(proc, inp->inp_socket); goto done; } @@ -1333,6 +1341,20 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, struct in_addr *laddr, RT_CONVERT_LOCK(ro->ro_rt); ia = ifatoia(ro->ro_rt->rt_ifa); IFA_ADDREF(&ia->ia_ifa); + + /* + * Mark the control block for notification of + * a possible flow that might undergo clat46 + * translation. + * + * We defer the decision to a later point when + * inpcb is being disposed off. + * The reason is that we only want to send notification + * if the flow was ever used to send data. + */ + if (IS_INTF_CLAT46(ro->ro_rt->rt_ifp)) + inp->inp_flags2 |= INP2_CLAT46_FLOW; + RT_UNLOCK(ro->ro_rt); error = 0; } @@ -1464,6 +1486,11 @@ in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct proc *p, int error; struct socket *so = inp->inp_socket; +#if CONTENT_FILTER + if (so) + so->so_state_change_cnt++; +#endif + /* * Call inner routine, to assign local interface address. */ @@ -1548,6 +1575,11 @@ in_pcbdisconnect(struct inpcb *inp) inp->inp_faddr.s_addr = INADDR_ANY; inp->inp_fport = 0; +#if CONTENT_FILTER + if (so) + so->so_state_change_cnt++; +#endif + if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->ipi_lock)) { /* lock inversion issue, mostly with udp multicast packets */ socket_unlock(so, 0); @@ -1624,6 +1656,35 @@ in_pcbdetach(struct inpcb *inp) inp->inp_moptions = NULL; sofreelastref(so, 0); inp->inp_state = INPCB_STATE_DEAD; + + /* + * Enqueue an event to send kernel event notification + * if the flow has to CLAT46 for data packets + */ + if (inp->inp_flags2 & INP2_CLAT46_FLOW) { + /* + * If there has been any exchange of data bytes + * over this flow. + * Schedule a notification to report that flow is + * using client side translation. + */ + if (inp->inp_stat != NULL && + (inp->inp_stat->txbytes != 0 || + inp->inp_stat->rxbytes !=0)) { + if (so->so_flags & SOF_DELEGATED) { + in6_clat46_event_enqueue_nwk_wq_entry( + IN6_CLAT46_EVENT_V4_FLOW, + so->e_pid, + so->e_uuid); + } else { + in6_clat46_event_enqueue_nwk_wq_entry( + IN6_CLAT46_EVENT_V4_FLOW, + so->last_pid, + so->last_uuid); + } + } + } + /* makes sure we're not called twice from so_close */ so->so_flags |= SOF_PCBCLEARING; diff --git a/bsd/netinet/in_pcb.h b/bsd/netinet/in_pcb.h index 588a4d054..e1a7c9941 100644 --- a/bsd/netinet/in_pcb.h +++ b/bsd/netinet/in_pcb.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -221,7 +221,7 @@ struct inpcb { } inp_necp_attributes; struct necp_inpcb_result inp_policyresult; uuid_t necp_client_uuid; - void (*necp_cb)(void *, int, struct necp_client_flow *); + necp_client_flow_cb necp_cb; #endif u_char *inp_keepalive_data; /* for keepalive offload */ u_int8_t inp_keepalive_datalen; /* keepalive data length */ @@ -692,7 +692,7 @@ struct inpcbinfo { IN6P_RTHDR|IN6P_RTHDRDSTOPTS|IN6P_TCLASS|IN6P_RFC2292|IN6P_MTU) #define INP_UNMAPPABLEOPTS \ - (IN6P_HOPOPTS|IN6P_DSTOPTS|IN6P_RTHDR| IN6P_TCLASS|IN6P_AUTOFLOWLABEL) + (IN6P_HOPOPTS|IN6P_DSTOPTS|IN6P_RTHDR|IN6P_AUTOFLOWLABEL) /* * Flags for inp_flags2. @@ -706,8 +706,9 @@ struct inpcbinfo { #define INP2_INHASHLIST 0x00000010 /* pcb is in inp_hash list */ #define INP2_AWDL_UNRESTRICTED 0x00000020 /* AWDL restricted mode allowed */ #define INP2_KEEPALIVE_OFFLOAD 0x00000040 /* Enable UDP or TCP keepalive offload */ -#define INP2_INTCOPROC_ALLOWED 0x00000080 /* Allow communication via internal co-processor interfaces */ -#define INP2_CONNECT_IN_PROGRESS 0x00000100 /* A connect call is in progress, so binds are intermediate steps */ +#define INP2_INTCOPROC_ALLOWED 0x00000080 /* Allow communication via internal co-processor interfaces */ +#define INP2_CONNECT_IN_PROGRESS 0x00000100 /* A connect call is in progress, so binds are intermediate steps */ +#define INP2_CLAT46_FLOW 0x00000200 /* The flow is going to use CLAT46 path */ /* * Flags passed to in_pcblookup*() functions. diff --git a/bsd/netinet/in_pcblist.c b/bsd/netinet/in_pcblist.c index 266754acc..7865d6a1e 100644 --- a/bsd/netinet/in_pcblist.c +++ b/bsd/netinet/in_pcblist.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010-2017 Apple Inc. All rights reserved. + * Copyright (c) 2010-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -64,6 +64,8 @@ #include #include #include +#include +#include #include #include #include @@ -89,6 +91,8 @@ #include #include +#include + #ifndef ROUNDUP64 #define ROUNDUP64(x) P2ROUNDUP((x), sizeof (u_int64_t)) #endif @@ -99,6 +103,8 @@ static void inpcb_to_xinpcb_n(struct inpcb *, struct xinpcb_n *); static void tcpcb_to_xtcpcb_n(struct tcpcb *, struct xtcpcb_n *); +void shutdown_sockets_on_interface(struct ifnet *ifp); + __private_extern__ void sotoxsocket_n(struct socket *so, struct xsocket_n *xso) @@ -442,10 +448,39 @@ inpcb_get_ports_used(uint32_t ifindex, int protocol, uint32_t flags, (so->so_state & SS_ISDISCONNECTED)) continue; - if (!(protocol == PF_UNSPEC || - (protocol == PF_INET && (inp->inp_vflag & INP_IPV4)) || - (protocol == PF_INET6 && (inp->inp_vflag & INP_IPV6)))) - continue; + /* + * If protocol is specified, filter out inpcbs that + * are not relevant to the protocol family of interest. + */ + if (protocol != PF_UNSPEC) { + if (protocol == PF_INET) { + /* + * If protocol of interest is IPv4, skip the inpcb + * if the family is not IPv4. + * OR + * If the family is IPv4, skip if the IPv4 flow is + * CLAT46 translated. + */ + if ((inp->inp_vflag & INP_IPV4) == 0 || + (inp->inp_flags2 & INP2_CLAT46_FLOW) != 0) { + continue; + } + } else if (protocol == PF_INET6) { + /* + * If protocol of interest is IPv6, skip the inpcb + * if the family is not IPv6. + * AND + * The flow is not a CLAT46'd flow. + */ + if ((inp->inp_vflag & INP_IPV6) == 0 && + (inp->inp_flags2 & INP2_CLAT46_FLOW) == 0) { + continue; + } + } else { + /* Protocol family not supported */ + continue; + } + } if (SOCK_PROTO(inp->inp_socket) != IPPROTO_UDP && SOCK_PROTO(inp->inp_socket) != IPPROTO_TCP) @@ -631,3 +666,87 @@ inpcb_find_anypcb_byaddr(struct ifaddr *ifa, struct inpcbinfo *pcbinfo) lck_rw_done(pcbinfo->ipi_lock); return (0); } + +static int +shutdown_sockets_on_interface_proc_callout(proc_t p, void *arg) +{ + struct filedesc *fdp; + int i; + struct ifnet *ifp = (struct ifnet *)arg; + + if (ifp == NULL) + return (PROC_RETURNED); + + proc_fdlock(p); + fdp = p->p_fd; + for (i = 0; i < fdp->fd_nfiles; i++) { + struct fileproc *fp = fdp->fd_ofiles[i]; + struct fileglob *fg; + struct socket *so; + struct inpcb *inp; + struct ifnet *inp_ifp; + int error; + + if (fp == NULL || (fdp->fd_ofileflags[i] & UF_RESERVED) != 0) { + continue; + } + + fg = fp->f_fglob; + if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET) + continue; + + so = (struct socket *)fp->f_fglob->fg_data; + if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) + continue; + + inp = (struct inpcb *)so->so_pcb; + + if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) + continue; + + socket_lock(so, 1); + + if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { + socket_unlock(so, 1); + continue; + } + + if (inp->inp_boundifp != NULL) { + inp_ifp = inp->inp_boundifp; + } else if (inp->inp_last_outifp != NULL) { + inp_ifp = inp->inp_last_outifp; + } else { + socket_unlock(so, 1); + continue; + } + + if (inp_ifp != ifp && inp_ifp->if_delegated.ifp != ifp) { + socket_unlock(so, 1); + continue; + } + error = sosetdefunct(p, so, 0, TRUE); + if (error != 0) { + log(LOG_ERR, "%s: sosetdefunct() error %d", + __func__, error); + } else { + error = sodefunct(p, so, 0); + if (error != 0) { + log(LOG_ERR, "%s: sodefunct() error %d", + __func__, error); + } + } + + socket_unlock(so, 1); + } + proc_fdunlock(p); + + return (PROC_RETURNED); +} + +void +shutdown_sockets_on_interface(struct ifnet *ifp) +{ + proc_iterate(PROC_ALLPROCLIST, + shutdown_sockets_on_interface_proc_callout, + ifp, NULL, NULL); +} diff --git a/bsd/netinet/in_rmx.c b/bsd/netinet/in_rmx.c index 1c0fc3696..4aa686402 100644 --- a/bsd/netinet/in_rmx.c +++ b/bsd/netinet/in_rmx.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2017 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -84,6 +84,9 @@ #include #include #include +#include +#include +#include extern int tvtohz(struct timeval *); @@ -163,8 +166,14 @@ in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, } if (!rt->rt_rmx.rmx_mtu && !(rt->rt_rmx.rmx_locks & RTV_MTU) && - rt->rt_ifp) + rt->rt_ifp) { rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; + if (INTF_ADJUST_MTU_FOR_CLAT46(rt->rt_ifp)) { + rt->rt_rmx.rmx_mtu = IN6_LINKMTU(rt->rt_ifp); + /* Further adjust the size for CLAT46 expansion */ + rt->rt_rmx.rmx_mtu -= CLAT46_HDR_EXPANSION_OVERHD; + } + } ret = rn_addroute(v_arg, n_arg, head, treenodes); if (ret == NULL && (rt->rt_flags & RTF_HOST)) { diff --git a/bsd/netinet/in_tclass.c b/bsd/netinet/in_tclass.c index ff40a2872..530161345 100644 --- a/bsd/netinet/in_tclass.c +++ b/bsd/netinet/in_tclass.c @@ -260,8 +260,6 @@ mbuf_svc_class_t wifi_dscp_to_msc_array[DSCP_ARRAY_SIZE]; #if (DEVELOPMENT || DEBUG) -extern char *proc_best_name(proc_t p); - static int tfp_count = 0; static TAILQ_HEAD(, tclass_for_proc) tfp_head = @@ -1308,6 +1306,9 @@ so_tc2msc(int tc) case _SO_TC_VI: msc = MBUF_SC_VI; break; + case SO_TC_NETSVC_SIG: + msc = MBUF_SC_SIG; + break; case SO_TC_VO: case _SO_TC_VO: msc = MBUF_SC_VO; @@ -1344,6 +1345,8 @@ so_svc2tc(mbuf_svc_class_t svc) return (SO_TC_RV); case MBUF_SC_VI: return (SO_TC_VI); + case MBUF_SC_SIG: + return (SO_TC_NETSVC_SIG); case MBUF_SC_VO: return (SO_TC_VO); case MBUF_SC_CTL: diff --git a/bsd/netinet/ip6.h b/bsd/netinet/ip6.h index 8088cb4cb..7d181e66c 100644 --- a/bsd/netinet/ip6.h +++ b/bsd/netinet/ip6.h @@ -132,7 +132,7 @@ struct ip6_hdr { #if BYTE_ORDER == LITTLE_ENDIAN #define IPV6_FLOWINFO_MASK 0xffffff0f /* flow info (28 bits) */ #define IPV6_FLOWLABEL_MASK 0xffff0f00 /* flow label (20 bits) */ -#define IPV6_FLOW_ECN_MASK 0x00000300 /* the 2 ECN bits */ +#define IPV6_FLOW_ECN_MASK 0x00003000 /* the 2 ECN bits */ #endif /* LITTLE_ENDIAN */ #endif #if 1 @@ -141,8 +141,6 @@ struct ip6_hdr { #define IP6TOS_ECT 0x02 /* ECN-capable transport */ #endif -#define IP6FLOW_ECN_MASK 0x00300000 - /* * To access the 6 bits of the DSCP value in the 32 bits ip6_flow field */ diff --git a/bsd/netinet/ip_dummynet.c b/bsd/netinet/ip_dummynet.c index 122698fa4..c9f566822 100644 --- a/bsd/netinet/ip_dummynet.c +++ b/bsd/netinet/ip_dummynet.c @@ -2421,7 +2421,7 @@ dummynet_get(struct sockopt *sopt) for (i = 0; i < 10; i++) { size = dn_calc_size(is64user); lck_mtx_unlock(dn_mutex); - buf = _MALLOC(size, M_TEMP, M_WAITOK); + buf = _MALLOC(size, M_TEMP, M_WAITOK | M_ZERO); if (buf == NULL) return(ENOBUFS); lck_mtx_lock(dn_mutex); diff --git a/bsd/netinet/ip_fw2.c b/bsd/netinet/ip_fw2.c index 3d69b2029..9b365b2c0 100644 --- a/bsd/netinet/ip_fw2.c +++ b/bsd/netinet/ip_fw2.c @@ -3646,7 +3646,7 @@ ipfw_ctl(struct sockopt *sopt) struct ip_old_fw *buf2, *rule_vers0; lck_mtx_lock(ipfw_mutex); - buf2 = _MALLOC(static_count * sizeof(struct ip_old_fw), M_TEMP, M_WAITOK); + buf2 = _MALLOC(static_count * sizeof(struct ip_old_fw), M_TEMP, M_WAITOK | M_ZERO); if (buf2 == 0) { lck_mtx_unlock(ipfw_mutex); error = ENOBUFS; @@ -3687,7 +3687,7 @@ ipfw_ctl(struct sockopt *sopt) buf_size = static_count * ipfwcompsize + dyn_count * ipfwdyncompsize; - buf2 = _MALLOC(buf_size, M_TEMP, M_WAITOK); + buf2 = _MALLOC(buf_size, M_TEMP, M_WAITOK | M_ZERO); if (buf2 == 0) { lck_mtx_unlock(ipfw_mutex); error = ENOBUFS; diff --git a/bsd/netinet/ip_icmp.c b/bsd/netinet/ip_icmp.c index e67b329ca..260449c30 100644 --- a/bsd/netinet/ip_icmp.c +++ b/bsd/netinet/ip_icmp.c @@ -207,58 +207,84 @@ icmp_error( u_int32_t dest, u_int32_t nextmtu) { - struct ip *oip, *nip; - struct icmp *icp; - struct mbuf *m; - u_int32_t oiphlen, icmplen, icmpelen, nlen; - + struct ip *oip = NULL; + struct ip *nip = NULL; + struct icmp *icp = NULL; + struct mbuf *m = NULL; + u_int32_t oiphlen = 0; + u_int32_t icmplen = 0; + u_int32_t icmpelen = 0; + u_int32_t nlen = 0; + + VERIFY((u_int)type <= ICMP_MAXTYPE); /* Expect 32-bit aligned data pointer on strict-align platforms */ MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(n); + if (type != ICMP_REDIRECT) + icmpstat.icps_error++; + /* + * Don't send error: + * if not the first fragment of message + * if original packet was a multicast or broadcast packet + * if the old packet protocol was ICMP + * error message, only known informational types. + */ + if (n->m_flags & (M_BCAST|M_MCAST)) + goto freeit; + + /* + * Drop if IP header plus ICMP_MINLEN bytes are not contiguous + * in first mbuf. + */ + if (n->m_len < sizeof(struct ip) + ICMP_MINLEN) + goto freeit; + oip = mtod(n, struct ip *); oiphlen = IP_VHL_HL(oip->ip_vhl) << 2; + if (n->m_len < oiphlen + ICMP_MINLEN) + goto freeit; #if (DEBUG | DEVELOPMENT) if (icmpprintfs > 1) printf("icmp_error(0x%llx, %x, %d)\n", (uint64_t)VM_KERNEL_ADDRPERM(oip), type, code); #endif - if (type != ICMP_REDIRECT) - icmpstat.icps_error++; - /* - * Don't send error if not the first fragment of message. - * Don't error if the old packet protocol was ICMP - * error message, only known informational types. - */ + if (oip->ip_off & ~(IP_MF|IP_DF)) goto freeit; if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT && - n->m_len >= oiphlen + ICMP_MINLEN && - !ICMP_INFOTYPE(((struct icmp *)(void *)((caddr_t)oip + oiphlen))-> - icmp_type)) { + n->m_len >= oiphlen + ICMP_MINLEN && + !ICMP_INFOTYPE(((struct icmp *)(void *)((caddr_t)oip + oiphlen))-> + icmp_type)) { icmpstat.icps_oldicmp++; goto freeit; } - /* - * Don't send error in response to a multicast or - * broadcast packet - */ - if (n->m_flags & (M_BCAST|M_MCAST)) - goto freeit; /* * Calculate the length to quote from original packet and prevent * the ICMP mbuf from overflowing. + * Unfortunatly this is non-trivial since ip_forward() + * sends us truncated packets. */ nlen = m_length(n); if (oip->ip_p == IPPROTO_TCP) { - struct tcphdr *th; - u_int16_t tcphlen; + struct tcphdr *th = NULL; + u_int16_t tcphlen = 0; + /* + * If the packet got truncated and TCP header + * is not contained in the packet, send out + * standard reply with only IP header as payload + */ if (oiphlen + sizeof(struct tcphdr) > n->m_len && n->m_next == NULL) goto stdreply; + + /* + * Otherwise, pull up to get IP and TCP headers + * together + */ if (n->m_len < (oiphlen + sizeof(struct tcphdr)) && (n = m_pullup(n, (oiphlen + sizeof(struct tcphdr)))) == NULL) goto freeit; @@ -274,6 +300,8 @@ icmp_error( sizeof(u_int32_t)))) goto freeit; tcphlen = th->th_off << 2; + + /* Sanity checks */ if (tcphlen < sizeof(struct tcphdr)) goto freeit; if (oip->ip_len < (oiphlen + tcphlen)) @@ -297,11 +325,14 @@ icmp_error( stdreply: icmpelen = max(ICMP_MINLEN, min(icmp_datalen, (oip->ip_len - oiphlen))); - icmplen = min(oiphlen + icmpelen, min(nlen, oip->ip_len)); + icmplen = min(oiphlen + icmpelen, nlen); if (icmplen < sizeof(struct ip)) goto freeit; + /* * First, formulate icmp message + * Allocate enough space for the IP header, ICMP header + * and the payload (part of the original message to be sent back). */ if (MHLEN > (sizeof(struct ip) + ICMP_MINLEN + icmplen)) m = m_gethdr(M_DONTWAIT, MT_HEADER); /* MAC-OK */ @@ -311,24 +342,20 @@ stdreply: icmpelen = max(ICMP_MINLEN, min(icmp_datalen, if (m == NULL) goto freeit; - if (n->m_flags & M_SKIP_FIREWALL) { - /* - * set M_SKIP_FIREWALL to skip firewall check, since - * we're called from firewall - */ - m->m_flags |= M_SKIP_FIREWALL; - } - #if CONFIG_MACF_NET mac_mbuf_label_associate_netlayer(n, m); #endif - m->m_len = icmplen + ICMP_MINLEN; /* for ICMP header and data */ - MH_ALIGN(m, m->m_len); + /* + * Further refine the payload length to the space + * remaining in mbuf after including the IP header and ICMP + * header. + */ + icmplen = min(icmplen, M_TRAILINGSPACE(m) - + sizeof(struct ip) - ICMP_MINLEN); + m_align(m, ICMP_MINLEN + icmplen); + m->m_len = ICMP_MINLEN + icmplen; /* for ICMP header and data */ + icp = mtod(m, struct icmp *); - if ((u_int)type > ICMP_MAXTYPE) { - m_freem(m); - goto freeit; - } icmpstat.icps_outhist[type]++; icp->icmp_type = type; if (type == ICMP_REDIRECT) @@ -349,6 +376,11 @@ stdreply: icmpelen = max(ICMP_MINLEN, min(icmp_datalen, } icp->icmp_code = code; + + /* + * Copy icmplen worth of content from original + * mbuf (n) to the new packet after ICMP header. + */ m_copydata(n, 0, icmplen, (caddr_t)&icp->icmp_ip); nip = &icp->icmp_ip; @@ -360,13 +392,12 @@ stdreply: icmpelen = max(ICMP_MINLEN, min(icmp_datalen, HTONS(nip->ip_off); #endif /* - * Now, copy old ip header (without options) - * in front of icmp message. - */ - if (m->m_data - sizeof(struct ip) < m->m_pktdat) { - m_freem(m); - goto freeit; - } + * Set up ICMP message mbuf and copy old IP header (without options + * in front of ICMP message. + * If the original mbuf was meant to bypass the firewall, the error + * reply should bypass as well. + */ + m->m_flags |= n->m_flags & M_SKIP_FIREWALL; m->m_data -= sizeof(struct ip); m->m_len += sizeof(struct ip); m->m_pkthdr.len = m->m_len; @@ -379,7 +410,6 @@ stdreply: icmpelen = max(ICMP_MINLEN, min(icmp_datalen, nip->ip_tos = 0; nip->ip_off = 0; icmp_reflect(m); - freeit: m_freem(n); } diff --git a/bsd/netinet/ip_icmp.h b/bsd/netinet/ip_icmp.h index 3438a1bdb..2de986d94 100644 --- a/bsd/netinet/ip_icmp.h +++ b/bsd/netinet/ip_icmp.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2013 Apple Inc. All rights reserved. + * Copyright (c) 2008-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -227,6 +227,11 @@ struct icmp { (type) == ICMP_IREQ || (type) == ICMP_IREQREPLY || \ (type) == ICMP_MASKREQ || (type) == ICMP_MASKREPLY) +#define ICMP_ERRORTYPE(type) \ + ((type) == ICMP_UNREACH || (type) == ICMP_SOURCEQUENCH || \ + (type) == ICMP_REDIRECT || (type) == ICMP_TIMXCEED || \ + (type) == ICMP_PARAMPROB) + #ifdef BSD_KERNEL_PRIVATE void icmp_error(struct mbuf *, int, int, n_long, u_int32_t); void icmp_input(struct mbuf *, int); diff --git a/bsd/netinet/ip_input.c b/bsd/netinet/ip_input.c index 97750ef57..70ee5fac6 100644 --- a/bsd/netinet/ip_input.c +++ b/bsd/netinet/ip_input.c @@ -4197,6 +4197,16 @@ ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, goto no_mbufs; } } + if (inp->inp_socket->so_options & SO_TIMESTAMP_CONTINUOUS) { + uint64_t time; + + time = mach_continuous_time(); + mp = sbcreatecontrol_mbuf((caddr_t)&time, sizeof (time), + SCM_TIMESTAMP_CONTINUOUS, SOL_SOCKET, mp); + if (*mp == NULL) { + goto no_mbufs; + } + } if (inp->inp_flags & INP_RECVDSTADDR) { mp = sbcreatecontrol_mbuf((caddr_t)&ip->ip_dst, sizeof (struct in_addr), IP_RECVDSTADDR, IPPROTO_IP, mp); diff --git a/bsd/netinet/ip_output.c b/bsd/netinet/ip_output.c index f5b51ac52..35f778d25 100644 --- a/bsd/netinet/ip_output.c +++ b/bsd/netinet/ip_output.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2017 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -105,6 +105,9 @@ #include #include #include +#include + +#include #if CONFIG_MACF_NET #include @@ -350,6 +353,8 @@ ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt, uint32_t raw; } ipobf = { .raw = 0 }; + int interface_mtu = 0; + /* * Here we check for restrictions when sending frames. * N.B.: IPv4 over internal co-processor interfaces is not allowed. @@ -357,7 +362,7 @@ ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt, #define IP_CHECK_RESTRICTIONS(_ifp, _ipobf) \ (((_ipobf).nocell && IFNET_IS_CELLULAR(_ifp)) || \ ((_ipobf).noexpensive && IFNET_IS_EXPENSIVE(_ifp)) || \ - (IFNET_IS_INTCOPROC(_ifp)) || \ + (IFNET_IS_INTCOPROC(_ifp)) || \ (!(_ipobf).awdl_unrestricted && IFNET_IS_AWDL_RESTRICTED(_ifp))) if (ip_output_measure) @@ -1822,11 +1827,19 @@ ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt, ip_output_checksum(ifp, m, (IP_VHL_HL(ip->ip_vhl) << 2), ip->ip_len, &sw_csum); + interface_mtu = ifp->if_mtu; + + if (INTF_ADJUST_MTU_FOR_CLAT46(ifp)) { + interface_mtu = IN6_LINKMTU(ifp); + /* Further adjust the size for CLAT46 expansion */ + interface_mtu -= CLAT46_HDR_EXPANSION_OVERHD; + } + /* * If small enough for interface, or the interface will take * care of the fragmentation for us, can just send directly. */ - if ((u_short)ip->ip_len <= ifp->if_mtu || TSO_IPV4_OK(ifp, m) || + if ((u_short)ip->ip_len <= interface_mtu || TSO_IPV4_OK(ifp, m) || (!(ip->ip_off & IP_DF) && (ifp->if_hwassist & CSUM_FRAGMENT))) { #if BYTE_ORDER != BIG_ENDIAN HTONS(ip->ip_len); @@ -1899,6 +1912,8 @@ ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt, goto loopit; } } + + VERIFY(interface_mtu != 0); /* * Too large for interface; fragment if possible. * Must be able to put at least 8 bytes per fragment. @@ -1918,8 +1933,8 @@ ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt, RT_LOCK_SPIN(ro->ro_rt); if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) && !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) && - (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) { - ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; + (ro->ro_rt->rt_rmx.rmx_mtu > interface_mtu)) { + ro->ro_rt->rt_rmx.rmx_mtu = interface_mtu; } RT_UNLOCK(ro->ro_rt); } @@ -1930,7 +1945,46 @@ ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt, goto bad; } - error = ip_fragment(m, ifp, ifp->if_mtu, sw_csum); + /* + * XXX Only TCP seems to be passing a list of packets here. + * The following issue is limited to UDP datagrams with 0 checksum. + * For now limit it to the case when single packet is passed down. + */ + if (packetchain == 0 && IS_INTF_CLAT46(ifp)) { + /* + * If it is a UDP packet that has checksum set to 0 + * and is also not being offloaded, compute a full checksum + * and update the UDP checksum. + */ + if (ip->ip_p == IPPROTO_UDP && + !(m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_PARTIAL))) { + struct udphdr *uh = NULL; + + if (m->m_len < hlen + sizeof (struct udphdr)) { + m = m_pullup(m, hlen + sizeof (struct udphdr)); + if (m == NULL) { + error = ENOBUFS; + m0 = m; + goto bad; + } + m0 = m; + ip = mtod(m, struct ip *); + } + /* + * Get UDP header and if checksum is 0, then compute the full + * checksum. + */ + uh = (struct udphdr *)(void *)((caddr_t)ip + hlen); + if (uh->uh_sum == 0) { + uh->uh_sum = inet_cksum(m, IPPROTO_UDP, hlen, + ip->ip_len - hlen); + if (uh->uh_sum == 0) + uh->uh_sum = 0xffff; + } + } + } + + error = ip_fragment(m, ifp, interface_mtu, sw_csum); if (error != 0) { m0 = m = NULL; goto bad; @@ -2029,6 +2083,16 @@ ip_fragment(struct mbuf *m, struct ifnet *ifp, unsigned long mtu, int sw_csum) hlen = ip->ip_hl << 2; #endif /* !_IP_VHL */ +#ifdef INET6 + /* + * We need to adjust the fragment sizes to account + * for IPv6 fragment header if it needs to be translated + * from IPv4 to IPv6. + */ + if (IS_INTF_CLAT46(ifp)) + mtu -= sizeof(struct ip6_frag); + +#endif firstlen = len = (mtu - hlen) &~ 7; if (len < 8) { m_freem(m); @@ -3435,6 +3499,19 @@ in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) return (ifa); } +/* + * @brief Given outgoing interface it determines what checksum needs + * to be computed in software and what needs to be offloaded to the + * interface. + * + * @param ifp Pointer to the outgoing interface + * @param m Pointer to the packet + * @param hlen IP header length + * @param ip_len Total packet size i.e. headers + data payload + * @param sw_csum Pointer to a software checksum flag set + * + * @return void + */ void ip_output_checksum(struct ifnet *ifp, struct mbuf *m, int hlen, int ip_len, uint32_t *sw_csum) @@ -3458,6 +3535,14 @@ ip_output_checksum(struct ifnet *ifp, struct mbuf *m, int hlen, int ip_len, *sw_csum |= ((CSUM_DELAY_DATA | CSUM_DELAY_IP) & m->m_pkthdr.csum_flags); } else if (!(*sw_csum & CSUM_DELAY_DATA) && (hwcap & CSUM_PARTIAL)) { + int interface_mtu = ifp->if_mtu; + + if (INTF_ADJUST_MTU_FOR_CLAT46(ifp)) { + interface_mtu = IN6_LINKMTU(ifp); + /* Further adjust the size for CLAT46 expansion */ + interface_mtu -= CLAT46_HDR_EXPANSION_OVERHD; + } + /* * Partial checksum offload, if non-IP fragment, and TCP only * (no UDP support, as the hardware may not be able to convert @@ -3468,7 +3553,7 @@ ip_output_checksum(struct ifnet *ifp, struct mbuf *m, int hlen, int ip_len, ((m->m_pkthdr.csum_flags & CSUM_TCP) || ((hwcap & CSUM_ZERO_INVERT) && (m->m_pkthdr.csum_flags & CSUM_ZERO_INVERT))) && - ip_len <= ifp->if_mtu) { + ip_len <= interface_mtu) { uint16_t start = sizeof (struct ip); uint16_t ulpoff = m->m_pkthdr.csum_data & 0xffff; m->m_pkthdr.csum_flags |= diff --git a/bsd/netinet/isakmp.h b/bsd/netinet/isakmp.h new file mode 100644 index 000000000..299e90a66 --- /dev/null +++ b/bsd/netinet/isakmp.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * RFC 2408 Internet Security Association and Key Management Protocol + */ + +#ifndef _NETINET_ISAKMP_H_ +#define _NETINET_ISAKMP_H_ + +typedef u_char cookie_t[8]; +typedef u_char msgid_t[4]; + +/* 3.1 ISAKMP Header Format (IKEv1 and IKEv2) + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + ! Initiator ! + ! Cookie ! + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + ! Responder ! + ! Cookie ! + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + ! Next Payload ! MjVer ! MnVer ! Exchange Type ! Flags ! + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + ! Message ID ! + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + ! Length ! + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + */ +struct isakmp { + cookie_t i_ck; /* Initiator Cookie */ + cookie_t r_ck; /* Responder Cookie */ + uint8_t np; /* Next Payload Type */ + uint8_t vers; +#define ISAKMP_VERS_MAJOR 0xf0 +#define ISAKMP_VERS_MAJOR_SHIFT 4 +#define ISAKMP_VERS_MINOR 0x0f +#define ISAKMP_VERS_MINOR_SHIFT 0 + uint8_t etype; /* Exchange Type */ + uint8_t flags; /* Flags */ + msgid_t msgid; + uint32_t len; /* Length */ +}; + +/* 3.2 Payload Generic Header + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + ! Next Payload ! RESERVED ! Payload Length ! + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + */ +struct isakmp_gen { + uint8_t np; /* Next Payload */ + uint8_t critical; /* bit 7 - critical, rest is RESERVED */ + uint16_t len; /* Payload Length */ +}; + +#endif /* _NETINET_ISAKMP_H_ */ diff --git a/bsd/netinet/kpi_ipfilter.c b/bsd/netinet/kpi_ipfilter.c index ea54abae4..ad5e5f664 100644 --- a/bsd/netinet/kpi_ipfilter.c +++ b/bsd/netinet/kpi_ipfilter.c @@ -278,10 +278,15 @@ ipf_inject_input( struct mbuf *m = (struct mbuf *)data; struct m_tag *mtag = 0; struct ip *ip = mtod(m, struct ip *); + struct ip6_hdr *ip6; u_int8_t vers; int hlen; errno_t error = 0; protocol_family_t proto; + struct in_ifaddr *ia = NULL; + struct in_addr *pkt_dst = NULL; + struct in6_ifaddr *ia6 = NULL; + struct sockaddr_in6 pkt_dst6; vers = IP_VHL_V(ip->ip_vhl); @@ -298,7 +303,46 @@ ipf_inject_input( } if (filter_ref == 0 && m->m_pkthdr.rcvif == 0) { - m->m_pkthdr.rcvif = lo_ifp; + /* + * Search for interface with the local address + */ + switch (proto) { + case PF_INET: + pkt_dst = &ip->ip_dst; + lck_rw_lock_shared(in_ifaddr_rwlock); + TAILQ_FOREACH(ia, INADDR_HASH(pkt_dst->s_addr), ia_hash) { + if (IA_SIN(ia)->sin_addr.s_addr == pkt_dst->s_addr) { + m->m_pkthdr.rcvif = ia->ia_ifp; + break; + } + } + lck_rw_done(in_ifaddr_rwlock); + break; + + case PF_INET6: + ip6 = mtod(m, struct ip6_hdr *); + pkt_dst6.sin6_addr = ip6->ip6_dst; + lck_rw_lock_shared(&in6_ifaddr_rwlock); + for (ia6 = in6_ifaddrs; ia6 != NULL; ia6 = ia6->ia_next) { + if (IN6_ARE_ADDR_EQUAL(&ia6->ia_addr.sin6_addr, &pkt_dst6.sin6_addr)) { + m->m_pkthdr.rcvif = ia6->ia_ifp; + break; + } + } + lck_rw_done(&in6_ifaddr_rwlock); + break; + + default: + break; + } + + /* + * If none found, fallback to loopback + */ + if (m->m_pkthdr.rcvif == NULL) { + m->m_pkthdr.rcvif = lo_ifp; + } + m->m_pkthdr.csum_data = 0; m->m_pkthdr.csum_flags = 0; if (vers == 4) { diff --git a/bsd/netinet/kpi_ipfilter.h b/bsd/netinet/kpi_ipfilter.h index 392d0650b..ae9adf13d 100644 --- a/bsd/netinet/kpi_ipfilter.h +++ b/bsd/netinet/kpi_ipfilter.h @@ -50,14 +50,14 @@ struct ipf_pktopts { int ippo_mcast_loop; u_int8_t ippo_mcast_ttl; }; -#define IPPOF_MCAST_OPTS 0x1 +#define IPPOF_MCAST_OPTS 0x1 #ifdef PRIVATE -#define IPPOF_BOUND_IF 0x2 -#define IPPOF_NO_IFT_CELLULAR 0x4 -#define IPPOF_SELECT_SRCIF 0x8 -#define IPPOF_BOUND_SRCADDR 0x10 -#define IPPOF_SHIFT_IFSCOPE 16 -#define IPPOF_NO_IFF_EXPENSIVE 0x20 +#define IPPOF_BOUND_IF 0x2 +#define IPPOF_NO_IFT_CELLULAR 0x4 +#define IPPOF_SELECT_SRCIF 0x8 +#define IPPOF_BOUND_SRCADDR 0x10 +#define IPPOF_SHIFT_IFSCOPE 16 +#define IPPOF_NO_IFF_EXPENSIVE 0x20 #endif /* PRIVATE */ typedef struct ipf_pktopts *ipf_pktopts_t; diff --git a/bsd/netinet/mp_pcb.h b/bsd/netinet/mp_pcb.h index f8fb188c0..5d1cd3ef0 100644 --- a/bsd/netinet/mp_pcb.h +++ b/bsd/netinet/mp_pcb.h @@ -43,10 +43,6 @@ typedef enum mppcb_state { MPPCB_STATE_DEAD = 2, } mppcb_state_t; - -/* net/necp.h already includes mp_pcb.h - so we have to forward-declare */ -struct necp_client_flow; - /* * Multipath Protocol Control Block */ @@ -61,7 +57,7 @@ struct mppcb { #if NECP uuid_t necp_client_uuid; - void (*necp_cb)(void *, int, struct necp_client_flow *); + void (*necp_cb)(void *, int, uint32_t, uint32_t, bool *); #endif }; @@ -120,6 +116,10 @@ extern void mptcp_timer_sched(void); extern void mptcp_handle_deferred_upcalls(struct mppcb *mpp, uint32_t flag); extern int mp_getsockaddr(struct socket *mp_so, struct sockaddr **nam); extern int mp_getpeeraddr(struct socket *mp_so, struct sockaddr **nam); +#if NECP +extern int necp_client_register_multipath_cb(pid_t pid, uuid_t client_id, struct mppcb *mpp); +extern void necp_mppcb_dispose(struct mppcb *mpp); +#endif __END_DECLS #endif /* BSD_KERNEL_PRIVATE */ diff --git a/bsd/netinet/mptcp.c b/bsd/netinet/mptcp.c index 55829e0a1..80db1552d 100644 --- a/bsd/netinet/mptcp.c +++ b/bsd/netinet/mptcp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2017 Apple Inc. All rights reserved. + * Copyright (c) 2012-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -499,13 +499,6 @@ mptcp_input(struct mptses *mpte, struct mbuf *m) m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN; } - if (MPTCP_SEQ_GT(mb_dsn, mp_tp->mpt_rcvnxt) || - !LIST_EMPTY(&mp_tp->mpt_segq)) { - mb_dfin = mptcp_reass(mp_so, &m->m_pkthdr, &mb_datalen, m); - - goto next; - } - mb_dfin = !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN); if (MPTCP_SEQ_LT(mb_dsn, mp_tp->mpt_rcvnxt)) { if (MPTCP_SEQ_LEQ((mb_dsn + mb_datalen), @@ -531,6 +524,14 @@ mptcp_input(struct mptses *mpte, struct mbuf *m) MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE); } + if (MPTCP_SEQ_GT(mb_dsn, mp_tp->mpt_rcvnxt) || + !LIST_EMPTY(&mp_tp->mpt_segq)) { + mb_dfin = mptcp_reass(mp_so, &m->m_pkthdr, &mb_datalen, m); + + goto next; + } + mb_dfin = !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN); + mptcp_sbrcv_grow(mp_tp); if (sbappendstream_rcvdemux(mp_so, m, 0, 0)) @@ -885,9 +886,8 @@ mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **pr /* * Only handover if Symptoms tells us to do so. */ - if (IFNET_IS_WIFI(bestinp->inp_last_outifp) && - mptcp_is_wifi_unusable() && - besttp->t_rxtshift >= mptcp_fail_thresh) + if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) && + mptcp_is_wifi_unusable(mpte) != 0 && mptcp_subflow_is_bad(mpte, best)) return (mptcp_return_subflow(second_best)); return (mptcp_return_subflow(best)); @@ -896,8 +896,8 @@ mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **pr int rto_thresh = mptcp_rtothresh; /* Adjust with symptoms information */ - if (IFNET_IS_WIFI(bestinp->inp_last_outifp) && - mptcp_is_wifi_unusable()) { + if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) && + mptcp_is_wifi_unusable(mpte) != 0) { rtt_thresh /= 2; rto_thresh /= 2; } @@ -914,7 +914,7 @@ mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **pr return (mptcp_return_subflow(second_best)); } - if (besttp->t_rxtshift >= mptcp_fail_thresh && + if (mptcp_subflow_is_bad(mpte, best) && secondtp->t_rxtshift == 0) { return (mptcp_return_subflow(second_best)); } @@ -1136,8 +1136,8 @@ mptcp_update_rcv_state_meat(struct mptcb *mp_tp, struct tcpcb *tp, return; } mptcplog((LOG_DEBUG, - "%s: seqn = %x len = %x full = %llx rcvnxt = %llu \n", __func__, - seqn, mdss_data_len, full_dsn, mp_tp->mpt_rcvnxt), + "%s: seqn = %u len = %u full = %u rcvnxt = %u \n", __func__, + seqn, mdss_data_len, (uint32_t)full_dsn, (uint32_t)mp_tp->mpt_rcvnxt), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE); mptcp_notify_mpready(tp->t_inpcb->inp_socket); @@ -1356,11 +1356,17 @@ mptcp_reset_itfinfo(struct mpt_itf_info *info) info->ifindex = 0; info->has_v4_conn = 0; info->has_v6_conn = 0; + info->has_nat64_conn = 0; } void -mptcp_session_necp_cb(void *handle, int action, struct necp_client_flow *flow) +mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index, + uint32_t necp_flags, __unused bool *viable) { + boolean_t has_v4 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV4); + boolean_t has_v6 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV6); + boolean_t has_nat64 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_NAT64); + boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER); struct mppcb *mp = (struct mppcb *)handle; struct mptses *mpte = mptompte(mp); struct socket *mp_so; @@ -1368,7 +1374,7 @@ mptcp_session_necp_cb(void *handle, int action, struct necp_client_flow *flow) int locked = 0; uint32_t i, ifindex; - ifindex = flow->interface_index; + ifindex = interface_index; VERIFY(ifindex != IFSCOPE_NONE); /* About to be garbage-collected (see note about MPTCP/NECP interactions) */ @@ -1389,15 +1395,26 @@ mptcp_session_necp_cb(void *handle, int action, struct necp_client_flow *flow) mp_tp = mpte->mpte_mptcb; mp_so = mptetoso(mpte); - os_log_debug(mptcp_log_handle, "%s, action: %u ifindex %u usecount %u mpt_flags %#x state %u\n", - __func__, action, ifindex, mp->mpp_socket->so_usecount, mp_tp->mpt_flags, mp_tp->mpt_state); + os_log_info(mptcp_log_handle, "%s, action: %u ifindex %u usecount %u mpt_flags %#x state %u v4 %u v6 %u nat64 %u power %u\n", + __func__, action, ifindex, mp->mpp_socket->so_usecount, mp_tp->mpt_flags, mp_tp->mpt_state, + has_v4, has_v6, has_nat64, low_power); /* No need on fallen back sockets */ if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) goto out; + /* + * When the interface goes in low-power mode we don't want to establish + * new subflows on it. Thus, mark it internally as non-viable. + */ + if (low_power) + action = NECP_CLIENT_CBACTION_NONVIABLE; + if (action == NECP_CLIENT_CBACTION_NONVIABLE) { for (i = 0; i < mpte->mpte_itfinfo_size; i++) { + if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) + continue; + if (mpte->mpte_itfinfo[i].ifindex == ifindex) mptcp_reset_itfinfo(&mpte->mpte_itfinfo[i]); } @@ -1406,8 +1423,6 @@ mptcp_session_necp_cb(void *handle, int action, struct necp_client_flow *flow) } else if (action == NECP_CLIENT_CBACTION_VIABLE || action == NECP_CLIENT_CBACTION_INITIAL) { int found_slot = 0, slot_index = -1; - boolean_t has_v4 = !!(flow->necp_flow_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV4); - boolean_t has_v6 = !!(flow->necp_flow_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV6); struct ifnet *ifp; ifnet_head_lock_shared(); @@ -1425,6 +1440,9 @@ mptcp_session_necp_cb(void *handle, int action, struct necp_client_flow *flow) (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) goto out; + if (IS_INTF_CLAT46(ifp)) + has_v4 = FALSE; + /* Look for the slot on where to store/update the interface-info. */ for (i = 0; i < mpte->mpte_itfinfo_size; i++) { /* Found a potential empty slot where we can put it */ @@ -1439,7 +1457,8 @@ mptcp_session_necp_cb(void *handle, int action, struct necp_client_flow *flow) */ if (mpte->mpte_itfinfo[i].ifindex == ifindex && (mpte->mpte_itfinfo[i].has_v4_conn != has_v4 || - mpte->mpte_itfinfo[i].has_v6_conn != has_v6)) { + mpte->mpte_itfinfo[i].has_v6_conn != has_v6 || + mpte->mpte_itfinfo[i].has_nat64_conn != has_nat64)) { found_slot = 1; slot_index = i; break; @@ -1455,8 +1474,12 @@ mptcp_session_necp_cb(void *handle, int action, struct necp_client_flow *flow) } if ((mpte->mpte_dst.sa_family == AF_INET || mpte->mpte_dst.sa_family == 0) && - !(flow->necp_flow_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV4) && - ifnet_get_nat64prefix(ifp, NULL) == ENOENT) { + !has_nat64 && !has_v4) { + if (found_slot) { + mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4; + mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6; + mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64; + } mptcp_ask_for_nat64(ifp); goto out; } @@ -1466,8 +1489,8 @@ mptcp_session_necp_cb(void *handle, int action, struct necp_client_flow *flow) struct mpt_itf_info *info = _MALLOC(sizeof(*info) * new_size, M_TEMP, M_ZERO); if (info == NULL) { - mptcplog((LOG_ERR, "%s malloc failed for %u\n", __func__, new_size), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s malloc failed for %u\n", + __func__, new_size); goto out; } @@ -1481,15 +1504,13 @@ mptcp_session_necp_cb(void *handle, int action, struct necp_client_flow *flow) mpte->mpte_itfinfo = info; mpte->mpte_itfinfo_size = new_size; - - mptcplog((LOG_DEBUG, "%s Needed to realloc to %u\n", __func__, new_size), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE); } VERIFY(slot_index >= 0 && slot_index < (int)mpte->mpte_itfinfo_size); mpte->mpte_itfinfo[slot_index].ifindex = ifindex; mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4; mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6; + mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64; mptcp_sched_create_subflows(mpte); } @@ -1518,6 +1539,8 @@ mptcp_set_restrictions(struct socket *mp_so) continue; ifp = ifindex2ifnet[ifindex]; + if (ifp == NULL) + continue; if (IFNET_IS_EXPENSIVE(ifp) && (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE)) diff --git a/bsd/netinet/mptcp_opt.c b/bsd/netinet/mptcp_opt.c index 6da8235e8..13a205586 100644 --- a/bsd/netinet/mptcp_opt.c +++ b/bsd/netinet/mptcp_opt.c @@ -1080,39 +1080,31 @@ mptcp_data_ack_rcvd(struct mptcb *mp_tp, struct tcpcb *tp, u_int64_t full_dack) } void -mptcp_update_window_fallback(struct tcpcb *tp) +mptcp_update_window_wakeup(struct tcpcb *tp) { struct mptcb *mp_tp = tptomptp(tp); mpte_lock_assert_held(mp_tp->mpt_mpte); - if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP)) - return; - - mptcplog((LOG_DEBUG, "%s: update window to %u\n", __func__, tp->snd_wnd), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE); - - mp_tp->mpt_sndwnd = tp->snd_wnd; - mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt; - mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna; + if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) { + mp_tp->mpt_sndwnd = tp->snd_wnd; + mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt; + mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna; + } sowwakeup(tp->t_inpcb->inp_socket); } static void -mptcp_update_window(struct mptcb *mp_tp, u_int64_t ack, u_int64_t seq, - u_int32_t tiwin) +mptcp_update_window(struct mptcb *mp_tp, u_int64_t ack, u_int64_t seq, u_int32_t tiwin) { - /* Don't look at the window if there is no ACK flag */ - if ((SEQ_LT(mp_tp->mpt_sndwl1, seq) || - (mp_tp->mpt_sndwl1 == seq && (SEQ_LT(mp_tp->mpt_sndwl2, ack) || - (mp_tp->mpt_sndwl2 == ack && tiwin > mp_tp->mpt_sndwnd))))) { + if (SEQ_LT(mp_tp->mpt_sndwl1, seq) || + (mp_tp->mpt_sndwl1 == seq && + (SEQ_LT(mp_tp->mpt_sndwl2, ack) || + (mp_tp->mpt_sndwl2 == ack && tiwin > mp_tp->mpt_sndwnd)))) { mp_tp->mpt_sndwnd = tiwin; mp_tp->mpt_sndwl1 = seq; mp_tp->mpt_sndwl2 = ack; - - mptcplog((LOG_DEBUG, "%s: Updating window to %u\n", __func__, - mp_tp->mpt_sndwnd), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE); } } @@ -1138,11 +1130,11 @@ mptcp_do_dss_opt_ack_meat(u_int64_t full_dack, u_int64_t full_dsn, if (close_notify) mptcp_notify_close(tp->t_inpcb->inp_socket); } else { - mptcplog((LOG_ERR,"%s: unexpected dack %u snduna %u sndmax %u\n", __func__, - (u_int32_t)full_dack, (u_int32_t)mp_tp->mpt_snduna, - (u_int32_t)mp_tp->mpt_sndmax), - (MPTCP_SOCKET_DBG|MPTCP_RECEIVER_DBG), - MPTCP_LOGLVL_LOG); + os_log_error(mptcp_log_handle, + "%s: unexpected dack %u snduna %u sndmax %u\n", + __func__, (u_int32_t)full_dack, + (u_int32_t)mp_tp->mpt_snduna, + (u_int32_t)mp_tp->mpt_sndmax); } mptcp_update_window(mp_tp, full_dack, full_dsn, tiwin); diff --git a/bsd/netinet/mptcp_opt.h b/bsd/netinet/mptcp_opt.h index 785e1a998..f00653f08 100644 --- a/bsd/netinet/mptcp_opt.h +++ b/bsd/netinet/mptcp_opt.h @@ -42,7 +42,7 @@ __BEGIN_DECLS extern void mptcp_data_ack_rcvd(struct mptcb *mp_tp, struct tcpcb *tp, u_int64_t full_dack); -extern void mptcp_update_window_fallback(struct tcpcb *tp); +extern void mptcp_update_window_wakeup(struct tcpcb *tp); extern void tcp_do_mptcp_options(struct tcpcb *, u_char *, struct tcphdr *, struct tcpopt *, int); extern unsigned mptcp_setup_syn_opts(struct socket *, u_char*, unsigned); diff --git a/bsd/netinet/mptcp_subr.c b/bsd/netinet/mptcp_subr.c index a2a656883..1606cdb62 100644 --- a/bsd/netinet/mptcp_subr.c +++ b/bsd/netinet/mptcp_subr.c @@ -652,6 +652,9 @@ mptcpstats_session_wrapup(struct mptses *mpte) if (cell && mpte->mpte_handshake_success && mpte->mpte_used_wifi) tcpstat.tcps_mptcp_back_to_wifi++; + + if (mpte->mpte_triggered_cell) + tcpstat.tcps_mptcp_triggered_cell++; } /* @@ -695,7 +698,7 @@ static boolean_t mptcp_ok_to_create_subflows(struct mptcb *mp_tp) { return (mp_tp->mpt_state >= MPTCPS_ESTABLISHED && - mp_tp->mpt_state < MPTCPS_TIME_WAIT && + mp_tp->mpt_state < MPTCPS_FIN_WAIT_1 && !(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP)); } @@ -711,12 +714,12 @@ mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len, struct in_addr *addr char *ptrv4 = (char *)addrv4; char *ptr = (char *)addr; - if (IN_ZERONET(addrv4->s_addr) || // 0.0.0.0/8 Source hosts on local network - IN_LOOPBACK(addrv4->s_addr) || // 127.0.0.0/8 Loopback - IN_LINKLOCAL(addrv4->s_addr) || // 169.254.0.0/16 Link Local - IN_DS_LITE(addrv4->s_addr) || // 192.0.0.0/29 DS-Lite - IN_6TO4_RELAY_ANYCAST(addrv4->s_addr) || // 192.88.99.0/24 6to4 Relay Anycast - IN_MULTICAST(addrv4->s_addr) || // 224.0.0.0/4 Multicast + if (IN_ZERONET(ntohl(addrv4->s_addr)) || // 0.0.0.0/8 Source hosts on local network + IN_LOOPBACK(ntohl(addrv4->s_addr)) || // 127.0.0.0/8 Loopback + IN_LINKLOCAL(ntohl(addrv4->s_addr)) || // 169.254.0.0/16 Link Local + IN_DS_LITE(ntohl(addrv4->s_addr)) || // 192.0.0.0/29 DS-Lite + IN_6TO4_RELAY_ANYCAST(ntohl(addrv4->s_addr)) || // 192.88.99.0/24 6to4 Relay Anycast + IN_MULTICAST(ntohl(addrv4->s_addr)) || // 224.0.0.0/4 Multicast INADDR_BROADCAST == addrv4->s_addr) { // 255.255.255.255/32 Limited Broadcast return (-1); } @@ -724,8 +727,8 @@ mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len, struct in_addr *addr /* Check for the well-known prefix */ if (len == NAT64_PREFIX_LEN_96 && IN6_ARE_ADDR_EQUAL(addr, &well_known_prefix)) { - if (IN_PRIVATE(addrv4->s_addr) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use - IN_SHARED_ADDRESS_SPACE(addrv4->s_addr)) // 100.64.0.0/10 Shared Address Space + if (IN_PRIVATE(ntohl(addrv4->s_addr)) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use + IN_SHARED_ADDRESS_SPACE(ntohl(addrv4->s_addr))) // 100.64.0.0/10 Shared Address Space return (-1); } @@ -762,10 +765,36 @@ mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len, struct in_addr *addr return (0); } +static void +mptcp_trigger_cell_bringup(struct mptses *mpte) +{ + struct socket *mp_so = mptetoso(mpte); + + if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) { + uuid_string_t uuidstr; + int err; + + err = necp_client_assert_bb_radio_manager(mpsotomppcb(mp_so)->necp_client_uuid, + TRUE); + + if (err == 0) + mpte->mpte_triggered_cell = 1; + + uuid_unparse_upper(mpsotomppcb(mp_so)->necp_client_uuid, uuidstr); + os_log_info(mptcp_log_handle, "%s asked irat to bringup cell for uuid %s, err %d\n", + __func__, uuidstr, err); + } else { + os_log_info(mptcp_log_handle, "%s UUID is already null\n", __func__); + } +} + + void mptcp_check_subflows_and_add(struct mptses *mpte) { struct mptcb *mp_tp = mpte->mpte_mptcb; + boolean_t cellular_viable = FALSE; + boolean_t want_cellular = TRUE; uint32_t i; if (!mptcp_ok_to_create_subflows(mp_tp)) @@ -774,6 +803,7 @@ mptcp_check_subflows_and_add(struct mptses *mpte) for (i = 0; i < mpte->mpte_itfinfo_size; i++) { struct mpt_itf_info *info; struct mptsub *mpts; + struct ifnet *ifp; uint32_t ifindex; int found = 0; @@ -786,23 +816,22 @@ mptcp_check_subflows_and_add(struct mptses *mpte) if (ifindex == IFSCOPE_NONE) continue; + ifnet_head_lock_shared(); + ifp = ifindex2ifnet[ifindex]; + ifnet_head_done(); + + if (ifp == NULL) + continue; + + if (IFNET_IS_CELLULAR(ifp)) + cellular_viable = TRUE; + TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { - const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp; + const struct ifnet *subifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp; - if (ifp == NULL) + if (subifp == NULL) continue; - if (ifp->if_index == ifindex && - !(mpts->mpts_socket->so_state & SS_ISDISCONNECTED) && - sototcpcb(mpts->mpts_socket)->t_state != TCPS_CLOSED) { - /* - * We found a subflow on this interface. - * No need to create a new one. - */ - found = 1; - break; - } - /* * In Handover mode, only create cell subflow if * 1. Wi-Fi Assist is active @@ -821,15 +850,37 @@ mptcp_check_subflows_and_add(struct mptses *mpte) * good performance. */ if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER && - !IFNET_IS_CELLULAR(ifp) && + !IFNET_IS_CELLULAR(subifp) && !(mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) && - (!mptcp_is_wifi_unusable() || - (sototcpcb(mpts->mpts_socket)->t_rxtshift < mptcp_fail_thresh && - mptetoso(mpte)->so_snd.sb_cc))) { - mptcplog((LOG_DEBUG, "%s handover, wifi state %u rxt %u ifindex %u this %u\n", - __func__, mptcp_is_wifi_unusable(), sototcpcb(mpts->mpts_socket)->t_rxtshift, ifindex, - ifp->if_index), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE); + (mptcp_is_wifi_unusable(mpte) == 0 || + (sototcpcb(mpts->mpts_socket)->t_rxtshift < mptcp_fail_thresh * 2 && + ((mpte->mpte_flags & MPTE_FIRSTPARTY) || mptetoso(mpte)->so_snd.sb_cc)))) { + os_log_debug(mptcp_log_handle, "%s handover, wifi state %d rxt %u first-party %u sb_cc %u ifindex %u this %u\n", + __func__, mptcp_is_wifi_unusable(mpte), + sototcpcb(mpts->mpts_socket)->t_rxtshift, + !!(mpte->mpte_flags & MPTE_FIRSTPARTY), + mptetoso(mpte)->so_snd.sb_cc, + ifindex, subifp->if_index); + found = 1; + + /* We found a proper subflow on WiFi - no need for cell */ + want_cellular = FALSE; + break; + } else { + os_log_debug(mptcp_log_handle, "%s svc %u cell %u flags %#x unusable %d rtx %u first %u sbcc %u\n", + __func__, mpte->mpte_svctype, IFNET_IS_CELLULAR(subifp), mpts->mpts_flags, + mptcp_is_wifi_unusable(mpte), sototcpcb(mpts->mpts_socket)->t_rxtshift, + !!(mpte->mpte_flags & MPTE_FIRSTPARTY), mptetoso(mpte)->so_snd.sb_cc); + + } + + if (subifp->if_index == ifindex && + !(mpts->mpts_socket->so_state & SS_ISDISCONNECTED) && + sototcpcb(mpts->mpts_socket)->t_state != TCPS_CLOSED) { + /* + * We found a subflow on this interface. + * No need to create a new one. + */ found = 1; break; } @@ -847,22 +898,16 @@ mptcp_check_subflows_and_add(struct mptses *mpte) struct sockaddr_in6 nat64pre; if (mpte->mpte_dst.sa_family == AF_INET && - !info->has_v4_conn && info->has_v6_conn) { + !info->has_v4_conn && info->has_nat64_conn) { struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES]; - struct ifnet *ifp; int error, j; bzero(&nat64pre, sizeof(struct sockaddr_in6)); - ifnet_head_lock_shared(); - ifp = ifindex2ifnet[ifindex]; - ifnet_head_done(); - error = ifnet_get_nat64prefix(ifp, nat64prefixes); if (error) { - mptcplog((LOG_ERR, "%s: no NAT64-prefix on itf %s, error %d\n", - __func__, ifp->if_name, error), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s: no NAT64-prefix on itf %s, error %d\n", + __func__, ifp->if_name, error); continue; } @@ -877,8 +922,8 @@ mptcp_check_subflows_and_add(struct mptses *mpte) nat64prefixes[j].prefix_len, &mpte->__mpte_dst_v4.sin_addr); if (error != 0) { - mptcplog((LOG_INFO, "%s: cannot synthesize this addr\n", __func__), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG); + os_log_info(mptcp_log_handle, "%s: cannot synthesize this addr\n", + __func__); continue; } @@ -908,6 +953,11 @@ mptcp_check_subflows_and_add(struct mptses *mpte) mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL); } } + + if (!cellular_viable && want_cellular) { + /* Trigger Cell Bringup */ + mptcp_trigger_cell_bringup(mpte); + } } /* @@ -919,7 +969,7 @@ mptcp_check_subflows_and_remove(struct mptses *mpte) { struct mptsub *mpts, *tmpts; int found_working_subflow = 0, removed_some = 0; - int wifi_unusable = mptcp_is_wifi_unusable(); + int wifi_unusable = mptcp_is_wifi_unusable(mpte); if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER) return; @@ -943,8 +993,8 @@ mptcp_check_subflows_and_remove(struct mptses *mpte) tp->t_state != TCPS_ESTABLISHED) continue; - /* Either this subflow is in good condition while we try to send */ - if (tp->t_rxtshift == 0 && mptetoso(mpte)->so_snd.sb_cc) + /* Is this subflow in good condition? */ + if (tp->t_rxtshift == 0) found_working_subflow = 1; /* Or WiFi is fine */ @@ -1225,13 +1275,18 @@ mptcp_subflow_attach(struct mptses *mpte, struct mptsub *mpts, struct socket *so static void mptcp_subflow_necp_cb(void *handle, __unused int action, - __unused struct necp_client_flow *flow) + __unused uint32_t interface_index, + uint32_t necp_flags, bool *viable) { + boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER); struct inpcb *inp = (struct inpcb *)handle; struct socket *so = inp->inp_socket; struct mptsub *mpts; struct mptses *mpte; + if (low_power) + action = NECP_CLIENT_CBACTION_NONVIABLE; + if (action != NECP_CLIENT_CBACTION_NONVIABLE) return; @@ -1251,15 +1306,15 @@ mptcp_subflow_necp_cb(void *handle, __unused int action, mpte = tptomptp(sototcpcb(so))->mpt_mpte; mpts = sototcpcb(so)->t_mpsub; - mptcplog((LOG_DEBUG, "%s: Subflow became non-viable", __func__), - MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE); + os_log_debug(mptcp_log_handle, "%s Subflow on itf %u became non-viable, power %u", + __func__, mpts->mpts_ifscope, low_power); mpts->mpts_flags |= MPTSF_CLOSE_REQD; mptcp_sched_create_subflows(mpte); - if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) - flow->viable = 1; + if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER && viable != NULL) + *viable = 1; out: socket_unlock(so, 1); @@ -1797,8 +1852,8 @@ mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa, * Check if the full mapping is now present */ if ((int)so->so_rcv.sb_cc < dlen - dfin) { - mptcplog((LOG_INFO, "%s not enough data (%u) need %u\n", - __func__, so->so_rcv.sb_cc, dlen), + mptcplog((LOG_INFO, "%s not enough data (%u) need %u for dsn %u\n", + __func__, so->so_rcv.sb_cc, dlen, (uint32_t)dsn), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG); if (*mp0 == NULL) @@ -3751,9 +3806,9 @@ mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts, if (mpts->mpts_flags & MPTSF_MP_DEGRADED) goto done; mpts->mpts_flags |= MPTSF_MP_DEGRADED; - } - else + } else { mpts->mpts_flags &= ~MPTSF_MP_DEGRADED; + } if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY) mpts->mpts_flags |= MPTSF_MP_READY; @@ -3768,6 +3823,9 @@ mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts, if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) { VERIFY(!(mp_tp->mpt_flags & MPTCPF_JOIN_READY)); ret = MPTS_EVRET_DISCONNECT_FALLBACK; + + m_freem_list(mpte->mpte_reinjectq); + mpte->mpte_reinjectq = NULL; } else if (mpts->mpts_flags & MPTSF_MP_READY) { mp_tp->mpt_flags |= MPTCPF_JOIN_READY; ret = MPTS_EVRET_CONNECT_PENDING; @@ -3955,10 +4013,12 @@ mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt * if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED && mpo->mpo_level == SOL_SOCKET && mpo->mpo_name == SO_MARK_CELLFALLBACK) { - mptcplog((LOG_DEBUG, "%s Setting CELL_FALLBACK, mpte_flags %#x, svctype %u wifi unusable %u lastcell? %d boundcell? %d\n", - __func__, mpte->mpte_flags, mpte->mpte_svctype, mptcp_is_wifi_unusable(), + struct ifnet *ifp = ifindex2ifnet[mpts->mpts_ifscope]; + + mptcplog((LOG_DEBUG, "%s Setting CELL_FALLBACK, mpte_flags %#x, svctype %u wifi unusable %d lastcell? %d boundcell? %d\n", + __func__, mpte->mpte_flags, mpte->mpte_svctype, mptcp_is_wifi_unusable(mpte), sotoinpcb(so)->inp_last_outifp ? IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp) : -1, - mpts->mpts_ifscope != IFSCOPE_NONE ? IFNET_IS_CELLULAR(ifindex2ifnet[mpts->mpts_ifscope]) : -1), + mpts->mpts_ifscope != IFSCOPE_NONE && ifp ? IFNET_IS_CELLULAR(ifp) : -1), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE); /* @@ -3980,8 +4040,8 @@ mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt * * interface, then it definitely is not a cell-fallback * connection. */ - if (mpts->mpts_ifscope == IFSCOPE_NONE || - !IFNET_IS_CELLULAR(ifindex2ifnet[mpts->mpts_ifscope])) + if (mpts->mpts_ifscope == IFSCOPE_NONE || ifp == NULL || + !IFNET_IS_CELLULAR(ifp)) return (0); } @@ -5667,13 +5727,12 @@ symptoms_advisory_t mptcp_advisory; static errno_t mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac, - void **unitinfo) + void **unitinfo) { #pragma unused(kctlref, sac, unitinfo) if (OSIncrementAtomic(&mptcp_kern_skt_inuse) > 0) - mptcplog((LOG_ERR, "%s MPTCP kernel-control socket already open!", __func__), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s MPTCP kernel-control socket for Symptoms already open!", __func__); mptcp_kern_skt_unit = sac->sc_unit; @@ -5760,8 +5819,7 @@ mptcp_ask_symptoms(struct mptses *mpte) int pid, prio, err; if (mptcp_kern_skt_unit == 0) { - mptcplog((LOG_ERR, "%s skt_unit is still 0\n", __func__), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s skt_unit is still 0\n", __func__); return; } @@ -5774,8 +5832,7 @@ mptcp_ask_symptoms(struct mptses *mpte) p = proc_find(pid); if (p == PROC_NULL) { - mptcplog((LOG_ERR, "%s Couldn't find proc for pid %u\n", __func__, - pid), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s Couldn't find proc for pid %u\n", __func__, pid); return; } @@ -5795,14 +5852,12 @@ mptcp_ask_symptoms(struct mptses *mpte) else ask.priority = MPTCP_SYMPTOMS_UNKNOWN; - mptcplog((LOG_DEBUG, "%s ask symptoms about pid %u, prio %u\n", __func__, - pid, ask.priority), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE); - err = ctl_enqueuedata(mptcp_kern_ctrl_ref, mptcp_kern_skt_unit, &ask, sizeof(ask), CTL_DATA_EOR); - if (err) - mptcplog((LOG_ERR, "%s ctl_enqueuedata failed %d\n", __func__, err), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + + os_log_debug(mptcp_log_handle, "%s asked symptoms about pid %u, prio %u, err %d\n", + __func__, pid, ask.priority, err); + proc_rele(p); } @@ -5826,19 +5881,20 @@ mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, symptoms_advisory_t *sa = NULL; if (kcunit != mptcp_kern_skt_unit) - mptcplog((LOG_ERR, "%s kcunit %u is different from expected one %u\n", - __func__, kcunit, mptcp_kern_skt_unit), - MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR); + os_log_error(mptcp_log_handle, "%s kcunit %u is different from expected one %u\n", + __func__, kcunit, mptcp_kern_skt_unit); if (mbuf_pkthdr_len(m) < sizeof(*sa)) { mbuf_freem(m); return (EINVAL); } - if (mbuf_len(m) >= sizeof(*sa)) - sa = mbuf_data(m); - else + if (mbuf_len(m) < sizeof(*sa)) { + mbuf_freem(m); return (EINVAL); + } + + sa = mbuf_data(m); if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_NOCOMMENT && sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) { @@ -5870,6 +5926,7 @@ mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mptcp_allow_uuid(uuid); } + mbuf_freem(m); return (0); } @@ -5890,11 +5947,40 @@ mptcp_control_register(void) (void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref); } +/* + * Three return-values: + * 1 : WiFi is bad + * 0 : WiFi is good + * -1 : WiFi-state is unknown, use subflow-only heuristics + */ int -mptcp_is_wifi_unusable(void) +mptcp_is_wifi_unusable(struct mptses *mpte) { - /* a false return val indicates there is no info or wifi is ok */ - return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD); + if (mpte->mpte_flags & MPTE_FIRSTPARTY) { + if (mptcp_advisory.sa_wifi_status) + return ((mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD) ? 1 : 0); + + /* + * If it's a first-party app and we don't have any info + * about the Wi-Fi state, let's be pessimistic. + */ + return (-1); + } + + return ((mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD) ? 1 : 0); +} + +boolean_t +mptcp_subflow_is_bad(struct mptses *mpte, struct mptsub *mpts) +{ + struct tcpcb *tp = sototcpcb(mpts->mpts_socket); + int fail_thresh = mptcp_fail_thresh; + + if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) + fail_thresh *= 2; + + return (tp->t_rxtshift >= fail_thresh && + (mptetoso(mpte)->so_snd.sb_cc || mpte->mpte_reinjectq)); } /* If TFO data is succesfully acked, it must be dropped from the mptcp so */ diff --git a/bsd/netinet/mptcp_var.h b/bsd/netinet/mptcp_var.h index d09642033..1a85f2e3e 100644 --- a/bsd/netinet/mptcp_var.h +++ b/bsd/netinet/mptcp_var.h @@ -47,6 +47,7 @@ struct mpt_itf_info { uint32_t ifindex; uint32_t has_v4_conn:1, has_v6_conn:1, + has_nat64_conn:1, no_mptcp_support:1; }; @@ -106,6 +107,7 @@ struct mptses { uint32_t mpte_used_cell:1, mpte_used_wifi:1, mpte_initial_cell:1, + mpte_triggered_cell, mpte_handshake_success:1; struct mptcp_itf_stats mpte_itfstats[MPTCP_ITFSTATS_SIZE]; @@ -652,9 +654,10 @@ extern u_int32_t mptcp_get_notsent_lowat(struct mptses *mpte); extern int mptcp_notsent_lowat_check(struct socket *so); extern void mptcp_ask_symptoms(struct mptses *mpte); extern void mptcp_control_register(void); -extern int mptcp_is_wifi_unusable(void); +extern int mptcp_is_wifi_unusable(struct mptses *mpte); +extern boolean_t mptcp_subflow_is_bad(struct mptses *mpte, struct mptsub *mpts); extern void mptcp_ask_for_nat64(struct ifnet *ifp); -extern void mptcp_session_necp_cb(void *, int, struct necp_client_flow *); +extern void mptcp_session_necp_cb(void *, int, uint32_t, uint32_t, bool *); extern void mptcp_set_restrictions(struct socket *mp_so); extern int mptcp_freeq(struct mptcb *); extern void mptcp_set_cellicon(struct mptses *mpte); diff --git a/bsd/netinet/raw_ip.c b/bsd/netinet/raw_ip.c index 30c5e8e33..65f2d2a41 100644 --- a/bsd/netinet/raw_ip.c +++ b/bsd/netinet/raw_ip.c @@ -235,7 +235,7 @@ rip_input(struct mbuf *m, int iphlen) #if NECP if (n && !necp_socket_is_allowed_to_send_recv_v4(last, 0, 0, - &ip->ip_dst, &ip->ip_src, ifp, NULL, NULL)) { + &ip->ip_dst, &ip->ip_src, ifp, NULL, NULL, NULL)) { m_freem(n); /* do not inject data to pcb */ skipit = 1; @@ -254,7 +254,8 @@ rip_input(struct mbuf *m, int iphlen) int error = 0; if ((last->inp_flags & INP_CONTROLOPTS) != 0 || (last->inp_socket->so_options & SO_TIMESTAMP) != 0 || - (last->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + (last->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0 || + (last->inp_socket->so_options & SO_TIMESTAMP_CONTINUOUS) != 0) { ret = ip_savecontrol(last, &opts, ip, n); if (ret != 0) { m_freem(n); @@ -288,7 +289,7 @@ rip_input(struct mbuf *m, int iphlen) skipit = 0; #if NECP if (last && !necp_socket_is_allowed_to_send_recv_v4(last, 0, 0, - &ip->ip_dst, &ip->ip_src, ifp, NULL, NULL)) { + &ip->ip_dst, &ip->ip_src, ifp, NULL, NULL, NULL)) { m_freem(m); OSAddAtomic(1, &ipstat.ips_delivered); /* do not inject data to pcb */ @@ -307,7 +308,8 @@ rip_input(struct mbuf *m, int iphlen) if (last) { if ((last->inp_flags & INP_CONTROLOPTS) != 0 || (last->inp_socket->so_options & SO_TIMESTAMP) != 0 || - (last->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + (last->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0 || + (last->inp_socket->so_options & SO_TIMESTAMP_CONTINUOUS) != 0) { ret = ip_savecontrol(last, &opts, ip, m); if (ret != 0) { m_freem(m); @@ -455,6 +457,7 @@ rip_output( #if NECP { necp_kernel_policy_id policy_id; + necp_kernel_policy_id skip_policy_id; u_int32_t route_rule_id; /* @@ -492,12 +495,12 @@ rip_output( } if (!necp_socket_is_allowed_to_send_recv_v4(inp, 0, 0, - &ip->ip_src, &ip->ip_dst, NULL, &policy_id, &route_rule_id)) { + &ip->ip_src, &ip->ip_dst, NULL, &policy_id, &route_rule_id, &skip_policy_id)) { m_freem(m); return(EHOSTUNREACH); } - necp_mark_packet_from_socket(m, inp, policy_id, route_rule_id); + necp_mark_packet_from_socket(m, inp, policy_id, route_rule_id, skip_policy_id); if (net_qos_policy_restricted != 0) { struct ifnet *rt_ifp = NULL; @@ -529,6 +532,12 @@ rip_output( m->m_pkthdr.pkt_flags |= (PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC | PKTF_FLOW_RAWSOCK); m->m_pkthdr.pkt_proto = inp->inp_ip_p; + m->m_pkthdr.tx_rawip_pid = so->last_pid; + m->m_pkthdr.tx_rawip_e_pid = so->e_pid; + if (so->so_flags & SOF_DELEGATED) + m->m_pkthdr.tx_rawip_e_pid = so->e_pid; + else + m->m_pkthdr.tx_rawip_e_pid = 0; #if CONFIG_MACF_NET mac_mbuf_label_associate_inpcb(inp, m); diff --git a/bsd/netinet/tcp.h b/bsd/netinet/tcp.h index a51294468..b64798dd2 100644 --- a/bsd/netinet/tcp.h +++ b/bsd/netinet/tcp.h @@ -102,6 +102,7 @@ struct tcphdr { #define TH_ECE 0x40 #define TH_CWR 0x80 #define TH_FLAGS (TH_FIN|TH_SYN|TH_RST|TH_ACK|TH_URG|TH_ECE|TH_CWR) +#define TH_ACCEPT (TH_FIN|TH_SYN|TH_RST|TH_ACK) unsigned short th_win; /* window */ unsigned short th_sum; /* checksum */ diff --git a/bsd/netinet/tcp_input.c b/bsd/netinet/tcp_input.c index 216814cff..c18f014e1 100644 --- a/bsd/netinet/tcp_input.c +++ b/bsd/netinet/tcp_input.c @@ -237,13 +237,6 @@ SYSCTL_SKMEM_TCP_INT(OID_AUTO, autorcvbufmax, CTLFLAG_RW | CTLFLAG_LOCKED, u_int32_t, tcp_autorcvbuf_max, 512 * 1024, "Maximum receive socket buffer size"); -u_int32_t tcp_autorcvbuf_max_ca = 512 * 1024; -#if (DEBUG || DEVELOPMENT) -SYSCTL_INT(_net_inet_tcp, OID_AUTO, autorcvbufmaxca, - CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_autorcvbuf_max_ca, 0, - "Maximum receive socket buffer size"); -#endif /* (DEBUG || DEVELOPMENT) */ - #if CONFIG_EMBEDDED int sw_lro = 1; #else @@ -290,6 +283,13 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, disable_access_to_stats, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_disable_access_to_stats, 0, "Disable access to tcpstat"); +SYSCTL_SKMEM_TCP_INT(OID_AUTO, challengeack_limit, + CTLFLAG_RW | CTLFLAG_LOCKED, uint32_t, tcp_challengeack_limit, 10, + "Maximum number of challenge ACKs per connection per second"); + +SYSCTL_SKMEM_TCP_INT(OID_AUTO, do_rfc5961, + CTLFLAG_RW | CTLFLAG_LOCKED, static int, tcp_do_rfc5961, 1, + "Enable/Disable full RFC 5961 compliance"); extern int tcp_TCPTV_MIN; extern int tcp_acc_iaj_high; @@ -551,6 +551,40 @@ void compute_iaj_meat(struct tcpcb *tp, uint32_t cur_iaj) } #endif /* TRAFFIC_MGT */ +/* + * Perform rate limit check per connection per second + * tp->t_challengeack_last is the last_time diff was greater than 1sec + * tp->t_challengeack_count is the number of ACKs sent (within 1sec) + * Return TRUE if we shouldn't send the ACK due to rate limitation + * Return FALSE if it is still ok to send challenge ACK + */ +static boolean_t +tcp_is_ack_ratelimited(struct tcpcb *tp) +{ + boolean_t ret = TRUE; + uint32_t now = tcp_now; + int32_t diff = 0; + + diff = timer_diff(now, 0, tp->t_challengeack_last, 0); + /* If it is first time or diff > 1000ms, + * update the challengeack_last and reset the + * current count of ACKs + */ + if (tp->t_challengeack_last == 0 || diff >= 1000) { + tp->t_challengeack_last = now; + tp->t_challengeack_count = 0; + ret = FALSE; + } else if (tp->t_challengeack_count < tcp_challengeack_limit) { + ret = FALSE; + } + + /* Careful about wrap-around */ + if (ret == FALSE && (tp->t_challengeack_count + 1 > 0)) + tp->t_challengeack_count++; + + return (ret); +} + /* Check if enough amount of data has been acknowledged since * bw measurement was started */ @@ -1815,7 +1849,7 @@ tcp_update_window(struct tcpcb *tp, int thflags, struct tcphdr * th, tp->max_sndwnd = tp->snd_wnd; if (tp->t_inpcb->inp_socket->so_flags & SOF_MP_SUBFLOW) - mptcp_update_window_fallback(tp); + mptcp_update_window_wakeup(tp); return (true); } return (false); @@ -2247,7 +2281,7 @@ tcp_input(struct mbuf *m, int off0) if (so->so_state & SS_ISCONNECTED) { // Connected TCP sockets have a fully-bound local and remote, // so the policy check doesn't need to override addresses - if (!necp_socket_is_allowed_to_send_recv(inp, NULL, NULL)) { + if (!necp_socket_is_allowed_to_send_recv(inp, NULL, NULL, NULL)) { IF_TCP_STATINC(ifp, badformat); goto drop; } @@ -2256,7 +2290,7 @@ tcp_input(struct mbuf *m, int off0) if (isipv6) { if (!necp_socket_is_allowed_to_send_recv_v6(inp, th->th_dport, th->th_sport, &ip6->ip6_dst, - &ip6->ip6_src, ifp, NULL, NULL)) { + &ip6->ip6_src, ifp, NULL, NULL, NULL)) { IF_TCP_STATINC(ifp, badformat); goto drop; } @@ -2265,7 +2299,7 @@ tcp_input(struct mbuf *m, int off0) { if (!necp_socket_is_allowed_to_send_recv_v4(inp, th->th_dport, th->th_sport, &ip->ip_dst, &ip->ip_src, - ifp, NULL, NULL)) { + ifp, NULL, NULL, NULL)) { IF_TCP_STATINC(ifp, badformat); goto drop; } @@ -2282,6 +2316,10 @@ tcp_input(struct mbuf *m, int off0) if (tp->t_state == TCPS_CLOSED) goto drop; + /* If none of the FIN|SYN|RST|ACK flag is set, drop */ + if (tcp_do_rfc5961 && (thflags & TH_ACCEPT) == 0) + goto drop; + /* Unscale the window into a 32-bit value. */ if ((thflags & TH_SYN) == 0) tiwin = th->th_win << tp->snd_scale; @@ -2603,7 +2641,7 @@ tcp_input(struct mbuf *m, int off0) /* now drop the reference on the listener */ socket_unlock(oso, 1); - tcp_set_max_rwinscale(tp, so, TCP_AUTORCVBUF_MAX(ifp)); + tcp_set_max_rwinscale(tp, so, ifp); KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_END,0,0,0,0,0); } @@ -3212,6 +3250,7 @@ tcp_input(struct mbuf *m, int off0) * initialize CCsend and CCrecv. */ tp->snd_wnd = tiwin; /* initial send-window */ + tp->max_sndwnd = tp->snd_wnd; tp->t_flags |= TF_ACKNOW; tp->t_unacksegs = 0; DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, @@ -3317,6 +3356,7 @@ tcp_input(struct mbuf *m, int off0) if ((thflags & TH_SYN) == 0) goto drop; tp->snd_wnd = th->th_win; /* initial send window */ + tp->max_sndwnd = tp->snd_wnd; tp->irs = th->th_seq; tcp_rcvseqinit(tp); @@ -3524,11 +3564,20 @@ tcp_input(struct mbuf *m, int off0) /* Received a SYN while connection is already established. * This is a "half open connection and other anomalies" described * in RFC793 page 34, send an ACK so the remote reset the connection - * or recovers by adjusting its sequence numberering + * or recovers by adjusting its sequence numbering. Sending an ACK is + * in accordance with RFC 5961 Section 4.2 */ case TCPS_ESTABLISHED: - if (thflags & TH_SYN) - goto dropafterack; + if (thflags & TH_SYN) { + /* Drop the packet silently if we have reached the limit */ + if (tcp_do_rfc5961 && tcp_is_ack_ratelimited(tp)) { + goto drop; + } else { + /* Send challenge ACK */ + tcpstat.tcps_synchallenge++; + goto dropafterack; + } + } break; } @@ -3566,6 +3615,11 @@ tcp_input(struct mbuf *m, int off0) * only accepting RSTs where the sequence number is equal to * last_ack_sent. In all other states (the states in which a * RST is more likely), the more permissive check is used. + * RFC 5961 Section 3.2: if the RST bit is set, sequence # is + * within the receive window and last_ack_sent == seq, + * then reset the connection. Otherwise if the seq doesn't + * match last_ack_sent, TCP must send challenge ACK. Perform + * rate limitation when sending the challenge ACK. * If we have multiple segments in flight, the intial reset * segment sequence numbers will be to the left of last_ack_sent, * but they will eventually catch up. @@ -3606,52 +3660,64 @@ tcp_input(struct mbuf *m, int off0) (tp->rcv_wnd == 0 && ((tp->last_ack_sent == th->th_seq) || ((tp->last_ack_sent -1) == th->th_seq)))) { - switch (tp->t_state) { + if (tcp_do_rfc5961 == 0 || tp->last_ack_sent == th->th_seq) { + switch (tp->t_state) { - case TCPS_SYN_RECEIVED: - IF_TCP_STATINC(ifp, rstinsynrcv); - so->so_error = ECONNREFUSED; - goto close; + case TCPS_SYN_RECEIVED: + IF_TCP_STATINC(ifp, rstinsynrcv); + so->so_error = ECONNREFUSED; + goto close; - case TCPS_ESTABLISHED: - if (tp->last_ack_sent != th->th_seq) { - tcpstat.tcps_badrst++; - goto drop; - } - if (TCP_ECN_ENABLED(tp) && - tp->snd_una == tp->iss + 1 && - SEQ_GT(tp->snd_max, tp->snd_una)) { + case TCPS_ESTABLISHED: + if (tcp_do_rfc5961 == 0 && tp->last_ack_sent != th->th_seq) { + tcpstat.tcps_badrst++; + goto drop; + } + if (TCP_ECN_ENABLED(tp) && + tp->snd_una == tp->iss + 1 && + SEQ_GT(tp->snd_max, tp->snd_una)) { + /* + * If the first data packet on an + * ECN connection, receives a RST + * increment the heuristic + */ + tcp_heuristic_ecn_droprst(tp); + } + case TCPS_FIN_WAIT_1: + case TCPS_CLOSE_WAIT: /* - * If the first data packet on an - * ECN connection, receives a RST - * increment the heuristic - */ - tcp_heuristic_ecn_droprst(tp); - } - case TCPS_FIN_WAIT_1: - case TCPS_CLOSE_WAIT: - /* - Drop through ... - */ - case TCPS_FIN_WAIT_2: - so->so_error = ECONNRESET; - close: - postevent(so, 0, EV_RESET); - soevent(so, - (SO_FILT_HINT_LOCKED | - SO_FILT_HINT_CONNRESET)); - - tcpstat.tcps_drops++; - tp = tcp_close(tp); - break; + Drop through ... + */ + case TCPS_FIN_WAIT_2: + so->so_error = ECONNRESET; + close: + postevent(so, 0, EV_RESET); + soevent(so, + (SO_FILT_HINT_LOCKED | + SO_FILT_HINT_CONNRESET)); + + tcpstat.tcps_drops++; + tp = tcp_close(tp); + break; - case TCPS_CLOSING: - case TCPS_LAST_ACK: - tp = tcp_close(tp); - break; + case TCPS_CLOSING: + case TCPS_LAST_ACK: + tp = tcp_close(tp); + break; - case TCPS_TIME_WAIT: - break; + case TCPS_TIME_WAIT: + break; + } + } else if (tcp_do_rfc5961) { + tcpstat.tcps_badrst++; + /* Drop if we have reached the ACK limit */ + if (tcp_is_ack_ratelimited(tp)) { + goto drop; + } else { + /* Send challenge ACK */ + tcpstat.tcps_rstchallenge++; + goto dropafterack; + } } } goto drop; @@ -3728,9 +3794,16 @@ tcp_input(struct mbuf *m, int off0) goto dropwithreset; } + /* + * Check if there is old data at the beginning of the window + * i.e. the sequence number is before rcv_nxt + */ todrop = tp->rcv_nxt - th->th_seq; if (todrop > 0) { + boolean_t is_syn_set = FALSE; + if (thflags & TH_SYN) { + is_syn_set = TRUE; thflags &= ~TH_SYN; th->th_seq++; if (th->th_urp > 1) @@ -3741,6 +3814,8 @@ tcp_input(struct mbuf *m, int off0) } /* * Following if statement from Stevens, vol. 2, p. 960. + * The amount of duplicate data is greater than or equal + * to the size of the segment - entire segment is duplicate */ if (todrop > tlen || (todrop == tlen && (thflags & TH_FIN) == 0)) { @@ -3754,8 +3829,19 @@ tcp_input(struct mbuf *m, int off0) /* * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. + * + * If the SYN bit was originally set, then only send + * an ACK if we are not rate-limiting this connection. */ - tp->t_flags |= TF_ACKNOW; + if (tcp_do_rfc5961 && is_syn_set) { + if (!tcp_is_ack_ratelimited(tp)) { + tcpstat.tcps_synchallenge++; + tp->t_flags |= TF_ACKNOW; + } + } else { + tp->t_flags |= TF_ACKNOW; + } + if (todrop == 1) { /* This could be a keepalive */ soevent(so, SO_FILT_HINT_LOCKED | @@ -3898,15 +3984,31 @@ tcp_input(struct mbuf *m, int off0) } /* - * If a SYN is in the window, then this is an + * Stevens: If a SYN is in the window, then this is an * error and we send an RST and drop the connection. + * + * RFC 5961 Section 4.2 + * Send challenge ACK for any SYN in synchronized state + * Perform rate limitation in doing so. */ if (thflags & TH_SYN) { - tp = tcp_drop(tp, ECONNRESET); - rstreason = BANDLIM_UNLIMITED; - postevent(so, 0, EV_RESET); - IF_TCP_STATINC(ifp, synwindow); - goto dropwithreset; + if (tcp_do_rfc5961) { + tcpstat.tcps_badsyn++; + /* Drop if we have reached ACK limit */ + if (tcp_is_ack_ratelimited(tp)) { + goto drop; + } else { + /* Send challenge ACK */ + tcpstat.tcps_synchallenge++; + goto dropafterack; + } + } else { + tp = tcp_drop(tp, ECONNRESET); + rstreason = BANDLIM_UNLIMITED; + postevent(so, 0, EV_RESET); + IF_TCP_STATINC(ifp, synwindow); + goto dropwithreset; + } } /* @@ -3969,6 +4071,7 @@ tcp_input(struct mbuf *m, int off0) tp->snd_scale = tp->requested_s_scale; tp->rcv_scale = tp->request_r_scale; tp->snd_wnd = th->th_win << tp->snd_scale; + tp->max_sndwnd = tp->snd_wnd; tiwin = tp->snd_wnd; } /* @@ -4088,7 +4191,18 @@ tcp_input(struct mbuf *m, int off0) case TCPS_TIME_WAIT: if (SEQ_GT(th->th_ack, tp->snd_max)) { tcpstat.tcps_rcvacktoomuch++; - goto dropafterack; + if (tcp_do_rfc5961 && tcp_is_ack_ratelimited(tp)) { + goto drop; + } else { + goto dropafterack; + } + } + if (tcp_do_rfc5961 && SEQ_LT(th->th_ack, tp->snd_una - tp->max_sndwnd)) { + if (tcp_is_ack_ratelimited(tp)) { + goto drop; + } else { + goto dropafterack; + } } if (SACK_ENABLED(tp) && to.to_nsacks > 0) { recvd_dsack = tcp_sack_process_dsack(tp, &to, th); @@ -5607,12 +5721,22 @@ static inline unsigned int tcp_maxmtu(struct rtentry *rt) { unsigned int maxmtu; + int interface_mtu = 0; RT_LOCK_ASSERT_HELD(rt); + interface_mtu = rt->rt_ifp->if_mtu; + + if (rt_key(rt)->sa_family == AF_INET && + INTF_ADJUST_MTU_FOR_CLAT46(rt->rt_ifp)) { + interface_mtu = IN6_LINKMTU(rt->rt_ifp); + /* Further adjust the size for CLAT46 expansion */ + interface_mtu -= CLAT46_HDR_EXPANSION_OVERHD; + } + if (rt->rt_rmx.rmx_mtu == 0) - maxmtu = rt->rt_ifp->if_mtu; + maxmtu = interface_mtu; else - maxmtu = MIN(rt->rt_rmx.rmx_mtu, rt->rt_ifp->if_mtu); + maxmtu = MIN(rt->rt_rmx.rmx_mtu, interface_mtu); return (maxmtu); } @@ -6564,6 +6688,7 @@ tcp_input_checksum(int af, struct mbuf *m, struct tcphdr *th, int off, int tlen) return (0); } + SYSCTL_PROC(_net_inet_tcp, TCPCTL_STATS, stats, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, tcp_getstat, "S,tcpstat", "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); diff --git a/bsd/netinet/tcp_output.c b/bsd/netinet/tcp_output.c index 52884d357..75e8634c0 100644 --- a/bsd/netinet/tcp_output.c +++ b/bsd/netinet/tcp_output.c @@ -2287,13 +2287,13 @@ tcp_output(struct tcpcb *tp) * * Every time new data is sent PTO will get reset. */ - if (tcp_enable_tlp && tp->t_state == TCPS_ESTABLISHED && - SACK_ENABLED(tp) && !IN_FASTRECOVERY(tp) - && tp->snd_nxt == tp->snd_max - && SEQ_GT(tp->snd_nxt, tp->snd_una) - && tp->t_rxtshift == 0 - && (tp->t_flagsext & (TF_SENT_TLPROBE|TF_PKTS_REORDERED)) == 0) { - u_int32_t pto, srtt, new_rto = 0; + if (tcp_enable_tlp && len != 0 && tp->t_state == TCPS_ESTABLISHED && + SACK_ENABLED(tp) && !IN_FASTRECOVERY(tp) && + tp->snd_nxt == tp->snd_max && + SEQ_GT(tp->snd_nxt, tp->snd_una) && + tp->t_rxtshift == 0 && + (tp->t_flagsext & (TF_SENT_TLPROBE|TF_PKTS_REORDERED)) == 0) { + u_int32_t pto, srtt; /* * Using SRTT alone to set PTO can cause spurious @@ -2311,21 +2311,9 @@ tcp_output(struct tcpcb *tp) pto = max(10, pto); /* if RTO is less than PTO, choose RTO instead */ - if (tp->t_rxtcur < pto) { - /* - * Schedule PTO instead of RTO in favor of - * fast recovery. - */ + if (tp->t_rxtcur < pto) pto = tp->t_rxtcur; - /* Reset the next RTO to be after PTO. */ - TCPT_RANGESET(new_rto, - (pto + TCP_REXMTVAL(tp)), - max(tp->t_rttmin, tp->t_rttcur + 2), - TCPTV_REXMTMAX, 0); - tp->t_timer[TCPT_REXMT] = - OFFSET_FROM_START(tp, new_rto); - } tp->t_timer[TCPT_PTO] = OFFSET_FROM_START(tp, pto); } } else { @@ -2412,13 +2400,14 @@ tcp_output(struct tcpcb *tp) #if NECP { necp_kernel_policy_id policy_id; + necp_kernel_policy_id skip_policy_id; u_int32_t route_rule_id; - if (!necp_socket_is_allowed_to_send_recv(inp, &policy_id, &route_rule_id)) { + if (!necp_socket_is_allowed_to_send_recv(inp, &policy_id, &route_rule_id, &skip_policy_id)) { m_freem(m); error = EHOSTUNREACH; goto out; } - necp_mark_packet_from_socket(m, inp, policy_id, route_rule_id); + necp_mark_packet_from_socket(m, inp, policy_id, route_rule_id, skip_policy_id); if (net_qos_policy_restricted != 0) { necp_socket_update_qos_marking(inp, inp->inp_route.ro_rt, @@ -2445,6 +2434,11 @@ tcp_output(struct tcpcb *tp) m->m_pkthdr.pkt_flowid = inp->inp_flowhash; m->m_pkthdr.pkt_flags |= (PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC | PKTF_FLOW_ADV); m->m_pkthdr.pkt_proto = IPPROTO_TCP; + m->m_pkthdr.tx_tcp_pid = so->last_pid; + if (so->so_flags & SOF_DELEGATED) + m->m_pkthdr.tx_tcp_e_pid = so->e_pid; + else + m->m_pkthdr.tx_tcp_e_pid = 0; m->m_nextpkt = NULL; diff --git a/bsd/netinet/tcp_subr.c b/bsd/netinet/tcp_subr.c index 363dea99f..1c9b36da3 100644 --- a/bsd/netinet/tcp_subr.c +++ b/bsd/netinet/tcp_subr.c @@ -90,6 +90,7 @@ #include #include #include +#include #define tcp_minmssoverload fring #define _IP_VHL @@ -638,21 +639,11 @@ tcp_init(struct protosw *pp, struct domain *dp) * maximum allowed receive and send socket buffer size. */ if (nmbclusters > 30720) { - #if CONFIG_EMBEDDED - tcp_autorcvbuf_max = 2 * 1024 * 1024; - tcp_autosndbuf_max = 2 * 1024 * 1024; - #else - tcp_autorcvbuf_max = 1024 * 1024; - tcp_autosndbuf_max = 1024 * 1024; - #endif /* CONFIG_EMBEDDED */ + tcp_autorcvbuf_max = 2 * 1024 * 1024; + tcp_autosndbuf_max = 2 * 1024 * 1024; + SYSCTL_SKMEM_UPDATE_FIELD(tcp.autorcvbufmax, tcp_autorcvbuf_max); SYSCTL_SKMEM_UPDATE_FIELD(tcp.autosndbufmax, tcp_autosndbuf_max); - - /* - * Receive buffer max for cellular interfaces supporting - * Carrier Aggregation is higher - */ - tcp_autorcvbuf_max_ca = 2 * 1024 * 1024; } } @@ -925,7 +916,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, #endif #if NECP - necp_mark_packet_from_socket(m, tp ? tp->t_inpcb : NULL, 0, 0); + necp_mark_packet_from_socket(m, tp ? tp->t_inpcb : NULL, 0, 0, 0); #endif /* NECP */ #if IPSEC @@ -950,6 +941,8 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, m->m_pkthdr.pkt_flowid = tp->t_inpcb->inp_flowhash; m->m_pkthdr.pkt_flags |= (PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC | PKTF_FLOW_ADV); m->m_pkthdr.pkt_proto = IPPROTO_TCP; + m->m_pkthdr.tx_tcp_pid = tp->t_inpcb->inp_socket->last_pid; + m->m_pkthdr.tx_tcp_e_pid = tp->t_inpcb->inp_socket->e_pid; } #if INET6 @@ -2138,17 +2131,29 @@ tcp_pcblist_n SYSCTL_HANDLER_ARGS SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist_n, - CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, - tcp_pcblist_n, "S,xtcpcb_n", "List of active TCP connections"); + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, + tcp_pcblist_n, "S,xtcpcb_n", "List of active TCP connections"); + +static int +tcp_progress_indicators SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + + return (ntstat_tcp_progress_indicators(req)); +} + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, progress, + CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, 0, 0, + tcp_progress_indicators, "S", "Various items that indicate the current state of progress on the link"); __private_extern__ void tcp_get_ports_used(uint32_t ifindex, int protocol, uint32_t flags, bitstr_t *bitfield) { - inpcb_get_ports_used(ifindex, protocol, flags, bitfield, - &tcbinfo); - } + inpcb_get_ports_used(ifindex, protocol, flags, bitfield, + &tcbinfo); +} __private_extern__ uint32_t tcp_count_opportunistic(unsigned int ifindex, u_int32_t flags) @@ -2409,7 +2414,7 @@ tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d, __unused struct ifnet *ifp) } if (m == NULL || - (m->m_pkthdr.len < (int32_t) (off + offsetof(struct tcphdr, th_seq)))) + (m->m_pkthdr.len < (int32_t) (off + offsetof(struct tcphdr, th_ack)))) return; th = (struct tcphdr *)(void *)mtodo(m, off); @@ -2873,15 +2878,15 @@ tcp_rtlookup6(struct inpcb *inp, unsigned int input_ifscope) if (inp->inp_last_outifp == NULL) { inp->inp_last_outifp = rt->rt_ifp; } - } - /* Note if the peer is local */ - if (rt != NULL && !(rt->rt_ifp->if_flags & IFF_POINTOPOINT) && - (IN6_IS_ADDR_LOOPBACK(&inp->in6p_faddr) || - IN6_IS_ADDR_LINKLOCAL(&inp->in6p_faddr) || - rt->rt_gateway->sa_family == AF_LINK || - in6_localaddr(&inp->in6p_faddr))) { - tp->t_flags |= TF_LOCAL; + /* Note if the peer is local */ + if (!(rt->rt_ifp->if_flags & IFF_POINTOPOINT) && + (IN6_IS_ADDR_LOOPBACK(&inp->in6p_faddr) || + IN6_IS_ADDR_LINKLOCAL(&inp->in6p_faddr) || + rt->rt_gateway->sa_family == AF_LINK || + in6_localaddr(&inp->in6p_faddr))) { + tp->t_flags |= TF_LOCAL; + } } /* @@ -3311,15 +3316,25 @@ calculate_tcp_clock(void) * defined by the constant tcp_autorcvbuf_max. */ void -tcp_set_max_rwinscale(struct tcpcb *tp, struct socket *so, - u_int32_t rcvbuf_max) +tcp_set_max_rwinscale(struct tcpcb *tp, struct socket *so, struct ifnet *ifp) { - u_int32_t maxsockbufsize; + uint32_t maxsockbufsize; + uint32_t rcvbuf_max; + if (!tcp_do_rfc1323) { tp->request_r_scale = 0; return; } + /* + * When we start a connection and don't know about the interface, set + * the scaling factor simply to the max - we can always announce less. + */ + if (!ifp || (IFNET_IS_CELLULAR(ifp) && (ifp->if_eflags & IFEF_3CA))) + rcvbuf_max = (tcp_autorcvbuf_max << 1); + else + rcvbuf_max = tcp_autorcvbuf_max; + tp->request_r_scale = max(tcp_win_scale, tp->request_r_scale); maxsockbufsize = ((so->so_rcv.sb_flags & SB_USRSIZE) != 0) ? so->so_rcv.sb_hiwat : rcvbuf_max; @@ -3332,14 +3347,20 @@ tcp_set_max_rwinscale(struct tcpcb *tp, struct socket *so, } int -tcp_notsent_lowat_check(struct socket *so) { +tcp_notsent_lowat_check(struct socket *so) +{ struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = NULL; int notsent = 0; + if (inp != NULL) { tp = intotcpcb(inp); } + if (tp == NULL) { + return (0); + } + notsent = so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una); diff --git a/bsd/netinet/tcp_timer.c b/bsd/netinet/tcp_timer.c index 417b61b89..189603549 100644 --- a/bsd/netinet/tcp_timer.c +++ b/bsd/netinet/tcp_timer.c @@ -375,6 +375,7 @@ struct tcp_last_report_stats { u_int32_t tcps_mptcp_back_to_wifi; u_int32_t tcps_mptcp_wifi_proxy; u_int32_t tcps_mptcp_cell_proxy; + u_int32_t tcps_mptcp_triggered_cell; }; @@ -992,11 +993,9 @@ tcp_timers(struct tcpcb *tp, int timer) #if MPTCP if ((tp->t_rxtshift >= mptcp_fail_thresh) && (tp->t_state == TCPS_ESTABLISHED) && - (tp->t_mpflags & TMPF_MPTCP_TRUE)) { + (tp->t_mpflags & TMPF_MPTCP_TRUE)) mptcp_act_on_txfail(so); - } - if (so->so_flags & SOF_MP_SUBFLOW) { struct mptses *mpte = tptomptp(tp)->mpt_mpte; @@ -1126,7 +1125,7 @@ tcp_timers(struct tcpcb *tp, int timer) if (tp->t_maxopd > tcp_pmtud_black_hole_mss) { tp->t_maxopd = tcp_pmtud_black_hole_mss; } else { - tp->t_maxopd = /* use the default MSS */ + tp->t_maxopd = /* use the default MSS */ #if INET6 isipv6 ? tcp_v6mssdflt : #endif /* INET6 */ @@ -1135,9 +1134,9 @@ tcp_timers(struct tcpcb *tp, int timer) tp->t_maxseg = tp->t_maxopd - optlen; /* - * Reset the slow-start flight size + * Reset the slow-start flight size * as it may depend on the new MSS - */ + */ if (CC_ALGO(tp)->cwnd_init != NULL) CC_ALGO(tp)->cwnd_init(tp); tp->snd_cwnd = tp->t_maxseg; @@ -1300,7 +1299,7 @@ tcp_timers(struct tcpcb *tp, int timer) (tp->t_flagsext & TF_DETECT_READSTALL) || (tp->t_tfo_probe_state == TFO_PROBE_PROBING)) && (tp->t_state <= TCPS_CLOSING || tp->t_state == TCPS_FIN_WAIT_2)) { - if (idle_time >= TCP_CONN_KEEPIDLE(tp) + TCP_CONN_MAXIDLE(tp)) + if (idle_time >= TCP_CONN_KEEPIDLE(tp) + TCP_CONN_MAXIDLE(tp)) goto dropit; /* * Send a packet designed to force a response @@ -1489,9 +1488,10 @@ tcp_timers(struct tcpcb *tp, int timer) * send a probe */ if (tp->t_state != TCPS_ESTABLISHED || - (tp->t_rxtshift > 0 && !(tp->t_flagsext & TF_PROBING)) - || tp->snd_max == tp->snd_una || - !SACK_ENABLED(tp) || !TAILQ_EMPTY(&tp->snd_holes) || + (tp->t_rxtshift > 0 && !(tp->t_flagsext & TF_PROBING)) || + tp->snd_max == tp->snd_una || + !SACK_ENABLED(tp) || + !TAILQ_EMPTY(&tp->snd_holes) || IN_FASTRECOVERY(tp)) break; @@ -1522,6 +1522,15 @@ tcp_timers(struct tcpcb *tp, int timer) tp->t_tlpstart = tcp_now; tp->snd_cwnd += tp->t_maxseg; + + /* + * When tail-loss-probe fires, we reset the RTO timer, because + * a probe just got sent, so we are good to push out the timer. + * + * Set to 0 to ensure that tcp_output() will reschedule it + */ + tp->t_timer[TCPT_REXMT] = 0; + (void )tcp_output(tp); tp->snd_cwnd -= tp->t_maxseg; @@ -2388,7 +2397,8 @@ tcp_report_stats(void) &prev.tcps_mptcp_wifi_proxy , &stat.mptcp_wifi_proxy); tcp_cumulative_stat(tcpstat.tcps_mptcp_cell_proxy, &prev.tcps_mptcp_cell_proxy , &stat.mptcp_cell_proxy); - + tcp_cumulative_stat(tcpstat.tcps_mptcp_triggered_cell, + &prev.tcps_mptcp_triggered_cell, &stat.mptcp_triggered_cell); nstat_sysinfo_send_data(&data); diff --git a/bsd/netinet/tcp_usrreq.c b/bsd/netinet/tcp_usrreq.c index 1af338ade..bea1e4c0d 100644 --- a/bsd/netinet/tcp_usrreq.c +++ b/bsd/netinet/tcp_usrreq.c @@ -414,12 +414,13 @@ tcp_connect_complete(struct socket *so) /* TFO delays the tcp_output until later, when the app calls write() */ if (so->so_flags1 & SOF1_PRECONNECT_DATA) { - if (!necp_socket_is_allowed_to_send_recv(sotoinpcb(so), NULL, NULL)) + if (!necp_socket_is_allowed_to_send_recv(sotoinpcb(so), NULL, NULL, NULL)) return (EHOSTUNREACH); /* Initialize enough state so that we can actually send data */ tcp_mss(tp, -1, IFSCOPE_NONE); tp->snd_wnd = tp->t_maxseg; + tp->max_sndwnd = tp->snd_wnd; } else { error = tcp_output(tp); } @@ -1068,6 +1069,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, if (error) goto out; tp->snd_wnd = TTCP_CLIENT_SND_WND; + tp->max_sndwnd = tp->snd_wnd; tcp_mss(tp, -1, IFSCOPE_NONE); } @@ -1119,6 +1121,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, if (error) goto out; tp->snd_wnd = TTCP_CLIENT_SND_WND; + tp->max_sndwnd = tp->snd_wnd; tcp_mss(tp, -1, IFSCOPE_NONE); } tp->snd_up = tp->snd_una + so->so_snd.sb_cc; @@ -1380,7 +1383,7 @@ tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct proc *p) if (inp->inp_flowhash == 0) inp->inp_flowhash = inp_calc_flowhash(inp); - tcp_set_max_rwinscale(tp, so, TCP_AUTORCVBUF_MAX(outif)); + tcp_set_max_rwinscale(tp, so, outif); soisconnecting(so); tcpstat.tcps_connattempt++; @@ -1474,7 +1477,7 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct proc *p) (htonl(inp->inp_flowhash) & IPV6_FLOWLABEL_MASK); } - tcp_set_max_rwinscale(tp, so, TCP_AUTORCVBUF_MAX(outif)); + tcp_set_max_rwinscale(tp, so, outif); soisconnecting(so); tcpstat.tcps_connattempt++; diff --git a/bsd/netinet/tcp_var.h b/bsd/netinet/tcp_var.h index 4fde35c90..49e3f9731 100644 --- a/bsd/netinet/tcp_var.h +++ b/bsd/netinet/tcp_var.h @@ -620,6 +620,8 @@ struct tcpcb { SLIST_HEAD(,tcp_notify_ack_marker) t_notify_ack; /* state for notifying data acknowledgements */ u_int32_t t_recv_throttle_ts; /* TS for start of recv throttle */ u_int32_t t_rxt_minimum_timeout; /* minimum retransmit timeout in ms */ + uint32_t t_challengeack_last; /* last time challenge ACK was sent per sec */ + uint32_t t_challengeack_count; /* # of challenge ACKs already sent per sec */ }; #define IN_FASTRECOVERY(tp) (tp->t_flags & TF_FASTRECOVERY) @@ -718,9 +720,8 @@ extern int tcprexmtthresh; mptcp_reset_rexmit_state((_tp_)); \ } while(0); -#define TCP_AUTORCVBUF_MAX(_ifp_) (((_ifp_) != NULL && \ - ((_ifp_)->if_eflags & IFEF_3CA)) ? tcp_autorcvbuf_max_ca : \ - tcp_autorcvbuf_max) +#define TCP_AUTORCVBUF_MAX(_ifp_) (((_ifp_) != NULL && (IFNET_IS_CELLULAR((_ifp_))) && ((_ifp_)->if_eflags & IFEF_3CA)) ? \ + (tcp_autorcvbuf_max << 1) : tcp_autorcvbuf_max) enum tcp_cc_event { TCP_CC_CWND_INIT, /* 0 */ @@ -1003,9 +1004,12 @@ struct tcpstat { u_int32_t tcps_badsyn; /* bogus SYN, e.g. premature ACK */ u_int32_t tcps_mturesent; /* resends due to MTU discovery */ u_int32_t tcps_listendrop; /* listen queue overflows */ + u_int32_t tcps_synchallenge; /* challenge ACK due to bad SYN */ + u_int32_t tcps_rstchallenge; /* challenge ACK due to bad RST */ /* new stats from FreeBSD 5.4 sync up */ u_int32_t tcps_minmssdrops; /* average minmss too low drops */ + u_int32_t tcps_sndrexmitbad; /* unnecessary packet retransmissions */ u_int32_t tcps_badrst; /* ignored RSTs in the window */ @@ -1202,6 +1206,7 @@ struct tcpstat { u_int32_t tcps_mptcp_back_to_wifi; /* Total number of connections that succeed to move traffic away from cell (when starting on cell) */ u_int32_t tcps_mptcp_wifi_proxy; /* Total number of new subflows that fell back to regular TCP on cell */ u_int32_t tcps_mptcp_cell_proxy; /* Total number of new subflows that fell back to regular TCP on WiFi */ + u_int32_t tcps_mptcp_triggered_cell; /* Total number of times an MPTCP-connection triggered cell bringup */ }; @@ -1422,7 +1427,37 @@ struct xtcpcb_n { #define TCP_RTTVAR_SCALE 16 /* multiplier for rttvar; 4 bits */ #define TCP_RTTVAR_SHIFT 4 /* shift for rttvar; 4 bits */ #define TCP_DELTA_SHIFT 2 /* see tcp_input.c */ - + + +/* + * TCP structure with information that gives insight into forward progress on an interface, + * exported to user-land via sysctl(3). + */ +struct xtcpprogress_indicators { + u_int32_t xp_numflows; /* Total number of flows */ + u_int32_t xp_conn_probe_fails; /* Count of connection failures */ + u_int32_t xp_read_probe_fails; /* Count of read probe failures */ + u_int32_t xp_write_probe_fails; /* Count of write failures */ + u_int32_t xp_recentflows; /* Total of "recent" flows */ + u_int32_t xp_recentflows_unacked; /* Total of "recent" flows with unacknowledged data */ + u_int64_t xp_recentflows_rxbytes; /* Total of "recent" flows received bytes */ + u_int64_t xp_recentflows_txbytes; /* Total of "recent" flows transmitted bytes */ + u_int64_t xp_recentflows_rxooo; /* Total of "recent" flows received out of order bytes */ + u_int64_t xp_recentflows_rxdup; /* Total of "recent" flows received duplicate bytes */ + u_int64_t xp_recentflows_retx; /* Total of "recent" flows retransmitted bytes */ + u_int64_t xp_reserved1; /* Expansion */ + u_int64_t xp_reserved2; /* Expansion */ + u_int64_t xp_reserved3; /* Expansion */ + u_int64_t xp_reserved4; /* Expansion */ +}; + +struct tcpprogressreq { + u_int64_t ifindex; /* Interface index for progress indicators */ + u_int64_t recentflow_maxduration; /* In mach_absolute_time, max duration for flow to be counted as "recent" */ + u_int64_t xp_reserved1; /* Expansion */ + u_int64_t xp_reserved2; /* Expansion */ +}; + #endif /* PRIVATE */ #pragma pack() @@ -1505,7 +1540,6 @@ extern int tcp_ecn_outbound; extern int tcp_ecn_inbound; extern u_int32_t tcp_do_autorcvbuf; extern u_int32_t tcp_autorcvbuf_max; -extern u_int32_t tcp_autorcvbuf_max_ca; extern u_int32_t tcp_autorcvbuf_inc_shift; extern int tcp_recv_bg; @@ -1575,8 +1609,7 @@ void tcp_reset_stretch_ack(struct tcpcb *tp); extern void tcp_get_ports_used(u_int32_t, int, u_int32_t, bitstr_t *); uint32_t tcp_count_opportunistic(unsigned int ifindex, u_int32_t flags); uint32_t tcp_find_anypcb_byaddr(struct ifaddr *ifa); -void tcp_set_max_rwinscale(struct tcpcb *tp, struct socket *so, - u_int32_t maxrcvbuf); +void tcp_set_max_rwinscale(struct tcpcb *tp, struct socket *so, struct ifnet *ifp); struct bwmeas* tcp_bwmeas_alloc(struct tcpcb *tp); void tcp_bwmeas_free(struct tcpcb *tp); extern int32_t timer_diff(uint32_t t1, uint32_t toff1, uint32_t t2, uint32_t toff2); diff --git a/bsd/netinet/udp_usrreq.c b/bsd/netinet/udp_usrreq.c index a04bfcca4..f11f5a4a5 100644 --- a/bsd/netinet/udp_usrreq.c +++ b/bsd/netinet/udp_usrreq.c @@ -119,6 +119,10 @@ extern int esp_udp_encap_port; #include #endif /* FLOW_DIVERT */ +#if CONTENT_FILTER +#include +#endif /* CONTENT_FILTER */ + #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETUDP, 0) #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETUDP, 2) #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETUDP, 1) @@ -258,7 +262,11 @@ udp_init(struct protosw *pp, struct domain *dp) if (udp_initialized) return; udp_initialized = 1; - + uint32_t pool_size = (nmbclusters << MCLSHIFT) >> MBSHIFT; + if (pool_size >= 96) { + /* Improves 10GbE UDP performance. */ + udp_recvspace = 786896; + } LIST_INIT(&udb); udbinfo.ipi_listhead = &udb; udbinfo.ipi_hashbase = hashinit(UDBHASHSIZE, M_PCB, @@ -516,7 +524,7 @@ udp_input(struct mbuf *m, int iphlen) skipit = 0; if (!necp_socket_is_allowed_to_send_recv_v4(inp, uh->uh_dport, uh->uh_sport, &ip->ip_dst, - &ip->ip_src, ifp, NULL, NULL)) { + &ip->ip_src, ifp, NULL, NULL, NULL)) { /* do not inject data to pcb */ skipit = 1; } @@ -691,7 +699,7 @@ udp_input(struct mbuf *m, int iphlen) } #if NECP if (!necp_socket_is_allowed_to_send_recv_v4(inp, uh->uh_dport, - uh->uh_sport, &ip->ip_dst, &ip->ip_src, ifp, NULL, NULL)) { + uh->uh_sport, &ip->ip_dst, &ip->ip_src, ifp, NULL, NULL, NULL)) { udp_unlock(inp->inp_socket, 1, 0); IF_UDP_STATINC(ifp, badipsec); goto bad; @@ -706,7 +714,8 @@ udp_input(struct mbuf *m, int iphlen) udp_in.sin_addr = ip->ip_src; if ((inp->inp_flags & INP_CONTROLOPTS) != 0 || (inp->inp_socket->so_options & SO_TIMESTAMP) != 0 || - (inp->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + (inp->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0 || + (inp->inp_socket->so_options & SO_TIMESTAMP_CONTINUOUS) != 0) { #if INET6 if (inp->inp_vflag & INP_IPV6) { int savedflags; @@ -811,7 +820,8 @@ udp_append(struct inpcb *last, struct ip *ip, struct mbuf *n, int off, #endif /* CONFIG_MACF_NET */ if ((last->inp_flags & INP_CONTROLOPTS) != 0 || (last->inp_socket->so_options & SO_TIMESTAMP) != 0 || - (last->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + (last->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0 || + (last->inp_socket->so_options & SO_TIMESTAMP_CONTINUOUS) != 0) { #if INET6 if (last->inp_vflag & INP_IPV6) { int savedflags; @@ -1309,9 +1319,9 @@ __private_extern__ void udp_get_ports_used(uint32_t ifindex, int protocol, uint32_t flags, bitstr_t *bitfield) { - inpcb_get_ports_used(ifindex, protocol, flags, bitfield, - &udbinfo); - } + inpcb_get_ports_used(ifindex, protocol, flags, bitfield, + &udbinfo); +} __private_extern__ uint32_t udp_count_opportunistic(unsigned int ifindex, u_int32_t flags) @@ -1415,6 +1425,13 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, struct ip_moptions *mopts; struct route ro; struct ip_out_args ipoa; +#if CONTENT_FILTER + struct m_tag *cfil_tag = NULL; + bool cfil_faddr_use = false; + uint32_t cfil_so_state_change_cnt = 0; + short cfil_so_options = 0; + struct sockaddr *cfil_faddr = NULL; +#endif bzero(&ipoa, sizeof(ipoa)); ipoa.ipoa_boundif = IFSCOPE_NONE; @@ -1434,6 +1451,35 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, KERNEL_DEBUG(DBG_FNC_UDP_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0); socket_lock_assert_owned(so); + +#if CONTENT_FILTER + /* + * If socket is subject to UDP Content Filter and no addr is passed in, + * retrieve CFIL saved state from mbuf and use it if necessary. + */ + if (so->so_cfil_db && !addr) { + cfil_tag = cfil_udp_get_socket_state(m, &cfil_so_state_change_cnt, &cfil_so_options, &cfil_faddr); + if (cfil_tag) { + sin = (struct sockaddr_in *)(void *)cfil_faddr; + if (inp && inp->inp_faddr.s_addr == INADDR_ANY) { + /* + * Socket is unconnected, simply use the saved faddr as 'addr' to go through + * the connect/disconnect logic. + */ + addr = (struct sockaddr *)cfil_faddr; + } else if ((so->so_state_change_cnt != cfil_so_state_change_cnt) && + (inp->inp_fport != sin->sin_port || + inp->inp_faddr.s_addr != sin->sin_addr.s_addr)) { + /* + * Socket is connected but socket state and dest addr/port changed. + * We need to use the saved faddr info. + */ + cfil_faddr_use = true; + } + } + } +#endif + if (control != NULL) { sotc = so_tc_from_control(control, &netsvctype); VERIFY(outif == NULL); @@ -1496,8 +1542,15 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, * If there was a routing change, discard cached route and check * that we have a valid source address. Reacquire a new source * address if INADDR_ANY was specified. + * + * If we are using cfil saved state, go through this cache cleanup + * so that we can get a new route. */ - if (ROUTE_UNUSABLE(&inp->inp_route)) { + if (ROUTE_UNUSABLE(&inp->inp_route) +#if CONTENT_FILTER + || cfil_faddr_use +#endif + ) { struct in_ifaddr *ia = NULL; ROUTE_RELEASE(&inp->inp_route); @@ -1551,6 +1604,14 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, lport = inp->inp_lport; fport = inp->inp_fport; +#if CONTENT_FILTER + if (cfil_faddr_use) + { + faddr = ((struct sockaddr_in *)(void *)cfil_faddr)->sin_addr; + fport = ((struct sockaddr_in *)(void *)cfil_faddr)->sin_port; + } +#endif + if (addr) { sin = (struct sockaddr_in *)(void *)addr; if (faddr.s_addr != INADDR_ANY) { @@ -1659,9 +1720,26 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, ui->ui_ulen = htons((u_short)len + sizeof (struct udphdr)); /* - * Set up checksum and output datagram. + * Set up checksum to pseudo header checksum and output datagram. + * + * Treat flows to be CLAT46'd as IPv6 flow and compute checksum + * no matter what, as IPv6 mandates checksum for UDP. + * + * Here we only compute the one's complement sum of the pseudo header. + * The payload computation and final complement is delayed to much later + * in IP processing to decide if remaining computation needs to be done + * through offload. + * + * That is communicated by setting CSUM_UDP in csum_flags. + * The offset of checksum from the start of ULP header is communicated + * through csum_data. + * + * Note since this already contains the pseudo checksum header, any + * later operation at IP layer that modify the values used here must + * update the checksum as well (for example NAT etc). */ - if (udpcksum && !(inp->inp_flags & INP_UDP_NOCKSUM)) { + if ((inp->inp_flags2 & INP2_CLAT46_FLOW) || + (udpcksum && !(inp->inp_flags & INP_UDP_NOCKSUM))) { ui->ui_sum = in_pseudo(ui->ui_src.s_addr, ui->ui_dst.s_addr, htons((u_short)len + sizeof (struct udphdr) + IPPROTO_UDP)); m->m_pkthdr.csum_flags = (CSUM_UDP|CSUM_ZERO_INVERT); @@ -1680,6 +1758,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, #if NECP { necp_kernel_policy_id policy_id; + necp_kernel_policy_id skip_policy_id; u_int32_t route_rule_id; /* @@ -1715,12 +1794,12 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, } if (!necp_socket_is_allowed_to_send_recv_v4(inp, lport, fport, - &laddr, &faddr, NULL, &policy_id, &route_rule_id)) { + &laddr, &faddr, NULL, &policy_id, &route_rule_id, &skip_policy_id)) { error = EHOSTUNREACH; goto abort; } - necp_mark_packet_from_socket(m, inp, policy_id, route_rule_id); + necp_mark_packet_from_socket(m, inp, policy_id, route_rule_id, skip_policy_id); if (net_qos_policy_restricted != 0) { necp_socket_update_qos_marking(inp, @@ -1739,7 +1818,13 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, #endif /* IPSEC */ inpopts = inp->inp_options; - soopts |= (inp->inp_socket->so_options & (SO_DONTROUTE | SO_BROADCAST)); +#if CONTENT_FILTER + if (cfil_tag && (inp->inp_socket->so_options != cfil_so_options)) + soopts |= (cfil_so_options & (SO_DONTROUTE | SO_BROADCAST)); + else +#endif + soopts |= (inp->inp_socket->so_options & (SO_DONTROUTE | SO_BROADCAST)); + mopts = inp->inp_moptions; if (mopts != NULL) { IMO_LOCK(mopts); @@ -1763,6 +1848,11 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, m->m_pkthdr.pkt_flags |= (PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC); if (flowadv) m->m_pkthdr.pkt_flags |= PKTF_FLOW_ADV; + m->m_pkthdr.tx_udp_pid = so->last_pid; + if (so->so_flags & SOF_DELEGATED) + m->m_pkthdr.tx_udp_e_pid = so->e_pid; + else + m->m_pkthdr.tx_udp_e_pid = 0; if (ipoa.ipoa_boundif != IFSCOPE_NONE) ipoa.ipoa_flags |= IPOAF_BOUND_IF; @@ -1826,6 +1916,15 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, if (rt->rt_flags & (RTF_MULTICAST|RTF_BROADCAST)) rt = NULL; /* unusable */ + +#if CONTENT_FILTER + /* + * Discard temporary route for cfil case + */ + if (cfil_faddr_use) + rt = NULL; /* unusable */ +#endif + /* * Always discard if it is a multicast or broadcast route. */ @@ -1868,6 +1967,11 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, if (outif != NULL) ifnet_release(outif); +#if CONTENT_FILTER + if (cfil_tag) + m_tag_free(cfil_tag); +#endif + return (error); } diff --git a/bsd/netinet6/esp_chachapoly.c b/bsd/netinet6/esp_chachapoly.c index 0970f6983..2c68e6f07 100644 --- a/bsd/netinet6/esp_chachapoly.c +++ b/bsd/netinet6/esp_chachapoly.c @@ -79,11 +79,12 @@ typedef struct _esp_chachapoly_ctx { } \ } while (0) -#define ESP_CHECK_ARG(_arg) ESP_ASSERT(_arg != NULL, #_arg "is NULL") +#define ESP_CHECK_ARG(_arg) ESP_ASSERT(_arg != NULL, #_arg " is NULL") #define _esp_log(_level, _format, ...) \ log(_level, "%s:%d " _format, __FUNCTION__, __LINE__, ##__VA_ARGS__) #define esp_log_err(_format, ...) _esp_log(LOG_ERR, _format, ##__VA_ARGS__) +#define esp_log_default(_format, ...) _esp_log(LOG_NOTICE, _format, ##__VA_ARGS__) #define _esp_packet_log(_level, _format, ...) \ ipseclog((_level, "%s:%d " _format, __FUNCTION__, __LINE__, ##__VA_ARGS__)) @@ -97,38 +98,47 @@ esp_chachapoly_mature(struct secasvar *sav) ESP_CHECK_ARG(sav); if ((sav->flags & SADB_X_EXT_OLD) != 0) { - esp_log_err("ChaChaPoly is incompatible with SADB_X_EXT_OLD"); + esp_log_err("ChaChaPoly is incompatible with SADB_X_EXT_OLD, SPI 0x%08x", + ntohl(sav->spi)); return 1; } if ((sav->flags & SADB_X_EXT_DERIV) != 0) { - esp_log_err("ChaChaPoly is incompatible with SADB_X_EXT_DERIV"); + esp_log_err("ChaChaPoly is incompatible with SADB_X_EXT_DERIV, SPI 0x%08x", + ntohl(sav->spi)); return 1; } if (sav->alg_enc != SADB_X_EALG_CHACHA20POLY1305) { - esp_log_err("ChaChaPoly unsupported algorithm %d", - sav->alg_enc); + esp_log_err("ChaChaPoly unsupported algorithm %d, SPI 0x%08x", + sav->alg_enc, ntohl(sav->spi)); return 1; } if (sav->key_enc == NULL) { - esp_log_err("ChaChaPoly key is missing"); + esp_log_err("ChaChaPoly key is missing, SPI 0x%08x", + ntohl(sav->spi)); return 1; } algo = esp_algorithm_lookup(sav->alg_enc); if (algo == NULL) { - esp_log_err("ChaChaPoly lookup failed for algorithm %d", - sav->alg_enc); + esp_log_err("ChaChaPoly lookup failed for algorithm %d, SPI 0x%08x", + sav->alg_enc, ntohl(sav->spi)); return 1; } if (sav->key_enc->sadb_key_bits != ESP_CHACHAPOLY_KEYBITS_WITH_SALT) { - esp_log_err("ChaChaPoly invalid key length %d bits", - sav->key_enc->sadb_key_bits); + esp_log_err("ChaChaPoly invalid key length %d bits, SPI 0x%08x", + sav->key_enc->sadb_key_bits, ntohl(sav->spi)); return 1; } + esp_log_default("ChaChaPoly Mature SPI 0x%08x%s %s dir %u state %u mode %u", + ntohl(sav->spi), + (((sav->flags & SADB_X_EXT_IIV) != 0) ? " IIV" : ""), + ((sav->sah->ipsec_if != NULL) ? if_name(sav->sah->ipsec_if) : "NONE"), + sav->sah->dir, sav->sah->state, sav->sah->saidx.mode); + return 0; } @@ -146,22 +156,27 @@ esp_chachapoly_schedule(__unused const struct esp_algorithm *algo, int rc = 0; ESP_CHECK_ARG(sav); - if (sav->ivlen != ESP_CHACHAPOLY_IV_LEN) { - esp_log_err("Invalid ivlen %u", sav->ivlen); - return EINVAL; - } if (_KEYLEN(sav->key_enc) != ESP_CHACHAPOLY_KEY_LEN + ESP_CHACHAPOLY_SALT_LEN) { - esp_log_err("Invalid key len %u", _KEYLEN(sav->key_enc)); + esp_log_err("ChaChaPoly Invalid key len %u, SPI 0x%08x", + _KEYLEN(sav->key_enc), ntohl(sav->spi)); return EINVAL; } LCK_MTX_ASSERT(sadb_mutex, LCK_MTX_ASSERT_OWNED); esp_ccp_ctx = (esp_chachapoly_ctx_t)sav->sched; + esp_ccp_ctx->ccp_implicit_iv = ((sav->flags & SADB_X_EXT_IIV) != 0); + + if (sav->ivlen != (esp_ccp_ctx->ccp_implicit_iv ? 0 : ESP_CHACHAPOLY_IV_LEN)) { + esp_log_err("ChaChaPoly Invalid ivlen %u, SPI 0x%08x", + sav->ivlen, ntohl(sav->spi)); + return EINVAL; + } rc = chacha20poly1305_init(&esp_ccp_ctx->ccp_ctx, (const uint8_t *)_KEYBUF(sav->key_enc)); if (rc != 0) { - esp_log_err("chacha20poly1305_init returned %d", rc); + esp_log_err("ChaChaPoly chacha20poly1305_init failed %d, SPI 0x%08x", + rc, ntohl(sav->spi)); return rc; } @@ -169,11 +184,30 @@ esp_chachapoly_schedule(__unused const struct esp_algorithm *algo, (const uint8_t *)_KEYBUF(sav->key_enc) + ESP_CHACHAPOLY_KEY_LEN, sizeof(esp_ccp_ctx->ccp_salt)); - esp_ccp_ctx->ccp_implicit_iv = ((sav->flags & SADB_X_EXT_IIV) != 0); + + esp_log_default("ChaChaPoly Schedule SPI 0x%08x%s %s dir %u state %u mode %u", + ntohl(sav->spi), (esp_ccp_ctx->ccp_implicit_iv ? " IIV" : ""), + ((sav->sah->ipsec_if != NULL) ? if_name(sav->sah->ipsec_if) : "NONE"), + sav->sah->dir, sav->sah->state, sav->sah->saidx.mode); return 0; } +int +esp_chachapoly_ivlen(const struct esp_algorithm *algo, + struct secasvar *sav) +{ + ESP_CHECK_ARG(algo); + + if (sav != NULL && + ((sav->sched != NULL && ((esp_chachapoly_ctx_t)sav->sched)->ccp_implicit_iv) || + ((sav->flags & SADB_X_EXT_IIV) != 0))) { + return 0; + } else { + return algo->ivlenval; + } +} + int esp_chachapoly_encrypt_finalize(struct secasvar *sav, unsigned char *tag, @@ -185,14 +219,16 @@ esp_chachapoly_encrypt_finalize(struct secasvar *sav, ESP_CHECK_ARG(sav); ESP_CHECK_ARG(tag); if (tag_bytes != ESP_CHACHAPOLY_ICV_LEN) { - esp_log_err("Invalid tag_bytes %u", tag_bytes); + esp_log_err("ChaChaPoly Invalid tag_bytes %u, SPI 0x%08x", + tag_bytes, ntohl(sav->spi)); return EINVAL; } esp_ccp_ctx = (esp_chachapoly_ctx_t)sav->sched; rc = chacha20poly1305_finalize(&esp_ccp_ctx->ccp_ctx, tag); if (rc != 0) { - esp_log_err("chacha20poly1305_finalize returned %d", rc); + esp_log_err("ChaChaPoly chacha20poly1305_finalize failed %d, SPI 0x%08x", + rc, ntohl(sav->spi)); return rc; } return 0; @@ -209,14 +245,16 @@ esp_chachapoly_decrypt_finalize(struct secasvar *sav, ESP_CHECK_ARG(sav); ESP_CHECK_ARG(tag); if (tag_bytes != ESP_CHACHAPOLY_ICV_LEN) { - esp_log_err("Invalid tag_bytes %u", tag_bytes); + esp_log_err("ChaChaPoly Invalid tag_bytes %u, SPI 0x%08x", + tag_bytes, ntohl(sav->spi)); return EINVAL; } esp_ccp_ctx = (esp_chachapoly_ctx_t)sav->sched; rc = chacha20poly1305_verify(&esp_ccp_ctx->ccp_ctx, tag); if (rc != 0) { - esp_log_err("chacha20poly1305_finalize returned %d", rc); + esp_packet_log_err("ChaChaPoly chacha20poly1305_verify failed %d, SPI 0x%08x", + rc, ntohl(sav->spi)); return rc; } return 0; @@ -236,35 +274,36 @@ esp_chachapoly_encrypt(struct mbuf *m, // head of mbuf chain uint8_t *sp; // buffer of a given encryption round size_t len; // length of a given encryption round const int32_t ivoff = (int32_t)off + (int32_t)sizeof(struct newesp); // IV offset - int32_t bodyoff; // body offset + const int32_t bodyoff = ivoff + ivlen; // body offset int rc = 0; // return code of corecrypto operations struct newesp esp_hdr; // ESP header for AAD _Static_assert(sizeof(esp_hdr) == 8, "Bad size"); - uint8_t nonce[ESP_CHACHAPOLY_NONCE_LEN]; + uint32_t nonce[ESP_CHACHAPOLY_NONCE_LEN / 4]; // ensure 32bit alignment + _Static_assert(sizeof(nonce) == ESP_CHACHAPOLY_NONCE_LEN, "Bad nonce length"); esp_chachapoly_ctx_t esp_ccp_ctx; ESP_CHECK_ARG(m); ESP_CHECK_ARG(sav); - if (ivlen != ESP_CHACHAPOLY_IV_LEN) { + + esp_ccp_ctx = (esp_chachapoly_ctx_t)sav->sched; + + if (ivlen != (esp_ccp_ctx->ccp_implicit_iv ? 0 : ESP_CHACHAPOLY_IV_LEN)) { m_freem(m); - esp_log_err("Invalid ivlen %u", ivlen); + esp_log_err("ChaChaPoly Invalid ivlen %u, SPI 0x%08x", + ivlen, ntohl(sav->spi)); return EINVAL; } - if (sav->ivlen != ESP_CHACHAPOLY_IV_LEN) { + if (sav->ivlen != ivlen) { m_freem(m); - esp_log_err("Invalid sav->ivlen %u", sav->ivlen); + esp_log_err("ChaChaPoly Invalid sav->ivlen %u, SPI 0x%08x", + sav->ivlen, ntohl(sav->spi)); return EINVAL; } - esp_ccp_ctx = (esp_chachapoly_ctx_t)sav->sched; - if (esp_ccp_ctx->ccp_implicit_iv) { - bodyoff = ivoff; - } else { - bodyoff = ivoff + ivlen; - } // check if total packet length is enough to contain ESP + IV if (m->m_pkthdr.len < bodyoff) { - esp_log_err("Packet too short %d < %zu", m->m_pkthdr.len, bodyoff); + esp_log_err("ChaChaPoly Packet too short %d < %zu, SPI 0x%08x", + m->m_pkthdr.len, bodyoff, ntohl(sav->spi)); m_freem(m); return EINVAL; } @@ -272,45 +311,52 @@ esp_chachapoly_encrypt(struct mbuf *m, // head of mbuf chain rc = chacha20poly1305_reset(&esp_ccp_ctx->ccp_ctx); if (rc != 0) { m_freem(m); - esp_log_err("chacha20poly1305_reset failed %d", rc); + esp_log_err("ChaChaPoly chacha20poly1305_reset failed %d, SPI 0x%08x", + rc, ntohl(sav->spi)); return rc; } + // esp_hdr is used for nonce and AAD + m_copydata(m, (int)off, sizeof(esp_hdr), (void *)&esp_hdr); + // RFC 7634 dictates that the 12 byte nonce must be // the 4 byte salt followed by the 8 byte IV. // The IV MUST be non-repeating but does not need to be unpredictable, // so we use 4 bytes of 0 followed by the 4 byte ESP sequence number. - // this allows us to use implicit IV -- draft-mglt-ipsecme-implicit-iv - memset(sav->iv, 0, 4); - memcpy(sav->iv + 4, &sav->seq, sizeof(sav->seq)); - _Static_assert(4 + sizeof(sav->seq) == ESP_CHACHAPOLY_IV_LEN, - "Bad IV length"); + // this allows us to use implicit IV -- draft-ietf-ipsecme-implicit-iv + // Note that sav->seq is zero here so we must get esp_seq from esp_hdr memcpy(nonce, esp_ccp_ctx->ccp_salt, ESP_CHACHAPOLY_SALT_LEN); - memcpy(nonce + ESP_CHACHAPOLY_SALT_LEN, sav->iv, ESP_CHACHAPOLY_IV_LEN); + memset(((uint8_t *)nonce) + ESP_CHACHAPOLY_SALT_LEN, 0, 4); + memcpy(((uint8_t *)nonce) + ESP_CHACHAPOLY_SALT_LEN + 4, + &esp_hdr.esp_seq, sizeof(esp_hdr.esp_seq)); + + _Static_assert(4 + sizeof(esp_hdr.esp_seq) == ESP_CHACHAPOLY_IV_LEN, + "Bad IV length"); _Static_assert(ESP_CHACHAPOLY_SALT_LEN + ESP_CHACHAPOLY_IV_LEN == sizeof(nonce), "Bad nonce length"); - rc = chacha20poly1305_setnonce(&esp_ccp_ctx->ccp_ctx, nonce); + rc = chacha20poly1305_setnonce(&esp_ccp_ctx->ccp_ctx, (uint8_t *)nonce); if (rc != 0) { m_freem(m); - esp_log_err("chacha20poly1305_setnonce failed %d", rc); + esp_log_err("ChaChaPoly chacha20poly1305_setnonce failed %d, SPI 0x%08x", + rc, ntohl(sav->spi)); return rc; } if (!esp_ccp_ctx->ccp_implicit_iv) { + memcpy(sav->iv, ((uint8_t *)nonce) + ESP_CHACHAPOLY_SALT_LEN, ESP_CHACHAPOLY_IV_LEN); m_copyback(m, ivoff, ivlen, sav->iv); } cc_clear(sizeof(nonce), nonce); // Set Additional Authentication Data (AAD) - m_copydata(m, (int)off, sizeof(esp_hdr), (void *)&esp_hdr); - rc = chacha20poly1305_aad(&esp_ccp_ctx->ccp_ctx, sizeof(esp_hdr), (void *)&esp_hdr); if (rc != 0) { m_freem(m); - esp_log_err("chacha20poly1305_aad failed %d", rc); + esp_log_err("ChaChaPoly chacha20poly1305_aad failed %d, SPI 0x%08x", + rc, ntohl(sav->spi)); return rc; } @@ -337,7 +383,8 @@ esp_chachapoly_encrypt(struct mbuf *m, // head of mbuf chain len, sp, sp); if (rc != 0) { m_freem(m); - esp_log_err("chacha20poly1305_encrypt failed %d", rc); + esp_log_err("ChaChaPoly chacha20poly1305_encrypt failed %d, SPI 0x%08x", + rc, ntohl(sav->spi)); return rc; } @@ -347,7 +394,8 @@ esp_chachapoly_encrypt(struct mbuf *m, // head of mbuf chain } if (s == NULL && soff != m->m_pkthdr.len) { m_freem(m); - esp_log_err("not enough mbufs %d %d", soff, m->m_pkthdr.len); + esp_log_err("ChaChaPoly not enough mbufs %d %d, SPI 0x%08x", + soff, m->m_pkthdr.len, ntohl(sav->spi)); return EFBIG; } return 0; @@ -366,35 +414,36 @@ esp_chachapoly_decrypt(struct mbuf *m, // head of mbuf chain uint8_t *sp; // buffer of a given encryption round size_t len; // length of a given encryption round const int32_t ivoff = (int32_t)off + (int32_t)sizeof(struct newesp); // IV offset - int32_t bodyoff; // body offset + const int32_t bodyoff = ivoff + ivlen; // body offset int rc = 0; // return code of corecrypto operations struct newesp esp_hdr; // ESP header for AAD _Static_assert(sizeof(esp_hdr) == 8, "Bad size"); - uint8_t nonce[ESP_CHACHAPOLY_NONCE_LEN]; + uint32_t nonce[ESP_CHACHAPOLY_NONCE_LEN / 4]; // ensure 32bit alignment + _Static_assert(sizeof(nonce) == ESP_CHACHAPOLY_NONCE_LEN, "Bad nonce length"); esp_chachapoly_ctx_t esp_ccp_ctx; ESP_CHECK_ARG(m); ESP_CHECK_ARG(sav); - if (ivlen != ESP_CHACHAPOLY_IV_LEN) { + + esp_ccp_ctx = (esp_chachapoly_ctx_t)sav->sched; + + if (ivlen != (esp_ccp_ctx->ccp_implicit_iv ? 0 : ESP_CHACHAPOLY_IV_LEN)) { m_freem(m); - esp_log_err("Invalid ivlen %u", ivlen); + esp_log_err("ChaChaPoly Invalid ivlen %u, SPI 0x%08x", + ivlen, ntohl(sav->spi)); return EINVAL; } - if (sav->ivlen != ESP_CHACHAPOLY_IV_LEN) { + if (sav->ivlen != ivlen) { m_freem(m); - esp_log_err("Invalid sav->ivlen %u", sav->ivlen); + esp_log_err("ChaChaPoly Invalid sav->ivlen %u, SPI 0x%08x", + sav->ivlen, ntohl(sav->spi)); return EINVAL; } - esp_ccp_ctx = (esp_chachapoly_ctx_t)sav->sched; - if (esp_ccp_ctx->ccp_implicit_iv) { - bodyoff = ivoff; - } else { - bodyoff = ivoff + ivlen; - } // check if total packet length is enough to contain ESP + IV if (m->m_pkthdr.len < bodyoff) { - esp_packet_log_err("Packet too short %d < %zu", m->m_pkthdr.len, bodyoff); + esp_packet_log_err("ChaChaPoly Packet too short %d < %zu, SPI 0x%08x", + m->m_pkthdr.len, bodyoff, ntohl(sav->spi)); m_freem(m); return EINVAL; } @@ -402,7 +451,8 @@ esp_chachapoly_decrypt(struct mbuf *m, // head of mbuf chain rc = chacha20poly1305_reset(&esp_ccp_ctx->ccp_ctx); if (rc != 0) { m_freem(m); - esp_log_err("chacha20poly1305_reset failed %d", rc); + esp_log_err("ChaChaPoly chacha20poly1305_reset failed %d, SPI 0x%08x", + rc, ntohl(sav->spi)); return rc; } @@ -413,20 +463,22 @@ esp_chachapoly_decrypt(struct mbuf *m, // head of mbuf chain memcpy(nonce, esp_ccp_ctx->ccp_salt, ESP_CHACHAPOLY_SALT_LEN); if (esp_ccp_ctx->ccp_implicit_iv) { // IV is implicit (4 zero bytes followed by the ESP sequence number) - memset(nonce + ESP_CHACHAPOLY_SALT_LEN, 0, 4); - memcpy(nonce + ESP_CHACHAPOLY_SALT_LEN + 4, &esp_hdr.esp_seq, sizeof(esp_hdr.esp_seq)); + memset(((uint8_t *)nonce) + ESP_CHACHAPOLY_SALT_LEN, 0, 4); + memcpy(((uint8_t *)nonce) + ESP_CHACHAPOLY_SALT_LEN + 4, + &esp_hdr.esp_seq, sizeof(esp_hdr.esp_seq)); _Static_assert(4 + sizeof(esp_hdr.esp_seq) == ESP_CHACHAPOLY_IV_LEN, "Bad IV length"); } else { // copy IV from packet - m_copydata(m, ivoff, ESP_CHACHAPOLY_IV_LEN, nonce + ESP_CHACHAPOLY_SALT_LEN); + m_copydata(m, ivoff, ESP_CHACHAPOLY_IV_LEN, ((uint8_t *)nonce) + ESP_CHACHAPOLY_SALT_LEN); } _Static_assert(ESP_CHACHAPOLY_SALT_LEN + ESP_CHACHAPOLY_IV_LEN == sizeof(nonce), "Bad nonce length"); - rc = chacha20poly1305_setnonce(&esp_ccp_ctx->ccp_ctx, nonce); + rc = chacha20poly1305_setnonce(&esp_ccp_ctx->ccp_ctx, (uint8_t *)nonce); if (rc != 0) { m_freem(m); - esp_log_err("chacha20poly1305_setnonce failed %d", rc); + esp_log_err("ChaChaPoly chacha20poly1305_setnonce failed %d, SPI 0x%08x", + rc, ntohl(sav->spi)); return rc; } cc_clear(sizeof(nonce), nonce); @@ -437,7 +489,8 @@ esp_chachapoly_decrypt(struct mbuf *m, // head of mbuf chain (void *)&esp_hdr); if (rc != 0) { m_freem(m); - esp_log_err("chacha20poly1305_aad failed %d", rc); + esp_log_err("ChaChaPoly chacha20poly1305_aad failed %d, SPI 0x%08x", + rc, ntohl(sav->spi)); return rc; } @@ -464,7 +517,8 @@ esp_chachapoly_decrypt(struct mbuf *m, // head of mbuf chain len, sp, sp); if (rc != 0) { m_freem(m); - esp_packet_log_err("chacha20poly1305_decrypt failed %d", rc); + esp_packet_log_err("chacha20poly1305_decrypt failed %d, SPI 0x%08x", + rc, ntohl(sav->spi)); return rc; } @@ -474,7 +528,8 @@ esp_chachapoly_decrypt(struct mbuf *m, // head of mbuf chain } if (s == NULL && soff != m->m_pkthdr.len) { m_freem(m); - esp_packet_log_err("not enough mbufs %d %d", soff, m->m_pkthdr.len); + esp_packet_log_err("not enough mbufs %d %d, SPI 0x%08x", + soff, m->m_pkthdr.len, ntohl(sav->spi)); return EFBIG; } return 0; diff --git a/bsd/netinet6/esp_chachapoly.h b/bsd/netinet6/esp_chachapoly.h index b98b77a40..8e3c58e4d 100644 --- a/bsd/netinet6/esp_chachapoly.h +++ b/bsd/netinet6/esp_chachapoly.h @@ -48,6 +48,7 @@ int esp_chachapoly_decrypt(struct mbuf *, size_t, struct secasvar *, int esp_chachapoly_encrypt_finalize(struct secasvar *, unsigned char *, unsigned int); int esp_chachapoly_decrypt_finalize(struct secasvar *, unsigned char *, unsigned int); int esp_chachapoly_mature(struct secasvar *); +int esp_chachapoly_ivlen(const struct esp_algorithm *, struct secasvar *); #endif /* _ESP_CHACHA_POLY_H_ */ #endif /* BSD_KERNEL_PRIVATE */ diff --git a/bsd/netinet6/esp_core.c b/bsd/netinet6/esp_core.c index a26873e45..03fdcd7f1 100644 --- a/bsd/netinet6/esp_core.c +++ b/bsd/netinet6/esp_core.c @@ -188,7 +188,7 @@ static const struct esp_algorithm chacha_poly = { ESP_CHACHAPOLY_PAD_BOUND, ESP_CHACHAPOLY_IV_LEN, esp_chachapoly_mature, ESP_CHACHAPOLY_KEYBITS_WITH_SALT, ESP_CHACHAPOLY_KEYBITS_WITH_SALT, esp_chachapoly_schedlen, - "chacha-poly", esp_common_ivlen, esp_chachapoly_decrypt, + "chacha-poly", esp_chachapoly_ivlen, esp_chachapoly_decrypt, esp_chachapoly_encrypt, esp_chachapoly_schedule, NULL, NULL, ESP_CHACHAPOLY_ICV_LEN, esp_chachapoly_decrypt_finalize, esp_chachapoly_encrypt_finalize}; @@ -268,6 +268,7 @@ esp_schedule(const struct esp_algorithm *algo, struct secasvar *sav) ipseclog((LOG_ERR, "esp_schedule %s: implicit IV not allowed\n", algo->name)); + lck_mtx_unlock(sadb_mutex); return EINVAL; } diff --git a/bsd/netinet6/icmp6.c b/bsd/netinet6/icmp6.c index 853d97702..58917f864 100644 --- a/bsd/netinet6/icmp6.c +++ b/bsd/netinet6/icmp6.c @@ -2068,7 +2068,8 @@ icmp6_rip6_input(struct mbuf **mp, int off) if ((n = m_copy(m, 0, (int)M_COPYALL)) != NULL) { if ((last->in6p_flags & INP_CONTROLOPTS) != 0 || (last->in6p_socket->so_options & SO_TIMESTAMP) != 0 || - (last->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + (last->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0 || + (last->in6p_socket->so_options & SO_TIMESTAMP_CONTINUOUS) != 0) { ret = ip6_savecontrol(last, n, &opts); if (ret != 0) { m_freem(n); @@ -2093,7 +2094,8 @@ icmp6_rip6_input(struct mbuf **mp, int off) if (last) { if ((last->in6p_flags & INP_CONTROLOPTS) != 0 || (last->in6p_socket->so_options & SO_TIMESTAMP) != 0 || - (last->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + (last->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0 || + (last->in6p_socket->so_options & SO_TIMESTAMP_CONTINUOUS) != 0) { ret = ip6_savecontrol(last, m, &opts); if (ret != 0) { goto error; @@ -2232,7 +2234,7 @@ icmp6_reflect(struct mbuf *m, size_t off) for (ia = in6_ifaddrs; ia; ia = ia->ia_next) { IFA_LOCK(&ia->ia_ifa); if (IN6_ARE_ADDR_EQUAL(&t, &ia->ia_addr.sin6_addr) && - (ia->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY)) == 0) { + (ia->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY|IN6_IFF_CLAT46)) == 0) { IFA_UNLOCK(&ia->ia_ifa); src = &t; break; @@ -2651,8 +2653,8 @@ icmp6_redirect_output(struct mbuf *m0, struct rtentry *rt) /* get ip6 linklocal address for ifp(my outgoing interface). */ struct in6_ifaddr *ia; if ((ia = in6ifa_ifpforlinklocal(ifp, - IN6_IFF_NOTREADY| - IN6_IFF_ANYCAST)) == NULL) + IN6_IFF_NOTREADY| + IN6_IFF_ANYCAST)) == NULL) goto fail; IFA_LOCK(&ia->ia_ifa); ifp_ll6 = ia->ia_addr.sin6_addr; diff --git a/bsd/netinet6/in6.c b/bsd/netinet6/in6.c index a76b74157..4f34af6ba 100644 --- a/bsd/netinet6/in6.c +++ b/bsd/netinet6/in6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2017 Apple Inc. All rights reserved. + * Copyright (c) 2003-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -262,7 +262,7 @@ static struct zone *in6ifa_zone; /* zone for in6_ifaddr */ #define IN6IFA_ZONE_NAME "in6_ifaddr" /* zone name */ struct eventhandler_lists_ctxt in6_evhdlr_ctxt; - +struct eventhandler_lists_ctxt in6_clat46_evhdlr_ctxt; /* * Subroutine for in6_ifaddloop() and in6_ifremloop(). * This routine does actual work. @@ -934,7 +934,7 @@ in6ctl_gifstat(struct ifnet *ifp, u_long cmd, struct in6_ifreq *ifr) /* N.B.: if_inet6data is never freed once set. */ if (IN6_IFEXTRA(ifp) == NULL) { /* return (EAFNOSUPPORT)? */ - bzero(&ifr->ifr_ifru.ifru_stat, + bzero(&ifr->ifr_ifru.ifru_icmp6stat, sizeof (ifr->ifr_ifru.ifru_icmp6stat)); } else { bcopy(&IN6_IFEXTRA(ifp)->icmp6_ifstat, @@ -1070,6 +1070,88 @@ in6ctl_alifetime(struct in6_ifaddr *ia, u_long cmd, struct in6_ifreq *ifr, return (error); } +static int +in6ctl_clat46start(struct ifnet *ifp) +{ + struct nd_prefix *pr = NULL; + struct nd_prefix *next = NULL; + struct in6_ifaddr *ia6 = NULL; + int error = 0; + + if (ifp == lo_ifp) + return (EINVAL); + /* + * Traverse the list of prefixes and find the first non-linklocal + * prefix on the interface. + * For that found eligible prefix, configure a CLAT46 reserved address. + */ + lck_mtx_lock(nd6_mutex); + for (pr = nd_prefix.lh_first; pr; pr = next) { + next = pr->ndpr_next; + + NDPR_LOCK(pr); + if (pr->ndpr_ifp != ifp) { + NDPR_UNLOCK(pr); + continue; + } + + if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) { + NDPR_UNLOCK(pr); + continue; /* XXX */ + } + + if (pr->ndpr_raf_auto == 0) { + NDPR_UNLOCK(pr); + continue; + } + + if (pr->ndpr_stateflags & NDPRF_DEFUNCT) { + NDPR_UNLOCK(pr); + continue; + } + + if ((pr->ndpr_stateflags & NDPRF_CLAT46) == 0 + && pr->ndpr_vltime != 0) { + NDPR_ADDREF_LOCKED(pr); /* Take reference for rest of the processing */ + NDPR_UNLOCK(pr); + break; + } else { + NDPR_UNLOCK(pr); + continue; + } + } + lck_mtx_unlock(nd6_mutex); + + if (pr != NULL) { + if ((ia6 = in6_pfx_newpersistaddr(pr, FALSE, &error, TRUE)) == NULL) { + nd6log0((LOG_ERR, "Could not configure CLAT46 address on interface " + "%s.\n", ifp->if_xname)); + } else { + IFA_LOCK(&ia6->ia_ifa); + NDPR_LOCK(pr); + ia6->ia6_ndpr = pr; + NDPR_ADDREF_LOCKED(pr); /* for addr reference */ + pr->ndpr_stateflags |= NDPRF_CLAT46; + pr->ndpr_addrcnt++; + VERIFY(pr->ndpr_addrcnt != 0); + NDPR_UNLOCK(pr); + IFA_UNLOCK(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); + ia6 = NULL; + /* + * A newly added address might affect the status + * of other addresses, so we check and update it. + * XXX: what if address duplication happens? + */ + lck_mtx_lock(nd6_mutex); + pfxlist_onlink_check(); + lck_mtx_unlock(nd6_mutex); + } + NDPR_REMREF(pr); + } + return (error); +} + #define ifa2ia6(ifa) ((struct in6_ifaddr *)(void *)(ifa)) /* @@ -1191,6 +1273,30 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, error = in6ctl_llstop(ifp); goto done; + case SIOCCLAT46_START: /* struct in6_ifreq */ + if (!privileged) { + error = EPERM; + goto done; + } + error = in6ctl_clat46start(ifp); + if (error == 0) + ifp->if_eflags |= IFEF_CLAT46; + goto done; + + case SIOCCLAT46_STOP: /* struct in6_ifreq */ + if (!privileged) { + error = EPERM; + goto done; + } + + /* + * Not much to be done here and it might not be needed + * It would usually be done when IPv6 configuration is being + * flushed. + * XXX Probably STOP equivalent is not needed here. + */ + ifp->if_eflags &= ~IFEF_CLAT46; + goto done; case SIOCSETROUTERMODE_IN6: /* struct in6_ifreq */ if (!privileged) { error = EPERM; @@ -2500,6 +2606,8 @@ in6_unlink_ifa(struct in6_ifaddr *ia, struct ifnet *ifp) NDPR_LOCK(pr); VERIFY(pr->ndpr_addrcnt != 0); pr->ndpr_addrcnt--; + if (oia->ia6_flags & IN6_IFF_CLAT46) + pr->ndpr_stateflags &= ~NDPRF_CLAT46; NDPR_UNLOCK(pr); NDPR_REMREF(pr); /* release addr reference */ } @@ -2665,6 +2773,31 @@ in6ifa_ifpforlinklocal(struct ifnet *ifp, int ignoreflags) return ((struct in6_ifaddr *)ifa); } +struct in6_ifaddr * +in6ifa_ifpwithflag(struct ifnet * ifp, int flag) +{ + struct ifaddr *ifa; + + ifnet_lock_shared(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) + { + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6 ) { + IFA_UNLOCK(ifa); + continue; + } + if ((((struct in6_ifaddr *)ifa)->ia6_flags & flag) == flag) { + IFA_ADDREF_LOCKED(ifa); + IFA_UNLOCK(ifa); + break; + } + IFA_UNLOCK(ifa); + } + ifnet_lock_done(ifp); + + return ((struct in6_ifaddr *)ifa); +} + /* * find the internet address corresponding to a given interface and address. */ @@ -3010,7 +3143,7 @@ in6_ifawithscope(struct ifnet *oifp, struct in6_addr *dst) * nor a duplicated address. */ if (((struct in6_ifaddr *)ifa)->ia6_flags & - IN6_IFF_NOTREADY) { + (IN6_IFF_NOTREADY | IN6_IFF_CLAT46)) { IFA_UNLOCK(ifa); continue; } @@ -3294,7 +3427,7 @@ in6_ifawithifp(struct ifnet *ifp, struct in6_addr *dst) IFA_UNLOCK(ifa); continue; /* XXX: is there any case to allow anycast? */ } - if (ifa2ia6(ifa)->ia6_flags & IN6_IFF_NOTREADY) { + if (ifa2ia6(ifa)->ia6_flags & (IN6_IFF_NOTREADY | IN6_IFF_CLAT46)) { IFA_UNLOCK(ifa); continue; /* don't use this interface */ } @@ -3364,7 +3497,7 @@ in6_ifawithifp(struct ifnet *ifp, struct in6_addr *dst) IFA_UNLOCK(ifa); continue; /* XXX: is there any case to allow anycast? */ } - if (ifa2ia6(ifa)->ia6_flags & IN6_IFF_NOTREADY) { + if (ifa2ia6(ifa)->ia6_flags & (IN6_IFF_NOTREADY | IN6_IFF_CLAT46)) { IFA_UNLOCK(ifa); continue; /* don't use this interface */ } diff --git a/bsd/netinet6/in6.h b/bsd/netinet6/in6.h index 3fe1484e2..e057fd9eb 100644 --- a/bsd/netinet6/in6.h +++ b/bsd/netinet6/in6.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2017 Apple Inc. All rights reserved. + * Copyright (c) 2008-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -99,7 +99,6 @@ #define _NETINET6_IN6_H_ #include #include - #include /* @@ -149,13 +148,13 @@ /* * IPv6 address */ -struct in6_addr { +typedef struct in6_addr { union { __uint8_t __u6_addr8[16]; __uint16_t __u6_addr16[8]; __uint32_t __u6_addr32[4]; } __u6_addr; /* 128-bit IP6 address */ -}; +} in6_addr_t; #define s6_addr __u6_addr.__u6_addr8 #ifdef KERNEL /* XXX nonstandard */ @@ -887,7 +886,6 @@ extern uint32_t in6_finalize_cksum(struct mbuf *, uint32_t, int32_t, /* IPv6 protocol events */ extern struct eventhandler_lists_ctxt in6_evhdlr_ctxt; - /* * XXX Avoid reordering the enum values below. * If the order is changed, please make sure @@ -923,7 +921,6 @@ struct in6_event2kev { const char *in6_event_str; }; extern struct in6_event2kev in6_event2kev_array[]; - extern void in6_eventhdlr_callback(struct eventhandler_entry_arg, in6_evhdlr_code_t, struct ifnet *, struct in6_addr *, uint32_t); extern void in6_event_enqueue_nwk_wq_entry(in6_evhdlr_code_t, @@ -934,6 +931,33 @@ typedef void (*in6_event_fn) (struct eventhandler_entry_arg, in6_evhdlr_code_t, EVENTHANDLER_DECLARE(in6_event, in6_event_fn); #endif /* BSD_KERNEL_PRIVATE */ +#ifdef PRIVATE +/* CLAT46 events */ +typedef enum in6_clat46_evhdlr_code_t { + IN6_CLAT46_EVENT_V4_FLOW, + IN6_CLAT46_EVENT_V6_ADDR_CONFFAIL, +} in6_clat46_evhdlr_code_t; + +struct kev_netevent_clat46_data { + in6_clat46_evhdlr_code_t clat46_event_code; + pid_t epid; + uuid_t euuid; +}; +#endif /* PRIVATE */ + +#ifdef BSD_KERNEL_PRIVATE +/* CLAT46 events */ +extern struct eventhandler_lists_ctxt in6_clat46_evhdlr_ctxt; +extern void in6_clat46_eventhdlr_callback(struct eventhandler_entry_arg, + in6_clat46_evhdlr_code_t, pid_t, uuid_t); +extern void in6_clat46_event_enqueue_nwk_wq_entry(in6_clat46_evhdlr_code_t, + pid_t, uuid_t); + +typedef void (*in6_clat46_event_fn) (struct eventhandler_entry_arg, in6_clat46_evhdlr_code_t, + pid_t, uuid_t); +EVENTHANDLER_DECLARE(in6_clat46_event, in6_clat46_event_fn); +#endif /* BSD_KERNEL_PRIVATE */ + #ifdef KERNEL_PRIVATE /* exporte for ApplicationFirewall */ extern int in6_localaddr(struct in6_addr *); diff --git a/bsd/netinet6/in6_ifattach.c b/bsd/netinet6/in6_ifattach.c index 86759c202..f19872e56 100644 --- a/bsd/netinet6/in6_ifattach.c +++ b/bsd/netinet6/in6_ifattach.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2016 Apple Inc. All rights reserved. + * Copyright (c) 2003-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -746,10 +746,6 @@ in6_ifattach_prelim(struct ifnet *ifp) sizeof(IN6_IFEXTRA(ifp)->icmp6_ifstat)); bzero(&IN6_IFEXTRA(ifp)->in6_ifstat, sizeof(IN6_IFEXTRA(ifp)->in6_ifstat)); - IN6_IFEXTRA(ifp)->netsig_len = 0; - bzero(&IN6_IFEXTRA(ifp)->netsig, - sizeof(IN6_IFEXTRA(ifp)->netsig)); - bzero(IN6_IFEXTRA(ifp)->nat64_prefixes, sizeof(IN6_IFEXTRA(ifp)->nat64_prefixes)); /* XXX TBD Purge the layer two table */ /* * XXX When recycling, nd_ifinfo gets initialized, other @@ -758,7 +754,7 @@ in6_ifattach_prelim(struct ifnet *ifp) } /* - * XXX Only initialize NDP ifinfo for the interface + * XXX Only initialize IPv6 configuration for the interface * if interface has not yet been configured with * link local IPv6 address. * Could possibly be optimized with an interface flag if need @@ -766,6 +762,11 @@ in6_ifattach_prelim(struct ifnet *ifp) */ ia6 = in6ifa_ifpforlinklocal(ifp, 0); if (ia6 == NULL) { + IN6_IFEXTRA(ifp)->netsig_len = 0; + bzero(&IN6_IFEXTRA(ifp)->netsig, + sizeof(IN6_IFEXTRA(ifp)->netsig)); + bzero(IN6_IFEXTRA(ifp)->nat64_prefixes, + sizeof(IN6_IFEXTRA(ifp)->nat64_prefixes)); /* initialize NDP variables */ nd6_ifattach(ifp); } else { diff --git a/bsd/netinet6/in6_mcast.c b/bsd/netinet6/in6_mcast.c index 3bd00c8a2..467b3a164 100644 --- a/bsd/netinet6/in6_mcast.c +++ b/bsd/netinet6/in6_mcast.c @@ -2671,7 +2671,7 @@ in6p_set_source_filters(struct inpcb *inp, struct sockopt *sopt) if (error) return (error); /* we never use msfr.msfr_srcs; */ - memcpy(&msfr, &msfr64, sizeof(msfr)); + memcpy(&msfr, &msfr64, sizeof(msfr64)); } else { error = sooptcopyin(sopt, &msfr32, sizeof(struct __msfilterreq32), @@ -2679,7 +2679,7 @@ in6p_set_source_filters(struct inpcb *inp, struct sockopt *sopt) if (error) return (error); /* we never use msfr.msfr_srcs; */ - memcpy(&msfr, &msfr32, sizeof(msfr)); + memcpy(&msfr, &msfr32, sizeof(msfr32)); } if ((size_t) msfr.msfr_nsrcs > diff --git a/bsd/netinet6/in6_pcb.c b/bsd/netinet6/in6_pcb.c index 3118b7bae..db72c5c35 100644 --- a/bsd/netinet6/in6_pcb.c +++ b/bsd/netinet6/in6_pcb.c @@ -270,8 +270,8 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) */ IFA_LOCK_SPIN(ifa); if (((struct in6_ifaddr *)ifa)->ia6_flags & - (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY| - IN6_IFF_DETACHED)) { + (IN6_IFF_ANYCAST | IN6_IFF_NOTREADY| + IN6_IFF_DETACHED | IN6_IFF_CLAT46)) { IFA_UNLOCK(ifa); IFA_REMREF(ifa); lck_rw_done(pcbinfo->ipi_lock); @@ -295,9 +295,9 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) struct inpcb *t; uid_t u; - /* GROSS */ #if !CONFIG_EMBEDDED - if (ntohs(lport) < IPV6PORT_RESERVED) { + if (ntohs(lport) < IPV6PORT_RESERVED && + !IN6_IS_ADDR_UNSPECIFIED(&sin6.sin6_addr)) { cred = kauth_cred_proc_ref(p); error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0); @@ -533,6 +533,11 @@ in6_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct proc *p) struct ifnet *outif = NULL; struct socket *so = inp->inp_socket; +#if CONTENT_FILTER + if (so) + so->so_state_change_cnt++; +#endif + if (so->so_proto->pr_protocol == IPPROTO_UDP && sin6->sin6_port == htons(53) && !(so->so_flags1 & SOF1_DNS_COUNTED)) { so->so_flags1 |= SOF1_DNS_COUNTED; @@ -598,6 +603,11 @@ in6_pcbdisconnect(struct inpcb *inp) { struct socket *so = inp->inp_socket; +#if CONTENT_FILTER + if (so) + so->so_state_change_cnt++; +#endif + if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->ipi_lock)) { /* lock inversion issue, mostly with udp multicast packets */ socket_unlock(so, 0); diff --git a/bsd/netinet6/in6_proto.c b/bsd/netinet6/in6_proto.c index e71714e69..cd8777c0a 100644 --- a/bsd/netinet6/in6_proto.c +++ b/bsd/netinet6/in6_proto.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2015 Apple Inc. All rights reserved. + * Copyright (c) 2008-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -104,6 +104,7 @@ #include #include #include +#include #include #include @@ -630,6 +631,8 @@ SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXDYNROUTES, SYSCTL_INT(_net_inet6_ip6, OID_AUTO, only_allow_rfc4193_prefixes, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_only_allow_rfc4193_prefix, 0, ""); +SYSCTL_INT(_net_inet6_ip6, OID_AUTO, + clat_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &clat_debug, 0, ""); /* net.inet6.icmp6 */ SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRACCEPT, diff --git a/bsd/netinet6/in6_src.c b/bsd/netinet6/in6_src.c index ab987d2db..8af4dc7b1 100644 --- a/bsd/netinet6/in6_src.c +++ b/bsd/netinet6/in6_src.c @@ -326,6 +326,15 @@ in6_selectsrc_core(struct sockaddr_in6 *dstsock, uint32_t hint_mask, IFA_LOCK(&ia->ia_ifa); + /* + * Simply skip addresses reserved for CLAT46 + */ + if (ia->ia6_flags & IN6_IFF_CLAT46) { + SASEL_LOG("NEXT ia %s address on ifp1 %s skipped as it is " + "reserved for CLAT46", s_src, ifp1->if_xname); + goto next; + } + /* * XXX By default we are strong end system and will * limit candidate set of source address to the ones @@ -687,7 +696,7 @@ in6_selectsrc(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, goto done; } IFA_LOCK_SPIN(&ia6->ia_ifa); - if ((ia6->ia6_flags & (IN6_IFF_ANYCAST | IN6_IFF_NOTREADY)) || + if ((ia6->ia6_flags & (IN6_IFF_ANYCAST | IN6_IFF_NOTREADY | IN6_IFF_CLAT46)) || (inp && inp_restricted_send(inp, ia6->ia_ifa.ifa_ifp))) { IFA_UNLOCK(&ia6->ia_ifa); IFA_REMREF(&ia6->ia_ifa); @@ -1429,8 +1438,7 @@ in6_pcbsetport(struct in6_addr *laddr, struct inpcb *inp, struct proc *p, bool found; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; kauth_cred_t cred; - - (void)laddr; +#pragma unused(laddr) if (!locked) { /* Make sure we don't run into a deadlock: 4052373 */ if (!lck_rw_try_lock_exclusive(pcbinfo->ipi_lock)) { socket_unlock(inp->inp_socket, 0); diff --git a/bsd/netinet6/in6_var.h b/bsd/netinet6/in6_var.h index 8a08baa85..50cd3e9b9 100644 --- a/bsd/netinet6/in6_var.h +++ b/bsd/netinet6/in6_var.h @@ -723,9 +723,12 @@ void in6_post_msg(struct ifnet *, u_int32_t, struct in6_ifaddr *, uint8_t *mac); #define SIOCLL_CGASTART_32 _IOW('i', 160, struct in6_cgareq_32) #define SIOCLL_CGASTART_64 _IOW('i', 160, struct in6_cgareq_64) #endif + #define SIOCGIFCGAPREP_IN6 _IOWR('i', 187, struct in6_cgareq) #define SIOCSIFCGAPREP_IN6 _IOWR('i', 188, struct in6_cgareq) +#define SIOCCLAT46_START _IOWR('i', 189, struct in6_ifreq) +#define SIOCCLAT46_STOP _IOWR('i', 190, struct in6_ifreq) #endif /* PRIVATE */ #ifdef BSD_KERNEL_PRIVATE @@ -754,6 +757,7 @@ void in6_post_msg(struct ifnet *, u_int32_t, struct in6_ifaddr *, uint8_t *mac); #ifdef PRIVATE #define IN6_IFF_SWIFTDAD 0x0800 /* DAD with no delay */ #endif +#define IN6_IFF_CLAT46 0x1000 /* Address reserved for CLAT46 */ #define IN6_IFF_NOPFX 0x8000 /* Depreciated. Don't use. */ /* Duplicate Address Detection [DAD] in progress. */ @@ -1114,6 +1118,7 @@ extern void in6_setmaxmtu(void); extern void in6_restoremkludge(struct in6_ifaddr *, struct ifnet *); extern void in6_purgemkludge(struct ifnet *); extern struct in6_ifaddr *in6ifa_ifpforlinklocal(struct ifnet *, int); +extern struct in6_ifaddr *in6ifa_ifpwithflag(struct ifnet *, int); extern struct in6_ifaddr *in6ifa_ifpwithaddr(struct ifnet *, struct in6_addr *); extern struct in6_ifaddr *in6ifa_prproxyaddr(struct in6_addr *); extern void in6ifa_getlifetime(struct in6_ifaddr *, diff --git a/bsd/netinet6/ip6_input.c b/bsd/netinet6/ip6_input.c index 6ca8bba66..2e8eea64b 100644 --- a/bsd/netinet6/ip6_input.c +++ b/bsd/netinet6/ip6_input.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2017 Apple Inc. All rights reserved. + * Copyright (c) 2003-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -315,6 +315,11 @@ ip6_init(struct ip6protosw *pp, struct domain *dp) in6_eventhdlr_callback, eventhandler_entry_dummy_arg, EVENTHANDLER_PRI_ANY); + eventhandler_lists_ctxt_init(&in6_clat46_evhdlr_ctxt); + (void)EVENTHANDLER_REGISTER(&in6_clat46_evhdlr_ctxt, in6_clat46_event, + in6_clat46_eventhdlr_callback, eventhandler_entry_dummy_arg, + EVENTHANDLER_PRI_ANY); + for (i = 0; i < IN6_EVENT_MAX; i++) VERIFY(in6_event2kev_array[i].in6_event_code == i); @@ -895,7 +900,7 @@ ip6_input(struct mbuf *m) * a lot of things in the address are set once and never * changed (e.g. ia_ifp.) */ - if (!(ia6->ia6_flags & IN6_IFF_NOTREADY)) { + if (!(ia6->ia6_flags & (IN6_IFF_NOTREADY | IN6_IFF_CLAT46))) { /* this address is ready */ ours = 1; deliverifp = ia6->ia_ifp; @@ -1613,6 +1618,15 @@ ip6_savecontrol_v4(struct inpcb *inp, struct mbuf *m, struct mbuf **mp, if (*mp == NULL) return (NULL); } + if ((inp->inp_socket->so_options & SO_TIMESTAMP_CONTINUOUS) != 0) { + uint64_t time; + + time = mach_continuous_time(); + mp = sbcreatecontrol_mbuf((caddr_t)&time, sizeof (time), + SCM_TIMESTAMP_CONTINUOUS, SOL_SOCKET, mp); + if (*mp == NULL) + return (NULL); + } if ((inp->inp_socket->so_flags & SOF_RECV_TRAFFIC_CLASS) != 0) { int tc = m_get_traffic_class(m); @@ -1622,13 +1636,43 @@ ip6_savecontrol_v4(struct inpcb *inp, struct mbuf *m, struct mbuf **mp, return (NULL); } +#define IS2292(inp, x, y) (((inp)->inp_flags & IN6P_RFC2292) ? (x) : (y)) if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { - if (v4only != NULL) + if (v4only != NULL) { *v4only = 1; + } + + // Send ECN flags for v4-mapped addresses + if ((inp->inp_flags & IN6P_TCLASS) != 0) { + struct ip *ip_header = mtod(m, struct ip *); + u_int8_t tos = (ip_header->ip_tos & IPTOS_ECN_MASK); + + mp = sbcreatecontrol_mbuf((caddr_t)&tos, sizeof(tos), + IPV6_TCLASS, IPPROTO_IPV6, mp); + if (*mp == NULL) + return (NULL); + } + + // Send IN6P_PKTINFO for v4-mapped address + if ((inp->inp_flags & IN6P_PKTINFO) != 0) { + struct in6_pktinfo pi6 = { + .ipi6_addr = IN6ADDR_V4MAPPED_INIT, + .ipi6_ifindex = (m && m->m_pkthdr.rcvif) ? m->m_pkthdr.rcvif->if_index : 0, + }; + + struct ip *ip_header = mtod(m, struct ip *); + bcopy(&ip_header->ip_dst, &pi6.ipi6_addr.s6_addr32[3], sizeof(struct in_addr)); + + mp = sbcreatecontrol_mbuf((caddr_t)&pi6, + sizeof (struct in6_pktinfo), + IS2292(inp, IPV6_2292PKTINFO, IPV6_PKTINFO), + IPPROTO_IPV6, mp); + if (*mp == NULL) + return (NULL); + } return (mp); } -#define IS2292(inp, x, y) (((inp)->inp_flags & IN6P_RFC2292) ? (x) : (y)) /* RFC 2292 sec. 5 */ if ((inp->inp_flags & IN6P_PKTINFO) != 0) { struct in6_pktinfo pi6; diff --git a/bsd/netinet6/ip6_output.c b/bsd/netinet6/ip6_output.c index 73a66159e..0720d6809 100644 --- a/bsd/netinet6/ip6_output.c +++ b/bsd/netinet6/ip6_output.c @@ -122,6 +122,7 @@ #include #include +#include #include #include #include @@ -2555,6 +2556,13 @@ ip6_ctloutput(struct socket *so, struct sockopt *sopt) optp = &in6p->in6p_outputopts; error = ip6_pcbopt(optname, (u_char *)&optval, sizeof (optval), optp, uproto); + + if (optname == IPV6_TCLASS) { + // Add in the ECN flags + u_int8_t tos = (in6p->inp_ip_tos & ~IPTOS_ECN_MASK); + u_int8_t ecn = optval & IPTOS_ECN_MASK; + in6p->inp_ip_tos = tos | ecn; + } break; } diff --git a/bsd/netinet6/ip6_var.h b/bsd/netinet6/ip6_var.h index 67fcd97bc..23c510710 100644 --- a/bsd/netinet6/ip6_var.h +++ b/bsd/netinet6/ip6_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2017 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -339,6 +339,32 @@ struct ip6stat { /* NECP policy related drop */ u_quad_t ip6s_necp_policy_drop; + + /* CLAT46 stats */ + u_quad_t ip6s_clat464_in_tooshort_drop; + u_quad_t ip6s_clat464_in_nov6addr_drop; + u_quad_t ip6s_clat464_in_nov4addr_drop; + u_quad_t ip6s_clat464_in_v4synthfail_drop; + u_quad_t ip6s_clat464_in_64transfail_drop; + u_quad_t ip6s_clat464_in_64proto_transfail_drop; + u_quad_t ip6s_clat464_in_64frag_transfail_drop; + u_quad_t ip6s_clat464_in_invalpbuf_drop; + u_quad_t ip6s_clat464_in_success; + u_quad_t ip6s_clat464_in_drop; + u_quad_t ip6s_clat464_in_v4_drop; + + u_quad_t ip6s_clat464_out_nov6addr_drop; + u_quad_t ip6s_clat464_out_v6synthfail_drop; + u_quad_t ip6s_clat464_out_46transfail_drop; + u_quad_t ip6s_clat464_out_46proto_transfail_drop; + u_quad_t ip6s_clat464_out_46frag_transfail_drop; + u_quad_t ip6s_clat464_out_invalpbuf_drop; + u_quad_t ip6s_clat464_out_success; + u_quad_t ip6s_clat464_out_drop; + + u_quad_t ip6s_clat464_v6addr_conffail; + u_quad_t ip6s_clat464_plat64_pfx_setfail; + u_quad_t ip6s_clat464_plat64_pfx_getfail; }; enum ip6s_sources_rule_index { @@ -421,6 +447,7 @@ struct ip6_out_args { #define IP6OAF_AWDL_UNRESTRICTED 0x00000040 /* privileged AWDL */ #define IP6OAF_QOSMARKING_ALLOWED 0x00000080 /* policy allows Fastlane DSCP marking */ #define IP6OAF_INTCOPROC_ALLOWED 0x00000100 /* access to internal coproc interfaces */ +#define IP6OAF_NO_LOW_POWER 0x00000200 /* skip low power */ u_int32_t ip6oa_retflags; /* IP6OARF return flags (see below) */ #define IP6OARF_IFDENIED 0x00000001 /* denied access to interface */ int ip6oa_sotc; /* traffic class for Fastlane DSCP mapping */ diff --git a/bsd/netinet6/ipsec.c b/bsd/netinet6/ipsec.c index 283158d58..5442bf7e8 100644 --- a/bsd/netinet6/ipsec.c +++ b/bsd/netinet6/ipsec.c @@ -2873,8 +2873,10 @@ ipsec_updatereplay(u_int32_t seq, struct secasvar *sav) wsizeb = replay->wsize << 3; /* sequence number of 0 is invalid */ - if (seq == 0) - return 1; + if (seq == 0) { + lck_mtx_unlock(sadb_mutex); + return 1; + } /* first time */ if (replay->count == 0) { @@ -3274,14 +3276,31 @@ ipsec4_interface_output(struct ipsec_output_state *state, ifnet_t interface) LCK_MTX_ASSERT(sadb_mutex, LCK_MTX_ASSERT_NOTOWNED); - if (!state) + if (state == NULL) { panic("state == NULL in ipsec4_output"); - if (!state->m) + } + if (state->m == NULL) { panic("state->m == NULL in ipsec4_output"); - if (!state->dst) + } + if (state->dst == NULL) { panic("state->dst == NULL in ipsec4_output"); + } + + struct ip *ip = mtod(state->m, struct ip *); + + struct sockaddr_in src = {}; + src.sin_family = AF_INET; + src.sin_len = sizeof(src); + memcpy(&src.sin_addr, &ip->ip_src, sizeof(src.sin_addr)); + + struct sockaddr_in dst = {}; + dst.sin_family = AF_INET; + dst.sin_len = sizeof(dst); + memcpy(&dst.sin_addr, &ip->ip_dst, sizeof(dst.sin_addr)); - sav = key_alloc_outbound_sav_for_interface(interface, AF_INET); + sav = key_alloc_outbound_sav_for_interface(interface, AF_INET, + (struct sockaddr *)&src, + (struct sockaddr *)&dst); if (sav == NULL) { goto bad; } @@ -3291,13 +3310,15 @@ ipsec4_interface_output(struct ipsec_output_state *state, ifnet_t interface) } KERNEL_DEBUG(DBG_FNC_IPSEC_OUT | DBG_FUNC_END, 0,0,0,0,0); - if (sav) + if (sav) { key_freesav(sav, KEY_SADB_UNLOCKED); + } return 0; bad: - if (sav) + if (sav) { key_freesav(sav, KEY_SADB_UNLOCKED); + } m_freem(state->m); state->m = NULL; KERNEL_DEBUG(DBG_FNC_IPSEC_OUT | DBG_FUNC_END, error,0,0,0,0); @@ -4058,16 +4079,34 @@ ipsec6_interface_output(struct ipsec_output_state *state, ifnet_t interface, u_c LCK_MTX_ASSERT(sadb_mutex, LCK_MTX_ASSERT_NOTOWNED); - if (!state) + if (state == NULL) { panic("state == NULL in ipsec6_output"); - if (!state->m) + } + if (state->m == NULL) { panic("state->m == NULL in ipsec6_output"); - if (!nexthdrp) + } + if (nexthdrp == NULL) { panic("nexthdrp == NULL in ipsec6_output"); - if (!mprev) + } + if (mprev == NULL) { panic("mprev == NULL in ipsec6_output"); - - sav = key_alloc_outbound_sav_for_interface(interface, AF_INET6); + } + + struct ip6_hdr *ip6 = mtod(state->m, struct ip6_hdr *); + + struct sockaddr_in6 src = {}; + src.sin6_family = AF_INET6; + src.sin6_len = sizeof(src); + memcpy(&src.sin6_addr, &ip6->ip6_src, sizeof(src.sin6_addr)); + + struct sockaddr_in6 dst = {}; + dst.sin6_family = AF_INET6; + dst.sin6_len = sizeof(dst); + memcpy(&dst.sin6_addr, &ip6->ip6_dst, sizeof(dst.sin6_addr)); + + sav = key_alloc_outbound_sav_for_interface(interface, AF_INET6, + (struct sockaddr *)&src, + (struct sockaddr *)&dst); if (sav == NULL) { goto bad; } @@ -4083,13 +4122,15 @@ ipsec6_interface_output(struct ipsec_output_state *state, ifnet_t interface, u_c } } - if (sav) + if (sav) { key_freesav(sav, KEY_SADB_UNLOCKED); + } return 0; bad: - if (sav) + if (sav) { key_freesav(sav, KEY_SADB_UNLOCKED); + } m_freem(state->m); state->m = NULL; return error; diff --git a/bsd/netinet6/nd6.c b/bsd/netinet6/nd6.c index 0b150814a..c0c9d9a56 100644 --- a/bsd/netinet6/nd6.c +++ b/bsd/netinet6/nd6.c @@ -531,6 +531,10 @@ nd6_ifattach(struct ifnet *ifp) nd6_ifreset(ifp); lck_mtx_unlock(&ndi->lock); nd6_setmtu(ifp); + + nd6log0((LOG_INFO, ": ", + "%s Reinit'd ND information for interface %s\n", + if_name(ifp))); return; } @@ -1390,7 +1394,7 @@ nd6_service(void *arg) if (pr->ndpr_expire != 0 && pr->ndpr_expire < timenow) { /* * address expiration and prefix expiration are - * separate. NEVER perform in6_purgeaddr here. + * separate. NEVER perform in6_purgeaddr here. */ pr->ndpr_stateflags |= NDPRF_PROCESSED_SERVICE; NDPR_ADDREF_LOCKED(pr); diff --git a/bsd/netinet6/nd6.h b/bsd/netinet6/nd6.h index 13bb3e963..04c90e17a 100644 --- a/bsd/netinet6/nd6.h +++ b/bsd/netinet6/nd6.h @@ -458,6 +458,7 @@ struct in6_ndifreq_64 { #define NDPRF_PROCESSED_ONLINK 0x08000 #define NDPRF_PROCESSED_SERVICE 0x10000 #define NDPRF_DEFUNCT 0x20000 +#define NDPRF_CLAT46 0x40000 #endif /* protocol constants */ @@ -871,6 +872,8 @@ extern void nd6_alt_node_present(struct ifnet *, struct sockaddr_in6 *, extern void nd6_alt_node_absent(struct ifnet *, struct sockaddr_in6 *); /* nd6_rtr.c */ +extern struct in6_ifaddr *in6_pfx_newpersistaddr(struct nd_prefix *, int, + int *, boolean_t); extern void nd6_rtr_init(void); extern void nd6_rs_input(struct mbuf *, int, int); extern void nd6_ra_input(struct mbuf *, int, int); diff --git a/bsd/netinet6/nd6_rtr.c b/bsd/netinet6/nd6_rtr.c index f54c20f54..453eec269 100644 --- a/bsd/netinet6/nd6_rtr.c +++ b/bsd/netinet6/nd6_rtr.c @@ -100,9 +100,6 @@ static struct nd_defrouter *defrtrlist_update_common(struct nd_defrouter *, boolean_t); static struct nd_defrouter *defrtrlist_update(struct nd_defrouter *); -static struct in6_ifaddr *in6_pfx_newpersistaddr(struct nd_prefix *, int, - int *); - static struct nd_pfxrouter *pfxrtr_lookup(struct nd_prefix *, struct nd_defrouter *); static void pfxrtr_add(struct nd_prefix *, struct nd_defrouter *); @@ -2362,8 +2359,7 @@ prelist_update( * No address matched and the valid lifetime is non-zero. * Create a new address. */ - - if ((ia6 = in6_pfx_newpersistaddr(new, mcast, &error)) + if ((ia6 = in6_pfx_newpersistaddr(new, mcast, &error, FALSE)) != NULL) { /* * note that we should use pr (not new) for reference. @@ -2401,6 +2397,46 @@ prelist_update( IFA_REMREF(&ia6->ia_ifa); ia6 = NULL; + /* + * If the interface is marked for CLAT46 configuration + * try and configure the reserved IPv6 address for + * stateless translation. + */ + if (IS_INTF_CLAT46(ifp)) { + if ((ia6 = in6_pfx_newpersistaddr(new, mcast,&error, TRUE)) != NULL) { + IFA_LOCK(&ia6->ia_ifa); + NDPR_LOCK(pr); + ia6->ia6_ndpr = pr; + NDPR_ADDREF_LOCKED(pr); /* for addr reference */ + pr->ndpr_addrcnt++; + VERIFY(pr->ndpr_addrcnt != 0); + pr->ndpr_stateflags |= NDPRF_CLAT46; + NDPR_UNLOCK(pr); + IFA_UNLOCK(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); + ia6 = NULL; + } else if (error != EEXIST) { + uuid_t tmp_uuid = {}; + /* + * Only report the error if it is not + * EEXIST. + */ + ip6stat.ip6s_clat464_v6addr_conffail++; + in6_clat46_event_enqueue_nwk_wq_entry( + IN6_CLAT46_EVENT_V6_ADDR_CONFFAIL, + 0, + tmp_uuid); + nd6log0((LOG_ERR, "Could not configure CLAT46 address on interface " + "%s.\n", ifp->if_xname)); + } + /* + * Reset the error as we do not want to + * treat failure of CLAT46 address configuration + * as complete failure in prelist update path. + */ + error = 0; + } + /* * A newly added address might affect the status * of other addresses, so we check and update it. @@ -2411,7 +2447,6 @@ prelist_update( lck_mtx_unlock(nd6_mutex); } } - end: if (pr != NULL) NDPR_REMREF(pr); @@ -3543,8 +3578,8 @@ nd6_prefix_offlink(struct nd_prefix *pr) return (error); } -static struct in6_ifaddr * -in6_pfx_newpersistaddr(struct nd_prefix *pr, int mcast, int *errorp) +struct in6_ifaddr * +in6_pfx_newpersistaddr(struct nd_prefix *pr, int mcast, int *errorp, boolean_t is_clat46) { struct in6_ifaddr *ia6 = NULL; struct ifnet *ifp = NULL; @@ -3619,7 +3654,7 @@ in6_pfx_newpersistaddr(struct nd_prefix *pr, int mcast, int *errorp) lck_mtx_unlock(&ndi->lock); NDPR_UNLOCK(pr); - if (notcga) { + if (notcga && !is_clat46) { ia6 = in6ifa_ifpforlinklocal(ifp, 0); if (ia6 == NULL) { error = EADDRNOTAVAIL; @@ -3644,22 +3679,43 @@ in6_pfx_newpersistaddr(struct nd_prefix *pr, int mcast, int *errorp) in6_cga_node_lock(); struct in6_cga_prepare local_cga_prepare; + /* + * XXX For now the collision count is not used in the classical + * way for secure addresses. + * Use a different collision count value to generate reserved + * address for stateless CLAT46 + */ if (ndi->cga_initialized) { bcopy(&(ndi->local_cga_modifier), &(local_cga_prepare.cga_modifier), sizeof(local_cga_prepare.cga_modifier)); - error = in6_cga_generate(&local_cga_prepare, 0, - &ifra.ifra_addr.sin6_addr); + if (!is_clat46) { + error = in6_cga_generate(&local_cga_prepare, 0, + &ifra.ifra_addr.sin6_addr); + } else { + error = in6_cga_generate(&local_cga_prepare, 1, + &ifra.ifra_addr.sin6_addr); + } } else { - error = in6_cga_generate(NULL, 0, - &ifra.ifra_addr.sin6_addr); + if (!is_clat46) + error = in6_cga_generate(NULL, 0, + &ifra.ifra_addr.sin6_addr); + else + error = in6_cga_generate(NULL, 1, + &ifra.ifra_addr.sin6_addr); } in6_cga_node_unlock(); - if (error == 0) + if (error == 0) { ifra.ifra_flags |= IN6_IFF_SECURED; - else { - nd6log((LOG_ERR, "%s: no CGA available (%s)\n", - __func__, if_name(ifp))); + if (is_clat46) + ifra.ifra_flags |= IN6_IFF_CLAT46; + } else { + if (!is_clat46) + nd6log((LOG_ERR, "%s: no CGA available (%s)\n", + __func__, if_name(ifp))); + else + nd6log((LOG_ERR, "%s: no CLAT46 available (%s)\n", + __func__, if_name(ifp))); goto done; } } @@ -3686,7 +3742,7 @@ in6_pfx_newpersistaddr(struct nd_prefix *pr, int mcast, int *errorp) */ if ((ia6 = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr)) != NULL) { - error = EADDRNOTAVAIL; + error = EEXIST; IFA_REMREF(&ia6->ia_ifa); ia6 = NULL; diff --git a/bsd/netinet6/nd6_send.c b/bsd/netinet6/nd6_send.c index cc7d35a5f..18a0b96e4 100644 --- a/bsd/netinet6/nd6_send.c +++ b/bsd/netinet6/nd6_send.c @@ -116,7 +116,7 @@ sysctl_cga_parameters SYSCTL_HANDLER_ARGS #endif MALLOC(buffer, char *, SYSCTL_CGA_PARAMETERS_BUFFER_SIZE, M_IP6CGA, - M_WAITOK); + M_WAITOK | M_ZERO); if (buffer == NULL) { log(LOG_ERR, "%s: could not allocate marshaling buffer.\n", __func__); diff --git a/bsd/netinet6/raw_ip6.c b/bsd/netinet6/raw_ip6.c index ec7f823fb..92ec475f4 100644 --- a/bsd/netinet6/raw_ip6.c +++ b/bsd/netinet6/raw_ip6.c @@ -197,7 +197,7 @@ rip6_input( #if NECP if (n && !necp_socket_is_allowed_to_send_recv_v6(in6p, 0, 0, - &ip6->ip6_dst, &ip6->ip6_src, ifp, NULL, NULL)) { + &ip6->ip6_dst, &ip6->ip6_src, ifp, NULL, NULL, NULL)) { m_freem(n); /* do not inject data into pcb */ } else @@ -205,7 +205,8 @@ rip6_input( if (n) { if ((last->in6p_flags & INP_CONTROLOPTS) != 0 || (last->in6p_socket->so_options & SO_TIMESTAMP) != 0 || - (last->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + (last->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0 || + (last->in6p_socket->so_options & SO_TIMESTAMP_CONTINUOUS) != 0) { ret = ip6_savecontrol(last, n, &opts); if (ret != 0) { m_freem(n); @@ -231,7 +232,7 @@ rip6_input( #if NECP if (last && !necp_socket_is_allowed_to_send_recv_v6(in6p, 0, 0, - &ip6->ip6_dst, &ip6->ip6_src, ifp, NULL, NULL)) { + &ip6->ip6_dst, &ip6->ip6_src, ifp, NULL, NULL, NULL)) { m_freem(m); ip6stat.ip6s_delivered--; /* do not inject data into pcb */ @@ -240,7 +241,8 @@ rip6_input( if (last) { if ((last->in6p_flags & INP_CONTROLOPTS) != 0 || (last->in6p_socket->so_options & SO_TIMESTAMP) != 0 || - (last->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + (last->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0 || + (last->in6p_socket->so_options & SO_TIMESTAMP_CONTINUOUS) != 0) { ret = ip6_savecontrol(last, m, &opts); if (ret != 0) { m_freem(m); @@ -568,6 +570,7 @@ rip6_output( #if NECP { necp_kernel_policy_id policy_id; + necp_kernel_policy_id skip_policy_id; u_int32_t route_rule_id; /* @@ -603,12 +606,12 @@ rip6_output( } if (!necp_socket_is_allowed_to_send_recv_v6(in6p, 0, 0, - &ip6->ip6_src, &ip6->ip6_dst, NULL, &policy_id, &route_rule_id)) { + &ip6->ip6_src, &ip6->ip6_dst, NULL, &policy_id, &route_rule_id, &skip_policy_id)) { error = EHOSTUNREACH; goto bad; } - necp_mark_packet_from_socket(m, in6p, policy_id, route_rule_id); + necp_mark_packet_from_socket(m, in6p, policy_id, route_rule_id, skip_policy_id); if (net_qos_policy_restricted != 0) { necp_socket_update_qos_marking(in6p, in6p->in6p_route.ro_rt, @@ -640,6 +643,11 @@ rip6_output( m->m_pkthdr.pkt_flags |= (PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC | PKTF_FLOW_RAWSOCK); m->m_pkthdr.pkt_proto = in6p->in6p_ip6_nxt; + m->m_pkthdr.tx_rawip_pid = so->last_pid; + if (so->so_flags & SOF_DELEGATED) + m->m_pkthdr.tx_rawip_e_pid = so->e_pid; + else + m->m_pkthdr.tx_rawip_e_pid = 0; if (im6o != NULL) IM6O_ADDREF(im6o); @@ -880,8 +888,8 @@ rip6_bind(struct socket *so, struct sockaddr *nam, struct proc *p) if (ifa != NULL) { IFA_LOCK(ifa); if (((struct in6_ifaddr *)ifa)->ia6_flags & - (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY| - IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) { + (IN6_IFF_ANYCAST | IN6_IFF_NOTREADY | IN6_IFF_CLAT46 | + IN6_IFF_DETACHED | IN6_IFF_DEPRECATED)) { IFA_UNLOCK(ifa); IFA_REMREF(ifa); return (EADDRNOTAVAIL); diff --git a/bsd/netinet6/udp6_output.c b/bsd/netinet6/udp6_output.c index fdda7e512..2e674c328 100644 --- a/bsd/netinet6/udp6_output.c +++ b/bsd/netinet6/udp6_output.c @@ -136,6 +136,10 @@ #include +#if CONTENT_FILTER +#include +#endif /* CONTENT_FILTER */ + /* * UDP protocol inplementation. * Per RFC 768, August, 1980. @@ -166,6 +170,13 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, struct socket *so = in6p->in6p_socket; struct route_in6 ro; int flowadv = 0; +#if CONTENT_FILTER + struct m_tag *cfil_tag = NULL; + bool cfil_faddr_use = false; + uint32_t cfil_so_state_change_cnt = 0; + struct sockaddr *cfil_faddr = NULL; + struct sockaddr_in6 *cfil_sin6 = NULL; +#endif bzero(&ip6oa, sizeof(ip6oa)); ip6oa.ip6oa_boundif = IFSCOPE_NONE; @@ -192,6 +203,28 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, if (INP_INTCOPROC_ALLOWED(in6p)) ip6oa.ip6oa_flags |= IP6OAF_INTCOPROC_ALLOWED; +#if CONTENT_FILTER + /* + * If socket is subject to UDP Content Filter and no addr is passed in, + * retrieve CFIL saved state from mbuf and use it if necessary. + */ + if (so->so_cfil_db && !addr6) { + cfil_tag = cfil_udp_get_socket_state(m, &cfil_so_state_change_cnt, NULL, &cfil_faddr); + if (cfil_tag) { + cfil_sin6 = (struct sockaddr_in6 *)(void *)cfil_faddr; + if ((so->so_state_change_cnt != cfil_so_state_change_cnt) && + (in6p->in6p_fport != cfil_sin6->sin6_port || + !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &cfil_sin6->sin6_addr))) { + /* + * Socket is connected but socket state and dest addr/port changed. + * We need to use the saved faddr info. + */ + cfil_faddr_use = true; + } + } + } +#endif + if (control) { sotc = so_tc_from_control(control, &netsvctype); if ((error = ip6_setpktopts(control, &opt, @@ -284,7 +317,20 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, error = ENOTCONN; goto release; } - if (IN6_IS_ADDR_V4MAPPED(&in6p->in6p_faddr)) { + laddr = &in6p->in6p_laddr; + faddr = &in6p->in6p_faddr; + fport = in6p->in6p_fport; +#if CONTENT_FILTER + if (cfil_faddr_use) + { + faddr = &((struct sockaddr_in6 *)(void *)cfil_faddr)->sin6_addr; + fport = ((struct sockaddr_in6 *)(void *)cfil_faddr)->sin6_port; + + /* Do not use cached route */ + ROUTE_RELEASE(&in6p->in6p_route); + } +#endif + if (IN6_IS_ADDR_V4MAPPED(faddr)) { if ((in6p->in6p_flags & IN6P_IPV6_V6ONLY)) { /* * XXX: this case would happen when the @@ -300,9 +346,7 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, } else af = AF_INET; } - laddr = &in6p->in6p_laddr; - faddr = &in6p->in6p_faddr; - fport = in6p->in6p_fport; + } if (in6p->inp_flowhash == 0) @@ -374,6 +418,7 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, #if NECP { necp_kernel_policy_id policy_id; + necp_kernel_policy_id skip_policy_id; u_int32_t route_rule_id; /* @@ -408,12 +453,12 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, in6p->inp_policyresult.results.qos_marking_gencount = 0; } - if (!necp_socket_is_allowed_to_send_recv_v6(in6p, in6p->in6p_lport, fport, laddr, faddr, NULL, &policy_id, &route_rule_id)) { + if (!necp_socket_is_allowed_to_send_recv_v6(in6p, in6p->in6p_lport, fport, laddr, faddr, NULL, &policy_id, &route_rule_id, &skip_policy_id)) { error = EHOSTUNREACH; goto release; } - necp_mark_packet_from_socket(m, in6p, policy_id, route_rule_id); + necp_mark_packet_from_socket(m, in6p, policy_id, route_rule_id, skip_policy_id); if (net_qos_policy_restricted != 0) { necp_socket_update_qos_marking(in6p, in6p->in6p_route.ro_rt, @@ -447,6 +492,11 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, m->m_pkthdr.pkt_flags |= (PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC); if (flowadv) m->m_pkthdr.pkt_flags |= PKTF_FLOW_ADV; + m->m_pkthdr.tx_udp_pid = so->last_pid; + if (so->so_flags & SOF_DELEGATED) + m->m_pkthdr.tx_udp_e_pid = so->e_pid; + else + m->m_pkthdr.tx_udp_e_pid = 0; im6o = in6p->in6p_moptions; if (im6o != NULL) { @@ -523,6 +573,14 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, if (rt->rt_flags & RTF_MULTICAST) rt = NULL; /* unusable */ +#if CONTENT_FILTER + /* + * Discard temporary route for cfil case + */ + if (cfil_faddr_use) + rt = NULL; /* unusable */ +#endif + /* * Always discard the cached route for unconnected * socket or if it is a multicast route. @@ -574,5 +632,9 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6, ip6_clearpktopts(optp, -1); m_freem(control); } +#if CONTENT_FILTER + if (cfil_tag) + m_tag_free(cfil_tag); +#endif return (error); } diff --git a/bsd/netinet6/udp6_usrreq.c b/bsd/netinet6/udp6_usrreq.c index 8680560fc..325e3773d 100644 --- a/bsd/netinet6/udp6_usrreq.c +++ b/bsd/netinet6/udp6_usrreq.c @@ -145,6 +145,10 @@ extern int esp_udp_encap_port; #include #endif /* FLOW_DIVERT */ +#if CONTENT_FILTER +#include +#endif /* CONTENT_FILTER */ + /* * UDP protocol inplementation. * Per RFC 768, August, 1980. @@ -206,7 +210,8 @@ udp6_append(struct inpcb *last, struct ip6_hdr *ip6, #endif /* CONFIG_MACF_NET */ if ((last->in6p_flags & INP_CONTROLOPTS) != 0 || (last->in6p_socket->so_options & SO_TIMESTAMP) != 0 || - (last->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + (last->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0 || + (last->in6p_socket->so_options & SO_TIMESTAMP_CONTINUOUS) != 0) { ret = ip6_savecontrol(last, n, &opts); if (ret != 0) { m_freem(n); @@ -400,7 +405,7 @@ udp6_input(struct mbuf **mp, int *offp, int proto) skipit = 0; if (!necp_socket_is_allowed_to_send_recv_v6(in6p, uh->uh_dport, uh->uh_sport, &ip6->ip6_dst, - &ip6->ip6_src, ifp, NULL, NULL)) { + &ip6->ip6_src, ifp, NULL, NULL, NULL)) { /* do not inject data to pcb */ skipit = 1; } @@ -548,7 +553,7 @@ udp6_input(struct mbuf **mp, int *offp, int proto) } #if NECP if (!necp_socket_is_allowed_to_send_recv_v6(in6p, uh->uh_dport, - uh->uh_sport, &ip6->ip6_dst, &ip6->ip6_src, ifp, NULL, NULL)) { + uh->uh_sport, &ip6->ip6_dst, &ip6->ip6_src, ifp, NULL, NULL, NULL)) { in_pcb_checkstate(in6p, WNT_RELEASE, 0); IF_UDP_STATINC(ifp, badipsec); goto bad; @@ -571,7 +576,8 @@ udp6_input(struct mbuf **mp, int *offp, int proto) udp_in6.sin6_port = uh->uh_sport; if ((in6p->in6p_flags & INP_CONTROLOPTS) != 0 || (in6p->in6p_socket->so_options & SO_TIMESTAMP) != 0 || - (in6p->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + (in6p->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0 || + (in6p->in6p_socket->so_options & SO_TIMESTAMP_CONTINUOUS) != 0) { ret = ip6_savecontrol(in6p, m, &opts); if (ret != 0) { udp_unlock(in6p->in6p_socket, 1, 0); @@ -943,6 +949,10 @@ udp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, #if defined(NECP) && defined(FLOW_DIVERT) int should_use_flow_divert = 0; #endif /* defined(NECP) && defined(FLOW_DIVERT) */ +#if CONTENT_FILTER + struct m_tag *cfil_tag = NULL; + struct sockaddr *cfil_faddr = NULL; +#endif inp = sotoinpcb(so); if (inp == NULL) { @@ -950,6 +960,16 @@ udp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, goto bad; } +#if CONTENT_FILTER + //If socket is subject to UDP Content Filter and unconnected, get addr from tag. + if (so->so_cfil_db && !addr && IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { + cfil_tag = cfil_udp_get_socket_state(m, NULL, NULL, &cfil_faddr); + if (cfil_tag) { + addr = (struct sockaddr *)cfil_faddr; + } + } +#endif + #if defined(NECP) && defined(FLOW_DIVERT) should_use_flow_divert = necp_socket_should_use_flow_divert(inp); #endif /* defined(NECP) && defined(FLOW_DIVERT) */ @@ -989,6 +1009,10 @@ udp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, pru = ip_protox[IPPROTO_UDP]->pr_usrreqs; error = ((*pru->pru_send)(so, flags, m, addr, control, p)); +#if CONTENT_FILTER + if (cfil_tag) + m_tag_free(cfil_tag); +#endif /* addr will just be freed in sendit(). */ return (error); } @@ -998,11 +1022,21 @@ udp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, do_flow_divert: if (should_use_flow_divert) { /* Implicit connect */ - return (flow_divert_implicit_data_out(so, flags, m, addr, control, p)); + error = flow_divert_implicit_data_out(so, flags, m, addr, control, p); +#if CONTENT_FILTER + if (cfil_tag) + m_tag_free(cfil_tag); +#endif + return error; } #endif /* defined(NECP) && defined(FLOW_DIVERT) */ - return (udp6_output(inp, m, addr, control, p)); + error = udp6_output(inp, m, addr, control, p); +#if CONTENT_FILTER + if (cfil_tag) + m_tag_free(cfil_tag); +#endif + return error; bad: VERIFY(error != 0); @@ -1011,7 +1045,10 @@ udp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, m_freem(m); if (control != NULL) m_freem(control); - +#if CONTENT_FILTER + if (cfil_tag) + m_tag_free(cfil_tag); +#endif return (error); } diff --git a/bsd/netkey/key.c b/bsd/netkey/key.c index 5272fa8df..f373441d0 100644 --- a/bsd/netkey/key.c +++ b/bsd/netkey/key.c @@ -838,7 +838,9 @@ key_gettunnel( return sp; } -struct secasvar *key_alloc_outbound_sav_for_interface(ifnet_t interface, int family) +struct secasvar *key_alloc_outbound_sav_for_interface(ifnet_t interface, int family, + struct sockaddr *src, + struct sockaddr *dst) { struct secashead *sah; struct secasvar *sav; @@ -848,47 +850,75 @@ struct secasvar *key_alloc_outbound_sav_for_interface(ifnet_t interface, int fam int arraysize; struct sockaddr_in *sin; u_int16_t dstport; + bool strict = true; - if (interface == NULL) + if (interface == NULL) { return NULL; + } LCK_MTX_ASSERT(sadb_mutex, LCK_MTX_ASSERT_NOTOWNED); lck_mtx_lock(sadb_mutex); - - LIST_FOREACH(sah, &sahtree, chain) { - if (sah->state == SADB_SASTATE_DEAD) { - continue; - } - if (sah->ipsec_if == interface && - (family == AF_INET6 || family == AF_INET) && - sah->dir == IPSEC_DIR_OUTBOUND) { - /* This SAH is linked to the IPSec interface, and the right family. We found it! */ - if (key_preferred_oldsa) { - saorder_state_valid = saorder_state_valid_prefer_old; - arraysize = _ARRAYLEN(saorder_state_valid_prefer_old); - } else { - saorder_state_valid = saorder_state_valid_prefer_new; - arraysize = _ARRAYLEN(saorder_state_valid_prefer_new); + + do { + LIST_FOREACH(sah, &sahtree, chain) { + if (sah->state == SADB_SASTATE_DEAD) { + continue; } - - sin = (struct sockaddr_in *)&sah->saidx.dst; - dstport = sin->sin_port; - if (sah->saidx.mode == IPSEC_MODE_TRANSPORT) - sin->sin_port = IPSEC_PORT_ANY; - - for (stateidx = 0; stateidx < arraysize; stateidx++) { - state = saorder_state_valid[stateidx]; - sav = key_do_allocsa_policy(sah, state, dstport); - if (sav != NULL) { - lck_mtx_unlock(sadb_mutex); - return sav; + if (sah->ipsec_if == interface && + (family == AF_INET6 || family == AF_INET) && + sah->dir == IPSEC_DIR_OUTBOUND) { + + if (strict && + sah->saidx.mode == IPSEC_MODE_TRANSPORT && + src != NULL && dst != NULL) { + // Validate addresses for transport mode + if (key_sockaddrcmp((struct sockaddr *)&sah->saidx.src, src, 0) != 0) { + // Source doesn't match + continue; + } + + if (key_sockaddrcmp((struct sockaddr *)&sah->saidx.dst, dst, 0) != 0) { + // Destination doesn't match + continue; + } } + + /* This SAH is linked to the IPSec interface, and the right family. We found it! */ + if (key_preferred_oldsa) { + saorder_state_valid = saorder_state_valid_prefer_old; + arraysize = _ARRAYLEN(saorder_state_valid_prefer_old); + } else { + saorder_state_valid = saorder_state_valid_prefer_new; + arraysize = _ARRAYLEN(saorder_state_valid_prefer_new); + } + + sin = (struct sockaddr_in *)&sah->saidx.dst; + dstport = sin->sin_port; + if (sah->saidx.mode == IPSEC_MODE_TRANSPORT) { + sin->sin_port = IPSEC_PORT_ANY; + } + + for (stateidx = 0; stateidx < arraysize; stateidx++) { + state = saorder_state_valid[stateidx]; + sav = key_do_allocsa_policy(sah, state, dstport); + if (sav != NULL) { + lck_mtx_unlock(sadb_mutex); + return sav; + } + } + + break; } - + } + if (strict) { + // If we didn't find anything, try again without strict + strict = false; + } else { + // We already were on the second try, bail break; } - } + } while (true); lck_mtx_unlock(sadb_mutex); return NULL; @@ -9232,7 +9262,7 @@ key_promisc( } } -static int (*key_typesw[])(struct socket *, struct mbuf *, +static int (*const key_typesw[])(struct socket *, struct mbuf *, const struct sadb_msghdr *) = { NULL, /* SADB_RESERVED */ key_getspi, /* SADB_GETSPI */ diff --git a/bsd/netkey/key.h b/bsd/netkey/key.h index c13c36947..c61f04f22 100644 --- a/bsd/netkey/key.h +++ b/bsd/netkey/key.h @@ -57,7 +57,9 @@ extern struct secpolicy *key_allocsp(struct secpolicyindex *, u_int); extern struct secasvar *key_allocsa_policy(struct secasindex *); extern struct secpolicy *key_gettunnel(struct sockaddr *, struct sockaddr *, struct sockaddr *, struct sockaddr *); -extern struct secasvar *key_alloc_outbound_sav_for_interface(ifnet_t, int); +extern struct secasvar *key_alloc_outbound_sav_for_interface(ifnet_t interface, int family, + struct sockaddr *src, + struct sockaddr *dst); extern int key_checkrequest(struct ipsecrequest *isr, struct secasindex *, struct secasvar **sav); extern struct secasvar *key_allocsa(u_int, caddr_t, caddr_t, diff --git a/bsd/nfs/nfs_gss.c b/bsd/nfs/nfs_gss.c index a9c78f193..02c121289 100644 --- a/bsd/nfs/nfs_gss.c +++ b/bsd/nfs/nfs_gss.c @@ -1598,13 +1598,15 @@ nfs_gss_clnt_ctx_callserver(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) FREE(cp->gss_clnt_handle, M_TEMP); cp->gss_clnt_handle = NULL; } - if (cp->gss_clnt_handle_len > 0) { + if (cp->gss_clnt_handle_len > 0 && cp->gss_clnt_handle_len < GSS_MAX_CTX_HANDLE_LEN) { MALLOC(cp->gss_clnt_handle, u_char *, cp->gss_clnt_handle_len, M_TEMP, M_WAITOK); if (cp->gss_clnt_handle == NULL) { error = ENOMEM; goto nfsmout; } nfsm_chain_get_opaque(error, &nmrep, cp->gss_clnt_handle_len, cp->gss_clnt_handle); + } else { + error = EBADRPC; } nfsm_chain_get_32(error, &nmrep, cp->gss_clnt_major); nfsm_chain_get_32(error, &nmrep, cp->gss_clnt_minor); @@ -1612,13 +1614,15 @@ nfs_gss_clnt_ctx_callserver(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) nfsm_chain_get_32(error, &nmrep, cp->gss_clnt_tokenlen); if (error) goto nfsmout; - if (cp->gss_clnt_tokenlen > 0) { + if (cp->gss_clnt_tokenlen > 0 && cp->gss_clnt_tokenlen < GSS_MAX_TOKEN_LEN) { MALLOC(cp->gss_clnt_token, u_char *, cp->gss_clnt_tokenlen, M_TEMP, M_WAITOK); if (cp->gss_clnt_token == NULL) { error = ENOMEM; goto nfsmout; } nfsm_chain_get_opaque(error, &nmrep, cp->gss_clnt_tokenlen, cp->gss_clnt_token); + } else { + error = EBADRPC; } /* @@ -3065,7 +3069,9 @@ nfs_gss_svc_cred_get(struct nfsrv_descript *nd, struct nfsm_chain *nmc) nmc_tmp = *nmc; nfsm_chain_adv(error, &nmc_tmp, arglen); nfsm_chain_get_32(error, &nmc_tmp, cksum.length); - MALLOC(cksum.value, void *, cksum.length, M_TEMP, M_WAITOK); + cksum.value = NULL; + if (cksum.length > 0 && cksum.length < GSS_MAX_MIC_LEN) + MALLOC(cksum.value, void *, cksum.length, M_TEMP, M_WAITOK); if (cksum.value == NULL) { error = EBADRPC; @@ -3354,11 +3360,9 @@ nfs_gss_svc_ctx_init(struct nfsrv_descript *nd, struct nfsrv_sock *slp, mbuf_t * case RPCSEC_GSS_CONTINUE_INIT: /* Get the token from the request */ nfsm_chain_get_32(error, nmreq, cp->gss_svc_tokenlen); - if (cp->gss_svc_tokenlen == 0) { - autherr = RPCSEC_GSS_CREDPROBLEM; - break; - } - MALLOC(cp->gss_svc_token, u_char *, cp->gss_svc_tokenlen, M_TEMP, M_WAITOK); + cp->gss_svc_token = NULL; + if (cp->gss_svc_tokenlen > 0 && cp->gss_svc_tokenlen < GSS_MAX_TOKEN_LEN) + MALLOC(cp->gss_svc_token, u_char *, cp->gss_svc_tokenlen, M_TEMP, M_WAITOK); if (cp->gss_svc_token == NULL) { autherr = RPCSEC_GSS_CREDPROBLEM; break; diff --git a/bsd/nfs/nfs_gss.h b/bsd/nfs/nfs_gss.h index fe3db1893..5b6887f9b 100644 --- a/bsd/nfs/nfs_gss.h +++ b/bsd/nfs/nfs_gss.h @@ -54,6 +54,22 @@ enum rpcsec_gss_service { extern u_char krb5_mech_oid[11]; +/* + * RFC 2203 and friends don't define maximums for token lengths + * and context handles. We try to pick reasonable values here. + * + * N.B. Kerberos mech tokens can be quite large from the output + * of a gss_init_sec_context if it includes a large PAC. + */ + +#define GSS_MAX_CTX_HANDLE_LEN 256 +#define GSS_MAX_TOKEN_LEN 64*1024 + +/* + * Put a "reasonble" bound on MIC lengths + */ +#define GSS_MAX_MIC_LEN 2048 + #define GSS_MAXSEQ 0x80000000 // The biggest sequence number #define GSS_SVC_MAXCONTEXTS 500000 // Max contexts supported #define GSS_SVC_SEQWINDOW 256 // Server's sequence window diff --git a/bsd/nfs/nfs_lock.c b/bsd/nfs/nfs_lock.c index 9920f3c89..2514489c1 100644 --- a/bsd/nfs/nfs_lock.c +++ b/bsd/nfs/nfs_lock.c @@ -679,8 +679,8 @@ nfs3_lockd_request( * higher levels can resend the request. */ msg->lm_flags &= ~LOCKD_MSG_CANCEL; - nfs_lockdmsg_dequeue(msgreq); error = NFSERR_DENIED; + /* Will dequeue msgreq after the following break at the end of this routine */ break; } diff --git a/bsd/nfs/nfs_serv.c b/bsd/nfs/nfs_serv.c index 027d7a5d8..06c11bb72 100644 --- a/bsd/nfs/nfs_serv.c +++ b/bsd/nfs/nfs_serv.c @@ -440,8 +440,26 @@ nfsrv_getattr( error = nfsrv_credcheck(nd, ctx, nx, nxo); nfsmerr_if(error); +#if CONFIG_MAC + if (mac_vnode_check_open(ctx, vp, FREAD)) + error = ESTALE; + nfsmerr_if(error); +#endif + nfsm_srv_vattr_init(&vattr, nd->nd_vers); error = vnode_getattr(vp, &vattr, ctx); + +#if CONFIG_MAC + /* XXXab: Comment in the VFS code makes it sound like + * some arguments can be filtered out, but not + * what it actually means. Hopefully not like + * they gonna set mtime to 0 or something. For + * now trust there are no shenanigans here. + */ + error = mac_vnode_check_getattr(ctx, NOCRED, vp, &vattr); + nfsmerr_if(error); +#endif + vnode_put(vp); vp = NULL; @@ -556,6 +574,9 @@ nfsrv_setattr( error = nfsrv_authorize(vp, NULL, action, ctx, nxo, 0); #if CONFIG_MACF + if (!error && mac_vnode_check_open(ctx, vp, FREAD|FWRITE)) + error = ESTALE; + if (!error) { /* chown case */ if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) { @@ -666,6 +687,18 @@ nfsrv_lookup( /* update active user stats */ nfsrv_update_user_stat(nx, nd, saved_uid, 1, 0, 0); } + if (!error && mac_vnode_check_open(ctx, ni.ni_vp, FREAD)) { + error = EACCES; + if (dirp) { + vnode_put(dirp); + dirp = NULL; + } + + if (ni.ni_vp) { + vnode_put(ni.ni_vp); + ni.ni_vp = NULL; + } + } } if (dirp) { @@ -788,6 +821,13 @@ nfsrv_readlink( if (!error) error = nfsrv_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, ctx, nxo, 0); +#if CONFIG_MACF + if (mac_vnode_check_open(ctx, vp, FREAD)) + error = ESTALE; + nfsmerr_if(error); + if (!error) + error = mac_vnode_check_readlink(ctx, vp); +#endif if (!error) error = VNOP_READLINK(vp, auio, ctx); if (vp) { @@ -906,6 +946,21 @@ nfsrv_read( if ((error = nfsrv_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, ctx, nxo, 1))) error = nfsrv_authorize(vp, NULL, KAUTH_VNODE_EXECUTE, ctx, nxo, 1); } +#if CONFIG_MACF + if (!error) { + error = mac_vnode_check_open(ctx, vp, FREAD); + if (error) { + error = EACCES; + } else { + /* XXXab: Do we need to do this?! */ + error = mac_vnode_check_read(ctx, vfs_context_ucred(ctx), vp); + if (error) + error = EACCES; + /* mac_vnode_check_exec() can't be done here. */ + } + } + nfsmerr_if(error); +#endif nfsm_srv_vattr_init(vap, nd->nd_vers); attrerr = vnode_getattr(vp, vap, ctx); if (!error) @@ -4073,6 +4128,15 @@ nfsrv_readdir( } if (!error) error = nfsrv_authorize(vp, NULL, KAUTH_VNODE_LIST_DIRECTORY, ctx, nxo, 0); +#if CONFIG_MACF + if (!error) { + if (!error && mac_vnode_check_open(ctx, vp, FREAD)) + error = EACCES; + + if (!error) + error = mac_vnode_check_readdir(ctx, vp); + } +#endif nfsmerr_if(error); MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK); @@ -4294,6 +4358,15 @@ nfsrv_readdirplus( error = NFSERR_BAD_COOKIE; if (!error) error = nfsrv_authorize(vp, NULL, KAUTH_VNODE_LIST_DIRECTORY, ctx, nxo, 0); +#if CONFIG_MACF + if (!error) { + if (!error && mac_vnode_check_open(ctx, vp, FREAD)) + error = EACCES; + + if (!error) + error = mac_vnode_check_readdir(ctx, vp); + } +#endif nfsmerr_if(error); MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK); diff --git a/bsd/nfs/nfs_socket.c b/bsd/nfs/nfs_socket.c index f1123c328..8e3562c64 100644 --- a/bsd/nfs/nfs_socket.c +++ b/bsd/nfs/nfs_socket.c @@ -4749,6 +4749,7 @@ nfs_request_timer(__unused void *param0, __unused void *param1) int timeo, maxtime, finish_asyncio, error; struct timeval now; TAILQ_HEAD(nfs_mount_pokeq, nfsmount) nfs_mount_poke_queue; + TAILQ_INIT(&nfs_mount_poke_queue); restart: lck_mtx_lock(nfs_request_mutex); @@ -4760,7 +4761,6 @@ nfs_request_timer(__unused void *param0, __unused void *param1) } nfs_reqbusy(req); - TAILQ_INIT(&nfs_mount_poke_queue); microuptime(&now); for ( ; req != NULL ; req = nfs_reqnext(req)) { diff --git a/bsd/nfs/nfs_subs.c b/bsd/nfs/nfs_subs.c index ff5ae70b6..0702fbed9 100644 --- a/bsd/nfs/nfs_subs.c +++ b/bsd/nfs/nfs_subs.c @@ -1688,8 +1688,15 @@ nfs_getattrcache(nfsnode_t np, struct nfs_vattr *nvaper, int flags) * and return cached attributes. */ if (!nfs_use_cache(nmp)) { - timeo = nfs_attrcachetimeout(np); microuptime(&nowup); + if (np->n_attrstamp > nowup.tv_sec) { + printf("NFS: Attribute time stamp is in the future by %ld seconds. Invalidating cache\n", + np->n_attrstamp - nowup.tv_sec); + NATTRINVALIDATE(np); + NACCESSINVALIDATE(np); + return (ENOENT); + } + timeo = nfs_attrcachetimeout(np); if ((nowup.tv_sec - np->n_attrstamp) >= timeo) { FSDBG(528, np, 0, 0xffffff02, ENOENT); OSAddAtomic64(1, &nfsstats.attrcache_misses); diff --git a/bsd/nfs/nfs_vfsops.c b/bsd/nfs/nfs_vfsops.c index a5fc908b5..17c51b7da 100644 --- a/bsd/nfs/nfs_vfsops.c +++ b/bsd/nfs/nfs_vfsops.c @@ -2814,8 +2814,9 @@ mountnfs( xb_get_32(error, &xb, val); /* version */ xb_get_32(error, &xb, argslength); /* args length */ xb_get_32(error, &xb, val); /* XDR args version */ - if (val != NFS_XDRARGS_VERSION_0) + if (val != NFS_XDRARGS_VERSION_0 || argslength < ((4 + NFS_MATTR_BITMAP_LEN + 1) * XDRWORD)) { error = EINVAL; + } len = NFS_MATTR_BITMAP_LEN; xb_get_bitmap(error, &xb, mattrs, len); /* mount attribute bitmap */ attrslength = 0; @@ -4523,6 +4524,8 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags) /* Since we've drop the request mutex we can now safely unreference the request */ TAILQ_FOREACH_SAFE(req, &resendq, r_rchain, treq) { TAILQ_REMOVE(&resendq, req, r_rchain); + /* Make sure we don't try and remove again in nfs_request_destroy */ + req->r_rchain.tqe_next = NFSREQNOLIST; nfs_request_rele(req); } diff --git a/bsd/nfs/nfs_vnops.c b/bsd/nfs/nfs_vnops.c index 0ca66d6e6..5753b6ea8 100644 --- a/bsd/nfs/nfs_vnops.c +++ b/bsd/nfs/nfs_vnops.c @@ -6925,7 +6925,7 @@ nfs_vnop_ioctl( vfs_context_t ctx = ap->a_context; vnode_t vp = ap->a_vp; struct nfsmount *mp = VTONMP(vp); - struct user_nfs_gss_principal gprinc; + struct user_nfs_gss_principal gprinc = {}; uint32_t len; int error = ENOTTY; diff --git a/bsd/pgo/profile_runtime.c b/bsd/pgo/profile_runtime.c index ec24bfffe..4c115151b 100644 --- a/bsd/pgo/profile_runtime.c +++ b/bsd/pgo/profile_runtime.c @@ -113,7 +113,7 @@ kern_return_t do_pgo_reset_counters() static kern_return_t kextpgo_trap() { - return DebuggerTrapWithState(DBOP_RESET_PGO_COUNTERS, NULL, NULL, NULL, 0, FALSE, 0); + return DebuggerTrapWithState(DBOP_RESET_PGO_COUNTERS, NULL, NULL, NULL, 0, NULL, FALSE, 0); } static kern_return_t diff --git a/bsd/pthread/Makefile b/bsd/pthread/Makefile new file mode 100644 index 000000000..ef0643f8a --- /dev/null +++ b/bsd/pthread/Makefile @@ -0,0 +1,48 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + +include $(MakeInc_cmd) +include $(MakeInc_def) + +DATAFILES = \ + bsdthread_private.h \ + priority_private.h \ + workqueue_syscalls.h + +PRIVATE_DATAFILES = \ + bsdthread_private.h \ + priority_private.h \ + workqueue_syscalls.h + +KERNELFILES = \ + +PRIVATE_KERNELFILES = \ + +INTERNAL_KERNELFILES = \ + bsdthread_private.h \ + priority_private.h \ + workqueue_internal.h \ + workqueue_syscalls.h \ + workqueue_trace.h + +INSTALL_MI_DIR = pthread + +# /usr/local/include without PRIVATE stuff +# /System/Library/Frameworks/System.framework/PrivateHeaders +INCDIR = /usr/local/include +INSTALL_MI_LIST = ${DATAFILES} +INSTALL_MI_LCL_LIST = ${PRIVATE_DATAFILES} + +# /System/Library/Frameworks/Kernel.framework/Headers +# /System/Library/Frameworks/Kernel.framework/PrivateHeaders +INSTALL_KF_MI_LIST = $(sort ${KERNELFILES}) +INSTALL_KF_MI_LCL_LIST = $(sort ${KERNELFILES} ${PRIVATE_KERNELFILES}) + +EXPORT_MI_LIST = $(sort ${KERNELFILES} ${PRIVATE_KERNELFILES} ${INTERNAL_KERNELFILES}) + +EXPORT_MI_DIR = ${INSTALL_MI_DIR} + +include $(MakeInc_rule) +include $(MakeInc_dir) diff --git a/bsd/pthread/bsdthread_private.h b/bsd/pthread/bsdthread_private.h new file mode 100644 index 000000000..af854feb5 --- /dev/null +++ b/bsd/pthread/bsdthread_private.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2017 Apple, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _PTHREAD_BSDTHREAD_PRIVATE_H_ +#define _PTHREAD_BSDTHREAD_PRIVATE_H_ + +#if XNU_KERNEL_PRIVATE && !defined(__PTHREAD_EXPOSE_INTERNALS__) +#define __PTHREAD_EXPOSE_INTERNALS__ 1 +#endif // XNU_KERNEL_PRIVATE + +#ifdef __PTHREAD_EXPOSE_INTERNALS__ + +/* pthread bsdthread_ctl sysctl commands */ +/* bsdthread_ctl(BSDTHREAD_CTL_SET_QOS, thread_port, tsd_entry_addr, 0) */ +#define BSDTHREAD_CTL_SET_QOS 0x10 +/* bsdthread_ctl(BSDTHREAD_CTL_GET_QOS, thread_port, 0, 0) */ +#define BSDTHREAD_CTL_GET_QOS 0x20 +/* bsdthread_ctl(BSDTHREAD_CTL_QOS_OVERRIDE_START, thread_port, priority, 0) */ +#define BSDTHREAD_CTL_QOS_OVERRIDE_START 0x40 +/* bsdthread_ctl(BSDTHREAD_CTL_QOS_OVERRIDE_END, thread_port, 0, 0) */ +#define BSDTHREAD_CTL_QOS_OVERRIDE_END 0x80 +/* bsdthread_ctl(BSDTHREAD_CTL_SET_SELF, priority, voucher, flags) */ +#define BSDTHREAD_CTL_SET_SELF 0x100 +/* bsdthread_ctl(BSDTHREAD_CTL_QOS_OVERRIDE_RESET, 0, 0, 0) */ +#define BSDTHREAD_CTL_QOS_OVERRIDE_RESET 0x200 +/* bsdthread_ctl(BSDTHREAD_CTL_QOS_OVERRIDE_DISPATCH, thread_port, priority, 0) */ +#define BSDTHREAD_CTL_QOS_OVERRIDE_DISPATCH 0x400 +/* bsdthread_ctl(BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_ADD, thread_port, priority, resource) */ +#define BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_ADD 0x401 +/* bsdthread_ctl(BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_RESET, 0|1 (?reset_all), resource, 0) */ +#define BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_RESET 0x402 +/* bsdthread_ctl(BSDTHREAD_CTL_QOS_MAX_PARALLELISM, priority, flags, 0) */ +#define BSDTHREAD_CTL_QOS_MAX_PARALLELISM 0x800 + +#define _PTHREAD_QOS_PARALLELISM_COUNT_LOGICAL 0x1 +#define _PTHREAD_QOS_PARALLELISM_REALTIME 0x2 + +#endif // __PTHREAD_EXPOSE_INTERNALS__ +#endif // _PTHREAD_BSDTHREAD_PRIVATE_H_ diff --git a/bsd/pthread/priority_private.h b/bsd/pthread/priority_private.h new file mode 100644 index 000000000..5d20e08b3 --- /dev/null +++ b/bsd/pthread/priority_private.h @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2000-2017 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _PTHREAD_PRIORITY_PRIVATE_H_ +#define _PTHREAD_PRIORITY_PRIVATE_H_ + +/*! + * @typedef pthread_priority_t + * + * @abstract + * pthread_priority_t is an on opaque integer that is guaranteed to be ordered + * such that combations of QoS classes and relative priorities are ordered + * numerically, according to their combined priority. + * + * xnu, pthread & libdispatch flags + * + * @const _PTHREAD_PRIORITY_OVERCOMMIT_FLAG + * The thread this priority is applied to is overcommit (affects the workqueue + * creation policy for this priority). + * + * @const _PTHREAD_PRIORITY_FALLBACK_FLAG + * Indicates that this priority is is used only when incoming events have no + * priority at all. It is merely used as a fallback (hence the name) instead of + * a floor. + * + * This is usually used with QOS_CLASS_DEFAULT and a 0 relative priority. + * + * @const _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG + * The event manager flag indicates that this thread/request is for a event + * manager thread. There can only ever be one event manager thread at a time + * and it is brought up at the highest of all event manager priorities pthread + * knows about. + * + * pthread & dispatch only flags + * + * @const _PTHREAD_PRIORITY_SCHED_PRI_FLAG + * @const _PTHREAD_PRIORITY_SCHED_PRI_MASK + * This flag indicates that the bits extracted using + * _PTHREAD_PRIORITY_SCHED_PRI_MASK represent a scheduler priority instead of + * a {qos, relative priority} pair. + * + * This flag is only used by the pthread kext to indicate libdispatch that the + * event manager queue priority is a scheduling priority and not a QoS. This + * flag is never used as an input by anything else and is why it can perform + * a double duty with _PTHREAD_PRIORITY_ROOTQUEUE_FLAG. + * + * @const _PTHREAD_PRIORITY_NEEDS_UNBIND_FLAG + * This flag is used for the priority of event delivery threads to indicate + * to libdispatch that this thread is bound to a kqueue. + * + * dispatch only flags + * + * @const _PTHREAD_PRIORITY_INHERIT_FLAG + * This flag is meaningful to libdispatch only and has no meanting for the + * kernel and/or pthread. + * + * @const _PTHREAD_PRIORITY_ROOTQUEUE_FLAG + * This flag is meaningful to libdispatch only and has no meanting for the + * kernel and/or pthread. + * + * @const _PTHREAD_PRIORITY_ENFORCE_FLAG + * This flag is used to indicate that this priority should be prefered for work + * submited asynchronously over the intrinsic priority of the queue/thread the + * work is submitted to. + * + * @const _PTHREAD_PRIORITY_OVERRIDE_FLAG + * No longer used + */ +typedef unsigned long pthread_priority_t; + +#define _PTHREAD_PRIORITY_FLAGS_MASK 0xff000000 +#define _PTHREAD_PRIORITY_FLAGS_SHIFT (24ull) + +#define _PTHREAD_PRIORITY_OVERCOMMIT_FLAG 0x80000000 +#define _PTHREAD_PRIORITY_INHERIT_FLAG 0x40000000 /* dispatch only */ +#define _PTHREAD_PRIORITY_ROOTQUEUE_FLAG 0x20000000 /* dispatch only */ +#define _PTHREAD_PRIORITY_SCHED_PRI_FLAG 0x20000000 +#define _PTHREAD_PRIORITY_SCHED_PRI_MASK 0x0000ffff +#define _PTHREAD_PRIORITY_ENFORCE_FLAG 0x10000000 /* dispatch only */ +#define _PTHREAD_PRIORITY_OVERRIDE_FLAG 0x08000000 /* unused */ +#define _PTHREAD_PRIORITY_FALLBACK_FLAG 0x04000000 +#define _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG 0x02000000 +#define _PTHREAD_PRIORITY_NEEDS_UNBIND_FLAG 0x01000000 +#define _PTHREAD_PRIORITY_DEFAULTQUEUE_FLAG _PTHREAD_PRIORITY_FALLBACK_FLAG // compat + +#define _PTHREAD_PRIORITY_ENCODING_MASK 0x00a00000 +#define _PTHREAD_PRIORITY_ENCODING_SHIFT (22ull) +#define _PTHREAD_PRIORITY_ENCODING_V0 0x00000000 +#define _PTHREAD_PRIORITY_ENCODING_V1 0x00400000 /* unused */ +#define _PTHREAD_PRIORITY_ENCODING_V2 0x00800000 /* unused */ +#define _PTHREAD_PRIORITY_ENCODING_V3 0x00a00000 /* unused */ + +#define _PTHREAD_PRIORITY_QOS_CLASS_MASK 0x003fff00 +#define _PTHREAD_PRIORITY_VALID_QOS_CLASS_MASK 0x00003f00 +#define _PTHREAD_PRIORITY_QOS_CLASS_SHIFT (8ull) + +#define _PTHREAD_PRIORITY_PRIORITY_MASK 0x000000ff +#define _PTHREAD_PRIORITY_PRIORITY_SHIFT (0) + +#if PRIVATE +#if XNU_KERNEL_PRIVATE && !defined(__PTHREAD_EXPOSE_INTERNALS__) +#define __PTHREAD_EXPOSE_INTERNALS__ 1 +#endif // XNU_KERNEL_PRIVATE +#ifdef __PTHREAD_EXPOSE_INTERNALS__ +/* + * This exposes the encoding used for pthread_priority_t + * and is meant to be used by pthread and XNU only + */ +#include // THREAD_QOS_* +#include + +__attribute__((always_inline, const)) +static inline bool +_pthread_priority_has_qos(pthread_priority_t pp) +{ + return (pp & (_PTHREAD_PRIORITY_SCHED_PRI_FLAG | + _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG)) == 0 && + (pp & (_PTHREAD_PRIORITY_QOS_CLASS_MASK & + ~_PTHREAD_PRIORITY_VALID_QOS_CLASS_MASK)) == 0 && + (pp & _PTHREAD_PRIORITY_VALID_QOS_CLASS_MASK) != 0; +} + +__attribute__((always_inline, const)) +static inline pthread_priority_t +_pthread_priority_make_from_thread_qos(thread_qos_t qos, int relpri, + unsigned long flags) +{ + pthread_priority_t pp = (flags & _PTHREAD_PRIORITY_FLAGS_MASK); + if (qos && qos < THREAD_QOS_LAST) { + pp |= (1 << (_PTHREAD_PRIORITY_QOS_CLASS_SHIFT + qos - 1)); + pp |= ((uint8_t)relpri - 1) & _PTHREAD_PRIORITY_PRIORITY_MASK; + } + return pp; +} + +__attribute__((always_inline, const)) +static inline pthread_priority_t +_pthread_event_manager_priority(void) +{ + return _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG; +} + +__attribute__((always_inline, const)) +static inline pthread_priority_t +_pthread_unspecified_priority(void) +{ + return _pthread_priority_make_from_thread_qos(THREAD_QOS_UNSPECIFIED, 0, 0); +} + +__attribute__((always_inline, const)) +static inline pthread_priority_t +_pthread_default_priority(unsigned long flags) +{ + return _pthread_priority_make_from_thread_qos(THREAD_QOS_LEGACY, 0, flags); +} + +__attribute__((always_inline, const)) +static inline thread_qos_t +_pthread_priority_thread_qos(pthread_priority_t pp) +{ + if (_pthread_priority_has_qos(pp)) { + pp &= _PTHREAD_PRIORITY_QOS_CLASS_MASK; + pp >>= _PTHREAD_PRIORITY_QOS_CLASS_SHIFT; + return (thread_qos_t)__builtin_ffs((int)pp); + } + return THREAD_QOS_UNSPECIFIED; +} + +__attribute__((always_inline, const)) +static inline int +_pthread_priority_relpri(pthread_priority_t pp) +{ + if (_pthread_priority_has_qos(pp)) { + pp &= _PTHREAD_PRIORITY_PRIORITY_MASK; + pp >>= _PTHREAD_PRIORITY_PRIORITY_SHIFT; + return (int8_t)pp + 1; + } + return 0; +} + +#if KERNEL +// Interfaces only used by the kernel and not implemented in userspace. + +/* + * Keep managerness, overcomitness and fallback, discard other flags. + * Normalize and validate QoS/relpri + */ +__attribute__((const)) +pthread_priority_t +_pthread_priority_normalize(pthread_priority_t pp); + +/* + * Keep managerness, discard other flags. + * Normalize and validate QoS/relpri + */ +__attribute__((const)) +pthread_priority_t +_pthread_priority_normalize_for_ipc(pthread_priority_t pp); + +/* + * Keep the flags from base_pp and return the priority with the maximum priority + * of base_pp and _pthread_priority_make_from_thread_qos(qos, 0, 0) + */ +__attribute__((const)) +pthread_priority_t +_pthread_priority_combine(pthread_priority_t base_pp, thread_qos_t qos); + +#endif // KERNEL +#endif // __PTHREAD_EXPOSE_INTERNALS__ +#endif // PRIVATE +#endif // _PTHREAD_PRIORITY_PRIVATE_H_ diff --git a/bsd/pthread/pthread_priority.c b/bsd/pthread/pthread_priority.c new file mode 100644 index 000000000..53cda953a --- /dev/null +++ b/bsd/pthread/pthread_priority.c @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + * + */ + +#include + +#ifndef QOS_MIN_RELATIVE_PRIORITY // from in userspace +#define QOS_MIN_RELATIVE_PRIORITY -15 +#endif + +pthread_priority_t +_pthread_priority_normalize(pthread_priority_t pp) +{ + if (pp & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG) { + return _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG; + } + if (_pthread_priority_has_qos(pp)) { + int relpri = _pthread_priority_relpri(pp); + if (relpri > 0 || relpri < QOS_MIN_RELATIVE_PRIORITY) { + pp |= _PTHREAD_PRIORITY_PRIORITY_MASK; + } + return pp & (_PTHREAD_PRIORITY_OVERCOMMIT_FLAG | + _PTHREAD_PRIORITY_FALLBACK_FLAG | + _PTHREAD_PRIORITY_QOS_CLASS_MASK | + _PTHREAD_PRIORITY_PRIORITY_MASK); + } + return _pthread_unspecified_priority(); +} + +pthread_priority_t +_pthread_priority_normalize_for_ipc(pthread_priority_t pp) +{ + if (_pthread_priority_has_qos(pp)) { + int relpri = _pthread_priority_relpri(pp); + if (relpri > 0 || relpri < QOS_MIN_RELATIVE_PRIORITY) { + pp |= _PTHREAD_PRIORITY_PRIORITY_MASK; + } + return pp & (_PTHREAD_PRIORITY_QOS_CLASS_MASK | + _PTHREAD_PRIORITY_PRIORITY_MASK); + } + return _pthread_unspecified_priority(); +} + +pthread_priority_t +_pthread_priority_combine(pthread_priority_t base_pp, thread_qos_t qos) +{ + if (base_pp & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG) { + return _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG; + } + + if (base_pp & _PTHREAD_PRIORITY_FALLBACK_FLAG) { + if (!qos) { + return base_pp; + } + } else if (qos < _pthread_priority_thread_qos(base_pp)) { + return base_pp; + } + + return _pthread_priority_make_from_thread_qos(qos, 0, + base_pp & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG); +} diff --git a/bsd/kern/pthread_shims.c b/bsd/pthread/pthread_shims.c similarity index 62% rename from bsd/kern/pthread_shims.c rename to bsd/pthread/pthread_shims.c index 66fa1d73e..b23487ec0 100644 --- a/bsd/kern/pthread_shims.c +++ b/bsd/pthread/pthread_shims.c @@ -2,7 +2,7 @@ * Copyright (c) 2012-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,10 +22,10 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ - + #define PTHREAD_INTERNAL 1 #include @@ -44,6 +44,8 @@ #include #include #include +#include +#include #include #include #include @@ -57,9 +59,9 @@ /* on arm, the callbacks function has two #ifdef arm ponters */ #if defined(__arm__) -#define PTHREAD_CALLBACK_MEMBER map_is_1gb +#define PTHREAD_CALLBACK_MEMBER __unused_was_map_is_1gb #else -#define PTHREAD_CALLBACK_MEMBER ml_get_max_cpus +#define PTHREAD_CALLBACK_MEMBER __unused_was_ml_get_max_cpus #endif /* compile time asserts to check the length of structures in pthread_shims.h */ @@ -69,6 +71,7 @@ static_assert((sizeof(struct pthread_callbacks_s) - offsetof(struct pthread_call /* old pthread code had definitions for these as they don't exist in headers */ extern kern_return_t mach_port_deallocate(ipc_space_t, mach_port_name_t); extern kern_return_t semaphore_signal_internal_trap(mach_port_name_t); +extern void thread_deallocate_safe(thread_t thread); #define PTHREAD_STRUCT_ACCESSOR(get, set, rettype, structtype, member) \ static rettype \ @@ -84,59 +87,34 @@ PTHREAD_STRUCT_ACCESSOR(proc_get_threadstart, proc_set_threadstart, user_addr_t, PTHREAD_STRUCT_ACCESSOR(proc_get_pthsize, proc_set_pthsize, int, struct proc*, p_pthsize); PTHREAD_STRUCT_ACCESSOR(proc_get_wqthread, proc_set_wqthread, user_addr_t, struct proc*, p_wqthread); PTHREAD_STRUCT_ACCESSOR(proc_get_stack_addr_hint, proc_set_stack_addr_hint, user_addr_t, struct proc *, p_stack_addr_hint); -PTHREAD_STRUCT_ACCESSOR(proc_get_dispatchqueue_offset, proc_set_dispatchqueue_offset, uint64_t, struct proc*, p_dispatchqueue_offset); -PTHREAD_STRUCT_ACCESSOR(proc_get_dispatchqueue_serialno_offset, proc_set_dispatchqueue_serialno_offset, uint64_t, struct proc*, p_dispatchqueue_serialno_offset); PTHREAD_STRUCT_ACCESSOR(proc_get_pthread_tsd_offset, proc_set_pthread_tsd_offset, uint32_t, struct proc *, p_pth_tsd_offset); PTHREAD_STRUCT_ACCESSOR(proc_get_mach_thread_self_tsd_offset, proc_set_mach_thread_self_tsd_offset, uint64_t, struct proc *, p_mach_thread_self_offset); PTHREAD_STRUCT_ACCESSOR(proc_get_pthhash, proc_set_pthhash, void*, struct proc*, p_pthhash); -PTHREAD_STRUCT_ACCESSOR(proc_get_return_to_kernel_offset, proc_set_return_to_kernel_offset, uint64_t, struct proc*, p_return_to_kernel_offset); -PTHREAD_STRUCT_ACCESSOR(proc_get_user_stack, proc_set_user_stack, user_addr_t, struct proc*, user_stack); - -PTHREAD_STRUCT_ACCESSOR(uthread_get_threadlist, uthread_set_threadlist, void*, struct uthread*, uu_threadlist); -PTHREAD_STRUCT_ACCESSOR(uthread_get_sigmask, uthread_set_sigmask, sigset_t, struct uthread*, uu_sigmask); -PTHREAD_STRUCT_ACCESSOR(uthread_get_returnval, uthread_set_returnval, int, struct uthread*, uu_rval[0]); #define WQPTR_IS_INITING_VALUE ((void *)~(uintptr_t)0) -static void * -proc_get_wqptr(struct proc *p) { - void *wqptr = p->p_wqptr; - return (wqptr == WQPTR_IS_INITING_VALUE) ? NULL : wqptr; -} static void -proc_set_wqptr(struct proc *p, void *y) { - proc_lock(p); - - assert(y == NULL || p->p_wqptr == WQPTR_IS_INITING_VALUE); - - p->p_wqptr = y; - - if (y != NULL){ - wakeup(&p->p_wqptr); - } - - proc_unlock(p); +proc_set_dispatchqueue_offset(struct proc *p, uint64_t offset) +{ + p->p_dispatchqueue_offset = offset; } -static boolean_t -proc_init_wqptr_or_wait(struct proc *p) { - proc_lock(p); - if (p->p_wqptr == NULL){ - p->p_wqptr = WQPTR_IS_INITING_VALUE; - proc_unlock(p); - - return TRUE; - } else if (p->p_wqptr == WQPTR_IS_INITING_VALUE){ - assert_wait(&p->p_wqptr, THREAD_UNINT); - proc_unlock(p); - thread_block(THREAD_CONTINUE_NULL); +static void +proc_set_return_to_kernel_offset(struct proc *p, uint64_t offset) +{ + p->p_return_to_kernel_offset = offset; +} - return FALSE; - } else { - proc_unlock(p); +static user_addr_t +proc_get_user_stack(struct proc *p) +{ + return p->user_stack; +} - return FALSE; - } +static void +uthread_set_returnval(struct uthread *uth, int retval) +{ + uth->uu_rval[0] = retval; } __attribute__((noreturn)) @@ -158,11 +136,6 @@ get_task_threadmax(void) { return task_threadmax; } -static task_t -proc_get_task(struct proc *p) { - return p->task; -} - static uint64_t proc_get_register(struct proc *p) { return (p->p_lflag & P_LREGISTER); @@ -176,7 +149,7 @@ proc_set_register(struct proc *p) { static void* uthread_get_uukwe(struct uthread *t) { - return &t->uu_kevent.uu_kwe; + return &t->uu_save.uus_kwe; } static int @@ -197,15 +170,6 @@ qos_main_thread_active(void) return TRUE; } -#if defined(__arm__) -/* On iOS, the stack placement depends on the address space size */ -static uint32_t -map_is_1gb(vm_map_t map) -{ - return ((!vm_map_is_64bit(map)) && (get_map_max(map) == ml_get_max_offset(FALSE, MACHINE_MAX_OFFSET_MIN))); -} -#endif - static int proc_usynch_get_requested_thread_qos(struct uthread *uth) { thread_t thread = uth ? uth->uu_thread : current_thread(); @@ -226,17 +190,6 @@ static int proc_usynch_get_requested_thread_qos(struct uthread *uth) return requested_qos; } -static int -proc_usynch_thread_qos_add_override_for_resource_check_owner(thread_t thread, - int override_qos, boolean_t first_override_for_resource, - user_addr_t resource, int resource_type, - user_addr_t user_lock_addr, mach_port_name_t user_lock_owner) -{ - return proc_thread_qos_add_override_check_owner(thread, override_qos, - first_override_for_resource, resource, resource_type, - user_lock_addr, user_lock_owner); -} - static boolean_t proc_usynch_thread_qos_add_override_for_resource(task_t task, struct uthread *uth, uint64_t tid, int override_qos, boolean_t first_override_for_resource, @@ -245,7 +198,7 @@ proc_usynch_thread_qos_add_override_for_resource(task_t task, struct uthread *ut thread_t thread = uth ? uth->uu_thread : THREAD_NULL; return proc_thread_qos_add_override(task, thread, tid, override_qos, - first_override_for_resource, resource, resource_type); + first_override_for_resource, resource, resource_type) == 0; } static boolean_t @@ -254,111 +207,112 @@ proc_usynch_thread_qos_remove_override_for_resource(task_t task, { thread_t thread = uth ? uth->uu_thread : THREAD_NULL; - return proc_thread_qos_remove_override(task, thread, tid, resource, resource_type); + return proc_thread_qos_remove_override(task, thread, tid, resource, + resource_type) == 0; } -static boolean_t -proc_usynch_thread_qos_reset_override_for_resource(task_t task, - struct uthread *uth, uint64_t tid, user_addr_t resource, int resource_type) + +static wait_result_t +psynch_wait_prepare(uintptr_t kwq, struct turnstile **tstore, + thread_t owner, block_hint_t block_hint, uint64_t deadline) { - thread_t thread = uth ? uth->uu_thread : THREAD_NULL; + struct turnstile *ts; + wait_result_t wr; - return proc_thread_qos_reset_override(task, thread, tid, resource, resource_type); -} + if (tstore) { + ts = turnstile_prepare(kwq, tstore, TURNSTILE_NULL, + TURNSTILE_PTHREAD_MUTEX); -static boolean_t -proc_usynch_thread_qos_squash_override_for_resource(thread_t thread, - user_addr_t resource, int resource_type) -{ - return proc_thread_qos_squash_override(thread, resource, resource_type); -} + turnstile_update_inheritor(ts, owner, + (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD)); -/* kernel (core) to kext shims */ + thread_set_pending_block_hint(current_thread(), block_hint); -void -pthread_init(void) -{ - if (!pthread_functions) { - panic("pthread kernel extension not loaded (function table is NULL)."); + wr = waitq_assert_wait64_leeway(&ts->ts_waitq, (event64_t)kwq, + THREAD_ABORTSAFE, TIMEOUT_URGENCY_USER_NORMAL, deadline, 0); + } else { + thread_set_pending_block_hint(current_thread(), block_hint); + + wr = assert_wait_deadline_with_leeway((event_t)kwq, THREAD_ABORTSAFE, + TIMEOUT_URGENCY_USER_NORMAL, deadline, 0); } - pthread_functions->pthread_init(); + + return wr; } -int -fill_procworkqueue(proc_t p, struct proc_workqueueinfo * pwqinfo) +static void +psynch_wait_update_complete(struct turnstile *ts) { - return pthread_functions->fill_procworkqueue(p, pwqinfo); + assert(ts); + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD); } -/* - * Returns true if the workqueue flags are available, and will fill - * in exceeded_total and exceeded_constrained. - */ -boolean_t -workqueue_get_pwq_exceeded(void *v, boolean_t *exceeded_total, - boolean_t *exceeded_constrained) +static void +psynch_wait_complete(uintptr_t kwq, struct turnstile **tstore) { - proc_t p = v; - struct proc_workqueueinfo pwqinfo; - int err; - - assert(p != NULL); - assert(exceeded_total != NULL); - assert(exceeded_constrained != NULL); + assert(tstore); + turnstile_complete(kwq, tstore, NULL); +} - err = fill_procworkqueue(p, &pwqinfo); - if (err) { - return FALSE; - } - if (!(pwqinfo.pwq_state & WQ_FLAGS_AVAILABLE)) { - return FALSE; - } +static void +psynch_wait_update_owner(uintptr_t kwq, thread_t owner, + struct turnstile **tstore) +{ + struct turnstile *ts; - *exceeded_total = (pwqinfo.pwq_state & WQ_EXCEEDED_TOTAL_THREAD_LIMIT); - *exceeded_constrained = (pwqinfo.pwq_state & WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT); + ts = turnstile_prepare(kwq, tstore, TURNSTILE_NULL, + TURNSTILE_PTHREAD_MUTEX); - return TRUE; + turnstile_update_inheritor(ts, owner, + (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD)); + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD); + turnstile_complete(kwq, tstore, NULL); } -uint32_t -workqueue_get_pwq_state_kdp(void * v) +static void +psynch_wait_cleanup(void) { - static_assert((WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT << 17) == kTaskWqExceededConstrainedThreadLimit); - static_assert((WQ_EXCEEDED_TOTAL_THREAD_LIMIT << 17) == kTaskWqExceededTotalThreadLimit); - static_assert((WQ_FLAGS_AVAILABLE << 17) == kTaskWqFlagsAvailable); - static_assert((WQ_FLAGS_AVAILABLE | WQ_EXCEEDED_TOTAL_THREAD_LIMIT | WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT) == 0x7); - proc_t p = v; - if (pthread_functions == NULL || pthread_functions->get_pwq_state_kdp == NULL) - return 0; - else - return pthread_functions->get_pwq_state_kdp(p); + turnstile_cleanup(); } -void -workqueue_exit(struct proc *p) +static kern_return_t +psynch_wait_wakeup(uintptr_t kwq, struct ksyn_waitq_element *kwe, + struct turnstile **tstore) { - pthread_functions->workqueue_exit(p); -} + struct uthread *uth; + struct turnstile *ts; + kern_return_t kr; -void -workqueue_mark_exiting(struct proc *p) -{ - pthread_functions->workqueue_mark_exiting(p); -} + uth = __container_of(kwe, struct uthread, uu_save.uus_kwe); + assert(uth); -void -workqueue_thread_yielded(void) -{ - pthread_functions->workqueue_thread_yielded(); + if (tstore) { + ts = turnstile_prepare(kwq, tstore, TURNSTILE_NULL, + TURNSTILE_PTHREAD_MUTEX); + turnstile_update_inheritor(ts, uth->uu_thread, + (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD)); + + kr = waitq_wakeup64_thread(&ts->ts_waitq, (event64_t)kwq, + uth->uu_thread, THREAD_AWAKENED); + + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD); + turnstile_complete(kwq, tstore, NULL); + } else { + kr = thread_wakeup_thread((event_t)kwq, uth->uu_thread); + } + + return kr; } -sched_call_t -workqueue_get_sched_callback(void) +/* kernel (core) to kext shims */ + +void +pthread_init(void) { - if (pthread_functions->workqueue_get_sched_callback) { - return pthread_functions->workqueue_get_sched_callback(); + if (!pthread_functions) { + panic("pthread kernel extension not loaded (function table is NULL)."); } - return NULL; + pthread_functions->pthread_init(); } void @@ -383,50 +337,41 @@ bsdthread_create(struct proc *p, struct bsdthread_create_args *uap, user_addr_t int bsdthread_register(struct proc *p, struct bsdthread_register_args *uap, __unused int32_t *retval) { + kern_return_t kr; + static_assert(offsetof(struct bsdthread_register_args, threadstart) + sizeof(user_addr_t) == + offsetof(struct bsdthread_register_args, wqthread)); + kr = machine_thread_function_pointers_convert_from_user(current_thread(), &uap->threadstart, 2); + assert(kr == KERN_SUCCESS); + if (pthread_functions->version >= 1) { - return pthread_functions->bsdthread_register2(p, uap->threadstart, uap->wqthread, - uap->flags, uap->stack_addr_hint, - uap->targetconc_ptr, uap->dispatchqueue_offset, - uap->tsd_offset, retval); + return pthread_functions->bsdthread_register2(p, uap->threadstart, + uap->wqthread, uap->flags, uap->stack_addr_hint, + uap->targetconc_ptr, uap->dispatchqueue_offset, + uap->tsd_offset, retval); } else { - return pthread_functions->bsdthread_register(p, uap->threadstart, uap->wqthread, - uap->flags, uap->stack_addr_hint, - uap->targetconc_ptr, uap->dispatchqueue_offset, - retval); + return pthread_functions->bsdthread_register(p, uap->threadstart, + uap->wqthread, uap->flags, uap->stack_addr_hint, + uap->targetconc_ptr, uap->dispatchqueue_offset, + retval); } } int bsdthread_terminate(struct proc *p, struct bsdthread_terminate_args *uap, int32_t *retval) { + thread_t th = current_thread(); + if (thread_get_tag(th) & THREAD_TAG_WORKQUEUE) { + workq_thread_terminate(p, get_bsdthread_info(th)); + } return pthread_functions->bsdthread_terminate(p, uap->stackaddr, uap->freesize, uap->port, uap->sem, retval); } -int -bsdthread_ctl(struct proc *p, struct bsdthread_ctl_args *uap, int *retval) -{ - return pthread_functions->bsdthread_ctl(p, uap->cmd, uap->arg1, uap->arg2, uap->arg3, retval); -} - - int thread_selfid(struct proc *p, __unused struct thread_selfid_args *uap, uint64_t *retval) { return pthread_functions->thread_selfid(p, retval); } -int -workq_kernreturn(struct proc *p, struct workq_kernreturn_args *uap, int32_t *retval) -{ - return pthread_functions->workq_kernreturn(p, uap->options, uap->item, uap->affinity, uap->prio, retval); -} - -int -workq_open(struct proc *p, __unused struct workq_open_args *uap, int32_t *retval) -{ - return pthread_functions->workq_open(p, retval); -} - /* pthread synchroniser syscalls */ int @@ -513,29 +458,6 @@ psynch_rw_downgrade(__unused proc_t p, __unused struct psynch_rw_downgrade_args return 0; } -int -thread_qos_from_pthread_priority(unsigned long priority, unsigned long *flags) -{ - return pthread_functions->thread_qos_from_pthread_priority(priority, flags); -} - -unsigned long -pthread_priority_canonicalize(unsigned long priority, boolean_t propagation) -{ - return pthread_functions->pthread_priority_canonicalize2(priority, propagation); -} - -boolean_t -workq_thread_has_been_unbound(thread_t th, int qos_class) -{ - if (pthread_functions->workq_thread_has_been_unbound) { - return pthread_functions->workq_thread_has_been_unbound(th, qos_class); - } else { - panic("pthread kext does not support workq_thread_has_been_unbound"); - return false; - } -} - void kdp_pthread_find_owner(thread_t thread, struct stackshot_thread_waitinfo *waitinfo) { @@ -552,7 +474,7 @@ kdp_pthread_get_thread_kwq(thread_t thread) return NULL; } -static void +void thread_will_park_or_terminate(thread_t thread) { if (thread_owned_workloops_count(thread)) { @@ -560,20 +482,6 @@ thread_will_park_or_terminate(thread_t thread) } } -#if defined(__arm64__) -static unsigned __int128 -atomic_fetch_add_128_relaxed(_Atomic unsigned __int128 *ptr, unsigned __int128 value) -{ - return atomic_fetch_add_explicit(ptr, value, memory_order_relaxed); -} - -static unsigned __int128 -atomic_load_128_relaxed(_Atomic unsigned __int128 *ptr) -{ - return atomic_load_explicit(ptr, memory_order_relaxed); -} -#endif - /* * The callbacks structure (defined in pthread_shims.h) contains a collection * of kernel functions that were not deemed sensible to expose as a KPI to all @@ -591,15 +499,9 @@ static const struct pthread_callbacks_s pthread_callbacks = { .proc_set_pthsize = proc_set_pthsize, .proc_get_wqthread = proc_get_wqthread, .proc_set_wqthread = proc_set_wqthread, - .proc_get_dispatchqueue_offset = proc_get_dispatchqueue_offset, .proc_set_dispatchqueue_offset = proc_set_dispatchqueue_offset, - .proc_get_wqptr = proc_get_wqptr, - .proc_set_wqptr = proc_set_wqptr, .proc_get_pthhash = proc_get_pthhash, .proc_set_pthhash = proc_set_pthhash, - .proc_get_task = proc_get_task, - .proc_lock = proc_lock, - .proc_unlock = proc_unlock, .proc_get_register = proc_get_register, .proc_set_register = proc_set_register, @@ -607,18 +509,12 @@ static const struct pthread_callbacks_s pthread_callbacks = { .ipc_port_copyout_send = ipc_port_copyout_send, .task_get_ipcspace = get_task_ipcspace, .vm_map_page_info = vm_map_page_info, - .vm_map_switch = vm_map_switch, .thread_set_wq_state32 = thread_set_wq_state32, #if !defined(__arm__) .thread_set_wq_state64 = thread_set_wq_state64, #endif - .uthread_get_threadlist = uthread_get_threadlist, - .uthread_set_threadlist = uthread_set_threadlist, - .uthread_get_sigmask = uthread_get_sigmask, - .uthread_set_sigmask = uthread_set_sigmask, .uthread_get_uukwe = uthread_get_uukwe, - .uthread_get_returnval = uthread_get_returnval, .uthread_set_returnval = uthread_set_returnval, .uthread_is_cancelled = uthread_is_cancelled, @@ -626,26 +522,9 @@ static const struct pthread_callbacks_s pthread_callbacks = { .thread_bootstrap_return = pthread_bootstrap_return, .unix_syscall_return = unix_syscall_return, - .absolutetime_to_microtime = absolutetime_to_microtime, - - .thread_set_workq_pri = thread_set_workq_pri, - .thread_set_workq_qos = thread_set_workq_qos, - .get_bsdthread_info = (void*)get_bsdthread_info, - .thread_sched_call = thread_sched_call, - .thread_static_param = thread_static_param, - .thread_create_workq = thread_create_workq, .thread_policy_set_internal = thread_policy_set_internal, .thread_policy_get = thread_policy_get, - .thread_set_voucher_name = thread_set_voucher_name, - - .thread_affinity_set = thread_affinity_set, - - .zalloc = zalloc, - .zfree = zfree, - .zinit = zinit, - - .workloop_fulfill_threadreq = workloop_fulfill_threadreq, .__pthread_testcancel = __pthread_testcancel, @@ -656,18 +535,6 @@ static const struct pthread_callbacks_s pthread_callbacks = { .thread_resume = thread_resume, .convert_thread_to_port = convert_thread_to_port, - .ml_get_max_cpus = (void*)ml_get_max_cpus, - -#if defined(__arm__) - .map_is_1gb = map_is_1gb, -#endif -#if defined(__arm64__) - .atomic_fetch_add_128_relaxed = atomic_fetch_add_128_relaxed, - .atomic_load_128_relaxed = atomic_load_128_relaxed, -#endif - - .proc_get_dispatchqueue_serialno_offset = proc_get_dispatchqueue_serialno_offset, - .proc_set_dispatchqueue_serialno_offset = proc_set_dispatchqueue_serialno_offset, .proc_get_stack_addr_hint = proc_get_stack_addr_hint, .proc_set_stack_addr_hint = proc_set_stack_addr_hint, @@ -681,29 +548,27 @@ static const struct pthread_callbacks_s pthread_callbacks = { .proc_usynch_get_requested_thread_qos = proc_usynch_get_requested_thread_qos, .qos_main_thread_active = qos_main_thread_active, + .thread_set_voucher_name = thread_set_voucher_name, - .proc_usynch_thread_qos_add_override_for_resource_check_owner = proc_usynch_thread_qos_add_override_for_resource_check_owner, .proc_usynch_thread_qos_add_override_for_resource = proc_usynch_thread_qos_add_override_for_resource, .proc_usynch_thread_qos_remove_override_for_resource = proc_usynch_thread_qos_remove_override_for_resource, - .proc_usynch_thread_qos_reset_override_for_resource = proc_usynch_thread_qos_reset_override_for_resource, - - .proc_init_wqptr_or_wait = proc_init_wqptr_or_wait, .thread_set_tag = thread_set_tag, .thread_get_tag = thread_get_tag, - .proc_usynch_thread_qos_squash_override_for_resource = proc_usynch_thread_qos_squash_override_for_resource, - .task_get_default_manager_qos = task_get_default_manager_qos, - .thread_create_workq_waiting = thread_create_workq_waiting, - - .proc_get_return_to_kernel_offset = proc_get_return_to_kernel_offset, .proc_set_return_to_kernel_offset = proc_set_return_to_kernel_offset, .thread_will_park_or_terminate = thread_will_park_or_terminate, - .qos_max_parallelism = qos_max_parallelism, - .proc_get_user_stack = proc_get_user_stack, - .proc_set_user_stack = proc_set_user_stack, + .task_findtid = task_findtid, + .thread_deallocate_safe = thread_deallocate_safe, + + .psynch_wait_prepare = psynch_wait_prepare, + .psynch_wait_update_complete = psynch_wait_update_complete, + .psynch_wait_complete = psynch_wait_complete, + .psynch_wait_cleanup = psynch_wait_cleanup, + .psynch_wait_wakeup = psynch_wait_wakeup, + .psynch_wait_update_owner = psynch_wait_update_owner, }; pthread_callbacks_t pthread_kern = &pthread_callbacks; diff --git a/bsd/pthread/pthread_workqueue.c b/bsd/pthread/pthread_workqueue.c new file mode 100644 index 000000000..0e8aee8cb --- /dev/null +++ b/bsd/pthread/pthread_workqueue.c @@ -0,0 +1,3467 @@ +/* + * Copyright (c) 2000-2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* Copyright (c) 1995-2018 Apple, Inc. All Rights Reserved */ + +#include + +// panic() should be marked noreturn +extern void panic(const char *string, ...) __printflike(1,2) __dead2; + +#include +#include +#include +#include +#include +#include +#include +#include /* for thread_exception_return */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for thread_resume */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include /* for fill_procworkqueue */ +#include +#include +#include +#include +#include +#include +#include +#include /* for ulock_owner_value_to_port_name */ + +#include +#include +#include +#include + +#include + +extern thread_t port_name_to_thread(mach_port_name_t port_name); /* osfmk/kern/ipc_tt.h */ + +static void workq_unpark_continue(void *uth, wait_result_t wr) __dead2; +static void workq_schedule_creator(proc_t p, struct workqueue *wq, int flags); + +static bool workq_threadreq_admissible(struct workqueue *wq, struct uthread *uth, + workq_threadreq_t req); + +static uint32_t workq_constrained_allowance(struct workqueue *wq, + thread_qos_t at_qos, struct uthread *uth, bool may_start_timer); + +static bool workq_thread_is_busy(uint64_t cur_ts, + _Atomic uint64_t *lastblocked_tsp); + +static int workq_sysctl_handle_usecs SYSCTL_HANDLER_ARGS; + +#pragma mark globals + +struct workq_usec_var { + uint32_t usecs; + uint64_t abstime; +}; + +#define WORKQ_SYSCTL_USECS(var, init) \ + static struct workq_usec_var var = { .usecs = init }; \ + SYSCTL_OID(_kern, OID_AUTO, var##_usecs, \ + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &var, 0, \ + workq_sysctl_handle_usecs, "I", "") + +static lck_grp_t *workq_lck_grp; +static lck_attr_t *workq_lck_attr; +static lck_grp_attr_t *workq_lck_grp_attr; +os_refgrp_decl(static, workq_refgrp, "workq", NULL); + +static zone_t workq_zone_workqueue; +static zone_t workq_zone_threadreq; + +WORKQ_SYSCTL_USECS(wq_stalled_window, WQ_STALLED_WINDOW_USECS); +WORKQ_SYSCTL_USECS(wq_reduce_pool_window, WQ_REDUCE_POOL_WINDOW_USECS); +WORKQ_SYSCTL_USECS(wq_max_timer_interval, WQ_MAX_TIMER_INTERVAL_USECS); +static uint32_t wq_max_threads = WORKQUEUE_MAXTHREADS; +static uint32_t wq_max_constrained_threads = WORKQUEUE_MAXTHREADS / 8; +static uint32_t wq_init_constrained_limit = 1; +static uint16_t wq_death_max_load; +static uint32_t wq_max_parallelism[WORKQ_NUM_QOS_BUCKETS]; + +#pragma mark sysctls + +static int +workq_sysctl_handle_usecs SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg2) + struct workq_usec_var *v = arg1; + int error = sysctl_handle_int(oidp, &v->usecs, 0, req); + if (error || !req->newptr) + return error; + clock_interval_to_absolutetime_interval(v->usecs, NSEC_PER_USEC, + &v->abstime); + return 0; +} + +SYSCTL_INT(_kern, OID_AUTO, wq_max_threads, CTLFLAG_RW | CTLFLAG_LOCKED, + &wq_max_threads, 0, ""); + +SYSCTL_INT(_kern, OID_AUTO, wq_max_constrained_threads, CTLFLAG_RW | CTLFLAG_LOCKED, + &wq_max_constrained_threads, 0, ""); + +#pragma mark p_wqptr + +#define WQPTR_IS_INITING_VALUE ((struct workqueue *)~(uintptr_t)0) + +static struct workqueue * +proc_get_wqptr_fast(struct proc *p) +{ + return os_atomic_load(&p->p_wqptr, relaxed); +} + +static struct workqueue * +proc_get_wqptr(struct proc *p) +{ + struct workqueue *wq = proc_get_wqptr_fast(p); + return wq == WQPTR_IS_INITING_VALUE ? NULL : wq; +} + +static void +proc_set_wqptr(struct proc *p, struct workqueue *wq) +{ + wq = os_atomic_xchg(&p->p_wqptr, wq, release); + if (wq == WQPTR_IS_INITING_VALUE) { + proc_lock(p); + thread_wakeup(&p->p_wqptr); + proc_unlock(p); + } +} + +static bool +proc_init_wqptr_or_wait(struct proc *p) +{ + struct workqueue *wq; + + proc_lock(p); + wq = p->p_wqptr; + + if (wq == NULL) { + p->p_wqptr = WQPTR_IS_INITING_VALUE; + proc_unlock(p); + return true; + } + + if (wq == WQPTR_IS_INITING_VALUE) { + assert_wait(&p->p_wqptr, THREAD_UNINT); + proc_unlock(p); + thread_block(THREAD_CONTINUE_NULL); + } else { + proc_unlock(p); + } + return false; +} + +static inline event_t +workq_parked_wait_event(struct uthread *uth) +{ + return (event_t)&uth->uu_workq_stackaddr; +} + +static inline void +workq_thread_wakeup(struct uthread *uth) +{ + if ((uth->uu_workq_flags & UT_WORKQ_IDLE_CLEANUP) == 0) { + thread_wakeup_thread(workq_parked_wait_event(uth), uth->uu_thread); + } +} + +#pragma mark wq_thactive + +#if defined(__LP64__) +// Layout is: +// 127 - 115 : 13 bits of zeroes +// 114 - 112 : best QoS among all pending constrained requests +// 111 - 0 : MGR, AUI, UI, IN, DF, UT, BG+MT buckets every 16 bits +#define WQ_THACTIVE_BUCKET_WIDTH 16 +#define WQ_THACTIVE_QOS_SHIFT (7 * WQ_THACTIVE_BUCKET_WIDTH) +#else +// Layout is: +// 63 - 61 : best QoS among all pending constrained requests +// 60 : Manager bucket (0 or 1) +// 59 - 0 : AUI, UI, IN, DF, UT, BG+MT buckets every 10 bits +#define WQ_THACTIVE_BUCKET_WIDTH 10 +#define WQ_THACTIVE_QOS_SHIFT (6 * WQ_THACTIVE_BUCKET_WIDTH + 1) +#endif +#define WQ_THACTIVE_BUCKET_MASK ((1U << WQ_THACTIVE_BUCKET_WIDTH) - 1) +#define WQ_THACTIVE_BUCKET_HALF (1U << (WQ_THACTIVE_BUCKET_WIDTH - 1)) + +static_assert(sizeof(wq_thactive_t) * CHAR_BIT - WQ_THACTIVE_QOS_SHIFT >= 3, + "Make sure we have space to encode a QoS"); + +static inline wq_thactive_t +_wq_thactive(struct workqueue *wq) +{ + return os_atomic_load(&wq->wq_thactive, relaxed); +} + +static inline int +_wq_bucket(thread_qos_t qos) +{ + // Map both BG and MT to the same bucket by over-shifting down and + // clamping MT and BG together. + switch (qos) { + case THREAD_QOS_MAINTENANCE: + return 0; + default: + return qos - 2; + } +} + +#define WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(tha) \ + ((tha) >> WQ_THACTIVE_QOS_SHIFT) + +static inline thread_qos_t +_wq_thactive_best_constrained_req_qos(struct workqueue *wq) +{ + // Avoid expensive atomic operations: the three bits we're loading are in + // a single byte, and always updated under the workqueue lock + wq_thactive_t v = *(wq_thactive_t *)&wq->wq_thactive; + return WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(v); +} + +static void +_wq_thactive_refresh_best_constrained_req_qos(struct workqueue *wq) +{ + thread_qos_t old_qos, new_qos; + workq_threadreq_t req; + + req = priority_queue_max(&wq->wq_constrained_queue, + struct workq_threadreq_s, tr_entry); + new_qos = req ? req->tr_qos : THREAD_QOS_UNSPECIFIED; + old_qos = _wq_thactive_best_constrained_req_qos(wq); + if (old_qos != new_qos) { + long delta = (long)new_qos - (long)old_qos; + wq_thactive_t v = (wq_thactive_t)delta << WQ_THACTIVE_QOS_SHIFT; + /* + * We can do an atomic add relative to the initial load because updates + * to this qos are always serialized under the workqueue lock. + */ + v = os_atomic_add(&wq->wq_thactive, v, relaxed); +#ifdef __LP64__ + WQ_TRACE_WQ(TRACE_wq_thactive_update, wq, (uint64_t)v, + (uint64_t)(v >> 64), 0, 0); +#else + WQ_TRACE_WQ(TRACE_wq_thactive_update, wq, v, 0, 0, 0); +#endif + } +} + +static inline wq_thactive_t +_wq_thactive_offset_for_qos(thread_qos_t qos) +{ + return (wq_thactive_t)1 << (_wq_bucket(qos) * WQ_THACTIVE_BUCKET_WIDTH); +} + +static inline wq_thactive_t +_wq_thactive_inc(struct workqueue *wq, thread_qos_t qos) +{ + wq_thactive_t v = _wq_thactive_offset_for_qos(qos); + return os_atomic_add_orig(&wq->wq_thactive, v, relaxed); +} + +static inline wq_thactive_t +_wq_thactive_dec(struct workqueue *wq, thread_qos_t qos) +{ + wq_thactive_t v = _wq_thactive_offset_for_qos(qos); + return os_atomic_sub_orig(&wq->wq_thactive, v, relaxed); +} + +static inline void +_wq_thactive_move(struct workqueue *wq, + thread_qos_t old_qos, thread_qos_t new_qos) +{ + wq_thactive_t v = _wq_thactive_offset_for_qos(new_qos) - + _wq_thactive_offset_for_qos(old_qos); + os_atomic_add_orig(&wq->wq_thactive, v, relaxed); + wq->wq_thscheduled_count[_wq_bucket(old_qos)]--; + wq->wq_thscheduled_count[_wq_bucket(new_qos)]++; +} + +static inline uint32_t +_wq_thactive_aggregate_downto_qos(struct workqueue *wq, wq_thactive_t v, + thread_qos_t qos, uint32_t *busycount, uint32_t *max_busycount) +{ + uint32_t count = 0, active; + uint64_t curtime; + + assert(WORKQ_THREAD_QOS_MIN <= qos && qos <= WORKQ_THREAD_QOS_MAX); + + if (busycount) { + curtime = mach_absolute_time(); + *busycount = 0; + } + if (max_busycount) { + *max_busycount = THREAD_QOS_LAST - qos; + } + + int i = _wq_bucket(qos); + v >>= i * WQ_THACTIVE_BUCKET_WIDTH; + for (; i < WORKQ_NUM_QOS_BUCKETS; i++, v >>= WQ_THACTIVE_BUCKET_WIDTH) { + active = v & WQ_THACTIVE_BUCKET_MASK; + count += active; + + if (busycount && wq->wq_thscheduled_count[i] > active) { + if (workq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[i])) { + /* + * We only consider the last blocked thread for a given bucket + * as busy because we don't want to take the list lock in each + * sched callback. However this is an approximation that could + * contribute to thread creation storms. + */ + (*busycount)++; + } + } + } + + return count; +} + +#pragma mark wq_flags + +static inline uint32_t +_wq_flags(struct workqueue *wq) +{ + return os_atomic_load(&wq->wq_flags, relaxed); +} + +static inline bool +_wq_exiting(struct workqueue *wq) +{ + return _wq_flags(wq) & WQ_EXITING; +} + +bool +workq_is_exiting(struct proc *p) +{ + struct workqueue *wq = proc_get_wqptr(p); + return !wq || _wq_exiting(wq); +} + +struct turnstile * +workq_turnstile(struct proc *p) +{ + struct workqueue *wq = proc_get_wqptr(p); + return wq ? wq->wq_turnstile : TURNSTILE_NULL; +} + +#pragma mark workqueue lock + +static bool +workq_lock_spin_is_acquired_kdp(struct workqueue *wq) +{ + return kdp_lck_spin_is_acquired(&wq->wq_lock); +} + +static inline void +workq_lock_spin(struct workqueue *wq) +{ + lck_spin_lock(&wq->wq_lock); +} + +static inline void +workq_lock_held(__assert_only struct workqueue *wq) +{ + LCK_SPIN_ASSERT(&wq->wq_lock, LCK_ASSERT_OWNED); +} + +static inline bool +workq_lock_try(struct workqueue *wq) +{ + return lck_spin_try_lock(&wq->wq_lock); +} + +static inline void +workq_unlock(struct workqueue *wq) +{ + lck_spin_unlock(&wq->wq_lock); +} + +#pragma mark idle thread lists + +#define WORKQ_POLICY_INIT(qos) \ + (struct uu_workq_policy){ .qos_req = qos, .qos_bucket = qos } + +static inline thread_qos_t +workq_pri_bucket(struct uu_workq_policy req) +{ + return MAX(MAX(req.qos_req, req.qos_max), req.qos_override); +} + +static inline thread_qos_t +workq_pri_override(struct uu_workq_policy req) +{ + return MAX(workq_pri_bucket(req), req.qos_bucket); +} + +static inline bool +workq_thread_needs_params_change(workq_threadreq_t req, struct uthread *uth) +{ + workq_threadreq_param_t cur_trp, req_trp = { }; + + cur_trp.trp_value = uth->uu_save.uus_workq_park_data.workloop_params; + if (req->tr_flags & TR_FLAG_WL_PARAMS) { + req_trp = kqueue_threadreq_workloop_param(req); + } + + /* + * CPU percent flags are handled separately to policy changes, so ignore + * them for all of these checks. + */ + uint16_t cur_flags = (cur_trp.trp_flags & ~TRP_CPUPERCENT); + uint16_t req_flags = (req_trp.trp_flags & ~TRP_CPUPERCENT); + + if (!req_flags && !cur_flags) { + return false; + } + + if (req_flags != cur_flags) { + return true; + } + + if ((req_flags & TRP_PRIORITY) && req_trp.trp_pri != cur_trp.trp_pri) { + return true; + } + + if ((req_flags & TRP_POLICY) && cur_trp.trp_pol != cur_trp.trp_pol) { + return true; + } + + return false; +} + +static inline bool +workq_thread_needs_priority_change(workq_threadreq_t req, struct uthread *uth) +{ + if (workq_thread_needs_params_change(req, uth)) { + return true; + } + + return req->tr_qos != workq_pri_override(uth->uu_workq_pri); +} + +static void +workq_thread_update_bucket(proc_t p, struct workqueue *wq, struct uthread *uth, + struct uu_workq_policy old_pri, struct uu_workq_policy new_pri, + bool force_run) +{ + thread_qos_t old_bucket = old_pri.qos_bucket; + thread_qos_t new_bucket = workq_pri_bucket(new_pri); + + if (old_bucket != new_bucket) { + _wq_thactive_move(wq, old_bucket, new_bucket); + } + + new_pri.qos_bucket = new_bucket; + uth->uu_workq_pri = new_pri; + + if (workq_pri_override(old_pri) != new_bucket) { + thread_set_workq_override(uth->uu_thread, new_bucket); + } + + if (wq->wq_reqcount && (old_bucket > new_bucket || force_run)) { + int flags = WORKQ_THREADREQ_CAN_CREATE_THREADS; + if (old_bucket > new_bucket) { + /* + * When lowering our bucket, we may unblock a thread request, + * but we can't drop our priority before we have evaluated + * whether this is the case, and if we ever drop the workqueue lock + * that would cause a priority inversion. + * + * We hence have to disallow thread creation in that case. + */ + flags = 0; + } + workq_schedule_creator(p, wq, flags); + } +} + +/* + * Sets/resets the cpu percent limits on the current thread. We can't set + * these limits from outside of the current thread, so this function needs + * to be called when we're executing on the intended + */ +static void +workq_thread_reset_cpupercent(workq_threadreq_t req, struct uthread *uth) +{ + assert(uth == current_uthread()); + workq_threadreq_param_t trp = { }; + + if (req && (req->tr_flags & TR_FLAG_WL_PARAMS)) { + trp = kqueue_threadreq_workloop_param(req); + } + + if (uth->uu_workq_flags & UT_WORKQ_CPUPERCENT) { + /* + * Going through disable when we have an existing CPU percent limit + * set will force the ledger to refill the token bucket of the current + * thread. Removing any penalty applied by previous thread use. + */ + thread_set_cpulimit(THREAD_CPULIMIT_DISABLE, 0, 0); + uth->uu_workq_flags &= ~UT_WORKQ_CPUPERCENT; + } + + if (trp.trp_flags & TRP_CPUPERCENT) { + thread_set_cpulimit(THREAD_CPULIMIT_BLOCK, trp.trp_cpupercent, + (uint64_t)trp.trp_refillms * NSEC_PER_SEC); + uth->uu_workq_flags |= UT_WORKQ_CPUPERCENT; + } +} + +static void +workq_thread_reset_pri(struct workqueue *wq, struct uthread *uth, + workq_threadreq_t req) +{ + thread_t th = uth->uu_thread; + thread_qos_t qos = req ? req->tr_qos : WORKQ_THREAD_QOS_CLEANUP; + workq_threadreq_param_t trp = { }; + int priority = 31; + int policy = POLICY_TIMESHARE; + + if (req && (req->tr_flags & TR_FLAG_WL_PARAMS)) { + trp = kqueue_threadreq_workloop_param(req); + } + + uth->uu_workq_pri = WORKQ_POLICY_INIT(qos); + uth->uu_workq_flags &= ~UT_WORKQ_OUTSIDE_QOS; + uth->uu_save.uus_workq_park_data.workloop_params = trp.trp_value; + + // qos sent out to userspace (may differ from uu_workq_pri on param threads) + uth->uu_save.uus_workq_park_data.qos = qos; + + if (qos == WORKQ_THREAD_QOS_MANAGER) { + uint32_t mgr_pri = wq->wq_event_manager_priority; + assert(trp.trp_value == 0); // manager qos and thread policy don't mix + + if (mgr_pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) { + mgr_pri &= _PTHREAD_PRIORITY_SCHED_PRI_MASK; + thread_set_workq_pri(th, THREAD_QOS_UNSPECIFIED, mgr_pri, + POLICY_TIMESHARE); + return; + } + + qos = _pthread_priority_thread_qos(mgr_pri); + } else { + if (trp.trp_flags & TRP_PRIORITY) { + qos = THREAD_QOS_UNSPECIFIED; + priority = trp.trp_pri; + uth->uu_workq_flags |= UT_WORKQ_OUTSIDE_QOS; + } + + if (trp.trp_flags & TRP_POLICY) { + policy = trp.trp_pol; + } + } + + thread_set_workq_pri(th, qos, priority, policy); +} + +/* + * Called by kevent with the NOTE_WL_THREAD_REQUEST knote lock held, + * every time a servicer is being told about a new max QoS. + */ +void +workq_thread_set_max_qos(struct proc *p, struct kqrequest *kqr) +{ + struct uu_workq_policy old_pri, new_pri; + struct uthread *uth = get_bsdthread_info(kqr->kqr_thread); + struct workqueue *wq = proc_get_wqptr_fast(p); + thread_qos_t qos = kqr->kqr_qos_index; + + if (uth->uu_workq_pri.qos_max == qos) + return; + + workq_lock_spin(wq); + old_pri = new_pri = uth->uu_workq_pri; + new_pri.qos_max = qos; + workq_thread_update_bucket(p, wq, uth, old_pri, new_pri, false); + workq_unlock(wq); +} + +#pragma mark idle threads accounting and handling + +static inline struct uthread * +workq_oldest_killable_idle_thread(struct workqueue *wq) +{ + struct uthread *uth = TAILQ_LAST(&wq->wq_thidlelist, workq_uthread_head); + + if (uth && !uth->uu_save.uus_workq_park_data.has_stack) { + uth = TAILQ_PREV(uth, workq_uthread_head, uu_workq_entry); + if (uth) { + assert(uth->uu_save.uus_workq_park_data.has_stack); + } + } + return uth; +} + +static inline uint64_t +workq_kill_delay_for_idle_thread(struct workqueue *wq) +{ + uint64_t delay = wq_reduce_pool_window.abstime; + uint16_t idle = wq->wq_thidlecount; + + /* + * If we have less than wq_death_max_load threads, have a 5s timer. + * + * For the next wq_max_constrained_threads ones, decay linearly from + * from 5s to 50ms. + */ + if (idle <= wq_death_max_load) { + return delay; + } + + if (wq_max_constrained_threads > idle - wq_death_max_load) { + delay *= (wq_max_constrained_threads - (idle - wq_death_max_load)); + } + return delay / wq_max_constrained_threads; +} + +static inline bool +workq_should_kill_idle_thread(struct workqueue *wq, struct uthread *uth, + uint64_t now) +{ + uint64_t delay = workq_kill_delay_for_idle_thread(wq); + return now - uth->uu_save.uus_workq_park_data.idle_stamp > delay; +} + +static void +workq_death_call_schedule(struct workqueue *wq, uint64_t deadline) +{ + uint32_t wq_flags = os_atomic_load(&wq->wq_flags, relaxed); + + if (wq_flags & (WQ_EXITING | WQ_DEATH_CALL_SCHEDULED)) { + return; + } + os_atomic_or(&wq->wq_flags, WQ_DEATH_CALL_SCHEDULED, relaxed); + + WQ_TRACE_WQ(TRACE_wq_death_call | DBG_FUNC_NONE, wq, 1, 0, 0, 0); + + /* + * Due to how long term timers work, the leeway + * can't be too short, so use 500ms which is long enough that we will not + * wake up the CPU for killing threads, but short enough that it doesn't + * fall into long-term timer list shenanigans. + */ + thread_call_enter_delayed_with_leeway(wq->wq_death_call, NULL, deadline, + wq_reduce_pool_window.abstime / 10, + THREAD_CALL_DELAY_LEEWAY | THREAD_CALL_DELAY_USER_BACKGROUND); +} + +/* + * `decrement` is set to the number of threads that are no longer dying: + * - because they have been resuscitated just in time (workq_pop_idle_thread) + * - or have been killed (workq_thread_terminate). + */ +static void +workq_death_policy_evaluate(struct workqueue *wq, uint16_t decrement) +{ + struct uthread *uth; + + assert(wq->wq_thdying_count >= decrement); + if ((wq->wq_thdying_count -= decrement) > 0) + return; + + if (wq->wq_thidlecount <= 1) + return; + + if ((uth = workq_oldest_killable_idle_thread(wq)) == NULL) + return; + + uint64_t now = mach_absolute_time(); + uint64_t delay = workq_kill_delay_for_idle_thread(wq); + + if (now - uth->uu_save.uus_workq_park_data.idle_stamp > delay) { + WQ_TRACE_WQ(TRACE_wq_thread_terminate | DBG_FUNC_START, + wq, wq->wq_thidlecount, 0, 0, 0); + wq->wq_thdying_count++; + uth->uu_workq_flags |= UT_WORKQ_DYING; + workq_thread_wakeup(uth); + return; + } + + workq_death_call_schedule(wq, + uth->uu_save.uus_workq_park_data.idle_stamp + delay); +} + +void +workq_thread_terminate(struct proc *p, struct uthread *uth) +{ + struct workqueue *wq = proc_get_wqptr_fast(p); + + workq_lock_spin(wq); + TAILQ_REMOVE(&wq->wq_thrunlist, uth, uu_workq_entry); + if (uth->uu_workq_flags & UT_WORKQ_DYING) { + WQ_TRACE_WQ(TRACE_wq_thread_terminate | DBG_FUNC_END, + wq, wq->wq_thidlecount, 0, 0, 0); + workq_death_policy_evaluate(wq, 1); + } + if (wq->wq_nthreads-- == wq_max_threads) { + /* + * We got under the thread limit again, which may have prevented + * thread creation from happening, redrive if there are pending requests + */ + if (wq->wq_reqcount) { + workq_schedule_creator(p, wq, WORKQ_THREADREQ_CAN_CREATE_THREADS); + } + } + workq_unlock(wq); + + thread_deallocate(uth->uu_thread); +} + +static void +workq_kill_old_threads_call(void *param0, void *param1 __unused) +{ + struct workqueue *wq = param0; + + workq_lock_spin(wq); + WQ_TRACE_WQ(TRACE_wq_death_call | DBG_FUNC_START, wq, 0, 0, 0, 0); + os_atomic_and(&wq->wq_flags, ~WQ_DEATH_CALL_SCHEDULED, relaxed); + workq_death_policy_evaluate(wq, 0); + WQ_TRACE_WQ(TRACE_wq_death_call | DBG_FUNC_END, wq, 0, 0, 0, 0); + workq_unlock(wq); +} + +static struct uthread * +workq_pop_idle_thread(struct workqueue *wq) +{ + struct uthread *uth; + + if ((uth = TAILQ_FIRST(&wq->wq_thidlelist))) { + TAILQ_REMOVE(&wq->wq_thidlelist, uth, uu_workq_entry); + } else { + uth = TAILQ_FIRST(&wq->wq_thnewlist); + TAILQ_REMOVE(&wq->wq_thnewlist, uth, uu_workq_entry); + } + TAILQ_INSERT_TAIL(&wq->wq_thrunlist, uth, uu_workq_entry); + + assert((uth->uu_workq_flags & UT_WORKQ_RUNNING) == 0); + uth->uu_workq_flags |= UT_WORKQ_RUNNING | UT_WORKQ_OVERCOMMIT; + wq->wq_threads_scheduled++; + wq->wq_thidlecount--; + + if (__improbable(uth->uu_workq_flags & UT_WORKQ_DYING)) { + uth->uu_workq_flags ^= UT_WORKQ_DYING; + workq_death_policy_evaluate(wq, 1); + } + return uth; +} + +/* + * Called by thread_create_workq_waiting() during thread initialization, before + * assert_wait, before the thread has been started. + */ +event_t +workq_thread_init_and_wq_lock(task_t task, thread_t th) +{ + struct uthread *uth = get_bsdthread_info(th); + + uth->uu_workq_flags = UT_WORKQ_NEW; + uth->uu_workq_pri = WORKQ_POLICY_INIT(THREAD_QOS_LEGACY); + uth->uu_workq_thport = MACH_PORT_NULL; + uth->uu_workq_stackaddr = 0; + + thread_set_tag(th, THREAD_TAG_PTHREAD | THREAD_TAG_WORKQUEUE); + thread_reset_workq_qos(th, THREAD_QOS_LEGACY); + + workq_lock_spin(proc_get_wqptr_fast(get_bsdtask_info(task))); + return workq_parked_wait_event(uth); +} + +/** + * Try to add a new workqueue thread. + * + * - called with workq lock held + * - dropped and retaken around thread creation + * - return with workq lock held + */ +static bool +workq_add_new_idle_thread(proc_t p, struct workqueue *wq) +{ + mach_vm_offset_t th_stackaddr; + kern_return_t kret; + thread_t th; + + wq->wq_nthreads++; + + workq_unlock(wq); + + vm_map_t vmap = get_task_map(p->task); + + kret = pthread_functions->workq_create_threadstack(p, vmap, &th_stackaddr); + if (kret != KERN_SUCCESS) { + WQ_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, + kret, 1, 0, 0); + goto out; + } + + kret = thread_create_workq_waiting(p->task, workq_unpark_continue, &th); + if (kret != KERN_SUCCESS) { + WQ_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq, + kret, 0, 0, 0); + pthread_functions->workq_destroy_threadstack(p, vmap, th_stackaddr); + goto out; + } + + // thread_create_workq_waiting() will return with the wq lock held + // on success, because it calls workq_thread_init_and_wq_lock() above + + struct uthread *uth = get_bsdthread_info(th); + + wq->wq_creations++; + wq->wq_thidlecount++; + uth->uu_workq_stackaddr = th_stackaddr; + TAILQ_INSERT_TAIL(&wq->wq_thnewlist, uth, uu_workq_entry); + + WQ_TRACE_WQ(TRACE_wq_thread_create | DBG_FUNC_NONE, wq, 0, 0, 0, 0); + return true; + +out: + workq_lock_spin(wq); + /* + * Do not redrive here if we went under wq_max_threads again, + * it is the responsibility of the callers of this function + * to do so when it fails. + */ + wq->wq_nthreads--; + return false; +} + +#define WORKQ_UNPARK_FOR_DEATH_WAS_IDLE 0x1 + +__attribute__((noreturn, noinline)) +static void +workq_unpark_for_death_and_unlock(proc_t p, struct workqueue *wq, + struct uthread *uth, uint32_t death_flags) +{ + thread_qos_t qos = workq_pri_override(uth->uu_workq_pri); + bool first_use = uth->uu_workq_flags & UT_WORKQ_NEW; + + if (qos > WORKQ_THREAD_QOS_CLEANUP) { + workq_thread_reset_pri(wq, uth, NULL); + qos = WORKQ_THREAD_QOS_CLEANUP; + } + + workq_thread_reset_cpupercent(NULL, uth); + + if (death_flags & WORKQ_UNPARK_FOR_DEATH_WAS_IDLE) { + wq->wq_thidlecount--; + if (first_use) { + TAILQ_REMOVE(&wq->wq_thnewlist, uth, uu_workq_entry); + } else { + TAILQ_REMOVE(&wq->wq_thidlelist, uth, uu_workq_entry); + } + } + TAILQ_INSERT_TAIL(&wq->wq_thrunlist, uth, uu_workq_entry); + + workq_unlock(wq); + + uint32_t flags = WQ_FLAG_THREAD_NEWSPI | qos | WQ_FLAG_THREAD_PRIO_QOS; + uint32_t setup_flags = WQ_SETUP_EXIT_THREAD; + thread_t th = uth->uu_thread; + vm_map_t vmap = get_task_map(p->task); + + if (!first_use) flags |= WQ_FLAG_THREAD_REUSE; + + pthread_functions->workq_setup_thread(p, th, vmap, uth->uu_workq_stackaddr, + uth->uu_workq_thport, 0, setup_flags, flags); + __builtin_unreachable(); +} + +bool +workq_is_current_thread_updating_turnstile(struct workqueue *wq) +{ + return wq->wq_turnstile_updater == current_thread(); +} + +__attribute__((always_inline)) +static inline void +workq_perform_turnstile_operation_locked(struct workqueue *wq, + void (^operation)(void)) +{ + workq_lock_held(wq); + wq->wq_turnstile_updater = current_thread(); + operation(); + wq->wq_turnstile_updater = THREAD_NULL; +} + +static void +workq_turnstile_update_inheritor(struct workqueue *wq, + turnstile_inheritor_t inheritor, + turnstile_update_flags_t flags) +{ + workq_perform_turnstile_operation_locked(wq, ^{ + turnstile_update_inheritor(wq->wq_turnstile, inheritor, + flags | TURNSTILE_IMMEDIATE_UPDATE); + turnstile_update_inheritor_complete(wq->wq_turnstile, + TURNSTILE_INTERLOCK_HELD); + }); +} + +static void +workq_push_idle_thread(proc_t p, struct workqueue *wq, struct uthread *uth) +{ + uint64_t now = mach_absolute_time(); + + uth->uu_workq_flags &= ~UT_WORKQ_RUNNING; + if ((uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) == 0) { + wq->wq_constrained_threads_scheduled--; + } + TAILQ_REMOVE(&wq->wq_thrunlist, uth, uu_workq_entry); + wq->wq_threads_scheduled--; + + if (wq->wq_creator == uth) { + WQ_TRACE_WQ(TRACE_wq_creator_select, wq, 3, 0, + uth->uu_save.uus_workq_park_data.yields, 0); + wq->wq_creator = NULL; + if (wq->wq_reqcount) { + workq_turnstile_update_inheritor(wq, wq, TURNSTILE_INHERITOR_WORKQ); + } else { + workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0); + } + if (uth->uu_workq_flags & UT_WORKQ_NEW) { + TAILQ_INSERT_TAIL(&wq->wq_thnewlist, uth, uu_workq_entry); + wq->wq_thidlecount++; + return; + } + } else { + _wq_thactive_dec(wq, uth->uu_workq_pri.qos_bucket); + wq->wq_thscheduled_count[_wq_bucket(uth->uu_workq_pri.qos_bucket)]--; + assert(!(uth->uu_workq_flags & UT_WORKQ_NEW)); + uth->uu_workq_flags |= UT_WORKQ_IDLE_CLEANUP; + } + + uth->uu_save.uus_workq_park_data.idle_stamp = now; + + struct uthread *oldest = workq_oldest_killable_idle_thread(wq); + uint16_t cur_idle = wq->wq_thidlecount; + + if (cur_idle >= wq_max_constrained_threads || + (wq->wq_thdying_count == 0 && oldest && + workq_should_kill_idle_thread(wq, oldest, now))) { + /* + * Immediately kill threads if we have too may of them. + * + * And swap "place" with the oldest one we'd have woken up. + * This is a relatively desperate situation where we really + * need to kill threads quickly and it's best to kill + * the one that's currently on core than context switching. + */ + if (oldest) { + oldest->uu_save.uus_workq_park_data.idle_stamp = now; + TAILQ_REMOVE(&wq->wq_thidlelist, oldest, uu_workq_entry); + TAILQ_INSERT_HEAD(&wq->wq_thidlelist, oldest, uu_workq_entry); + } + + WQ_TRACE_WQ(TRACE_wq_thread_terminate | DBG_FUNC_START, + wq, cur_idle, 0, 0, 0); + wq->wq_thdying_count++; + uth->uu_workq_flags |= UT_WORKQ_DYING; + uth->uu_workq_flags &= ~UT_WORKQ_IDLE_CLEANUP; + workq_unpark_for_death_and_unlock(p, wq, uth, 0); + __builtin_unreachable(); + } + + struct uthread *tail = TAILQ_LAST(&wq->wq_thidlelist, workq_uthread_head); + + cur_idle += 1; + wq->wq_thidlecount = cur_idle; + + if (cur_idle >= wq_death_max_load && tail && + tail->uu_save.uus_workq_park_data.has_stack) { + uth->uu_save.uus_workq_park_data.has_stack = false; + TAILQ_INSERT_TAIL(&wq->wq_thidlelist, uth, uu_workq_entry); + } else { + uth->uu_save.uus_workq_park_data.has_stack = true; + TAILQ_INSERT_HEAD(&wq->wq_thidlelist, uth, uu_workq_entry); + } + + if (!tail) { + uint64_t delay = workq_kill_delay_for_idle_thread(wq); + workq_death_call_schedule(wq, now + delay); + } +} + +#pragma mark thread requests + +static inline int +workq_priority_for_req(workq_threadreq_t req) +{ + thread_qos_t qos = req->tr_qos; + + if (req->tr_flags & TR_FLAG_WL_OUTSIDE_QOS) { + workq_threadreq_param_t trp = kqueue_threadreq_workloop_param(req); + assert(trp.trp_flags & TRP_PRIORITY); + return trp.trp_pri; + } + return thread_workq_pri_for_qos(qos); +} + +static inline struct priority_queue * +workq_priority_queue_for_req(struct workqueue *wq, workq_threadreq_t req) +{ + if (req->tr_flags & TR_FLAG_WL_OUTSIDE_QOS) { + return &wq->wq_special_queue; + } else if (req->tr_flags & TR_FLAG_OVERCOMMIT) { + return &wq->wq_overcommit_queue; + } else { + return &wq->wq_constrained_queue; + } +} + +/* + * returns true if the the enqueued request is the highest priority item + * in its priority queue. + */ +static bool +workq_threadreq_enqueue(struct workqueue *wq, workq_threadreq_t req) +{ + assert(req->tr_state == TR_STATE_NEW); + + req->tr_state = TR_STATE_QUEUED; + wq->wq_reqcount += req->tr_count; + + if (req->tr_qos == WORKQ_THREAD_QOS_MANAGER) { + assert(wq->wq_event_manager_threadreq == NULL); + assert(req->tr_flags & TR_FLAG_KEVENT); + assert(req->tr_count == 1); + wq->wq_event_manager_threadreq = req; + return true; + } + if (priority_queue_insert(workq_priority_queue_for_req(wq, req), + &req->tr_entry, workq_priority_for_req(req), + PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) { + if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0) { + _wq_thactive_refresh_best_constrained_req_qos(wq); + } + return true; + } + return false; +} + +/* + * returns true if the the dequeued request was the highest priority item + * in its priority queue. + */ +static bool +workq_threadreq_dequeue(struct workqueue *wq, workq_threadreq_t req) +{ + wq->wq_reqcount--; + + if (--req->tr_count == 0) { + if (req->tr_qos == WORKQ_THREAD_QOS_MANAGER) { + assert(wq->wq_event_manager_threadreq == req); + assert(req->tr_count == 0); + wq->wq_event_manager_threadreq = NULL; + return true; + } + if (priority_queue_remove(workq_priority_queue_for_req(wq, req), + &req->tr_entry, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) { + if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0) { + _wq_thactive_refresh_best_constrained_req_qos(wq); + } + return true; + } + } + return false; +} + +static void +workq_threadreq_destroy(proc_t p, workq_threadreq_t req) +{ + req->tr_state = TR_STATE_IDLE; + if (req->tr_flags & (TR_FLAG_WORKLOOP | TR_FLAG_KEVENT)) { + kqueue_threadreq_cancel(p, req); + } else { + zfree(workq_zone_threadreq, req); + } +} + +/* + * Mark a thread request as complete. At this point, it is treated as owned by + * the submitting subsystem and you should assume it could be freed. + * + * Called with the workqueue lock held. + */ +static void +workq_threadreq_bind_and_unlock(proc_t p, struct workqueue *wq, + workq_threadreq_t req, struct uthread *uth) +{ + uint8_t tr_flags = req->tr_flags; + bool needs_commit = false; + int creator_flags = 0; + + wq->wq_fulfilled++; + + if (req->tr_state == TR_STATE_QUEUED) { + workq_threadreq_dequeue(wq, req); + creator_flags = WORKQ_THREADREQ_CAN_CREATE_THREADS; + } + + if (wq->wq_creator == uth) { + WQ_TRACE_WQ(TRACE_wq_creator_select, wq, 4, 0, + uth->uu_save.uus_workq_park_data.yields, 0); + creator_flags = WORKQ_THREADREQ_CAN_CREATE_THREADS | + WORKQ_THREADREQ_CREATOR_TRANSFER; + wq->wq_creator = NULL; + _wq_thactive_inc(wq, req->tr_qos); + wq->wq_thscheduled_count[_wq_bucket(req->tr_qos)]++; + } else if (uth->uu_workq_pri.qos_bucket != req->tr_qos) { + _wq_thactive_move(wq, uth->uu_workq_pri.qos_bucket, req->tr_qos); + } + workq_thread_reset_pri(wq, uth, req); + + if (tr_flags & TR_FLAG_OVERCOMMIT) { + if ((uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) == 0) { + uth->uu_workq_flags |= UT_WORKQ_OVERCOMMIT; + wq->wq_constrained_threads_scheduled--; + } + } else { + if ((uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) != 0) { + uth->uu_workq_flags &= ~UT_WORKQ_OVERCOMMIT; + wq->wq_constrained_threads_scheduled++; + } + } + + if (tr_flags & (TR_FLAG_KEVENT | TR_FLAG_WORKLOOP)) { + if (req->tr_state == TR_STATE_NEW) { + /* + * We're called from workq_kern_threadreq_initiate() + * due to an unbind, with the kq req held. + */ + assert(!creator_flags); + req->tr_state = TR_STATE_IDLE; + kqueue_threadreq_bind(p, req, uth->uu_thread, 0); + } else { + assert(req->tr_count == 0); + workq_perform_turnstile_operation_locked(wq, ^{ + kqueue_threadreq_bind_prepost(p, req, uth->uu_thread); + }); + needs_commit = true; + } + req = NULL; + } else if (req->tr_count > 0) { + req = NULL; + } + + if (creator_flags) { + /* This can drop the workqueue lock, and take it again */ + workq_schedule_creator(p, wq, creator_flags); + } + + workq_unlock(wq); + + if (req) { + zfree(workq_zone_threadreq, req); + } + if (needs_commit) { + kqueue_threadreq_bind_commit(p, uth->uu_thread); + } + + /* + * Run Thread, Run! + */ + uint32_t upcall_flags = WQ_FLAG_THREAD_NEWSPI; + if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER) { + upcall_flags |= WQ_FLAG_THREAD_EVENT_MANAGER; + } else if (tr_flags & TR_FLAG_OVERCOMMIT) { + upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT; + } + if (tr_flags & TR_FLAG_KEVENT) { + upcall_flags |= WQ_FLAG_THREAD_KEVENT; + } + if (tr_flags & TR_FLAG_WORKLOOP) { + upcall_flags |= WQ_FLAG_THREAD_WORKLOOP | WQ_FLAG_THREAD_KEVENT; + } + uth->uu_save.uus_workq_park_data.upcall_flags = upcall_flags; +} + +#pragma mark workqueue thread creation thread calls + +static inline bool +workq_thread_call_prepost(struct workqueue *wq, uint32_t sched, uint32_t pend, + uint32_t fail_mask) +{ + uint32_t old_flags, new_flags; + + os_atomic_rmw_loop(&wq->wq_flags, old_flags, new_flags, acquire, { + if (__improbable(old_flags & (WQ_EXITING | sched | pend | fail_mask))) { + os_atomic_rmw_loop_give_up(return false); + } + if (__improbable(old_flags & WQ_PROC_SUSPENDED)) { + new_flags = old_flags | pend; + } else { + new_flags = old_flags | sched; + } + }); + + return (old_flags & WQ_PROC_SUSPENDED) == 0; +} + +#define WORKQ_SCHEDULE_DELAYED_THREAD_CREATION_RESTART 0x1 + +static bool +workq_schedule_delayed_thread_creation(struct workqueue *wq, int flags) +{ + assert(!preemption_enabled()); + + if (!workq_thread_call_prepost(wq, WQ_DELAYED_CALL_SCHEDULED, + WQ_DELAYED_CALL_PENDED, WQ_IMMEDIATE_CALL_PENDED | + WQ_IMMEDIATE_CALL_SCHEDULED)) { + return false; + } + + uint64_t now = mach_absolute_time(); + + if (flags & WORKQ_SCHEDULE_DELAYED_THREAD_CREATION_RESTART) { + /* do not change the window */ + } else if (now - wq->wq_thread_call_last_run <= wq->wq_timer_interval) { + wq->wq_timer_interval *= 2; + if (wq->wq_timer_interval > wq_max_timer_interval.abstime) { + wq->wq_timer_interval = wq_max_timer_interval.abstime; + } + } else if (now - wq->wq_thread_call_last_run > 2 * wq->wq_timer_interval) { + wq->wq_timer_interval /= 2; + if (wq->wq_timer_interval < wq_stalled_window.abstime) { + wq->wq_timer_interval = wq_stalled_window.abstime; + } + } + + WQ_TRACE_WQ(TRACE_wq_start_add_timer, wq, wq->wq_reqcount, + _wq_flags(wq), wq->wq_timer_interval, 0); + + thread_call_t call = wq->wq_delayed_call; + uintptr_t arg = WQ_DELAYED_CALL_SCHEDULED; + uint64_t deadline = now + wq->wq_timer_interval; + if (thread_call_enter1_delayed(call, (void *)arg, deadline)) { + panic("delayed_call was already enqueued"); + } + return true; +} + +static void +workq_schedule_immediate_thread_creation(struct workqueue *wq) +{ + assert(!preemption_enabled()); + + if (workq_thread_call_prepost(wq, WQ_IMMEDIATE_CALL_SCHEDULED, + WQ_IMMEDIATE_CALL_PENDED, 0)) { + WQ_TRACE_WQ(TRACE_wq_start_add_timer, wq, wq->wq_reqcount, + _wq_flags(wq), 0, 0); + + uintptr_t arg = WQ_IMMEDIATE_CALL_SCHEDULED; + if (thread_call_enter1(wq->wq_immediate_call, (void *)arg)) { + panic("immediate_call was already enqueued"); + } + } +} + +void +workq_proc_suspended(struct proc *p) +{ + struct workqueue *wq = proc_get_wqptr(p); + + if (wq) os_atomic_or(&wq->wq_flags, WQ_PROC_SUSPENDED, relaxed); +} + +void +workq_proc_resumed(struct proc *p) +{ + struct workqueue *wq = proc_get_wqptr(p); + uint32_t wq_flags; + + if (!wq) return; + + wq_flags = os_atomic_and_orig(&wq->wq_flags, ~(WQ_PROC_SUSPENDED | + WQ_DELAYED_CALL_PENDED | WQ_IMMEDIATE_CALL_PENDED), relaxed); + if ((wq_flags & WQ_EXITING) == 0) { + disable_preemption(); + if (wq_flags & WQ_IMMEDIATE_CALL_PENDED) { + workq_schedule_immediate_thread_creation(wq); + } else if (wq_flags & WQ_DELAYED_CALL_PENDED) { + workq_schedule_delayed_thread_creation(wq, + WORKQ_SCHEDULE_DELAYED_THREAD_CREATION_RESTART); + } + enable_preemption(); + } +} + +/** + * returns whether lastblocked_tsp is within wq_stalled_window usecs of now + */ +static bool +workq_thread_is_busy(uint64_t now, _Atomic uint64_t *lastblocked_tsp) +{ + uint64_t lastblocked_ts = os_atomic_load(lastblocked_tsp, relaxed); + if (now <= lastblocked_ts) { + /* + * Because the update of the timestamp when a thread blocks + * isn't serialized against us looking at it (i.e. we don't hold + * the workq lock), it's possible to have a timestamp that matches + * the current time or that even looks to be in the future relative + * to when we grabbed the current time... + * + * Just treat this as a busy thread since it must have just blocked. + */ + return true; + } + return (now - lastblocked_ts) < wq_stalled_window.abstime; +} + +static void +workq_add_new_threads_call(void *_p, void *flags) +{ + proc_t p = _p; + struct workqueue *wq = proc_get_wqptr(p); + uint32_t my_flag = (uint32_t)(uintptr_t)flags; + + /* + * workq_exit() will set the workqueue to NULL before + * it cancels thread calls. + */ + if (!wq) return; + + assert((my_flag == WQ_DELAYED_CALL_SCHEDULED) || + (my_flag == WQ_IMMEDIATE_CALL_SCHEDULED)); + + WQ_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_START, wq, _wq_flags(wq), + wq->wq_nthreads, wq->wq_thidlecount, 0); + + workq_lock_spin(wq); + + wq->wq_thread_call_last_run = mach_absolute_time(); + os_atomic_and(&wq->wq_flags, ~my_flag, release); + + /* This can drop the workqueue lock, and take it again */ + workq_schedule_creator(p, wq, WORKQ_THREADREQ_CAN_CREATE_THREADS); + + workq_unlock(wq); + + WQ_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_END, wq, 0, + wq->wq_nthreads, wq->wq_thidlecount, 0); +} + +#pragma mark thread state tracking + +static void +workq_sched_callback(int type, thread_t thread) +{ + struct uthread *uth = get_bsdthread_info(thread); + proc_t proc = get_bsdtask_info(get_threadtask(thread)); + struct workqueue *wq = proc_get_wqptr(proc); + thread_qos_t req_qos, qos = uth->uu_workq_pri.qos_bucket; + wq_thactive_t old_thactive; + bool start_timer = false; + + if (qos == WORKQ_THREAD_QOS_MANAGER) { + return; + } + + switch (type) { + case SCHED_CALL_BLOCK: + old_thactive = _wq_thactive_dec(wq, qos); + req_qos = WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(old_thactive); + + /* + * Remember the timestamp of the last thread that blocked in this + * bucket, it used used by admission checks to ignore one thread + * being inactive if this timestamp is recent enough. + * + * If we collide with another thread trying to update the + * last_blocked (really unlikely since another thread would have to + * get scheduled and then block after we start down this path), it's + * not a problem. Either timestamp is adequate, so no need to retry + */ + os_atomic_store(&wq->wq_lastblocked_ts[_wq_bucket(qos)], + thread_last_run_time(thread), relaxed); + + if (req_qos == THREAD_QOS_UNSPECIFIED) { + /* + * No pending request at the moment we could unblock, move on. + */ + } else if (qos < req_qos) { + /* + * The blocking thread is at a lower QoS than the highest currently + * pending constrained request, nothing has to be redriven + */ + } else { + uint32_t max_busycount, old_req_count; + old_req_count = _wq_thactive_aggregate_downto_qos(wq, old_thactive, + req_qos, NULL, &max_busycount); + /* + * If it is possible that may_start_constrained_thread had refused + * admission due to being over the max concurrency, we may need to + * spin up a new thread. + * + * We take into account the maximum number of busy threads + * that can affect may_start_constrained_thread as looking at the + * actual number may_start_constrained_thread will see is racy. + * + * IOW at NCPU = 4, for IN (req_qos = 1), if the old req count is + * between NCPU (4) and NCPU - 2 (2) we need to redrive. + */ + uint32_t conc = wq_max_parallelism[_wq_bucket(qos)]; + if (old_req_count <= conc && conc <= old_req_count + max_busycount) { + start_timer = workq_schedule_delayed_thread_creation(wq, 0); + } + } + if (__improbable(kdebug_enable)) { + __unused uint32_t old = _wq_thactive_aggregate_downto_qos(wq, + old_thactive, qos, NULL, NULL); + WQ_TRACE_WQ(TRACE_wq_thread_block | DBG_FUNC_START, wq, + old - 1, qos | (req_qos << 8), + wq->wq_reqcount << 1 | start_timer, 0); + } + break; + + case SCHED_CALL_UNBLOCK: + /* + * we cannot take the workqueue_lock here... + * an UNBLOCK can occur from a timer event which + * is run from an interrupt context... if the workqueue_lock + * is already held by this processor, we'll deadlock... + * the thread lock for the thread being UNBLOCKED + * is also held + */ + old_thactive = _wq_thactive_inc(wq, qos); + if (__improbable(kdebug_enable)) { + __unused uint32_t old = _wq_thactive_aggregate_downto_qos(wq, + old_thactive, qos, NULL, NULL); + req_qos = WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(old_thactive); + WQ_TRACE_WQ(TRACE_wq_thread_block | DBG_FUNC_END, wq, + old + 1, qos | (req_qos << 8), + wq->wq_threads_scheduled, 0); + } + break; + } +} + +#pragma mark workq lifecycle + +void +workq_reference(struct workqueue *wq) +{ + os_ref_retain(&wq->wq_refcnt); +} + +void +workq_destroy(struct workqueue *wq) +{ + struct turnstile *ts; + + turnstile_complete((uintptr_t)wq, &wq->wq_turnstile, &ts); + assert(ts); + turnstile_cleanup(); + turnstile_deallocate(ts); + + lck_spin_destroy(&wq->wq_lock, workq_lck_grp); + zfree(workq_zone_workqueue, wq); +} + +static void +workq_deallocate(struct workqueue *wq) +{ + if (os_ref_release_relaxed(&wq->wq_refcnt) == 0) { + workq_destroy(wq); + } +} + +void +workq_deallocate_safe(struct workqueue *wq) +{ + if (__improbable(os_ref_release_relaxed(&wq->wq_refcnt) == 0)) { + workq_deallocate_enqueue(wq); + } +} + +/** + * Setup per-process state for the workqueue. + */ +int +workq_open(struct proc *p, __unused struct workq_open_args *uap, + __unused int32_t *retval) +{ + struct workqueue *wq; + int error = 0; + + if ((p->p_lflag & P_LREGISTER) == 0) { + return EINVAL; + } + + if (wq_init_constrained_limit) { + uint32_t limit, num_cpus = ml_get_max_cpus(); + + /* + * set up the limit for the constrained pool + * this is a virtual pool in that we don't + * maintain it on a separate idle and run list + */ + limit = num_cpus * WORKQUEUE_CONSTRAINED_FACTOR; + + if (limit > wq_max_constrained_threads) + wq_max_constrained_threads = limit; + + if (wq_max_threads > WQ_THACTIVE_BUCKET_HALF) { + wq_max_threads = WQ_THACTIVE_BUCKET_HALF; + } + if (wq_max_threads > CONFIG_THREAD_MAX - 20) { + wq_max_threads = CONFIG_THREAD_MAX - 20; + } + + wq_death_max_load = (uint16_t)fls(num_cpus) + 1; + + for (thread_qos_t qos = WORKQ_THREAD_QOS_MIN; qos <= WORKQ_THREAD_QOS_MAX; qos++) { + wq_max_parallelism[_wq_bucket(qos)] = + qos_max_parallelism(qos, QOS_PARALLELISM_COUNT_LOGICAL); + } + + wq_init_constrained_limit = 0; + } + + if (proc_get_wqptr(p) == NULL) { + if (proc_init_wqptr_or_wait(p) == FALSE) { + assert(proc_get_wqptr(p) != NULL); + goto out; + } + + wq = (struct workqueue *)zalloc(workq_zone_workqueue); + bzero(wq, sizeof(struct workqueue)); + + os_ref_init_count(&wq->wq_refcnt, &workq_refgrp, 1); + + // Start the event manager at the priority hinted at by the policy engine + thread_qos_t mgr_priority_hint = task_get_default_manager_qos(current_task()); + pthread_priority_t pp = _pthread_priority_make_from_thread_qos(mgr_priority_hint, 0, 0); + wq->wq_event_manager_priority = (uint32_t)pp; + wq->wq_timer_interval = wq_stalled_window.abstime; + wq->wq_proc = p; + turnstile_prepare((uintptr_t)wq, &wq->wq_turnstile, turnstile_alloc(), + TURNSTILE_WORKQS); + + TAILQ_INIT(&wq->wq_thrunlist); + TAILQ_INIT(&wq->wq_thnewlist); + TAILQ_INIT(&wq->wq_thidlelist); + priority_queue_init(&wq->wq_overcommit_queue, + PRIORITY_QUEUE_BUILTIN_MAX_HEAP); + priority_queue_init(&wq->wq_constrained_queue, + PRIORITY_QUEUE_BUILTIN_MAX_HEAP); + priority_queue_init(&wq->wq_special_queue, + PRIORITY_QUEUE_BUILTIN_MAX_HEAP); + + wq->wq_delayed_call = thread_call_allocate_with_options( + workq_add_new_threads_call, p, THREAD_CALL_PRIORITY_KERNEL, + THREAD_CALL_OPTIONS_ONCE); + wq->wq_immediate_call = thread_call_allocate_with_options( + workq_add_new_threads_call, p, THREAD_CALL_PRIORITY_KERNEL, + THREAD_CALL_OPTIONS_ONCE); + wq->wq_death_call = thread_call_allocate_with_options( + workq_kill_old_threads_call, wq, + THREAD_CALL_PRIORITY_USER, THREAD_CALL_OPTIONS_ONCE); + + lck_spin_init(&wq->wq_lock, workq_lck_grp, workq_lck_attr); + + WQ_TRACE_WQ(TRACE_wq_create | DBG_FUNC_NONE, wq, + VM_KERNEL_ADDRHIDE(wq), 0, 0, 0); + proc_set_wqptr(p, wq); + } +out: + + return error; +} + +/* + * Routine: workq_mark_exiting + * + * Function: Mark the work queue such that new threads will not be added to the + * work queue after we return. + * + * Conditions: Called against the current process. + */ +void +workq_mark_exiting(struct proc *p) +{ + struct workqueue *wq = proc_get_wqptr(p); + uint32_t wq_flags; + workq_threadreq_t mgr_req; + + if (!wq) return; + + WQ_TRACE_WQ(TRACE_wq_pthread_exit|DBG_FUNC_START, wq, 0, 0, 0, 0); + + workq_lock_spin(wq); + + wq_flags = os_atomic_or_orig(&wq->wq_flags, WQ_EXITING, relaxed); + if (__improbable(wq_flags & WQ_EXITING)) { + panic("workq_mark_exiting called twice"); + } + + /* + * Opportunistically try to cancel thread calls that are likely in flight. + * workq_exit() will do the proper cleanup. + */ + if (wq_flags & WQ_IMMEDIATE_CALL_SCHEDULED) { + thread_call_cancel(wq->wq_immediate_call); + } + if (wq_flags & WQ_DELAYED_CALL_SCHEDULED) { + thread_call_cancel(wq->wq_delayed_call); + } + if (wq_flags & WQ_DEATH_CALL_SCHEDULED) { + thread_call_cancel(wq->wq_death_call); + } + + mgr_req = wq->wq_event_manager_threadreq; + wq->wq_event_manager_threadreq = NULL; + wq->wq_reqcount = 0; /* workq_schedule_creator must not look at queues */ + workq_turnstile_update_inheritor(wq, NULL, 0); + + workq_unlock(wq); + + if (mgr_req) { + kqueue_threadreq_cancel(p, mgr_req); + } + /* + * No one touches the priority queues once WQ_EXITING is set. + * It is hence safe to do the tear down without holding any lock. + */ + priority_queue_destroy(&wq->wq_overcommit_queue, + struct workq_threadreq_s, tr_entry, ^(void *e){ + workq_threadreq_destroy(p, e); + }); + priority_queue_destroy(&wq->wq_constrained_queue, + struct workq_threadreq_s, tr_entry, ^(void *e){ + workq_threadreq_destroy(p, e); + }); + priority_queue_destroy(&wq->wq_special_queue, + struct workq_threadreq_s, tr_entry, ^(void *e){ + workq_threadreq_destroy(p, e); + }); + + WQ_TRACE(TRACE_wq_pthread_exit|DBG_FUNC_END, 0, 0, 0, 0, 0); +} + +/* + * Routine: workq_exit + * + * Function: clean up the work queue structure(s) now that there are no threads + * left running inside the work queue (except possibly current_thread). + * + * Conditions: Called by the last thread in the process. + * Called against current process. + */ +void +workq_exit(struct proc *p) +{ + struct workqueue *wq; + struct uthread *uth, *tmp; + + wq = os_atomic_xchg(&p->p_wqptr, NULL, relaxed); + if (wq != NULL) { + thread_t th = current_thread(); + + WQ_TRACE_WQ(TRACE_wq_workqueue_exit|DBG_FUNC_START, wq, 0, 0, 0, 0); + + if (thread_get_tag(th) & THREAD_TAG_WORKQUEUE) { + /* + * Make sure we will no longer call the + * sched call, if we ever block this thread, which the cancel_wait + * below can do. + */ + thread_sched_call(th, NULL); + } + + /* + * Thread calls are always scheduled by the proc itself or under the + * workqueue spinlock if WQ_EXITING is not yet set. + * + * Either way, when this runs, the proc has no threads left beside + * the one running this very code, so we know no thread call can be + * dispatched anymore. + */ + thread_call_cancel_wait(wq->wq_delayed_call); + thread_call_cancel_wait(wq->wq_immediate_call); + thread_call_cancel_wait(wq->wq_death_call); + thread_call_free(wq->wq_delayed_call); + thread_call_free(wq->wq_immediate_call); + thread_call_free(wq->wq_death_call); + + /* + * Clean up workqueue data structures for threads that exited and + * didn't get a chance to clean up after themselves. + * + * idle/new threads should have been interrupted and died on their own + */ + TAILQ_FOREACH_SAFE(uth, &wq->wq_thrunlist, uu_workq_entry, tmp) { + thread_sched_call(uth->uu_thread, NULL); + thread_deallocate(uth->uu_thread); + } + assert(TAILQ_EMPTY(&wq->wq_thnewlist)); + assert(TAILQ_EMPTY(&wq->wq_thidlelist)); + + WQ_TRACE_WQ(TRACE_wq_destroy | DBG_FUNC_END, wq, + VM_KERNEL_ADDRHIDE(wq), 0, 0, 0); + + workq_deallocate(wq); + + WQ_TRACE(TRACE_wq_workqueue_exit|DBG_FUNC_END, 0, 0, 0, 0, 0); + } +} + + +#pragma mark bsd thread control + +static bool +_pthread_priority_to_policy(pthread_priority_t priority, + thread_qos_policy_data_t *data) +{ + data->qos_tier = _pthread_priority_thread_qos(priority); + data->tier_importance = _pthread_priority_relpri(priority); + if (data->qos_tier == THREAD_QOS_UNSPECIFIED || data->tier_importance > 0 || + data->tier_importance < THREAD_QOS_MIN_TIER_IMPORTANCE) { + return false; + } + return true; +} + +static int +bsdthread_set_self(proc_t p, thread_t th, pthread_priority_t priority, + mach_port_name_t voucher, enum workq_set_self_flags flags) +{ + struct uthread *uth = get_bsdthread_info(th); + struct workqueue *wq = proc_get_wqptr(p); + + kern_return_t kr; + int unbind_rv = 0, qos_rv = 0, voucher_rv = 0, fixedpri_rv = 0; + bool is_wq_thread = (thread_get_tag(th) & THREAD_TAG_WORKQUEUE); + + if (flags & WORKQ_SET_SELF_WQ_KEVENT_UNBIND) { + if (!is_wq_thread) { + unbind_rv = EINVAL; + goto qos; + } + + if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER) { + unbind_rv = EINVAL; + goto qos; + } + + struct kqrequest *kqr = uth->uu_kqr_bound; + if (kqr == NULL) { + unbind_rv = EALREADY; + goto qos; + } + + if (kqr->kqr_state & KQR_WORKLOOP) { + unbind_rv = EINVAL; + goto qos; + } + + kqueue_threadreq_unbind(p, uth->uu_kqr_bound); + } + +qos: + if (flags & WORKQ_SET_SELF_QOS_FLAG) { + thread_qos_policy_data_t new_policy; + + if (!_pthread_priority_to_policy(priority, &new_policy)) { + qos_rv = EINVAL; + goto voucher; + } + + if (!is_wq_thread) { + /* + * Threads opted out of QoS can't change QoS + */ + if (!thread_has_qos_policy(th)) { + qos_rv = EPERM; + goto voucher; + } + } else if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER) { + /* + * Workqueue manager threads can't change QoS + */ + qos_rv = EINVAL; + goto voucher; + } else { + /* + * For workqueue threads, possibly adjust buckets and redrive thread + * requests. + */ + bool old_overcommit = uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT; + bool new_overcommit = priority & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG; + struct uu_workq_policy old_pri, new_pri; + bool force_run = false; + + workq_lock_spin(wq); + + if (old_overcommit != new_overcommit) { + uth->uu_workq_flags ^= UT_WORKQ_OVERCOMMIT; + if (old_overcommit) { + wq->wq_constrained_threads_scheduled++; + } else if (wq->wq_constrained_threads_scheduled-- == + wq_max_constrained_threads) { + force_run = true; + } + } + + old_pri = new_pri = uth->uu_workq_pri; + new_pri.qos_req = new_policy.qos_tier; + workq_thread_update_bucket(p, wq, uth, old_pri, new_pri, force_run); + workq_unlock(wq); + } + + kr = thread_policy_set_internal(th, THREAD_QOS_POLICY, + (thread_policy_t)&new_policy, THREAD_QOS_POLICY_COUNT); + if (kr != KERN_SUCCESS) { + qos_rv = EINVAL; + } + } + +voucher: + if (flags & WORKQ_SET_SELF_VOUCHER_FLAG) { + kr = thread_set_voucher_name(voucher); + if (kr != KERN_SUCCESS) { + voucher_rv = ENOENT; + goto fixedpri; + } + } + +fixedpri: + if (qos_rv) goto done; + if (flags & WORKQ_SET_SELF_FIXEDPRIORITY_FLAG) { + thread_extended_policy_data_t extpol = {.timeshare = 0}; + + if (is_wq_thread) { + /* Not allowed on workqueue threads */ + fixedpri_rv = ENOTSUP; + goto done; + } + + kr = thread_policy_set_internal(th, THREAD_EXTENDED_POLICY, + (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT); + if (kr != KERN_SUCCESS) { + fixedpri_rv = EINVAL; + goto done; + } + } else if (flags & WORKQ_SET_SELF_TIMESHARE_FLAG) { + thread_extended_policy_data_t extpol = {.timeshare = 1}; + + if (is_wq_thread) { + /* Not allowed on workqueue threads */ + fixedpri_rv = ENOTSUP; + goto done; + } + + kr = thread_policy_set_internal(th, THREAD_EXTENDED_POLICY, + (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT); + if (kr != KERN_SUCCESS) { + fixedpri_rv = EINVAL; + goto done; + } + } + +done: + if (qos_rv && voucher_rv) { + /* Both failed, give that a unique error. */ + return EBADMSG; + } + + if (unbind_rv) { + return unbind_rv; + } + + if (qos_rv) { + return qos_rv; + } + + if (voucher_rv) { + return voucher_rv; + } + + if (fixedpri_rv) { + return fixedpri_rv; + } + + return 0; +} + +static int +bsdthread_add_explicit_override(proc_t p, mach_port_name_t kport, + pthread_priority_t pp, user_addr_t resource) +{ + thread_qos_t qos = _pthread_priority_thread_qos(pp); + if (qos == THREAD_QOS_UNSPECIFIED) { + return EINVAL; + } + + thread_t th = port_name_to_thread(kport); + if (th == THREAD_NULL) { + return ESRCH; + } + + int rv = proc_thread_qos_add_override(p->task, th, 0, qos, TRUE, + resource, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE); + + thread_deallocate(th); + return rv; +} + +static int +bsdthread_remove_explicit_override(proc_t p, mach_port_name_t kport, + user_addr_t resource) +{ + thread_t th = port_name_to_thread(kport); + if (th == THREAD_NULL) { + return ESRCH; + } + + int rv = proc_thread_qos_remove_override(p->task, th, 0, resource, + THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE); + + thread_deallocate(th); + return rv; +} + +static int +workq_thread_add_dispatch_override(proc_t p, mach_port_name_t kport, + pthread_priority_t pp, user_addr_t ulock_addr) +{ + struct uu_workq_policy old_pri, new_pri; + struct workqueue *wq = proc_get_wqptr(p); + + thread_qos_t qos_override = _pthread_priority_thread_qos(pp); + if (qos_override == THREAD_QOS_UNSPECIFIED) { + return EINVAL; + } + + thread_t thread = port_name_to_thread(kport); + if (thread == THREAD_NULL) { + return ESRCH; + } + + struct uthread *uth = get_bsdthread_info(thread); + if ((thread_get_tag(thread) & THREAD_TAG_WORKQUEUE) == 0) { + thread_deallocate(thread); + return EPERM; + } + + WQ_TRACE_WQ(TRACE_wq_override_dispatch | DBG_FUNC_NONE, + wq, thread_tid(thread), 1, pp, 0); + + thread_mtx_lock(thread); + + if (ulock_addr) { + uint64_t val; + int rc; + /* + * Workaround lack of explicit support for 'no-fault copyin' + * , as disabling preemption prevents paging in + */ + disable_preemption(); + rc = copyin_word(ulock_addr, &val, sizeof(kport)); + enable_preemption(); + if (rc == 0 && ulock_owner_value_to_port_name((uint32_t)val) != kport) { + goto out; + } + } + + workq_lock_spin(wq); + + old_pri = uth->uu_workq_pri; + if (old_pri.qos_override >= qos_override) { + /* Nothing to do */ + } else if (thread == current_thread()) { + new_pri = old_pri; + new_pri.qos_override = qos_override; + workq_thread_update_bucket(p, wq, uth, old_pri, new_pri, false); + } else { + uth->uu_workq_pri.qos_override = qos_override; + if (qos_override > workq_pri_override(old_pri)) { + thread_set_workq_override(thread, qos_override); + } + } + + workq_unlock(wq); + +out: + thread_mtx_unlock(thread); + thread_deallocate(thread); + return 0; +} + +static int +workq_thread_reset_dispatch_override(proc_t p, thread_t thread) +{ + struct uu_workq_policy old_pri, new_pri; + struct workqueue *wq = proc_get_wqptr(p); + struct uthread *uth = get_bsdthread_info(thread); + + if ((thread_get_tag(thread) & THREAD_TAG_WORKQUEUE) == 0) { + return EPERM; + } + + WQ_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_NONE, wq, 0, 0, 0, 0); + + workq_lock_spin(wq); + old_pri = new_pri = uth->uu_workq_pri; + new_pri.qos_override = THREAD_QOS_UNSPECIFIED; + workq_thread_update_bucket(p, wq, uth, old_pri, new_pri, false); + workq_unlock(wq); + return 0; +} + +static int +bsdthread_get_max_parallelism(thread_qos_t qos, unsigned long flags, + int *retval) +{ + static_assert(QOS_PARALLELISM_COUNT_LOGICAL == + _PTHREAD_QOS_PARALLELISM_COUNT_LOGICAL, "logical"); + static_assert(QOS_PARALLELISM_REALTIME == + _PTHREAD_QOS_PARALLELISM_REALTIME, "realtime"); + + if (flags & ~(QOS_PARALLELISM_REALTIME | QOS_PARALLELISM_COUNT_LOGICAL)) { + return EINVAL; + } + + if (flags & QOS_PARALLELISM_REALTIME) { + if (qos) { + return EINVAL; + } + } else if (qos == THREAD_QOS_UNSPECIFIED || qos >= THREAD_QOS_LAST) { + return EINVAL; + } + + *retval = qos_max_parallelism(qos, flags); + return 0; +} + +#define ENSURE_UNUSED(arg) \ + ({ if ((arg) != 0) { return EINVAL; } }) + +int +bsdthread_ctl(struct proc *p, struct bsdthread_ctl_args *uap, int *retval) +{ + switch (uap->cmd) { + case BSDTHREAD_CTL_QOS_OVERRIDE_START: + return bsdthread_add_explicit_override(p, (mach_port_name_t)uap->arg1, + (pthread_priority_t)uap->arg2, uap->arg3); + case BSDTHREAD_CTL_QOS_OVERRIDE_END: + ENSURE_UNUSED(uap->arg3); + return bsdthread_remove_explicit_override(p, (mach_port_name_t)uap->arg1, + (user_addr_t)uap->arg2); + + case BSDTHREAD_CTL_QOS_OVERRIDE_DISPATCH: + return workq_thread_add_dispatch_override(p, (mach_port_name_t)uap->arg1, + (pthread_priority_t)uap->arg2, uap->arg3); + case BSDTHREAD_CTL_QOS_OVERRIDE_RESET: + return workq_thread_reset_dispatch_override(p, current_thread()); + + case BSDTHREAD_CTL_SET_SELF: + return bsdthread_set_self(p, current_thread(), + (pthread_priority_t)uap->arg1, (mach_port_name_t)uap->arg2, + (enum workq_set_self_flags)uap->arg3); + + case BSDTHREAD_CTL_QOS_MAX_PARALLELISM: + ENSURE_UNUSED(uap->arg3); + return bsdthread_get_max_parallelism((thread_qos_t)uap->arg1, + (unsigned long)uap->arg2, retval); + + case BSDTHREAD_CTL_SET_QOS: + case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_ADD: + case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_RESET: + /* no longer supported */ + return ENOTSUP; + + default: + return EINVAL; + } +} + +#pragma mark workqueue thread manipulation + +static void __dead2 +workq_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq, + struct uthread *uth); + +static void workq_setup_and_run(proc_t p, struct uthread *uth, int flags) __dead2; + +#if KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD +static inline uint64_t +workq_trace_req_id(workq_threadreq_t req) +{ + struct kqworkloop *kqwl; + if (req->tr_flags & TR_FLAG_WORKLOOP) { + kqwl = __container_of(req, struct kqworkloop, kqwl_request.kqr_req); + return kqwl->kqwl_dynamicid; + } + + return VM_KERNEL_ADDRHIDE(req); +} +#endif + +/** + * Entry point for libdispatch to ask for threads + */ +static int +workq_reqthreads(struct proc *p, uint32_t reqcount, pthread_priority_t pp) +{ + thread_qos_t qos = _pthread_priority_thread_qos(pp); + struct workqueue *wq = proc_get_wqptr(p); + uint32_t unpaced, upcall_flags = WQ_FLAG_THREAD_NEWSPI; + + if (wq == NULL || reqcount <= 0 || reqcount > UINT16_MAX || + qos == THREAD_QOS_UNSPECIFIED) { + return EINVAL; + } + + WQ_TRACE_WQ(TRACE_wq_wqops_reqthreads | DBG_FUNC_NONE, + wq, reqcount, pp, 0, 0); + + workq_threadreq_t req = zalloc(workq_zone_threadreq); + priority_queue_entry_init(&req->tr_entry); + req->tr_state = TR_STATE_NEW; + req->tr_flags = 0; + req->tr_qos = qos; + + if (pp & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) { + req->tr_flags |= TR_FLAG_OVERCOMMIT; + upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT; + } + + WQ_TRACE_WQ(TRACE_wq_thread_request_initiate | DBG_FUNC_NONE, + wq, workq_trace_req_id(req), req->tr_qos, reqcount, 0); + + workq_lock_spin(wq); + do { + if (_wq_exiting(wq)) { + goto exiting; + } + + /* + * When userspace is asking for parallelism, wakeup up to (reqcount - 1) + * threads without pacing, to inform the scheduler of that workload. + * + * The last requests, or the ones that failed the admission checks are + * enqueued and go through the regular creator codepath. + * + * If there aren't enough threads, add one, but re-evaluate everything + * as conditions may now have changed. + */ + if (reqcount > 1 && (req->tr_flags & TR_FLAG_OVERCOMMIT) == 0) { + unpaced = workq_constrained_allowance(wq, qos, NULL, false); + if (unpaced >= reqcount - 1) { + unpaced = reqcount - 1; + } + } else { + unpaced = reqcount - 1; + } + + /* + * This path does not currently handle custom workloop parameters + * when creating threads for parallelism. + */ + assert(!(req->tr_flags & TR_FLAG_WL_PARAMS)); + + /* + * This is a trimmed down version of workq_threadreq_bind_and_unlock() + */ + while (unpaced > 0 && wq->wq_thidlecount) { + struct uthread *uth = workq_pop_idle_thread(wq); + + _wq_thactive_inc(wq, qos); + wq->wq_thscheduled_count[_wq_bucket(qos)]++; + workq_thread_reset_pri(wq, uth, req); + wq->wq_fulfilled++; + + uth->uu_workq_flags |= UT_WORKQ_EARLY_BOUND; + if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0) { + uth->uu_workq_flags &= ~UT_WORKQ_OVERCOMMIT; + wq->wq_constrained_threads_scheduled++; + } + uth->uu_save.uus_workq_park_data.upcall_flags = upcall_flags; + uth->uu_save.uus_workq_park_data.thread_request = req; + workq_thread_wakeup(uth); + unpaced--; + reqcount--; + } + } while (unpaced && wq->wq_nthreads < wq_max_threads && + workq_add_new_idle_thread(p, wq)); + + if (_wq_exiting(wq)) { + goto exiting; + } + + req->tr_count = reqcount; + if (workq_threadreq_enqueue(wq, req)) { + /* This can drop the workqueue lock, and take it again */ + workq_schedule_creator(p, wq, WORKQ_THREADREQ_CAN_CREATE_THREADS); + } + workq_unlock(wq); + return 0; + +exiting: + workq_unlock(wq); + zfree(workq_zone_threadreq, req); + return ECANCELED; +} + +bool +workq_kern_threadreq_initiate(struct proc *p, struct kqrequest *kqr, + struct turnstile *workloop_ts, thread_qos_t qos, int flags) +{ + struct workqueue *wq = proc_get_wqptr_fast(p); + workq_threadreq_t req = &kqr->kqr_req; + struct uthread *uth = NULL; + uint8_t tr_flags = 0; + + if (kqr->kqr_state & KQR_WORKLOOP) { + tr_flags = TR_FLAG_WORKLOOP; + + workq_threadreq_param_t trp = kqueue_threadreq_workloop_param(req); + if (trp.trp_flags & TRP_PRIORITY) { + tr_flags |= TR_FLAG_WL_OUTSIDE_QOS; + qos = thread_workq_qos_for_pri(trp.trp_pri); + if (qos == THREAD_QOS_UNSPECIFIED) { + qos = WORKQ_THREAD_QOS_ABOVEUI; + } + } + if (trp.trp_flags) { + tr_flags |= TR_FLAG_WL_PARAMS; + } + } else { + tr_flags = TR_FLAG_KEVENT; + } + if (qos != WORKQ_THREAD_QOS_MANAGER && + (kqr->kqr_state & KQR_THOVERCOMMIT)) { + tr_flags |= TR_FLAG_OVERCOMMIT; + } + + assert(req->tr_state == TR_STATE_IDLE); + priority_queue_entry_init(&req->tr_entry); + req->tr_count = 1; + req->tr_state = TR_STATE_NEW; + req->tr_flags = tr_flags; + req->tr_qos = qos; + + WQ_TRACE_WQ(TRACE_wq_thread_request_initiate | DBG_FUNC_NONE, wq, + workq_trace_req_id(req), qos, 1, 0); + + if (flags & WORKQ_THREADREQ_ATTEMPT_REBIND) { + /* + * we're called back synchronously from the context of + * kqueue_threadreq_unbind from within workq_thread_return() + * we can try to match up this thread with this request ! + */ + uth = current_uthread(); + assert(uth->uu_kqr_bound == NULL); + } + + workq_lock_spin(wq); + if (_wq_exiting(wq)) { + workq_unlock(wq); + return false; + } + + if (uth && workq_threadreq_admissible(wq, uth, req)) { + assert(uth != wq->wq_creator); + workq_threadreq_bind_and_unlock(p, wq, req, uth); + } else { + if (workloop_ts) { + workq_perform_turnstile_operation_locked(wq, ^{ + turnstile_update_inheritor(workloop_ts, wq->wq_turnstile, + TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_TURNSTILE); + turnstile_update_inheritor_complete(workloop_ts, + TURNSTILE_INTERLOCK_HELD); + }); + } + if (workq_threadreq_enqueue(wq, req)) { + workq_schedule_creator(p, wq, flags); + } + workq_unlock(wq); + } + + return true; +} + +void +workq_kern_threadreq_modify(struct proc *p, struct kqrequest *kqr, + thread_qos_t qos, int flags) +{ + struct workqueue *wq = proc_get_wqptr_fast(p); + workq_threadreq_t req = &kqr->kqr_req; + bool change_overcommit = false; + + if (req->tr_flags & TR_FLAG_WL_OUTSIDE_QOS) { + /* Requests outside-of-QoS shouldn't accept modify operations */ + return; + } + + workq_lock_spin(wq); + + assert(req->tr_qos != WORKQ_THREAD_QOS_MANAGER); + assert(req->tr_flags & (TR_FLAG_KEVENT | TR_FLAG_WORKLOOP)); + + if (req->tr_state == TR_STATE_BINDING) { + kqueue_threadreq_bind(p, req, req->tr_binding_thread, 0); + workq_unlock(wq); + return; + } + + change_overcommit = (bool)(kqr->kqr_state & KQR_THOVERCOMMIT) != + (bool)(req->tr_flags & TR_FLAG_OVERCOMMIT); + + if (_wq_exiting(wq) || (req->tr_qos == qos && !change_overcommit)) { + workq_unlock(wq); + return; + } + + assert(req->tr_count == 1); + if (req->tr_state != TR_STATE_QUEUED) { + panic("Invalid thread request (%p) state %d", req, req->tr_state); + } + + WQ_TRACE_WQ(TRACE_wq_thread_request_modify | DBG_FUNC_NONE, wq, + workq_trace_req_id(req), qos, 0, 0); + + struct priority_queue *pq = workq_priority_queue_for_req(wq, req); + workq_threadreq_t req_max; + + /* + * Stage 1: Dequeue the request from its priority queue. + * + * If we dequeue the root item of the constrained priority queue, + * maintain the best constrained request qos invariant. + */ + if (priority_queue_remove(pq, &req->tr_entry, + PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) { + if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0) { + _wq_thactive_refresh_best_constrained_req_qos(wq); + } + } + + /* + * Stage 2: Apply changes to the thread request + * + * If the item will not become the root of the priority queue it belongs to, + * then we need to wait in line, just enqueue and return quickly. + */ + if (__improbable(change_overcommit)) { + req->tr_flags ^= TR_FLAG_OVERCOMMIT; + pq = workq_priority_queue_for_req(wq, req); + } + req->tr_qos = qos; + + req_max = priority_queue_max(pq, struct workq_threadreq_s, tr_entry); + if (req_max && req_max->tr_qos >= qos) { + priority_queue_insert(pq, &req->tr_entry, workq_priority_for_req(req), + PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE); + workq_unlock(wq); + return; + } + + /* + * Stage 3: Reevaluate whether we should run the thread request. + * + * Pretend the thread request is new again: + * - adjust wq_reqcount to not count it anymore. + * - make its state TR_STATE_NEW (so that workq_threadreq_bind_and_unlock + * properly attempts a synchronous bind) + */ + wq->wq_reqcount--; + req->tr_state = TR_STATE_NEW; + if (workq_threadreq_enqueue(wq, req)) { + workq_schedule_creator(p, wq, flags); + } + workq_unlock(wq); +} + +void +workq_kern_threadreq_lock(struct proc *p) +{ + workq_lock_spin(proc_get_wqptr_fast(p)); +} + +void +workq_kern_threadreq_unlock(struct proc *p) +{ + workq_unlock(proc_get_wqptr_fast(p)); +} + +void +workq_kern_threadreq_update_inheritor(struct proc *p, struct kqrequest *kqr, + thread_t owner, struct turnstile *wl_ts, + turnstile_update_flags_t flags) +{ + struct workqueue *wq = proc_get_wqptr_fast(p); + workq_threadreq_t req = &kqr->kqr_req; + turnstile_inheritor_t inheritor; + + assert(req->tr_qos != WORKQ_THREAD_QOS_MANAGER); + assert(req->tr_flags & TR_FLAG_WORKLOOP); + workq_lock_held(wq); + + if (req->tr_state == TR_STATE_BINDING) { + kqueue_threadreq_bind(p, req, req->tr_binding_thread, + KQUEUE_THREADERQ_BIND_NO_INHERITOR_UPDATE); + return; + } + + if (_wq_exiting(wq)) { + inheritor = TURNSTILE_INHERITOR_NULL; + } else { + if (req->tr_state != TR_STATE_QUEUED) { + panic("Invalid thread request (%p) state %d", req, req->tr_state); + } + + if (owner) { + inheritor = owner; + flags |= TURNSTILE_INHERITOR_THREAD; + } else { + inheritor = wq->wq_turnstile; + flags |= TURNSTILE_INHERITOR_TURNSTILE; + } + } + + workq_perform_turnstile_operation_locked(wq, ^{ + turnstile_update_inheritor(wl_ts, inheritor, flags); + }); +} + +void +workq_kern_threadreq_redrive(struct proc *p, int flags) +{ + struct workqueue *wq = proc_get_wqptr_fast(p); + + workq_lock_spin(wq); + workq_schedule_creator(p, wq, flags); + workq_unlock(wq); +} + +void +workq_schedule_creator_turnstile_redrive(struct workqueue *wq, bool locked) +{ + if (!locked) workq_lock_spin(wq); + workq_schedule_creator(NULL, wq, WORKQ_THREADREQ_CREATOR_SYNC_UPDATE); + if (!locked) workq_unlock(wq); +} + +static int +workq_thread_return(struct proc *p, struct workq_kernreturn_args *uap, + struct workqueue *wq) +{ + thread_t th = current_thread(); + struct uthread *uth = get_bsdthread_info(th); + struct kqrequest *kqr = uth->uu_kqr_bound; + workq_threadreq_param_t trp = { }; + int nevents = uap->affinity, error; + user_addr_t eventlist = uap->item; + + if (((thread_get_tag(th) & THREAD_TAG_WORKQUEUE) == 0) || + (uth->uu_workq_flags & UT_WORKQ_DYING)) { + return EINVAL; + } + + if (eventlist && nevents && kqr == NULL) { + return EINVAL; + } + + /* reset signal mask on the workqueue thread to default state */ + if (uth->uu_sigmask != (sigset_t)(~workq_threadmask)) { + proc_lock(p); + uth->uu_sigmask = ~workq_threadmask; + proc_unlock(p); + } + + if (kqr && kqr->kqr_req.tr_flags & TR_FLAG_WL_PARAMS) { + /* + * Ensure we store the threadreq param before unbinding + * the kqr from this thread. + */ + trp = kqueue_threadreq_workloop_param(&kqr->kqr_req); + } + + if (kqr) { + uint32_t upcall_flags = WQ_FLAG_THREAD_NEWSPI | WQ_FLAG_THREAD_REUSE; + if (kqr->kqr_state & KQR_WORKLOOP) { + upcall_flags |= WQ_FLAG_THREAD_WORKLOOP | WQ_FLAG_THREAD_KEVENT; + } else { + upcall_flags |= WQ_FLAG_THREAD_KEVENT; + } + if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER) { + upcall_flags |= WQ_FLAG_THREAD_EVENT_MANAGER; + } else { + if (uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) { + upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT; + } + if (uth->uu_workq_flags & UT_WORKQ_OUTSIDE_QOS) { + upcall_flags |= WQ_FLAG_THREAD_OUTSIDEQOS; + } else { + upcall_flags |= uth->uu_workq_pri.qos_req | + WQ_FLAG_THREAD_PRIO_QOS; + } + } + + error = pthread_functions->workq_handle_stack_events(p, th, + get_task_map(p->task), uth->uu_workq_stackaddr, + uth->uu_workq_thport, eventlist, nevents, upcall_flags); + if (error) return error; + + // pthread is supposed to pass KEVENT_FLAG_PARKING here + // which should cause the above call to either: + // - not return + // - return an error + // - return 0 and have unbound properly + assert(uth->uu_kqr_bound == NULL); + } + + WQ_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_END, wq, uap->options, 0, 0, 0); + + thread_sched_call(th, NULL); + thread_will_park_or_terminate(th); +#if CONFIG_WORKLOOP_DEBUG + UU_KEVENT_HISTORY_WRITE_ENTRY(uth, { .uu_error = -1, }); +#endif + + workq_lock_spin(wq); + WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_END, wq, 0, 0, 0, 0); + uth->uu_save.uus_workq_park_data.workloop_params = trp.trp_value; + workq_select_threadreq_or_park_and_unlock(p, wq, uth); + __builtin_unreachable(); +} + +/** + * Multiplexed call to interact with the workqueue mechanism + */ +int +workq_kernreturn(struct proc *p, struct workq_kernreturn_args *uap, int32_t *retval) +{ + int options = uap->options; + int arg2 = uap->affinity; + int arg3 = uap->prio; + struct workqueue *wq = proc_get_wqptr(p); + int error = 0; + + if ((p->p_lflag & P_LREGISTER) == 0) { + return EINVAL; + } + + switch (options) { + case WQOPS_QUEUE_NEWSPISUPP: { + /* + * arg2 = offset of serialno into dispatch queue + * arg3 = kevent support + */ + int offset = arg2; + if (arg3 & 0x01){ + // If we get here, then userspace has indicated support for kevent delivery. + } + + p->p_dispatchqueue_serialno_offset = (uint64_t)offset; + break; + } + case WQOPS_QUEUE_REQTHREADS: { + /* + * arg2 = number of threads to start + * arg3 = priority + */ + error = workq_reqthreads(p, arg2, arg3); + break; + } + case WQOPS_SET_EVENT_MANAGER_PRIORITY: { + /* + * arg2 = priority for the manager thread + * + * if _PTHREAD_PRIORITY_SCHED_PRI_FLAG is set, + * the low bits of the value contains a scheduling priority + * instead of a QOS value + */ + pthread_priority_t pri = arg2; + + if (wq == NULL) { + error = EINVAL; + break; + } + + /* + * Normalize the incoming priority so that it is ordered numerically. + */ + if (pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) { + pri &= (_PTHREAD_PRIORITY_SCHED_PRI_MASK | + _PTHREAD_PRIORITY_SCHED_PRI_FLAG); + } else { + thread_qos_t qos = _pthread_priority_thread_qos(pri); + int relpri = _pthread_priority_relpri(pri); + if (relpri > 0 || relpri < THREAD_QOS_MIN_TIER_IMPORTANCE || + qos == THREAD_QOS_UNSPECIFIED) { + error = EINVAL; + break; + } + pri &= ~_PTHREAD_PRIORITY_FLAGS_MASK; + } + + /* + * If userspace passes a scheduling priority, that wins over any QoS. + * Userspace should takes care not to lower the priority this way. + */ + workq_lock_spin(wq); + if (wq->wq_event_manager_priority < (uint32_t)pri) { + wq->wq_event_manager_priority = (uint32_t)pri; + } + workq_unlock(wq); + break; + } + case WQOPS_THREAD_KEVENT_RETURN: + case WQOPS_THREAD_WORKLOOP_RETURN: + case WQOPS_THREAD_RETURN: { + error = workq_thread_return(p, uap, wq); + break; + } + + case WQOPS_SHOULD_NARROW: { + /* + * arg2 = priority to test + * arg3 = unused + */ + thread_t th = current_thread(); + struct uthread *uth = get_bsdthread_info(th); + if (((thread_get_tag(th) & THREAD_TAG_WORKQUEUE) == 0) || + (uth->uu_workq_flags & (UT_WORKQ_DYING|UT_WORKQ_OVERCOMMIT))) { + error = EINVAL; + break; + } + + thread_qos_t qos = _pthread_priority_thread_qos(arg2); + if (qos == THREAD_QOS_UNSPECIFIED) { + error = EINVAL; + break; + } + workq_lock_spin(wq); + bool should_narrow = !workq_constrained_allowance(wq, qos, uth, false); + workq_unlock(wq); + + *retval = should_narrow; + break; + } + default: + error = EINVAL; + break; + } + + return (error); +} + +/* + * We have no work to do, park ourselves on the idle list. + * + * Consumes the workqueue lock and does not return. + */ +__attribute__((noreturn, noinline)) +static void +workq_park_and_unlock(proc_t p, struct workqueue *wq, struct uthread *uth) +{ + assert(uth == current_uthread()); + assert(uth->uu_kqr_bound == NULL); + workq_push_idle_thread(p, wq, uth); // may not return + + workq_thread_reset_cpupercent(NULL, uth); + + if (uth->uu_workq_flags & UT_WORKQ_IDLE_CLEANUP) { + workq_unlock(wq); + + /* + * workq_push_idle_thread() will unset `has_stack` + * if it wants us to free the stack before parking. + */ + if (!uth->uu_save.uus_workq_park_data.has_stack) { + pthread_functions->workq_markfree_threadstack(p, uth->uu_thread, + get_task_map(p->task), uth->uu_workq_stackaddr); + } + + /* + * When we remove the voucher from the thread, we may lose our importance + * causing us to get preempted, so we do this after putting the thread on + * the idle list. Then, when we get our importance back we'll be able to + * use this thread from e.g. the kevent call out to deliver a boosting + * message. + */ + __assert_only kern_return_t kr; + kr = thread_set_voucher_name(MACH_PORT_NULL); + assert(kr == KERN_SUCCESS); + + workq_lock_spin(wq); + uth->uu_workq_flags &= ~UT_WORKQ_IDLE_CLEANUP; + } + + if (uth->uu_workq_flags & UT_WORKQ_RUNNING) { + /* + * While we'd dropped the lock to unset our voucher, someone came + * around and made us runnable. But because we weren't waiting on the + * event their thread_wakeup() was ineffectual. To correct for that, + * we just run the continuation ourselves. + */ + WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_END, wq, 0, 0, 0, 0); + workq_select_threadreq_or_park_and_unlock(p, wq, uth); + __builtin_unreachable(); + } + + if (uth->uu_workq_flags & UT_WORKQ_DYING) { + workq_unpark_for_death_and_unlock(p, wq, uth, + WORKQ_UNPARK_FOR_DEATH_WAS_IDLE); + __builtin_unreachable(); + } + + thread_set_pending_block_hint(uth->uu_thread, kThreadWaitParkedWorkQueue); + assert_wait(workq_parked_wait_event(uth), THREAD_INTERRUPTIBLE); + workq_unlock(wq); + WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_END, wq, 0, 0, 0, 0); + thread_block(workq_unpark_continue); + __builtin_unreachable(); +} + +static inline bool +workq_may_start_event_mgr_thread(struct workqueue *wq, struct uthread *uth) +{ + /* + * There's an event manager request and either: + * - no event manager currently running + * - we are re-using the event manager + */ + return wq->wq_thscheduled_count[_wq_bucket(WORKQ_THREAD_QOS_MANAGER)] == 0 || + (uth && uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER); +} + +static uint32_t +workq_constrained_allowance(struct workqueue *wq, thread_qos_t at_qos, + struct uthread *uth, bool may_start_timer) +{ + assert(at_qos != WORKQ_THREAD_QOS_MANAGER); + uint32_t count = 0; + + uint32_t max_count = wq->wq_constrained_threads_scheduled; + if (uth && (uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) == 0) { + /* + * don't count the current thread as scheduled + */ + assert(max_count > 0); + max_count--; + } + if (max_count >= wq_max_constrained_threads) { + WQ_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 1, + wq->wq_constrained_threads_scheduled, + wq_max_constrained_threads, 0); + /* + * we need 1 or more constrained threads to return to the kernel before + * we can dispatch additional work + */ + return 0; + } + max_count -= wq_max_constrained_threads; + + /* + * Compute a metric for many how many threads are active. We find the + * highest priority request outstanding and then add up the number of + * active threads in that and all higher-priority buckets. We'll also add + * any "busy" threads which are not active but blocked recently enough that + * we can't be sure they've gone idle yet. We'll then compare this metric + * to our max concurrency to decide whether to add a new thread. + */ + + uint32_t busycount, thactive_count; + + thactive_count = _wq_thactive_aggregate_downto_qos(wq, _wq_thactive(wq), + at_qos, &busycount, NULL); + + if (uth && uth->uu_workq_pri.qos_bucket != WORKQ_THREAD_QOS_MANAGER && + at_qos <= uth->uu_workq_pri.qos_bucket) { + /* + * Don't count this thread as currently active, but only if it's not + * a manager thread, as _wq_thactive_aggregate_downto_qos ignores active + * managers. + */ + assert(thactive_count > 0); + thactive_count--; + } + + count = wq_max_parallelism[_wq_bucket(at_qos)]; + if (count > thactive_count + busycount) { + count -= thactive_count + busycount; + WQ_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 2, + thactive_count, busycount, 0); + return MIN(count, max_count); + } else { + WQ_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 3, + thactive_count, busycount, 0); + } + + if (busycount && may_start_timer) { + /* + * If this is called from the add timer, we won't have another timer + * fire when the thread exits the "busy" state, so rearm the timer. + */ + workq_schedule_delayed_thread_creation(wq, 0); + } + + return 0; +} + +static bool +workq_threadreq_admissible(struct workqueue *wq, struct uthread *uth, + workq_threadreq_t req) +{ + if (req->tr_qos == WORKQ_THREAD_QOS_MANAGER) { + return workq_may_start_event_mgr_thread(wq, uth); + } + if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0) { + return workq_constrained_allowance(wq, req->tr_qos, uth, true); + } + return true; +} + +static workq_threadreq_t +workq_threadreq_select_for_creator(struct workqueue *wq) +{ + workq_threadreq_t req_qos, req_pri, req_tmp; + thread_qos_t qos = THREAD_QOS_UNSPECIFIED; + uint8_t pri = 0; + + req_tmp = wq->wq_event_manager_threadreq; + if (req_tmp && workq_may_start_event_mgr_thread(wq, NULL)) { + return req_tmp; + } + + /* + * Compute the best priority request, and ignore the turnstile for now + */ + + req_pri = priority_queue_max(&wq->wq_special_queue, + struct workq_threadreq_s, tr_entry); + if (req_pri) { + pri = priority_queue_entry_key(&wq->wq_special_queue, &req_pri->tr_entry); + } + + /* + * Compute the best QoS Request, and check whether it beats the "pri" one + */ + + req_qos = priority_queue_max(&wq->wq_overcommit_queue, + struct workq_threadreq_s, tr_entry); + if (req_qos) { + qos = req_qos->tr_qos; + } + + req_tmp = priority_queue_max(&wq->wq_constrained_queue, + struct workq_threadreq_s, tr_entry); + + if (req_tmp && qos < req_tmp->tr_qos) { + if (pri && pri >= thread_workq_pri_for_qos(req_tmp->tr_qos)) { + return req_pri; + } + + if (workq_constrained_allowance(wq, req_tmp->tr_qos, NULL, true)) { + /* + * If the constrained thread request is the best one and passes + * the admission check, pick it. + */ + return req_tmp; + } + } + + if (pri && (!qos || pri >= thread_workq_pri_for_qos(qos))) { + return req_pri; + } + + if (req_qos) { + return req_qos; + } + + /* + * If we had no eligible request but we have a turnstile push, + * it must be a non overcommit thread request that failed + * the admission check. + * + * Just fake a BG thread request so that if the push stops the creator + * priority just drops to 4. + */ + if (turnstile_workq_proprietor_of_max_turnstile(wq->wq_turnstile, NULL)) { + static struct workq_threadreq_s workq_sync_push_fake_req = { + .tr_qos = THREAD_QOS_BACKGROUND, + }; + + return &workq_sync_push_fake_req; + } + + return NULL; +} + +static workq_threadreq_t +workq_threadreq_select(struct workqueue *wq, struct uthread *uth) +{ + workq_threadreq_t req_qos, req_pri, req_tmp; + uintptr_t proprietor; + thread_qos_t qos = THREAD_QOS_UNSPECIFIED; + uint8_t pri = 0; + + if (uth == wq->wq_creator) uth = NULL; + + req_tmp = wq->wq_event_manager_threadreq; + if (req_tmp && workq_may_start_event_mgr_thread(wq, uth)) { + return req_tmp; + } + + /* + * Compute the best priority request (special or turnstile) + */ + + pri = turnstile_workq_proprietor_of_max_turnstile(wq->wq_turnstile, + &proprietor); + if (pri) { + struct kqworkloop *kqwl = (struct kqworkloop *)proprietor; + req_pri = &kqwl->kqwl_request.kqr_req; + if (req_pri->tr_state != TR_STATE_QUEUED) { + panic("Invalid thread request (%p) state %d", + req_pri, req_pri->tr_state); + } + } else { + req_pri = NULL; + } + + req_tmp = priority_queue_max(&wq->wq_special_queue, + struct workq_threadreq_s, tr_entry); + if (req_tmp && pri < priority_queue_entry_key(&wq->wq_special_queue, + &req_tmp->tr_entry)) { + req_pri = req_tmp; + pri = priority_queue_entry_key(&wq->wq_special_queue, &req_tmp->tr_entry); + } + + /* + * Compute the best QoS Request, and check whether it beats the "pri" one + */ + + req_qos = priority_queue_max(&wq->wq_overcommit_queue, + struct workq_threadreq_s, tr_entry); + if (req_qos) { + qos = req_qos->tr_qos; + } + + req_tmp = priority_queue_max(&wq->wq_constrained_queue, + struct workq_threadreq_s, tr_entry); + + if (req_tmp && qos < req_tmp->tr_qos) { + if (pri && pri >= thread_workq_pri_for_qos(req_tmp->tr_qos)) { + return req_pri; + } + + if (workq_constrained_allowance(wq, req_tmp->tr_qos, uth, true)) { + /* + * If the constrained thread request is the best one and passes + * the admission check, pick it. + */ + return req_tmp; + } + } + + if (req_pri && (!qos || pri >= thread_workq_pri_for_qos(qos))) { + return req_pri; + } + + return req_qos; +} + +/* + * The creator is an anonymous thread that is counted as scheduled, + * but otherwise without its scheduler callback set or tracked as active + * that is used to make other threads. + * + * When more requests are added or an existing one is hurried along, + * a creator is elected and setup, or the existing one overridden accordingly. + * + * While this creator is in flight, because no request has been dequeued, + * already running threads have a chance at stealing thread requests avoiding + * useless context switches, and the creator once scheduled may not find any + * work to do and will then just park again. + * + * The creator serves the dual purpose of informing the scheduler of work that + * hasn't be materialized as threads yet, and also as a natural pacing mechanism + * for thread creation. + * + * By being anonymous (and not bound to anything) it means that thread requests + * can be stolen from this creator by threads already on core yielding more + * efficient scheduling and reduced context switches. + */ +static void +workq_schedule_creator(proc_t p, struct workqueue *wq, int flags) +{ + workq_threadreq_t req; + struct uthread *uth; + + workq_lock_held(wq); + assert(p || (flags & WORKQ_THREADREQ_CAN_CREATE_THREADS) == 0); + +again: + uth = wq->wq_creator; + + if (!wq->wq_reqcount) { + if (uth == NULL) { + workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0); + } + return; + } + + req = workq_threadreq_select_for_creator(wq); + if (req == NULL) { + if (flags & WORKQ_THREADREQ_CREATOR_SYNC_UPDATE) { + assert((flags & WORKQ_THREADREQ_CREATOR_TRANSFER) == 0); + /* + * turnstile propagation code is reaching out to us, + * and we still don't want to do anything, do not recurse. + */ + } else { + workq_turnstile_update_inheritor(wq, wq, TURNSTILE_INHERITOR_WORKQ); + } + return; + } + + if (uth) { + /* + * We need to maybe override the creator we already have + */ + if (workq_thread_needs_priority_change(req, uth)) { + WQ_TRACE_WQ(TRACE_wq_creator_select | DBG_FUNC_NONE, + wq, 1, thread_tid(uth->uu_thread), req->tr_qos, 0); + workq_thread_reset_pri(wq, uth, req); + } + } else if (wq->wq_thidlecount) { + /* + * We need to unpark a creator thread + */ + wq->wq_creator = uth = workq_pop_idle_thread(wq); + if (workq_thread_needs_priority_change(req, uth)) { + workq_thread_reset_pri(wq, uth, req); + } + workq_turnstile_update_inheritor(wq, uth->uu_thread, + TURNSTILE_INHERITOR_THREAD); + WQ_TRACE_WQ(TRACE_wq_creator_select | DBG_FUNC_NONE, + wq, 2, thread_tid(uth->uu_thread), req->tr_qos, 0); + uth->uu_save.uus_workq_park_data.fulfilled_snapshot = wq->wq_fulfilled; + uth->uu_save.uus_workq_park_data.yields = 0; + workq_thread_wakeup(uth); + } else { + /* + * We need to allocate a thread... + */ + if (__improbable(wq->wq_nthreads >= wq_max_threads)) { + /* out of threads, just go away */ + } else if (flags & WORKQ_THREADREQ_SET_AST_ON_FAILURE) { + act_set_astkevent(current_thread(), AST_KEVENT_REDRIVE_THREADREQ); + } else if (!(flags & WORKQ_THREADREQ_CAN_CREATE_THREADS)) { + /* This can drop the workqueue lock, and take it again */ + workq_schedule_immediate_thread_creation(wq); + } else if (workq_add_new_idle_thread(p, wq)) { + goto again; + } else { + workq_schedule_delayed_thread_creation(wq, 0); + } + + if (flags & WORKQ_THREADREQ_CREATOR_TRANSFER) { + /* + * workq_schedule_creator() failed at creating a thread, + * and the responsibility of redriving is now with a thread-call. + * + * We still need to tell the turnstile the previous creator is gone. + */ + workq_turnstile_update_inheritor(wq, NULL, 0); + } + } +} + +/** + * Runs a thread request on a thread + * + * - if thread is THREAD_NULL, will find a thread and run the request there. + * Otherwise, the thread must be the current thread. + * + * - if req is NULL, will find the highest priority request and run that. If + * it is not NULL, it must be a threadreq object in state NEW. If it can not + * be run immediately, it will be enqueued and moved to state QUEUED. + * + * Either way, the thread request object serviced will be moved to state + * BINDING and attached to the uthread. + * + * Should be called with the workqueue lock held. Will drop it. + */ +__attribute__((noreturn, noinline)) +static void +workq_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq, + struct uthread *uth) +{ + uint32_t setup_flags = 0; + workq_threadreq_t req; + + if (uth->uu_workq_flags & UT_WORKQ_EARLY_BOUND) { + if (uth->uu_workq_flags & UT_WORKQ_NEW) { + setup_flags |= WQ_SETUP_FIRST_USE; + } + uth->uu_workq_flags &= ~(UT_WORKQ_NEW | UT_WORKQ_EARLY_BOUND); + /* + * This pointer is possibly freed and only used for tracing purposes. + */ + req = uth->uu_save.uus_workq_park_data.thread_request; + workq_unlock(wq); + WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_START, wq, + VM_KERNEL_ADDRHIDE(req), 0, 0, 0); + goto run; + } else if (_wq_exiting(wq)) { + WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 0, 0, 0, 0); + } else if (wq->wq_reqcount == 0) { + WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 1, 0, 0, 0); + } else if ((req = workq_threadreq_select(wq, uth)) == NULL) { + WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 2, 0, 0, 0); + } else { + WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_START, wq, + workq_trace_req_id(req), 0, 0, 0); + if (uth->uu_workq_flags & UT_WORKQ_NEW) { + uth->uu_workq_flags ^= UT_WORKQ_NEW; + setup_flags |= WQ_SETUP_FIRST_USE; + } + workq_thread_reset_cpupercent(req, uth); + workq_threadreq_bind_and_unlock(p, wq, req, uth); +run: + workq_setup_and_run(p, uth, setup_flags); + __builtin_unreachable(); + } + + workq_park_and_unlock(p, wq, uth); + __builtin_unreachable(); +} + +static bool +workq_creator_should_yield(struct workqueue *wq, struct uthread *uth) +{ + thread_qos_t qos = workq_pri_override(uth->uu_workq_pri); + + if (qos >= THREAD_QOS_USER_INTERACTIVE) { + return false; + } + + uint32_t snapshot = uth->uu_save.uus_workq_park_data.fulfilled_snapshot; + if (wq->wq_fulfilled == snapshot) { + return false; + } + + uint32_t cnt = 0, conc = wq_max_parallelism[_wq_bucket(qos)]; + if (wq->wq_fulfilled - snapshot > conc) { + /* we fulfilled more than NCPU requests since being dispatched */ + WQ_TRACE_WQ(TRACE_wq_creator_yield, wq, 1, + wq->wq_fulfilled, snapshot, 0); + return true; + } + + for (int i = _wq_bucket(qos); i < WORKQ_NUM_QOS_BUCKETS; i++) { + cnt += wq->wq_thscheduled_count[i]; + } + if (conc <= cnt) { + /* We fulfilled requests and have more than NCPU scheduled threads */ + WQ_TRACE_WQ(TRACE_wq_creator_yield, wq, 2, + wq->wq_fulfilled, snapshot, 0); + return true; + } + + return false; +} + +/** + * parked thread wakes up + */ +__attribute__((noreturn, noinline)) +static void +workq_unpark_continue(void *parameter __unused, wait_result_t wr __unused) +{ + struct uthread *uth = current_uthread(); + proc_t p = current_proc(); + struct workqueue *wq = proc_get_wqptr_fast(p); + + workq_lock_spin(wq); + + if (wq->wq_creator == uth && workq_creator_should_yield(wq, uth)) { + /* + * If the number of threads we have out are able to keep up with the + * demand, then we should avoid sending this creator thread to + * userspace. + */ + uth->uu_save.uus_workq_park_data.fulfilled_snapshot = wq->wq_fulfilled; + uth->uu_save.uus_workq_park_data.yields++; + workq_unlock(wq); + thread_yield_with_continuation(workq_unpark_continue, NULL); + __builtin_unreachable(); + } + + if (__probable(uth->uu_workq_flags & UT_WORKQ_RUNNING)) { + workq_select_threadreq_or_park_and_unlock(p, wq, uth); + __builtin_unreachable(); + } + + if (__probable(wr == THREAD_AWAKENED)) { + /* + * We were set running, but for the purposes of dying. + */ + assert(uth->uu_workq_flags & UT_WORKQ_DYING); + assert((uth->uu_workq_flags & UT_WORKQ_NEW) == 0); + } else { + /* + * workaround for , + * in case we do hit userspace, make sure calling + * workq_thread_terminate() does the right thing here, + * and if we never call it, that workq_exit() will too because it sees + * this thread on the runlist. + */ + assert(wr == THREAD_INTERRUPTED); + wq->wq_thdying_count++; + uth->uu_workq_flags |= UT_WORKQ_DYING; + } + + workq_unpark_for_death_and_unlock(p, wq, uth, + WORKQ_UNPARK_FOR_DEATH_WAS_IDLE); + __builtin_unreachable(); +} + +__attribute__((noreturn, noinline)) +static void +workq_setup_and_run(proc_t p, struct uthread *uth, int setup_flags) +{ + thread_t th = uth->uu_thread; + vm_map_t vmap = get_task_map(p->task); + + if (setup_flags & WQ_SETUP_CLEAR_VOUCHER) { + /* + * For preemption reasons, we want to reset the voucher as late as + * possible, so we do it in two places: + * - Just before parking (i.e. in workq_park_and_unlock()) + * - Prior to doing the setup for the next workitem (i.e. here) + * + * Those two places are sufficient to ensure we always reset it before + * it goes back out to user space, but be careful to not break that + * guarantee. + */ + __assert_only kern_return_t kr; + kr = thread_set_voucher_name(MACH_PORT_NULL); + assert(kr == KERN_SUCCESS); + } + + uint32_t upcall_flags = uth->uu_save.uus_workq_park_data.upcall_flags; + if (!(setup_flags & WQ_SETUP_FIRST_USE)) { + upcall_flags |= WQ_FLAG_THREAD_REUSE; + } + + if (uth->uu_workq_flags & UT_WORKQ_OUTSIDE_QOS) { + /* + * For threads that have an outside-of-QoS thread priority, indicate + * to userspace that setting QoS should only affect the TSD and not + * change QOS in the kernel. + */ + upcall_flags |= WQ_FLAG_THREAD_OUTSIDEQOS; + } else { + /* + * Put the QoS class value into the lower bits of the reuse_thread + * register, this is where the thread priority used to be stored + * anyway. + */ + upcall_flags |= uth->uu_save.uus_workq_park_data.qos | + WQ_FLAG_THREAD_PRIO_QOS; + } + + if (uth->uu_workq_thport == MACH_PORT_NULL) { + /* convert_thread_to_port() consumes a reference */ + thread_reference(th); + ipc_port_t port = convert_thread_to_port(th); + uth->uu_workq_thport = ipc_port_copyout_send(port, get_task_ipcspace(p->task)); + } + + /* + * Call out to pthread, this sets up the thread, pulls in kevent structs + * onto the stack, sets up the thread state and then returns to userspace. + */ + WQ_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_START, + proc_get_wqptr_fast(p), 0, 0, 0, 0); + thread_sched_call(th, workq_sched_callback); + pthread_functions->workq_setup_thread(p, th, vmap, uth->uu_workq_stackaddr, + uth->uu_workq_thport, 0, setup_flags, upcall_flags); + + __builtin_unreachable(); +} + +#pragma mark misc + +int +fill_procworkqueue(proc_t p, struct proc_workqueueinfo * pwqinfo) +{ + struct workqueue *wq = proc_get_wqptr(p); + int error = 0; + int activecount; + + if (wq == NULL) { + return EINVAL; + } + + /* + * This is sometimes called from interrupt context by the kperf sampler. + * In that case, it's not safe to spin trying to take the lock since we + * might already hold it. So, we just try-lock it and error out if it's + * already held. Since this is just a debugging aid, and all our callers + * are able to handle an error, that's fine. + */ + bool locked = workq_lock_try(wq); + if (!locked) { + return EBUSY; + } + + wq_thactive_t act = _wq_thactive(wq); + activecount = _wq_thactive_aggregate_downto_qos(wq, act, + WORKQ_THREAD_QOS_MIN, NULL, NULL); + if (act & _wq_thactive_offset_for_qos(WORKQ_THREAD_QOS_MANAGER)) { + activecount++; + } + pwqinfo->pwq_nthreads = wq->wq_nthreads; + pwqinfo->pwq_runthreads = activecount; + pwqinfo->pwq_blockedthreads = wq->wq_threads_scheduled - activecount; + pwqinfo->pwq_state = 0; + + if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) { + pwqinfo->pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT; + } + + if (wq->wq_nthreads >= wq_max_threads) { + pwqinfo->pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT; + } + + workq_unlock(wq); + return error; +} + +boolean_t +workqueue_get_pwq_exceeded(void *v, boolean_t *exceeded_total, + boolean_t *exceeded_constrained) +{ + proc_t p = v; + struct proc_workqueueinfo pwqinfo; + int err; + + assert(p != NULL); + assert(exceeded_total != NULL); + assert(exceeded_constrained != NULL); + + err = fill_procworkqueue(p, &pwqinfo); + if (err) { + return FALSE; + } + if (!(pwqinfo.pwq_state & WQ_FLAGS_AVAILABLE)) { + return FALSE; + } + + *exceeded_total = (pwqinfo.pwq_state & WQ_EXCEEDED_TOTAL_THREAD_LIMIT); + *exceeded_constrained = (pwqinfo.pwq_state & WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT); + + return TRUE; +} + +uint32_t +workqueue_get_pwq_state_kdp(void * v) +{ + static_assert((WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT << 17) == + kTaskWqExceededConstrainedThreadLimit); + static_assert((WQ_EXCEEDED_TOTAL_THREAD_LIMIT << 17) == + kTaskWqExceededTotalThreadLimit); + static_assert((WQ_FLAGS_AVAILABLE << 17) == kTaskWqFlagsAvailable); + static_assert((WQ_FLAGS_AVAILABLE | WQ_EXCEEDED_TOTAL_THREAD_LIMIT | + WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT) == 0x7); + + if (v == NULL) { + return 0; + } + + proc_t p = v; + struct workqueue *wq = proc_get_wqptr(p); + + if (wq == NULL || workq_lock_spin_is_acquired_kdp(wq)) { + return 0; + } + + uint32_t pwq_state = WQ_FLAGS_AVAILABLE; + + if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) { + pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT; + } + + if (wq->wq_nthreads >= wq_max_threads) { + pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT; + } + + return pwq_state; +} + +void +workq_init(void) +{ + workq_lck_grp_attr = lck_grp_attr_alloc_init(); + workq_lck_attr = lck_attr_alloc_init(); + workq_lck_grp = lck_grp_alloc_init("workq", workq_lck_grp_attr); + + workq_zone_workqueue = zinit(sizeof(struct workqueue), + 1024 * sizeof(struct workqueue), 8192, "workq.wq"); + workq_zone_threadreq = zinit(sizeof(struct workq_threadreq_s), + 1024 * sizeof(struct workq_threadreq_s), 8192, "workq.threadreq"); + + clock_interval_to_absolutetime_interval(wq_stalled_window.usecs, + NSEC_PER_USEC, &wq_stalled_window.abstime); + clock_interval_to_absolutetime_interval(wq_reduce_pool_window.usecs, + NSEC_PER_USEC, &wq_reduce_pool_window.abstime); + clock_interval_to_absolutetime_interval(wq_max_timer_interval.usecs, + NSEC_PER_USEC, &wq_max_timer_interval.abstime); +} diff --git a/bsd/pthread/workqueue_internal.h b/bsd/pthread/workqueue_internal.h new file mode 100644 index 000000000..a072d35ef --- /dev/null +++ b/bsd/pthread/workqueue_internal.h @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2014 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _WORKQUEUE_INTERNAL_H_ +#define _WORKQUEUE_INTERNAL_H_ + +// Sometimes something gets passed a bucket number and we need a way to express +// that it's actually the event manager. Use the (0)th bucket for that. +#define WORKQ_THREAD_QOS_MIN (THREAD_QOS_MAINTENANCE) +#define WORKQ_THREAD_QOS_MAX (THREAD_QOS_LAST) +#define WORKQ_THREAD_QOS_CLEANUP (THREAD_QOS_LEGACY) +#define WORKQ_THREAD_QOS_ABOVEUI (THREAD_QOS_LAST) +#define WORKQ_THREAD_QOS_MANAGER (THREAD_QOS_LAST + 1) // outside of MIN/MAX + +#define WORKQ_NUM_QOS_BUCKETS (WORKQ_THREAD_QOS_MAX - 1) // MT/BG shared +#define WORKQ_NUM_BUCKETS (WORKQ_NUM_QOS_BUCKETS + 1) // + mgr + +/* These definitions are only available to the kext, to avoid bleeding + * constants and types across the boundary to the userspace library. + */ +#ifdef KERNEL +#pragma mark wq structs + +/* These defines come from kern/thread.h but are XNU_KERNEL_PRIVATE so do not get + * exported to kernel extensions. + */ +#define SCHED_CALL_BLOCK 0x1 +#define SCHED_CALL_UNBLOCK 0x2 + +/* old workq priority scheme */ + +#define WORKQUEUE_HIGH_PRIOQUEUE 0 /* high priority queue */ +#define WORKQUEUE_DEFAULT_PRIOQUEUE 1 /* default priority queue */ +#define WORKQUEUE_LOW_PRIOQUEUE 2 /* low priority queue */ +#define WORKQUEUE_BG_PRIOQUEUE 3 /* background priority queue */ + +/* wq_max_constrained_threads = max(64, N_CPU * WORKQUEUE_CONSTRAINED_FACTOR) + * This used to be WORKQ_NUM_BUCKETS + 1 when NUM_BUCKETS was 4, yielding + * N_CPU * 5. When NUM_BUCKETS changed, we decided that the limit should + * not change. So the factor is now always 5. + */ +#define WORKQUEUE_CONSTRAINED_FACTOR 5 + +#if BSD_KERNEL_PRIVATE +#include +#include +#include +#include +#include +#include + +/* struct uthread::uu_workq_flags */ +#define UT_WORKQ_NEW 0x01 /* First return to userspace */ +#define UT_WORKQ_RUNNING 0x02 /* On thrunlist, not parked. */ +#define UT_WORKQ_DYING 0x04 /* Thread is being killed */ +#define UT_WORKQ_OVERCOMMIT 0x08 /* Overcommit thread. */ +#define UT_WORKQ_OUTSIDE_QOS 0x10 /* Thread should avoid send QoS changes to kernel */ +#define UT_WORKQ_IDLE_CLEANUP 0x20 /* Thread is removing its voucher or stack */ +#define UT_WORKQ_EARLY_BOUND 0x40 /* Thread has been bound early */ +#define UT_WORKQ_CPUPERCENT 0x80 /* Thread has CPU percent policy active */ + +typedef union workq_threadreq_param_s { + struct { + uint16_t trp_flags; + uint8_t trp_pri; + uint8_t trp_pol; + uint32_t trp_cpupercent: 8, + trp_refillms: 24; + }; + uint64_t trp_value; +} workq_threadreq_param_t; + +#define TRP_PRIORITY 0x1 +#define TRP_POLICY 0x2 +#define TRP_CPUPERCENT 0x4 +#define TRP_RELEASED 0x8000 + +typedef struct workq_threadreq_s { + union { + struct priority_queue_entry tr_entry; + thread_t tr_binding_thread; + }; + uint32_t tr_flags; + uint8_t tr_state; + thread_qos_t tr_qos; + uint16_t tr_count; +} *workq_threadreq_t; + +TAILQ_HEAD(threadreq_head, workq_threadreq_s); + +#define TR_STATE_IDLE 0 /* request isn't in flight */ +#define TR_STATE_NEW 1 /* request is being initiated */ +#define TR_STATE_QUEUED 2 /* request is being queued */ +#define TR_STATE_BINDING 4 /* request is preposted for bind */ + +#define TR_FLAG_KEVENT 0x01 +#define TR_FLAG_WORKLOOP 0x02 +#define TR_FLAG_OVERCOMMIT 0x04 +#define TR_FLAG_WL_PARAMS 0x08 +#define TR_FLAG_WL_OUTSIDE_QOS 0x10 + +#if defined(__LP64__) +typedef unsigned __int128 wq_thactive_t; +#else +typedef uint64_t wq_thactive_t; +#endif + +typedef enum { + WQ_EXITING = 0x0001, + WQ_PROC_SUSPENDED = 0x0002, + WQ_DEATH_CALL_SCHEDULED = 0x0004, + + WQ_DELAYED_CALL_SCHEDULED = 0x0010, + WQ_DELAYED_CALL_PENDED = 0x0020, + WQ_IMMEDIATE_CALL_SCHEDULED = 0x0040, + WQ_IMMEDIATE_CALL_PENDED = 0x0080, +} workq_state_flags_t; + +TAILQ_HEAD(workq_uthread_head, uthread); + +struct workqueue { + thread_call_t wq_delayed_call; + thread_call_t wq_immediate_call; + thread_call_t wq_death_call; + struct turnstile *wq_turnstile; + + lck_spin_t wq_lock; + + uint64_t wq_thread_call_last_run; + struct os_refcnt wq_refcnt; + workq_state_flags_t _Atomic wq_flags; + uint32_t wq_fulfilled; + uint32_t wq_creations; + uint32_t wq_timer_interval; + uint32_t wq_event_manager_priority; + uint32_t wq_reqcount; /* number of elements on the wq_*_reqlists */ + uint16_t wq_thdying_count; + uint16_t wq_threads_scheduled; + uint16_t wq_constrained_threads_scheduled; + uint16_t wq_nthreads; + uint16_t wq_thidlecount; + uint16_t wq_thscheduled_count[WORKQ_NUM_BUCKETS]; // incl. manager + + _Atomic wq_thactive_t wq_thactive; + _Atomic uint64_t wq_lastblocked_ts[WORKQ_NUM_QOS_BUCKETS]; + + struct proc *wq_proc; + struct uthread *wq_creator; + thread_t wq_turnstile_updater; // thread doing a turnstile_update_ineritor + struct workq_uthread_head wq_thrunlist; + struct workq_uthread_head wq_thnewlist; + struct workq_uthread_head wq_thidlelist; + + struct priority_queue wq_overcommit_queue; + struct priority_queue wq_constrained_queue; + struct priority_queue wq_special_queue; + workq_threadreq_t wq_event_manager_threadreq; +}; + +static_assert(offsetof(struct workqueue, wq_lock) >= sizeof(struct queue_entry), + "Make sure workq_deallocate_enqueue can cast the workqueue"); + +#define WORKQUEUE_MAXTHREADS 512 +#define WQ_STALLED_WINDOW_USECS 200 +#define WQ_REDUCE_POOL_WINDOW_USECS 5000000 +#define WQ_MAX_TIMER_INTERVAL_USECS 50000 + +#pragma mark definitions + +struct kqrequest; +uint32_t _get_pwq_state_kdp(proc_t p); + +void workq_exit(struct proc *p); +void workq_mark_exiting(struct proc *p); + +bool workq_is_exiting(struct proc *p); + +struct turnstile *workq_turnstile(struct proc *p); + +void workq_thread_set_max_qos(struct proc *p, struct kqrequest *kqr); + +void workq_thread_terminate(struct proc *p, struct uthread *uth); + +#define WORKQ_THREADREQ_SET_AST_ON_FAILURE 0x01 +#define WORKQ_THREADREQ_ATTEMPT_REBIND 0x02 +#define WORKQ_THREADREQ_CAN_CREATE_THREADS 0x04 +#define WORKQ_THREADREQ_CREATOR_TRANSFER 0x08 +#define WORKQ_THREADREQ_CREATOR_SYNC_UPDATE 0x10 + +// called with the kq req lock held +bool workq_kern_threadreq_initiate(struct proc *p, struct kqrequest *kqr, + struct turnstile *ts, thread_qos_t qos, int flags); + +// called with the kq req lock held +void workq_kern_threadreq_modify(struct proc *p, struct kqrequest *kqr, + thread_qos_t qos, int flags); + +// called with the kq req lock held +void workq_kern_threadreq_update_inheritor(struct proc *p, struct kqrequest *kqr, + thread_t owner, struct turnstile *ts, turnstile_update_flags_t flags); + +void workq_kern_threadreq_lock(struct proc *p); +void workq_kern_threadreq_unlock(struct proc *p); + +void workq_kern_threadreq_redrive(struct proc *p, int flags); + +enum workq_set_self_flags { + WORKQ_SET_SELF_QOS_FLAG = 0x1, + WORKQ_SET_SELF_VOUCHER_FLAG = 0x2, + WORKQ_SET_SELF_FIXEDPRIORITY_FLAG = 0x4, + WORKQ_SET_SELF_TIMESHARE_FLAG = 0x8, + WORKQ_SET_SELF_WQ_KEVENT_UNBIND = 0x10, +}; + +void workq_proc_suspended(struct proc *p); +void workq_proc_resumed(struct proc *p); + +#endif // BSD_KERNEL_PRIVATE + +void workq_init(void); + +#endif // KERNEL + +#endif // _WORKQUEUE_INTERNAL_H_ diff --git a/bsd/pthread/workqueue_syscalls.h b/bsd/pthread/workqueue_syscalls.h new file mode 100644 index 000000000..e8604193e --- /dev/null +++ b/bsd/pthread/workqueue_syscalls.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2017 Apple, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _PTHREAD_WORKQUEUE_PRIVATE_H_ +#define _PTHREAD_WORKQUEUE_PRIVATE_H_ + +#if XNU_KERNEL_PRIVATE && !defined(__PTHREAD_EXPOSE_INTERNALS__) +#define __PTHREAD_EXPOSE_INTERNALS__ 1 +#endif // XNU_KERNEL_PRIVATE + +#ifdef __PTHREAD_EXPOSE_INTERNALS__ +/* workq_kernreturn commands */ +#define WQOPS_THREAD_RETURN 0x04 /* parks the thread back into the kernel */ +#define WQOPS_QUEUE_NEWSPISUPP 0x10 /* this is to check for newer SPI support */ +#define WQOPS_QUEUE_REQTHREADS 0x20 /* request number of threads of a prio */ +#define WQOPS_QUEUE_REQTHREADS2 0x30 /* request a number of threads in a given priority bucket */ +#define WQOPS_THREAD_KEVENT_RETURN 0x40 /* parks the thread after delivering the passed kevent array */ +#define WQOPS_SET_EVENT_MANAGER_PRIORITY 0x80 /* max() in the provided priority in the the priority of the event manager */ +#define WQOPS_THREAD_WORKLOOP_RETURN 0x100 /* parks the thread after delivering the passed kevent array */ +#define WQOPS_SHOULD_NARROW 0x200 /* checks whether we should narrow our concurrency */ + +/* flag values for upcall flags field, only 8 bits per struct threadlist */ +#define WQ_FLAG_THREAD_PRIO_SCHED 0x00008000 +#define WQ_FLAG_THREAD_PRIO_QOS 0x00004000 +#define WQ_FLAG_THREAD_PRIO_MASK 0x00000fff + +#define WQ_FLAG_THREAD_OVERCOMMIT 0x00010000 /* thread is with overcommit prio */ +#define WQ_FLAG_THREAD_REUSE 0x00020000 /* thread is being reused */ +#define WQ_FLAG_THREAD_NEWSPI 0x00040000 /* the call is with new SPIs */ +#define WQ_FLAG_THREAD_KEVENT 0x00080000 /* thread is response to kevent req */ +#define WQ_FLAG_THREAD_EVENT_MANAGER 0x00100000 /* event manager thread */ +#define WQ_FLAG_THREAD_TSD_BASE_SET 0x00200000 /* tsd base has already been set */ +#define WQ_FLAG_THREAD_WORKLOOP 0x00400000 /* workloop thread */ +#define WQ_FLAG_THREAD_OUTSIDEQOS 0x00800000 /* thread qos changes should not be sent to kernel */ + +#define WQ_KEVENT_LIST_LEN 16 // WORKQ_KEVENT_EVENT_BUFFER_LEN +#define WQ_KEVENT_DATA_SIZE (32 * 1024) + +/* kqueue_workloop_ctl commands */ +#define KQ_WORKLOOP_CREATE 0x01 +#define KQ_WORKLOOP_DESTROY 0x02 + +/* indicate which fields of kq_workloop_create params are valid */ +#define KQ_WORKLOOP_CREATE_SCHED_PRI 0x01 +#define KQ_WORKLOOP_CREATE_SCHED_POL 0x02 +#define KQ_WORKLOOP_CREATE_CPU_PERCENT 0x04 + +struct kqueue_workloop_params { + int kqwlp_version; + int kqwlp_flags; + uint64_t kqwlp_id; + int kqwlp_sched_pri; + int kqwlp_sched_pol; + int kqwlp_cpu_percent; + int kqwlp_cpu_refillms; +} __attribute__((packed)); + +_Static_assert(offsetof(struct kqueue_workloop_params, kqwlp_version) == 0, + "kqwlp_version should be first"); + +int +__workq_open(void); + +int +__workq_kernreturn(int op, void *arg2, int arg3, int arg4); + +int +__kqueue_workloop_ctl(uintptr_t cmd, uint64_t options, void *addr, size_t sz); + +/* SPI flags between WQ and workq_setup_thread in pthread.kext */ +#define WQ_SETUP_FIRST_USE 1 +#define WQ_SETUP_CLEAR_VOUCHER 2 +// was WQ_SETUP_SET_SCHED_CALL 4 +#define WQ_SETUP_EXIT_THREAD 8 + +#endif // __PTHREAD_EXPOSE_INTERNALS__ +#endif // _PTHREAD_WORKQUEUE_PRIVATE_H_ diff --git a/bsd/pthread/workqueue_trace.h b/bsd/pthread/workqueue_trace.h new file mode 100644 index 000000000..6625798c4 --- /dev/null +++ b/bsd/pthread/workqueue_trace.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2017 Apple, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _WORKQUEUE_TRACE_H_ +#define _WORKQUEUE_TRACE_H_ + +// General workqueue tracepoints, mostly for debugging +#define WQ_TRACE_WORKQUEUE_SUBCLASS 1 +// Workqueue request scheduling tracepoints +#define WQ_TRACE_REQUESTS_SUBCLASS 2 +// Generic pthread tracepoints +#define WQ_TRACE_BSDTHREAD_SUBCLASS 16 + +#define TRACE_wq_pthread_exit \ + KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x01) +#define TRACE_wq_workqueue_exit \ + KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x02) +#define TRACE_wq_runthread \ + KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x03) +#define TRACE_wq_death_call \ + KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x05) +#define TRACE_wq_thread_block \ + KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x09) +#define TRACE_wq_thactive_update \ + KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x0a) +#define TRACE_wq_add_timer \ + KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x0b) +#define TRACE_wq_start_add_timer \ + KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x0c) +#define TRACE_wq_override_dispatch \ + KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x14) +#define TRACE_wq_override_reset \ + KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x15) +#define TRACE_wq_thread_create_failed \ + KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x1d) +#define TRACE_wq_thread_terminate \ + KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x1e) +#define TRACE_wq_thread_create \ + KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x1f) +#define TRACE_wq_select_threadreq \ + KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x20) +#define TRACE_wq_creator_select \ + KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x23) +#define TRACE_wq_creator_yield \ + KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x24) +#define TRACE_wq_constrained_admission \ + KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x25) +#define TRACE_wq_wqops_reqthreads \ + KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x26) + +#define TRACE_wq_create \ + KDBG_CODE(DBG_PTHREAD, WQ_TRACE_REQUESTS_SUBCLASS, 0x01) +#define TRACE_wq_destroy \ + KDBG_CODE(DBG_PTHREAD, WQ_TRACE_REQUESTS_SUBCLASS, 0x02) +#define TRACE_wq_thread_logical_run \ + KDBG_CODE(DBG_PTHREAD, WQ_TRACE_REQUESTS_SUBCLASS, 0x03) +#define TRACE_wq_thread_request_initiate \ + KDBG_CODE(DBG_PTHREAD, WQ_TRACE_REQUESTS_SUBCLASS, 0x05) +#define TRACE_wq_thread_request_modify \ + KDBG_CODE(DBG_PTHREAD, WQ_TRACE_REQUESTS_SUBCLASS, 0x06) +#define TRACE_wq_thread_request_fulfill \ + KDBG_CODE(DBG_PTHREAD, WQ_TRACE_REQUESTS_SUBCLASS, 0x08) + +#define TRACE_bsdthread_set_qos_self \ + KDBG_CODE(DBG_PTHREAD, WQ_TRACE_BSDTHREAD_SUBCLASS, 0x1) + +#define WQ_TRACE(x,a,b,c,d,e) \ + ({ KERNEL_DEBUG_CONSTANT(x, a, b, c, d, e); }) +#define WQ_TRACE_WQ(x,wq,b,c,d,e) \ + ({ KERNEL_DEBUG_CONSTANT(x, (wq)->wq_proc->p_pid, b, c, d, e); }) + +#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) +#define __wq_trace_only +#else // (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) +#define __wq_trace_only __unused +#endif // (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) + +#endif // _WORKQUEUE_TRACE_H_ diff --git a/bsd/security/audit/audit.c b/bsd/security/audit/audit.c index 830c1fc33..e91adedf4 100644 --- a/bsd/security/audit/audit.c +++ b/bsd/security/audit/audit.c @@ -100,6 +100,13 @@ int audit_suspended; int audit_syscalls; au_class_t audit_kevent_mask; +/* + * The audit control mode is used to ensure configuration settings are only + * accepted from appropriate sources based on the current mode. + */ +au_ctlmode_t audit_ctl_mode; +au_expire_after_t audit_expire_after; + /* * Flags controlling behavior in low storage situations. Should we panic if * a write fails? Should we fail stop if we're out of disk space? @@ -274,6 +281,7 @@ audit_record_dtor(struct kaudit_record *ar) free(ar->k_ar.ar_arg_argv, M_AUDITTEXT); if (ar->k_ar.ar_arg_envv != NULL) free(ar->k_ar.ar_arg_envv, M_AUDITTEXT); + audit_identity_info_destruct(&ar->k_ar.ar_arg_identity); } /* @@ -294,6 +302,10 @@ audit_init(void) audit_in_failure = 0; audit_argv = 0; audit_arge = 0; + audit_ctl_mode = AUDIT_CTLMODE_NORMAL; + audit_expire_after.age = 0; + audit_expire_after.size = 0; + audit_expire_after.op_type = AUDIT_EXPIRE_OP_AND; audit_fstat.af_filesz = 0; /* '0' means unset, unbounded. */ audit_fstat.af_currsz = 0; @@ -610,7 +622,7 @@ audit_syscall_enter(unsigned int code, proc_t proc, struct uthread *uthread) * the syscall table(s). This table is generated by makesyscalls.sh * from syscalls.master and stored in audit_kevents.c. */ - if (code > nsysent) + if (code >= nsysent) return; event = sys_au_event[code]; if (event == AUE_NULL) @@ -668,6 +680,14 @@ audit_syscall_enter(unsigned int code, proc_t proc, struct uthread *uthread) uthread->uu_ar = audit_new(event, proc, uthread); } + /* + * All audited events will contain an identity + * + * Note: Identity should be obtained prior to the syscall implementation + * being called to handle cases like execve(2) where the process changes + */ + AUDIT_ARG(identity); + out: kauth_cred_unref(&cred); } diff --git a/bsd/security/audit/audit.h b/bsd/security/audit/audit.h index 61e818f60..6a60b36c6 100644 --- a/bsd/security/audit/audit.h +++ b/bsd/security/audit/audit.h @@ -130,6 +130,7 @@ extern int audit_syscalls; #define ARG_DATA 0x0010000000000000ULL /* darwin-only */ #define ARG_ADDR64 0x0020000000000000ULL /* darwin-only */ #define ARG_FD2 0x0040000000000000ULL /* darwin-only */ +#define ARG_IDENTITY 0x0080000000000000ULL /* darwin-only */ #define ARG_NONE 0x0000000000000000ULL #define ARG_ALL 0xFFFFFFFFFFFFFFFFULL @@ -242,6 +243,7 @@ void audit_arg_argv(struct kaudit_record *ar, char *argv, int argc, int length); void audit_arg_envv(struct kaudit_record *ar, char *envv, int envc, int length); +void audit_arg_identity(struct kaudit_record *ar); void audit_arg_mach_port1(struct kaudit_record *ar, mach_port_name_t port); void audit_arg_mach_port2(struct kaudit_record *ar, mach_port_name_t port); @@ -323,7 +325,7 @@ extern au_event_t sys_au_event[]; if (AUDIT_SYSCALLS()) { \ struct kaudit_record *__ar = AUDIT_RECORD(); \ if (AUDIT_AUDITING(__ar)) \ - audit_arg_ ## op (__ar, args); \ + audit_arg_ ## op (__ar, ## args); \ } \ } while (0) diff --git a/bsd/security/audit/audit_arg.c b/bsd/security/audit/audit_arg.c index 7e338fd2b..950d1f49f 100644 --- a/bsd/security/audit/audit_arg.c +++ b/bsd/security/audit/audit_arg.c @@ -59,6 +59,8 @@ #include #include #include +#include +#include #include #include @@ -900,4 +902,91 @@ audit_sysclose(struct kaudit_record *ar, proc_t p, int fd) fp_drop(p, fd, fp, 0); } +void +audit_identity_info_destruct(struct au_identity_info *id_info) +{ + if (!id_info) { + return; + } + + if (id_info->signing_id != NULL) { + free(id_info->signing_id, M_AUDITTEXT); + id_info->signing_id = NULL; + } + + if (id_info->team_id != NULL) { + free(id_info->team_id, M_AUDITTEXT); + id_info->team_id = NULL; + } + + if (id_info->cdhash != NULL) { + free(id_info->cdhash, M_AUDITDATA); + id_info->cdhash = NULL; + } +} + +void +audit_identity_info_construct(struct au_identity_info *id_info) +{ + struct proc *p; + struct cs_blob *blob; + unsigned int signer_type = 0; + const char *signing_id = NULL; + const char* team_id = NULL; + const uint8_t *cdhash = NULL; + size_t src_len = 0; + + p = current_proc(); + blob = csproc_get_blob(p); + if (blob) { + signing_id = csblob_get_identity(blob); + cdhash = csblob_get_cdhash(blob); + team_id = csblob_get_teamid(blob); + signer_type = csblob_get_platform_binary(blob) ? 1 : 0; + } + + id_info->signer_type = signer_type; + + if (id_info->signing_id == NULL && signing_id != NULL) { + id_info->signing_id = malloc( MAX_AU_IDENTITY_SIGNING_ID_LENGTH, + M_AUDITTEXT, M_WAITOK); + if (id_info->signing_id != NULL) { + src_len = strlcpy(id_info->signing_id, + signing_id, MAX_AU_IDENTITY_SIGNING_ID_LENGTH); + + if (src_len >= MAX_AU_IDENTITY_SIGNING_ID_LENGTH) { + id_info->signing_id_trunc = 1; + } + } + } + + if (id_info->team_id == NULL && team_id != NULL) { + id_info->team_id = malloc(MAX_AU_IDENTITY_TEAM_ID_LENGTH, + M_AUDITTEXT, M_WAITOK); + if (id_info->team_id != NULL) { + src_len = strlcpy(id_info->team_id, team_id, + MAX_AU_IDENTITY_TEAM_ID_LENGTH); + + if (src_len >= MAX_AU_IDENTITY_TEAM_ID_LENGTH) { + id_info->team_id_trunc = 1; + } + } + } + + if (id_info->cdhash == NULL && cdhash != NULL) { + id_info->cdhash = malloc(CS_CDHASH_LEN, M_AUDITDATA, M_WAITOK); + if (id_info->cdhash != NULL) { + memcpy(id_info->cdhash, cdhash, CS_CDHASH_LEN); + id_info->cdhash_len = CS_CDHASH_LEN; + } + } +} + +void +audit_arg_identity(struct kaudit_record *ar) +{ + audit_identity_info_construct(&ar->k_ar.ar_arg_identity); + ARG_SET_VALID(ar, ARG_IDENTITY); +} + #endif /* CONFIG_AUDIT */ diff --git a/bsd/security/audit/audit_bsd.c b/bsd/security/audit/audit_bsd.c index 64b4e9f05..5a6ea3750 100644 --- a/bsd/security/audit/audit_bsd.c +++ b/bsd/security/audit/audit_bsd.c @@ -234,8 +234,9 @@ _audit_free(void *addr, __unused au_malloc_type_t *type) return; hdr = addr; hdr--; - KASSERT(hdr->mh_magic == AUDIT_MHMAGIC, - ("_audit_free(): hdr->mh_magic != AUDIT_MHMAGIC")); + if (hdr->mh_magic != AUDIT_MHMAGIC) { + panic("_audit_free(): hdr->mh_magic (%lx) != AUDIT_MHMAGIC", hdr->mh_magic); + } #if AUDIT_MALLOC_DEBUG if (type != NULL) { diff --git a/bsd/security/audit/audit_bsm.c b/bsd/security/audit/audit_bsm.c index edebfd61b..60c8dbf14 100644 --- a/bsd/security/audit/audit_bsm.c +++ b/bsd/security/audit/audit_bsm.c @@ -263,6 +263,18 @@ kau_free(struct au_record *rec) } \ } while (0) +#define VNODE2_PATH_TOKENS do { \ + if (ARG_IS_VALID(kar, ARG_KPATH2)) { \ + tok = au_to_path(ar->ar_arg_kpath2); \ + kau_write(rec, tok); \ + } \ + if (ARG_IS_VALID(kar, ARG_VNODE2)) { \ + tok = au_to_attr32(&ar->ar_arg_vnode2); \ + kau_write(rec, tok); \ + MAC_VNODE2_LABEL_TOKEN; \ + } \ +} while (0) + #define FD_VNODE1_TOKENS do { \ if (ARG_IS_VALID(kar, ARG_VNODE1)) { \ if (ARG_IS_VALID(kar, ARG_KPATH1)) { \ @@ -983,6 +995,12 @@ kaudit_to_bsm(struct kaudit_record *kar, struct au_record **pau) kau_write(rec, tok); } UPATH1_VNODE1_TOKENS; + VNODE2_PATH_TOKENS; + if (ARG_IS_VALID(kar, ARG_DATA)) { + tok = au_to_data(AUP_HEX, ar->ar_arg_data_type, + ar->ar_arg_data_count, ar->ar_arg_data); + kau_write(rec, tok); + } break; case AUE_FCHMOD_EXTENDED: @@ -2020,6 +2038,14 @@ kaudit_to_bsm(struct kaudit_record *kar, struct au_record **pau) tok = au_to_return32(au_errno_to_bsm(ar->ar_errno), ar->ar_retval); kau_write(rec, tok); /* Every record gets a return token */ + if (ARG_IS_VALID(kar, ARG_IDENTITY)) { + struct au_identity_info *id = &ar->ar_arg_identity; + tok = au_to_identity(id->signer_type, id->signing_id, + id->signing_id_trunc, id->team_id, id->team_id_trunc, + id->cdhash, id->cdhash_len); + kau_write(rec, tok); + } + kau_close(rec, &ar->ar_endtime, ar->ar_event); *pau = rec; @@ -2027,25 +2053,47 @@ kaudit_to_bsm(struct kaudit_record *kar, struct au_record **pau) } /* - * Verify that a record is a valid BSM record. This verification is simple - * now, but may be expanded on sometime in the future. Return 1 if the + * Verify that a record is a valid BSM record. Return 1 if the * record is good, 0 otherwise. */ int -bsm_rec_verify(void *rec) +bsm_rec_verify(void *rec, int length) { - char c = *(char *)rec; + /* Used to partially deserialize the buffer */ + struct hdr_tok_partial *hdr; + struct trl_tok_partial *trl; - /* - * Check the token ID of the first token; it has to be a header - * token. - * - * XXXAUDIT There needs to be a token structure to map a token. - * XXXAUDIT 'Shouldn't be simply looking at the first char. - */ - if ((c != AUT_HEADER32) && (c != AUT_HEADER32_EX) && - (c != AUT_HEADER64) && (c != AUT_HEADER64_EX)) + /* A record requires a complete header and trailer token */ + if (length < (AUDIT_HEADER_SIZE + AUDIT_TRAILER_SIZE)) { + return (0); + } + + hdr = (struct hdr_tok_partial*)rec; + + /* Ensure the provided length matches what the record shows */ + if ((uint32_t)length != ntohl(hdr->len)) { + return (0); + } + + trl = (struct trl_tok_partial*)(rec + (length - AUDIT_TRAILER_SIZE)); + + /* Ensure the buffer contains what look like header and trailer tokens */ + if (((hdr->type != AUT_HEADER32) && (hdr->type != AUT_HEADER32_EX) && + (hdr->type != AUT_HEADER64) && (hdr->type != AUT_HEADER64_EX)) || + (trl->type != AUT_TRAILER)) { return (0); + } + + /* Ensure the header and trailer agree on the length */ + if (hdr->len != trl->len) { + return (0); + } + + /* Ensure the trailer token has a proper magic value */ + if (ntohs(trl->magic) != AUT_TRAILER_MAGIC) { + return (0); + } + return (1); } #endif /* CONFIG_AUDIT */ diff --git a/bsd/security/audit/audit_bsm_klib.c b/bsd/security/audit/audit_bsm_klib.c index 4a8187f44..30787a14c 100644 --- a/bsd/security/audit/audit_bsm_klib.c +++ b/bsd/security/audit/audit_bsm_klib.c @@ -53,6 +53,7 @@ #include #include #include +#include #if CONFIG_AUDIT /* @@ -103,10 +104,48 @@ au_event_class(au_event_t event) return (class); } +/* + * Return a new class mask that allows changing the reserved class bit + * only if the current task is entitled to do so or if this is being done + * from the kernel task. If the current task is not allowed to make the + * change, the reserved bit is reverted to its previous state and the rest + * of the mask is left intact. + */ +static au_class_t +au_class_protect(au_class_t old_class, au_class_t new_class) +{ + au_class_t result = new_class; + + /* Check if the reserved class bit has been flipped */ + if ((old_class & AU_CLASS_MASK_RESERVED) != + (new_class & AU_CLASS_MASK_RESERVED)) { + + task_t task = current_task(); + if (task != kernel_task && + !IOTaskHasEntitlement(task, AU_CLASS_RESERVED_ENTITLEMENT)) { + /* + * If the caller isn't entitled, revert the class bit: + * - First remove the reserved bit from the new_class mask + * - Next get the state of the old_class mask's reserved bit + * - Finally, OR the result from the first two operations + */ + result = (new_class & ~AU_CLASS_MASK_RESERVED) | + (old_class & AU_CLASS_MASK_RESERVED); + } + } + + return result; +} + /* * Insert a event to class mapping. If the event already exists in the * mapping, then replace the mapping with the new one. * + * IMPORTANT: This function should only be called from the kernel during + * initialization (e.g. during au_evclassmap_init). Calling afterwards can + * have adverse effects on other system components that rely on event/class + * map state. + * * XXX There is currently no constraints placed on the number of mappings. * May want to either limit to a number, or in terms of memory usage. */ @@ -135,7 +174,7 @@ au_evclassmap_insert(au_event_t event, au_class_t class) evcl = &evclass_hash[event % EVCLASSMAP_HASH_TABLE_SIZE]; LIST_FOREACH(evc, &evcl->head, entry) { if (evc->event == event) { - evc->class = class; + evc->class = au_class_protect(evc->class, class); EVCLASS_WUNLOCK(); free(evc_new, M_AUDITEVCLASS); return; @@ -143,7 +182,11 @@ au_evclassmap_insert(au_event_t event, au_class_t class) } evc = evc_new; evc->event = event; - evc->class = class; + /* + * Mappings that require a new element must use 0 as the "old_class" since + * there is no previous state. + */ + evc->class = au_class_protect(0, class); LIST_INSERT_HEAD(&evcl->head, evc, entry); EVCLASS_WUNLOCK(); } diff --git a/bsd/security/audit/audit_bsm_token.c b/bsd/security/audit/audit_bsm_token.c index cf0b781a8..487349595 100644 --- a/bsd/security/audit/audit_bsm_token.c +++ b/bsd/security/audit/audit_bsm_token.c @@ -1158,7 +1158,6 @@ au_to_exec_strings(const char *strs, int count, u_char type) token_t * au_to_exec_args(char *args, int argc) { - return (au_to_exec_strings(args, argc, AUT_EXEC_ARGS)); } @@ -1170,9 +1169,30 @@ au_to_exec_args(char *args, int argc) token_t * au_to_exec_env(char *envs, int envc) { - return (au_to_exec_strings(envs, envc, AUT_EXEC_ENV)); } + +/* + * token ID 1 byte + * count 4 bytes + * text count null-terminated strings + */ +token_t * +au_to_certificate_hash(char *hashes, int hashc) +{ + return (au_to_exec_strings(hashes, hashc, AUT_CERT_HASH)); +} + +/* + * token ID 1 byte + * count 4 bytes + * text count null-terminated strings + */ +token_t * +au_to_krb5_principal(char *principals, int princ) +{ + return (au_to_exec_strings(principals, princ, AUT_KRB5_PRINCIPAL)); +} #else /* * token ID 1 byte @@ -1273,6 +1293,69 @@ au_to_exec_env(char **envp) } #endif /* !(defined(_KERNEL) || defined(KERNEL)) */ +/* + * token ID 1 byte + * signer type 4 bytes + * signer id length 2 bytes + * signer id n bytes + * signer id truncated 1 byte + * team id length 2 bytes + * team id n bytes + * team id truncated 1 byte + * cdhash length 2 bytes + * cdhash n bytes + */ +token_t* +au_to_identity(uint32_t signer_type, const char* signing_id, + u_char signing_id_trunc, const char* team_id, u_char team_id_trunc, + uint8_t* cdhash, uint16_t cdhash_len) +{ + token_t *t = NULL; + u_char *dptr = NULL; + size_t signing_id_len = 0; + size_t team_id_len = 0; + size_t totlen = 0; + + if (signing_id) { + signing_id_len = strlen(signing_id); + } + + if (team_id) { + team_id_len = strlen(team_id); + } + + totlen = + sizeof(u_char) + // token id + sizeof(uint32_t) + // signer type + sizeof(uint16_t) + // singing id length + signing_id_len + // length of signing id to copy + sizeof(u_char) + // null terminator for signing id + sizeof(u_char) + // if signing id truncated + sizeof(uint16_t) + // team id length + team_id_len + // length of team id to copy + sizeof(u_char) + // null terminator for team id + sizeof(u_char) + // if team id truncated + sizeof(uint16_t) + // cdhash length + cdhash_len; // cdhash buffer + + GET_TOKEN_AREA(t, dptr, totlen); + + ADD_U_CHAR(dptr, AUT_IDENTITY); // token id + ADD_U_INT32(dptr, signer_type); // signer type + ADD_U_INT16(dptr, signing_id_len + 1); // signing id length+null + ADD_STRING(dptr, signing_id, signing_id_len); // truncated signing id + ADD_U_CHAR(dptr, 0); // null terminator byte + ADD_U_CHAR(dptr, signing_id_trunc); // if signing id is trunc + ADD_U_INT16(dptr, team_id_len + 1); // team id length+null + ADD_STRING(dptr, team_id, team_id_len); // truncated team id + ADD_U_CHAR(dptr, 0); // null terminator byte + ADD_U_CHAR(dptr, team_id_trunc); // if team id is trunc + ADD_U_INT16(dptr, cdhash_len); // cdhash length + ADD_MEM(dptr, cdhash, cdhash_len); // cdhash + + return (t); +} + /* * token ID 1 byte * record byte count 4 bytes diff --git a/bsd/security/audit/audit_private.h b/bsd/security/audit/audit_private.h index 8a5a556d8..8b58b79c1 100644 --- a/bsd/security/audit/audit_private.h +++ b/bsd/security/audit/audit_private.h @@ -71,6 +71,8 @@ extern int audit_panic_on_write_fail; extern int audit_fail_stop; extern int audit_argv; extern int audit_arge; +extern au_ctlmode_t audit_ctl_mode; +extern au_expire_after_t audit_expire_after; /* * Kernel mask that is used to check to see if system calls need to be audited. @@ -182,6 +184,8 @@ union auditon_udata { au_stat_t au_stat; au_fstat_t au_fstat; auditinfo_addr_t au_kau_info; + au_ctlmode_t au_ctl_mode; + au_expire_after_t au_expire_after; }; struct posix_ipc_perm { @@ -190,6 +194,16 @@ struct posix_ipc_perm { mode_t pipc_mode; }; +struct au_identity_info { + u_int32_t signer_type; + char *signing_id; + u_char signing_id_trunc; + char *team_id; + u_char team_id_trunc; + u_int8_t *cdhash; + u_int16_t cdhash_len; +}; + struct audit_record { /* Audit record header. */ u_int32_t ar_magic; @@ -285,6 +299,7 @@ struct audit_record { LIST_HEAD(mac_audit_record_list_t, mac_audit_record) *ar_mac_records; int ar_forced_by_mac; #endif + struct au_identity_info ar_arg_identity; }; /* @@ -333,7 +348,7 @@ struct kaudit_record *audit_new(int event, proc_t p, struct uthread *td); */ struct au_record; int kaudit_to_bsm(struct kaudit_record *kar, struct au_record **pau); -int bsm_rec_verify(void *rec); +int bsm_rec_verify(void *rec, int length); /* * Kernel versions of the libbsm audit record functions. @@ -421,6 +436,10 @@ void audit_free(struct kaudit_record *ar); void audit_rotate_vnode(struct ucred *cred, struct vnode *vp); void audit_worker_init(void); +void audit_identity_info_construct( + struct au_identity_info *id_info); +void audit_identity_info_destruct( + struct au_identity_info *id_info); /* * Audit pipe functions. @@ -459,6 +478,36 @@ int audit_session_lookup(au_asid_t asid, auditinfo_addr_t *ret_aia); #define ASSIGNED_ASID_MIN (PID_MAX + 1) #define ASSIGNED_ASID_MAX (0xFFFFFFFF - 1) +/* + * Entitlement required to control various audit subsystem settings + */ +#define AU_CLASS_RESERVED_ENTITLEMENT "com.apple.private.dz.audit" + +/* + * Entitlement required to control auditctl sys call + */ +#define AU_AUDITCTL_RESERVED_ENTITLEMENT "com.apple.private.protected-audit-control" + +/* + * Max sizes used by the kernel for signing id and team id values of the + * identity tokens. These lengths include space for the null terminator. + */ +#define MAX_AU_IDENTITY_SIGNING_ID_LENGTH 129 +#define MAX_AU_IDENTITY_TEAM_ID_LENGTH 17 + +struct __attribute__((__packed__)) hdr_tok_partial { + u_char type; + uint32_t len; +}; +static_assert(sizeof(struct hdr_tok_partial) == 5); + +struct __attribute__((__packed__)) trl_tok_partial { + u_char type; + uint16_t magic; + uint32_t len; +}; +static_assert(sizeof(struct trl_tok_partial) == 7); + #endif /* defined(KERNEL) || defined(_KERNEL) */ #endif /* ! _SECURITY_AUDIT_PRIVATE_H_ */ diff --git a/bsd/security/audit/audit_syscalls.c b/bsd/security/audit/audit_syscalls.c index 2a46a579d..191596b5f 100644 --- a/bsd/security/audit/audit_syscalls.c +++ b/bsd/security/audit/audit_syscalls.c @@ -60,6 +60,7 @@ #include #include +#include #include #include @@ -87,6 +88,8 @@ #include #include +#include + #if CONFIG_AUDIT #define IS_NOT_VALID_PID(p) ((p) < 1 || (p) > PID_MAX) @@ -147,22 +150,44 @@ int audit(proc_t p, struct audit_args *uap, __unused int32_t *retval) { - int error; - void * rec; - struct kaudit_record *ar; - struct uthread *uthr; + int error = 0; + void * rec = NULL; + void * full_rec = NULL; + struct kaudit_record *ar = NULL; + struct uthread *uthr = NULL; + int add_identity_token = 1; + int max_record_length = MAX_AUDIT_RECORD_SIZE; + void *udata = NULL; + u_int ulen = 0; + struct au_identity_info id_info = {0, NULL, 0, NULL, 0, NULL, 0}; + token_t *id_tok = NULL; error = suser(kauth_cred_get(), &p->p_acflag); - if (error) - return (error); + if (error) { + goto free_out; + } mtx_lock(&audit_mtx); - if ((uap->length <= 0) || (uap->length > (int)audit_qctrl.aq_bufsz)) { - mtx_unlock(&audit_mtx); - return (EINVAL); - } + max_record_length = MIN(audit_qctrl.aq_bufsz, MAX_AUDIT_RECORD_SIZE); mtx_unlock(&audit_mtx); + if (IOTaskHasEntitlement(current_task(), + AU_CLASS_RESERVED_ENTITLEMENT)) { + /* Entitled tasks are trusted to add appropriate identity info */ + add_identity_token = 0; + } else { + /* + * If the caller is unentitled, an identity token will be added and + * the space must be accounted for + */ + max_record_length -= MAX_AUDIT_IDENTITY_SIZE; + } + + if ((uap->length <= 0) || (uap->length > max_record_length)) { + error = EINVAL; + goto free_out; + } + ar = currecord(); /* @@ -171,8 +196,11 @@ audit(proc_t p, struct audit_args *uap, __unused int32_t *retval) */ if (ar == NULL) { uthr = curthread(); - if (uthr == NULL) /* can this happen? */ - return (ENOTSUP); + if (uthr == NULL) { + /* can this happen? */ + error = ENOTSUP; + goto free_out; + } /* * This is not very efficient; we're required to allocate a @@ -180,32 +208,88 @@ audit(proc_t p, struct audit_args *uap, __unused int32_t *retval) * tag along. */ uthr->uu_ar = audit_new(AUE_NULL, p, uthr); - if (uthr->uu_ar == NULL) - return (ENOTSUP); + if (uthr->uu_ar == NULL) { + error = ENOTSUP; + goto free_out; + } ar = uthr->uu_ar; } - if (uap->length > MAX_AUDIT_RECORD_SIZE) - return (EINVAL); - rec = malloc(uap->length, M_AUDITDATA, M_WAITOK); + if (!rec) { + error = ENOMEM; + goto free_out; + } error = copyin(uap->record, rec, uap->length); - if (error) + if (error) { goto free_out; + } #if CONFIG_MACF error = mac_system_check_audit(kauth_cred_get(), rec, uap->length); - if (error) + if (error) { goto free_out; + } #endif /* Verify the record. */ - if (bsm_rec_verify(rec) == 0) { + if (bsm_rec_verify(rec, uap->length) == 0) { error = EINVAL; goto free_out; } + if (add_identity_token) { + struct hdr_tok_partial *hdr; + struct trl_tok_partial *trl; + int bytes_copied = 0; + + /* Create a new identity token for this buffer */ + audit_identity_info_construct(&id_info); + id_tok = au_to_identity(id_info.signer_type, id_info.signing_id, + id_info.signing_id_trunc, id_info.team_id, id_info.team_id_trunc, + id_info.cdhash, id_info.cdhash_len); + if (!id_tok) { + error = ENOMEM; + goto free_out; + } + + /* Splice the record together using a new buffer */ + full_rec = malloc(uap->length + id_tok->len, M_AUDITDATA, M_WAITOK); + if (!full_rec) { + error = ENOMEM; + goto free_out; + } + + /* Copy the original buffer up to but not including the trailer */ + memcpy(full_rec, rec, uap->length - AUDIT_TRAILER_SIZE); + bytes_copied = uap->length - AUDIT_TRAILER_SIZE; + + /* Copy the identity token */ + memcpy(full_rec + bytes_copied, id_tok->t_data, id_tok->len); + bytes_copied += id_tok->len; + + /* Copy the old trailer */ + memcpy(full_rec + bytes_copied, + rec + (uap->length - AUDIT_TRAILER_SIZE), AUDIT_TRAILER_SIZE); + bytes_copied += AUDIT_TRAILER_SIZE; + + /* Fix the record size stored in the header token */ + hdr = (struct hdr_tok_partial*)full_rec; + hdr->len = htonl(bytes_copied); + + /* Fix the record size stored in the trailer token */ + trl = (struct trl_tok_partial*) + (full_rec + bytes_copied - AUDIT_TRAILER_SIZE); + trl->len = htonl(bytes_copied); + + udata = full_rec; + ulen = bytes_copied; + } else { + udata = rec; + ulen = uap->length; + } + /* * Attach the user audit record to the kernel audit record. Because * this system call is an auditable event, we will write the user @@ -214,8 +298,8 @@ audit(proc_t p, struct audit_args *uap, __unused int32_t *retval) * XXXAUDIT: KASSERT appropriate starting values of k_udata, k_ulen, * k_ar_commit & AR_COMMIT_USER? */ - ar->k_udata = rec; - ar->k_ulen = uap->length; + ar->k_udata = udata; + ar->k_ulen = ulen; ar->k_ar_commit |= AR_COMMIT_USER; /* @@ -225,14 +309,30 @@ audit(proc_t p, struct audit_args *uap, __unused int32_t *retval) * want to setup kernel based preselection. */ ar->k_ar_commit |= (AR_PRESELECT_USER_TRAIL | AR_PRESELECT_USER_PIPE); - return (0); free_out: /* - * audit_syscall_exit() will free the audit record on the thread even - * if we allocated it above. + * If rec was allocated, it must be freed if an identity token was added + * (since full_rec will be used) OR there was an error (since nothing + * will be attached to the kernel structure). */ - free(rec, M_AUDITDATA); + if (rec && (add_identity_token || error)) { + free(rec, M_AUDITDATA); + } + + /* Only free full_rec if an error occurred */ + if (full_rec && error) { + free(full_rec, M_AUDITDATA); + } + + audit_identity_info_destruct(&id_info); + if (id_tok) { + if (id_tok->t_data) { + free(id_tok->t_data, M_AUDITBSM); + } + free(id_tok, M_AUDITBSM); + } + return (error); } @@ -288,6 +388,8 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) case A_GETSINFO_ADDR: case A_GETSFLAGS: case A_SETSFLAGS: + case A_SETCTLMODE: + case A_SETEXPAFTER: error = copyin(uap->data, (void *)&udata, uap->length); if (error) return (error); @@ -319,6 +421,13 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) * control implemented in audit_session_setaia(). */ break; + case A_SETCTLMODE: + case A_SETEXPAFTER: + if (!IOTaskHasEntitlement(current_task(), + AU_CLASS_RESERVED_ENTITLEMENT)) { + error = EPERM; + } + break; default: error = suser(kauth_cred_get(), &p->p_acflag); break; @@ -326,6 +435,26 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) if (error) return (error); + /* + * If the audit subsytem is in external control mode, additional + * privilege checks are required for a subset of auditon commands + */ + if (audit_ctl_mode == AUDIT_CTLMODE_EXTERNAL) { + switch (uap->cmd) { + case A_SETCOND: + case A_SETFSIZE: + case A_SETPOLICY: + case A_SETQCTRL: + if (!IOTaskHasEntitlement(current_task(), + AU_CLASS_RESERVED_ENTITLEMENT)) { + error = EPERM; + } + break; + } + if (error) + return (error); + } + /* * XXX Need to implement these commands by accessing the global * values associated with the commands. @@ -698,6 +827,56 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) return (error); break; + case A_GETCTLMODE: + if (sizeof(udata.au_ctl_mode) != uap->length) { + return (EINVAL); + } + mtx_lock(&audit_mtx); + udata.au_ctl_mode = audit_ctl_mode; + mtx_unlock(&audit_mtx); + break; + + case A_SETCTLMODE: + if (sizeof(udata.au_ctl_mode) != uap->length) { + return (EINVAL); + } + + mtx_lock(&audit_mtx); + + if (udata.au_ctl_mode == AUDIT_CTLMODE_NORMAL) { + audit_ctl_mode = AUDIT_CTLMODE_NORMAL; + } else if (udata.au_ctl_mode == AUDIT_CTLMODE_EXTERNAL) { + audit_ctl_mode = AUDIT_CTLMODE_EXTERNAL; + } else { + mtx_unlock(&audit_mtx); + return (EINVAL); + } + + mtx_unlock(&audit_mtx); + break; + + case A_GETEXPAFTER: + if (sizeof(udata.au_expire_after) != uap->length) { + return (EINVAL); + } + mtx_lock(&audit_mtx); + udata.au_expire_after.age = audit_expire_after.age; + udata.au_expire_after.size = audit_expire_after.size; + udata.au_expire_after.op_type = audit_expire_after.op_type; + mtx_unlock(&audit_mtx); + break; + + case A_SETEXPAFTER: + if (sizeof(udata.au_expire_after) != uap->length) { + return (EINVAL); + } + mtx_lock(&audit_mtx); + audit_expire_after.age = udata.au_expire_after.age; + audit_expire_after.size = udata.au_expire_after.size; + audit_expire_after.op_type = udata.au_expire_after.op_type; + mtx_unlock(&audit_mtx); + break; + default: return (EINVAL); } @@ -723,6 +902,8 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) case A_GETKAUDIT: case A_GETSINFO_ADDR: case A_GETSFLAGS: + case A_GETCTLMODE: + case A_GETEXPAFTER: error = copyout((void *)&udata, uap->data, uap->length); if (error) return (ENOSYS); @@ -906,11 +1087,22 @@ auditctl(proc_t p, struct auditctl_args *uap, __unused int32_t *retval) kauth_cred_t cred; struct vnode *vp; int error = 0; + au_ctlmode_t ctlmode; error = suser(kauth_cred_get(), &p->p_acflag); if (error) return (error); + ctlmode = audit_ctl_mode; + + /* + * Do not allow setting of a path when auditing is in reserved mode + */ + if (ctlmode == AUDIT_CTLMODE_EXTERNAL && + !IOTaskHasEntitlement(current_task(), AU_AUDITCTL_RESERVED_ENTITLEMENT)) { + return (EPERM); + } + vp = NULL; cred = NULL; diff --git a/bsd/sys/Makefile b/bsd/sys/Makefile index 88c3a51c4..e2ad05581 100644 --- a/bsd/sys/Makefile +++ b/bsd/sys/Makefile @@ -169,6 +169,25 @@ PRIVATE_KERNELFILES = \ fsevents.h \ work_interval.h \ +XNU_ONLY_EXPORTS = \ + bsdtask_info.h \ + file_internal.h \ + filedesc.h \ + guarded.h \ + linker_set.h \ + mount_internal.h \ + munge.h \ + pipe.h \ + proc_internal.h \ + pthread_internal.h \ + resourcevar.h \ + semaphore.h \ + tree.h \ + uio_internal.h \ + ulock.h \ + ux_exception.h \ + vnode_internal.h + # /usr/include INSTALL_MI_LIST = ${DATAFILES} @@ -176,9 +195,7 @@ INSTALL_MI_GEN_LIST = syscall.h _posix_availability.h _symbol_aliasing.h INSTALL_MI_DIR = sys -EXPORT_MI_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES} linker_set.h bsdtask_info.h pthread_internal.h filedesc.h pipe.h resourcevar.h semaphore.h \ - vnode_internal.h proc_internal.h file_internal.h mount_internal.h \ - uio_internal.h tree.h munge.h guarded.h ulock.h +EXPORT_MI_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES} ${XNU_ONLY_EXPORTS} EXPORT_MI_GEN_LIST = syscall.h sysproto.h kdebugevents.h diff --git a/bsd/sys/_types/_user64_timex.h b/bsd/sys/_types/_user64_timex.h index 2547592f4..eb1422e5e 100644 --- a/bsd/sys/_types/_user64_timex.h +++ b/bsd/sys/_types/_user64_timex.h @@ -30,19 +30,19 @@ #define _STRUCT_USER64_TIMEX struct user64_timex _STRUCT_USER64_TIMEX { - u_int64_t modes; + u_int32_t modes; user64_long_t offset; user64_long_t freq; user64_long_t maxerror; user64_long_t esterror; - __int64_t status; + __int32_t status; user64_long_t constant; user64_long_t precision; user64_long_t tolerance; user64_long_t ppsfreq; user64_long_t jitter; - __int64_t shift; + __int32_t shift; user64_long_t stabil; user64_long_t jitcnt; user64_long_t calcnt; diff --git a/bsd/sys/bsdtask_info.h b/bsd/sys/bsdtask_info.h index 74c747bfe..7f2edccd2 100644 --- a/bsd/sys/bsdtask_info.h +++ b/bsd/sys/bsdtask_info.h @@ -111,8 +111,8 @@ extern struct vnode *vnode_mountdevvp(struct vnode *); extern int fill_procregioninfo(task_t t, uint64_t arg, struct proc_regioninfo_internal *pinfo, uintptr_t *vp, uint32_t *vid); extern int fill_procregioninfo_onlymappedvnodes(task_t t, uint64_t arg, struct proc_regioninfo_internal *pinfo, uintptr_t *vp, uint32_t *vid); void fill_taskprocinfo(task_t task, struct proc_taskinfo_internal * ptinfo); -int fill_taskthreadinfo(task_t task, uint64_t thaddr, int thuniqueid, struct proc_threadinfo_internal * ptinfo, void *, int *); -int fill_taskthreadlist(task_t task, void * buffer, int thcount); +int fill_taskthreadinfo(task_t task, uint64_t thaddr, bool thuniqueid, struct proc_threadinfo_internal * ptinfo, void *, int *); +int fill_taskthreadlist(task_t task, void * buffer, int thcount, bool thuniqueid); int get_numthreads(task_t); boolean_t bsd_hasthreadname(void *uth); void bsd_getthreadname(void *uth, char* buffer); diff --git a/bsd/sys/cdefs.h b/bsd/sys/cdefs.h index 4339caf7b..a7f6639e0 100644 --- a/bsd/sys/cdefs.h +++ b/bsd/sys/cdefs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -389,6 +389,27 @@ #define __DEQUALIFY(type, var) __CAST_AWAY_QUALIFIER(var, const volatile, type) #endif +/* + * __alloc_size can be used to label function arguments that represent the + * size of memory that the function allocates and returns. The one-argument + * form labels a single argument that gives the allocation size (where the + * arguments are numbered from 1): + * + * void *malloc(size_t __size) __alloc_size(1); + * + * The two-argument form handles the case where the size is calculated as the + * product of two arguments: + * + * void *calloc(size_t __count, size_t __size) __alloc_size(1,2); + */ +#ifndef __alloc_size +#if __has_attribute(alloc_size) +#define __alloc_size(...) __attribute__((alloc_size(__VA_ARGS__))) +#else +#define __alloc_size(...) +#endif +#endif // __alloc_size + /* * COMPILATION ENVIRONMENTS -- see compat(5) for additional detail * @@ -873,4 +894,24 @@ _Pragma("clang diagnostic pop") #endif +#if defined(PRIVATE) || defined(KERNEL) +/* + * Check if __probable and __improbable have already been defined elsewhere. + * These macros inform the compiler (and humans) about which branches are likely + * to be taken. + */ +#if !defined(__probable) && !defined(__improbable) +#define __probable(x) __builtin_expect(!!(x), 1) +#define __improbable(x) __builtin_expect(!!(x), 0) +#endif /* !defined(__probable) && !defined(__improbable) */ + +#define __container_of(ptr, type, field) ({ \ + const typeof(((type *)0)->field) *__ptr = (ptr); \ + (type *)((uintptr_t)__ptr - offsetof(type, field)); \ + }) + +#endif /* KERNEL || PRIVATE */ + +#define __compiler_barrier() __asm__ __volatile__("" ::: "memory") + #endif /* !_CDEFS_H_ */ diff --git a/bsd/sys/codesign.h b/bsd/sys/codesign.h index f56878e73..069725c7e4 100644 --- a/bsd/sys/codesign.h +++ b/bsd/sys/codesign.h @@ -55,6 +55,7 @@ #define CS_OPS_IDENTITY 11 /* get codesign identity */ #define CS_OPS_CLEARINSTALLER 12 /* clear INSTALLER flag */ #define CS_OPS_CLEARPLATFORM 13 /* clear platform binary status (DEVELOPMENT-only) */ +#define CS_OPS_TEAMID 14 /* get team id */ #define CS_MAX_TEAMID_LEN 64 @@ -71,6 +72,7 @@ __END_DECLS #else /* !KERNEL */ +#include #include #include @@ -82,8 +84,11 @@ struct fileglob; __BEGIN_DECLS int cs_valid(struct proc *); -int cs_enforcement(struct proc *); +int cs_process_enforcement(struct proc *); +int cs_process_global_enforcement(void); +int cs_system_enforcement(void); int cs_require_lv(struct proc *); +int csproc_forced_lv(struct proc* p); int cs_system_require_lv(void); uint32_t cs_entitlement_flags(struct proc *p); int cs_entitlements_blob_get(struct proc *, void **, size_t *); @@ -108,6 +113,11 @@ unsigned int csblob_get_signer_type(struct cs_blob *); void csproc_clear_platform_binary(struct proc *); #endif +void csproc_disable_enforcement(struct proc* p); +void csproc_mark_invalid_allowed(struct proc* p); +int csproc_check_invalid_allowed(struct proc* p); +int csproc_hardened_runtime(struct proc* p); + int csblob_get_entitlements(struct cs_blob *, void **, size_t *); const CS_GenericBlob * @@ -139,6 +149,14 @@ uint8_t csvnode_get_platform_identifier(struct vnode *, off_t); uint8_t csproc_get_platform_identifier(struct proc *); extern int cs_debug; +extern int cs_debug_fail_on_unsigned_code; +extern unsigned int cs_debug_unsigned_exec_failures; +extern unsigned int cs_debug_unsigned_mmap_failures; + +int cs_blob_create_validated(vm_address_t* addr, vm_size_t size, + struct cs_blob ** ret_blob, CS_CodeDirectory const **ret_cd); + +void cs_blob_free(struct cs_blob *blob); #ifdef XNU_KERNEL_PRIVATE diff --git a/bsd/sys/csr.h b/bsd/sys/csr.h index 4c7f51ece..9b6c0d0ca 100644 --- a/bsd/sys/csr.h +++ b/bsd/sys/csr.h @@ -50,6 +50,7 @@ typedef uint32_t csr_op_t; #define CSR_ALLOW_DEVICE_CONFIGURATION (1 << 7) #define CSR_ALLOW_ANY_RECOVERY_OS (1 << 8) #define CSR_ALLOW_UNAPPROVED_KEXTS (1 << 9) +#define CSR_ALLOW_EXECUTABLE_POLICY_OVERRIDE (1 << 10) #define CSR_VALID_FLAGS (CSR_ALLOW_UNTRUSTED_KEXTS | \ CSR_ALLOW_UNRESTRICTED_FS | \ @@ -60,7 +61,8 @@ typedef uint32_t csr_op_t; CSR_ALLOW_UNRESTRICTED_NVRAM | \ CSR_ALLOW_DEVICE_CONFIGURATION | \ CSR_ALLOW_ANY_RECOVERY_OS | \ - CSR_ALLOW_UNAPPROVED_KEXTS) + CSR_ALLOW_UNAPPROVED_KEXTS | \ + CSR_ALLOW_EXECUTABLE_POLICY_OVERRIDE) #define CSR_ALWAYS_ENFORCED_FLAGS (CSR_ALLOW_DEVICE_CONFIGURATION | CSR_ALLOW_ANY_RECOVERY_OS) diff --git a/bsd/sys/decmpfs.h b/bsd/sys/decmpfs.h index f30a6decc..1f57e93bf 100644 --- a/bsd/sys/decmpfs.h +++ b/bsd/sys/decmpfs.h @@ -28,10 +28,44 @@ #ifndef _SYS_DECMPFS_H_ #define _SYS_DECMPFS_H_ 1 -#include #include +#include +#include #include +/* + * Please switch on @DECMPFS_ENABLE_KDEBUG_TRACES to enable tracepoints. + * Tracepoints are compiled out by default to eliminate any overhead due to + * kernel tracing. + * + * #define DECMPFS_ENABLE_KDEBUG_TRACES 1 + */ +#if DECMPFS_ENABLE_KDEBUG_TRACES +#define DECMPFS_EMIT_TRACE_ENTRY(D, ...)\ + KDBG_FILTERED((D) | DBG_FUNC_START, ## __VA_ARGS__) +#define DECMPFS_EMIT_TRACE_RETURN(D, ...)\ + KDBG_FILTERED((D) | DBG_FUNC_END, ##__VA_ARGS__) +#else +#define DECMPFS_EMIT_TRACE_ENTRY(D, ...) do {} while (0) +#define DECMPFS_EMIT_TRACE_RETURN(D, ...) do {} while (0) +#endif /* DECMPFS_ENABLE_KDEBUG_TRACES */ + +/* + * KERNEL_DEBUG related definitions for decmpfs. + * + * Please NOTE: The Class DBG_FSYSTEM = 3, and Subclass DBG_DECMP = 0x12, so + * these debug codes are of the form 0x0312nnnn. + */ +#define DECMPDBG_CODE(code) FSDBG_CODE(DBG_DECMP, code) + +enum { + DECMPDBG_DECOMPRESS_FILE = DECMPDBG_CODE(0), /* 0x03120000 */ + DECMPDBG_FETCH_COMPRESSED_HEADER = DECMPDBG_CODE(1), /* 0x03120004 */ + DECMPDBG_FETCH_UNCOMPRESSED_DATA = DECMPDBG_CODE(2), /* 0x03120008 */ + DECMPDBG_FREE_COMPRESSED_DATA = DECMPDBG_CODE(4), /* 0x03120010 */ + DECMPDBG_FILE_IS_COMPRESSED = DECMPDBG_CODE(5), /* 0x03120014 */ +}; + #define MAX_DECMPFS_XATTR_SIZE 3802 /* diff --git a/bsd/sys/disk.h b/bsd/sys/disk.h index f046e7f76..f1c4c821e 100644 --- a/bsd/sys/disk.h +++ b/bsd/sys/disk.h @@ -78,6 +78,8 @@ * DKIOCGETIOMINSATURATIONBYTECOUNT get minimum byte count to saturate storage bandwidth * * DKIOCGETERRORDESCRIPTION get description of any drive error + * + * DKIOCGETMAXSWAPWRITE get maximum swap file write per day in bytes */ #define DK_FEATURE_BARRIER 0x00000002 @@ -339,6 +341,18 @@ typedef enum { #define DKIOCGETAPFSFLAVOUR _IOR('d', 91, dk_apfs_flavour_t) +// Extent's offset and length returned in bytes +typedef struct dk_apfs_wbc_range { + dev_t dev; // Physical device for extents + uint32_t count; // Number of extents + dk_extent_t extents[2]; // Addresses are relative to device we return +} dk_apfs_wbc_range_t; + +#define DKIOCAPFSGETWBCRANGE _IOR('d', 92, dk_apfs_wbc_range_t) +#define DKIOCAPFSRELEASEWBCRANGE _IO('d', 93) + +#define DKIOCGETMAXSWAPWRITE _IOR('d', 94, uint64_t) + #endif /* PRIVATE */ #endif /* KERNEL */ diff --git a/bsd/sys/dtrace_glue.h b/bsd/sys/dtrace_glue.h index 077e1f3d9..bf6a94092 100644 --- a/bsd/sys/dtrace_glue.h +++ b/bsd/sys/dtrace_glue.h @@ -50,11 +50,8 @@ /* * cmn_err */ -#define CE_CONT 0 /* continuation */ #define CE_NOTE 1 /* notice */ #define CE_WARN 2 /* warning */ -#define CE_PANIC 3 /* panic */ -#define CE_IGNORE 4 /* print nothing */ extern void cmn_err( int, const char *, ... ); @@ -69,6 +66,9 @@ extern void cmn_err( int, const char *, ... ); proc_t* sprlock(pid_t pid); void sprunlock(proc_t *p); +void dtrace_sprlock(proc_t *p); +void dtrace_sprunlock(proc_t *p); + /* * uread/uwrite */ @@ -85,15 +85,11 @@ int fuword16(user_addr_t, uint16_t *); int fuword32(user_addr_t, uint32_t *); int fuword64(user_addr_t, uint64_t *); -void fuword8_noerr(user_addr_t, uint8_t *); -void fuword16_noerr(user_addr_t, uint16_t *); void fuword32_noerr(user_addr_t, uint32_t *); void fuword64_noerr(user_addr_t, uint64_t *); int suword64(user_addr_t, uint64_t value); int suword32(user_addr_t, uint32_t value); -int suword16(user_addr_t, uint16_t value); -int suword8(user_addr_t, uint8_t value); /* * cpuvar @@ -233,7 +229,8 @@ typedef struct modctl { #define MODCTL_FBT_PRIVATE_PROBES_PROVIDED 0x80 // fbt private probes have been provided #define MODCTL_FBT_PROVIDE_PRIVATE_PROBES 0x100 // fbt provider must provide private probes #define MODCTL_FBT_PROVIDE_BLACKLISTED_PROBES 0x200 // fbt provider must provide blacklisted probes -#define MODCTL_FBT_BLACKLISTED_PROBES_PROVIDED 0x400 // fbt blacklisted probes have been provided +#define MODCTL_FBT_BLACKLISTED_PROBES_PROVIDED 0x400 // fbt blacklisted probes have been provided +#define MODCTL_IS_STATIC_KEXT 0x800 // module is a static kext /* Simple/singular mod_flags accessors */ #define MOD_IS_MACH_KERNEL(mod) (mod->mod_flags & MODCTL_IS_MACH_KERNEL) @@ -248,6 +245,7 @@ typedef struct modctl { #define MOD_FBT_PROVIDE_PRIVATE_PROBES(mod) (mod->mod_flags & MODCTL_FBT_PROVIDE_PRIVATE_PROBES) #define MOD_FBT_BLACKLISTED_PROBES_PROVIDED(mod) (mod->mod_flags & MODCTL_FBT_BLACKLISTED_PROBES_PROVIDED) #define MOD_FBT_PROVIDE_BLACKLISTED_PROBES(mod) (mod->mod_flags & MODCTL_FBT_PROVIDE_BLACKLISTED_PROBES) +#define MOD_IS_STATIC_KEXT(mod) (mod->mod_flags & MODCTL_IS_STATIC_KEXT) /* Compound accessors */ #define MOD_FBT_PRIVATE_PROBES_DONE(mod) (MOD_FBT_PRIVATE_PROBES_PROVIDED(mod) || !MOD_FBT_PROVIDE_PRIVATE_PROBES(mod)) @@ -258,6 +256,8 @@ typedef struct modctl { extern modctl_t *dtrace_modctl_list; +extern int dtrace_addr_in_module(void*, struct modctl*); + /* * cred_t */ @@ -280,20 +280,14 @@ extern cred_t *dtrace_CRED(void); /* Safe to call from probe context. */ #define CRED() kauth_cred_get() /* Can't be called from probe context! */ extern int PRIV_POLICY_CHOICE(void *, int, int); extern int PRIV_POLICY_ONLY(void *, int, int); -extern gid_t crgetgid(const cred_t *); extern uid_t crgetuid(const cred_t *); #define crgetzoneid(x) ((zoneid_t)0) -#define crhold(a) {} -#define crfree(a) {} - /* * "cyclic" */ #define CY_LOW_LEVEL 0 -#define CY_LOCK_LEVEL 1 #define CY_HIGH_LEVEL 2 -#define CY_SOFT_LEVELS 2 #define CY_LEVELS 3 typedef uintptr_t cyclic_id_t; @@ -338,18 +332,8 @@ extern void cyclic_timer_remove(cyclic_id_t); #define DDI_SUCCESS 0 #define DDI_FAILURE -1 -#define DDI_DEV_T_NONE ((dev_t)-1) -#define DDI_DEV_T_ANY ((dev_t)-2) -#define DDI_MAJOR_T_UNKNOWN ((major_t)0) - #define DDI_PSEUDO "ddi_pseudo" -typedef enum { - DDI_ATTACH = 0, - DDI_RESUME = 1, - DDI_PM_RESUME = 2 -} ddi_attach_cmd_t; - typedef enum { DDI_DETACH = 0, DDI_SUSPEND = 1, @@ -365,10 +349,6 @@ typedef uint_t minor_t; typedef struct __dev_info *dev_info_t; -extern void ddi_report_dev(dev_info_t *); - -int ddi_getprop(dev_t dev, dev_info_t *dip, int flags, const char *name, int defvalue); - extern int ddi_driver_major(dev_info_t *); extern int ddi_create_minor_node(dev_info_t *, const char *, int, minor_t, const char *, int); @@ -377,43 +357,15 @@ extern void ddi_remove_minor_node(dev_info_t *, char *); extern major_t getemajor(dev_t); extern minor_t getminor(dev_t); -extern dev_t makedevice(major_t, minor_t); - /* * Kernel Debug Interface */ - -typedef enum kdi_dtrace_set { - KDI_DTSET_DTRACE_ACTIVATE, - KDI_DTSET_DTRACE_DEACTIVATE, - KDI_DTSET_KMDB_BPT_ACTIVATE, - KDI_DTSET_KMDB_BPT_DEACTIVATE -} kdi_dtrace_set_t; - -extern int kdi_dtrace_set(kdi_dtrace_set_t); extern void debug_enter(char *); /* * DTrace specific zone allocation */ -/* - * To break dtrace memory usage out in a trackable - * fashion, uncomment the #define below. This will - * enable emulation of the general kalloc.XXX zones - * for most dtrace allocations. (kalloc.large is not - * emulated) - * - * #define DTRACE_MEMORY_ZONES 1 - * - */ - -#if defined(DTRACE_MEMORY_ZONES) -void dtrace_alloc_init(void); -void *dtrace_alloc(vm_size_t); -void dtrace_free(void *, vm_size_t); -#endif - /* * kmem */ @@ -424,15 +376,32 @@ void dtrace_free(void *, vm_size_t); typedef struct vmem vmem_t; typedef struct kmem_cache kmem_cache_t; -#define kmem_alloc dt_kmem_alloc /* Avoid clash with Darwin's kmem_alloc */ #define kmem_free dt_kmem_free /* Avoid clash with Darwin's kmem_free */ -#define kmem_zalloc dt_kmem_zalloc /* Avoid clash with Darwin's kmem_zalloc */ -extern void *dt_kmem_alloc(size_t, int); +#define kmem_free_aligned dt_kmem_free_aligned + +#define kmem_alloc(size, kmflag) \ + ({ VM_ALLOC_SITE_STATIC(0, 0); \ + dt_kmem_alloc_site(size, kmflag, &site); }) + +extern void *dt_kmem_alloc_site(size_t, int, vm_allocation_site_t*); extern void dt_kmem_free(void *, size_t); -extern void *dt_kmem_zalloc(size_t, int); -extern void *dt_kmem_alloc_aligned(size_t, size_t, int); -extern void *dt_kmem_zalloc_aligned(size_t, size_t, int); +#define kmem_zalloc(size, kmflag) \ + ({ VM_ALLOC_SITE_STATIC(0, 0); \ + dt_kmem_zalloc_site(size, kmflag, &site); }) + +extern void *dt_kmem_zalloc_site(size_t, int, vm_allocation_site_t*); + +#define kmem_alloc_aligned(size, align, kmflag) \ + ({ VM_ALLOC_SITE_STATIC(0, 0); \ + dt_kmem_alloc_aligned_site(size, align, kmflag, &site); }) +extern void *dt_kmem_alloc_aligned_site(size_t, size_t, int, vm_allocation_site_t*); + +#define kmem_zalloc_aligned(size, align, kmflag) \ + ({ VM_ALLOC_SITE_STATIC(0, 0); \ + dt_kmem_zalloc_aligned_site(size, align, kmflag, &site); }) +extern void *dt_kmem_zalloc_aligned_site(size_t, size_t, int, vm_allocation_site_t*); + extern void dt_kmem_free_aligned(void*, size_t); extern kmem_cache_t * @@ -452,7 +421,6 @@ typedef struct _kthread kthread_t; /* For dtrace_vtime_switch(), dtrace_panicked * proc */ -#define DATAMODEL_MASK 0x0FF00000 #define DATAMODEL_ILP32 0x00100000 #define DATAMODEL_LP64 0x00200000 @@ -467,23 +435,6 @@ typedef struct _kthread kthread_t; /* For dtrace_vtime_switch(), dtrace_panicked typedef unsigned int model_t; /* For dtrace_instr_size_isa() prototype in */ -/* - * taskq - */ - -#define TQ_SLEEP 0x00 /* Can block for memory */ - -typedef uint_t pri_t; -typedef struct taskq taskq_t; -typedef void (task_func_t)(void *); -typedef uintptr_t taskqid_t; - -extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); -extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); -extern void taskq_destroy(taskq_t *); - -extern pri_t maxclsyspri; - /* * vmem */ @@ -569,6 +520,7 @@ extern hrtime_t dtrace_abs_to_nano(uint64_t); __private_extern__ const char * strstr(const char *, const char *); const void* bsearch(const void*, const void*, size_t, size_t, int (*compar)(const void *, const void *)); +int dtrace_copy_maxsize(void); int dtrace_buffer_copyout(const void*, user_addr_t, vm_size_t); diff --git a/bsd/sys/dtrace_impl.h b/bsd/sys/dtrace_impl.h index 125293dbf..f463b49e3 100644 --- a/bsd/sys/dtrace_impl.h +++ b/bsd/sys/dtrace_impl.h @@ -110,6 +110,8 @@ struct dtrace_probe { char *dtpr_mod; /* probe's module name */ char *dtpr_func; /* probe's function name */ char *dtpr_name; /* probe's name */ + dtrace_probe_t *dtpr_nextprov; /* next in provider hash */ + dtrace_probe_t *dtpr_prevprov; /* previous in provider hash */ dtrace_probe_t *dtpr_nextmod; /* next in module hash */ dtrace_probe_t *dtpr_prevmod; /* previous in module hash */ dtrace_probe_t *dtpr_nextfunc; /* next in function hash */ @@ -135,18 +137,21 @@ typedef struct dtrace_probekey { typedef struct dtrace_hashbucket { struct dtrace_hashbucket *dthb_next; /* next on hash chain */ - dtrace_probe_t *dthb_chain; /* chain of probes */ + void *dthb_chain; /* chain of elements */ int dthb_len; /* number of probes here */ } dtrace_hashbucket_t; +typedef const char* dtrace_strkey_f(void*, uintptr_t); + typedef struct dtrace_hash { - dtrace_hashbucket_t **dth_tab; /* hash table */ - int dth_size; /* size of hash table */ - int dth_mask; /* mask to index into table */ - int dth_nbuckets; /* total number of buckets */ - uintptr_t dth_nextoffs; /* offset of next in probe */ - uintptr_t dth_prevoffs; /* offset of prev in probe */ - uintptr_t dth_stroffs; /* offset of str in probe */ + dtrace_hashbucket_t **dth_tab; /* hash table */ + int dth_size; /* size of hash table */ + int dth_mask; /* mask to index into table */ + int dth_nbuckets; /* total number of buckets */ + uintptr_t dth_nextoffs; /* offset of next in element */ + uintptr_t dth_prevoffs; /* offset of prev in element */ + dtrace_strkey_f *dth_getstr; /* func to retrieve str in element */ + uintptr_t dth_stroffs; /* offset of str in element */ } dtrace_hash_t; /* @@ -1310,6 +1315,16 @@ typedef struct dtrace_errhash { #endif /* DTRACE_ERRDEBUG */ + +typedef struct dtrace_string dtrace_string_t; + +typedef struct dtrace_string { + dtrace_string_t *dtst_next; + dtrace_string_t *dtst_prev; + uint32_t dtst_refcount; + char dtst_str[]; +} dtrace_string_t; + /** * DTrace Matching pre-conditions * @@ -1374,6 +1389,8 @@ extern int dtrace_attached(void); extern hrtime_t dtrace_gethrestime(void); extern void dtrace_isa_init(void); +extern void dtrace_flush_caches(void); + extern void dtrace_copy(uintptr_t, uintptr_t, size_t); extern void dtrace_copystr(uintptr_t, uintptr_t, size_t, volatile uint16_t *); diff --git a/bsd/sys/dtrace_ptss.h b/bsd/sys/dtrace_ptss.h index e7d8d9b0c..93382d71d 100644 --- a/bsd/sys/dtrace_ptss.h +++ b/bsd/sys/dtrace_ptss.h @@ -74,19 +74,17 @@ extern "C" { #define DTRACE_PTSS_SCRATCH_SPACE_PER_THREAD (64) -#define DTRACE_PTSS_ENTRIES_PER_PAGE (PAGE_SIZE / DTRACE_PTSS_SCRATCH_SPACE_PER_THREAD) +#define DTRACE_PTSS_ENTRIES_PER_PAGE (PAGE_MAX_SIZE / DTRACE_PTSS_SCRATCH_SPACE_PER_THREAD) struct dtrace_ptss_page_entry { struct dtrace_ptss_page_entry* next; user_addr_t addr; -#if CONFIG_EMBEDDED user_addr_t write_addr; -#endif }; struct dtrace_ptss_page { struct dtrace_ptss_page* next; - struct dtrace_ptss_page_entry entries[PAGE_MAX_SIZE / DTRACE_PTSS_SCRATCH_SPACE_PER_THREAD]; + struct dtrace_ptss_page_entry entries[DTRACE_PTSS_ENTRIES_PER_PAGE]; }; struct dtrace_ptss_page_entry* dtrace_ptss_claim_entry(struct proc* p); /* sprlock not held */ diff --git a/bsd/sys/event.h b/bsd/sys/event.h index 04385bc6c..60eee50ab 100644 --- a/bsd/sys/event.h +++ b/bsd/sys/event.h @@ -2,7 +2,7 @@ * Copyright (c) 2003-2017 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /*- @@ -189,9 +189,9 @@ typedef uint64_t kqueue_id_t; /* kevent system call flags */ -#define KEVENT_FLAG_NONE 0x000 /* no flag value */ -#define KEVENT_FLAG_IMMEDIATE 0x001 /* immediate timeout */ -#define KEVENT_FLAG_ERROR_EVENTS 0x002 /* output events only include change errors */ +#define KEVENT_FLAG_NONE 0x000000 /* no flag value */ +#define KEVENT_FLAG_IMMEDIATE 0x000001 /* immediate timeout */ +#define KEVENT_FLAG_ERROR_EVENTS 0x000002 /* output events only include change errors */ #ifdef PRIVATE @@ -201,34 +201,36 @@ typedef uint64_t kqueue_id_t; * instead. */ -#define KEVENT_FLAG_STACK_EVENTS 0x004 /* output events treated as stack (grows down) */ -#define KEVENT_FLAG_STACK_DATA 0x008 /* output data allocated as stack (grows down) */ -#define KEVENT_FLAG_UNBIND_CHECK_FLAGS 0x010 /* check the flags passed to kevent_qos_internal_unbind */ -#define KEVENT_FLAG_WORKQ 0x020 /* interact with the default workq kq */ -#define KEVENT_FLAG_WORKQ_MANAGER 0x200 /* current thread is the workq manager */ -#define KEVENT_FLAG_WORKLOOP 0x400 /* interact with the specified workloop kq */ -#define KEVENT_FLAG_SYNCHRONOUS_BIND 0x800 /* synchronous bind callback */ - -#define KEVENT_FLAG_WORKLOOP_SERVICER_ATTACH 0x8000 /* attach current thread to workloop */ -#define KEVENT_FLAG_WORKLOOP_SERVICER_DETACH 0x10000 /* unbind current thread from workloop */ -#define KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST 0x20000 /* kq lookup by id must exist */ -#define KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST 0x40000 /* kq lookup by id must not exist */ -#define KEVENT_FLAG_WORKLOOP_NO_WQ_THREAD 0x80000 /* do not create workqueue threads for this worloop */ +#define KEVENT_FLAG_STACK_EVENTS 0x000004 /* output events treated as stack (grows down) */ +#define KEVENT_FLAG_STACK_DATA 0x000008 /* output data allocated as stack (grows down) */ +// 0x000010 +#define KEVENT_FLAG_WORKQ 0x000020 /* interact with the default workq kq */ +// KEVENT_FLAG_LEGACY32 0x000040 +// KEVENT_FLAG_LEGACY64 0x000080 +// 0x000100 +#define KEVENT_FLAG_WORKQ_MANAGER 0x000200 /* obsolete */ +#define KEVENT_FLAG_WORKLOOP 0x000400 /* interact with the specified workloop kq */ +#define KEVENT_FLAG_PARKING 0x000800 /* workq thread is parking */ +// KEVENT_FLAG_KERNEL 0x001000 +// KEVENT_FLAG_DYNAMIC_KQUEUE 0x002000 +// 0x004000 +#define KEVENT_FLAG_WORKLOOP_SERVICER_ATTACH 0x008000 /* obsolete */ +#define KEVENT_FLAG_WORKLOOP_SERVICER_DETACH 0x010000 /* obsolete */ +#define KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST 0x020000 /* kq lookup by id must exist */ +#define KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST 0x040000 /* kq lookup by id must not exist */ +#define KEVENT_FLAG_WORKLOOP_NO_WQ_THREAD 0x080000 /* obsolete */ #ifdef XNU_KERNEL_PRIVATE -#define KEVENT_FLAG_LEGACY32 0x040 /* event data in legacy 32-bit format */ -#define KEVENT_FLAG_LEGACY64 0x080 /* event data in legacy 64-bit format */ +#define KEVENT_FLAG_LEGACY32 0x0040 /* event data in legacy 32-bit format */ +#define KEVENT_FLAG_LEGACY64 0x0080 /* event data in legacy 64-bit format */ #define KEVENT_FLAG_KERNEL 0x1000 /* caller is in-kernel */ #define KEVENT_FLAG_DYNAMIC_KQUEUE 0x2000 /* kqueue is dynamically allocated */ -#define KEVENT_FLAG_WORKLOOP_CANCELED 0x4000 /* workloop bind was cancelled */ #define KEVENT_FLAG_USER (KEVENT_FLAG_IMMEDIATE | KEVENT_FLAG_ERROR_EVENTS | \ - KEVENT_FLAG_STACK_EVENTS | KEVENT_FLAG_STACK_DATA | \ - KEVENT_FLAG_WORKQ | KEVENT_FLAG_WORKLOOP | \ - KEVENT_FLAG_WORKLOOP_SERVICER_ATTACH | KEVENT_FLAG_WORKLOOP_SERVICER_DETACH | \ - KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST | KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST | \ - KEVENT_FLAG_WORKLOOP_NO_WQ_THREAD) + KEVENT_FLAG_STACK_EVENTS | KEVENT_FLAG_STACK_DATA | \ + KEVENT_FLAG_WORKQ | KEVENT_FLAG_WORKLOOP | \ + KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST | KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST) /* * Since some filter ops are not part of the standard sysfilt_ops, we use @@ -260,48 +262,48 @@ typedef uint64_t kqueue_id_t; #endif /* PRIVATE */ /* actions */ -#define EV_ADD 0x0001 /* add event to kq (implies enable) */ -#define EV_DELETE 0x0002 /* delete event from kq */ -#define EV_ENABLE 0x0004 /* enable event */ -#define EV_DISABLE 0x0008 /* disable event (not reported) */ +#define EV_ADD 0x0001 /* add event to kq (implies enable) */ +#define EV_DELETE 0x0002 /* delete event from kq */ +#define EV_ENABLE 0x0004 /* enable event */ +#define EV_DISABLE 0x0008 /* disable event (not reported) */ /* flags */ -#define EV_ONESHOT 0x0010 /* only report one occurrence */ -#define EV_CLEAR 0x0020 /* clear event state after reporting */ -#define EV_RECEIPT 0x0040 /* force immediate event output */ - /* ... with or without EV_ERROR */ - /* ... use KEVENT_FLAG_ERROR_EVENTS */ - /* on syscalls supporting flags */ +#define EV_ONESHOT 0x0010 /* only report one occurrence */ +#define EV_CLEAR 0x0020 /* clear event state after reporting */ +#define EV_RECEIPT 0x0040 /* force immediate event output */ + /* ... with or without EV_ERROR */ + /* ... use KEVENT_FLAG_ERROR_EVENTS */ + /* on syscalls supporting flags */ -#define EV_DISPATCH 0x0080 /* disable event after reporting */ -#define EV_UDATA_SPECIFIC 0x0100 /* unique kevent per udata value */ +#define EV_DISPATCH 0x0080 /* disable event after reporting */ +#define EV_UDATA_SPECIFIC 0x0100 /* unique kevent per udata value */ #define EV_DISPATCH2 (EV_DISPATCH | EV_UDATA_SPECIFIC) - /* ... in combination with EV_DELETE */ - /* will defer delete until udata-specific */ - /* event enabled. EINPROGRESS will be */ - /* returned to indicate the deferral */ + /* ... in combination with EV_DELETE */ + /* will defer delete until udata-specific */ + /* event enabled. EINPROGRESS will be */ + /* returned to indicate the deferral */ -#define EV_VANISHED 0x0200 /* report that source has vanished */ - /* ... only valid with EV_DISPATCH2 */ +#define EV_VANISHED 0x0200 /* report that source has vanished */ + /* ... only valid with EV_DISPATCH2 */ -#define EV_SYSFLAGS 0xF000 /* reserved by system */ -#define EV_FLAG0 0x1000 /* filter-specific flag */ -#define EV_FLAG1 0x2000 /* filter-specific flag */ +#define EV_SYSFLAGS 0xF000 /* reserved by system */ +#define EV_FLAG0 0x1000 /* filter-specific flag */ +#define EV_FLAG1 0x2000 /* filter-specific flag */ /* returned values */ -#define EV_EOF 0x8000 /* EOF detected */ -#define EV_ERROR 0x4000 /* error, data contains errno */ +#define EV_EOF 0x8000 /* EOF detected */ +#define EV_ERROR 0x4000 /* error, data contains errno */ /* * Filter specific flags for EVFILT_READ * * The default behavior for EVFILT_READ is to make the "read" determination - * relative to the current file descriptor read pointer. + * relative to the current file descriptor read pointer. * * The EV_POLL flag indicates the determination should be made via poll(2) * semantics. These semantics dictate always returning true for regular files, - * regardless of the amount of unread data in the file. + * regardless of the amount of unread data in the file. * * On input, EV_OOBAND specifies that filter should actively return in the * presence of OOB on the descriptor. It implies that filter will return @@ -331,7 +333,7 @@ typedef uint64_t kqueue_id_t; #define NOTE_TRIGGER 0x01000000 /* - * On input, the top two bits of fflags specifies how the lower twenty four + * On input, the top two bits of fflags specifies how the lower twenty four * bits should be applied to the stored value of fflags. * * On output, the top two bits will always be set to NOTE_FFNOP and the @@ -342,7 +344,7 @@ typedef uint64_t kqueue_id_t; #define NOTE_FFOR 0x80000000 /* or fflags */ #define NOTE_FFCOPY 0xc0000000 /* copy fflags */ #define NOTE_FFCTRLMASK 0xc0000000 /* mask for operations */ -#define NOTE_FFLAGSMASK 0x00ffffff +#define NOTE_FFLAGSMASK 0x00ffffff #ifdef PRIVATE /* @@ -434,13 +436,13 @@ typedef uint64_t kqueue_id_t; /* * data/hint fflags for EVFILT_VNODE, shared with userspace */ -#define NOTE_DELETE 0x00000001 /* vnode was removed */ -#define NOTE_WRITE 0x00000002 /* data contents changed */ -#define NOTE_EXTEND 0x00000004 /* size increased */ -#define NOTE_ATTRIB 0x00000008 /* attributes changed */ -#define NOTE_LINK 0x00000010 /* link count changed */ -#define NOTE_RENAME 0x00000020 /* vnode was renamed */ -#define NOTE_REVOKE 0x00000040 /* vnode access was revoked */ +#define NOTE_DELETE 0x00000001 /* vnode was removed */ +#define NOTE_WRITE 0x00000002 /* data contents changed */ +#define NOTE_EXTEND 0x00000004 /* size increased */ +#define NOTE_ATTRIB 0x00000008 /* attributes changed */ +#define NOTE_LINK 0x00000010 /* link count changed */ +#define NOTE_RENAME 0x00000020 /* vnode was renamed */ +#define NOTE_REVOKE 0x00000040 /* vnode access was revoked */ #define NOTE_NONE 0x00000080 /* No specific vnode event: to test for EVFILT_READ activation*/ #define NOTE_FUNLOCK 0x00000100 /* vnode was unlocked by flock(2) */ @@ -458,22 +460,22 @@ enum { eNoteReapDeprecated __deprecated_enum_msg("This kqueue(2) EVFILT_PROC flag is deprecated") = 0x10000000 }; -#define NOTE_EXIT 0x80000000 /* process exited */ -#define NOTE_FORK 0x40000000 /* process forked */ -#define NOTE_EXEC 0x20000000 /* process exec'd */ -#define NOTE_REAP ((unsigned int)eNoteReapDeprecated /* 0x10000000 */) /* process reaped */ -#define NOTE_SIGNAL 0x08000000 /* shared with EVFILT_SIGNAL */ -#define NOTE_EXITSTATUS 0x04000000 /* exit status to be returned, valid for child process only */ -#define NOTE_EXIT_DETAIL 0x02000000 /* provide details on reasons for exit */ +#define NOTE_EXIT 0x80000000 /* process exited */ +#define NOTE_FORK 0x40000000 /* process forked */ +#define NOTE_EXEC 0x20000000 /* process exec'd */ +#define NOTE_REAP ((unsigned int)eNoteReapDeprecated /* 0x10000000 */) /* process reaped */ +#define NOTE_SIGNAL 0x08000000 /* shared with EVFILT_SIGNAL */ +#define NOTE_EXITSTATUS 0x04000000 /* exit status to be returned, valid for child process only */ +#define NOTE_EXIT_DETAIL 0x02000000 /* provide details on reasons for exit */ -#define NOTE_PDATAMASK 0x000fffff /* mask for signal & exit status */ -#define NOTE_PCTRLMASK (~NOTE_PDATAMASK) +#define NOTE_PDATAMASK 0x000fffff /* mask for signal & exit status */ +#define NOTE_PCTRLMASK (~NOTE_PDATAMASK) /* * If NOTE_EXITSTATUS is present, provide additional info about exiting process. */ enum { - eNoteExitReparentedDeprecated __deprecated_enum_msg("This kqueue(2) EVFILT_PROC flag is no longer sent") = 0x00080000 + eNoteExitReparentedDeprecated __deprecated_enum_msg("This kqueue(2) EVFILT_PROC flag is no longer sent") = 0x00080000 }; #define NOTE_EXIT_REPARENTED ((unsigned int)eNoteExitReparentedDeprecated) /* exited while reparented */ @@ -481,8 +483,8 @@ enum { * If NOTE_EXIT_DETAIL is present, these bits indicate specific reasons for exiting. */ #define NOTE_EXIT_DETAIL_MASK 0x00070000 -#define NOTE_EXIT_DECRYPTFAIL 0x00010000 -#define NOTE_EXIT_MEMORY 0x00020000 +#define NOTE_EXIT_DECRYPTFAIL 0x00010000 +#define NOTE_EXIT_MEMORY 0x00020000 #define NOTE_EXIT_CSERROR 0x00040000 #ifdef PRIVATE @@ -536,15 +538,15 @@ enum { */ #define EVFILT_MEMORYSTATUS_ALL_MASK \ (NOTE_MEMORYSTATUS_PRESSURE_NORMAL | NOTE_MEMORYSTATUS_PRESSURE_WARN | NOTE_MEMORYSTATUS_PRESSURE_CRITICAL | NOTE_MEMORYSTATUS_LOW_SWAP | \ - NOTE_MEMORYSTATUS_PROC_LIMIT_WARN | NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL | NOTE_MEMORYSTATUS_MSL_STATUS) + NOTE_MEMORYSTATUS_PROC_LIMIT_WARN | NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL | NOTE_MEMORYSTATUS_MSL_STATUS) #endif /* KERNEL_PRIVATE */ typedef enum vm_pressure_level { - kVMPressureNormal = 0, - kVMPressureWarning = 1, - kVMPressureUrgent = 2, - kVMPressureCritical = 3, + kVMPressureNormal = 0, + kVMPressureWarning = 1, + kVMPressureUrgent = 2, + kVMPressureCritical = 3, } vm_pressure_level_t; #endif /* PRIVATE */ @@ -561,7 +563,7 @@ typedef enum vm_pressure_level { #define NOTE_NSECONDS 0x00000004 /* data is nanoseconds */ #define NOTE_ABSOLUTE 0x00000008 /* absolute timeout */ /* ... implicit EV_ONESHOT, timeout uses the gettimeofday epoch */ -#define NOTE_LEEWAY 0x00000010 /* ext[1] holds leeway for power aware timers */ +#define NOTE_LEEWAY 0x00000010 /* ext[1] holds leeway for power aware timers */ #define NOTE_CRITICAL 0x00000020 /* system does minimal timer coalescing */ #define NOTE_BACKGROUND 0x00000040 /* system does maximum timer coalescing */ #define NOTE_MACH_CONTINUOUS_TIME 0x00000080 @@ -580,27 +582,32 @@ typedef enum vm_pressure_level { * data/hint fflags for EVFILT_SOCK, shared with userspace. * */ -#define NOTE_CONNRESET 0x00000001 /* Received RST */ -#define NOTE_READCLOSED 0x00000002 /* Read side is shutdown */ -#define NOTE_WRITECLOSED 0x00000004 /* Write side is shutdown */ -#define NOTE_TIMEOUT 0x00000008 /* timeout: rexmt, keep-alive or persist */ -#define NOTE_NOSRCADDR 0x00000010 /* source address not available */ -#define NOTE_IFDENIED 0x00000020 /* interface denied connection */ -#define NOTE_SUSPEND 0x00000040 /* output queue suspended */ -#define NOTE_RESUME 0x00000080 /* output queue resumed */ +#define NOTE_CONNRESET 0x00000001 /* Received RST */ +#define NOTE_READCLOSED 0x00000002 /* Read side is shutdown */ +#define NOTE_WRITECLOSED 0x00000004 /* Write side is shutdown */ +#define NOTE_TIMEOUT 0x00000008 /* timeout: rexmt, keep-alive or persist */ +#define NOTE_NOSRCADDR 0x00000010 /* source address not available */ +#define NOTE_IFDENIED 0x00000020 /* interface denied connection */ +#define NOTE_SUSPEND 0x00000040 /* output queue suspended */ +#define NOTE_RESUME 0x00000080 /* output queue resumed */ #define NOTE_KEEPALIVE 0x00000100 /* TCP Keepalive received */ #define NOTE_ADAPTIVE_WTIMO 0x00000200 /* TCP adaptive write timeout */ #define NOTE_ADAPTIVE_RTIMO 0x00000400 /* TCP adaptive read timeout */ -#define NOTE_CONNECTED 0x00000800 /* socket is connected */ -#define NOTE_DISCONNECTED 0x00001000 /* socket is disconnected */ -#define NOTE_CONNINFO_UPDATED 0x00002000 /* connection info was updated */ -#define NOTE_NOTIFY_ACK 0x00004000 /* notify acknowledgement */ +#define NOTE_CONNECTED 0x00000800 /* socket is connected */ +#define NOTE_DISCONNECTED 0x00001000 /* socket is disconnected */ +#define NOTE_CONNINFO_UPDATED 0x00002000 /* connection info was updated */ +#define NOTE_NOTIFY_ACK 0x00004000 /* notify acknowledgement */ -#define EVFILT_SOCK_LEVEL_TRIGGER_MASK \ - (NOTE_READCLOSED | NOTE_WRITECLOSED | NOTE_SUSPEND | NOTE_RESUME | NOTE_CONNECTED | NOTE_DISCONNECTED) +#define EVFILT_SOCK_LEVEL_TRIGGER_MASK \ + (NOTE_READCLOSED | NOTE_WRITECLOSED | NOTE_SUSPEND | NOTE_RESUME | \ + NOTE_CONNECTED | NOTE_DISCONNECTED) #define EVFILT_SOCK_ALL_MASK \ - (NOTE_CONNRESET | NOTE_READCLOSED | NOTE_WRITECLOSED | NOTE_TIMEOUT | NOTE_NOSRCADDR | NOTE_IFDENIED | NOTE_SUSPEND | NOTE_RESUME | NOTE_KEEPALIVE | NOTE_ADAPTIVE_WTIMO | NOTE_ADAPTIVE_RTIMO | NOTE_CONNECTED | NOTE_DISCONNECTED | NOTE_CONNINFO_UPDATED | NOTE_NOTIFY_ACK) + (NOTE_CONNRESET | NOTE_READCLOSED | NOTE_WRITECLOSED | NOTE_TIMEOUT | \ + NOTE_NOSRCADDR | NOTE_IFDENIED | NOTE_SUSPEND | NOTE_RESUME | \ + NOTE_KEEPALIVE | NOTE_ADAPTIVE_WTIMO | NOTE_ADAPTIVE_RTIMO | \ + NOTE_CONNECTED | NOTE_DISCONNECTED | NOTE_CONNINFO_UPDATED | \ + NOTE_NOTIFY_ACK) #endif /* PRIVATE */ @@ -623,7 +630,7 @@ typedef enum vm_pressure_level { * system call argument specifying an ouput area (kevent_qos) will be consulted. If * the system call specified an output data area, the user-space address * of the received message is carved from that provided output data area (if enough - * space remains there). The address and length of each received message is + * space remains there). The address and length of each received message is * returned in the ext[0] and ext[1] fields (respectively) of the corresponding kevent. * * IF_MACH_RCV_VOUCHER_CONTENT is specified, the contents of the message voucher is @@ -642,9 +649,9 @@ typedef enum vm_pressure_level { * NOTE_TRACK, NOTE_TRACKERR, and NOTE_CHILD are no longer supported as of 10.5 */ /* additional flags for EVFILT_PROC */ -#define NOTE_TRACK 0x00000001 /* follow across forks */ -#define NOTE_TRACKERR 0x00000002 /* could not track child */ -#define NOTE_CHILD 0x00000004 /* am a child process */ +#define NOTE_TRACK 0x00000001 /* follow across forks */ +#define NOTE_TRACKERR 0x00000002 /* could not track child */ +#define NOTE_CHILD 0x00000004 /* am a child process */ #ifdef PRIVATE @@ -652,7 +659,7 @@ typedef enum vm_pressure_level { #ifndef KERNEL /* Temporay solution for BootX to use inode.h till kqueue moves to vfs layer */ -#include +#include struct knote; SLIST_HEAD(klist, knote); #endif @@ -660,10 +667,11 @@ SLIST_HEAD(klist, knote); #ifdef KERNEL #ifdef XNU_KERNEL_PRIVATE -#include +#include #include #include /* FREAD, FWRITE */ #include /* panic */ +#include #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_KQUEUE); @@ -671,58 +679,61 @@ MALLOC_DECLARE(M_KQUEUE); TAILQ_HEAD(kqtailq, knote); /* a list of "queued" events */ -/* Bit size for packed field within knote */ -#define KNOTE_KQ_BITSIZE 40 - - /* index into various kq queues */ -typedef uint8_t kq_index_t; +typedef uint8_t kq_index_t; typedef uint16_t kn_status_t; -#define KN_ACTIVE 0x0001 /* event has been triggered */ -#define KN_QUEUED 0x0002 /* event is on queue */ -#define KN_DISABLED 0x0004 /* event is disabled */ -#define KN_DROPPING 0x0008 /* knote is being dropped */ -#define KN_USEWAIT 0x0010 /* wait for knote use */ -#define KN_ATTACHING 0x0020 /* event is pending attach */ -#define KN_STAYACTIVE 0x0040 /* force event to stay active */ -#define KN_DEFERDELETE 0x0080 /* defer delete until re-enabled */ -#define KN_ATTACHED 0x0100 /* currently attached to source */ -#define KN_DISPATCH 0x0200 /* disables as part of deliver */ -#define KN_UDATA_SPECIFIC 0x0400 /* udata is part of matching */ -#define KN_SUPPRESSED 0x0800 /* event is suppressed during delivery */ -#define KN_STOLENDROP 0x1000 /* someone stole the drop privilege */ -#define KN_REQVANISH 0x2000 /* requested EV_VANISH */ -#define KN_VANISHED 0x4000 /* has vanished */ - +#define KN_ACTIVE 0x0001 /* event has been triggered */ +#define KN_QUEUED 0x0002 /* event is on queue */ +#define KN_DISABLED 0x0004 /* event is disabled */ +#define KN_DROPPING 0x0008 /* knote is being dropped */ +#define KN_LOCKED 0x0010 /* knote is locked (kq_knlocks) */ +#define KN_ATTACHING 0x0020 /* event is pending attach */ +#define KN_STAYACTIVE 0x0040 /* force event to stay active */ +#define KN_DEFERDELETE 0x0080 /* defer delete until re-enabled */ +#define KN_ATTACHED 0x0100 /* currently attached to source */ +#define KN_DISPATCH 0x0200 /* disables as part of deliver */ +#define KN_UDATA_SPECIFIC 0x0400 /* udata is part of matching */ +#define KN_SUPPRESSED 0x0800 /* event is suppressed during delivery */ +#define KN_MERGE_QOS 0x1000 /* f_event() / f_* ran concurrently and + overrides must merge */ +#define KN_REQVANISH 0x2000 /* requested EV_VANISH */ +#define KN_VANISHED 0x4000 /* has vanished */ +// 0x8000 + +/* combination defines deferred-delete mode enabled */ #define KN_DISPATCH2 (KN_DISPATCH | KN_UDATA_SPECIFIC) - /* combination defines deferred-delete mode enabled */ +#define KNOTE_KQ_BITSIZE 42 +_Static_assert(KNOTE_KQ_BITSIZE >= VM_KERNEL_POINTER_SIGNIFICANT_BITS, + "Make sure sign extending kn_kq_packed is legit"); + +struct kqueue; struct knote { TAILQ_ENTRY(knote) kn_tqe; /* linkage for tail queue */ SLIST_ENTRY(knote) kn_link; /* linkage for search list */ SLIST_ENTRY(knote) kn_selnext; /* klist element chain */ - union { - struct fileproc *p_fp; /* file data pointer */ - struct proc *p_proc; /* proc pointer */ - struct ipc_mqueue *p_mqueue; /* pset pointer */ - } kn_ptr; - uint64_t kn_req_index:3, /* requested qos index */ - kn_qos_index:3, /* in-use qos index */ - kn_qos_override:3, /* qos override index */ - kn_qos_sync_override:3, /* qos sync override index */ - kn_vnode_kqok:1, - kn_vnode_use_ofst:1, - kn_qos_override_is_sync:1, /* qos override index is a sync override */ - kn_reserved:1, /* reserved bits */ - kn_filtid:8, /* filter id to index filter ops */ - kn_kq_packed:KNOTE_KQ_BITSIZE; /* packed pointer for kq */ - + uintptr_t kn_filtid:8, /* filter id to index filter ops */ + kn_req_index:4, /* requested qos index */ + kn_qos_index:4, /* in-use qos index */ + kn_qos_override:4, /* qos override index */ + kn_vnode_kqok:1, + kn_vnode_use_ofst:1; +#if __LP64__ + intptr_t kn_kq_packed : KNOTE_KQ_BITSIZE; +#else + intptr_t kn_kq_packed; +#endif union { void *kn_hook; uint64_t kn_hook_data; }; int64_t kn_sdata; /* saved data field */ + union { + struct fileproc *p_fp; /* file data pointer */ + struct proc *p_proc; /* proc pointer */ + struct ipc_mqueue *p_mqueue; /* pset pointer */ + } kn_ptr; struct kevent_internal_s kn_kevent; int kn_sfflags; /* saved filter flags */ int kn_hookid; @@ -741,28 +752,16 @@ struct knote { #define kn_fp kn_ptr.p_fp }; -static inline struct kqueue *knote_get_kq(struct knote *kn) -{ - if (!(kn->kn_kq_packed)) - return 0; - else - return (struct kqueue *)((uintptr_t)(kn->kn_kq_packed) + (uintptr_t)VM_MIN_KERNEL_AND_KEXT_ADDRESS); -} - -static inline void knote_set_kq(struct knote *kn, void *kq) +static inline struct kqueue * +knote_get_kq(struct knote *kn) { - if (!kq) - kn->kn_kq_packed = 0; - else { - uint64_t offset = ((uintptr_t)kq - (uintptr_t)VM_MIN_KERNEL_AND_KEXT_ADDRESS); - kn->kn_kq_packed = offset; - } + return (struct kqueue *)kn->kn_kq_packed; } static inline int knote_get_seltype(struct knote *kn) { switch (kn->kn_filter) { - case EVFILT_READ: + case EVFILT_READ: return FREAD; case EVFILT_WRITE: return FWRITE; @@ -792,8 +791,20 @@ typedef struct filt_process_s *filt_process_data_t; * Filter operators * * These routines, provided by each filter, are called to attach, detach, deliver events, - * change/update filter registration and process/deliver events. They are called with the - * with a use-count referenced knote, with the kq unlocked. Here are more details: + * change/update filter registration and process/deliver events: + * + * - the f_attach, f_touch, f_process, f_peek and f_detach callbacks are always + * serialized with respect to each other for the same knote. + * + * - the f_event routine is called with a use-count taken on the knote to + * prolongate its lifetime and protect against drop, but is not otherwise + * serialized with other routine calls. + * + * - the f_detach routine is always called last, and is serialized with all + * other callbacks, including f_event calls. + * + * + * Here are more details: * * f_isfd - * identifies if the "ident" field in the kevent structure is a file-descriptor. @@ -808,17 +819,17 @@ typedef struct filt_process_s *filt_process_data_t; * f_adjusts_qos - * identifies if the filter can adjust its QoS during its lifetime. * - * Currently, EVFILT_MAACHPORT is the only filter using this facility. + * Filters using this facility should request the new overrides they want + * using the appropriate FILTER_{RESET,ADJUST}_EVENT_QOS extended codes. * - * f_needs_boost - - * [OPTIONAL] used by filters to communicate they need to hold a boost - * while holding a usecount on this knote. This is called with the kqlock - * held. + * Currently, EVFILT_MACHPORT is the only filter using this facility. * - * This is only used by EVFILT_WORKLOOP currently. + * f_extended_codes - + * identifies if the filter returns extended codes from its routines + * (see FILTER_ACTIVE, ...) or 0 / 1 values. * * f_attach - - * called to attach the knote to the underlying object that will be delivering events + * called to attach the knote to the underlying object that will be delivering events * through it when EV_ADD is supplied and no existing matching event is found * * provided a knote that is pre-attached to the fd or hashed (see above) but is @@ -836,21 +847,9 @@ typedef struct filt_process_s *filt_process_data_t; * The return value indicates if the knote should already be considered "activated" at * the time of attach (one or more of the interest events has already occured). * - * f_post_attach - - * [OPTIONAL] called after a successful attach, with the kqueue lock held, - * returns lock held, may drop and re-acquire - * - * If this function is non-null, then it indicates that the filter wants - * to perform an action after a successful ATTACH of a knote. - * - * Currently, EVFILT_WORKLOOP is the only filter using this facility. - * - * The return value indicates an error to report to userland. - * - * * f_detach - * called to disassociate the knote from the underlying object delivering events - * the filter should not attempt to deliver events through this knote after this + * the filter should not attempt to deliver events through this knote after this * operation returns control to the kq system. * * f_event - @@ -864,24 +863,8 @@ typedef struct filt_process_s *filt_process_data_t; * The return value indicates if the knote should already be considered "activated" at * the time of attach (one or more of the interest events has already occured). * - * f_drop_and_unlock - - * [OPTIONAL] called with the kqueue locked, and has to unlock - * - * If this function is non-null, then it indicates that the filter - * wants to handle EV_DELETE events. This is necessary if a particular - * filter needs to synchronize knote deletion with its own filter lock. - * Currently, EVFILT_WORKLOOP is the only filter using this facility. - * - * The return value indicates an error during the knote drop, i.e., the - * knote still exists and user space should re-drive the EV_DELETE. - * - * If the return value is ERESTART, kevent_register() is called from - * scratch again (useful to wait for usecounts to drop and then - * reevaluate the relevance of that drop) - * - * * f_process - - * called when attempting to deliver triggered events to user-space. + * called when attempting to deliver triggered events to user-space. * * If the knote was previously activated, this operator will be called when a * thread is trying to deliver events to user-space. The filter gets one last @@ -912,47 +895,148 @@ typedef struct filt_process_s *filt_process_data_t; * Unless one of the special output flags was set in the output kevent, a non- * zero return value ALSO indicates that the knote should be re-activated * for future event processing (in case it delivers level-based or a multi-edge - * type events like message queues that already exist). + * type events like message queues that already exist). * * NOTE: In the future, the boolean may change to an enum that allows more * explicit indication of just delivering a current event vs delivering * an event with more events still pending. * * f_touch - - * called to update the knote with new state from the user during EVFILT_ADD/ENABLE/DISABLE - * on an already-attached knote. + * called to update the knote with new state from the user during + * EVFILT_ADD/ENABLE/DISABLE on an already-attached knote. * * f_touch should copy relevant new data from the kevent into the knote. - * (if KN_UDATA_SPECIFIC is not set, you may need to update the udata too) * - * operator must lock against concurrent f_event and f_process operations. + * operator must lock against concurrent f_event operations. * - * A return value of 1 indicates that the knote should now be considered 'activated'. + * A return value of 1 indicates that the knote should now be considered + * 'activated'. * - * f_touch can set EV_ERROR with specific error in the data field to return an error to the client. - * You should return 1 to indicate that the kevent needs to be activated and processed. + * f_touch can set EV_ERROR with specific error in the data field to + * return an error to the client. You should return 1 to indicate that + * the kevent needs to be activated and processed. * * f_peek - - * For knotes marked KN_STAYACTIVE, indicate if the knote is truly active at - * the moment (not used for event delivery, but for status checks). + * For knotes marked KN_STAYACTIVE, indicate if the knote is truly active + * at the moment (not used for event delivery, but for status checks). + * + * f_allow_drop - + * + * [OPTIONAL] If this function is non-null, then it indicates that the + * filter wants to validate EV_DELETE events. This is necessary if + * a particular filter needs to synchronize knote deletion with its own + * filter lock. + * + * When true is returned, the the EV_DELETE is allowed and can proceed. + * + * If false is returned, the EV_DELETE doesn't proceed, and the passed in + * kevent is used for the copyout to userspace. + * + * Currently, EVFILT_WORKLOOP is the only filter using this facility. + * + * f_post_register_wait - + * [OPTIONAL] called when attach or touch return the FILTER_REGISTER_WAIT + * extended code bit. It is possible to use this facility when the last + * register command wants to wait. + * + * Currently, EVFILT_WORKLOOP is the only filter using this facility. */ +struct _kevent_register; +struct knote_lock_ctx; +struct proc; +struct uthread; +struct waitq; + struct filterops { - bool f_isfd; /* true if ident == filedescriptor */ - bool f_adjusts_qos; /* true if the filter can override the knote */ - bool (*f_needs_boost)(struct kevent_internal_s *kev); + bool f_isfd; /* true if ident == filedescriptor */ + bool f_adjusts_qos; /* true if the filter can override the knote */ + bool f_extended_codes; /* hooks return extended codes */ + int (*f_attach)(struct knote *kn, struct kevent_internal_s *kev); - int (*f_post_attach)(struct knote *kn, struct kevent_internal_s *kev); void (*f_detach)(struct knote *kn); int (*f_event)(struct knote *kn, long hint); int (*f_touch)(struct knote *kn, struct kevent_internal_s *kev); - int (*f_drop_and_unlock)(struct knote *kn, struct kevent_internal_s *kev); int (*f_process)(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); - unsigned (*f_peek)(struct knote *kn); + int (*f_peek)(struct knote *kn); + + /* optional & advanced */ + bool (*f_allow_drop)(struct knote *kn, struct kevent_internal_s *kev); + void (*f_post_register_wait)(struct uthread *uth, struct knote_lock_ctx *ctx, + struct _kevent_register *ss_kr); }; -struct proc; -struct waitq; +/* + * Extended codes returned by filter routines when f_extended_codes is set. + * + * FILTER_ACTIVE + * The filter is active and a call to f_process() may return an event. + * + * For f_process() the meaning is slightly different: the knote will be + * activated again as long as f_process returns FILTER_ACTIVE, unless + * EV_CLEAR is set, which require a new f_event to reactivate the knote. + * + * Valid: f_attach, f_event, f_touch, f_process, f_peek + * Implicit: - + * Ignored: - + * + * FILTER_REGISTER_WAIT + * The filter wants its f_post_register_wait() to be called. + * + * Note: It is only valid to ask for this behavior for a workloop kqueue, + * and is really only meant to be used by EVFILT_WORKLOOP. + * + * Valid: f_attach, f_touch + * Implicit: - + * Ignored: f_event, f_process, f_peek + * + * FILTER_UPDATE_REQ_QOS + * The filter wants the passed in QoS to be updated as the new intrinsic qos + * for this knote. If the kevent `qos` field is 0, no update is performed. + * + * This also will reset the event QoS, so FILTER_ADJUST_EVENT_QOS() must + * also be used if an override should be maintained. + * + * Valid: f_touch + * Implicit: f_attach + * Ignored: f_event, f_process, f_peek + * + * FILTER_RESET_EVENT_QOS + * FILTER_ADJUST_EVENT_QOS(qos) + * The filter wants the QoS of the next event delivery to be overridden + * at the specified QoS. This allows for the next event QoS to be elevated + * from the knote requested qos (See FILTER_UPDATE_REQ_QOS). + * + * Event QoS Overrides are reset when a particular knote is no longer + * active. Hence this is ignored if FILTER_ACTIVE isn't also returned. + * + * Races between an f_event() and any other f_* routine asking for + * a specific QoS override are handled generically and the filters do not + * have to worry about them. + * + * To use this facility, filters MUST set their f_adjusts_qos bit to true. + * + * It is expected that filters will return the new QoS they expect to be + * applied from any f_* callback except for f_process() where no specific + * information should be provided. Filters should not try to hide no-ops, + * kevent will already optimize these away. + * + * Valid: f_touch, f_attach, f_event, f_process + * Implicit: - + * Ignored: f_peek + */ +#define FILTER_ACTIVE 0x00000001 +#define FILTER_REGISTER_WAIT 0x00000002 +#define FILTER_UPDATE_REQ_QOS 0x00000004 +#define FILTER_ADJUST_EVENT_QOS_BIT 0x00000008 +#define FILTER_ADJUST_EVENT_QOS_MASK 0x00000070 +#define FILTER_ADJUST_EVENT_QOS_SHIFT 4 +#define FILTER_ADJUST_EVENT_QOS(qos) \ + (((qos) << FILTER_ADJUST_EVENT_QOS_SHIFT) | FILTER_ADJUST_EVENT_QOS_BIT) +#define FILTER_RESET_EVENT_QOS FILTER_ADJUST_EVENT_QOS_BIT + +#define filter_call(_ops, call) \ + ((_ops)->f_extended_codes ? (_ops)->call : !!((_ops)->call)) SLIST_HEAD(klist, knote); extern void knote_init(void); @@ -965,17 +1049,20 @@ extern void klist_init(struct klist *list); extern void knote(struct klist *list, long hint); extern int knote_attach(struct klist *list, struct knote *kn); extern int knote_detach(struct klist *list, struct knote *kn); -extern void knote_vanish(struct klist *list); +extern void knote_vanish(struct klist *list); +extern void knote_link_waitqset_lazy_alloc(struct knote *kn); +extern boolean_t knote_link_waitqset_should_lazy_alloc(struct knote *kn); extern int knote_link_waitq(struct knote *kn, struct waitq *wq, uint64_t *reserved_link); extern int knote_unlink_waitq(struct knote *kn, struct waitq *wq); -extern void knote_fdclose(struct proc *p, int fd, int force); +extern void knote_fdclose(struct proc *p, int fd); extern void knote_markstayactive(struct knote *kn); extern void knote_clearstayactive(struct knote *kn); -extern void knote_adjust_qos(struct knote *kn, int qos, int override, kq_index_t sync_override_index); -extern void knote_adjust_sync_qos(struct knote *kn, kq_index_t sync_qos, boolean_t lock_kq); extern const struct filterops *knote_fops(struct knote *kn); extern void knote_set_error(struct knote *kn, int error); +extern struct turnstile *kqueue_turnstile(struct kqueue *); +extern struct turnstile *kqueue_alloc_turnstile(struct kqueue *); + int kevent_exit_on_workloop_ownership_leak(thread_t thread); int kevent_proc_copy_uptrs(void *proc, uint64_t *buf, int bufsize); int kevent_copyout_proc_dynkqids(void *proc, user_addr_t ubuf, @@ -987,7 +1074,7 @@ int kevent_copyout_dynkqextinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf, #elif defined(KERNEL_PRIVATE) /* !XNU_KERNEL_PRIVATE: kexts still need a klist structure definition */ -#include +#include struct proc; struct knote; SLIST_HEAD(klist, knote); @@ -998,17 +1085,12 @@ SLIST_HEAD(klist, knote); #ifdef PRIVATE /* make these private functions available to the pthread kext */ -extern int kevent_qos_internal(struct proc *p, int fd, +extern int kevent_qos_internal(struct proc *p, int fd, user_addr_t changelist, int nchanges, user_addr_t eventlist, int nevents, user_addr_t data_out, user_size_t *data_available, unsigned int flags, int32_t *retval); -extern int kevent_qos_internal_bind(struct proc *p, - int qos, thread_t thread, unsigned int flags); -extern int kevent_qos_internal_unbind(struct proc *p, - int qos, thread_t thread, unsigned int flags); - extern int kevent_id_internal(struct proc *p, kqueue_id_t *id, user_addr_t changelist, int nchanges, user_addr_t eventlist, int nevents, @@ -1018,7 +1100,7 @@ extern int kevent_id_internal(struct proc *p, kqueue_id_t *id, #endif /* PRIVATE */ #endif /* KERNEL_PRIVATE */ -#else /* KERNEL */ +#else /* KERNEL */ #include @@ -1026,24 +1108,24 @@ struct timespec; __BEGIN_DECLS int kqueue(void); -int kevent(int kq, +int kevent(int kq, const struct kevent *changelist, int nchanges, struct kevent *eventlist, int nevents, const struct timespec *timeout); -int kevent64(int kq, +int kevent64(int kq, const struct kevent64_s *changelist, int nchanges, struct kevent64_s *eventlist, int nevents, - unsigned int flags, + unsigned int flags, const struct timespec *timeout); #ifdef PRIVATE -int kevent_qos(int kq, +int kevent_qos(int kq, const struct kevent_qos_s *changelist, int nchanges, struct kevent_qos_s *eventlist, int nevents, void *data_out, size_t *data_available, unsigned int flags); -int kevent_id(kqueue_id_t id, +int kevent_id(kqueue_id_t id, const struct kevent_qos_s *changelist, int nchanges, struct kevent_qos_s *eventlist, int nevents, void *data_out, size_t *data_available, @@ -1063,5 +1145,4 @@ __END_DECLS #endif /* PRIVATE */ - #endif /* !_SYS_EVENT_H_ */ diff --git a/bsd/sys/eventhandler.h b/bsd/sys/eventhandler.h index 79ed93512..e5b717bfe 100644 --- a/bsd/sys/eventhandler.h +++ b/bsd/sys/eventhandler.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2017 Apple Inc. All rights reserved. + * Copyright (c) 2016-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -131,8 +131,7 @@ typedef struct eventhandler_entry *eventhandler_tag; EHL_LOCK_SPIN((list)); \ } \ } \ - KASSERT((list)->el_runcount > 0, \ - ("eventhandler_invoke: runcount underflow")); \ + VERIFY((list)->el_runcount > 0); \ (list)->el_runcount--; \ if ((list)->el_runcount == 0) { \ EHL_LOCK_CONVERT((list)); \ diff --git a/bsd/sys/eventvar.h b/bsd/sys/eventvar.h index 82323625f..e60eaeb86 100644 --- a/bsd/sys/eventvar.h +++ b/bsd/sys/eventvar.h @@ -2,7 +2,7 @@ * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,10 +22,10 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/*- +/* * Copyright (c) 1999,2000 Jonathan Lemon * All rights reserved. * @@ -68,8 +68,8 @@ typedef void (*kqueue_continue_t)(struct kqueue *, void *, int); #include #include -#include #include +#include /* * Lock ordering: @@ -100,6 +100,40 @@ typedef void (*kqueue_continue_t)(struct kqueue *, void *, int); #define KQEXTENT 256 /* linear growth by this amount */ +struct knote_lock_ctx { + struct knote *knlc_knote; + thread_t knlc_thread; + // TODO: knlc_turnstile + TAILQ_HEAD(, knote_lock_ctx) knlc_head; + union { + LIST_ENTRY(knote_lock_ctx) knlc_le; + TAILQ_ENTRY(knote_lock_ctx) knlc_tqe; + }; +#if DEBUG || DEVELOPMENT +#define KNOTE_LOCK_CTX_UNLOCKED 0 +#define KNOTE_LOCK_CTX_LOCKED 1 +#define KNOTE_LOCK_CTX_WAITING 2 + int knlc_state; +#endif +}; +LIST_HEAD(knote_locks, knote_lock_ctx); + +#if DEBUG || DEVELOPMENT +/* + * KNOTE_LOCK_CTX(name) is a convenience macro to define a knote lock context on + * the stack named `name`. In development kernels, it uses tricks to make sure + * not locks was still held when exiting the C-scope that contains this context. + */ +__attribute__((noinline,not_tail_called)) +void knote_lock_ctx_chk(struct knote_lock_ctx *ctx); +#define KNOTE_LOCK_CTX(n) \ + struct knote_lock_ctx n __attribute__((cleanup(knote_lock_ctx_chk))); \ + n.knlc_state = KNOTE_LOCK_CTX_UNLOCKED +#else +#define KNOTE_LOCK_CTX(n) \ + struct knote_lock_ctx n +#endif + /* * kqueue - common core definition of a kqueue * @@ -108,13 +142,17 @@ typedef void (*kqueue_continue_t)(struct kqueue *, void *, int); * derived from this definition. */ struct kqueue { - struct waitq_set kq_wqs; /* private waitq set */ - lck_spin_t kq_lock; /* kqueue lock */ - uint16_t kq_state; /* state of the kq */ - uint16_t kq_level; /* nesting level of the kq */ - uint32_t kq_count; /* number of queued events */ - struct proc *kq_p; /* process containing kqueue */ - struct kqtailq kq_queue[1]; /* variable array of kqtailq structs */ + struct { + struct waitq_set kq_wqs; /* private waitq set */ + lck_spin_t kq_lock; /* kqueue lock */ + uint16_t kq_state; /* state of the kq */ + uint16_t kq_level; /* nesting level of the kq */ + uint32_t kq_count; /* number of queued events */ + struct proc *kq_p; /* process containing kqueue */ + struct knote_locks kq_knlocks; /* list of knote locks held */ + lck_spin_t kq_reqlock; /* kqueue request lock */ + }; /* make sure struct padding is put before kq_queue */ + struct kqtailq kq_queue[0]; /* variable array of queues */ }; #define KQ_SEL 0x001 /* select was recorded for kq */ @@ -129,7 +167,6 @@ struct kqueue { #define KQ_DRAIN 0x200 /* kq is draining */ #define KQ_WAKEUP 0x400 /* kq awakened while processing */ #define KQ_DYNAMIC 0x800 /* kqueue is dynamically managed */ -#define KQ_NO_WQ_THREAD 0x1000 /* kq will not have workqueue threads dynamically created */ /* * kqfile - definition of a typical kqueue opened as a file descriptor * via the kqueue() system call. @@ -139,6 +176,7 @@ struct kqueue { */ struct kqfile { struct kqueue kqf_kqueue; /* common kqueue core */ + struct kqtailq kqf_queue; /* queue of woken up knotes */ struct kqtailq kqf_suppressed; /* suppression queue */ struct selinfo kqf_sel; /* parent select/kqueue info */ }; @@ -149,57 +187,33 @@ struct kqfile { #define kqf_level kqf_kqueue.kq_level #define kqf_count kqf_kqueue.kq_count #define kqf_p kqf_kqueue.kq_p -#define kqf_queue kqf_kqueue.kq_queue #define QOS_INDEX_KQFILE 0 /* number of qos levels in a file kq */ -struct kqr_bound { - struct kqtailq kqrb_suppressed; /* Per-QoS suppression queues */ - thread_t kqrb_thread; /* thread to satisfy request */ -}; - /* * kqrequest - per-QoS thread request status */ struct kqrequest { -#if 0 - union { - struct kqr_bound kqru_bound; /* used when thread is bound */ - struct workq_threadreq_s kqru_req; /* used when request oustanding */ - } kqr_u; -#define kqr_suppressed kqr_u.kqru_bound.kqrb_suppressed -#define kqr_thread kqr_u.kqru_bound.kqrb_thread -#define kqr_req kqr_u.kqru_req -#else - struct kqr_bound kqr_bound; /* used when thread is bound */ struct workq_threadreq_s kqr_req; /* used when request oustanding */ -#define kqr_suppressed kqr_bound.kqrb_suppressed -#define kqr_thread kqr_bound.kqrb_thread -#endif - uint8_t kqr_state; /* KQ/workq interaction state */ - uint8_t kqr_wakeup_indexes; /* QoS/override levels that woke */ - uint16_t kqr_dsync_waiters:13, /* number of dispatch sync waiters */ - kqr_dsync_owner_qos:3; /* Qos override on dispatch sync owner */ - uint16_t kqr_sync_suppress_count; /* number of suppressed sync ipc knotes */ - kq_index_t kqr_stayactive_qos:3, /* max QoS of statyactive knotes */ - kqr_owner_override_is_sync:1, /* sync owner has sync ipc override */ - kqr_override_index:3, /* highest wakeup override index */ - kqr_has_sync_override:1; /* Qos/override at UI is sync ipc override */ - - /* set under both the kqlock and the filt_wllock */ - kq_index_t :0; /* prevent bitfields coalescing */ - kq_index_t kqr_qos_index:4, /* QoS for the thread request */ - kqr_dsync_waiters_qos:4; /* override from dispatch sync waiters */ + struct kqtailq kqr_suppressed; /* Per-QoS suppression queues */ + thread_t kqr_thread; /* thread to satisfy request */ + uint8_t kqr_state; /* KQ/workq interaction state */ +#define KQWL_STAYACTIVE_FIRED_BIT (1 << 0) + uint8_t kqr_wakeup_indexes; /* QoS/override levels that woke */ + uint16_t kqr_dsync_waiters; /* number of dispatch sync waiters */ + kq_index_t kqr_stayactive_qos; /* max QoS of statyactive knotes */ + kq_index_t kqr_override_index; /* highest wakeup override index */ + kq_index_t kqr_qos_index; /* QoS for the thread request */ }; -#define KQR_PROCESSING 0x01 /* requested thread is running the q */ +#define KQR_WORKLOOP 0x01 /* owner is a workloop */ #define KQR_THREQUESTED 0x02 /* thread has been requested from workq */ #define KQR_WAKEUP 0x04 /* wakeup called during processing */ -#define KQR_BOUND 0x08 /* servicing thread is bound */ -#define KQR_THOVERCOMMIT 0x20 /* overcommit needed for thread requests */ -#define KQR_DRAIN 0x40 /* cancel initiated - drain fulfill */ -#define KQR_R2K_NOTIF_ARMED 0x80 /* ast notifications armed */ +#define KQR_THOVERCOMMIT 0x08 /* overcommit needed for thread requests */ +#define KQR_R2K_NOTIF_ARMED 0x10 /* ast notifications armed */ +#define KQR_ALLOCATED_TURNSTILE 0x20 /* kqwl_turnstile is allocated */ + /* * WorkQ kqueues need to request threads to service the triggered * knotes in the queue. These threads are brought up on a @@ -213,40 +227,8 @@ struct kqrequest { #define KQWQ_QOS_MANAGER (THREAD_QOS_LAST) #endif -#if !defined(KQWQ_NQOS) -#define KQWQ_NQOS (KQWQ_QOS_MANAGER + 1) -#endif - -/* - * Workq thread start out a particular effective-requested-QoS, but - * additional events processed by the filters may represent - * backlogged events that may themselves have a higher requested-QoS. - * To represent this, the filter may apply an override to a knote's - * requested QoS. - * - * We further segregate these overridden knotes into different buckets - * by grouping. This allows easy matching of - * knotes to process vs. the highest workq thread override applied. - * - * Only certain override patterns need to be supported. A knote - * cannot have an effective-requested-QoS of UNSPECIFIED - because - * the kevent->qos (when canonicalized) will always be above that - * or indicate manager. And we don't allow an override to specify - * manager. This results in the following buckets being needed: - * - * Effective-Requested QoS - * MAINT BG UTIL DEFAULT UINIT UINTER MANAGER - * override: - * MAINT 0 - * BG 1 6 - * UTILITY 2 7 11 - * DEFAULT 3 8 12 15 - * UINIT 4 9 13 16 18 - * UINTER 5 10 14 17 19 20 - * 21 - */ #if !defined(KQWQ_NBUCKETS) -#define KQWQ_NBUCKETS 22 +#define KQWQ_NBUCKETS (KQWQ_QOS_MANAGER + 1) #endif /* @@ -259,9 +241,8 @@ struct kqrequest { */ struct kqworkq { struct kqueue kqwq_kqueue; - struct kqtailq kqwq_queuecont[KQWQ_NBUCKETS-1]; /* continue array of queues */ - struct kqrequest kqwq_request[KQWQ_NQOS]; /* per-QoS request states */ - lck_spin_t kqwq_reqlock; /* kqueue request lock */ + struct kqtailq kqwq_queue[KQWQ_NBUCKETS]; /* array of queues */ + struct kqrequest kqwq_request[KQWQ_NBUCKETS]; /* per-QoS request states */ }; #define kqwq_wqs kqwq_kqueue.kq_wqs @@ -270,13 +251,6 @@ struct kqworkq { #define kqwq_level kqwq_kqueue.kq_level #define kqwq_count kqwq_kqueue.kq_count #define kqwq_p kqwq_kqueue.kq_p -#define kqwq_queue kqwq_kqueue.kq_queue - -#define kqwq_req_lock(kqwq) lck_spin_lock(&kqwq->kqwq_reqlock) -#define kqwq_req_unlock(kqwq) lck_spin_unlock(&kqwq->kqwq_reqlock) -#define kqwq_req_held(kqwq) LCK_SPIN_ASSERT(&kqwq->kqwq_reqlock, LCK_ASSERT_OWNED) - -#define KQWQ_THMANAGER 0x10 /* expect manager thread to run the queue */ /* * WorkLoop kqueues need to request a thread to service the triggered @@ -319,16 +293,49 @@ struct kqworkq { */ struct kqworkloop { struct kqueue kqwl_kqueue; /* queue of events */ - struct kqtailq kqwl_queuecont[KQWL_NBUCKETS-1]; /* continue array of queues */ + struct kqtailq kqwl_queue[KQWL_NBUCKETS]; /* array of queues */ struct kqrequest kqwl_request; /* thread request state */ - lck_spin_t kqwl_reqlock; /* kqueue request lock */ lck_mtx_t kqwl_statelock; /* state/debounce lock */ thread_t kqwl_owner; /* current [sync] owner thread */ uint32_t kqwl_retains; /* retain references */ kqueue_id_t kqwl_dynamicid; /* dynamic identity */ + uint64_t kqwl_params; /* additional parameters */ + struct turnstile *kqwl_turnstile; /* turnstile for sync IPC/waiters */ SLIST_ENTRY(kqworkloop) kqwl_hashlink; /* linkage for search list */ +#if CONFIG_WORKLOOP_DEBUG +#define KQWL_HISTORY_COUNT 32 +#define KQWL_HISTORY_WRITE_ENTRY(kqwl, ...) ({ \ + struct kqworkloop *__kqwl = (kqwl); \ + unsigned int __index = os_atomic_inc_orig(&__kqwl->kqwl_index, relaxed); \ + __kqwl->kqwl_history[__index % KQWL_HISTORY_COUNT] = \ + (struct kqwl_history)__VA_ARGS__; \ + }) + struct kqwl_history { + thread_t updater; /* Note: updates can be reordered */ + thread_t servicer; + thread_t old_owner; + thread_t new_owner; + + uint64_t kev_ident; + int16_t error; + uint16_t kev_flags; + uint32_t kev_fflags; + + uint64_t kev_mask; + uint64_t kev_value; + uint64_t in_value; + } kqwl_history[KQWL_HISTORY_COUNT]; + unsigned int kqwl_index; +#endif // CONFIG_WORKLOOP_DEBUG }; +typedef union { + struct kqueue *kq; + struct kqworkq *kqwq; + struct kqfile *kqf; + struct kqworkloop *kqwl; +} __attribute__((transparent_union)) kqueue_t; + SLIST_HEAD(kqlist, kqworkloop); #define kqwl_wqs kqwl_kqueue.kq_wqs @@ -337,30 +344,39 @@ SLIST_HEAD(kqlist, kqworkloop); #define kqwl_level kqwl_kqueue.kq_level #define kqwl_count kqwl_kqueue.kq_count #define kqwl_p kqwl_kqueue.kq_p -#define kqwl_queue kqwl_kqueue.kq_queue - -#define kqwl_req_lock(kqwl) lck_spin_lock(&kqwl->kqwl_reqlock) -#define kqwl_req_unlock(kqwl) lck_spin_unlock(&kqwl->kqwl_reqlock) -#define kqwl_req_held(kqwl) LCK_SPIN_ASSERT(&kqwl->kqwl_reqlock, LCK_ASSERT_OWNED) #define KQ_WORKLOOP_RETAINS_MAX UINT32_MAX -extern int workloop_fulfill_threadreq(struct proc *p, workq_threadreq_t req, thread_t thread, int flags); +extern void kqueue_threadreq_unbind(struct proc *p, struct kqrequest *kqr); + +// called with the kq req held +#define KQUEUE_THREADERQ_BIND_NO_INHERITOR_UPDATE 0x1 +extern void kqueue_threadreq_bind(struct proc *p, workq_threadreq_t req, + thread_t thread, unsigned int flags); + +// called with the wq lock held +extern void kqueue_threadreq_bind_prepost(struct proc *p, workq_threadreq_t req, thread_t thread); + +// called with no lock held +extern void kqueue_threadreq_bind_commit(struct proc *p, thread_t thread); + +extern void kqueue_threadreq_cancel(struct proc *p, workq_threadreq_t req); + +// lock not held as kqwl_params is immutable after creation +extern workq_threadreq_param_t kqueue_threadreq_workloop_param(workq_threadreq_t req); extern struct kqueue *kqueue_alloc(struct proc *, unsigned int); extern void kqueue_dealloc(struct kqueue *); extern void knotes_dealloc(struct proc *); +extern void kqworkloops_dealloc(struct proc *); -extern void kevent_register(struct kqueue *, struct kevent_internal_s *, struct proc *); +extern int kevent_register(struct kqueue *, struct kevent_internal_s *, + struct knote_lock_ctx *); extern int kqueue_scan(struct kqueue *, kevent_callback_t, kqueue_continue_t, - void *, struct filt_process_s *, struct timeval *, struct proc *); + void *, struct filt_process_s *, struct timeval *, struct proc *); extern int kqueue_stat(struct kqueue *, void *, int, proc_t); #endif /* XNU_KERNEL_PRIVATE */ #endif /* !_SYS_EVENTVAR_H_ */ - - - - diff --git a/bsd/sys/fasttrap_impl.h b/bsd/sys/fasttrap_impl.h index 1ca389cb6..863e6037e 100644 --- a/bsd/sys/fasttrap_impl.h +++ b/bsd/sys/fasttrap_impl.h @@ -190,11 +190,6 @@ extern fasttrap_hash_t fasttrap_tpoints; #define FASTTRAP_TPOINTS_INDEX(pid, pc) \ (((pc) / sizeof (fasttrap_instr_t) + (pid)) & fasttrap_tpoints.fth_mask) - -#ifdef CONFIG_EMBEDDED -#define FASTTRAP_ASYNC_REMOVE -#endif - extern void fasttrap_tracepoint_retire(proc_t *p, fasttrap_tracepoint_t *tp); /* diff --git a/bsd/sys/fbt.h b/bsd/sys/fbt.h index fdda0b161..a6411a57f 100644 --- a/bsd/sys/fbt.h +++ b/bsd/sys/fbt.h @@ -69,7 +69,5 @@ extern int fbt_enable (void *arg, dtrace_id_t id, void *parg); extern int fbt_module_excluded(struct modctl*); extern int fbt_excluded(const char *); -extern void fbt_provide_probe(struct modctl *ctl, uintptr_t instr_low, uintptr_t instr_high, char *modname, char* symbol_name, machine_inst_t* symbol_start); - -extern void fbt_provide_module_kernel_syms(struct modctl *ctl); +extern void fbt_provide_probe(struct modctl *ctl, const char *modname, const char *name, machine_inst_t *instr, machine_inst_t *limit); #endif /* _FBT_H */ diff --git a/bsd/sys/filedesc.h b/bsd/sys/filedesc.h index b79440b48..16e33533a 100644 --- a/bsd/sys/filedesc.h +++ b/bsd/sys/filedesc.h @@ -118,6 +118,9 @@ struct filedesc { /* if we're force unmounted and unable to */ /* take a vnode_ref on fd_rdir during a fork */ +#define FD_WORKLOOP 0x02 /* process has created a kqworkloop that */ + /* requires manual cleanup on exit */ + /* * Per-process open flags. */ diff --git a/bsd/sys/fsctl.h b/bsd/sys/fsctl.h index 8a3624d3b..eafcb9b45 100644 --- a/bsd/sys/fsctl.h +++ b/bsd/sys/fsctl.h @@ -251,6 +251,13 @@ typedef struct disk_conditioner_info { uint64_t read_throughput_mbps; // maximum throughput for reads uint64_t write_throughput_mbps; // maximum throughput for writes int is_ssd; // behave like an SSD + + /* revision 2 */ + uint32_t ioqueue_depth; + uint32_t maxreadcnt; + uint32_t maxwritecnt; + uint32_t segreadcnt; + uint32_t segwritecnt; } disk_conditioner_info; #define FSCTL_SYNC_FULLSYNC (1<<0) /* Flush the data fully to disk, if supported by the filesystem */ @@ -328,6 +335,14 @@ typedef struct disk_conditioner_info { #define SPOTLIGHT_IOC_GET_LAST_MTIME _IOR('h', 19, u_int32_t) #define SPOTLIGHT_FSCTL_GET_LAST_MTIME IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME) +/* Mark file's extents as "frozen" because someone has references to physical address */ +#define FSIOC_FREEZE_EXTENTS _IO('h', 20) +#define FSCTL_FREEZE_EXTENTS IOCBASECMD(FSIOC_FREEZE_EXTENTS) + +/* Clear the "frozen" status of file's extents */ +#define FSIOC_THAW_EXTENTS _IO('h', 21) +#define FSCTL_THAW_EXTENTS IOCBASECMD(FSIOC_THAW_EXTENTS) + #ifndef KERNEL #include diff --git a/bsd/sys/guarded.h b/bsd/sys/guarded.h index ae1ec05f9..f445d4fd6 100644 --- a/bsd/sys/guarded.h +++ b/bsd/sys/guarded.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016 Apple Inc. All rights reserved. + * Copyright (c) 2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -150,8 +150,6 @@ enum guard_vn_exception_codes { kGUARD_EXC_EXCHDATA = VNG_EXCHDATA, }; -#if defined(KERNEL) - /* Guard violation behaviors: not all combinations make sense */ #define kVNG_POLICY_LOGMSG (1u << 0) @@ -159,7 +157,9 @@ enum guard_vn_exception_codes { #define kVNG_POLICY_EXC (1u << 2) #define kVNG_POLICY_EXC_CORPSE (1u << 3) #define kVNG_POLICY_SIGKILL (1u << 4) +#define kVNG_POLICY_UPRINTMSG (1u << 5) +#if defined(KERNEL) extern int vnguard_exceptions_active(void); extern void vnguard_policy_init(void); #endif /* KERNEL */ diff --git a/bsd/sys/imgact.h b/bsd/sys/imgact.h index c1fbde252..80344fce5 100644 --- a/bsd/sys/imgact.h +++ b/bsd/sys/imgact.h @@ -127,16 +127,18 @@ struct image_params { /* * Image flags */ -#define IMGPF_NONE 0x00000000 /* No flags */ -#define IMGPF_INTERPRET 0x00000001 /* Interpreter invoked */ -#define IMGPF_RESERVED 0x00000002 -#define IMGPF_WAS_64BIT 0x00000004 /* exec from a 64Bit binary */ -#define IMGPF_IS_64BIT 0x00000008 /* exec to a 64Bit binary */ -#define IMGPF_SPAWN 0x00000010 /* spawn (without setexec) */ -#define IMGPF_DISABLE_ASLR 0x00000020 /* disable ASLR */ +#define IMGPF_NONE 0x00000000 /* No flags */ +#define IMGPF_INTERPRET 0x00000001 /* Interpreter invoked */ +#define IMGPF_RESERVED 0x00000002 +#define IMGPF_WAS_64BIT_ADDR 0x00000004 /* exec from a 64Bit address space */ +#define IMGPF_IS_64BIT_ADDR 0x00000008 /* exec to a 64Bit address space */ +#define IMGPF_SPAWN 0x00000010 /* spawn (without setexec) */ +#define IMGPF_DISABLE_ASLR 0x00000020 /* disable ASLR */ #define IMGPF_ALLOW_DATA_EXEC 0x00000040 /* forcibly disallow data execution */ -#define IMGPF_VFORK_EXEC 0x00000080 /* vfork followed by exec */ -#define IMGPF_EXEC 0x00000100 /* exec */ +#define IMGPF_VFORK_EXEC 0x00000080 /* vfork followed by exec */ +#define IMGPF_EXEC 0x00000100 /* exec */ #define IMGPF_HIGH_BITS_ASLR 0x00000200 /* randomize high bits of ASLR slide */ +#define IMGPF_IS_64BIT_DATA 0x00000400 /* exec to a 64Bit register state */ + #endif /* !_SYS_IMGACT */ diff --git a/bsd/sys/kauth.h b/bsd/sys/kauth.h index 6d46a5afb..48ae2b3f2 100644 --- a/bsd/sys/kauth.h +++ b/bsd/sys/kauth.h @@ -582,6 +582,7 @@ __END_DECLS #define KAUTH_FILEOP_LINK 5 #define KAUTH_FILEOP_EXEC 6 #define KAUTH_FILEOP_DELETE 7 +#define KAUTH_FILEOP_WILL_RENAME 8 /* * arguments passed to KAUTH_FILEOP_OPEN listeners @@ -591,6 +592,10 @@ __END_DECLS * arg0 is pointer to vnode (vnode *) for file to be closed. * arg1 is pointer to path (char *) of file to be closed. * arg2 is close flags. + * arguments passed to KAUTH_FILEOP_WILL_RENAME listeners + * arg0 is pointer to vnode (vnode *) of the file being renamed + * arg1 is pointer to the "from" path (char *) + * arg2 is pointer to the "to" path (char *) * arguments passed to KAUTH_FILEOP_RENAME listeners * arg0 is pointer to "from" path (char *). * arg1 is pointer to "to" path (char *). diff --git a/bsd/sys/kdebug.h b/bsd/sys/kdebug.h index 132698775..7d5f89cf8 100644 --- a/bsd/sys/kdebug.h +++ b/bsd/sys/kdebug.h @@ -47,10 +47,6 @@ __BEGIN_DECLS #include #endif -#ifdef XNU_KERNEL_PRIVATE -#include /* __improbable */ -#endif - /* * Kdebug is a facility for tracing events occurring on a system. * @@ -190,7 +186,7 @@ extern void kernel_debug_enter( #define DBG_DRIVERS 6 #define DBG_TRACE 7 #define DBG_DLIL 8 -#define DBG_WORKQUEUE 9 +#define DBG_PTHREAD 9 #define DBG_CORESTORAGE 10 #define DBG_CG 11 #define DBG_MONOTONIC 12 @@ -211,6 +207,7 @@ extern void kernel_debug_enter( #define DBG_DISPATCH 46 #define DBG_IMG 49 #define DBG_UMALLOC 51 +#define DBG_TURNSTILE 53 #define DBG_MIG 255 @@ -389,12 +386,14 @@ extern void kdebug_reset(void); #define DBG_MACH_ZALLOC 0xA5 /* Zone allocator */ #define DBG_MACH_THREAD_GROUP 0xA6 /* Thread groups */ #define DBG_MACH_COALITION 0xA7 /* Coalitions */ +#define DBG_MACH_SHAREDREGION 0xA8 /* Shared region */ /* Interrupt type bits for DBG_MACH_EXCP_INTR */ #define DBG_INTR_TYPE_UNKNOWN 0x0 /* default/unknown interrupt */ #define DBG_INTR_TYPE_IPI 0x1 /* interprocessor interrupt */ #define DBG_INTR_TYPE_TIMER 0x2 /* timer interrupt */ #define DBG_INTR_TYPE_OTHER 0x3 /* other (usually external) interrupt */ +#define DBG_INTR_TYPE_PMI 0x4 /* performance monitor interrupt */ /* Codes for Scheduler (DBG_MACH_SCHED) */ #define MACH_SCHED 0x0 /* Scheduler */ @@ -404,8 +403,8 @@ extern void kdebug_reset(void); #define MACH_CALLOUT 0x4 /* callouts */ #define MACH_STACK_DETACH 0x5 #define MACH_MAKE_RUNNABLE 0x6 /* make thread runnable */ -#define MACH_PROMOTE 0x7 /* promoted due to resource */ -#define MACH_DEMOTE 0x8 /* promotion undone */ +#define MACH_PROMOTE 0x7 /* promoted due to resource (replaced by MACH_PROMOTED) */ +#define MACH_DEMOTE 0x8 /* promotion undone (replaced by MACH_UNPROMOTED) */ #define MACH_IDLE 0x9 /* processor idling */ #define MACH_STACK_DEPTH 0xa /* stack depth at switch */ #define MACH_MOVED 0xb /* did not use original scheduling decision */ @@ -447,6 +446,11 @@ extern void kdebug_reset(void); #define MACH_EXEC_DEMOTE 0x31 /* Thread demoted from exec boost */ #define MACH_AMP_SIGNAL_SPILL 0x32 /* AMP spill signal sent to cpuid */ #define MACH_AMP_STEAL 0x33 /* AMP thread stolen or spilled */ +#define MACH_SCHED_LOAD_EFFECTIVE 0x34 /* Effective scheduler load */ +#define MACH_PROMOTED 0x35 /* thread promoted due to mutex priority promotion */ +#define MACH_UNPROMOTED 0x36 /* thread unpromoted due to mutex priority promotion */ +#define MACH_PROMOTED_UPDATE 0x37 /* thread already promoted, but promotion priority changed */ +#define MACH_QUIESCENT_COUNTER 0x38 /* quiescent counter tick */ /* Variants for MACH_MULTIQ_DEQUEUE */ #define MACH_MULTIQ_BOUND 1 @@ -478,6 +482,7 @@ extern void kdebug_reset(void); #define MACH_IPC_VOUCHER_DESTROY 0x9 /* Voucher removed from global voucher hashtable */ #define MACH_IPC_KMSG_INFO 0xa /* Send/Receive info for a kmsg */ #define MACH_IPC_KMSG_LINK 0xb /* link a kernel kmsg pointer to user mach_msg_header_t */ +#define MACH_IPC_PORT_ENTRY_MODIFY 0xc /* A port space gained or lost a port right (reference) */ /* Codes for thread groups (DBG_MACH_THREAD_GROUP) */ #define MACH_THREAD_GROUP_NEW 0x0 @@ -513,6 +518,9 @@ extern void kdebug_reset(void); #define PMAP__FLUSH_TLBS_TO 0xf #define PMAP__FLUSH_EPT 0x10 #define PMAP__FAST_FAULT 0x11 +#define PMAP__SWITCH 0x12 +#define PMAP__TTE 0x13 +#define PMAP__SWITCH_USER_TTB 0x14 /* Codes for clock (DBG_MACH_CLOCK) */ #define MACH_EPOCH_CHANGE 0x0 /* wake epoch change */ @@ -661,6 +669,7 @@ extern void kdebug_reset(void); #define DBG_DRVSSM 24 /* System State Manager(AppleSSM) */ #define DBG_DRVSMC 25 /* System Management Controller */ #define DBG_DRVMACEFIMANAGER 26 /* Mac EFI Manager */ +#define DBG_DRVANE 27 /* ANE */ /* Backwards compatibility */ #define DBG_DRVPOINTING DBG_DRVHID /* OBSOLETE: Use DBG_DRVHID instead */ @@ -674,7 +683,11 @@ extern void kdebug_reset(void); #define DBG_DLIL_IF_FLT 5 /* DLIL Interface FIlter */ -/* The Kernel Debug Sub Classes for File System (DBG_FSYSTEM) */ +/* + * The Kernel Debug Sub Classes for File System (DBG_FSYSTEM) + * + * Please NOTE: sub class values 0xC and 0xD are currently unused. + */ #define DBG_FSRW 0x1 /* reads and writes to the filesystem */ #define DBG_DKRW 0x2 /* reads and writes to the disk */ #define DBG_FSVN 0x3 /* vnode operations (inc. locking/unlocking) */ @@ -690,6 +703,7 @@ extern void kdebug_reset(void); #define DBG_MSDOS 0xF /* FAT-specific events; see the msdosfs project */ #define DBG_ACFS 0x10 /* Xsan-specific events; see the XsanFS project */ #define DBG_THROTTLE 0x11 /* I/O Throttling events */ +#define DBG_DECMP 0x12 /* Decmpfs-specific events */ #define DBG_CONTENT_PROT 0xCF /* Content Protection Events: see bsd/sys/cprotect.h */ /* @@ -736,7 +750,9 @@ extern void kdebug_reset(void); #ifdef PRIVATE #define BSD_MEMSTAT_GRP_SET_PROP 12 /* set group properties */ #define BSD_MEMSTAT_DO_KILL 13 /* memorystatus kills */ +#define BSD_MEMSTAT_CHANGE_PRIORITY 14 /* priority changed */ #endif /* PRIVATE */ +#define BSD_MEMSTAT_FAST_JETSAM 15 /* Aggressive jetsam ("clear-the-deck") */ /* Codes for BSD subcode class DBG_BSD_KEVENT */ #define BSD_KEVENT_KQ_PROCESS_BEGIN 1 @@ -760,6 +776,7 @@ extern void kdebug_reset(void); #define BSD_KEVENT_KQWL_BIND 19 #define BSD_KEVENT_KQWL_UNBIND 20 #define BSD_KEVENT_KNOTE_ENABLE 21 +#define BSD_KEVENT_KNOTE_VANISHED 22 /* The Kernel Debug Sub Classes for DBG_TRACE */ #define DBG_TRACE_DATA 0 @@ -795,12 +812,14 @@ extern void kdebug_reset(void); /* The Kernel Debug Sub Classes for DBG_MONOTONIC */ #define DBG_MT_INSTRS_CYCLES 1 +#define DBG_MT_DEBUG 2 #define DBG_MT_TMPTH 0xfe #define DBG_MT_TMPCPU 0xff /* The Kernel Debug Sub Classes for DBG_MISC */ -#define DBG_EVENT 0x10 -#define DBG_BUFFER 0x20 +#define DBG_EVENT 0x10 +#define DBG_MISC_LAYOUT 0x1a +#define DBG_BUFFER 0x20 /* The Kernel Debug Sub Classes for DBG_DYLD */ #define DBG_DYLD_UUID (5) @@ -841,7 +860,9 @@ extern void kdebug_reset(void); #define DBG_APP_SYSTEMUI 0x05 #define DBG_APP_SIGNPOST 0x0A #define DBG_APP_APPKIT 0x0C +#define DBG_APP_UIKIT 0x0D #define DBG_APP_DFR 0x0E +#define DBG_APP_LAYOUT 0x0F #define DBG_APP_SAMBA 0x80 #define DBG_APP_EOSSUPPORT 0x81 #define DBG_APP_MACEFIMANAGER 0x82 @@ -898,6 +919,32 @@ extern void kdebug_reset(void); #define IMP_SYNC_IPC_QOS_OVERFLOW 0x2 #define IMP_SYNC_IPC_QOS_UNDERFLOW 0x3 +/* Subclasses for Turnstiles (DBG_TURNSTILE) */ +#define TURNSTILE_HEAP_OPERATIONS 0x10 +#define TURNSTILE_PRIORITY_OPERATIONS 0x20 +#define TURNSTILE_FREELIST_OPERATIONS 0x30 + +/* Codes for TURNSTILE_HEAP_OPERATIONS */ +#define THREAD_ADDED_TO_TURNSTILE_WAITQ 0x1 +#define THREAD_REMOVED_FROM_TURNSTILE_WAITQ 0x2 +#define THREAD_MOVED_IN_TURNSTILE_WAITQ 0x3 +#define TURNSTILE_ADDED_TO_TURNSTILE_HEAP 0x4 +#define TURNSTILE_REMOVED_FROM_TURNSTILE_HEAP 0x5 +#define TURNSTILE_MOVED_IN_TURNSTILE_HEAP 0x6 +#define TURNSTILE_ADDED_TO_THREAD_HEAP 0x7 +#define TURNSTILE_REMOVED_FROM_THREAD_HEAP 0x8 +#define TURNSTILE_MOVED_IN_THREAD_HEAP 0x9 +#define TURNSTILE_UPDATE_STOPPED_BY_LIMIT 0xa +#define THREAD_NOT_WAITING_ON_TURNSTILE 0xb + +/* Codes for TURNSTILE_PRIORITY_OPERATIONS */ +#define TURNSTILE_PRIORITY_CHANGE 0x1 +#define THREAD_USER_PROMOTION_CHANGE 0x2 + +/* Codes for TURNSTILE_FREELIST_OPERATIONS */ +#define TURNSTILE_PREPARE 0x1 +#define TURNSTILE_COMPLETE 0x2 + /* Subclasses for MACH Bank Voucher Attribute Manager (DBG_BANK) */ #define BANK_ACCOUNT_INFO 0x10 /* Trace points related to bank account struct */ #define BANK_TASK_INFO 0x11 /* Trace points related to bank task struct */ @@ -968,6 +1015,7 @@ extern void kdebug_reset(void); #define IMPORTANCE_CODE(SubClass, code) KDBG_CODE(DBG_IMPORTANCE, (SubClass), (code)) #define BANK_CODE(SubClass, code) KDBG_CODE(DBG_BANK, (SubClass), (code)) #define ATM_CODE(SubClass, code) KDBG_CODE(DBG_ATM, (SubClass), (code)) +#define TURNSTILE_CODE(SubClass, code) KDBG_CODE(DBG_TURNSTILE, (SubClass), (code)) /* Kernel Debug Macros for specific daemons */ #define COREDUETDBG_CODE(code) DAEMONDBG_CODE(DBG_DAEMON_COREDUET, code) @@ -1002,16 +1050,23 @@ extern void kdebug_reset(void); */ /* - * Traced on debug and development (and release OS X) kernels. + * Traced on debug and development (and release macOS) kernels. */ #define KDBG(x, ...) KDBG_(, x, ## __VA_ARGS__, 4, 3, 2, 1, 0) /* - * Traced on debug and development (and release OS X) kernels if explicitly + * Traced on debug and development (and release macOS) kernels if explicitly * requested. Omitted from tracing without a typefilter. */ #define KDBG_FILTERED(x, ...) KDBG_(_FILTERED, x, ## __VA_ARGS__, 4, 3, 2, 1, 0) +/* + * Traced on debug and development (and release macOS) kernels, even if the + * process filter would reject it. + */ +#define KDBG_RELEASE_NOPROCFILT(x, ...) \ + KDBG_(_RELEASE_NOPROCFILT, x, ## __VA_ARGS__, 4, 3, 2, 1, 0) + /* * Traced on debug, development, and release kernels. * @@ -1096,17 +1151,31 @@ extern unsigned int kdebug_enable; * tracing without a typefilter. */ #if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) -#define KERNEL_DEBUG_CONSTANT_FILTERED(x, a, b, c, d, ...) \ - do { \ - if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) { \ - kernel_debug_filtered((x), (uintptr_t)(a), (uintptr_t)(b), \ - (uintptr_t)(c), (uintptr_t)(d)); \ - } \ +#define KERNEL_DEBUG_CONSTANT_FILTERED(x, a, b, c, d, ...) \ + do { \ + if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) { \ + kernel_debug_filtered((x), (uintptr_t)(a), (uintptr_t)(b), \ + (uintptr_t)(c), (uintptr_t)(d)); \ + } \ } while (0) #else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */ #define KERNEL_DEBUG_CONSTANT_FILTERED(type, x, a, b, c, d, ...) do {} while (0) #endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */ +#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) +#define KERNEL_DEBUG_CONSTANT_RELEASE_NOPROCFILT(x, a, b, c, d, ...) \ + do { \ + if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) { \ + kernel_debug_flags((x), (uintptr_t)(a), (uintptr_t)(b), \ + (uintptr_t)(c), (uintptr_t)(d), KDBG_FLAG_NOPROCFILT); \ + } \ + } while (0) +#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */ +#define KERNEL_DEBUG_CONSTANT_RELEASE_NOPROCFILT(x, a, b, c, d, ...) \ + do { } while (0) +#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */ + + #if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) #define KERNEL_DEBUG_CONSTANT(x, a, b, c, d, e) \ do { \ @@ -1227,6 +1296,17 @@ extern void kernel_debug1( uintptr_t arg4, uintptr_t arg5); +#define KDBG_FLAG_FILTERED 0x01 +#define KDBG_FLAG_NOPROCFILT 0x02 + +extern void kernel_debug_flags( + uint32_t debugid, + uintptr_t arg1, + uintptr_t arg2, + uintptr_t arg3, + uintptr_t arg4, + uint64_t flags); + extern void kernel_debug_filtered( uint32_t debugid, uintptr_t arg1, @@ -1398,7 +1478,15 @@ boolean_t kdebug_debugid_enabled(uint32_t debugid); boolean_t kdebug_debugid_explicitly_enabled(uint32_t debugid); uint32_t kdebug_commpage_state(void); -void kdebug_lookup_gen_events(long *dbg_parms, int dbg_namelen, void *dp, boolean_t lookup); + +#define KDBG_VFS_LOOKUP_FLAG_LOOKUP 0x01 +#define KDBG_VFS_LOOKUP_FLAG_NOPROCFILT 0x02 +void kdebug_vfs_lookup(long *dbg_parms, int dbg_namelen, void *dp, + uint32_t flags); + +void kdebug_lookup_gen_events(long *dbg_parms, int dbg_namelen, void *dp, + boolean_t lookup); + void kdbg_trace_data(struct proc *proc, long *arg_pid, long *arg_uniqueid); void kdbg_trace_string(struct proc *proc, long *arg1, long *arg2, long *arg3, long *arg4); @@ -1409,8 +1497,6 @@ void kdebug_trace_start(unsigned int n_events, const char *filterdesc, boolean_t wrapping, boolean_t at_wake); void kdebug_free_early_buf(void); struct task; -boolean_t disable_wrap(uint32_t *old_slowcheck, uint32_t *old_flags); -void enable_wrap(uint32_t old_slowcheck, boolean_t lostevents); void release_storage_unit(int cpu, uint32_t storage_unit); int allocate_storage_unit(int cpu); @@ -1427,78 +1513,92 @@ __END_DECLS * private kernel_debug definitions */ +/* + * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf + * structure. + */ +#if defined(__arm64__) +typedef uint64_t kd_buf_argtype; +#else +typedef uintptr_t kd_buf_argtype; +#endif + typedef struct { uint64_t timestamp; - uintptr_t arg1; - uintptr_t arg2; - uintptr_t arg3; - uintptr_t arg4; - uintptr_t arg5; /* the thread ID */ + kd_buf_argtype arg1; + kd_buf_argtype arg2; + kd_buf_argtype arg3; + kd_buf_argtype arg4; + kd_buf_argtype arg5; /* the thread ID */ uint32_t debugid; -#if defined(__LP64__) +/* + * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf + * structure. + */ +#if defined(__LP64__) || defined(__arm64__) uint32_t cpuid; - uintptr_t unused; + kd_buf_argtype unused; #endif } kd_buf; -#if !defined(__LP64__) -#define KDBG_TIMESTAMP_MASK 0x00ffffffffffffffULL -#define KDBG_CPU_MASK 0xff00000000000000ULL -#define KDBG_CPU_SHIFT 56 +#if defined(__LP64__) || defined(__arm64__) +#define KDBG_TIMESTAMP_MASK 0xffffffffffffffffULL static inline void kdbg_set_cpu(kd_buf *kp, int cpu) { - kp->timestamp = (kp->timestamp & KDBG_TIMESTAMP_MASK) | - (((uint64_t) cpu) << KDBG_CPU_SHIFT); + kp->cpuid = (unsigned int)cpu; } static inline int kdbg_get_cpu(kd_buf *kp) { - return (int) (((kp)->timestamp & KDBG_CPU_MASK) >> KDBG_CPU_SHIFT); + return (int)kp->cpuid; } static inline void kdbg_set_timestamp(kd_buf *kp, uint64_t thetime) { - kp->timestamp = thetime & KDBG_TIMESTAMP_MASK; + kp->timestamp = thetime; } static inline uint64_t kdbg_get_timestamp(kd_buf *kp) { - return kp->timestamp & KDBG_TIMESTAMP_MASK; + return kp->timestamp; } static inline void kdbg_set_timestamp_and_cpu(kd_buf *kp, uint64_t thetime, int cpu) { - kp->timestamp = (thetime & KDBG_TIMESTAMP_MASK) | - (((uint64_t) cpu) << KDBG_CPU_SHIFT); + kdbg_set_timestamp(kp, thetime); + kdbg_set_cpu(kp, cpu); } #else -#define KDBG_TIMESTAMP_MASK 0xffffffffffffffffULL +#define KDBG_TIMESTAMP_MASK 0x00ffffffffffffffULL +#define KDBG_CPU_MASK 0xff00000000000000ULL +#define KDBG_CPU_SHIFT 56 static inline void kdbg_set_cpu(kd_buf *kp, int cpu) { - kp->cpuid = (unsigned int)cpu; + kp->timestamp = (kp->timestamp & KDBG_TIMESTAMP_MASK) | + (((uint64_t) cpu) << KDBG_CPU_SHIFT); } static inline int kdbg_get_cpu(kd_buf *kp) { - return (int)kp->cpuid; + return (int) (((kp)->timestamp & KDBG_CPU_MASK) >> KDBG_CPU_SHIFT); } static inline void kdbg_set_timestamp(kd_buf *kp, uint64_t thetime) { - kp->timestamp = thetime; + kp->timestamp = thetime & KDBG_TIMESTAMP_MASK; } static inline uint64_t kdbg_get_timestamp(kd_buf *kp) { - return kp->timestamp; + return kp->timestamp & KDBG_TIMESTAMP_MASK; } static inline void kdbg_set_timestamp_and_cpu(kd_buf *kp, uint64_t thetime, int cpu) { - kdbg_set_timestamp(kp, thetime); - kdbg_set_cpu(kp, cpu); + kp->timestamp = (thetime & KDBG_TIMESTAMP_MASK) | + (((uint64_t) cpu) << KDBG_CPU_SHIFT); } #endif @@ -1570,7 +1670,11 @@ typedef struct { typedef struct { /* the thread ID */ +#if defined(__arm64__) + uint64_t thread; +#else uintptr_t thread; +#endif /* 0 for invalid, otherwise the PID (or 1 for kernel_task) */ int valid; /* the name of the process owning the thread */ diff --git a/bsd/sys/kern_memorystatus.h b/bsd/sys/kern_memorystatus.h index 52bce789c..c3f5cec27 100644 --- a/bsd/sys/kern_memorystatus.h +++ b/bsd/sys/kern_memorystatus.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2006-2018 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -96,6 +96,31 @@ typedef struct memorystatus_priority_entry { uint32_t state; } memorystatus_priority_entry_t; +/* + * This should be the structure to specify different properties + * for processes (group or single) from user-space. Unfortunately, + * we can't move to it completely because the priority_entry structure + * above has been in use for a while now. We'll have to deprecate it. + * + * To support new fields/properties, we will add a new structure with a + * new version and a new size. + */ +#define MEMORYSTATUS_MPE_VERSION_1 1 + +#define MEMORYSTATUS_MPE_VERSION_1_SIZE sizeof(struct memorystatus_properties_entry_v1) + +typedef struct memorystatus_properties_entry_v1 { + int version; + pid_t pid; + int32_t priority; + int use_probability; + uint64_t user_data; + int32_t limit; /* MB */ + uint32_t state; + char proc_name[MAXCOMLEN+1]; + char __pad1[3]; +} memorystatus_properties_entry_v1_t; + typedef struct memorystatus_kernel_stats { uint32_t free_pages; uint32_t active_pages; @@ -131,7 +156,6 @@ typedef struct jetsam_snapshot_entry { uint64_t user_data; uint64_t killed; uint64_t pages; - uint64_t max_pages; uint64_t max_pages_lifetime; uint64_t purgeable_pages; uint64_t jse_internal_pages; @@ -148,7 +172,8 @@ typedef struct jetsam_snapshot_entry { uint64_t jse_killtime; /* absolute time when jetsam chooses to kill a process */ uint64_t jse_idle_delta; /* time spent in idle band */ uint64_t jse_coalition_jetsam_id; /* we only expose coalition id for COALITION_TYPE_JETSAM */ - struct timeval cpu_time; + struct timeval64 cpu_time; + uint64_t jse_thaw_count; } memorystatus_jetsam_snapshot_entry_t; typedef struct jetsam_snapshot { @@ -185,19 +210,21 @@ typedef struct memorystatus_freeze_entry { * kMemorystatusKilled... Cause enum * memorystatus_kill_cause_name[] */ -#define JETSAM_REASON_INVALID 0 -#define JETSAM_REASON_GENERIC 1 -#define JETSAM_REASON_MEMORY_HIGHWATER 2 -#define JETSAM_REASON_VNODE 3 -#define JETSAM_REASON_MEMORY_VMPAGESHORTAGE 4 -#define JETSAM_REASON_MEMORY_VMTHRASHING 5 -#define JETSAM_REASON_MEMORY_FCTHRASHING 6 -#define JETSAM_REASON_MEMORY_PERPROCESSLIMIT 7 -#define JETSAM_REASON_MEMORY_DIAGNOSTIC 8 -#define JETSAM_REASON_MEMORY_IDLE_EXIT 9 -#define JETSAM_REASON_ZONE_MAP_EXHAUSTION 10 - -#define JETSAM_REASON_MEMORYSTATUS_MAX JETSAM_REASON_ZONE_MAP_EXHAUSTION +#define JETSAM_REASON_INVALID 0 +#define JETSAM_REASON_GENERIC 1 +#define JETSAM_REASON_MEMORY_HIGHWATER 2 +#define JETSAM_REASON_VNODE 3 +#define JETSAM_REASON_MEMORY_VMPAGESHORTAGE 4 +#define JETSAM_REASON_MEMORY_PROCTHRASHING 5 +#define JETSAM_REASON_MEMORY_FCTHRASHING 6 +#define JETSAM_REASON_MEMORY_PERPROCESSLIMIT 7 +#define JETSAM_REASON_MEMORY_DISK_SPACE_SHORTAGE 8 +#define JETSAM_REASON_MEMORY_IDLE_EXIT 9 +#define JETSAM_REASON_ZONE_MAP_EXHAUSTION 10 +#define JETSAM_REASON_MEMORY_VMCOMPRESSOR_THRASHING 11 +#define JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE 12 + +#define JETSAM_REASON_MEMORYSTATUS_MAX JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE /* * Jetsam exit reason definitions - not related to memorystatus @@ -206,19 +233,26 @@ typedef struct memorystatus_freeze_entry { /* Cause */ enum { - kMemorystatusInvalid = JETSAM_REASON_INVALID, - kMemorystatusKilled = JETSAM_REASON_GENERIC, - kMemorystatusKilledHiwat = JETSAM_REASON_MEMORY_HIGHWATER, - kMemorystatusKilledVnodes = JETSAM_REASON_VNODE, - kMemorystatusKilledVMPageShortage = JETSAM_REASON_MEMORY_VMPAGESHORTAGE, - kMemorystatusKilledVMThrashing = JETSAM_REASON_MEMORY_VMTHRASHING, - kMemorystatusKilledFCThrashing = JETSAM_REASON_MEMORY_FCTHRASHING, - kMemorystatusKilledPerProcessLimit = JETSAM_REASON_MEMORY_PERPROCESSLIMIT, - kMemorystatusKilledDiagnostic = JETSAM_REASON_MEMORY_DIAGNOSTIC, - kMemorystatusKilledIdleExit = JETSAM_REASON_MEMORY_IDLE_EXIT, - kMemorystatusKilledZoneMapExhaustion = JETSAM_REASON_ZONE_MAP_EXHAUSTION + kMemorystatusInvalid = JETSAM_REASON_INVALID, + kMemorystatusKilled = JETSAM_REASON_GENERIC, + kMemorystatusKilledHiwat = JETSAM_REASON_MEMORY_HIGHWATER, + kMemorystatusKilledVnodes = JETSAM_REASON_VNODE, + kMemorystatusKilledVMPageShortage = JETSAM_REASON_MEMORY_VMPAGESHORTAGE, + kMemorystatusKilledProcThrashing = JETSAM_REASON_MEMORY_PROCTHRASHING, + kMemorystatusKilledFCThrashing = JETSAM_REASON_MEMORY_FCTHRASHING, + kMemorystatusKilledPerProcessLimit = JETSAM_REASON_MEMORY_PERPROCESSLIMIT, + kMemorystatusKilledDiskSpaceShortage = JETSAM_REASON_MEMORY_DISK_SPACE_SHORTAGE, + kMemorystatusKilledIdleExit = JETSAM_REASON_MEMORY_IDLE_EXIT, + kMemorystatusKilledZoneMapExhaustion = JETSAM_REASON_ZONE_MAP_EXHAUSTION, + kMemorystatusKilledVMCompressorThrashing = JETSAM_REASON_MEMORY_VMCOMPRESSOR_THRASHING, + kMemorystatusKilledVMCompressorSpaceShortage = JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE, }; +/* For backwards compatibility */ +#define kMemorystatusKilledDiagnostic kMemorystatusKilledDiskSpaceShortage +#define kMemorystatusKilledVMThrashing kMemorystatusKilledVMCompressorThrashing +#define JETSAM_REASON_MEMORY_VMTHRASHING JETSAM_REASON_MEMORY_VMCOMPRESSOR_THRASHING + /* Memorystatus control */ #define MEMORYSTATUS_BUFFERSIZE_MAX 65536 @@ -241,8 +275,19 @@ int memorystatus_control(uint32_t command, int32_t pid, uint32_t flags, void *bu #define MEMORYSTATUS_CMD_AGGRESSIVE_JETSAM_LENIENT_MODE_ENABLE 11 /* Enable the 'lenient' mode for aggressive jetsam. See comments in kern_memorystatus.c near the top. */ #define MEMORYSTATUS_CMD_AGGRESSIVE_JETSAM_LENIENT_MODE_DISABLE 12 /* Disable the 'lenient' mode for aggressive jetsam. */ #define MEMORYSTATUS_CMD_GET_MEMLIMIT_EXCESS 13 /* Compute how much a process's phys_footprint exceeds inactive memory limit */ -#define MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE 14 -#define MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_DISABLE 15 +#define MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE 14 /* Set the inactive jetsam band for a process to JETSAM_PRIORITY_ELEVATED_INACTIVE */ +#define MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_DISABLE 15 /* Reset the inactive jetsam band for a process to the default band (0)*/ +#define MEMORYSTATUS_CMD_SET_PROCESS_IS_MANAGED 16 /* (Re-)Set state on a process that marks it as (un-)managed by a system entity e.g. assertiond */ +#define MEMORYSTATUS_CMD_GET_PROCESS_IS_MANAGED 17 /* Return the 'managed' status of a process */ +#define MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE 18 /* Is the process eligible for freezing? Apps and extensions can pass in FALSE to opt out of freezing, i.e., + if they would prefer being jetsam'ed in the idle band to being frozen in an elevated band. */ +#define MEMORYSTATUS_CMD_GET_PROCESS_IS_FREEZABLE 19 /* Return the freezable state of a process. */ + +#if CONFIG_FREEZE +#if DEVELOPMENT || DEBUG +#define MEMORYSTATUS_CMD_FREEZER_CONTROL 20 +#endif /* DEVELOPMENT || DEBUG */ +#endif /* CONFIG_FREEZE */ /* Commands that act on a group of processes */ #define MEMORYSTATUS_CMD_GRP_SET_PROPERTIES 100 @@ -268,6 +313,14 @@ typedef struct memorystatus_jetsam_panic_options { #endif /* PRIVATE */ +/* memorystatus_control() flags */ + +#define MEMORYSTATUS_FLAGS_SNAPSHOT_ON_DEMAND 0x1 /* A populated snapshot buffer is returned on demand */ +#define MEMORYSTATUS_FLAGS_SNAPSHOT_AT_BOOT 0x2 /* Returns a snapshot with memstats collected at boot */ +#define MEMORYSTATUS_FLAGS_SNAPSHOT_COPY 0x4 /* Returns the previously populated snapshot created by the system */ +#define MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY 0x8 /* Set jetsam priorities for a group of pids */ +#define MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY 0x10 /* Set probability of use for a group of processes */ + /* * For use with memorystatus_control: * MEMORYSTATUS_CMD_GET_JETSAM_SNAPSHOT @@ -287,14 +340,18 @@ typedef struct memorystatus_jetsam_panic_options { * stats do not change. In this mode, * the snapshot entry_count is always 0. * + * Copy mode - this returns the previous snapshot + * collected by the system. The current snaphshot + * might be only half populated. + * * Snapshots are inherently racey between request * for buffer size and actual data compilation. */ -/* Flags */ -#define MEMORYSTATUS_SNAPSHOT_ON_DEMAND 0x1 /* A populated snapshot buffer is returned on demand */ -#define MEMORYSTATUS_SNAPSHOT_AT_BOOT 0x2 /* Returns a snapshot with memstats collected at boot */ - +/* These definitions are required for backwards compatibility */ +#define MEMORYSTATUS_SNAPSHOT_ON_DEMAND MEMORYSTATUS_FLAGS_SNAPSHOT_ON_DEMAND +#define MEMORYSTATUS_SNAPSHOT_AT_BOOT MEMORYSTATUS_FLAGS_SNAPSHOT_AT_BOOT +#define MEMORYSTATUS_SNAPSHOT_COPY MEMORYSTATUS_FLAGS_SNAPSHOT_COPY /* * For use with memorystatus_control: @@ -348,19 +405,19 @@ typedef struct memorystatus_memlimit_properties { * - in kernel process state and memlimit state */ -#define P_MEMSTAT_SUSPENDED 0x00000001 -#define P_MEMSTAT_FROZEN 0x00000002 -#define P_MEMSTAT_NORECLAIM 0x00000004 -#define P_MEMSTAT_ERROR 0x00000008 -#define P_MEMSTAT_LOCKED 0x00000010 -#define P_MEMSTAT_TERMINATED 0x00000020 -#define P_MEMSTAT_NOTFIED 0x00000040 -#define P_MEMSTAT_PRIORITYUPDATED 0x00000080 -#define P_MEMSTAT_FOREGROUND 0x00000100 -#define P_MEMSTAT_DIAG_SUSPENDED 0x00000200 -#define P_MEMSTAT_PRIOR_THAW 0x00000400 -/* unused 0x00000800 */ -#define P_MEMSTAT_INTERNAL 0x00001000 +#define P_MEMSTAT_SUSPENDED 0x00000001 /* Process is suspended and likely in the IDLE band */ +#define P_MEMSTAT_FROZEN 0x00000002 /* Process has some state on disk. It should be suspended */ +#define P_MEMSTAT_FREEZE_DISABLED 0x00000004 /* Process isn't freeze-eligible and will not be frozen */ +#define P_MEMSTAT_ERROR 0x00000008 /* Process couldn't be jetsammed for some reason. Transient state so jetsam can skip it next time it sees it */ +#define P_MEMSTAT_LOCKED 0x00000010 /* Process is being actively worked on behind the proc_list_lock */ +#define P_MEMSTAT_TERMINATED 0x00000020 /* Process is exiting */ +#define P_MEMSTAT_FREEZE_IGNORE 0x00000040 /* Process was evaluated by freezer and will be ignored till the next time it goes active and does something */ +#define P_MEMSTAT_PRIORITYUPDATED 0x00000080 /* Process had its jetsam priority updated */ +#define P_MEMSTAT_FOREGROUND 0x00000100 /* Process is in the FG jetsam band...unused??? */ +#define P_MEMSTAT_DIAG_SUSPENDED 0x00000200 /* ...unused??? */ +#define P_MEMSTAT_REFREEZE_ELIGIBLE 0x00000400 /* Process was once thawed i.e. its state was brought back from disk. It is now refreeze eligible.*/ +#define P_MEMSTAT_MANAGED 0x00000800 /* Process is managed by assertiond i.e. is either application or extension */ +#define P_MEMSTAT_INTERNAL 0x00001000 /* Process is a system-critical-not-be-jetsammed process i.e. launchd */ #define P_MEMSTAT_FATAL_MEMLIMIT 0x00002000 /* current fatal state of the process's memlimit */ #define P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL 0x00004000 /* if set, exceeding limit is fatal when the process is active */ #define P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL 0x00008000 /* if set, exceeding limit is fatal when the process is inactive */ @@ -378,7 +435,7 @@ extern int memorystatus_update(proc_t p, int priority, uint64_t user_data, boole extern int memorystatus_remove(proc_t p, boolean_t locked); -int memorystatus_update_inactive_jetsam_priority_band(pid_t pid, uint32_t opflags, boolean_t effective_now); +int memorystatus_update_inactive_jetsam_priority_band(pid_t pid, uint32_t opflags, int priority, boolean_t effective_now); extern int memorystatus_dirty_track(proc_t p, uint32_t pcontrol); @@ -405,7 +462,7 @@ void memorystatus_knote_unregister(struct knote *kn); void memorystatus_log_exception(const int max_footprint_mb, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal); void memorystatus_on_ledger_footprint_exceeded(int warning, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal); void proc_memstat_terminated(proc_t p, boolean_t set); -boolean_t memorystatus_proc_is_dirty_unsafe(void *v); +void memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit); #endif /* CONFIG_MEMORYSTATUS */ int memorystatus_get_pressure_status_kdp(void); @@ -420,19 +477,18 @@ typedef enum memorystatus_policy { kPolicyDiagnoseActive = (kPolicyDiagnoseAll | kPolicyDiagnoseFirst), } memorystatus_policy_t; -extern int memorystatus_jetsam_wakeup; -extern unsigned int memorystatus_jetsam_running; - boolean_t memorystatus_kill_on_VM_page_shortage(boolean_t async); boolean_t memorystatus_kill_on_FC_thrashing(boolean_t async); +boolean_t memorystatus_kill_on_VM_compressor_thrashing(boolean_t async); boolean_t memorystatus_kill_on_vnode_limit(void); void jetsam_on_ledger_cpulimit_exceeded(void); +void memorystatus_fast_jetsam_override(boolean_t enable_override); #endif /* CONFIG_JETSAM */ boolean_t memorystatus_kill_on_zone_map_exhaustion(pid_t pid); -boolean_t memorystatus_kill_on_VM_thrashing(boolean_t async); +boolean_t memorystatus_kill_on_VM_compressor_space_shortage(boolean_t async); void memorystatus_pages_update(unsigned int pages_avail); boolean_t memorystatus_idle_exit_from_VM(void); @@ -440,13 +496,19 @@ boolean_t memorystatus_idle_exit_from_VM(void); #ifdef CONFIG_FREEZE -#define FREEZE_PAGES_MIN ( 1 * 1024 * 1024 / PAGE_SIZE) -#define FREEZE_PAGES_MAX (16 * 1024 * 1024 / PAGE_SIZE) +#define FREEZE_PAGES_MIN ( 8 * 1024 * 1024 / PAGE_SIZE) +#define FREEZE_PAGES_MAX (32 * 1024 * 1024 / PAGE_SIZE) -#define FREEZE_SUSPENDED_THRESHOLD_LOW 2 #define FREEZE_SUSPENDED_THRESHOLD_DEFAULT 4 +#define FREEZE_PROCESSES_MAX 20 #define FREEZE_DAILY_MB_MAX_DEFAULT 1024 +#define FREEZE_DEGRADATION_BUDGET_THRESHOLD 25 //degraded perf. when the daily budget left falls below this threshold percentage + +#define MAX_FROZEN_SHARED_MB_PERCENT 10 /* max shared MB calculated as percent of system task limit. */ +#define MAX_FROZEN_PROCESS_DEMOTIONS 2 /* max demotions of frozen processes into IDLE band done daily. */ +#define MIN_THAW_DEMOTION_THRESHOLD 5 /* min # of thaws required for a process to be safe from demotion. */ +#define MIN_THAW_REFREEZE_THRESHOLD 3 /* min # of global thaws needed for us to consider refreezing these processes. */ typedef struct throttle_interval_t { uint32_t mins; @@ -454,7 +516,6 @@ typedef struct throttle_interval_t { uint32_t pageouts; uint32_t max_pageouts; mach_timespec_t ts; - boolean_t throttle; } throttle_interval_t; extern boolean_t memorystatus_freeze_enabled; @@ -462,6 +523,11 @@ extern int memorystatus_freeze_wakeup; extern void memorystatus_freeze_init(void) __attribute__((section("__TEXT, initcode"))); extern int memorystatus_freeze_process_sync(proc_t p); + +#if DEVELOPMENT || DEBUG +#define FREEZER_CONTROL_GET_STATUS (1) +#endif /* DEVELOPMENT || DEBUG */ + #endif /* CONFIG_FREEZE */ #if VM_PRESSURE_EVENTS diff --git a/bsd/sys/kern_overrides.h b/bsd/sys/kern_overrides.h index b1b486528..e2212a3c3 100644 --- a/bsd/sys/kern_overrides.h +++ b/bsd/sys/kern_overrides.h @@ -36,13 +36,42 @@ __BEGIN_DECLS +/* + * system_override() system call + * + * The system_override() syscall is used to modify some kernel performance mechanisms. + * The system call needs a special entitlement and should be used with extreme caution. + * A misuse of this syscall could lead to severe performance and battery life issues. + * + * The caller needs to specify the mask for the specific mechanisms to modify and a + * timeout. The implementation of this system call blocks the thread in the syscall + * for the duration specified in the call. Blocking a thread in the system call allows + * the kernel to revert the modification in case the calling process dies. It also + * makes the change of behavior extremely obvious due to the backtrace of the calling + * thread. + * + * Multiple agents are allowed to call this interface at the same time. The behavior + * change is effective from the time the first call is made (for a specific mechanism) + * until the longest timeout specified by any agent. If the caller wishes to disable + * the behavior change caused by itself, it can call the same interface with the + * SYS_OVERRIDE_DISABLE flag and the mechanism mask from another thread in the same + * process. Note that this does not break out the original thread from the block + * immediately. It simply undoes the mechanism change underneath. + * + * The currently supported overrides are: + * - SYS_OVERRIDE_IO_THROTTLE: Modifies I/O throttling behavior + * - SYS_OVERRIDE_CPU_THROTTLE: Modifies background stepper throttling mechanism + * - SYS_OVERRIDE_FAST_JETSAM: Modifies jetsam behavior to use aggressive parallel jetsam + * + */ + /* System Overrides Flags */ -#define SYS_OVERRIDE_DISABLE 0x0 +#define SYS_OVERRIDE_DISABLE (~(~0ull >> 1)) #define SYS_OVERRIDE_IO_THROTTLE 0x1 #define SYS_OVERRIDE_CPU_THROTTLE 0x2 +#define SYS_OVERRIDE_FAST_JETSAM 0x4 - -#define SYS_OVERRIDE_FLAGS_MASK (SYS_OVERRIDE_DISABLE | SYS_OVERRIDE_IO_THROTTLE | SYS_OVERRIDE_CPU_THROTTLE) +#define SYS_OVERRIDE_FLAGS_MASK (SYS_OVERRIDE_DISABLE | SYS_OVERRIDE_IO_THROTTLE | SYS_OVERRIDE_CPU_THROTTLE | SYS_OVERRIDE_FAST_JETSAM) #ifdef BSD_KERNEL_PRIVATE void init_system_override(void); diff --git a/bsd/sys/kpi_mbuf.h b/bsd/sys/kpi_mbuf.h index d877f0974..9ecef199c 100644 --- a/bsd/sys/kpi_mbuf.h +++ b/bsd/sys/kpi_mbuf.h @@ -1535,6 +1535,10 @@ extern errno_t mbuf_get_traffic_class_index(mbuf_traffic_class_t tc, medium loss tolerant, elastic flow, constant packet interval, variable rate & size. This level corresponds to WMM access class "VI" or MBUF_TC_VI. + @constant MBUF_SC_SIG "Signaling", low delay tolerant, low loss + tolerant, inelastic flow, jitter tolerant, rate is bursty but + short, variable size. e.g. SIP. This level corresponds to WMM + access class "VI" or MBUF_TC_VI. @constant MBUF_SC_VO "Interactive Voice", low delay tolerant, low loss tolerant, inelastic flow, constant packet rate, somewhat fixed size. This level corresponds to WMM access class "VO" or @@ -1556,6 +1560,7 @@ typedef enum { MBUF_SC_AV = 0x00280120, MBUF_SC_RV = 0x00300110, MBUF_SC_VI = 0x00380100, + MBUF_SC_SIG = 0x00380130, MBUF_SC_VO = 0x00400180, MBUF_SC_CTL = 0x00480190, /* highest class */ diff --git a/bsd/sys/linker_set.h b/bsd/sys/linker_set.h index 8fd29dbb9..4ebdcec61 100644 --- a/bsd/sys/linker_set.h +++ b/bsd/sys/linker_set.h @@ -146,15 +146,8 @@ struct linker_set_entry { * Iterates over the members of _set within _object. Since the set contains * pointers to its elements, for a set of elements of type etyp, _pvar must * be (etyp **). - * set_member_type **LINKER_SET_OBJECT_ITEM(_object, _set, _i) - * Returns a pointer to the _i'th element of _set within _object. - * - * void **LINKER_SET_BEGIN(_set) - * void **LINKER_SET_LIMINT(_set) * LINKER_SET_FOREACH((set_member_type **)_pvar, _cast, _set) - * set_member_type **LINKER_SET_ITEM(_set, _i) - * These versions implicitly reference the kernel/application object. - * + * * Example of _cast: For the _pvar "struct sysctl_oid **oidpp", _cast would be * "struct sysctl_oid **" * @@ -168,17 +161,11 @@ struct linker_set_entry { _pvar < (_cast) LINKER_SET_OBJECT_LIMIT(_object, _set); \ _pvar++) -#define LINKER_SET_OBJECT_ITEM(_object, _set, _i) \ - ((LINKER_SET_OBJECT_BEGIN(_object, _set))[_i]) +#define LINKER_SET_OBJECT_ITEM(_object, _cast, _set, _i) \ + (((_cast)(LINKER_SET_OBJECT_BEGIN(_object, _set)))[_i]) -#define LINKER_SET_BEGIN(_set) \ - LINKER_SET_OBJECT_BEGIN((kernel_mach_header_t *)&_mh_execute_header, _set) -#define LINKER_SET_LIMIT(_set) \ - LINKER_SET_OBJECT_LIMIT((kernel_mach_header_t *)&_mh_execute_header, _set) #define LINKER_SET_FOREACH(_pvar, _cast, _set) \ LINKER_SET_OBJECT_FOREACH((kernel_mach_header_t *)&_mh_execute_header, _pvar, _cast, _set) -#define LINKER_SET_ITEM(_set, _i) \ - LINKER_SET_OBJECT_ITEM((kernel_mach_header_t *)&_mh_execute_header, _set, _i) /* * Implementation. diff --git a/bsd/sys/lockstat.h b/bsd/sys/lockstat.h index a9e536d7a..870789261 100644 --- a/bsd/sys/lockstat.h +++ b/bsd/sys/lockstat.h @@ -182,7 +182,7 @@ extern void (lockstat_probe_wrapper)(int, uintptr_t, int); #define LOCKSTAT_RECORD4(probe, lp, arg0, arg1, arg2, arg3) \ { \ dtrace_id_t id; \ - if ((id = lockstat_probemap[(probe)])) { \ + if (__improbable(id = lockstat_probemap[(probe)])) { \ (*lockstat_probe)(id, (uintptr_t)(lp), (arg0), \ (arg1), (arg2), (arg3)); \ } \ diff --git a/bsd/sys/malloc.h b/bsd/sys/malloc.h index 0dd7117f5..fea78a29c 100644 --- a/bsd/sys/malloc.h +++ b/bsd/sys/malloc.h @@ -220,8 +220,9 @@ #define M_EVENTHANDLER 125 /* Eventhandler */ #define M_LLTABLE 126 /* Link layer table */ #define M_NWKWQ 127 /* Network work queue */ +#define M_CFIL 128 /* Content Filter */ -#define M_LAST 128 /* Must be last type + 1 */ +#define M_LAST 129 /* Must be last type + 1 */ #else /* BSD_KERNEL_PRIVATE */ diff --git a/bsd/sys/mbuf.h b/bsd/sys/mbuf.h index 8dafca387..b89ee1459 100644 --- a/bsd/sys/mbuf.h +++ b/bsd/sys/mbuf.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2017 Apple Inc. All rights reserved. + * Copyright (c) 1999-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -79,14 +79,17 @@ #ifndef _SYS_MBUF_H_ #define _SYS_MBUF_H_ -#include #include +#include #include /* u_int32_t */ #include /* u_int64_t */ #include /* u_short */ -#ifdef XNU_KERNEL_PRIVATE +#ifdef KERNEL +#include +#endif +#ifdef XNU_KERNEL_PRIVATE #include #include #include @@ -228,6 +231,8 @@ struct tcp_pktinfo { struct { u_int32_t segsz; /* segment size (actual MSS) */ u_int32_t start_seq; /* start seq of this packet */ + pid_t pid; + pid_t e_pid; } __tx; struct { u_int16_t lro_pktlen; /* max seg size encountered */ @@ -241,6 +246,8 @@ struct tcp_pktinfo { } __msgattr; #define tso_segsz proto_mtag.__pr_u.tcp.tm_tcp.__offload.__tx.segsz #define tx_start_seq proto_mtag.__pr_u.tcp.tm_tcp.__offload.__tx.start_seq +#define tx_tcp_pid proto_mtag.__pr_u.tcp.tm_tcp.__offload.__tx.pid +#define tx_tcp_e_pid proto_mtag.__pr_u.tcp.tm_tcp.__offload.__tx.e_pid #define lro_pktlen proto_mtag.__pr_u.tcp.tm_tcp.__offload.__rx.lro_pktlen #define lro_npkts proto_mtag.__pr_u.tcp.tm_tcp.__offload.__rx.lro_npkts #define lro_elapsed proto_mtag.__pr_u.tcp.tm_tcp.__offload.__rx.lro_timediff @@ -275,6 +282,20 @@ struct tcp_mtag { }; }; +struct udp_mtag { + pid_t _pid; + pid_t _e_pid; +#define tx_udp_pid proto_mtag.__pr_u.udp._pid +#define tx_udp_e_pid proto_mtag.__pr_u.udp._e_pid +}; + +struct rawip_mtag { + pid_t _pid; + pid_t _e_pid; +#define tx_rawip_pid proto_mtag.__pr_u.rawip._pid +#define tx_rawip_e_pid proto_mtag.__pr_u.rawip._e_pid +}; + struct driver_mtag_ { uintptr_t _drv_tx_compl_arg; uintptr_t _drv_tx_compl_data; @@ -297,6 +318,8 @@ struct driver_mtag_ { struct proto_mtag_ { union { struct tcp_mtag tcp; /* TCP specific */ + struct udp_mtag udp; /* UDP specific */ + struct rawip_mtag rawip; /* raw IPv4/IPv6 specific */ } __pr_u; }; @@ -305,9 +328,10 @@ struct proto_mtag_ { */ struct necp_mtag_ { u_int32_t necp_policy_id; - u_int32_t necp_last_interface_index; + u_int32_t necp_skip_policy_id; u_int32_t necp_route_rule_id; - u_int32_t necp_app_id; + u_int16_t necp_last_interface_index; + u_int16_t necp_app_id; }; union builtin_mtag { @@ -346,7 +370,11 @@ struct pkthdr { } _csum_tx; #define csum_tx_start _csum_tx.start #define csum_tx_stuff _csum_tx.stuff - u_int32_t csum_data; /* data field used by csum routines */ + /* + * Generic data field used by csum routines. + * It gets used differently in different contexts. + */ + u_int32_t csum_data; }; u_int16_t vlan_tag; /* VLAN tag, host byte order */ /* @@ -758,36 +786,61 @@ union m16kcluster { #define M_COPY_CLASSIFIER(to, from) m_copy_classifier(to, from) /* - * Set the m_data pointer of a newly-allocated mbuf (m_get/MGET) to place - * an object of the specified size at the end of the mbuf, longword aligned. + * Evaluate TRUE if it's safe to write to the mbuf m's data region (this can + * be both the local data payload, or an external buffer area, depending on + * whether M_EXT is set). */ -#define M_ALIGN(m, len) \ -do { \ - (m)->m_data += (MLEN - (len)) &~ (sizeof (long) - 1); \ -} while (0) +#define M_WRITABLE(m) (((m)->m_flags & M_EXT) == 0 || !MCLHASREFERENCE(m)) /* - * As above, for mbufs allocated with m_gethdr/MGETHDR - * or initialized by M_COPY_PKTHDR. + * These macros are mapped to the appropriate KPIs, so that private code + * can be simply recompiled in order to be forward-compatible with future + * changes toward the struture sizes. + */ +#define MLEN mbuf_get_mlen() /* normal mbuf data len */ +#define MHLEN mbuf_get_mhlen() /* data len in an mbuf w/pkthdr */ +#define MINCLSIZE mbuf_get_minclsize() /* cluster usage threshold */ +/* + * Return the address of the start of the buffer associated with an mbuf, + * handling external storage, packet-header mbufs, and regular data mbufs. */ -#define MH_ALIGN(m, len) \ -do { \ - (m)->m_data += (MHLEN - (len)) &~ (sizeof (long) - 1); \ -} while (0) +#define M_START(m) \ + (((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf : \ + ((m)->m_flags & M_PKTHDR) ? &(m)->m_pktdat[0] : \ + &(m)->m_dat[0]) /* - * Compute the amount of space available - * before the current start of data in an mbuf. - * Subroutine - data not available if certain references. + * Return the size of the buffer associated with an mbuf, handling external + * storage, packet-header mbufs, and regular data mbufs. */ -#define M_LEADINGSPACE(m) m_leadingspace(m) +#define M_SIZE(m) \ + (((m)->m_flags & M_EXT) ? (m)->m_ext.ext_size : \ + ((m)->m_flags & M_PKTHDR) ? MHLEN : \ + MLEN) + +#define M_ALIGN(m, len) m_align(m, len) +#define MH_ALIGN(m, len) m_align(m, len) +#define MEXT_ALIGN(m, len) m_align(m, len) + +/* + * Compute the amount of space available before the current start of data in + * an mbuf. + * + * The M_WRITABLE() is a temporary, conservative safety measure: the burden + * of checking writability of the mbuf data area rests solely with the caller. + */ +#define M_LEADINGSPACE(m) \ + (M_WRITABLE(m) ? ((m)->m_data - M_START(m)) : 0) /* - * Compute the amount of space available - * after the end of data in an mbuf. - * Subroutine - data not available if certain references. + * Compute the amount of space available after the end of data in an mbuf. + * + * The M_WRITABLE() is a temporary, conservative safety measure: the burden + * of checking writability of the mbuf data area rests solely with the caller. */ -#define M_TRAILINGSPACE(m) m_trailingspace(m) +#define M_TRAILINGSPACE(m) \ + (M_WRITABLE(m) ? \ + ((M_START(m) + M_SIZE(m)) - ((m)->m_data + (m)->m_len)) : 0) /* * Arrange to prepend space of size plen to mbuf m. @@ -1175,16 +1228,6 @@ struct mbuf; #define M_COPYM_MUST_COPY_HDR 3 /* MUST copy pkthdr from old to new */ #define M_COPYM_MUST_MOVE_HDR 4 /* MUST move pkthdr from old to new */ -/* - * These macros are mapped to the appropriate KPIs, so that private code - * can be simply recompiled in order to be forward-compatible with future - * changes toward the struture sizes. - */ -#define MLEN mbuf_get_mlen() /* normal data len */ -#define MHLEN mbuf_get_mhlen() /* data len w/pkthdr */ - -#define MINCLSIZE mbuf_get_minclsize() /* cluster usage threshold */ - extern void m_freem(struct mbuf *); extern u_int64_t mcl_to_paddr(char *); extern void m_adj(struct mbuf *, int); @@ -1247,6 +1290,7 @@ extern void m_mclfree(caddr_t p); * MBUF_SC_AV ] ==> MBUF_TC_VI * MBUF_SC_RV ] * MBUF_SC_VI ] + * MBUF_SC_SIG ] * * MBUF_SC_VO ] ==> MBUF_TC_VO * MBUF_SC_CTL ] @@ -1276,6 +1320,7 @@ extern void m_mclfree(caddr_t p); #define SCIDX_AV MBUF_SCIDX(MBUF_SC_AV) #define SCIDX_RV MBUF_SCIDX(MBUF_SC_RV) #define SCIDX_VI MBUF_SCIDX(MBUF_SC_VI) +#define SCIDX_SIG MBUF_SCIDX(MBUF_SC_SIG) #define SCIDX_VO MBUF_SCIDX(MBUF_SC_VO) #define SCIDX_CTL MBUF_SCIDX(MBUF_SC_CTL) @@ -1287,26 +1332,27 @@ extern void m_mclfree(caddr_t p); #define SCVAL_AV MBUF_SCVAL(MBUF_SC_AV) #define SCVAL_RV MBUF_SCVAL(MBUF_SC_RV) #define SCVAL_VI MBUF_SCVAL(MBUF_SC_VI) +#define SCVAL_SIG MBUF_SCVAL(MBUF_SC_SIG) #define SCVAL_VO MBUF_SCVAL(MBUF_SC_VO) #define SCVAL_CTL MBUF_SCVAL(MBUF_SC_CTL) #define MBUF_VALID_SC(c) \ (c == MBUF_SC_BK_SYS || c == MBUF_SC_BK || c == MBUF_SC_BE || \ c == MBUF_SC_RD || c == MBUF_SC_OAM || c == MBUF_SC_AV || \ - c == MBUF_SC_RV || c == MBUF_SC_VI || c == MBUF_SC_VO || \ - c == MBUF_SC_CTL) + c == MBUF_SC_RV || c == MBUF_SC_VI || c == MBUF_SC_SIG || \ + c == MBUF_SC_VO || c == MBUF_SC_CTL) #define MBUF_VALID_SCIDX(c) \ (c == SCIDX_BK_SYS || c == SCIDX_BK || c == SCIDX_BE || \ c == SCIDX_RD || c == SCIDX_OAM || c == SCIDX_AV || \ - c == SCIDX_RV || c == SCIDX_VI || c == SCIDX_VO || \ - c == SCIDX_CTL) + c == SCIDX_RV || c == SCIDX_VI || c == SCIDX_SIG || \ + c == SCIDX_VO || c == SCIDX_CTL) #define MBUF_VALID_SCVAL(c) \ (c == SCVAL_BK_SYS || c == SCVAL_BK || c == SCVAL_BE || \ c == SCVAL_RD || c == SCVAL_OAM || c == SCVAL_AV || \ - c == SCVAL_RV || c == SCVAL_VI || c == SCVAL_VO || \ - c == SCVAL_CTL) + c == SCVAL_RV || c == SCVAL_VI || c == SCVAL_SIG || \ + c == SCVAL_VO || SCVAL_CTL) extern unsigned char *mbutl; /* start VA of mbuf pool */ extern unsigned char *embutl; /* end VA of mbuf pool */ @@ -1363,8 +1409,7 @@ __private_extern__ struct mbuf *m_dtom(void *); __private_extern__ int m_mtocl(void *); __private_extern__ union mcluster *m_cltom(int); -__private_extern__ int m_trailingspace(struct mbuf *); -__private_extern__ int m_leadingspace(struct mbuf *); +__private_extern__ void m_align(struct mbuf *, int); __private_extern__ struct mbuf *m_normalize(struct mbuf *m); __private_extern__ void m_mchtype(struct mbuf *m, int t); @@ -1389,7 +1434,7 @@ __private_extern__ uint32_t m_ext_get_prop(struct mbuf *); __private_extern__ int m_ext_paired_is_active(struct mbuf *); __private_extern__ void m_ext_paired_activate(struct mbuf *); -__private_extern__ void m_drain(void); +__private_extern__ void mbuf_drain(boolean_t); /* * Packets may have annotations attached by affixing a list of "packet @@ -1432,6 +1477,7 @@ enum { KERNEL_TAG_TYPE_INET6 = 9, KERNEL_TAG_TYPE_IPSEC = 10, KERNEL_TAG_TYPE_DRVAUX = 11, + KERNEL_TAG_TYPE_CFIL_UDP = 13, }; /* Packet tag routines */ @@ -1451,13 +1497,6 @@ __private_extern__ void m_tag_init(struct mbuf *, int); __private_extern__ struct m_tag *m_tag_first(struct mbuf *); __private_extern__ struct m_tag *m_tag_next(struct mbuf *, struct m_tag *); -__END_DECLS -#endif /* XNU_KERNEL_PRIVATE */ -#ifdef KERNEL -#include -#ifdef XNU_KERNEL_PRIVATE -__BEGIN_DECLS - __private_extern__ void m_scratch_init(struct mbuf *); __private_extern__ u_int32_t m_scratch_get(struct mbuf *, u_int8_t **); @@ -1485,9 +1524,9 @@ __private_extern__ struct ext_ref *m_get_rfa(struct mbuf *); __private_extern__ m_ext_free_func_t m_get_ext_free(struct mbuf *); __private_extern__ caddr_t m_get_ext_arg(struct mbuf *); -extern void m_do_tx_compl_callback(struct mbuf *, struct ifnet *); +__private_extern__ void m_do_tx_compl_callback(struct mbuf *, struct ifnet *); +__private_extern__ mbuf_tx_compl_func m_get_tx_compl_callback(u_int32_t); __END_DECLS #endif /* XNU_KERNEL_PRIVATE */ -#endif /* KERNEL */ #endif /* !_SYS_MBUF_H_ */ diff --git a/bsd/sys/mcache.h b/bsd/sys/mcache.h index 9bf6ed529..b8007aa6d 100644 --- a/bsd/sys/mcache.h +++ b/bsd/sys/mcache.h @@ -37,7 +37,6 @@ extern "C" { #include #include #include -#include #include #include diff --git a/bsd/sys/monotonic.h b/bsd/sys/monotonic.h index 883b6a0ad..e880a9a0a 100644 --- a/bsd/sys/monotonic.h +++ b/bsd/sys/monotonic.h @@ -12,9 +12,16 @@ __BEGIN_DECLS * XXX These declarations are subject to change at any time. */ +#define MT_IOC(x) _IO('m', (x)) + +#define MT_IOC_RESET MT_IOC(0) + +#define MT_IOC_ADD MT_IOC(1) + struct monotonic_config { uint64_t event; uint64_t allowed_ctr_mask; + uint64_t cpu_mask; }; union monotonic_ctl_add { @@ -27,12 +34,20 @@ union monotonic_ctl_add { } out; }; +/* + * - Consider a separate IOC for disable -- to avoid the copyin to determine + * which way to set it. + */ +#define MT_IOC_ENABLE MT_IOC(2) + union monotonic_ctl_enable { struct { bool enable; } in; }; +#define MT_IOC_COUNTS MT_IOC(3) + union monotonic_ctl_counts { struct { uint64_t ctr_mask; @@ -43,24 +58,15 @@ union monotonic_ctl_counts { } out; }; -#define MT_IOC(x) _IO('m', (x)) +#define MT_IOC_GET_INFO MT_IOC(4) -/* - * FIXME - * - * - Consider a separate IOC for disable -- to avoid the copyin to determine which way to set it. - * - * - Maybe IOC_COUNTS should just return all the enable counters' counts. - */ -enum monotonic_ioc { - MT_IOC_RESET = MT_IOC(0), - MT_IOC_ADD = MT_IOC(1), - MT_IOC_ENABLE = MT_IOC(2), - MT_IOC_COUNTS = MT_IOC(3), +union monotonic_ctl_info { + struct { + unsigned int nmonitors; + unsigned int ncounters; + } out; }; -#undef MT_IOC - #if XNU_KERNEL_PRIVATE #include @@ -125,18 +131,22 @@ enum monotonic_ioc { #define MT_KDBG_TMPTH_START(CODE) MT_KDBG_TMPTH_(CODE, DBG_FUNC_START) #define MT_KDBG_TMPTH_END(CODE) MT_KDBG_TMPTH_(CODE, DBG_FUNC_END) -/* maybe provider, bank, group, set, unit, pmu */ - -struct monotonic_dev { +struct mt_device { const char *mtd_name; - int (*mtd_init)(void); - int (*mtd_add)(struct monotonic_config *config, uint32_t *ctr_out); - void (*mtd_reset)(void); - void (*mtd_enable)(bool enable); - int (*mtd_read)(uint64_t ctr_mask, uint64_t *counts_out); + int (* const mtd_init)(struct mt_device *dev); + int (* const mtd_add)(struct monotonic_config *config, uint32_t *ctr_out); + void (* const mtd_reset)(void); + void (* const mtd_enable)(bool enable); + int (* const mtd_read)(uint64_t ctr_mask, uint64_t *counts_out); + decl_lck_mtx_data(, mtd_lock); + + uint8_t mtd_nmonitors; + uint8_t mtd_ncounters; + bool mtd_inuse; }; +typedef struct mt_device *mt_device_t; -extern const struct monotonic_dev monotonic_devs[]; +extern struct mt_device mt_devices[]; extern lck_grp_t *mt_lock_grp; diff --git a/bsd/sys/mount_internal.h b/bsd/sys/mount_internal.h index 644faceb1..243c75e02 100644 --- a/bsd/sys/mount_internal.h +++ b/bsd/sys/mount_internal.h @@ -222,6 +222,11 @@ struct mount { */ #define MNT_DEFAULT_IOQUEUE_DEPTH 32 +/* + * mnt_ioscale value for the given ioqueue depth + */ +#define MNT_IOSCALE(ioqueue_depth) ((ioqueue_depth + (MNT_DEFAULT_IOQUEUE_DEPTH - 1)) / MNT_DEFAULT_IOQUEUE_DEPTH) + /* mount point to which dead vps point to */ extern struct mount * dead_mountp; @@ -484,7 +489,7 @@ extern int num_trailing_0(uint64_t n); /* sync lock */ extern lck_mtx_t * sync_mtx_lck; -extern int sync_timeout; +extern int sync_timeout_seconds; __END_DECLS diff --git a/bsd/sys/namei.h b/bsd/sys/namei.h index 2f5b90bfb..26a5c707b 100644 --- a/bsd/sys/namei.h +++ b/bsd/sys/namei.h @@ -260,6 +260,7 @@ int relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp); int lookup_traverse_union(vnode_t dvp, vnode_t *new_dvp, vfs_context_t ctx); void lookup_compound_vnop_post_hook(int error, vnode_t dvp, vnode_t vp, struct nameidata *ndp, int did_create); +void kdebug_lookup(struct vnode *dp, struct componentname *cnp); /* * namecache function prototypes diff --git a/bsd/sys/persona.h b/bsd/sys/persona.h index 4e3d71f46..64d135168 100644 --- a/bsd/sys/persona.h +++ b/bsd/sys/persona.h @@ -177,10 +177,12 @@ int kpersona_find(const char *name, uid_t uid, uid_t *id, size_t *idlen); #include #include #include +#include #ifdef PERSONA_DEBUG +#include #define persona_dbg(fmt, ...) \ - printf("[%4d] %s: " fmt "\n", \ + os_log(OS_LOG_DEFAULT, "[%4d] %s: " fmt "\n", \ current_proc() ? current_proc()->p_pid : -1, \ __func__, ## __VA_ARGS__) #else @@ -193,7 +195,7 @@ int kpersona_find(const char *name, uid_t uid, uid_t *id, size_t *idlen); #ifdef XNU_KERNEL_PRIVATE /* only XNU proper needs to see the persona structure */ struct persona { - int32_t pna_refcount; + os_refcnt_t pna_refcount; int32_t pna_valid; uid_t pna_id; @@ -323,6 +325,9 @@ void personas_bootstrap(void); struct persona *persona_alloc(uid_t id, const char *login, int type, int *error); +int persona_init_begin(struct persona *persona); +void persona_init_end(struct persona *persona, int error); + struct persona *persona_lookup_and_invalidate(uid_t id); static inline int proc_has_persona(proc_t p) diff --git a/bsd/sys/priv.h b/bsd/sys/priv.h index 1933800d2..688da6449 100644 --- a/bsd/sys/priv.h +++ b/bsd/sys/priv.h @@ -133,7 +133,9 @@ #define PRIV_VFS_SNAPSHOT_REVERT 14003 /* Allow reverting filesystem to a previous snapshot */ #define PRIV_APFS_EMBED_DRIVER 14100 /* Allow embedding an EFI driver into the APFS container */ -#define PRIV_APFS_FUSION_DEBUG 14101 /* Allow getting internal statistics and controlling the APFS fusion container */ +#define PRIV_APFS_FUSION_DEBUG 14101 /* Allow getting internal statistics and controlling the APFS Fusion container */ +#define PRIV_APFS_FUSION_ALLOW_PIN_FASTPROMOTE 14102 /* Allow changing pinned/fastPromote inode flags in APFS Fusion container */ + #ifdef KERNEL /* * Privilege check interface. No flags are currently defined for the API. diff --git a/bsd/sys/proc.h b/bsd/sys/proc.h index 652cc74b9..46536dee4 100644 --- a/bsd/sys/proc.h +++ b/bsd/sys/proc.h @@ -219,6 +219,8 @@ struct extern_proc { #define P_DIRTY_MARKED 0x00000080 /* marked dirty previously */ #define P_DIRTY_AGING_IN_PROGRESS 0x00000100 /* aging in one of the 'aging bands' */ #define P_DIRTY_LAUNCH_IN_PROGRESS 0x00000200 /* launch is in progress */ +#define P_DIRTY_DEFER_ALWAYS 0x00000400 /* defer going to idle-exit after every dirty->clean transition. + * For legacy jetsam policy only. This is the default with the other policies.*/ #define P_DIRTY_IS_DIRTY (P_DIRTY | P_DIRTY_SHUTDOWN) #define P_DIRTY_IDLE_EXIT_ENABLED (P_DIRTY_TRACK|P_DIRTY_ALLOW_IDLE_EXIT) @@ -255,6 +257,8 @@ extern int proc_isinferior(int pid1, int pid2); * routine is to be used typically for debugging */ void proc_name(int pid, char * buf, int size); +/* returns the 32-byte name if it exists, otherwise returns the 16-byte name */ +extern char *proc_best_name(proc_t p); /* This routine is simillar to proc_name except it returns for current process */ void proc_selfname(char * buf, int size); @@ -274,15 +278,23 @@ extern int proc_noremotehang(proc_t); extern int proc_forcequota(proc_t); /* returns 1 if the process is chrooted */ extern int proc_chrooted(proc_t); +/* returns TRUE if a sync EXC_RESOURCE should be sent for the process */ +extern boolean_t proc_send_synchronous_EXC_RESOURCE(proc_t p); -/* this routine returns 1 if the process is running with 64bit address space, else 0 */ +/* this routine returns 1 if the process is running with a 64bit address space, else 0 */ extern int proc_is64bit(proc_t); +/* this routine returns 1 if the process is running with a 64bit register state, else 0 */ +extern int proc_is64bit_data(proc_t); /* is this process exiting? */ extern int proc_exiting(proc_t); +/* returns whether the process has started down proc_exit() */ +extern int proc_in_teardown(proc_t); /* this routine returns error if the process is not one with super user privileges */ int proc_suser(proc_t p); /* returns the cred assicaited with the process; temporary api */ kauth_cred_t proc_ucred(proc_t p); +/* returns 1 if the process is tainted by uid or gid changes,e else 0 */ +extern int proc_issetugid(proc_t p); extern int proc_tbe(proc_t); @@ -367,7 +379,7 @@ extern void proc_coalitionids(proc_t, uint64_t [COALITION_NUM_TYPES]); #ifdef CONFIG_32BIT_TELEMETRY extern void proc_log_32bit_telemetry(proc_t p); #endif /* CONFIG_32BIT_TELEMETRY */ - +extern uint64_t get_current_unique_pid(void); #endif /* XNU_KERNEL_PRIVATE*/ #ifdef KERNEL_PRIVATE diff --git a/bsd/sys/proc_info.h b/bsd/sys/proc_info.h index f28ae3d10..8e247fcf1 100644 --- a/bsd/sys/proc_info.h +++ b/bsd/sys/proc_info.h @@ -646,7 +646,10 @@ struct kqueue_dyninfo { uint8_t kqdi_async_qos; uint16_t kqdi_request_state; uint8_t kqdi_events_qos; - uint8_t _kqdi_reserved0[7]; + uint8_t kqdi_pri; + uint8_t kqdi_pol; + uint8_t kqdi_cpupercent; + uint8_t _kqdi_reserved0[4]; uint64_t _kqdi_reserved1[4]; }; @@ -724,7 +727,6 @@ struct proc_fileportinfo { #define PROC_PIDLISTTHREADS 6 #define PROC_PIDLISTTHREADS_SIZE (2* sizeof(uint32_t)) - #define PROC_PIDREGIONINFO 7 #define PROC_PIDREGIONINFO_SIZE (sizeof(struct proc_regioninfo)) @@ -793,8 +795,12 @@ struct proc_fileportinfo { #define PROC_PIDLISTDYNKQUEUES 27 #define PROC_PIDLISTDYNKQUEUES_SIZE (sizeof(kqueue_id_t)) -#endif +#define PROC_PIDLISTTHREADIDS 28 +#define PROC_PIDLISTTHREADIDS_SIZE (2* sizeof(uint32_t)) +#define PROC_PIDVMRTFAULTINFO 29 +#define PROC_PIDVMRTFAULTINFO_SIZE (7 * sizeof(uint64_t)) +#endif /* PRIVATE */ /* Flavors for proc_pidfdinfo */ #define PROC_PIDFDVNODEINFO 1 @@ -865,6 +871,7 @@ struct proc_fileportinfo { #define PROC_DIRTY_ALLOW_IDLE_EXIT 0x2 #define PROC_DIRTY_DEFER 0x4 #define PROC_DIRTY_LAUNCH_IN_PROGRESS 0x8 +#define PROC_DIRTY_DEFER_ALWAYS 0x10 /* proc_get_dirty() flags */ #define PROC_DIRTY_TRACKED 0x1 @@ -929,7 +936,6 @@ struct proc_fileportinfo { #define PROC_INFO_CALL_CANUSEFGHW 0xc #define PROC_INFO_CALL_PIDDYNKQUEUEINFO 0xd #define PROC_INFO_CALL_UDATA_INFO 0xe - #endif /* PRIVATE */ #ifdef XNU_KERNEL_PRIVATE diff --git a/bsd/sys/proc_internal.h b/bsd/sys/proc_internal.h index 7119591d2..c2aacbc96 100644 --- a/bsd/sys/proc_internal.h +++ b/bsd/sys/proc_internal.h @@ -194,7 +194,6 @@ struct proc; struct proc { LIST_ENTRY(proc) p_list; /* List of all processes. */ - pid_t p_pid; /* Process identifier. (static)*/ void * task; /* corresponding task (static)*/ struct proc * p_pptr; /* Pointer to parent process.(LL) */ pid_t p_ppid; /* process's parent pid number */ @@ -209,7 +208,7 @@ struct proc { uint64_t p_puniqueid; /* parent's unique ID - set on fork/spawn/vfork, doesn't change if reparented. */ lck_mtx_t p_mlock; /* mutex lock for proc */ - + pid_t p_pid; /* Process identifier. (static)*/ char p_stat; /* S* process status. (PL)*/ char p_shutdownstate; char p_kdebug; /* P_KDEBUG eq (CC)*/ @@ -238,12 +237,12 @@ struct proc { struct plimit *p_limit; /* Process limits.(PL) */ struct sigacts *p_sigacts; /* Signal actions, state (PL) */ - int p_siglist; /* signals captured back from threads */ lck_spin_t p_slock; /* spin lock for itimer/profil protection */ #define p_rlimit p_limit->pl_rlimit struct plimit *p_olimit; /* old process limits - not inherited by child (PL) */ + int p_siglist; /* signals captured back from threads */ unsigned int p_flag; /* P_* flags. (atomic bit ops) */ unsigned int p_lflag; /* local flags (PL) */ unsigned int p_listflag; /* list flags (LL) */ @@ -251,10 +250,8 @@ struct proc { int p_refcount; /* number of outstanding users(LL) */ int p_childrencnt; /* children holding ref on parent (LL) */ int p_parentref; /* children lookup ref on parent (LL) */ - pid_t p_oppid; /* Save parent pid during ptrace. XXX */ u_int p_xstat; /* Exit status for wait; also stop signal. */ - uint8_t p_xhighbits; /* Stores the top byte of exit status to avoid truncation*/ #ifdef _PROC_HAS_SCHEDINFO_ /* may need cleanup, not used */ @@ -273,11 +270,9 @@ struct proc { boolean_t sigwait; /* indication to suspend (PL) */ void *sigwait_thread; /* 'thread' holding sigwait(PL) */ void *exit_thread; /* Which thread is exiting(PL) */ + void * p_vforkact; /* activation running this vfork proc)(static) */ int p_vforkcnt; /* number of outstanding vforks(PL) */ - void * p_vforkact; /* activation running this vfork proc)(static) */ int p_fpdrainwait; /* (PFDL) */ - pid_t p_contproc; /* last PID to send us a SIGCONT (PL) */ - /* Following fields are info from SIGCHLD (PL) */ pid_t si_pid; /* (PL) */ u_int si_status; /* (PL) */ @@ -290,9 +285,9 @@ struct proc { user_addr_t p_dtrace_argv; /* (write once, read only after that) */ user_addr_t p_dtrace_envp; /* (write once, read only after that) */ lck_mtx_t p_dtrace_sprlock; /* sun proc lock emulation */ + uint8_t p_dtrace_stop; /* indicates a DTrace-desired stop */ int p_dtrace_probes; /* (PL) are there probes for this proc? */ u_int p_dtrace_count; /* (sprlock) number of DTrace tracepoints */ - uint8_t p_dtrace_stop; /* indicates a DTrace-desired stop */ struct dtrace_ptss_page* p_dtrace_ptss_pages; /* (sprlock) list of user ptss pages */ struct dtrace_ptss_page_entry* p_dtrace_ptss_free_list; /* (atomic) list of individual ptss entries */ struct dtrace_helpers* p_dtrace_helpers; /* (dtrace_lock) DTrace per-proc private */ @@ -321,7 +316,8 @@ struct proc { // types currently in sys/param.h command_t p_comm; proc_name_t p_name; /* can be changed by the process */ - + uint8_t p_xhighbits; /* Stores the top byte of exit status to avoid truncation*/ + pid_t p_contproc; /* last PID to send us a SIGCONT (PL) */ struct pgrp *p_pgrp; /* Pointer to process group. (LL) */ uint32_t p_csflags; /* flags for codesign (PL) */ @@ -346,10 +342,9 @@ struct proc { struct klist p_klist; /* knote list (PL ?)*/ struct rusage_superset *p_ru; /* Exit information. (PL) */ - int p_sigwaitcnt; thread_t p_signalholder; thread_t p_transholder; - + int p_sigwaitcnt; /* DEPRECATE following field */ u_short p_acflag; /* Accounting flags. */ volatile u_short p_vfs_iopolicy; /* VFS iopolicy flags. (atomic bit ops) */ @@ -359,7 +354,7 @@ struct proc { int p_pthsize; /* pthread size */ uint32_t p_pth_tsd_offset; /* offset from pthread_t to TSD for new threads */ user_addr_t p_stack_addr_hint; /* stack allocation hint for wq threads */ - void * p_wqptr; /* workq ptr */ + struct workqueue *_Atomic p_wqptr; /* workq ptr */ struct timeval p_start; /* starting time */ void * p_rcall; @@ -400,7 +395,9 @@ struct proc { int32_t p_memstat_memlimit_active; /* memory limit enforced when process is in active jetsam state */ int32_t p_memstat_memlimit_inactive; /* memory limit enforced when process is in inactive jetsam state */ #if CONFIG_FREEZE - uint32_t p_memstat_suspendedfootprint; /* footprint at time of suspensions */ + uint32_t p_memstat_freeze_sharedanon_pages; /* shared pages left behind after freeze */ + uint32_t p_memstat_frozen_count; + uint32_t p_memstat_thaw_count; #endif /* CONFIG_FREEZE */ #endif /* CONFIG_MEMORYSTATUS */ @@ -498,7 +495,9 @@ struct proc { #define P_LXBKIDLEINPROG 0x02 /* p_vfs_iopolicy flags */ -#define P_VFS_IOPOLICY_FORCE_HFS_CASE_SENSITIVITY 0x0001 +#define P_VFS_IOPOLICY_FORCE_HFS_CASE_SENSITIVITY 0x0001 +#define P_VFS_IOPOLICY_ATIME_UPDATES 0x0002 +#define P_VFS_IOPOLICY_VALID_MASK (P_VFS_IOPOLICY_ATIME_UPDATES | P_VFS_IOPOLICY_FORCE_HFS_CASE_SENSITIVITY) /* process creation arguments */ #define PROC_CREATE_FORK 0 /* independent child (running) */ @@ -514,10 +513,13 @@ struct proc { #ifdef KERNEL #include /* user_timeval, user_itimerval */ -/* This packing breaks symmetry with userspace side (struct extern_proc - * of proc.h) for the ARMV7K ABI where 64-bit types are 64-bit aligned +/* + * This packing is required to ensure symmetry between userspace and kernelspace + * when the kernel is 64-bit and the user application is 32-bit. All currently + * supported ARM slices (arm64/armv7k/arm64_32) contain the same struct + * alignment ABI so this packing isn't needed for ARM. */ -#if !(__arm__ && (__BIGGEST_ALIGNMENT__ > 4)) +#if defined(__x86_64__) #pragma pack(4) #endif struct user32_extern_proc { diff --git a/bsd/sys/pthread_internal.h b/bsd/sys/pthread_internal.h index 1dff9968f..3f4c3f12c 100644 --- a/bsd/sys/pthread_internal.h +++ b/bsd/sys/pthread_internal.h @@ -40,12 +40,9 @@ struct ksyn_waitq_element { #endif }; -void workqueue_mark_exiting(struct proc *); -void workqueue_exit(struct proc *); +void workq_mark_exiting(struct proc *); +void workq_exit(struct proc *); void pthread_init(void); -int thread_qos_from_pthread_priority(unsigned long, unsigned long *); -unsigned long pthread_priority_canonicalize(unsigned long priority, boolean_t propagation); -boolean_t workq_thread_has_been_unbound(thread_t th, int qos_class); #endif /* _SYS_PTHREAD_INTERNAL_H_ */ diff --git a/bsd/sys/pthread_shims.h b/bsd/sys/pthread_shims.h index 2256a4a01..03b2333a1 100644 --- a/bsd/sys/pthread_shims.h +++ b/bsd/sys/pthread_shims.h @@ -2,7 +2,7 @@ * Copyright (c) 2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ @@ -33,10 +33,13 @@ #ifndef ASSEMBLER +#include #include #include #include #include +#include +#include #include #include #include @@ -44,6 +47,7 @@ #ifndef PTHREAD_INTERNAL struct uthread; +struct ksyn_waitq_element; #define M_PROC 41 #endif @@ -52,22 +56,6 @@ struct uthread; typedef void (*sched_call_t)(int type, thread_t thread); #endif -typedef struct workq_reqthreads_req_s {unsigned long priority; int count;} *workq_reqthreads_req_t; -typedef struct workq_threadreq_s { void *opaqueptr[2]; uint32_t opaqueint[2];} *workq_threadreq_t; -enum workq_threadreq_type { - WORKQ_THREADREQ_KEVENT = 1, - WORKQ_THREADREQ_WORKLOOP = 2, - WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL = 3, - WORKQ_THREADREQ_REDRIVE = 4, -}; -enum workq_threadreq_op { - WORKQ_THREADREQ_CHANGE_PRI = 1, - WORKQ_THREADREQ_CANCEL = 2, - WORKQ_THREADREQ_CHANGE_PRI_NO_THREAD_CALL = 3, -}; -#define WORKQ_THREADREQ_FLAG_NOEMERGENCY 0x1 - - /* * Increment each time new reserved slots are used. When the pthread * kext registers this table, it will include the version of the xnu @@ -80,14 +68,14 @@ typedef const struct pthread_functions_s { /* internal calls, kernel core -> kext */ void (*pthread_init)(void); - int (*fill_procworkqueue)(proc_t p, void* pwqinfo); - void (*__unused1)(void); - void (*__unused2)(void); + void *__unused_was_fill_procworkqueue; + void *__unused1; + void *__unused2; + void *__unused_was_workqueue_exit; + void *__unused_was_workqueue_mark_exiting; + void *__unused_was_workqueue_thread_yielded; - void (*workqueue_exit)(struct proc *p); - void (*workqueue_mark_exiting)(struct proc *p); - void (*workqueue_thread_yielded)(void); void (*pth_proc_hashinit)(proc_t p); void (*pth_proc_hashdelete)(proc_t p); @@ -96,8 +84,8 @@ typedef const struct pthread_functions_s { int (*bsdthread_register)(struct proc *p, user_addr_t threadstart, user_addr_t wqthread, int pthsize, user_addr_t dummy_value, user_addr_t targetconc_ptr, uint64_t dispatchqueue_offset, int32_t *retval); int (*bsdthread_terminate)(struct proc *p, user_addr_t stackaddr, size_t size, uint32_t kthport, uint32_t sem, int32_t *retval); int (*thread_selfid)(struct proc *p, uint64_t *retval); - int (*workq_kernreturn)(struct proc *p, int options, user_addr_t item, int affinity, int prio, int32_t *retval); - int (*workq_open)(struct proc *p, int32_t *retval); + void *__unused_was_workq_kernreturn; + void *__unused_was_workq_open; /* psynch syscalls */ int (*psynch_mutexwait)(proc_t p, user_addr_t mutex, uint32_t mgen, uint32_t ugen, uint64_t tid, uint32_t flags, uint32_t *retval); @@ -112,68 +100,44 @@ typedef const struct pthread_functions_s { int (*psynch_rw_wrlock)(proc_t p, user_addr_t rwlock, uint32_t lgenval, uint32_t ugenval, uint32_t rw_wc, int flags, uint32_t *retval); int (*psynch_rw_yieldwrlock)(proc_t p, user_addr_t rwlock, uint32_t lgenval, uint32_t ugenval, uint32_t rw_wc, int flags, uint32_t *retval); - sched_call_t (*workqueue_get_sched_callback)(void); + void *__unused_was_workqueue_get_sched_callback; /* New register function with TSD offset */ int (*bsdthread_register2)(struct proc *p, user_addr_t threadstart, user_addr_t wqthread, uint32_t flags, user_addr_t stack_addr_hint, user_addr_t targetconc_ptr, uint32_t dispatchqueue_offset, uint32_t tsd_offset, int32_t *retval); - /* New pthreadctl system. */ - int (*bsdthread_ctl)(struct proc *p, user_addr_t cmd, user_addr_t arg1, user_addr_t arg2, user_addr_t arg3, int *retval); + void *__unused_was_bsdthread_ctl; + void *__unused_was_workq_reqthreads; + + void *__unused_was_thread_qos_from_pthread_priority; + void *__unused_was_get_pwq_state_kdp; + void *__unused3; + void *__unused_was_pthread_priority_canonicalize2; + void *__unused_was_workq_thread_has_been_unbound; - /* Request threads to deliver kevents */ - thread_t (*workq_reqthreads)(struct proc *p, int requests_count, workq_reqthreads_req_t requests); + void (*pthread_find_owner)(thread_t thread, struct stackshot_thread_waitinfo *waitinfo); + void *(*pthread_get_thread_kwq)(thread_t thread); - /* Resolve a pthread_priority_t to a QoS/relative pri */ - integer_t (*thread_qos_from_pthread_priority)(unsigned long pthread_priority, unsigned long *flags); + void *__unused_was_workq_threadreq; - /* try to get wq flags in debugger context */ - uint32_t (*get_pwq_state_kdp)(proc_t p); + int (*workq_handle_stack_events)(proc_t p, thread_t th, vm_map_t map, + user_addr_t stackaddr, mach_port_name_t kport, + user_addr_t events, int nevents, int upcall_flags); - void (*__unused3)(void); - unsigned long (*pthread_priority_canonicalize2)(unsigned long pthread_priority, boolean_t propagation); + int (*workq_create_threadstack)(proc_t p, vm_map_t vmap, + mach_vm_offset_t *out_addr); - /* Returns true on success, false on mismatch */ - boolean_t (*workq_thread_has_been_unbound)(thread_t th, int qos_class); + int (*workq_destroy_threadstack)(proc_t p, vm_map_t vmap, + mach_vm_offset_t stackaddr); - void (*pthread_find_owner)(thread_t thread, struct stackshot_thread_waitinfo *waitinfo); - void *(*pthread_get_thread_kwq)(thread_t thread); + void (*workq_setup_thread)(proc_t p, thread_t th, vm_map_t map, + user_addr_t stackaddr, mach_port_name_t kport, int th_qos, + int setup_flags, int upcall_flags); - /* - * Submits a threadreq to the workq system. - * - * If type is WORKQ_THREADREQ_KEVENT, the semantics are similar to a call - * to workq_reqthreads and the kevent bind function will be called to - * indicate the thread fulfilling the request. The req argument is ignored. - * - * If type is WORKQ_THREADREQ_WORKLOOP, The req argument should point to - * allocated memory of at least the sizeof(workq_threadreq_t). That memory - * is lent to the workq system until workloop_fulfill_threadreq is called - * and passed the pointer, at which point it may be freed. - * - * The properties of the request are passed in the (pthread) priority and flags arguments. - * - * Will return zero upon success or an error value on failure. An error of - * ENOTSUP means the type argument was not understood. - */ - int (*workq_threadreq)(struct proc *p, workq_threadreq_t req, - enum workq_threadreq_type, unsigned long priority, int flags); - - /* - * Modifies an already submitted thread request. - * - * If operation is WORKQ_THREADREQ_CHANGE_PRI, arg1 is the new priority and arg2 is unused. - * - * If operation is WORKQ_THREADREQ_CANCEL, arg1 and arg2 are unused. - * - * Will return zero upon success or an error value on failure. An error of - * ENOTSUP means the operation argument was not understood. - */ - int (*workq_threadreq_modify)(struct proc *t, workq_threadreq_t req, - enum workq_threadreq_op operation, - unsigned long arg1, unsigned long arg2); + void (*workq_markfree_threadstack)(proc_t p, thread_t, vm_map_t map, + user_addr_t stackaddr); /* padding for future */ - void * _pad[87]; + void * _pad[83]; } * pthread_functions_t; typedef const struct pthread_callbacks_s { @@ -193,35 +157,42 @@ typedef const struct pthread_callbacks_s { void (*proc_set_wqthread)(struct proc *t, user_addr_t addr); int (*proc_get_pthsize)(struct proc *t); void (*proc_set_pthsize)(struct proc *t, int size); -#if defined(__arm64__) - unsigned __int128 (*atomic_fetch_add_128_relaxed)(_Atomic unsigned __int128 *ptr, - unsigned __int128 value); - unsigned __int128 (*atomic_load_128_relaxed)(_Atomic unsigned __int128 *ptr); -#else - void *unused_was_proc_get_targconc; - void *unused_was_proc_set_targconc; -#endif - uint64_t (*proc_get_dispatchqueue_offset)(struct proc *t); + + thread_t (*task_findtid)(task_t t, uint64_t tid); + void (*thread_deallocate_safe)(thread_t); + void *__unused_was_proc_get_dispatchqueue_offset; void (*proc_set_dispatchqueue_offset)(struct proc *t, uint64_t offset); - void *unused_was_proc_get_wqlockptr; - void *unused_was_proc_get_wqinitingptr; - void* (*proc_get_wqptr)(struct proc *t); - void (*proc_set_wqptr)(struct proc *t, void* ptr); - void *unused_was_proc_get_wqsize; - void *unused_was_proc_set_wqsize; - void (*proc_lock)(struct proc *t); - void (*proc_unlock)(struct proc *t); - task_t (*proc_get_task)(struct proc *t); + void *__unused_was_proc_get_wqlockptr; + void *__unused_was_proc_get_wqinitingptr; + void *__unused_was_proc_get_wqptr; + + wait_result_t (*psynch_wait_prepare)(uintptr_t kwq, + struct turnstile **tstore, thread_t owner, block_hint_t block_hint, + uint64_t deadline); + + void (*psynch_wait_update_complete)(struct turnstile *turnstile); + + void (*psynch_wait_complete)(uintptr_t kwq, struct turnstile **tstore); + + void (*psynch_wait_cleanup)(void); + + kern_return_t (*psynch_wait_wakeup)(uintptr_t kwq, + struct ksyn_waitq_element *kwe, struct turnstile **tstore); + + void (*psynch_wait_update_owner)(uintptr_t kwq, thread_t owner, + struct turnstile **tstore); + void* (*proc_get_pthhash)(struct proc *t); void (*proc_set_pthhash)(struct proc *t, void* ptr); /* bsd/sys/user.h */ - void* (*uthread_get_threadlist)(struct uthread *t); - void (*uthread_set_threadlist)(struct uthread *t, void* threadlist); - sigset_t (*uthread_get_sigmask)(struct uthread *t); - void (*uthread_set_sigmask)(struct uthread *t, sigset_t s); + void *__unused_was_uthread_get_threadlist; + void *__unused_was_uthread_set_threadlist; + void *__unused_was_uthread_get_sigmask; + void *__unused_was_uthread_set_sigmask; + void* (*uthread_get_uukwe)(struct uthread *t); - int (*uthread_get_returnval)(struct uthread *t); + void *__unused_was_uthread_get_returnval; void (*uthread_set_returnval)(struct uthread *t, int val); int (*uthread_is_cancelled)(struct uthread *t); @@ -231,7 +202,7 @@ typedef const struct pthread_callbacks_s { /* osfmk/vm/vm_map.h */ kern_return_t (*vm_map_page_info)(vm_map_t map, vm_map_offset_t offset, vm_page_info_flavor_t flavor, vm_page_info_t info, mach_msg_type_number_t *count); - vm_map_t (*vm_map_switch)(vm_map_t map); + void *__unused_was_vm_map_switch; /* wq functions */ kern_return_t (*thread_set_wq_state32)(thread_t thread, thread_state_t state); @@ -243,29 +214,25 @@ typedef const struct pthread_callbacks_s { void (*thread_exception_return)(void); void (*thread_bootstrap_return)(void); - /* kern/clock.h */ - void (*absolutetime_to_microtime)(uint64_t abstime, clock_sec_t *secs, clock_usec_t *microsecs); - - kern_return_t (*thread_set_workq_pri)(thread_t thread, integer_t priority, integer_t policy); - kern_return_t (*thread_set_workq_qos)(thread_t thread, int qos_tier, int relprio); + void *__unused_was_absolutetime_to_microtime; + void *__unused_was_thread_set_workq_pri; + void *__unused_was_thread_set_workq_qos; /* osfmk/kern/thread.h */ struct uthread* (*get_bsdthread_info)(thread_t th); - void (*thread_sched_call)(thread_t t, sched_call_t call); - void (*thread_static_param)(thread_t t, boolean_t state); - kern_return_t (*thread_create_workq)(task_t t, thread_continue_t c, thread_t *new_t); + void *__unused_was_thread_sched_call; + void *__unused_was_thread_static_param; + void *__unused_was_thread_create_workq_waiting_parameter; kern_return_t (*thread_policy_set_internal)(thread_t t, thread_policy_flavor_t flavour, thread_policy_t info, mach_msg_type_number_t count); - /* osfmk/kern/affinity.h */ - kern_return_t (*thread_affinity_set)(thread_t thread, uint32_t tag); + void *__unused_was_thread_affinity_set; /* bsd/sys/systm.h */ void (*unix_syscall_return)(int error); - /* osfmk/kern/zalloc.h */ - void* (*zalloc)(zone_t zone); - void (*zfree)(zone_t zone, void* ptr); - zone_t (*zinit)(vm_size_t, vm_size_t maxmem, vm_size_t alloc, const char *name); + void *__unused_was_zalloc; + void *__unused_was_zfree; + void *__unused_was_zinit; /* bsd/kerb/kern_sig.c */ void (*__pthread_testcancel)(int); @@ -284,20 +251,16 @@ typedef const struct pthread_callbacks_s { /* mach/thread_act.h */ kern_return_t (*thread_resume)(thread_act_t target_act); - /* osfmk//machine_routines.h */ - int (*ml_get_max_cpus)(void); - - #if defined(__arm__) - uint32_t (*map_is_1gb)(vm_map_t); - #endif + void *__unused_was_ml_get_max_cpus; +#if defined(__arm__) + void *__unused_was_map_is_1gb; +#endif - /* xnu: struct proc p_dispatchqueue_serialno_offset additions */ - uint64_t (*proc_get_dispatchqueue_serialno_offset)(struct proc *p); - void (*proc_set_dispatchqueue_serialno_offset)(struct proc *p, uint64_t offset); + void *__unused_was_proc_get_dispatchqueue_serialno_offset; + void *__unused_was_proc_set_dispatchqueue_serialno_offset; - int (*proc_usynch_thread_qos_add_override_for_resource_check_owner)(thread_t thread, int override_qos, boolean_t first_override_for_resource, - user_addr_t resource, int resource_type, user_addr_t user_lock_addr, mach_port_name_t user_lock_owner); - void *unused_was_proc_set_stack_addr_hint; + void *__unused_was_proc_usynch_thread_qos_add_override_for_resource_check_owner; + void *__unused_was_proc_set_stack_addr_hint; uint32_t (*proc_get_pthread_tsd_offset)(struct proc *p); void (*proc_set_pthread_tsd_offset)(struct proc *p, uint32_t pthread_tsd_offset); @@ -311,56 +274,46 @@ typedef const struct pthread_callbacks_s { kern_return_t (*thread_policy_get)(thread_t t, thread_policy_flavor_t flavor, thread_policy_t info, mach_msg_type_number_t *count, boolean_t *get_default); boolean_t (*qos_main_thread_active)(void); - kern_return_t (*thread_set_voucher_name)(mach_port_name_t voucher_name); + kern_return_t (*thread_set_voucher_name)(mach_port_name_t name); boolean_t (*proc_usynch_thread_qos_add_override_for_resource)(task_t task, struct uthread *, uint64_t tid, int override_qos, boolean_t first_override_for_resource, user_addr_t resource, int resource_type); boolean_t (*proc_usynch_thread_qos_remove_override_for_resource)(task_t task, struct uthread *, uint64_t tid, user_addr_t resource, int resource_type); - boolean_t (*proc_usynch_thread_qos_reset_override_for_resource)(task_t task, struct uthread *, uint64_t tid, user_addr_t resource, int resource_type); + void *__unused_was_proc_usynch_thread_qos_reset_override_for_resource; - boolean_t (*proc_init_wqptr_or_wait)(proc_t proc); + void *__unused_was_proc_init_wqptr_or_wait; uint16_t (*thread_set_tag)(thread_t thread, uint16_t tag); uint16_t (*thread_get_tag)(thread_t thread); - int (*proc_usynch_thread_qos_squash_override_for_resource)(thread_t thread, user_addr_t resource, int resource_type); - int (*task_get_default_manager_qos)(task_t task); - - int (*thread_create_workq_waiting)(task_t task, thread_continue_t thread_return, event_t event, thread_t *new_thread); + void *__unused_was_proc_usynch_thread_qos_squash_override_for_resource; + void *__unused_was_task_get_default_manager_qos; + void *__unused_was_thread_create_workq_waiting; user_addr_t (*proc_get_stack_addr_hint)(struct proc *p); void (*proc_set_stack_addr_hint)(struct proc *p, user_addr_t stack_addr_hint); - uint64_t (*proc_get_return_to_kernel_offset)(struct proc *t); + void *__unused_was_proc_get_return_to_kernel_offset; void (*proc_set_return_to_kernel_offset)(struct proc *t, uint64_t offset); - /* indicates call is being made synchronously with workq_threadreq call */ -# define WORKLOOP_FULFILL_THREADREQ_SYNC 0x1 -# define WORKLOOP_FULFILL_THREADREQ_CANCEL 0x2 - int (*workloop_fulfill_threadreq)(struct proc *p, workq_threadreq_t req, thread_t thread, int flags); + void *__unused_was_workloop_fulfill_threadreq; void (*thread_will_park_or_terminate)(thread_t thread); - /* For getting maximum parallelism for a given QoS */ - uint32_t (*qos_max_parallelism)(int qos, uint64_t options); + void *__unused_was_qos_max_parallelism; /* proc_internal.h: struct proc user_stack accessor */ user_addr_t (*proc_get_user_stack)(struct proc *p); - void (*proc_set_user_stack)(struct proc *p, user_addr_t user_stack); + void *__unused_was_proc_set_user_stack; /* padding for future */ void* _pad[69]; - } *pthread_callbacks_t; void pthread_kext_register(pthread_functions_t fns, pthread_callbacks_t *callbacks); #ifdef BSD_KERNEL_PRIVATE -void workqueue_mark_exiting(struct proc *); -void workqueue_exit(struct proc *); -void workqueue_thread_yielded(void); -sched_call_t workqueue_get_sched_callback(void); +void thread_will_park_or_terminate(thread_t thread); void pthread_init(void); - extern pthread_callbacks_t pthread_kern; extern pthread_functions_t pthread_functions; #endif diff --git a/bsd/sys/queue.h b/bsd/sys/queue.h index 294eec935..aa26d7636 100644 --- a/bsd/sys/queue.h +++ b/bsd/sys/queue.h @@ -59,6 +59,15 @@ #ifndef _SYS_QUEUE_H_ #define _SYS_QUEUE_H_ +#ifdef KERNEL_PRIVATE +#include /* panic function call */ +#include /* __improbable in kernelspace */ +#else +#ifndef __improbable +#define __improbable(x) (x) /* noop in userspace */ +#endif /* __improbable */ +#endif /* KERNEL_PRIVATE */ + /* * This file defines five types of data structures: singly-linked lists, * singly-linked tail queues, lists, tail queues, and circular queues. @@ -436,30 +445,32 @@ __MISMATCH_TAGS_POP * List functions. */ -#if (defined(_KERNEL) && defined(INVARIANTS)) || defined(QUEUE_MACRO_DEBUG) -#define QMD_LIST_CHECK_HEAD(head, field) do { \ - if (LIST_FIRST((head)) != NULL && \ - LIST_FIRST((head))->field.le_prev != \ - &LIST_FIRST((head))) \ - panic("Bad list head %p first->prev != head", (head)); \ +#ifdef KERNEL_PRIVATE +#define LIST_CHECK_HEAD(head, field) do { \ + if (__improbable( \ + LIST_FIRST((head)) != NULL && \ + LIST_FIRST((head))->field.le_prev != \ + &LIST_FIRST((head)))) \ + panic("Bad list head %p first->prev != head", (head)); \ } while (0) -#define QMD_LIST_CHECK_NEXT(elm, field) do { \ - if (LIST_NEXT((elm), field) != NULL && \ - LIST_NEXT((elm), field)->field.le_prev != \ - &((elm)->field.le_next)) \ - panic("Bad link elm %p next->prev != elm", (elm)); \ +#define LIST_CHECK_NEXT(elm, field) do { \ + if (__improbable( \ + LIST_NEXT((elm), field) != NULL && \ + LIST_NEXT((elm), field)->field.le_prev != \ + &((elm)->field.le_next))) \ + panic("Bad link elm %p next->prev != elm", (elm)); \ } while (0) -#define QMD_LIST_CHECK_PREV(elm, field) do { \ - if (*(elm)->field.le_prev != (elm)) \ +#define LIST_CHECK_PREV(elm, field) do { \ + if (__improbable(*(elm)->field.le_prev != (elm))) \ panic("Bad link elm %p prev->next != elm", (elm)); \ } while (0) #else -#define QMD_LIST_CHECK_HEAD(head, field) -#define QMD_LIST_CHECK_NEXT(elm, field) -#define QMD_LIST_CHECK_PREV(elm, field) -#endif /* (_KERNEL && INVARIANTS) || QUEUE_MACRO_DEBUG */ +#define LIST_CHECK_HEAD(head, field) +#define LIST_CHECK_NEXT(elm, field) +#define LIST_CHECK_PREV(elm, field) +#endif /* KERNEL_PRIVATE */ #define LIST_EMPTY(head) ((head)->lh_first == NULL) @@ -480,7 +491,7 @@ __MISMATCH_TAGS_POP } while (0) #define LIST_INSERT_AFTER(listelm, elm, field) do { \ - QMD_LIST_CHECK_NEXT(listelm, field); \ + LIST_CHECK_NEXT(listelm, field); \ if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\ LIST_NEXT((listelm), field)->field.le_prev = \ &LIST_NEXT((elm), field); \ @@ -489,7 +500,7 @@ __MISMATCH_TAGS_POP } while (0) #define LIST_INSERT_BEFORE(listelm, elm, field) do { \ - QMD_LIST_CHECK_PREV(listelm, field); \ + LIST_CHECK_PREV(listelm, field); \ (elm)->field.le_prev = (listelm)->field.le_prev; \ LIST_NEXT((elm), field) = (listelm); \ *(listelm)->field.le_prev = (elm); \ @@ -497,7 +508,7 @@ __MISMATCH_TAGS_POP } while (0) #define LIST_INSERT_HEAD(head, elm, field) do { \ - QMD_LIST_CHECK_HEAD((head), field); \ + LIST_CHECK_HEAD((head), field); \ if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL) \ LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\ LIST_FIRST((head)) = (elm); \ @@ -507,8 +518,8 @@ __MISMATCH_TAGS_POP #define LIST_NEXT(elm, field) ((elm)->field.le_next) #define LIST_REMOVE(elm, field) do { \ - QMD_LIST_CHECK_NEXT(elm, field); \ - QMD_LIST_CHECK_PREV(elm, field); \ + LIST_CHECK_NEXT(elm, field); \ + LIST_CHECK_PREV(elm, field); \ if (LIST_NEXT((elm), field) != NULL) \ LIST_NEXT((elm), field)->field.le_prev = \ (elm)->field.le_prev; \ @@ -557,6 +568,33 @@ __MISMATCH_TAGS_POP /* * Tail queue functions. */ +#ifdef KERNEL_PRIVATE +#define TAILQ_CHECK_HEAD(head, field) do { \ + if (__improbable( \ + TAILQ_FIRST((head)) != NULL && \ + TAILQ_FIRST((head))->field.tqe_prev != \ + &TAILQ_FIRST((head)))) \ + panic("Bad tailq head %p first->prev != head", (head)); \ +} while (0) + +#define TAILQ_CHECK_NEXT(elm, field) do { \ + if (__improbable( \ + TAILQ_NEXT((elm), field) != NULL && \ + TAILQ_NEXT((elm), field)->field.tqe_prev != \ + &((elm)->field.tqe_next))) \ + panic("Bad tailq elm %p next->prev != elm", (elm)); \ +} while(0) + +#define TAILQ_CHECK_PREV(elm, field) do { \ + if (__improbable(*(elm)->field.tqe_prev != (elm))) \ + panic("Bad tailq elm %p prev->next != elm", (elm)); \ +} while(0) +#else +#define TAILQ_CHECK_HEAD(head, field) +#define TAILQ_CHECK_NEXT(elm, field) +#define TAILQ_CHECK_PREV(elm, field) +#endif /* KERNEL_PRIVATE */ + #define TAILQ_CONCAT(head1, head2, field) do { \ if (!TAILQ_EMPTY(head2)) { \ *(head1)->tqh_last = (head2)->tqh_first; \ @@ -598,7 +636,9 @@ __MISMATCH_TAGS_POP QMD_TRACE_HEAD(head); \ } while (0) + #define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \ + TAILQ_CHECK_NEXT(listelm, field); \ if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\ TAILQ_NEXT((elm), field)->field.tqe_prev = \ &TAILQ_NEXT((elm), field); \ @@ -613,6 +653,7 @@ __MISMATCH_TAGS_POP } while (0) #define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \ + TAILQ_CHECK_PREV(listelm, field); \ (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \ TAILQ_NEXT((elm), field) = (listelm); \ *(listelm)->field.tqe_prev = (elm); \ @@ -622,6 +663,7 @@ __MISMATCH_TAGS_POP } while (0) #define TAILQ_INSERT_HEAD(head, elm, field) do { \ + TAILQ_CHECK_HEAD(head, field); \ if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL) \ TAILQ_FIRST((head))->field.tqe_prev = \ &TAILQ_NEXT((elm), field); \ @@ -655,6 +697,8 @@ __MISMATCH_TAGS_PUSH \ __MISMATCH_TAGS_POP #define TAILQ_REMOVE(head, elm, field) do { \ + TAILQ_CHECK_NEXT(elm, field); \ + TAILQ_CHECK_PREV(elm, field); \ if ((TAILQ_NEXT((elm), field)) != NULL) \ TAILQ_NEXT((elm), field)->field.tqe_prev = \ (elm)->field.tqe_prev; \ @@ -713,6 +757,31 @@ __MISMATCH_TAGS_POP /* * Circular queue functions. */ +#ifdef KERNEL_PRIVATE +#define CIRCLEQ_CHECK_HEAD(head, field) do { \ + if (__improbable( \ + CIRCLEQ_FIRST((head)) != ((void*)(head)) && \ + CIRCLEQ_FIRST((head))->field.cqe_prev != ((void*)(head))))\ + panic("Bad circleq head %p first->prev != head", (head)); \ +} while(0) +#define CIRCLEQ_CHECK_NEXT(head, elm, field) do { \ + if (__improbable( \ + CIRCLEQ_NEXT((elm), field) != ((void*)(head)) && \ + CIRCLEQ_NEXT((elm), field)->field.cqe_prev != (elm))) \ + panic("Bad circleq elm %p next->prev != elm", (elm)); \ +} while(0) +#define CIRCLEQ_CHECK_PREV(head, elm, field) do { \ + if (__improbable( \ + CIRCLEQ_PREV((elm), field) != ((void*)(head)) && \ + CIRCLEQ_PREV((elm), field)->field.cqe_next != (elm))) \ + panic("Bad circleq elm %p prev->next != elm", (elm)); \ +} while(0) +#else +#define CIRCLEQ_CHECK_HEAD(head, field) +#define CIRCLEQ_CHECK_NEXT(head, elm, field) +#define CIRCLEQ_CHECK_PREV(head, elm, field) +#endif /* KERNEL_PRIVATE */ + #define CIRCLEQ_EMPTY(head) ((head)->cqh_first == (void *)(head)) #define CIRCLEQ_FIRST(head) ((head)->cqh_first) @@ -728,6 +797,7 @@ __MISMATCH_TAGS_POP } while (0) #define CIRCLEQ_INSERT_AFTER(head, listelm, elm, field) do { \ + CIRCLEQ_CHECK_NEXT(head, listelm, field); \ (elm)->field.cqe_next = (listelm)->field.cqe_next; \ (elm)->field.cqe_prev = (listelm); \ if ((listelm)->field.cqe_next == (void *)(head)) \ @@ -738,6 +808,7 @@ __MISMATCH_TAGS_POP } while (0) #define CIRCLEQ_INSERT_BEFORE(head, listelm, elm, field) do { \ + CIRCLEQ_CHECK_PREV(head, listelm, field); \ (elm)->field.cqe_next = (listelm); \ (elm)->field.cqe_prev = (listelm)->field.cqe_prev; \ if ((listelm)->field.cqe_prev == (void *)(head)) \ @@ -748,6 +819,7 @@ __MISMATCH_TAGS_POP } while (0) #define CIRCLEQ_INSERT_HEAD(head, elm, field) do { \ + CIRCLEQ_CHECK_HEAD(head, field); \ (elm)->field.cqe_next = (head)->cqh_first; \ (elm)->field.cqe_prev = (void *)(head); \ if ((head)->cqh_last == (void *)(head)) \ @@ -774,6 +846,8 @@ __MISMATCH_TAGS_POP #define CIRCLEQ_PREV(elm,field) ((elm)->field.cqe_prev) #define CIRCLEQ_REMOVE(head, elm, field) do { \ + CIRCLEQ_CHECK_NEXT(head, elm, field); \ + CIRCLEQ_CHECK_PREV(head, elm, field); \ if ((elm)->field.cqe_next == (void *)(head)) \ (head)->cqh_last = (elm)->field.cqe_prev; \ else \ @@ -801,12 +875,37 @@ struct quehead { }; #ifdef __GNUC__ +#ifdef KERNEL_PRIVATE +static __inline void +chkquenext(void *a) +{ + struct quehead *element = (struct quehead *)a; + if (__improbable(element->qh_link != NULL && + element->qh_link->qh_rlink != element)) { + panic("Bad que elm %p next->prev != elm", a); + } +} + +static __inline void +chkqueprev(void *a) +{ + struct quehead *element = (struct quehead *)a; + if (__improbable(element->qh_rlink != NULL && + element->qh_rlink->qh_link != element)) { + panic("Bad que elm %p prev->next != elm", a); + } +} +#else /* !KERNEL_PRIVATE */ +#define chkquenext(a) +#define chkqueprev(a) +#endif /* KERNEL_PRIVATE */ static __inline void insque(void *a, void *b) { struct quehead *element = (struct quehead *)a, *head = (struct quehead *)b; + chkquenext(head); element->qh_link = head->qh_link; element->qh_rlink = head; @@ -818,6 +917,8 @@ static __inline void remque(void *a) { struct quehead *element = (struct quehead *)a; + chkquenext(element); + chkqueprev(element); element->qh_link->qh_rlink = element->qh_rlink; element->qh_rlink->qh_link = element->qh_link; @@ -831,7 +932,7 @@ void remque(void *a); #endif /* __GNUC__ */ -#endif +#endif /* NOTFB31 */ #endif /* _KERNEL */ #endif /* !_SYS_QUEUE_H_ */ diff --git a/bsd/sys/reason.h b/bsd/sys/reason.h index 81792a1b9..ce2d47670 100644 --- a/bsd/sys/reason.h +++ b/bsd/sys/reason.h @@ -105,11 +105,13 @@ void os_reason_free(os_reason_t cur_reason); #define OS_REASON_WATCHDOG 20 #define OS_REASON_METAL 21 #define OS_REASON_WATCHKIT 22 +#define OS_REASON_GUARD 23 +#define OS_REASON_ANALYTICS 24 /* * Update whenever new OS_REASON namespaces are added. */ -#define OS_REASON_MAX_VALID_NAMESPACE OS_REASON_WATCHKIT +#define OS_REASON_MAX_VALID_NAMESPACE OS_REASON_ANALYTICS #define OS_REASON_BUFFER_MAX_SIZE 5120 @@ -122,11 +124,12 @@ void os_reason_free(os_reason_t cur_reason); #define OS_REASON_FLAG_CONSISTENT_FAILURE 0x40 /* Whatever caused this reason to be created will happen again */ #define OS_REASON_FLAG_ONE_TIME_FAILURE 0x80 /* Whatever caused this reason to be created was a one time issue */ #define OS_REASON_FLAG_NO_CRASHED_TID 0x100 /* Don't include the TID that processed the exit in the crash report */ +#define OS_REASON_FLAG_ABORT 0x200 /* Reason created from abort_* rather than terminate_* */ /* * Set of flags that are allowed to be passed from userspace */ -#define OS_REASON_FLAG_MASK_ALLOWED_FROM_USER (OS_REASON_FLAG_CONSISTENT_FAILURE | OS_REASON_FLAG_ONE_TIME_FAILURE | OS_REASON_FLAG_NO_CRASH_REPORT) +#define OS_REASON_FLAG_MASK_ALLOWED_FROM_USER (OS_REASON_FLAG_CONSISTENT_FAILURE | OS_REASON_FLAG_ONE_TIME_FAILURE | OS_REASON_FLAG_NO_CRASH_REPORT | OS_REASON_FLAG_ABORT) /* * Macros to encode the exit reason namespace and first 32 bits of code in exception code @@ -234,6 +237,13 @@ int terminate_with_payload(int pid, uint32_t reason_namespace, uint64_t reason_c #define EXEC_EXIT_REASON_UPX 12 #define EXEC_EXIT_REASON_NO32EXEC 13 +/* + * guard reasons + */ +#define GUARD_REASON_VNODE 1 +#define GUARD_REASON_VIRT_MEMORY 2 +#define GUARD_REASON_MACH_PORT 3 + __END_DECLS #endif /* _REASON_H_ */ diff --git a/bsd/sys/resource.h b/bsd/sys/resource.h index 2f0316c87..55b553a1b 100644 --- a/bsd/sys/resource.h +++ b/bsd/sys/resource.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -126,6 +126,7 @@ typedef __uint64_t rlim_t; #define PRIO_DARWIN_ROLE_NON_UI 0x3 /* Off screen, non-focal UI */ #define PRIO_DARWIN_ROLE_UI_NON_FOCAL 0x4 /* On screen, non-focal UI */ #define PRIO_DARWIN_ROLE_TAL_LAUNCH 0x5 /* Throttled-launch (for OS X TAL resume) */ +#define PRIO_DARWIN_ROLE_DARWIN_BG 0x6 /* Throttled for running in the background */ #endif /* PRIVATE */ @@ -337,8 +338,9 @@ struct rusage_info_v4 { uint64_t ri_cycles; uint64_t ri_billed_energy; uint64_t ri_serviced_energy; - // We're reserving 2 counters for future extension - uint64_t ri_unused[2]; + uint64_t ri_interval_max_phys_footprint; + // 1 reserve counter(s) remaining for future extension + uint64_t ri_unused[1]; }; typedef struct rusage_info_v4 rusage_info_current; @@ -454,6 +456,7 @@ struct rlimit { #define RLIMIT_WAKEUPS_MONITOR 0x1 /* Configure the wakeups monitor. */ #define RLIMIT_CPU_USAGE_MONITOR 0x2 /* Configure the CPU usage monitor. */ #define RLIMIT_THREAD_CPULIMITS 0x3 /* Configure a blocking, per-thread, CPU limits. */ +#define RLIMIT_FOOTPRINT_INTERVAL 0x4 /* Configure memory footprint interval tracking */ /* * Flags for wakeups monitor control. @@ -463,11 +466,17 @@ struct rlimit { #define WAKEMON_GET_PARAMS 0x04 #define WAKEMON_SET_DEFAULTS 0x08 #define WAKEMON_MAKE_FATAL 0x10 /* Configure the task so that violations are fatal. */ + /* * Flags for CPU usage monitor control. */ #define CPUMON_MAKE_FATAL 0x1000 +/* + * Flags for memory footprint interval tracking. + */ +#define FOOTPRINT_INTERVAL_RESET 0x1 /* Reset the footprint interval counter to zero */ + struct proc_rlimit_control_wakeupmon { uint32_t wm_flags; int32_t wm_rate; @@ -488,6 +497,7 @@ struct proc_rlimit_control_wakeupmon { #if PRIVATE #define IOPOL_TYPE_VFS_HFS_CASE_SENSITIVITY 1 #endif +#define IOPOL_TYPE_VFS_ATIME_UPDATES 2 /* scope */ #define IOPOL_SCOPE_PROCESS 0 @@ -511,6 +521,9 @@ struct proc_rlimit_control_wakeupmon { #define IOPOL_VFS_HFS_CASE_SENSITIVITY_FORCE_CASE_SENSITIVE 1 #endif +#define IOPOL_ATIME_UPDATES_DEFAULT 0 +#define IOPOL_ATIME_UPDATES_OFF 1 + #ifdef PRIVATE /* * Structures for use in communicating via iopolicysys() between Libc and the diff --git a/bsd/sys/sdt_impl.h b/bsd/sys/sdt_impl.h index a0675b2b1..f48f83e50 100644 --- a/bsd/sys/sdt_impl.h +++ b/bsd/sys/sdt_impl.h @@ -68,6 +68,7 @@ extern int sdt_invop(uintptr_t, uintptr_t *, uintptr_t); extern uint64_t sdt_getarg(void *, dtrace_id_t, void *, int, int); void sdt_provide_module(void *, struct modctl *); +void sdt_early_init(void); void sdt_init(void); extern int sdt_probetab_size; diff --git a/bsd/sys/signal.h b/bsd/sys/signal.h index 817454ab5..e7218b566 100644 --- a/bsd/sys/signal.h +++ b/bsd/sys/signal.h @@ -473,6 +473,9 @@ struct __kern_sigaction { /* This will provide 64bit register set in a 32bit user address space */ #define SA_64REGSET 0x0200 /* signal handler with SA_SIGINFO args with 64bit regs information */ #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ +#ifdef BSD_KERNEL_PRIVATE +#define SA_VALIDATE_SIGRETURN_FROM_SIGTRAMP 0x0400 /* use token to validate sigreturn was called from matching sigtramp */ +#endif /* BSD_KERNEL_PRIVATE */ /* the following are the only bits we support from user space, the * rest are for kernel use only. diff --git a/bsd/sys/signalvar.h b/bsd/sys/signalvar.h index b280c686f..3419e8c23 100644 --- a/bsd/sys/signalvar.h +++ b/bsd/sys/signalvar.h @@ -67,6 +67,9 @@ #include #ifdef BSD_KERNEL_PRIVATE + +#include + /* * Kernel signal definitions and data structures, * not exported to user programs. @@ -86,13 +89,13 @@ struct sigacts { sigset_t ps_signodefer; /* signals not masked while handled */ sigset_t ps_siginfo; /* signals that want SA_SIGINFO args */ sigset_t ps_oldmask; /* saved mask from before sigpause */ + user_addr_t ps_sigreturn_token; /* random token used to validate sigreturn arguments */ + _Atomic uint32_t ps_sigreturn_validation; /* sigreturn argument validation state */ int ps_flags; /* signal flags, below */ struct kern_sigaltstack ps_sigstk; /* sp, length & flags */ int ps_sig; /* for core dump/debugger XXX */ int ps_code; /* for core dump/debugger XXX */ int ps_addr; /* for core dump/debugger XXX */ - sigset_t ps_usertramp; /* SunOS compat; libc sigtramp XXX */ - sigset_t ps_64regset; /* signals that want SA_EXSIGINFO args */ }; /* signal flags */ @@ -108,6 +111,11 @@ struct sigacts { #define KERN_SIG_HOLD CAST_USER_ADDR_T(3) #define KERN_SIG_WAIT CAST_USER_ADDR_T(4) +/* Values for ps_sigreturn_validation */ +#define PS_SIGRETURN_VALIDATION_DEFAULT 0x0u +#define PS_SIGRETURN_VALIDATION_ENABLED 0x1u +#define PS_SIGRETURN_VALIDATION_DISABLED 0x2u + /* * get signal action for process and signal; currently only for current process */ diff --git a/bsd/sys/socket.h b/bsd/sys/socket.h index 3602f7764..f6bafa632 100644 --- a/bsd/sys/socket.h +++ b/bsd/sys/socket.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -153,6 +153,7 @@ #ifdef PRIVATE #define SO_NOWAKEFROMSLEEP 0x10000 /* Don't wake for traffic to this socket */ #define SO_NOAPNFALLBK 0x20000 /* Don't attempt APN fallback for the socket */ +#define SO_TIMESTAMP_CONTINUOUS 0x40000 /* Continuous monotonic timestamp on rcvd dgram */ #endif #endif /* (!__APPLE__) */ @@ -276,7 +277,7 @@ (c == SO_TC_BK_SYS || c == SO_TC_BK || c == SO_TC_BE || \ c == SO_TC_RD || c == SO_TC_OAM || c == SO_TC_AV || \ c == SO_TC_RV || c == SO_TC_VI || c == SO_TC_VO || \ - c == SO_TC_CTL) + c == SO_TC_CTL || c == SO_TC_NETSVC_SIG) #define SO_TC_UNSPEC ((int)-1) /* Traffic class not specified */ @@ -760,7 +761,12 @@ struct sockaddr_storage { #define NET_RT_DUMPX 8 /* private */ #define NET_RT_DUMPX_FLAGS 9 /* private */ #endif /* PRIVATE */ -#define NET_RT_MAXID 10 +/* + * Allows read access non-local host's MAC address + * if the process has neighbor cache entitlement. + */ +#define NET_RT_FLAGS_PRIV 10 +#define NET_RT_MAXID 11 #endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ #ifdef KERNEL_PRIVATE @@ -1084,6 +1090,7 @@ struct cmsgcred { #ifdef PRIVATE #define SCM_SEQNUM 0x05 /* TCP unordered recv seq no */ #define SCM_MSG_PRIORITY 0x06 /* TCP unordered snd priority */ +#define SCM_TIMESTAMP_CONTINUOUS 0x07 /* timestamp (uint64_t) */ #endif /* PRIVATE */ #ifdef KERNEL_PRIVATE diff --git a/bsd/sys/socketvar.h b/bsd/sys/socketvar.h index b0be72420..caf612051 100644 --- a/bsd/sys/socketvar.h +++ b/bsd/sys/socketvar.h @@ -81,6 +81,9 @@ #include #include #include +#ifdef BSD_KERNEL_PRIVATE +#include +#endif /* BSD_KERNEL_PRIVATE */ #endif /* KERNEL_PRIVATE */ typedef u_quad_t so_gen_t; @@ -310,7 +313,11 @@ struct socket { struct msg_state *so_msg_state; /* unordered snd/rcv state */ struct flow_divert_pcb *so_fd_pcb; /* Flow Divert control block */ - struct cfil_info *so_cfil; +#if CONTENT_FILTER + struct cfil_info *so_cfil; + struct cfil_db *so_cfil_db; + u_int32_t so_state_change_cnt; /* incr for each connect, disconnect */ +#endif u_int32_t so_eventmask; /* event mask */ @@ -748,6 +755,7 @@ __BEGIN_DECLS /* Exported */ extern int sbappendaddr(struct sockbuf *sb, struct sockaddr *asa, struct mbuf *m0, struct mbuf *control, int *error_out); +extern int sbappendchain(struct sockbuf *sb, struct mbuf *m, int space); extern int sbappendrecord(struct sockbuf *sb, struct mbuf *m0); extern void sbflush(struct sockbuf *sb); extern int sbspace(struct sockbuf *sb); @@ -776,11 +784,17 @@ extern void soreserve_preconnect(struct socket *so, unsigned int pre_cc); extern void sorwakeup(struct socket *so); extern int sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags); +extern int sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, + struct mbuf *control, uint32_t sendflags); extern int sosend_list(struct socket *so, struct uio **uio, u_int uiocnt, int flags); extern int soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int msgcnt, int *flags); extern void sonullevent(struct socket *so, void *arg, uint32_t hint); +extern struct mbuf *sbconcat_mbufs(struct sockbuf *sb, struct sockaddr *asa, struct mbuf *m0, + struct mbuf *control); + + __END_DECLS #ifdef BSD_KERNEL_PRIVATE diff --git a/bsd/sys/sockio.h b/bsd/sys/sockio.h index be5cfba2e..0ef6be269 100644 --- a/bsd/sys/sockio.h +++ b/bsd/sys/sockio.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2017 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -278,7 +278,6 @@ #define SIOCSECNMODE _IOW('i', 177, struct ifreq) #define SIOCSIFORDER _IOWR('i', 178, struct if_order) -#define SIOCGIFORDER _IOWR('i', 179, struct if_order) #define SIOCSQOSMARKINGMODE _IOWR('i', 180, struct ifreq) #define SIOCSFASTLANECAPABLE SIOCSQOSMARKINGMODE @@ -316,4 +315,13 @@ #endif /* BSD_KERNEL_PRIVATE */ #endif /* PRIVATE */ +#ifdef PRIVATE +#define SIOCGIFLOWPOWER _IOWR('i', 199, struct ifreq) /* Low Power Mode */ +#define SIOCSIFLOWPOWER _IOWR('i', 200, struct ifreq) /* Low Power Mode */ + +#if INET6 +#define SIOCGIFCLAT46ADDR _IOWR('i', 201, struct if_clat46req) +#endif /* INET6 */ +#endif /* PRIVATE */ + #endif /* !_SYS_SOCKIO_H_ */ diff --git a/bsd/sys/spawn_internal.h b/bsd/sys/spawn_internal.h index 29ea49d3b..069897d1b 100644 --- a/bsd/sys/spawn_internal.h +++ b/bsd/sys/spawn_internal.h @@ -182,6 +182,9 @@ struct _posix_spawn_persona_info { * can be set, as well as any metadata whose validity is signalled by the * presence of a bit in the flags field. All fields are initialized to the * appropriate default values by posix_spawnattr_init(). + * + * Fields must be added at the end of this, but before extensions array + * pointers. */ typedef struct _posix_spawnattr { @@ -205,6 +208,9 @@ typedef struct _posix_spawnattr { uint64_t psa_qos_clamp; /* QoS Clamp to set on the new process */ uint64_t psa_darwin_role; /* PRIO_DARWIN_ROLE to set on the new process */ + int psa_thread_limit; /* thread limit */ + + uint64_t psa_max_addr; /* Max valid VM address */ /* * NOTE: Extensions array pointers must stay at the end so that diff --git a/bsd/sys/stat.h b/bsd/sys/stat.h index 183fdd207..1169924d5 100644 --- a/bsd/sys/stat.h +++ b/bsd/sys/stat.h @@ -368,14 +368,16 @@ struct user32_stat64 { __uint32_t st_gen; /* file generation number */ __uint32_t st_lspare; /* RESERVED: DO NOT USE! */ __int64_t st_qspare[2]; /* RESERVED: DO NOT USE! */ -#if __arm__ && (__BIGGEST_ALIGNMENT__ > 4) -/* For the newer ARMv7k ABI where 64-bit types are 64-bit aligned, but pointers - * are 32-bit: - * Applying attributes here causes a mismatch with the user-space struct stat64 +#if defined(__x86_64__) +/* + * This packing is required to ensure symmetry between userspace and kernelspace + * when the kernel is 64-bit and the user application is 32-bit. All currently + * supported ARM slices (arm64/armv7k/arm64_32) contain the same struct + * alignment ABI so this packing isn't needed for ARM. */ -}; -#else } __attribute__((packed,aligned(4))); +#else +}; #endif extern void munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp); diff --git a/bsd/sys/sysctl.h b/bsd/sys/sysctl.h index ff17198ca..0d4414a58 100644 --- a/bsd/sys/sysctl.h +++ b/bsd/sys/sysctl.h @@ -1139,6 +1139,7 @@ extern char machine[]; extern char osrelease[]; extern char ostype[]; extern char osversion[]; +extern char osbuild_config[]; struct linker_set; diff --git a/bsd/sys/systm.h b/bsd/sys/systm.h index 98dc4dd52..bec0bc45e 100644 --- a/bsd/sys/systm.h +++ b/bsd/sys/systm.h @@ -263,9 +263,7 @@ void *exec_spawnattr_getmacpolicyinfo(const void *macextensions, const char *pol #ifdef BSD_KERNEL_PRIVATE -#define THROTTLE_IO_ENABLE 1 -#define THROTTLE_IO_DISABLE 0 -void sys_override_io_throttle(int flag); +void sys_override_io_throttle(boolean_t enable_override); #endif /* BSD_KERNEL_PRIVATE */ diff --git a/bsd/sys/ubc.h b/bsd/sys/ubc.h index 0699b5b03..4c209a6e7 100644 --- a/bsd/sys/ubc.h +++ b/bsd/sys/ubc.h @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -97,6 +98,8 @@ const char *cs_identity_get(proc_t); #endif /* cluster IO routines */ +void cluster_update_state(vnode_t, vm_object_offset_t, vm_object_offset_t, boolean_t); + int advisory_read(vnode_t, off_t, off_t, int); int advisory_read_ext(vnode_t, off_t, off_t, int, int (*)(buf_t, void *), void *, int); diff --git a/bsd/sys/ubc_internal.h b/bsd/sys/ubc_internal.h index 3724348ad..be82f0f66 100644 --- a/bsd/sys/ubc_internal.h +++ b/bsd/sys/ubc_internal.h @@ -120,9 +120,11 @@ struct cs_blob { void * csb_entitlements; /* The entitlements as an OSDictionary */ unsigned int csb_signer_type; + unsigned int csb_reconstituted; /* signature has potentially been modified after validation */ /* The following two will be replaced by the csb_signer_type. */ unsigned int csb_platform_binary:1; unsigned int csb_platform_path:1; + }; /* @@ -186,7 +188,6 @@ __private_extern__ uint32_t cluster_throttle_io_limit(vnode_t, uint32_t *); #define UBC_FOR_PAGEOUT 0x0002 memory_object_control_t ubc_getobject(vnode_t, int); -boolean_t ubc_strict_uncached_IO(vnode_t); int ubc_info_init(vnode_t); int ubc_info_init_withsize(vnode_t, off_t); diff --git a/bsd/sys/ulock.h b/bsd/sys/ulock.h index de799d8f1..5a1b5f62e 100644 --- a/bsd/sys/ulock.h +++ b/bsd/sys/ulock.h @@ -84,12 +84,17 @@ extern int __ulock_wake(uint32_t operation, void *addr, uint64_t wake_value); /* * operation bits [23, 16] contain the flags for __ulock_wait - */ -/* The waiter is contending on this lock for synchronization around global data. + * + * @const ULF_WAIT_WORKQ_DATA_CONTENTION + * The waiter is contending on this lock for synchronization around global data. * This causes the workqueue subsystem to not create new threads to offset for * waiters on this lock. + * + * @const ULF_WAIT_CANCEL_POINT + * This wait is a cancelation point */ -#define ULF_WAIT_WORKQ_DATA_CONTENTION 0x00010000 +#define ULF_WAIT_WORKQ_DATA_CONTENTION 0x00010000 +#define ULF_WAIT_CANCEL_POINT 0x00020000 /* * operation bits [31, 24] contain the generic flags @@ -104,7 +109,8 @@ extern int __ulock_wake(uint32_t operation, void *addr, uint64_t wake_value); #define ULF_GENERIC_MASK 0xFFFF0000 #define ULF_WAIT_MASK (ULF_NO_ERRNO | \ - ULF_WAIT_WORKQ_DATA_CONTENTION) + ULF_WAIT_WORKQ_DATA_CONTENTION | \ + ULF_WAIT_CANCEL_POINT) #define ULF_WAKE_MASK (ULF_WAKE_ALL | \ ULF_WAKE_THREAD | \ diff --git a/bsd/sys/user.h b/bsd/sys/user.h index 92b235bb9..552bacac6 100644 --- a/bsd/sys/user.h +++ b/bsd/sys/user.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2012 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* Copyright (c) 1995, 1997 Apple Computer, Inc. All Rights Reserved */ @@ -112,18 +112,19 @@ struct label; /* MAC label dummy struct */ /* * Per-thread U area. */ - + struct uthread { /* syscall parameters, results and catches */ u_int64_t uu_arg[8]; /* arguments to current system call */ - int uu_rval[2]; + int uu_rval[2]; + char uu_cursig; /* p_cursig for exc. */ unsigned int syscall_code; /* current syscall code */ /* thread exception handling */ + int uu_exception; mach_exception_code_t uu_code; /* ``code'' to trap */ mach_exception_subcode_t uu_subcode; - int uu_exception; - char uu_cursig; /* p_cursig for exc. */ + /* support for syscalls which use continuations */ union { struct _select_data { @@ -132,102 +133,165 @@ struct uthread { int count; struct select_nocancel_args *args; /* original syscall arguments */ int32_t *retval; /* place to store return val */ - } ss_select_data; + } uus_select_data; + struct _kqueue_scan { kevent_callback_t call; /* per-event callback */ kqueue_continue_t cont; /* whole call continuation */ filt_process_data_t process_data; /* needed for filter processing */ uint64_t deadline; /* computed deadline for operation */ void *data; /* caller's private data */ - } ss_kqueue_scan; /* saved state for kevent_scan() */ + } uus_kqueue_scan; /* saved state for kevent_scan() */ + struct _kevent { struct _kqueue_scan scan; /* space for the generic data */ struct fileproc *fp; /* fileproc we hold iocount on */ int fd; /* fd for fileproc (if held) */ - int eventcount; /* user-level event count */ + int eventcount; /* user-level event count */ int eventout; /* number of events output */ struct filt_process_s process_data; /* space for process data fed thru */ int32_t *retval; /* place to store return val */ user_addr_t eventlist; /* user-level event list address */ uint64_t data_available; /* [user/kernel] addr of in/out size */ - } ss_kevent; /* saved state for kevent() */ + } uus_kevent; /* saved state for kevent() */ + + struct _kevent_register { + struct kevent_internal_s kev; /* the kevent to maybe copy out */ + struct knote *knote; /* the knote used for the wait */ + struct fileproc *fp; /* fileproc we hold iocount on */ + thread_t handoff_thread; /* thread we handed off to, has +1 */ + struct kqueue *kq; + int fd; /* fd for fileproc (if held) */ + int eventcount; /* user-level event count */ + int eventout; /* number of events output */ + unsigned int flags; /* flags for kevent_copyout() */ + int32_t *retval; /* place to store return val */ + user_addr_t ueventlist; /* the user-address to copyout to */ + } uus_kevent_register; /* saved for EVFILT_WORKLOOP wait */ struct _kauth { - user_addr_t message; /* message in progress */ - } uu_kauth; + user_addr_t message; /* message in progress */ + } uus_kauth; - struct ksyn_waitq_element uu_kwe; /* user for pthread synch */ + struct ksyn_waitq_element uus_kwe; /* user for pthread synch */ struct _waitid_data { - struct waitid_nocancel_args *args; /* original syscall arguments */ - int32_t *retval; /* place to store return val */ - } uu_waitid_data; + struct waitid_nocancel_args *args; /* original syscall arguments */ + int32_t *retval; /* place to store return val */ + } uus_waitid_data; struct _wait4_data { - struct wait4_nocancel_args *args; /* original syscall arguments */ - int32_t *retval; /* place to store return val */ - } uu_wait4_data; - } uu_kevent; + struct wait4_nocancel_args *args; /* original syscall arguments */ + int32_t *retval; /* place to store return val */ + } uus_wait4_data; + + struct _workq_park_data { + uint64_t idle_stamp; + uint64_t workloop_params; + uint32_t fulfilled_snapshot; + uint32_t yields; + void *thread_request; /* request being fulfilled, for tracing only */ + uint32_t upcall_flags; + bool has_stack; + thread_qos_t qos; + } uus_workq_park_data; /* saved for parked workq threads */ + + struct _ulock_wait_data { + thread_t owner_thread; + thread_t old_owner; + int32_t *retval; + uint flags; + } uus_ulock_wait_data; + } uu_save; /* Persistent memory allocations across system calls */ struct _select { - u_int32_t *ibits, *obits; /* bits to select on */ - uint nbytes; /* number of bytes in ibits and obits */ + u_int32_t *ibits, *obits; /* bits to select on */ + uint nbytes; /* number of bytes in ibits and obits */ } uu_select; /* saved state for select() */ - /* internal support for continuation framework */ - int (*uu_continuation)(int); - int uu_pri; - int uu_timo; + /* internal support for continuation framework */ + int (*uu_continuation)(int); + int uu_pri; + int uu_timo; caddr_t uu_wchan; /* sleeping thread wait channel */ const char *uu_wmesg; /* ... wait message */ - struct proc * uu_proc; + struct proc *uu_proc; thread_t uu_thread; void * uu_userstate; struct waitq_set *uu_wqset; /* waitq state cached across select calls */ size_t uu_wqstate_sz; /* ...size of uu_wqset buffer */ int uu_flag; sigset_t uu_siglist; /* signals pending for the thread */ - sigset_t uu_sigwait; /* sigwait on this thread*/ - sigset_t uu_sigmask; /* signal mask for the thread */ - sigset_t uu_oldmask; /* signal mask saved before sigpause */ - sigset_t uu_vforkmask; /* saved signal mask during vfork */ + sigset_t uu_sigwait; /* sigwait on this thread*/ + sigset_t uu_sigmask; /* signal mask for the thread */ + sigset_t uu_oldmask; /* signal mask saved before sigpause */ + sigset_t uu_vforkmask; /* saved signal mask during vfork */ struct vfs_context uu_context; /* thread + cred */ TAILQ_ENTRY(uthread) uu_list; /* List of uthreads in proc */ - struct kaudit_record *uu_ar; /* audit record */ + struct kaudit_record *uu_ar; /* audit record */ struct task* uu_aio_task; /* target task for async io */ - + lck_mtx_t *uu_mtx; lck_spin_t uu_rethrottle_lock; /* locks was_rethrottled and is_throttled */ TAILQ_ENTRY(uthread) uu_throttlelist; /* List of uthreads currently throttled */ - void * uu_throttle_info; /* pointer to throttled I/Os info */ + void * uu_throttle_info; /* pointer to throttled I/Os info */ int uu_on_throttlelist; int uu_lowpri_window; - boolean_t uu_was_rethrottled; - boolean_t uu_is_throttled; - boolean_t uu_throttle_bc; + /* These boolean fields are protected by different locks */ + bool uu_was_rethrottled; + bool uu_is_throttled; + bool uu_throttle_bc; u_int32_t uu_network_marks; /* network control flow marks */ struct kern_sigaltstack uu_sigstk; - vnode_t uu_vreclaims; + vnode_t uu_vreclaims; vnode_t uu_cdir; /* per thread CWD */ int uu_dupfd; /* fd in fdesc_open/dupfdopen */ - int uu_defer_reclaims; - - struct kqueue *uu_kqueue_bound; /* kqueue we are bound to service */ - unsigned int uu_kqueue_qos_index; /* qos index we are bound to service */ - unsigned int uu_kqueue_flags; /* the flags we are using */ - boolean_t uu_kqueue_override_is_sync; /* sync qos override applied to servicer */ + int uu_defer_reclaims; + + /* + * Bound kqueue request. This field is only cleared by the current thread, + * hence can be dereferenced safely by the current thread without locks. + */ + struct kqrequest *uu_kqr_bound; + TAILQ_ENTRY(uthread) uu_workq_entry; + mach_vm_offset_t uu_workq_stackaddr; + mach_port_name_t uu_workq_thport; + struct uu_workq_policy { + uint16_t qos_req : 4; /* requested QoS */ + uint16_t qos_max : 4; /* current acked max qos */ + uint16_t qos_override : 4; /* received async override */ + uint16_t qos_bucket : 4; /* current acked bucket */ + } uu_workq_pri; + uint8_t uu_workq_flags; + kq_index_t uu_kqueue_override; #ifdef JOE_DEBUG - int uu_iocount; - int uu_vpindex; - void * uu_vps[32]; - void * uu_pcs[32][10]; + int uu_iocount; + int uu_vpindex; + void *uu_vps[32]; + void *uu_pcs[32][10]; +#endif +#if CONFIG_WORKLOOP_DEBUG +#define UU_KEVENT_HISTORY_COUNT 32 +#define UU_KEVENT_HISTORY_WRITE_ENTRY(uth, ...) ({ \ + struct uthread *__uth = (uth); \ + unsigned int __index = __uth->uu_kevent_index++; \ + __uth->uu_kevent_history[__index % UU_KEVENT_HISTORY_COUNT] = \ + (struct uu_kevent_history)__VA_ARGS__; \ + }) + struct uu_kevent_history { + uint64_t uu_kqid; + struct kqueue *uu_kq; + int uu_error, uu_nchanges, uu_nevents; + unsigned int uu_flags; + } uu_kevent_history[UU_KEVENT_HISTORY_COUNT]; + unsigned int uu_kevent_index; #endif int uu_proc_refcount; #if PROC_REF_DEBUG @@ -241,22 +305,22 @@ struct uthread { #if CONFIG_DTRACE uint32_t t_dtrace_errno; /* Most recent errno */ siginfo_t t_dtrace_siginfo; - uint64_t t_dtrace_resumepid; /* DTrace's pidresume() pid */ - uint8_t t_dtrace_stop; /* indicates a DTrace desired stop */ - uint8_t t_dtrace_sig; /* signal sent via DTrace's raise() */ - - union __tdu { - struct __tds { - uint8_t _t_dtrace_on; /* hit a fasttrap tracepoint */ - uint8_t _t_dtrace_step; /* about to return to kernel */ - uint8_t _t_dtrace_ret; /* handling a return probe */ - uint8_t _t_dtrace_ast; /* saved ast flag */ + uint64_t t_dtrace_resumepid; /* DTrace's pidresume() pid */ + uint8_t t_dtrace_stop; /* indicates a DTrace desired stop */ + uint8_t t_dtrace_sig; /* signal sent via DTrace's raise() */ + + union __tdu { + struct __tds { + uint8_t _t_dtrace_on; /* hit a fasttrap tracepoint */ + uint8_t _t_dtrace_step; /* about to return to kernel */ + uint8_t _t_dtrace_ret; /* handling a return probe */ + uint8_t _t_dtrace_ast; /* saved ast flag */ #if __sol64 || defined(__APPLE__) - uint8_t _t_dtrace_reg; /* modified register */ + uint8_t _t_dtrace_reg; /* modified register */ #endif - } _tds; - u_int32_t _t_dtrace_ft; /* bitwise or of these flags */ - } _tdu; + } _tds; + u_int32_t _t_dtrace_ft; /* bitwise or of these flags */ + } _tdu; #define t_dtrace_ft _tdu._t_dtrace_ft #define t_dtrace_on _tdu._tds._t_dtrace_on #define t_dtrace_step _tdu._tds._t_dtrace_step @@ -266,20 +330,19 @@ struct uthread { #define t_dtrace_reg _tdu._tds._t_dtrace_reg #endif - user_addr_t t_dtrace_pc; /* DTrace saved pc from fasttrap */ - user_addr_t t_dtrace_npc; /* DTrace next pc from fasttrap */ - user_addr_t t_dtrace_scrpc; /* DTrace per-thread scratch location */ - user_addr_t t_dtrace_astpc; /* DTrace return sequence location */ + user_addr_t t_dtrace_pc; /* DTrace saved pc from fasttrap */ + user_addr_t t_dtrace_npc; /* DTrace next pc from fasttrap */ + user_addr_t t_dtrace_scrpc; /* DTrace per-thread scratch location */ + user_addr_t t_dtrace_astpc; /* DTrace return sequence location */ struct dtrace_ptss_page_entry* t_dtrace_scratch; /* scratch space entry */ #if __sol64 || defined(__APPLE__) - uint64_t t_dtrace_regv; /* DTrace saved reg from fasttrap */ + uint64_t t_dtrace_regv; /* DTrace saved reg from fasttrap */ #endif - void * t_dtrace_syscall_args; + void *t_dtrace_syscall_args; #endif /* CONFIG_DTRACE */ - void * uu_threadlist; - char * pth_name; + char *pth_name; /* Document Tracking struct used to track a "tombstone" for a document */ struct doc_tombstone *t_tombstone; @@ -300,10 +363,10 @@ typedef struct uthread * uthread_t; #define UT_THROTTLE_IO 0x00000080 /* this thread issues throttle I/O */ #define UT_PASSIVE_IO 0x00000100 /* this thread issues passive I/O */ #define UT_PROCEXIT 0x00000200 /* this thread completed the proc exit */ -#define UT_RAGE_VNODES 0x00000400 /* rapid age any vnodes created by this thread */ -/* 0x00000800 unused, used to be UT_BACKGROUND */ +#define UT_RAGE_VNODES 0x00000400 /* rapid age any vnodes created by this thread */ +#define UT_KERN_RAGE_VNODES 0x00000800 /* rapid age any vnodes created by this thread (kernel set) */ /* 0x00001000 unused, used to be UT_BACKGROUND_TRAFFIC_MGT */ - +#define UT_ATIME_UPDATE 0x00002000 /* don't update atime for files accessed by this thread */ #define UT_VFORK 0x02000000 /* thread has vfork children */ #define UT_SETUID 0x04000000 /* thread is settugid() */ #define UT_WASSETUID 0x08000000 /* thread was settugid() (in vfork) */ @@ -321,9 +384,9 @@ typedef struct uthread * uthread_t; * This structure may or may not be at the same kernel address * in all processes. */ - + struct user { - /* NOT USED ANYMORE */ + /* NOT USED ANYMORE */ }; #endif /* !_SYS_USER_H_ */ diff --git a/bsd/sys/ux_exception.h b/bsd/sys/ux_exception.h index 836c658ca..99352e29a 100644 --- a/bsd/sys/ux_exception.h +++ b/bsd/sys/ux_exception.h @@ -53,26 +53,22 @@ #endif /* __APPLE_API_UNSTABLE */ -#ifdef KERNEL -#ifdef __APPLE_API_PRIVATE -/* - * Kernel data structures for Unix exception handler. - */ +#ifdef XNU_KERNEL_PRIVATE -#include +/* Kernel functions for Unix exception handler. */ -#if defined(__x86_64__) || defined(__arm64__) -extern mach_port_t ux_exception_port; -#else -extern mach_port_name_t ux_exception_port; -#endif /* __x86_64__ */ +#include -boolean_t machine_exception(int exception, mach_exception_code_t code, - mach_exception_subcode_t subcode, - int *unix_signal, mach_exception_code_t *unix_code); -void ux_handler_init(void); +extern int +machine_exception(int exception, mach_exception_code_t code, + mach_exception_subcode_t subcode); -#endif /* __APPLE_API_PRIVATE */ -#endif /* KERNEL */ +extern kern_return_t +handle_ux_exception(thread_t thread, int exception, + mach_exception_code_t code, + mach_exception_subcode_t subcode); + +#endif /* XNU_KERNEL_PRIVATE */ #endif /* _SYS_UX_EXCEPTION_H_ */ + diff --git a/bsd/sys/vnode.h b/bsd/sys/vnode.h index 74e0704e8..b7aa1efe8 100644 --- a/bsd/sys/vnode.h +++ b/bsd/sys/vnode.h @@ -1102,6 +1102,17 @@ int vnode_isswap(vnode_t vp); */ int vnode_isnamedstream(vnode_t vp); +#ifdef KERNEL_PRIVATE +/*! + @function vnode_setasnamedstream + @abstract Set svp as a named stream of vp and take appropriate references. + @param vp The vnode whose namedstream has to be set. + @param svp The namedstream vnode. + @return 0 if the operation is successful, an error otherwise. + */ +errno_t vnode_setasnamedstream(vnode_t vp, vnode_t svp); +#endif + /*! @function vnode_ismountedon @abstract Determine if a vnode is a block device on which a filesystem has been mounted. @@ -1653,6 +1664,7 @@ int vn_getpath_fsenter_with_parent(struct vnode *dvp, struct vnode *vp, char *pa #endif /* KERNEL_PRIVATE */ #define VNODE_UPDATE_PARENT 0x01 +#define VNODE_UPDATE_NAMEDSTREAM_PARENT VNODE_UPDATE_PARENT #define VNODE_UPDATE_NAME 0x02 #define VNODE_UPDATE_CACHE 0x04 #define VNODE_UPDATE_PURGE 0x08 @@ -2171,6 +2183,16 @@ const char *vnode_getname_printable(vnode_t vp); void vnode_putname_printable(const char *name); #endif // KERNEL_PRIVATE +/*! + @function vnode_getbackingvnode + @abstract If the input vnode is a NULLFS mirrored vnode, then return the vnode it wraps. + @Used to un-mirror files, primarily for security purposes. On success, out_vp is always set to a vp with an iocount. The caller must release the iocount. + @param in_vp The vnode being asked about + @param out_vpp A pointer to the output vnode, unchanged on error + @return 0 on Success, ENOENT if in_vp doesn't mirror anything, EINVAL on parameter errors. + */ +int vnode_getbackingvnode(vnode_t in_vp, vnode_t* out_vpp); + /* * Helper functions for implementing VNOP_GETATTRLISTBULK for a filesystem */ diff --git a/bsd/sys/vnode_internal.h b/bsd/sys/vnode_internal.h index d06102237..b34dbc110 100644 --- a/bsd/sys/vnode_internal.h +++ b/bsd/sys/vnode_internal.h @@ -253,9 +253,7 @@ struct vnode { #define VLOCKLOCAL 0x080000 /* this vnode does adv locking in vfs */ #define VISHARDLINK 0x100000 /* hard link needs special processing on lookup and in volfs */ #define VISUNION 0x200000 /* union special processing */ -#if NAMEDSTREAMS #define VISNAMEDSTREAM 0x400000 /* vnode is a named stream (eg HFS resource fork) */ -#endif #define VOPENEVT 0x800000 /* if process is P_CHECKOPENEVT, then or in the O_EVTONLY flag on open */ #define VNEEDSSNAPSHOT 0x1000000 #define VNOCS 0x2000000 /* is there no code signature available */ @@ -444,6 +442,9 @@ int vn_authorize_rename(struct vnode *fdvp, struct vnode *fvp, struct component int vn_authorize_renamex(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved); +int vn_authorize_renamex_with_paths(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, const char *from_path, + struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, const char *to_path, + vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved); int vn_authorize_rmdir(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, void *reserved); typedef int (*vn_create_authorizer_t)(vnode_t, struct componentname *, struct vnode_attr *, vfs_context_t, void*); @@ -479,7 +480,7 @@ errno_t vnode_verifynamedstream (vnode_t vp); void nchinit(void); -int resize_namecache(uint32_t newsize); +int resize_namecache(int newsize); void name_cache_lock_shared(void); void name_cache_lock(void); void name_cache_unlock(void); diff --git a/bsd/sys/work_interval.h b/bsd/sys/work_interval.h index ae881a0f9..797f929bc 100644 --- a/bsd/sys/work_interval.h +++ b/bsd/sys/work_interval.h @@ -117,6 +117,7 @@ __BEGIN_DECLS #define WORK_INTERVAL_TYPE_COREANIMATION (0x2 << 28) #define WORK_INTERVAL_TYPE_CA_RENDER_SERVER (0x2 << 28) #define WORK_INTERVAL_TYPE_CA_CLIENT (0x3 << 28) +#define WORK_INTERVAL_TYPE_HID_DELIVERY (0x4 << 28) #define WORK_INTERVAL_TYPE_LAST (0xF << 28) #ifndef KERNEL diff --git a/bsd/tests/bsd_tests.c b/bsd/tests/bsd_tests.c new file mode 100644 index 000000000..dfb379143 --- /dev/null +++ b/bsd/tests/bsd_tests.c @@ -0,0 +1,290 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#if !(DEVELOPMENT || DEBUG) +#error "Testing is not enabled on RELEASE configurations" +#endif + +#ifdef __arm64__ +extern kern_return_t arm64_lock_test(void); +#endif +kern_return_t kalloc_test(void); +kern_return_t ipi_test(void); + +struct xnupost_test bsd_post_tests[] = { +#ifdef __arm64__ + XNUPOST_TEST_CONFIG_BASIC(arm64_lock_test), +#endif + XNUPOST_TEST_CONFIG_BASIC(kalloc_test), + XNUPOST_TEST_CONFIG_BASIC(ipi_test) +}; + +uint32_t bsd_post_tests_count = sizeof(bsd_post_tests) / sizeof(xnupost_test_data_t); + +extern uint64_t last_loaded_timestamp; /* updated by OSKext::load() */ +extern uint64_t kernel_post_args; +int +bsd_list_tests() +{ + if (kernel_post_args == 0) { + return 0; + } + + uint64_t prev_load_time = last_loaded_timestamp; + int no_load_counter = 5; + int absolute_break_counter = 15; + int delay_duration_usecs = 300000; /* 0.3 second for kext loading to stabilize */ + + while (no_load_counter > 0) { + printf("bsd_list_tests:INFO waiting for %d usecs\n", delay_duration_usecs); + printf("bsd_list_tests: prev: %llu current: %llu\n", prev_load_time, last_loaded_timestamp); + + delay(delay_duration_usecs); + absolute_break_counter -= 1; + + if (absolute_break_counter <= 0) { + printf("bsd_list_tests: WARNING: Waiting beyond normal time for stabilizing kext loading\n"); + break; + } + + if (prev_load_time == last_loaded_timestamp) { + no_load_counter -= 1; + printf("bsd_list_tests: INFO: no new kexts loaded. remaining checks: %d\n", no_load_counter); + } + + prev_load_time = last_loaded_timestamp; + } + + return xnupost_list_tests(bsd_post_tests, bsd_post_tests_count); +} + +int +bsd_do_post() +{ + return xnupost_run_tests(bsd_post_tests, bsd_post_tests_count); +} + +kern_return_t +kalloc_test() +{ + uint64_t * data_ptr; + size_t alloc_size; + + T_LOG("Running kalloc test.\n"); + + alloc_size = sizeof(uint64_t); + data_ptr = kalloc(alloc_size); + T_ASSERT_NOTNULL(data_ptr, "kalloc sizeof(uint64_t) return not null"); + kfree(data_ptr, alloc_size); + + alloc_size = 3544; + data_ptr = kalloc(alloc_size); + T_ASSERT_NOTNULL(data_ptr, "kalloc 3544 return not null"); + kfree(data_ptr, alloc_size); + + return KERN_SUCCESS; +} + +/* kcdata type definition */ +#define XNUPOST_TNAME_MAXLEN 132 + +struct kcdata_subtype_descriptor kc_xnupost_test_def[] = { + {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT16, 0, sizeof(uint16_t), "config"}, + {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT16, 1 * sizeof(uint16_t), sizeof(uint16_t), "test_num"}, + {KCS_SUBTYPE_FLAGS_NONE, KC_ST_INT32, 2 * sizeof(uint16_t), sizeof(int32_t), "retval"}, + {KCS_SUBTYPE_FLAGS_NONE, KC_ST_INT32, 2 * sizeof(uint16_t) + sizeof(int32_t), sizeof(int32_t), "expected_retval"}, + {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT64, 2 * (sizeof(uint16_t) + sizeof(int32_t)), sizeof(uint64_t), "begin_time"}, + {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT64, 2 * (sizeof(uint16_t) + sizeof(int32_t)) + sizeof(uint64_t), sizeof(uint64_t), "end_time"}, + {KCS_SUBTYPE_FLAGS_ARRAY, + KC_ST_CHAR, + 2 * (sizeof(uint16_t) + sizeof(int32_t) + sizeof(uint64_t)), + KCS_SUBTYPE_PACK_SIZE(XNUPOST_TNAME_MAXLEN * sizeof(char), sizeof(char)), + "test_name"}}; + +const uint32_t kc_xnupost_test_def_count = sizeof(kc_xnupost_test_def) / sizeof(struct kcdata_subtype_descriptor); + +kern_return_t xnupost_copyout_test(xnupost_test_t t, mach_vm_address_t outaddr); + +int +xnupost_copyout_test(xnupost_test_t t, mach_vm_address_t outaddr) +{ + /* code to copyout test config */ + int kret = 0; + uint32_t namelen = 0; + + kret = copyout(&t->xt_config, outaddr, sizeof(uint16_t)); + if (kret) + return kret; + outaddr += sizeof(uint16_t); + + kret = copyout(&t->xt_test_num, outaddr, sizeof(uint16_t)); + if (kret) + return kret; + outaddr += sizeof(uint16_t); + + kret = copyout(&t->xt_retval, outaddr, sizeof(uint32_t)); + if (kret) + return kret; + outaddr += sizeof(uint32_t); + + kret = copyout(&t->xt_expected_retval, outaddr, sizeof(uint32_t)); + if (kret) + return kret; + outaddr += sizeof(uint32_t); + + kret = copyout(&t->xt_begin_time, outaddr, sizeof(uint64_t)); + if (kret) + return kret; + outaddr += sizeof(uint64_t); + + kret = copyout(&t->xt_end_time, outaddr, sizeof(uint64_t)); + if (kret) + return kret; + outaddr += sizeof(uint64_t); + + namelen = strnlen(t->xt_name, XNUPOST_TNAME_MAXLEN); + kret = copyout(t->xt_name, outaddr, namelen); + if (kret) + return kret; + outaddr += namelen; + + return 0; +} + +uint32_t +xnupost_get_estimated_testdata_size(void) +{ + uint32_t total_tests = bsd_post_tests_count + kernel_post_tests_count; + uint32_t elem_size = kc_xnupost_test_def[kc_xnupost_test_def_count - 1].kcs_elem_offset + + kcs_get_elem_size(&kc_xnupost_test_def[kc_xnupost_test_def_count - 1]); + uint32_t retval = 1024; /* account for type definition and mach timebase */ + retval += 1024; /* kernel version and boot-args string data */ + retval += (total_tests * elem_size); + + return retval; +} + +int +xnupost_export_testdata(void * outp, uint32_t size, uint32_t * lenp) +{ + struct kcdata_descriptor kcd; + mach_vm_address_t user_addr = 0; + mach_vm_address_t tmp_entry_addr = 0; + kern_return_t kret = 0; + uint32_t i = 0; + char kctype_name[32] = "xnupost_test_config"; + mach_timebase_info_data_t timebase = {0, 0}; + uint32_t length_to_copy = 0; + +#define RET_IF_OP_FAIL \ + do { \ + if (kret != KERN_SUCCESS) { \ + return (kret == KERN_NO_ACCESS) ? EACCES : ((kret == KERN_RESOURCE_SHORTAGE) ? ENOMEM : EINVAL); \ + } \ + } while (0) + + kret = kcdata_memory_static_init(&kcd, (mach_vm_address_t)outp, KCDATA_BUFFER_BEGIN_XNUPOST_CONFIG, size, KCFLAG_USE_COPYOUT); + RET_IF_OP_FAIL; + + /* add mach timebase info */ + clock_timebase_info(&timebase); + kret = kcdata_get_memory_addr(&kcd, KCDATA_TYPE_TIMEBASE, sizeof(timebase), &user_addr); + RET_IF_OP_FAIL; + kret = copyout(&timebase, user_addr, sizeof(timebase)); + RET_IF_OP_FAIL; + + /* save boot-args and osversion string */ + length_to_copy = MIN((uint32_t)(strlen(version) + 1), OSVERSIZE); + kret = kcdata_get_memory_addr(&kcd, STACKSHOT_KCTYPE_OSVERSION, length_to_copy, &user_addr); + RET_IF_OP_FAIL; + kret = copyout(&version[0], user_addr, length_to_copy); + RET_IF_OP_FAIL; + + length_to_copy = MIN((uint32_t)(strlen(PE_boot_args()) + 1), OSVERSIZE); + kret = kcdata_get_memory_addr(&kcd, STACKSHOT_KCTYPE_BOOTARGS, length_to_copy, &user_addr); + RET_IF_OP_FAIL; + kret = copyout(PE_boot_args(), user_addr, length_to_copy); + RET_IF_OP_FAIL; + + /* add type definition to buffer */ + kret = kcdata_add_type_definition(&kcd, XNUPOST_KCTYPE_TESTCONFIG, kctype_name, &kc_xnupost_test_def[0], + kc_xnupost_test_def_count); + RET_IF_OP_FAIL; + + /* add the tests to buffer as array */ + uint32_t total_tests = bsd_post_tests_count + kernel_post_tests_count; + uint32_t elem_size = kc_xnupost_test_def[kc_xnupost_test_def_count - 1].kcs_elem_offset + + kcs_get_elem_size(&kc_xnupost_test_def[kc_xnupost_test_def_count - 1]); + + kret = kcdata_get_memory_addr_for_array(&kcd, XNUPOST_KCTYPE_TESTCONFIG, elem_size, total_tests, &user_addr); + RET_IF_OP_FAIL; + + for (i = 0; i < bsd_post_tests_count; i++) { + tmp_entry_addr = (mach_vm_address_t)((uint64_t)(user_addr) + (uint64_t)(i * elem_size)); + kret = xnupost_copyout_test(&bsd_post_tests[i], tmp_entry_addr); + RET_IF_OP_FAIL; + } + user_addr = (mach_vm_address_t)((uint64_t)(user_addr) + (uint64_t)(i * elem_size)); + + for (i = 0; i < kernel_post_tests_count; i++) { + tmp_entry_addr = (mach_vm_address_t)((uint64_t)(user_addr) + (uint64_t)(i * elem_size)); + kret = xnupost_copyout_test(&kernel_post_tests[i], tmp_entry_addr); + RET_IF_OP_FAIL; + } + + if (kret == KERN_SUCCESS && lenp != NULL) + *lenp = (uint32_t)kcdata_memory_get_used_bytes(&kcd); + RET_IF_OP_FAIL; + +#undef RET_IF_OP_FAIL + return kret; +} + +int +xnupost_reset_all_tests(void) +{ + xnupost_reset_tests(&bsd_post_tests[0], bsd_post_tests_count); + xnupost_reset_tests(&kernel_post_tests[0], kernel_post_tests_count); + return 0; +} diff --git a/osfmk/mach/branch_predicates.h b/bsd/tests/ctrr_test_sysctl.c similarity index 81% rename from osfmk/mach/branch_predicates.h rename to bsd/tests/ctrr_test_sysctl.c index a551970dc..ca1056fcf 100644 --- a/osfmk/mach/branch_predicates.h +++ b/bsd/tests/ctrr_test_sysctl.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2018 Apple Inc. All rights reserved. + * Copyright (c) 2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -26,12 +26,5 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +#include -#ifndef _MACH_BRANCH_PREDICATES_H -#define _MACH_BRANCH_PREDICATES_H - -#define __probable(x) __builtin_expect(!!((long)(x)), 1L) - -#define __improbable(x) __builtin_expect(!!((long)(x)), 0L) - -#endif /* _MACH_BRANCH_PREDICATES_H */ diff --git a/osfmk/prng/YarrowCoreLib/src/macOnly.h b/bsd/tests/pmap_test_sysctl.c similarity index 53% rename from osfmk/prng/YarrowCoreLib/src/macOnly.h rename to bsd/tests/pmap_test_sysctl.c index 4586b0245..d1280372f 100644 --- a/osfmk/prng/YarrowCoreLib/src/macOnly.h +++ b/bsd/tests/pmap_test_sysctl.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999, 2000-2013 Apple Inc. All rights reserved. + * Copyright (c) 2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -26,44 +26,37 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - File: macOnly.h - - Contains: Mac-specific #defines for Yarrow. - - Written by: Doug Mitchell - - Copyright: (c) 2000 by Apple Computer, Inc., all rights reserved. - - Change History (most recent first): - - 02/10/99 dpm Created. - -*/ +#include -#if !defined(macintosh) && !defined(__APPLE__) -#error Hey, why are you including macOnly for a non-Mac build!? -#endif +extern kern_return_t test_pmap_enter_disconnect(unsigned int); +extern kern_return_t test_pmap_iommu_disconnect(void); -#ifndef _MAC_ONLY_H_ -#define _MAC_ONLY_H_ - -#include "prng/YarrowCoreLib/include/WindowsTypesForMac.h" - -#if defined(__cplusplus) -extern "C" { -#endif - -/* - * No "slow poll" for Mac. - */ -#define SLOW_POLL_ENABLE 0 -#if SLOW_POLL_ENABLE -extern DWORD prng_slow_poll(BYTE* buf,UINT bufsize); -#endif /* SLOW_POLL_ENABLE */ +static int +sysctl_test_pmap_enter_disconnect(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + unsigned int num_loops; + int error, changed; + error = sysctl_io_number(req, 0, sizeof(num_loops), &num_loops, &changed); + if (error || !changed) + return error; + return test_pmap_enter_disconnect(num_loops); +} -#if defined(__cplusplus) +SYSCTL_PROC(_kern, OID_AUTO, pmap_enter_disconnect_test, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, 0, sysctl_test_pmap_enter_disconnect, "I", ""); + +static int +sysctl_test_pmap_iommu_disconnect(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + unsigned int run = 0; + int error, changed; + error = sysctl_io_number(req, 0, sizeof(run), &run, &changed); + if (error || !changed) + return error; + return test_pmap_iommu_disconnect(); } -#endif -#endif /* _MAC_ONLY_H_*/ +SYSCTL_PROC(_kern, OID_AUTO, pmap_iommu_disconnect_test, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, 0, sysctl_test_pmap_iommu_disconnect, "I", ""); diff --git a/bsd/uuid/uuid.h b/bsd/uuid/uuid.h index 65524909a..f751dc2ed 100644 --- a/bsd/uuid/uuid.h +++ b/bsd/uuid/uuid.h @@ -62,6 +62,8 @@ void uuid_generate(uuid_t out); void uuid_generate_random(uuid_t out); void uuid_generate_time(uuid_t out); +void uuid_generate_early_random(uuid_t out); + int uuid_is_null(const uuid_t uu); int uuid_parse(const uuid_string_t in, uuid_t uu); diff --git a/bsd/uxkern/ux_exception.c b/bsd/uxkern/ux_exception.c index 21bd3eec9..b69437f3d 100644 --- a/bsd/uxkern/ux_exception.c +++ b/bsd/uxkern/ux_exception.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2016 Apple Inc. All rights reserved. + * Copyright (c) 2000-2017 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,463 +22,157 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* +/* * Mach Operating System * Copyright (c) 1987 Carnegie-Mellon University * All rights reserved. The CMU software License Agreement specifies * the terms and conditions for use and redistribution. */ -/* - ********************************************************************* - * HISTORY - ********************************************************************** - */ - #include #include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include #include #include -#include -#include /* MAXSSIZ */ - -#include /* get_task_ipcspace() */ -/* - * XXX Things that should be retrieved from Mach headers, but aren't - */ -struct ipc_object; -extern kern_return_t ipc_object_copyin(ipc_space_t space, mach_port_name_t name, - mach_msg_type_name_t msgt_name, struct ipc_object **objectp); -extern mach_msg_return_t mach_msg_receive(mach_msg_header_t *msg, - mach_msg_option_t option, mach_msg_size_t rcv_size, - mach_port_name_t rcv_name, mach_msg_timeout_t rcv_timeout, - void (*continuation)(mach_msg_return_t), - mach_msg_size_t slist_size); -extern mach_msg_return_t mach_msg_send(mach_msg_header_t *msg, - mach_msg_option_t option, mach_msg_size_t send_size, - mach_msg_timeout_t send_timeout, mach_port_name_t notify); -extern thread_t convert_port_to_thread(ipc_port_t port); -extern void ipc_port_release_send(ipc_port_t port); - - +#include /* MAXSSIZ */ +#include /* - * Unix exception handler. + * Translate Mach exceptions to UNIX signals. + * + * ux_exception translates a mach exception, code and subcode to + * a signal. Calls machine_exception (machine dependent) + * to attempt translation first. */ - -static void ux_exception(int exception, mach_exception_code_t code, - mach_exception_subcode_t subcode, - int *ux_signal, mach_exception_code_t *ux_code); - -#if defined(__x86_64__) || defined(__arm64__) -mach_port_t ux_exception_port; -#else -mach_port_name_t ux_exception_port; -#endif /* __x86_64__ */ - -static task_t ux_handler_self; - -__attribute__((noreturn)) -static void -ux_handler(void) +static int +ux_exception(int exception, + mach_exception_code_t code, + mach_exception_subcode_t subcode) { - task_t self = current_task(); - mach_port_name_t exc_port_name; - mach_port_name_t exc_set_name; - - /* self->kernel_vm_space = TRUE; */ - ux_handler_self = self; - - - /* - * Allocate a port set that we will receive on. - */ - if (mach_port_allocate(get_task_ipcspace(ux_handler_self), MACH_PORT_RIGHT_PORT_SET, &exc_set_name) != MACH_MSG_SUCCESS) - panic("ux_handler: port_set_allocate failed"); - - /* - * Allocate an exception port and use object_copyin to - * translate it to the global name. Put it into the set. - */ - if (mach_port_allocate(get_task_ipcspace(ux_handler_self), MACH_PORT_RIGHT_RECEIVE, &exc_port_name) != MACH_MSG_SUCCESS) - panic("ux_handler: port_allocate failed"); - if (mach_port_move_member(get_task_ipcspace(ux_handler_self), - exc_port_name, exc_set_name) != MACH_MSG_SUCCESS) - panic("ux_handler: port_set_add failed"); - - if (ipc_object_copyin(get_task_ipcspace(self), exc_port_name, - MACH_MSG_TYPE_MAKE_SEND, - (void *) &ux_exception_port) != MACH_MSG_SUCCESS) - panic("ux_handler: object_copyin(ux_exception_port) failed"); - - proc_list_lock(); - thread_wakeup(&ux_exception_port); - proc_list_unlock(); - - /* Message handling loop. */ - - for (;;) { - struct rep_msg { - mach_msg_header_t Head; - NDR_record_t NDR; - kern_return_t RetCode; - } rep_msg; - struct exc_msg { - mach_msg_header_t Head; - /* start of the kernel processed data */ - mach_msg_body_t msgh_body; - mach_msg_port_descriptor_t thread; - mach_msg_port_descriptor_t task; - /* end of the kernel processed data */ - NDR_record_t NDR; - exception_type_t exception; - mach_msg_type_number_t codeCnt; - mach_exception_data_t code; - /* some times RCV_TO_LARGE probs */ - char pad[512]; - } exc_msg; - mach_port_name_t reply_port; - kern_return_t result; - - exc_msg.Head.msgh_local_port = CAST_MACH_NAME_TO_PORT(exc_set_name); - exc_msg.Head.msgh_size = sizeof (exc_msg); -#if 0 - result = mach_msg_receive(&exc_msg.Head); -#else - result = mach_msg_receive(&exc_msg.Head, MACH_RCV_MSG, - sizeof (exc_msg), exc_set_name, - MACH_MSG_TIMEOUT_NONE, MACH_PORT_NULL, - 0); -#endif - if (result == MACH_MSG_SUCCESS) { - reply_port = CAST_MACH_PORT_TO_NAME(exc_msg.Head.msgh_remote_port); - - if (mach_exc_server(&exc_msg.Head, &rep_msg.Head)) { - result = mach_msg_send(&rep_msg.Head, MACH_SEND_MSG, - sizeof (rep_msg),MACH_MSG_TIMEOUT_NONE,MACH_PORT_NULL); - if (reply_port != 0 && result != MACH_MSG_SUCCESS) - mach_port_deallocate(get_task_ipcspace(ux_handler_self), reply_port); - } - + int machine_signal = 0; + + /* Try machine-dependent translation first. */ + if ((machine_signal = machine_exception(exception, code, subcode)) != 0) + return machine_signal; + + switch(exception) { + case EXC_BAD_ACCESS: + if (code == KERN_INVALID_ADDRESS) + return SIGSEGV; + else + return SIGBUS; + + case EXC_BAD_INSTRUCTION: + return SIGILL; + + case EXC_ARITHMETIC: + return SIGFPE; + + case EXC_EMULATION: + return SIGEMT; + + case EXC_SOFTWARE: + switch (code) { + case EXC_UNIX_BAD_SYSCALL: + return SIGSYS; + case EXC_UNIX_BAD_PIPE: + return SIGPIPE; + case EXC_UNIX_ABORT: + return SIGABRT; + case EXC_SOFT_SIGNAL: + return SIGKILL; + } + break; + + case EXC_BREAKPOINT: + return SIGTRAP; } - else if (result == MACH_RCV_TOO_LARGE) - /* ignore oversized messages */; - else - panic("exception_handler"); - } -} -void -ux_handler_init(void) -{ - thread_t thread = THREAD_NULL; - - ux_exception_port = MACH_PORT_NULL; - (void) kernel_thread_start((thread_continue_t)ux_handler, NULL, &thread); - thread_deallocate(thread); - proc_list_lock(); - if (ux_exception_port == MACH_PORT_NULL) { - (void)msleep(&ux_exception_port, proc_list_mlock, 0, "ux_handler_wait", 0); - } - proc_list_unlock(); + return 0; } +/* + * Sends the corresponding UNIX signal to a thread that has triggered a Mach exception. + */ kern_return_t -catch_exception_raise( - __unused mach_port_t exception_port, - mach_port_t thread, - mach_port_t task, - exception_type_t exception, - exception_data_t code, - __unused mach_msg_type_number_t codeCnt -) +handle_ux_exception(thread_t thread, + int exception, + mach_exception_code_t code, + mach_exception_subcode_t subcode) { - mach_exception_data_type_t big_code[EXCEPTION_CODE_MAX]; - big_code[0] = code[0]; - big_code[1] = code[1]; + /* Returns +1 proc reference */ + proc_t p = proc_findthread(thread); - return catch_mach_exception_raise(exception_port, - thread, - task, - exception, - big_code, - codeCnt); + /* Can't deliver a signal without a bsd process reference */ + if (p == NULL) + return KERN_FAILURE; -} + /* Translate exception and code to signal type */ + int ux_signal = ux_exception(exception, code, subcode); -kern_return_t -catch_mach_exception_raise( - __unused mach_port_t exception_port, - mach_port_t thread, - mach_port_t task, - exception_type_t exception, - mach_exception_data_t code, - __unused mach_msg_type_number_t codeCnt -) -{ - task_t self = current_task(); - thread_t th_act; - ipc_port_t thread_port; - struct proc *p; - kern_return_t result = MACH_MSG_SUCCESS; - int ux_signal = 0; - mach_exception_code_t ucode = 0; - struct uthread *ut; - mach_port_name_t thread_name = CAST_MACH_PORT_TO_NAME(thread); - mach_port_name_t task_name = CAST_MACH_PORT_TO_NAME(task); + uthread_t ut = get_bsdthread_info(thread); /* - * Convert local thread name to global port. + * Stack overflow should result in a SIGSEGV signal + * on the alternate stack. + * but we have one or more guard pages after the + * stack top, so we would get a KERN_PROTECTION_FAILURE + * exception instead of KERN_INVALID_ADDRESS, resulting in + * a SIGBUS signal. + * Detect that situation and select the correct signal. */ - if (MACH_PORT_VALID(thread_name) && - (ipc_object_copyin(get_task_ipcspace(self), thread_name, - MACH_MSG_TYPE_PORT_SEND, - (void *) &thread_port) == MACH_MSG_SUCCESS)) { - if (IPC_PORT_VALID(thread_port)) { - th_act = convert_port_to_thread(thread_port); - ipc_port_release_send(thread_port); - } else { - th_act = THREAD_NULL; + if (code == KERN_PROTECTION_FAILURE && + ux_signal == SIGBUS) { + user_addr_t sp = subcode; + + user_addr_t stack_max = p->user_stack; + user_addr_t stack_min = p->user_stack - MAXSSIZ; + if (sp >= stack_min && sp < stack_max) { + /* + * This is indeed a stack overflow. Deliver a + * SIGSEGV signal. + */ + ux_signal = SIGSEGV; + + /* + * If the thread/process is not ready to handle + * SIGSEGV on an alternate stack, force-deliver + * SIGSEGV with a SIG_DFL handler. + */ + int mask = sigmask(ux_signal); + struct sigacts *ps = p->p_sigacts; + if ((p->p_sigignore & mask) || + (ut->uu_sigwait & mask) || + (ut->uu_sigmask & mask) || + (ps->ps_sigact[SIGSEGV] == SIG_IGN) || + (! (ps->ps_sigonstack & mask))) { + p->p_sigignore &= ~mask; + p->p_sigcatch &= ~mask; + ps->ps_sigact[SIGSEGV] = SIG_DFL; + ut->uu_sigwait &= ~mask; + ut->uu_sigmask &= ~mask; + } + } } - /* - * Catch bogus ports - */ - if (th_act != THREAD_NULL) { - - /* - * Convert exception to unix signal and code. - */ - ux_exception(exception, code[0], code[1], &ux_signal, &ucode); - - ut = get_bsdthread_info(th_act); - p = proc_findthread(th_act); - - /* Can't deliver a signal without a bsd process reference */ - if (p == NULL) { - ux_signal = 0; - result = KERN_FAILURE; - } - - /* - * Stack overflow should result in a SIGSEGV signal - * on the alternate stack. - * but we have one or more guard pages after the - * stack top, so we would get a KERN_PROTECTION_FAILURE - * exception instead of KERN_INVALID_ADDRESS, resulting in - * a SIGBUS signal. - * Detect that situation and select the correct signal. - */ - if (code[0] == KERN_PROTECTION_FAILURE && - ux_signal == SIGBUS) { - user_addr_t sp, stack_min, stack_max; - int mask; - struct sigacts *ps; - - sp = code[1]; - - stack_max = p->user_stack; - stack_min = p->user_stack - MAXSSIZ; - if (sp >= stack_min && - sp < stack_max) { - /* - * This is indeed a stack overflow. Deliver a - * SIGSEGV signal. - */ - ux_signal = SIGSEGV; - - /* - * If the thread/process is not ready to handle - * SIGSEGV on an alternate stack, force-deliver - * SIGSEGV with a SIG_DFL handler. - */ - mask = sigmask(ux_signal); - ps = p->p_sigacts; - if ((p->p_sigignore & mask) || - (ut->uu_sigwait & mask) || - (ut->uu_sigmask & mask) || - (ps->ps_sigact[SIGSEGV] == SIG_IGN) || - (! (ps->ps_sigonstack & mask))) { - p->p_sigignore &= ~mask; - p->p_sigcatch &= ~mask; - ps->ps_sigact[SIGSEGV] = SIG_DFL; - ut->uu_sigwait &= ~mask; - ut->uu_sigmask &= ~mask; - } - } - } - /* - * Send signal. - */ - if (ux_signal != 0) { - ut->uu_exception = exception; - //ut->uu_code = code[0]; // filled in by threadsignal - ut->uu_subcode = code[1]; - threadsignal(th_act, ux_signal, code[0], TRUE); - } - if (p != NULL) - proc_rele(p); - thread_deallocate(th_act); + /* Send signal to thread */ + if (ux_signal != 0) { + ut->uu_exception = exception; + //ut->uu_code = code; // filled in by threadsignal + ut->uu_subcode = subcode; + threadsignal(thread, ux_signal, code, TRUE); } - else - result = KERN_INVALID_ARGUMENT; - } - else - result = KERN_INVALID_ARGUMENT; - /* - * Delete our send rights to the task port. - */ - (void)mach_port_deallocate(get_task_ipcspace(ux_handler_self), task_name); + proc_rele(p); - return (result); + return KERN_SUCCESS; } -kern_return_t -catch_exception_raise_state( - __unused mach_port_t exception_port, - __unused exception_type_t exception, - __unused const exception_data_t code, - __unused mach_msg_type_number_t codeCnt, - __unused int *flavor, - __unused const thread_state_t old_state, - __unused mach_msg_type_number_t old_stateCnt, - __unused thread_state_t new_state, - __unused mach_msg_type_number_t *new_stateCnt) -{ - return(KERN_INVALID_ARGUMENT); -} - -kern_return_t -catch_mach_exception_raise_state( - __unused mach_port_t exception_port, - __unused exception_type_t exception, - __unused const mach_exception_data_t code, - __unused mach_msg_type_number_t codeCnt, - __unused int *flavor, - __unused const thread_state_t old_state, - __unused mach_msg_type_number_t old_stateCnt, - __unused thread_state_t new_state, - __unused mach_msg_type_number_t *new_stateCnt) -{ - return(KERN_INVALID_ARGUMENT); -} - -kern_return_t -catch_exception_raise_state_identity( - __unused mach_port_t exception_port, - __unused mach_port_t thread, - __unused mach_port_t task, - __unused exception_type_t exception, - __unused exception_data_t code, - __unused mach_msg_type_number_t codeCnt, - __unused int *flavor, - __unused thread_state_t old_state, - __unused mach_msg_type_number_t old_stateCnt, - __unused thread_state_t new_state, - __unused mach_msg_type_number_t *new_stateCnt) -{ - return(KERN_INVALID_ARGUMENT); -} - -kern_return_t -catch_mach_exception_raise_state_identity( - __unused mach_port_t exception_port, - __unused mach_port_t thread, - __unused mach_port_t task, - __unused exception_type_t exception, - __unused mach_exception_data_t code, - __unused mach_msg_type_number_t codeCnt, - __unused int *flavor, - __unused thread_state_t old_state, - __unused mach_msg_type_number_t old_stateCnt, - __unused thread_state_t new_state, - __unused mach_msg_type_number_t *new_stateCnt) -{ - return(KERN_INVALID_ARGUMENT); -} - - -/* - * ux_exception translates a mach exception, code and subcode to - * a signal and u.u_code. Calls machine_exception (machine dependent) - * to attempt translation first. - */ - -static -void ux_exception( - int exception, - mach_exception_code_t code, - mach_exception_subcode_t subcode, - int *ux_signal, - mach_exception_code_t *ux_code) -{ - /* - * Try machine-dependent translation first. - */ - if (machine_exception(exception, code, subcode, ux_signal, ux_code)) - return; - - switch(exception) { - - case EXC_BAD_ACCESS: - if (code == KERN_INVALID_ADDRESS) - *ux_signal = SIGSEGV; - else - *ux_signal = SIGBUS; - break; - - case EXC_BAD_INSTRUCTION: - *ux_signal = SIGILL; - break; - - case EXC_ARITHMETIC: - *ux_signal = SIGFPE; - break; - - case EXC_EMULATION: - *ux_signal = SIGEMT; - break; - - case EXC_SOFTWARE: - switch (code) { - - case EXC_UNIX_BAD_SYSCALL: - *ux_signal = SIGSYS; - break; - case EXC_UNIX_BAD_PIPE: - *ux_signal = SIGPIPE; - break; - case EXC_UNIX_ABORT: - *ux_signal = SIGABRT; - break; - case EXC_SOFT_SIGNAL: - *ux_signal = SIGKILL; - break; - } - break; - - case EXC_BREAKPOINT: - *ux_signal = SIGTRAP; - break; - } -} diff --git a/bsd/vfs/kpi_vfs.c b/bsd/vfs/kpi_vfs.c index 060866928..f09e98f74 100644 --- a/bsd/vfs/kpi_vfs.c +++ b/bsd/vfs/kpi_vfs.c @@ -103,10 +103,12 @@ #include #include #include +#include #include #include #include +#include #include @@ -120,6 +122,10 @@ #include #endif +#if NULLFS +#include +#endif + #include #define ESUCCESS 0 @@ -1595,12 +1601,16 @@ vfs_ctx_skipatime (vfs_context_t ctx) { if (proc->p_lflag & P_LRAGE_VNODES) { return 1; } - + if (ut) { - if (ut->uu_flag & UT_RAGE_VNODES) { + if (ut->uu_flag & (UT_RAGE_VNODES | UT_ATIME_UPDATE)) { return 1; } } + + if (proc->p_vfs_iopolicy & P_VFS_IOPOLICY_ATIME_UPDATES) { + return 1; + } } return 0; } @@ -2904,6 +2914,20 @@ vnode_ismonitored(vnode_t vp) { return (vp->v_knotes.slh_first != NULL); } +int +vnode_getbackingvnode(vnode_t in_vp, vnode_t* out_vpp) +{ + if (out_vpp) { + *out_vpp = NULLVP; + } +#if NULLFS + return nullfs_getbackingvnode(in_vp, out_vpp); +#else +#pragma unused(in_vp) + return ENOENT; +#endif +} + /* * Initialize a struct vnode_attr and activate the attributes required * by the vnode_notify() call. @@ -4003,37 +4027,35 @@ vn_rename(struct vnode *fdvp, struct vnode **fvpp, struct componentname *fcnp, s * in the rename syscall. It's OK if the source file does not exist, since this * is only for AppleDouble files. */ - if (xfromname != NULL) { - MALLOC(fromnd, struct nameidata *, sizeof (struct nameidata), M_TEMP, M_WAITOK); - NDINIT(fromnd, RENAME, OP_RENAME, NOFOLLOW | USEDVP | CN_NBMOUNTLOOK, - UIO_SYSSPACE, CAST_USER_ADDR_T(xfromname), ctx); - fromnd->ni_dvp = fdvp; - error = namei(fromnd); - - /* - * If there was an error looking up source attribute file, - * we'll behave as if it didn't exist. - */ + MALLOC(fromnd, struct nameidata *, sizeof (struct nameidata), M_TEMP, M_WAITOK); + NDINIT(fromnd, RENAME, OP_RENAME, NOFOLLOW | USEDVP | CN_NBMOUNTLOOK, + UIO_SYSSPACE, CAST_USER_ADDR_T(xfromname), ctx); + fromnd->ni_dvp = fdvp; + error = namei(fromnd); - if (error == 0) { - if (fromnd->ni_vp) { - /* src_attr_vp indicates need to call vnode_put / nameidone later */ - src_attr_vp = fromnd->ni_vp; - - if (fromnd->ni_vp->v_type != VREG) { - src_attr_vp = NULLVP; - vnode_put(fromnd->ni_vp); - } - } - /* - * Either we got an invalid vnode type (not a regular file) or the namei lookup - * suppressed ENOENT as a valid error since we're renaming. Either way, we don't - * have a vnode here, so we drop our namei buffer for the source attribute file - */ - if (src_attr_vp == NULLVP) { - nameidone(fromnd); + /* + * If there was an error looking up source attribute file, + * we'll behave as if it didn't exist. + */ + + if (error == 0) { + if (fromnd->ni_vp) { + /* src_attr_vp indicates need to call vnode_put / nameidone later */ + src_attr_vp = fromnd->ni_vp; + + if (fromnd->ni_vp->v_type != VREG) { + src_attr_vp = NULLVP; + vnode_put(fromnd->ni_vp); } } + /* + * Either we got an invalid vnode type (not a regular file) or the namei lookup + * suppressed ENOENT as a valid error since we're renaming. Either way, we don't + * have a vnode here, so we drop our namei buffer for the source attribute file + */ + if (src_attr_vp == NULLVP) { + nameidone(fromnd); + } } } #endif /* CONFIG_APPLEDOUBLE */ @@ -5466,8 +5488,11 @@ VNOP_CLONEFILE(vnode_t fvp, vnode_t dvp, vnode_t *vpp, _err = (*dvp->v_op[vnop_clonefile_desc.vdesc_offset])(&a); - if (_err == 0 && *vpp) + if (_err == 0 && *vpp) { DTRACE_FSINFO(clonefile, vnode_t, *vpp); + if (kdebug_enable) + kdebug_lookup(*vpp, cnp); + } post_event_if_success(dvp, _err, NOTE_WRITE); diff --git a/bsd/vfs/vfs_attrlist.c b/bsd/vfs/vfs_attrlist.c index cde828a7b..cd8cbacad 100644 --- a/bsd/vfs/vfs_attrlist.c +++ b/bsd/vfs/vfs_attrlist.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1995-2017 Apple Inc. All rights reserved. + * Copyright (c) 1995-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1136,8 +1136,8 @@ getvolattrlist(vfs_context_t ctx, vnode_t vp, struct attrlist *alp, * Note that since we won't ever copy out more than the caller requested, * we never need to allocate more than they offer. */ - ab.allocated = ulmin(bufferSize, fixedsize + varsize); - if (ab.allocated > ATTR_MAX_BUFFER) { + ab.allocated = fixedsize + varsize; + if (((size_t)ab.allocated) > ATTR_MAX_BUFFER) { error = ENOMEM; VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: buffer size too large (%d limit %d)", ab.allocated, ATTR_MAX_BUFFER); goto out; @@ -1182,6 +1182,10 @@ getvolattrlist(vfs_context_t ctx, vnode_t vp, struct attrlist *alp, ab.needed = fixedsize + varsize; /* common attributes **************************************************/ + if (alp->commonattr & ATTR_CMN_ERROR) { + ATTR_PACK4(ab, 0); + ab.actual.commonattr |= ATTR_CMN_ERROR; + } if (alp->commonattr & ATTR_CMN_NAME) { attrlist_pack_string(&ab, cnp, cnl); ab.actual.commonattr |= ATTR_CMN_NAME; @@ -1477,7 +1481,7 @@ getvolattrlist(vfs_context_t ctx, vnode_t vp, struct attrlist *alp, * of the result buffer, even if we copied less out. The caller knows how big a buffer * they gave us, so they can always check for truncation themselves. */ - *(uint32_t *)ab.base = (options & FSOPT_REPORT_FULLSIZE) ? ab.needed : imin(ab.allocated, ab.needed); + *(uint32_t *)ab.base = (options & FSOPT_REPORT_FULLSIZE) ? ab.needed : imin(bufferSize, ab.needed); /* Return attribute set output if requested. */ if (return_valid && @@ -1493,9 +1497,9 @@ getvolattrlist(vfs_context_t ctx, vnode_t vp, struct attrlist *alp, if (UIO_SEG_IS_USER_SPACE(segflg)) error = copyout(ab.base, CAST_USER_ADDR_T(attributeBuffer), - ab.allocated); + ulmin(bufferSize, ab.needed)); else - bcopy(ab.base, (void *)attributeBuffer, (size_t)ab.allocated); + bcopy(ab.base, (void *)attributeBuffer, (size_t)ulmin(bufferSize, ab.needed)); out: if (vs.f_vol_name != NULL) @@ -3700,6 +3704,7 @@ getattrlistbulk(proc_t p, struct getattrlistbulk_args *uap, int32_t *retval) struct fileproc *fp; struct fd_vn_data *fvdata; vfs_context_t ctx; + uthread_t ut; enum uio_seg segflg; int count; uio_t auio = NULL; @@ -3719,6 +3724,7 @@ getattrlistbulk(proc_t p, struct getattrlistbulk_args *uap, int32_t *retval) fvdata = NULL; eofflag = 0; ctx = vfs_context_current(); + ut = get_bsdthread_info(current_thread()); segflg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32; if ((fp->f_fglob->fg_flag & FREAD) == 0) { @@ -3865,8 +3871,14 @@ getattrlistbulk(proc_t p, struct getattrlistbulk_args *uap, int32_t *retval) (void)getattrlist_setupvattr_all(&al, &va, VNON, NULL, IS_64BIT_PROCESS(p), (uap->options & FSOPT_ATTR_CMN_EXTENDED)); + /* + * Set UT_KERN_RAGE_VNODES to cause all vnodes created by the + * filesystem to be rapidly aged. + */ + ut->uu_flag |= UT_KERN_RAGE_VNODES; error = VNOP_GETATTRLISTBULK(dvp, &al, &va, auio, NULL, options, &eofflag, &count, ctx); + ut->uu_flag &= ~UT_KERN_RAGE_VNODES; FREE(va_name, M_TEMP); @@ -3887,8 +3899,10 @@ getattrlistbulk(proc_t p, struct getattrlistbulk_args *uap, int32_t *retval) eofflag = 0; count = 0; + ut->uu_flag |= UT_KERN_RAGE_VNODES; error = readdirattr(dvp, fvdata, auio, &al, options, &count, &eofflag, ctx); + ut->uu_flag &= ~UT_KERN_RAGE_VNODES; } if (count) { diff --git a/bsd/vfs/vfs_bio.c b/bsd/vfs/vfs_bio.c index c1019a327..d26613ce8 100644 --- a/bsd/vfs/vfs_bio.c +++ b/bsd/vfs/vfs_bio.c @@ -3255,13 +3255,14 @@ buf_getblk(vnode_t vp, daddr64_t blkno, int size, int slpflag, int slptimeo, int if (kret != KERN_SUCCESS) panic("getblk: ubc_upl_map() failed with (%d)", kret); break; - } + } // end BLK_READ default: panic("getblk: paging or unknown operation - %x", operation); /*NOTREACHED*/ break; - } - } + } // end switch + } //end buf_t !incore + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END, bp, bp->b_datap, bp->b_flags, 3, 0); @@ -4044,9 +4045,11 @@ buf_biodone(buf_t bp) code |= DKIO_TIER_UPGRADE; } - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE, - buf_kernel_addrperm_addr(bp), (uintptr_t)VM_KERNEL_ADDRPERM(bp->b_vp), bp->b_resid, bp->b_error, 0); - } + KDBG_RELEASE_NOPROCFILT(FSDBG_CODE(DBG_DKRW, code), + buf_kernel_addrperm_addr(bp), + (uintptr_t)VM_KERNEL_ADDRPERM(bp->b_vp), bp->b_resid, + bp->b_error); + } microuptime(&real_elapsed); timevalsub(&real_elapsed, &bp->b_timestamp_tv); @@ -4579,7 +4582,7 @@ fs_buffer_cache_gc_dispatch_callouts(int all) lck_mtx_unlock(buf_gc_callout); } -boolean_t +static boolean_t buffer_cache_gc(int all) { buf_t bp; diff --git a/bsd/vfs/vfs_cache.c b/bsd/vfs/vfs_cache.c index b24dbc590..56c69754c 100644 --- a/bsd/vfs/vfs_cache.c +++ b/bsd/vfs/vfs_cache.c @@ -82,6 +82,7 @@ #include #include #include +#include #if CONFIG_MACF #include @@ -876,10 +877,8 @@ vnode_update_identity(vnode_t vp, vnode_t dvp, const char *name, int name_len, u { struct namecache *ncp; vnode_t old_parentvp = NULLVP; -#if NAMEDSTREAMS int isstream = (vp->v_flag & VISNAMEDSTREAM); int kusecountbumped = 0; -#endif kauth_cred_t tcred = NULL; const char *vname = NULL; const char *tname = NULL; @@ -888,7 +887,6 @@ vnode_update_identity(vnode_t vp, vnode_t dvp, const char *name, int name_len, u if (dvp && vnode_ref(dvp) != 0) { dvp = NULLVP; } -#if NAMEDSTREAMS /* Don't count a stream's parent ref during unmounts */ if (isstream && dvp && (dvp != vp) && (dvp != vp->v_parent) && (dvp->v_type == VREG)) { vnode_lock_spin(dvp); @@ -896,7 +894,6 @@ vnode_update_identity(vnode_t vp, vnode_t dvp, const char *name, int name_len, u kusecountbumped = 1; vnode_unlock(dvp); } -#endif } else { dvp = NULLVP; } @@ -960,7 +957,6 @@ vnode_update_identity(vnode_t vp, vnode_t dvp, const char *name, int name_len, u kauth_cred_unref(&tcred); } if (dvp != NULLVP) { -#if NAMEDSTREAMS /* Back-out the ref we took if we lost a race for vp->v_parent. */ if (kusecountbumped) { vnode_lock_spin(dvp); @@ -968,20 +964,17 @@ vnode_update_identity(vnode_t vp, vnode_t dvp, const char *name, int name_len, u --dvp->v_kusecount; vnode_unlock(dvp); } -#endif vnode_rele(dvp); } if (old_parentvp) { struct uthread *ut; -#if NAMEDSTREAMS if (isstream) { vnode_lock_spin(old_parentvp); if ((old_parentvp->v_type != VDIR) && (old_parentvp->v_kusecount > 0)) --old_parentvp->v_kusecount; vnode_unlock(old_parentvp); } -#endif ut = get_bsdthread_info(current_thread()); /* @@ -1437,7 +1430,7 @@ cache_lookup_path(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, * Force directory hardlinks to go to * file system for ".." requests. */ - if (dp && (dp->v_flag & VISHARDLINK)) { + if ((dp->v_flag & VISHARDLINK)) { break; } /* @@ -2167,28 +2160,35 @@ name_cache_unlock(void) int -resize_namecache(u_int newsize) +resize_namecache(int newsize) { struct nchashhead *new_table; struct nchashhead *old_table; struct nchashhead *old_head, *head; struct namecache *entry, *next; uint32_t i, hashval; - int dNodes, dNegNodes; + int dNodes, dNegNodes, nelements; u_long new_size, old_size; + if (newsize < 0) + return EINVAL; + dNegNodes = (newsize / 10); dNodes = newsize + dNegNodes; - // we don't support shrinking yet if (dNodes <= desiredNodes) { - return 0; + return 0; + } + + if (os_mul_overflow(dNodes, 2, &nelements)) { + return EINVAL; } - new_table = hashinit(2 * dNodes, M_CACHE, &nchashmask); + + new_table = hashinit(nelements, M_CACHE, &nchashmask); new_size = nchashmask + 1; if (new_table == NULL) { - return ENOMEM; + return ENOMEM; } NAME_CACHE_LOCK(); diff --git a/bsd/vfs/vfs_cluster.c b/bsd/vfs/vfs_cluster.c index 60807acea..cb023ccf9 100644 --- a/bsd/vfs/vfs_cluster.c +++ b/bsd/vfs/vfs_cluster.c @@ -207,18 +207,25 @@ static int cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t static int cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, u_int32_t *write_length, int (*)(buf_t, void *), void *callback_arg, int bflag); +static void cluster_update_state_internal(vnode_t vp, struct cl_extent *cl, int flags, boolean_t defer_writes, boolean_t *first_pass, + off_t write_off, int write_cnt, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated); + static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*)(buf_t, void *), void *callback_arg); static int cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag); -static void cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra, int (*callback)(buf_t, void *), void *callback_arg, int bflag); +static void cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra, + int (*callback)(buf_t, void *), void *callback_arg, int bflag); -static int cluster_push_now(vnode_t vp, struct cl_extent *, off_t EOF, int flags, int (*)(buf_t, void *), void *callback_arg); +static int cluster_push_now(vnode_t vp, struct cl_extent *, off_t EOF, int flags, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_ioitiated); -static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int flags, int (*)(buf_t, void *), void *callback_arg, int *err); +static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int flags, int (*)(buf_t, void *), + void *callback_arg, int *err, boolean_t vm_initiated); -static void sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF, int (*)(buf_t, void *), void *callback_arg); -static int sparse_cluster_push(void **cmapp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*)(buf_t, void *), void *callback_arg); -static void sparse_cluster_add(void **cmapp, vnode_t vp, struct cl_extent *, off_t EOF, int (*)(buf_t, void *), void *callback_arg); +static int sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated); +static int sparse_cluster_push(struct cl_writebehind *, void **cmapp, vnode_t vp, off_t EOF, int push_flag, + int io_flags, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated); +static int sparse_cluster_add(struct cl_writebehind *, void **cmapp, vnode_t vp, struct cl_extent *, off_t EOF, + int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated); static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp); static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp); @@ -487,7 +494,7 @@ cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *c if (wbp->cl_number) { lck_mtx_lock(&wbp->cl_lockw); - cluster_try_push(wbp, vp, newEOF, PUSH_ALL | flags, 0, callback, callback_arg, NULL); + cluster_try_push(wbp, vp, newEOF, PUSH_ALL | flags, 0, callback, callback_arg, NULL, FALSE); lck_mtx_unlock(&wbp->cl_lockw); } @@ -704,9 +711,9 @@ cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_fla * leave pages in the cache unchanged on error */ upl_abort_code = UPL_ABORT_FREE_ON_EMPTY; - else if (page_out && ((error != ENXIO) || vnode_isswap(vp))) + else if (((io_flags & B_READ) == 0) && ((error != ENXIO) || vnode_isswap(vp))) /* - * transient error... leave pages unchanged + * transient error on pageout/write path... leave pages unchanged */ upl_abort_code = UPL_ABORT_FREE_ON_EMPTY; else if (page_in) @@ -830,9 +837,9 @@ cluster_iodone(buf_t bp, void *callback_arg) if (ISSET(b_flags, B_COMMIT_UPL)) { cluster_handle_associated_upl(iostate, - cbp_head->b_upl, - upl_offset, - transaction_size); + cbp_head->b_upl, + upl_offset, + transaction_size); } if (error == 0 && total_resid) @@ -881,12 +888,15 @@ cluster_iodone(buf_t bp, void *callback_arg) } if (b_flags & B_COMMIT_UPL) { + pg_offset = upl_offset & PAGE_MASK; commit_size = (pg_offset + transaction_size + (PAGE_SIZE - 1)) & ~PAGE_MASK; - if (error) + if (error) { + upl_set_iodone_error(upl, error); + upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, commit_size, error, b_flags, vp); - else { + } else { upl_flags = UPL_COMMIT_FREE_ON_EMPTY; if ((b_flags & B_PHYS) && (b_flags & B_READ)) @@ -2977,6 +2987,280 @@ cluster_zero_range(upl_t upl, upl_page_info_t *pl, int flags, int io_offset, off } +void +cluster_update_state(vnode_t vp, vm_object_offset_t s_offset, vm_object_offset_t e_offset, boolean_t vm_initiated) +{ + struct cl_extent cl; + boolean_t first_pass = TRUE; + + assert(s_offset < e_offset); + assert((s_offset & PAGE_MASK_64) == 0); + assert((e_offset & PAGE_MASK_64) == 0); + + cl.b_addr = (daddr64_t)(s_offset / PAGE_SIZE_64); + cl.e_addr = (daddr64_t)(e_offset / PAGE_SIZE_64); + + cluster_update_state_internal(vp, &cl, 0, TRUE, &first_pass, s_offset, (int)(e_offset - s_offset), + vp->v_un.vu_ubcinfo->ui_size, NULL, NULL, vm_initiated); +} + + +static void +cluster_update_state_internal(vnode_t vp, struct cl_extent *cl, int flags, boolean_t defer_writes, + boolean_t *first_pass, off_t write_off, int write_cnt, off_t newEOF, + int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated) +{ + struct cl_writebehind *wbp; + int cl_index; + int ret_cluster_try_push; + u_int max_cluster_pgcount; + + + max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE; + + /* + * take the lock to protect our accesses + * of the writebehind and sparse cluster state + */ + wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED); + + if (wbp->cl_scmap) { + + if ( !(flags & IO_NOCACHE)) { + /* + * we've fallen into the sparse + * cluster method of delaying dirty pages + */ + sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, cl, newEOF, callback, callback_arg, vm_initiated); + + lck_mtx_unlock(&wbp->cl_lockw); + return; + } + /* + * must have done cached writes that fell into + * the sparse cluster mechanism... we've switched + * to uncached writes on the file, so go ahead + * and push whatever's in the sparse map + * and switch back to normal clustering + */ + wbp->cl_number = 0; + + sparse_cluster_push(wbp, &(wbp->cl_scmap), vp, newEOF, PUSH_ALL, 0, callback, callback_arg, vm_initiated); + /* + * no clusters of either type present at this point + * so just go directly to start_new_cluster since + * we know we need to delay this I/O since we've + * already released the pages back into the cache + * to avoid the deadlock with sparse_cluster_push + */ + goto start_new_cluster; + } + if (*first_pass == TRUE) { + if (write_off == wbp->cl_last_write) + wbp->cl_seq_written += write_cnt; + else + wbp->cl_seq_written = write_cnt; + + wbp->cl_last_write = write_off + write_cnt; + + *first_pass = FALSE; + } + if (wbp->cl_number == 0) + /* + * no clusters currently present + */ + goto start_new_cluster; + + for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) { + /* + * check each cluster that we currently hold + * try to merge some or all of this write into + * one or more of the existing clusters... if + * any portion of the write remains, start a + * new cluster + */ + if (cl->b_addr >= wbp->cl_clusters[cl_index].b_addr) { + /* + * the current write starts at or after the current cluster + */ + if (cl->e_addr <= (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) { + /* + * we have a write that fits entirely + * within the existing cluster limits + */ + if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) + /* + * update our idea of where the cluster ends + */ + wbp->cl_clusters[cl_index].e_addr = cl->e_addr; + break; + } + if (cl->b_addr < (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) { + /* + * we have a write that starts in the middle of the current cluster + * but extends beyond the cluster's limit... we know this because + * of the previous checks + * we'll extend the current cluster to the max + * and update the b_addr for the current write to reflect that + * the head of it was absorbed into this cluster... + * note that we'll always have a leftover tail in this case since + * full absorbtion would have occurred in the clause above + */ + wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount; + + cl->b_addr = wbp->cl_clusters[cl_index].e_addr; + } + /* + * we come here for the case where the current write starts + * beyond the limit of the existing cluster or we have a leftover + * tail after a partial absorbtion + * + * in either case, we'll check the remaining clusters before + * starting a new one + */ + } else { + /* + * the current write starts in front of the cluster we're currently considering + */ + if ((wbp->cl_clusters[cl_index].e_addr - cl->b_addr) <= max_cluster_pgcount) { + /* + * we can just merge the new request into + * this cluster and leave it in the cache + * since the resulting cluster is still + * less than the maximum allowable size + */ + wbp->cl_clusters[cl_index].b_addr = cl->b_addr; + + if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) { + /* + * the current write completely + * envelops the existing cluster and since + * each write is limited to at most max_cluster_pgcount pages + * we can just use the start and last blocknos of the write + * to generate the cluster limits + */ + wbp->cl_clusters[cl_index].e_addr = cl->e_addr; + } + break; + } + /* + * if we were to combine this write with the current cluster + * we would exceed the cluster size limit.... so, + * let's see if there's any overlap of the new I/O with + * the cluster we're currently considering... in fact, we'll + * stretch the cluster out to it's full limit and see if we + * get an intersection with the current write + * + */ + if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount) { + /* + * the current write extends into the proposed cluster + * clip the length of the current write after first combining it's + * tail with the newly shaped cluster + */ + wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount; + + cl->e_addr = wbp->cl_clusters[cl_index].b_addr; + } + /* + * if we get here, there was no way to merge + * any portion of this write with this cluster + * or we could only merge part of it which + * will leave a tail... + * we'll check the remaining clusters before starting a new one + */ + } + } + if (cl_index < wbp->cl_number) + /* + * we found an existing cluster(s) that we + * could entirely merge this I/O into + */ + goto delay_io; + + if (defer_writes == FALSE && + wbp->cl_number == MAX_CLUSTERS && + wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) { + uint32_t n; + + if (vp->v_mount->mnt_minsaturationbytecount) { + n = vp->v_mount->mnt_minsaturationbytecount / MAX_CLUSTER_SIZE(vp); + + if (n > MAX_CLUSTERS) + n = MAX_CLUSTERS; + } else + n = 0; + + if (n == 0) { + if (disk_conditioner_mount_is_ssd(vp->v_mount)) + n = WRITE_BEHIND_SSD; + else + n = WRITE_BEHIND; + } + while (n--) + cluster_try_push(wbp, vp, newEOF, 0, 0, callback, callback_arg, NULL, vm_initiated); + } + if (wbp->cl_number < MAX_CLUSTERS) { + /* + * we didn't find an existing cluster to + * merge into, but there's room to start + * a new one + */ + goto start_new_cluster; + } + /* + * no exisitng cluster to merge with and no + * room to start a new one... we'll try + * pushing one of the existing ones... if none of + * them are able to be pushed, we'll switch + * to the sparse cluster mechanism + * cluster_try_push updates cl_number to the + * number of remaining clusters... and + * returns the number of currently unused clusters + */ + ret_cluster_try_push = 0; + + /* + * if writes are not deferred, call cluster push immediately + */ + if (defer_writes == FALSE) { + + ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, 0, callback, callback_arg, NULL, vm_initiated); + } + /* + * execute following regardless of writes being deferred or not + */ + if (ret_cluster_try_push == 0) { + /* + * no more room in the normal cluster mechanism + * so let's switch to the more expansive but expensive + * sparse mechanism.... + */ + sparse_cluster_switch(wbp, vp, newEOF, callback, callback_arg, vm_initiated); + sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, cl, newEOF, callback, callback_arg, vm_initiated); + + lck_mtx_unlock(&wbp->cl_lockw); + return; + } +start_new_cluster: + wbp->cl_clusters[wbp->cl_number].b_addr = cl->b_addr; + wbp->cl_clusters[wbp->cl_number].e_addr = cl->e_addr; + + wbp->cl_clusters[wbp->cl_number].io_flags = 0; + + if (flags & IO_NOCACHE) + wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IONOCACHE; + + if (flags & IO_PASSIVE) + wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IOPASSIVE; + + wbp->cl_number++; +delay_io: + lck_mtx_unlock(&wbp->cl_lockw); + return; +} + + static int cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int flags, int (*callback)(buf_t, void *), void *callback_arg) @@ -3005,9 +3289,7 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old int write_cnt = 0; boolean_t first_pass = FALSE; struct cl_extent cl; - struct cl_writebehind *wbp; int bflag; - u_int max_cluster_pgcount; u_int max_io_size; if (uio) { @@ -3036,7 +3318,6 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old zero_off = 0; zero_off1 = 0; - max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE; max_io_size = cluster_max_io_size(vp->v_mount, CL_WRITE); if (flags & IO_HEADZEROFILL) { @@ -3293,7 +3574,7 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old retval = cluster_copy_upl_data(uio, upl, io_offset, (int *)&io_requested); if (retval) { - ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY); + ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE, upl, 0, 0, retval, 0); @@ -3318,20 +3599,15 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old io_offset += bytes_to_zero; } if (retval == 0) { - int cl_index; - int ret_cluster_try_push; int do_zeroing = 1; - io_size += start_offset; - /* Force more restrictive zeroing behavior only on APFS */ if ((vnode_tag(vp) == VT_APFS) && (newEOF < oldEOF)) { do_zeroing = 0; } - if (do_zeroing && (upl_f_offset + io_size) >= newEOF && (u_int)io_size < upl_size) { /* @@ -3370,269 +3646,28 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64); if (flags & IO_SYNC) { - /* - * if the IO_SYNC flag is set than we need to - * bypass any clusters and immediately issue - * the I/O - */ - goto issue_io; - } - /* - * take the lock to protect our accesses - * of the writebehind and sparse cluster state - */ - wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED); - - if (wbp->cl_scmap) { - - if ( !(flags & IO_NOCACHE)) { - /* - * we've fallen into the sparse - * cluster method of delaying dirty pages - */ - sparse_cluster_add(&(wbp->cl_scmap), vp, &cl, newEOF, callback, callback_arg); - - lck_mtx_unlock(&wbp->cl_lockw); - - continue; - } - /* - * must have done cached writes that fell into - * the sparse cluster mechanism... we've switched - * to uncached writes on the file, so go ahead - * and push whatever's in the sparse map - * and switch back to normal clustering - */ - wbp->cl_number = 0; - - sparse_cluster_push(&(wbp->cl_scmap), vp, newEOF, PUSH_ALL, 0, callback, callback_arg); /* - * no clusters of either type present at this point - * so just go directly to start_new_cluster since - * we know we need to delay this I/O since we've - * already released the pages back into the cache - * to avoid the deadlock with sparse_cluster_push - */ - goto start_new_cluster; - } - if (first_pass) { - if (write_off == wbp->cl_last_write) - wbp->cl_seq_written += write_cnt; - else - wbp->cl_seq_written = write_cnt; - - wbp->cl_last_write = write_off + write_cnt; - - first_pass = FALSE; - } - if (wbp->cl_number == 0) - /* - * no clusters currently present - */ - goto start_new_cluster; - - for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) { - /* - * check each cluster that we currently hold - * try to merge some or all of this write into - * one or more of the existing clusters... if - * any portion of the write remains, start a - * new cluster - */ - if (cl.b_addr >= wbp->cl_clusters[cl_index].b_addr) { - /* - * the current write starts at or after the current cluster - */ - if (cl.e_addr <= (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) { - /* - * we have a write that fits entirely - * within the existing cluster limits - */ - if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr) - /* - * update our idea of where the cluster ends - */ - wbp->cl_clusters[cl_index].e_addr = cl.e_addr; - break; - } - if (cl.b_addr < (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) { - /* - * we have a write that starts in the middle of the current cluster - * but extends beyond the cluster's limit... we know this because - * of the previous checks - * we'll extend the current cluster to the max - * and update the b_addr for the current write to reflect that - * the head of it was absorbed into this cluster... - * note that we'll always have a leftover tail in this case since - * full absorbtion would have occurred in the clause above - */ - wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount; - - cl.b_addr = wbp->cl_clusters[cl_index].e_addr; - } - /* - * we come here for the case where the current write starts - * beyond the limit of the existing cluster or we have a leftover - * tail after a partial absorbtion - * - * in either case, we'll check the remaining clusters before - * starting a new one - */ - } else { - /* - * the current write starts in front of the cluster we're currently considering - */ - if ((wbp->cl_clusters[cl_index].e_addr - cl.b_addr) <= max_cluster_pgcount) { - /* - * we can just merge the new request into - * this cluster and leave it in the cache - * since the resulting cluster is still - * less than the maximum allowable size - */ - wbp->cl_clusters[cl_index].b_addr = cl.b_addr; - - if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr) { - /* - * the current write completely - * envelops the existing cluster and since - * each write is limited to at most max_cluster_pgcount pages - * we can just use the start and last blocknos of the write - * to generate the cluster limits - */ - wbp->cl_clusters[cl_index].e_addr = cl.e_addr; - } - break; - } - - /* - * if we were to combine this write with the current cluster - * we would exceed the cluster size limit.... so, - * let's see if there's any overlap of the new I/O with - * the cluster we're currently considering... in fact, we'll - * stretch the cluster out to it's full limit and see if we - * get an intersection with the current write - * - */ - if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount) { - /* - * the current write extends into the proposed cluster - * clip the length of the current write after first combining it's - * tail with the newly shaped cluster - */ - wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount; - - cl.e_addr = wbp->cl_clusters[cl_index].b_addr; - } - /* - * if we get here, there was no way to merge - * any portion of this write with this cluster - * or we could only merge part of it which - * will leave a tail... - * we'll check the remaining clusters before starting a new one - */ - } - } - if (cl_index < wbp->cl_number) - /* - * we found an existing cluster(s) that we - * could entirely merge this I/O into - */ - goto delay_io; - - if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && - wbp->cl_number == MAX_CLUSTERS && - wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) { - uint32_t n; - - if (vp->v_mount->mnt_minsaturationbytecount) { - n = vp->v_mount->mnt_minsaturationbytecount / MAX_CLUSTER_SIZE(vp); - - if (n > MAX_CLUSTERS) - n = MAX_CLUSTERS; - } else - n = 0; - - if (n == 0) { - if (disk_conditioner_mount_is_ssd(vp->v_mount)) - n = WRITE_BEHIND_SSD; - else - n = WRITE_BEHIND; - } - while (n--) - cluster_try_push(wbp, vp, newEOF, 0, 0, callback, callback_arg, NULL); - } - if (wbp->cl_number < MAX_CLUSTERS) { - /* - * we didn't find an existing cluster to - * merge into, but there's room to start - * a new one - */ - goto start_new_cluster; - } - /* - * no exisitng cluster to merge with and no - * room to start a new one... we'll try - * pushing one of the existing ones... if none of - * them are able to be pushed, we'll switch - * to the sparse cluster mechanism - * cluster_try_push updates cl_number to the - * number of remaining clusters... and - * returns the number of currently unused clusters - */ - ret_cluster_try_push = 0; - - /* - * if writes are not deferred, call cluster push immediately - */ - if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) { - - ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, 0, callback, callback_arg, NULL); - } - - /* - * execute following regardless of writes being deferred or not - */ - if (ret_cluster_try_push == 0) { - /* - * no more room in the normal cluster mechanism - * so let's switch to the more expansive but expensive - * sparse mechanism.... + * if the IO_SYNC flag is set than we need to bypass + * any clustering and immediately issue the I/O + * + * we don't hold the lock at this point + * + * we've already dropped the current upl, so pick it back up with COPYOUT_FROM set + * so that we correctly deal with a change in state of the hardware modify bit... + * we do this via cluster_push_now... by passing along the IO_SYNC flag, we force + * cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also + * responsible for generating the correct sized I/O(s) */ - sparse_cluster_switch(wbp, vp, newEOF, callback, callback_arg); - sparse_cluster_add(&(wbp->cl_scmap), vp, &cl, newEOF, callback, callback_arg); + retval = cluster_push_now(vp, &cl, newEOF, flags, callback, callback_arg, FALSE); + } else { + boolean_t defer_writes = FALSE; - lck_mtx_unlock(&wbp->cl_lockw); + if (vfs_flags(vp->v_mount) & MNT_DEFWRITE) + defer_writes = TRUE; - continue; + cluster_update_state_internal(vp, &cl, flags, defer_writes, &first_pass, + write_off, write_cnt, newEOF, callback, callback_arg, FALSE); } -start_new_cluster: - wbp->cl_clusters[wbp->cl_number].b_addr = cl.b_addr; - wbp->cl_clusters[wbp->cl_number].e_addr = cl.e_addr; - - wbp->cl_clusters[wbp->cl_number].io_flags = 0; - - if (flags & IO_NOCACHE) - wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IONOCACHE; - - if (bflag & CL_PASSIVE) - wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IOPASSIVE; - - wbp->cl_number++; -delay_io: - lck_mtx_unlock(&wbp->cl_lockw); - - continue; -issue_io: - /* - * we don't hold the lock at this point - * - * we've already dropped the current upl, so pick it back up with COPYOUT_FROM set - * so that we correctly deal with a change in state of the hardware modify bit... - * we do this via cluster_push_now... by passing along the IO_SYNC flag, we force - * cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also - * responsible for generating the correct sized I/O(s) - */ - retval = cluster_push_now(vp, &cl, newEOF, flags, callback, callback_arg); } } KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END, retval, 0, io_resid, 0, 0); @@ -4368,7 +4403,6 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t max_rd_size; u_int32_t max_rd_ahead; u_int32_t max_vector_size; - boolean_t strict_uncached_IO = FALSE; boolean_t io_throttled = FALSE; u_int32_t vector_upl_iosize = 0; @@ -4433,8 +4467,6 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, devblocksize = PAGE_SIZE; } - strict_uncached_IO = ubc_strict_uncached_IO(vp); - orig_iov_base = uio_curriovbase(uio); last_iov_base = orig_iov_base; @@ -4512,7 +4544,7 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, * cluster_copy_ubc_data returns the resid * in io_size */ - if ((strict_uncached_IO == FALSE) && ((flags & IO_ENCRYPTED) == 0)) { + if ((flags & IO_ENCRYPTED) == 0) { retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0); } /* @@ -4602,7 +4634,7 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, * Don't re-check the UBC data if we are looking for uncached IO * or asking for encrypted blocks. */ - if ((strict_uncached_IO == FALSE) && ((flags & IO_ENCRYPTED) == 0)) { + if ((flags & IO_ENCRYPTED) == 0) { if ((xsize = io_size) > max_rd_size) xsize = max_rd_size; @@ -4865,7 +4897,16 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, * we couldn't handle the tail of this request in DIRECT mode * so fire it through the copy path */ - retval = cluster_read_copy(vp, uio, io_req_size, filesize, flags, callback, callback_arg); + if (flags & IO_ENCRYPTED) { + /* + * We cannot fall back to the copy path for encrypted I/O. If this + * happens, there is something wrong with the user buffer passed + * down. + */ + retval = EFAULT; + } else { + retval = cluster_read_copy(vp, uio, io_req_size, filesize, flags, callback, callback_arg); + } *read_type = IO_UNKNOWN; } @@ -5371,6 +5412,7 @@ cluster_push_err(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *ca int retval; int my_sparse_wait = 0; struct cl_writebehind *wbp; + int local_err = 0; if (err) *err = 0; @@ -5440,22 +5482,35 @@ cluster_push_err(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *ca lck_mtx_unlock(&wbp->cl_lockw); - retval = sparse_cluster_push(&scmap, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg); + retval = sparse_cluster_push(wbp, &scmap, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, FALSE); lck_mtx_lock(&wbp->cl_lockw); wbp->cl_sparse_pushes--; + + if (retval) { + if (wbp->cl_scmap != NULL) { + panic("cluster_push_err: Expected NULL cl_scmap\n"); + } + + wbp->cl_scmap = scmap; + } if (wbp->cl_sparse_wait && wbp->cl_sparse_pushes == 0) wakeup((caddr_t)&wbp->cl_sparse_pushes); } else { - retval = sparse_cluster_push(&(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg); + retval = sparse_cluster_push(wbp, &(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, FALSE); } + + local_err = retval; + if (err) *err = retval; retval = 1; } else { - retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, err); + retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, &local_err, FALSE); + if (err) + *err = local_err; } lck_mtx_unlock(&wbp->cl_lockw); @@ -5476,7 +5531,7 @@ cluster_push_err(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *ca lck_mtx_unlock(&wbp->cl_lockw); } KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END, - wbp->cl_scmap, wbp->cl_number, retval, 0, 0); + wbp->cl_scmap, wbp->cl_number, retval, local_err, 0); return (retval); } @@ -5516,7 +5571,7 @@ cluster_release(struct ubc_info *ubc) static int -cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg, int *err) +cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg, int *err, boolean_t vm_initiated) { int cl_index; int cl_index1; @@ -5597,6 +5652,9 @@ cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_fla goto dont_try; } } + if (vm_initiated == TRUE) + lck_mtx_unlock(&wbp->cl_lockw); + for (cl_index = 0; cl_index < cl_len; cl_index++) { int flags; struct cl_extent cl; @@ -5619,19 +5677,23 @@ cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_fla cl.b_addr = l_clusters[cl_index].b_addr; cl.e_addr = l_clusters[cl_index].e_addr; - retval = cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg); - - if (error == 0 && retval) - error = retval; + retval = cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg, vm_initiated); - l_clusters[cl_index].b_addr = 0; - l_clusters[cl_index].e_addr = 0; + if (retval == 0) { + cl_pushed++; - cl_pushed++; + l_clusters[cl_index].b_addr = 0; + l_clusters[cl_index].e_addr = 0; + } else if (error == 0) { + error = retval; + } if ( !(push_flag & PUSH_ALL) ) break; } + if (vm_initiated == TRUE) + lck_mtx_lock(&wbp->cl_lockw); + if (err) *err = error; @@ -5651,7 +5713,7 @@ cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_fla * * collect the active public clusters... */ - sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg); + sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated); for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) { if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) @@ -5671,7 +5733,7 @@ cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_fla * and collect the original clusters that were moved into the * local storage for sorting purposes */ - sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg); + sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated); } else { /* @@ -5701,7 +5763,8 @@ cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_fla static int -cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags, int (*callback)(buf_t, void *), void *callback_arg) +cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags, + int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated) { upl_page_info_t *pl; upl_t upl; @@ -5758,6 +5821,13 @@ cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags, int (*c } else size = upl_size; + + if (vm_initiated) { + vnode_pageout(vp, NULL, (upl_offset_t)0, upl_f_offset, (upl_size_t)upl_size, + UPL_MSYNC | UPL_VNODE_PAGER | UPL_KEEPCACHED, &error); + + return (error); + } KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0); /* @@ -5868,7 +5938,7 @@ cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags, int (*c size -= io_size; } - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, error, 0, 0); return(error); } @@ -5877,12 +5947,13 @@ cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags, int (*c /* * sparse_cluster_switch is called with the write behind lock held */ -static void -sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg) +static int +sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated) { int cl_index; + int error; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, kdebug_vnode(vp), wbp->cl_scmap, 0, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, 0, 0); for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) { int flags; @@ -5894,14 +5965,20 @@ sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*c if (flags & UPL_POP_DIRTY) { cl.e_addr = cl.b_addr + 1; - sparse_cluster_add(&(wbp->cl_scmap), vp, &cl, EOF, callback, callback_arg); + error = sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, &cl, EOF, callback, callback_arg, vm_initiated); + + if (error) { + break; + } } } } } - wbp->cl_number = 0; + wbp->cl_number -= cl_index; + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, error, 0); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, kdebug_vnode(vp), wbp->cl_scmap, 0, 0, 0); + return error; } @@ -5911,11 +5988,13 @@ sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*c * from the write-behind context (the cluster_push case), the wb lock is not held */ static int -sparse_cluster_push(void **scmap, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg) +sparse_cluster_push(struct cl_writebehind *wbp, void **scmap, vnode_t vp, off_t EOF, int push_flag, + int io_flags, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated) { struct cl_extent cl; off_t offset; u_int length; + void *l_scmap; int error = 0; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, kdebug_vnode(vp), (*scmap), 0, push_flag, 0); @@ -5923,22 +6002,44 @@ sparse_cluster_push(void **scmap, vnode_t vp, off_t EOF, int push_flag, int io_f if (push_flag & PUSH_ALL) vfs_drt_control(scmap, 1); + l_scmap = *scmap; + for (;;) { int retval; + if (vfs_drt_get_cluster(scmap, &offset, &length) != KERN_SUCCESS) break; + if (vm_initiated == TRUE) + lck_mtx_unlock(&wbp->cl_lockw); + cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64); cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64); - retval = cluster_push_now(vp, &cl, EOF, io_flags & (IO_PASSIVE|IO_CLOSE), callback, callback_arg); + retval = cluster_push_now(vp, &cl, EOF, io_flags, callback, callback_arg, vm_initiated); if (error == 0 && retval) error = retval; - if ( !(push_flag & PUSH_ALL) ) + if (vm_initiated == TRUE) { + lck_mtx_lock(&wbp->cl_lockw); + + if (*scmap != l_scmap) + break; + } + + if (error) { + if (vfs_drt_mark_pages(scmap, offset, length, NULL) != KERN_SUCCESS) { + panic("Failed to restore dirty state on failure\n"); + } + + break; + } + + if ( !(push_flag & PUSH_ALL)) { break; + } } - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), 0, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0); return error; } @@ -5947,12 +6048,14 @@ sparse_cluster_push(void **scmap, vnode_t vp, off_t EOF, int push_flag, int io_f /* * sparse_cluster_add is called with the write behind lock held */ -static void -sparse_cluster_add(void **scmap, vnode_t vp, struct cl_extent *cl, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg) +static int +sparse_cluster_add(struct cl_writebehind *wbp, void **scmap, vnode_t vp, struct cl_extent *cl, off_t EOF, + int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated) { u_int new_dirty; u_int length; off_t offset; + int error; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (*scmap), 0, cl->b_addr, (int)cl->e_addr, 0); @@ -5965,12 +6068,18 @@ sparse_cluster_add(void **scmap, vnode_t vp, struct cl_extent *cl, off_t EOF, in * only a partial update was done * push out some pages and try again */ - sparse_cluster_push(scmap, vp, EOF, 0, 0, callback, callback_arg); + error = sparse_cluster_push(wbp, scmap, vp, EOF, 0, 0, callback, callback_arg, vm_initiated); + + if (error) { + break; + } offset += (new_dirty * PAGE_SIZE_64); length -= (new_dirty * PAGE_SIZE); } - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), 0, 0, 0); + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0); + + return error; } @@ -6259,7 +6368,7 @@ is_file_clean(vnode_t vp, off_t filesize) * single hashtable entry. Each hashtable entry is aligned to this * size within the file. */ -#define DRT_BITVECTOR_PAGES ((1024 * 1024) / PAGE_SIZE) +#define DRT_BITVECTOR_PAGES ((1024 * 256) / PAGE_SIZE) /* * File offset handling. @@ -6306,6 +6415,7 @@ is_file_clean(vnode_t vp, off_t filesize) } while(0); +#if CONFIG_EMBEDDED /* * Hash table moduli. * @@ -6314,13 +6424,14 @@ is_file_clean(vnode_t vp, off_t filesize) * both being prime and fitting within the desired allocation * size, these values need to be manually determined. * - * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes. + * For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes. * - * The small hashtable allocation is 1024 bytes, so the modulus is 23. - * The large hashtable allocation is 16384 bytes, so the modulus is 401. + * The small hashtable allocation is 4096 bytes, so the modulus is 251. + * The large hashtable allocation is 32768 bytes, so the modulus is 2039. */ -#define DRT_HASH_SMALL_MODULUS 23 -#define DRT_HASH_LARGE_MODULUS 401 + +#define DRT_HASH_SMALL_MODULUS 251 +#define DRT_HASH_LARGE_MODULUS 2039 /* * Physical memory required before the large hash modulus is permitted. @@ -6330,11 +6441,58 @@ is_file_clean(vnode_t vp, off_t filesize) */ #define DRT_HASH_LARGE_MEMORY_REQUIRED (1024LL * 1024LL * 1024LL) /* 1GiB */ -#define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */ -#define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */ +#define DRT_SMALL_ALLOCATION 4096 /* 80 bytes spare */ +#define DRT_LARGE_ALLOCATION 32768 /* 144 bytes spare */ + +#else +/* + * Hash table moduli. + * + * Since the hashtable entry's size is dependent on the size of + * the bitvector, and since the hashtable size is constrained to + * both being prime and fitting within the desired allocation + * size, these values need to be manually determined. + * + * For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes. + * + * The small hashtable allocation is 16384 bytes, so the modulus is 1019. + * The large hashtable allocation is 131072 bytes, so the modulus is 8179. + */ + +#define DRT_HASH_SMALL_MODULUS 1019 +#define DRT_HASH_LARGE_MODULUS 8179 + +/* + * Physical memory required before the large hash modulus is permitted. + * + * On small memory systems, the large hash modulus can lead to phsyical + * memory starvation, so we avoid using it there. + */ +#define DRT_HASH_LARGE_MEMORY_REQUIRED (4 * 1024LL * 1024LL * 1024LL) /* 4GiB */ + +#define DRT_SMALL_ALLOCATION 16384 /* 80 bytes spare */ +#define DRT_LARGE_ALLOCATION 131072 /* 208 bytes spare */ + +#endif /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */ +/* + * Hashtable entry. + */ +struct vfs_drt_hashentry { + u_int64_t dhe_control; +/* +* dhe_bitvector was declared as dhe_bitvector[DRT_BITVECTOR_PAGES / 32]; +* DRT_BITVECTOR_PAGES is defined as ((1024 * 256) / PAGE_SIZE) +* Since PAGE_SIZE is only known at boot time, +* -define MAX_DRT_BITVECTOR_PAGES for smallest supported page size (4k) +* -declare dhe_bitvector array for largest possible length +*/ +#define MAX_DRT_BITVECTOR_PAGES (1024 * 256)/( 4 * 1024) + u_int32_t dhe_bitvector[MAX_DRT_BITVECTOR_PAGES/32]; +}; + /* * Hashtable bitvector handling. * @@ -6351,30 +6509,12 @@ is_file_clean(vnode_t vp, off_t filesize) ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32))) #define DRT_BITVECTOR_CLEAR(scm, i) \ - bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t)) + bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t)) #define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \ bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \ &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \ - (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t)) - - - -/* - * Hashtable entry. - */ -struct vfs_drt_hashentry { - u_int64_t dhe_control; -/* -* dhe_bitvector was declared as dhe_bitvector[DRT_BITVECTOR_PAGES / 32]; -* DRT_BITVECTOR_PAGES is defined as ((1024 * 1024) / PAGE_SIZE) -* Since PAGE_SIZE is only known at boot time, -* -define MAX_DRT_BITVECTOR_PAGES for smallest supported page size (4k) -* -declare dhe_bitvector array for largest possible length -*/ -#define MAX_DRT_BITVECTOR_PAGES (1024 * 1024)/( 4 * 1024) - u_int32_t dhe_bitvector[MAX_DRT_BITVECTOR_PAGES/32]; -}; + (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t)) /* * Dirty Region Tracking structure. @@ -6754,12 +6894,17 @@ vfs_drt_do_mark_pages( for (i = 0; i < pgcount; i++) { if (dirty) { if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) { + if (ecount >= DRT_BITVECTOR_PAGES) + panic("ecount >= DRT_BITVECTOR_PAGES, cmap = %p, index = %d, bit = %d", cmap, index, pgoff+i); DRT_HASH_SET_BIT(cmap, index, pgoff + i); ecount++; setcount++; } } else { if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) { + if (ecount <= 0) + panic("ecount <= 0, cmap = %p, index = %d, bit = %d", cmap, index, pgoff+i); + assert(ecount > 0); DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i); ecount--; setcount++; @@ -6870,7 +7015,8 @@ vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp) } if (fs == -1) { /* didn't find any bits set */ - panic("vfs_drt: entry summary count > 0 but no bits set in map"); + panic("vfs_drt: entry summary count > 0 but no bits set in map, cmap = %p, index = %d, count = %lld", + cmap, index, DRT_HASH_GET_COUNT(cmap, index)); } for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) { if (!DRT_HASH_TEST_BIT(cmap, index, i)) diff --git a/bsd/vfs/vfs_cprotect.c b/bsd/vfs/vfs_cprotect.c index ff53ee1a3..26af78a75 100644 --- a/bsd/vfs/vfs_cprotect.c +++ b/bsd/vfs/vfs_cprotect.c @@ -33,6 +33,9 @@ #include #include #include +//for write protection +#include +#include #define PTR_ADD(type, base, offset) (type)((uintptr_t)(base) + (offset)) @@ -54,7 +57,10 @@ enum { // Using AES IV context generated from key CPX_IV_AES_CTX_VFS = 0x08, CPX_SYNTHETIC_OFFSET_FOR_IV = 0x10, - CPX_COMPOSITEKEY = 0x20 + CPX_COMPOSITEKEY = 0x20, + + //write page protection + CPX_WRITE_PROTECTABLE = 0x40 }; struct cpx { @@ -88,21 +94,39 @@ size_t cpx_sizex(const struct cpx *cpx) cpx_t cpx_alloc(size_t key_len) { - cpx_t cpx; + cpx_t cpx = NULL; -#if TARGET_OS_OSX +#if CONFIG_KEYPAGE_WP /* * Macs only use 1 key per volume, so force it into its own page. * This way, we can write-protect as needed. */ size_t cpsize = cpx_size (key_len); if (cpsize < PAGE_SIZE) { - MALLOC(cpx, cpx_t, PAGE_SIZE, M_TEMP, M_WAITOK); + /* + * Don't use MALLOC to allocate the page-sized structure. Instead, + * use kmem_alloc to bypass KASAN since we are supplying our own + * unilateral write protection on this page. Note that kmem_alloc + * can block. + */ + if (kmem_alloc (kernel_map, (vm_offset_t *)&cpx, PAGE_SIZE, VM_KERN_MEMORY_FILE)) { + /* + * returning NULL at this point (due to failed allocation) would just + * result in a panic. fall back to attempting a normal MALLOC, and don't + * let the cpx get marked PROTECTABLE. + */ + MALLOC(cpx, cpx_t, cpx_size(key_len), M_TEMP, M_WAITOK); + } + else { + //mark the page as protectable, since kmem_alloc succeeded. + cpx->cpx_flags |= CPX_WRITE_PROTECTABLE; + } } else { panic ("cpx_size too large ! (%lu)", cpsize); } #else + /* If key page write protection disabled, just switch to kernel MALLOC */ MALLOC(cpx, cpx_t, cpx_size(key_len), M_TEMP, M_WAITOK); #endif cpx_init(cpx, key_len); @@ -113,10 +137,12 @@ cpx_t cpx_alloc(size_t key_len) /* this is really a void function */ void cpx_writeprotect (cpx_t cpx) { -#if TARGET_OS_OSX +#if CONFIG_KEYPAGE_WP void *cpxstart = (void*)cpx; void *cpxend = (void*)((uint8_t*)cpx + PAGE_SIZE); - vm_map_protect (kernel_map, cpxstart, cpxend, (VM_PROT_READ), FALSE); + if (cpx->cpx_flags & CPX_WRITE_PROTECTABLE) { + vm_map_protect (kernel_map, (vm_map_offset_t)cpxstart, (vm_map_offset_t)cpxend, (VM_PROT_READ), FALSE); + } #else (void) cpx; #endif @@ -136,15 +162,26 @@ void cpx_free(cpx_t cpx) assert(*PTR_ADD(uint32_t *, cpx, cpx_sizex(cpx) - 4) == cpx_magic2); #endif -#if TARGET_OS_OSX +#if CONFIG_KEYPAGE_WP /* unprotect the page before bzeroing */ void *cpxstart = (void*)cpx; - void *cpxend = (void*)((uint8_t*)cpx + PAGE_SIZE); - vm_map_protect (kernel_map, cpxstart, cpxend, (VM_PROT_DEFAULT), FALSE); -#endif + void *cpxend = (void*)((uint8_t*)cpx + PAGE_SIZE); + if (cpx->cpx_flags & CPX_WRITE_PROTECTABLE) { + vm_map_protect (kernel_map, (vm_map_offset_t)cpxstart, (vm_map_offset_t)cpxend, (VM_PROT_DEFAULT), FALSE); + //now zero the memory after un-protecting it + bzero(cpx->cpx_cached_key, cpx->cpx_max_key_len); + + //If we are here, then we used kmem_alloc to get the page. Must use kmem_free to drop it. + kmem_free(kernel_map, (vm_offset_t)cpx, PAGE_SIZE); + return; + } +#else bzero(cpx->cpx_cached_key, cpx->cpx_max_key_len); FREE(cpx, M_TEMP); + return; +#endif + } void cpx_init(cpx_t cpx, size_t key_len) diff --git a/bsd/vfs/vfs_disk_conditioner.c b/bsd/vfs/vfs_disk_conditioner.c index 8cc7237c6..79872204b 100644 --- a/bsd/vfs/vfs_disk_conditioner.c +++ b/bsd/vfs/vfs_disk_conditioner.c @@ -52,12 +52,19 @@ // idle period until assumed disk spin down #define DISK_IDLE_SEC (10 * 60) +struct saved_mount_fields { + uint32_t mnt_maxreadcnt; /* Max. byte count for read */ + uint32_t mnt_maxwritecnt; /* Max. byte count for write */ + uint32_t mnt_segreadcnt; /* Max. segment count for read */ + uint32_t mnt_segwritecnt; /* Max. segment count for write */ + uint32_t mnt_ioqueue_depth; /* the maxiumum number of commands a device can accept */ + uint32_t mnt_ioscale; /* scale the various throttles/limits imposed on the amount of I/O in flight */ +}; + struct _disk_conditioner_info_t { - boolean_t enabled; // if other fields have any effect - uint64_t access_time_usec; // maximum latency before an I/O transfer begins - uint64_t read_throughput_mbps; // throughput of an I/O read - uint64_t write_throughput_mbps; // throughput of an I/O write - boolean_t is_ssd; // behave like an SSD (for both conditioning and affecting behavior in other parts of VFS) + disk_conditioner_info dcinfo; // all the original data from fsctl + struct saved_mount_fields mnt_fields; // fields to restore in mount_t when conditioner is disabled + daddr64_t last_blkno; // approx. last transfered block for simulating seek times struct timeval last_io_timestamp; // the last time an I/O completed }; @@ -85,25 +92,33 @@ disk_conditioner_delay(buf_t bp, int extents, int total_size, uint64_t already_e daddr64_t blkdiff; daddr64_t last_blkno; double access_time_scale; - struct _disk_conditioner_info_t *info = NULL; + struct _disk_conditioner_info_t *internal_info = NULL; + disk_conditioner_info *info = NULL; struct timeval elapsed; struct timeval start; + vnode_t vp; - mp = buf_vnode(bp)->v_mount; + vp = buf_vnode(bp); + if (!vp) { + return; + } + + mp = vp->v_mount; if (!mp) { return; } - info = mp->mnt_disk_conditioner_info; - if (!info || !info->enabled) { + internal_info = mp->mnt_disk_conditioner_info; + if (!internal_info || !internal_info->dcinfo.enabled) { return; } + info = &(internal_info->dcinfo); if (!info->is_ssd) { // calculate approximate seek time based on difference in block number - last_blkno = info->last_blkno; + last_blkno = internal_info->last_blkno; blkdiff = bp->b_blkno > last_blkno ? bp->b_blkno - last_blkno : last_blkno - bp->b_blkno; - info->last_blkno = bp->b_blkno + bp->b_bcount; + internal_info->last_blkno = bp->b_blkno + bp->b_bcount; } else { blkdiff = BLK_MAX(mp); } @@ -122,15 +137,15 @@ disk_conditioner_delay(buf_t bp, int extents, int total_size, uint64_t already_e // try simulating disk spinup based on time since last I/O if (!info->is_ssd) { microuptime(&elapsed); - timevalsub(&elapsed, &info->last_io_timestamp); + timevalsub(&elapsed, &internal_info->last_io_timestamp); // avoid this delay right after boot (assuming last_io_timestamp is 0 and disk is already spinning) - if (elapsed.tv_sec > DISK_IDLE_SEC && info->last_io_timestamp.tv_sec != 0) { + if (elapsed.tv_sec > DISK_IDLE_SEC && internal_info->last_io_timestamp.tv_sec != 0) { delay_usec += DISK_SPINUP_SEC * USEC_PER_SEC; } } if (delay_usec <= already_elapsed_usec) { - microuptime(&info->last_io_timestamp); + microuptime(&internal_info->last_io_timestamp); return; } @@ -153,7 +168,7 @@ disk_conditioner_delay(buf_t bp, int extents, int total_size, uint64_t already_e } } - microuptime(&info->last_io_timestamp); + microuptime(&internal_info->last_io_timestamp); } int @@ -167,23 +182,29 @@ disk_conditioner_get_info(mount_t mp, disk_conditioner_info *uinfo) info = mp->mnt_disk_conditioner_info; - if (!info) { - return 0; + if (info) { + memcpy(uinfo, &(info->dcinfo), sizeof(disk_conditioner_info)); } - uinfo->enabled = info->enabled; - uinfo->access_time_usec = info->access_time_usec; - uinfo->read_throughput_mbps = info->read_throughput_mbps; - uinfo->write_throughput_mbps = info->write_throughput_mbps; - uinfo->is_ssd = info->is_ssd; - return 0; } +static inline void +disk_conditioner_restore_mount_fields(mount_t mp, struct saved_mount_fields *mnt_fields) { + mp->mnt_maxreadcnt = mnt_fields->mnt_maxreadcnt; + mp->mnt_maxwritecnt = mnt_fields->mnt_maxwritecnt; + mp->mnt_segreadcnt = mnt_fields->mnt_segreadcnt; + mp->mnt_segwritecnt = mnt_fields->mnt_segwritecnt; + mp->mnt_ioqueue_depth = mnt_fields->mnt_ioqueue_depth; + mp->mnt_ioscale = mnt_fields->mnt_ioscale; +} + int disk_conditioner_set_info(mount_t mp, disk_conditioner_info *uinfo) { - struct _disk_conditioner_info_t *info; + struct _disk_conditioner_info_t *internal_info; + disk_conditioner_info *info; + struct saved_mount_fields *mnt_fields; if (!kauth_cred_issuser(kauth_cred_get()) || !IOTaskHasEntitlement(current_task(), DISK_CONDITIONER_SET_ENTITLEMENT)) { return EPERM; @@ -193,18 +214,62 @@ disk_conditioner_set_info(mount_t mp, disk_conditioner_info *uinfo) return EINVAL; } - info = mp->mnt_disk_conditioner_info; - if (!info) { - info = mp->mnt_disk_conditioner_info = kalloc(sizeof(struct _disk_conditioner_info_t)); - bzero(info, sizeof(struct _disk_conditioner_info_t)); + mount_lock(mp); + + internal_info = mp->mnt_disk_conditioner_info; + if (!internal_info) { + internal_info = mp->mnt_disk_conditioner_info = kalloc(sizeof(struct _disk_conditioner_info_t)); + bzero(internal_info, sizeof(struct _disk_conditioner_info_t)); + mnt_fields = &(internal_info->mnt_fields); + + /* save mount_t fields for restoration later */ + mnt_fields->mnt_maxreadcnt = mp->mnt_maxreadcnt; + mnt_fields->mnt_maxwritecnt = mp->mnt_maxwritecnt; + mnt_fields->mnt_segreadcnt = mp->mnt_segreadcnt; + mnt_fields->mnt_segwritecnt = mp->mnt_segwritecnt; + mnt_fields->mnt_ioqueue_depth = mp->mnt_ioqueue_depth; + mnt_fields->mnt_ioscale = mp->mnt_ioscale; + } + + info = &(internal_info->dcinfo); + mnt_fields = &(internal_info->mnt_fields); + + if (!uinfo->enabled && info->enabled) { + /* disk conditioner is being disabled when already enabled */ + disk_conditioner_restore_mount_fields(mp, mnt_fields); + } + + memcpy(info, uinfo, sizeof(disk_conditioner_info)); + + /* scale back based on hardware advertised limits */ + if (uinfo->ioqueue_depth == 0 || uinfo->ioqueue_depth > mnt_fields->mnt_ioqueue_depth) { + info->ioqueue_depth = mnt_fields->mnt_ioqueue_depth; + } + if (uinfo->maxreadcnt == 0 || uinfo->maxreadcnt > mnt_fields->mnt_maxreadcnt) { + info->maxreadcnt = mnt_fields->mnt_maxreadcnt; + } + if (uinfo->maxwritecnt == 0 || uinfo->maxwritecnt > mnt_fields->mnt_maxwritecnt) { + info->maxwritecnt = mnt_fields->mnt_maxwritecnt; + } + if (uinfo->segreadcnt == 0 || uinfo->segreadcnt > mnt_fields->mnt_segreadcnt) { + info->segreadcnt = mnt_fields->mnt_segreadcnt; } + if (uinfo->segwritecnt == 0 || uinfo->segwritecnt > mnt_fields->mnt_segwritecnt) { + info->segwritecnt = mnt_fields->mnt_segwritecnt; + } + + if (uinfo->enabled) { + mp->mnt_maxreadcnt = info->maxreadcnt; + mp->mnt_maxwritecnt = info->maxwritecnt; + mp->mnt_segreadcnt = info->segreadcnt; + mp->mnt_segwritecnt = info->segwritecnt; + mp->mnt_ioqueue_depth = info->ioqueue_depth; + mp->mnt_ioscale = MNT_IOSCALE(info->ioqueue_depth); + } + + mount_unlock(mp); - info->enabled = uinfo->enabled; - info->access_time_usec = uinfo->access_time_usec; - info->read_throughput_mbps = uinfo->read_throughput_mbps; - info->write_throughput_mbps = uinfo->write_throughput_mbps; - info->is_ssd = uinfo->is_ssd; - microuptime(&info->last_io_timestamp); + microuptime(&internal_info->last_io_timestamp); // make sure throttling picks up the new periods throttle_info_mount_reset_period(mp, info->is_ssd); @@ -215,21 +280,27 @@ disk_conditioner_set_info(mount_t mp, disk_conditioner_info *uinfo) void disk_conditioner_unmount(mount_t mp) { - if (!mp->mnt_disk_conditioner_info) { + struct _disk_conditioner_info_t *internal_info = mp->mnt_disk_conditioner_info; + + if (!internal_info) { return; } - kfree(mp->mnt_disk_conditioner_info, sizeof(struct _disk_conditioner_info_t)); + + if (internal_info->dcinfo.enabled) { + disk_conditioner_restore_mount_fields(mp, &(internal_info->mnt_fields)); + } mp->mnt_disk_conditioner_info = NULL; + kfree(internal_info, sizeof(struct _disk_conditioner_info_t)); } boolean_t disk_conditioner_mount_is_ssd(mount_t mp) { - struct _disk_conditioner_info_t *info = mp->mnt_disk_conditioner_info; + struct _disk_conditioner_info_t *internal_info = mp->mnt_disk_conditioner_info; - if (!info || !info->enabled) { - return (mp->mnt_kern_flag & MNTK_SSD); + if (!internal_info || !internal_info->dcinfo.enabled) { + return !!(mp->mnt_kern_flag & MNTK_SSD); } - return info->is_ssd; + return internal_info->dcinfo.is_ssd; } diff --git a/bsd/vfs/vfs_fsevents.c b/bsd/vfs/vfs_fsevents.c index f2e6b0bc3..5b8eac30e 100644 --- a/bsd/vfs/vfs_fsevents.c +++ b/bsd/vfs/vfs_fsevents.c @@ -1303,11 +1303,11 @@ copy_out_kfse(fs_event_watcher *watcher, kfs_event *kfse, struct uio *uio) return 0; } - if (kfse->type == FSE_RENAME && kfse->dest == NULL) { + if (((kfse->type == FSE_RENAME) || (kfse->type == FSE_CLONE)) && kfse->dest == NULL) { // // This can happen if an event gets recycled but we had a // pointer to it in our event queue. The event is the - // destination of a rename which we'll process separately + // destination of a rename or clone which we'll process separately // (that is, another kfse points to this one so it's ok // to skip this guy because we'll process it when we process // the other one) @@ -1967,7 +1967,7 @@ filt_fsevent(struct knote *kn, long hint) switch(kn->kn_filter) { case EVFILT_READ: kn->kn_data = amt; - + if (kn->kn_data != 0) { activate = 1; } @@ -2001,8 +2001,6 @@ filt_fsevent_touch(struct knote *kn, struct kevent_internal_s *kev) /* accept new fflags/data as saved */ kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; - if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) - kn->kn_udata = kev->udata; /* restrict the current results to the (smaller?) set of new interest */ /* @@ -2079,8 +2077,6 @@ fseventsf_drain(struct fileproc *fp, __unused vfs_context_t ctx) int counter = 0; fsevent_handle *fseh = (struct fsevent_handle *)fp->f_fglob->fg_data; - fseh->watcher->flags |= WATCHER_CLOSING; - // if there are people still waiting, sleep for 10ms to // let them clean up and get out of there. however we // also don't want to get stuck forever so if they don't diff --git a/bsd/vfs/vfs_fslog.c b/bsd/vfs/vfs_fslog.c index 6dbd62b93..87db6067f 100644 --- a/bsd/vfs/vfs_fslog.c +++ b/bsd/vfs/vfs_fslog.c @@ -75,7 +75,7 @@ fslog_extmod_msgtracer(proc_t caller, proc_t target) strlcat(c_name, "(", sizeof(c_name)); strlcat(c_name, uuidstr, sizeof(c_name)); strlcat(c_name, ")", sizeof(c_name)); - if (0 != escape_str(c_name, strlen(c_name), sizeof(c_name))) { + if (0 != escape_str(c_name, strlen(c_name) + 1, sizeof(c_name))) { return; } @@ -84,7 +84,7 @@ fslog_extmod_msgtracer(proc_t caller, proc_t target) strlcat(t_name, "(", sizeof(t_name)); strlcat(t_name, uuidstr, sizeof(t_name)); strlcat(t_name, ")", sizeof(t_name)); - if (0 != escape_str(t_name, strlen(t_name), sizeof(t_name))) { + if (0 != escape_str(t_name, strlen(t_name) + 1, sizeof(t_name))) { return; } #if DEBUG diff --git a/bsd/vfs/vfs_lookup.c b/bsd/vfs/vfs_lookup.c index 55b86f9e6..ccee2e1c5 100644 --- a/bsd/vfs/vfs_lookup.c +++ b/bsd/vfs/vfs_lookup.c @@ -105,8 +105,6 @@ #define VOLFS_MIN_PATH_LEN 9 -static void kdebug_lookup(struct vnode *dp, struct componentname *cnp); - #if CONFIG_VOLFS static int vfs_getrealpath(const char * path, char * realpath, size_t bufsize, vfs_context_t ctx); #define MAX_VOLFS_RESTARTS 5 @@ -1746,24 +1744,33 @@ nameidone(struct nameidata *ndp) #if (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) void -kdebug_lookup_gen_events(long *dbg_parms, int dbg_namelen, void *dp, boolean_t lookup) +kdebug_vfs_lookup(long *dbg_parms, int dbg_namelen, void *dp, uint32_t flags) { int code; unsigned int i; + bool lookup = flags & KDBG_VFS_LOOKUP_FLAG_LOOKUP; + bool noprocfilt = flags & KDBG_VFS_LOOKUP_FLAG_NOPROCFILT; /* * In the event that we collect multiple, consecutive pathname * entries, we must mark the start of the path's string and the end. */ - if (lookup == TRUE) + if (lookup) { code = VFS_LOOKUP | DBG_FUNC_START; - else + } else { code = VFS_LOOKUP_DONE | DBG_FUNC_START; + } if (dbg_namelen <= (int)(3 * sizeof(long))) code |= DBG_FUNC_END; - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, code, kdebug_vnode(dp), dbg_parms[0], dbg_parms[1], dbg_parms[2], 0); + if (noprocfilt) { + KDBG_RELEASE_NOPROCFILT(code, kdebug_vnode(dp), dbg_parms[0], + dbg_parms[1], dbg_parms[2]); + } else { + KDBG_RELEASE(code, kdebug_vnode(dp), dbg_parms[0], dbg_parms[1], + dbg_parms[2]); + } code &= ~DBG_FUNC_START; @@ -1771,11 +1778,25 @@ kdebug_lookup_gen_events(long *dbg_parms, int dbg_namelen, void *dp, boolean_t l if (dbg_namelen <= (int)(4 * sizeof(long))) code |= DBG_FUNC_END; - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, code, dbg_parms[i], dbg_parms[i+1], dbg_parms[i+2], dbg_parms[i+3], 0); + if (noprocfilt) { + KDBG_RELEASE_NOPROCFILT(code, dbg_parms[i], dbg_parms[i + 1], + dbg_parms[i + 2], dbg_parms[i + 3]); + } else { + KDBG_RELEASE(code, dbg_parms[i], dbg_parms[i + 1], dbg_parms[i + 2], + dbg_parms[i + 3]); + } } } -static void +void +kdebug_lookup_gen_events(long *dbg_parms, int dbg_namelen, void *dp, + boolean_t lookup) +{ + kdebug_vfs_lookup(dbg_parms, dbg_namelen, dp, + lookup ? KDBG_VFS_LOOKUP_FLAG_LOOKUP : 0); +} + +void kdebug_lookup(vnode_t dp, struct componentname *cnp) { int dbg_namelen; @@ -1799,13 +1820,15 @@ kdebug_lookup(vnode_t dp, struct componentname *cnp) *(cnp->cn_nameptr + cnp->cn_namelen) ? '>' : 0, sizeof(dbg_parms) - dbg_namelen); } - kdebug_lookup_gen_events(dbg_parms, dbg_namelen, (void *)dp, TRUE); -} + kdebug_vfs_lookup(dbg_parms, dbg_namelen, (void *)dp, + KDBG_VFS_LOOKUP_FLAG_LOOKUP); +} #else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */ void -kdebug_lookup_gen_events(long *dbg_parms __unused, int dbg_namelen __unused, void *dp __unused) +kdebug_vfs_lookup(long *dbg_parms __unused, int dbg_namelen __unused, + void *dp __unused, __unused uint32_t flags) { } diff --git a/bsd/vfs/vfs_subr.c b/bsd/vfs/vfs_subr.c index 4da9d4535..e1d18c7c3 100644 --- a/bsd/vfs/vfs_subr.c +++ b/bsd/vfs/vfs_subr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2017 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -3326,7 +3326,7 @@ vfs_init_io_attributes(vnode_t devvp, mount_t mp) temp = MNT_DEFAULT_IOQUEUE_DEPTH; mp->mnt_ioqueue_depth = temp; - mp->mnt_ioscale = (mp->mnt_ioqueue_depth + (MNT_DEFAULT_IOQUEUE_DEPTH - 1)) / MNT_DEFAULT_IOQUEUE_DEPTH; + mp->mnt_ioscale = MNT_IOSCALE(mp->mnt_ioqueue_depth); if (mp->mnt_ioscale > 1) printf("ioqueue_depth = %d, ioscale = %d\n", (int)mp->mnt_ioqueue_depth, (int)mp->mnt_ioscale); @@ -3782,8 +3782,6 @@ filt_fstouch(struct knote *kn, struct kevent_internal_s *kev) lck_mtx_lock(fs_klist_lock); kn->kn_sfflags = kev->fflags; - if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) - kn->kn_udata = kev->udata; /* * the above filter function sets bits even if nobody is looking for them. @@ -3919,7 +3917,7 @@ SYSCTL_PROC(_vfs_generic, OID_AUTO, noremotehang, CTLFLAG_RW | CTLFLAG_ANYBODY, SYSCTL_INT(_vfs_generic, VFS_MAXTYPENUM, maxtypenum, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &maxvfstypenum, 0, ""); -SYSCTL_INT(_vfs_generic, OID_AUTO, sync_timeout, CTLFLAG_RW | CTLFLAG_LOCKED, &sync_timeout, 0, ""); +SYSCTL_INT(_vfs_generic, OID_AUTO, sync_timeout, CTLFLAG_RW | CTLFLAG_LOCKED, &sync_timeout_seconds, 0, ""); SYSCTL_NODE(_vfs_generic, VFS_CONF, conf, CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_vfs_generic_conf, ""); @@ -5133,12 +5131,17 @@ vnode_create_internal(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp, ut = get_bsdthread_info(current_thread()); if ((current_proc()->p_lflag & P_LRAGE_VNODES) || - (ut->uu_flag & UT_RAGE_VNODES)) { + (ut->uu_flag & (UT_RAGE_VNODES | UT_KERN_RAGE_VNODES))) { /* * process has indicated that it wants any * vnodes created on its behalf to be rapidly * aged to reduce the impact on the cached set * of vnodes + * + * if UT_KERN_RAGE_VNODES is set, then the + * kernel internally wants vnodes to be rapidly + * aged, even if the process hasn't requested + * this */ vp->v_flag |= VRAGE; } @@ -5843,9 +5846,17 @@ vn_create(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *v if (!batched) { *vpp = (vnode_t) 0; vnode_put(vp); + vp = NULLVP; } } + /* + * For creation VNOPs, this is the equivalent of + * lookup_handle_found_vnode. + */ + if (kdebug_enable && *vpp) + kdebug_lookup(*vpp, cnp); + out: vn_attribute_cleanup(vap, defaulted); @@ -6135,6 +6146,15 @@ vn_authorize_renamex(struct vnode *fdvp, struct vnode *fvp, struct componentna struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved) { + + return vn_authorize_renamex_with_paths(fdvp, fvp, fcnp, NULL, tdvp, tvp, tcnp, NULL, ctx, flags, reserved); +} + +int +vn_authorize_renamex_with_paths(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, const char *from_path, + struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, const char *to_path, + vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved) +{ int error = 0; int moving = 0; bool swap = flags & VFS_RENAME_SWAP; @@ -6231,6 +6251,23 @@ vn_authorize_renamex(struct vnode *fdvp, struct vnode *fvp, struct componentna /***** *****/ + /* + * As part of the Kauth step, we call out to allow 3rd-party + * fileop notification of "about to rename". This is needed + * in the event that 3rd-parties need to know that the DELETE + * authorization is actually part of a rename. It's important + * that we guarantee that the DELETE call-out will always be + * made if the WILL_RENAME call-out is made. Another fileop + * call-out will be performed once the operation is completed. + * We can ignore the result of kauth_authorize_fileop(). + * + * N.B. We are passing the vnode and *both* paths to each + * call; kauth_authorize_fileop() extracts the "from" path + * when posting a KAUTH_FILEOP_WILL_RENAME notification. + * As such, we only post these notifications if all of the + * information we need is provided. + */ + if (swap) { kauth_action_t f = 0, t = 0; @@ -6244,9 +6281,19 @@ vn_authorize_renamex(struct vnode *fdvp, struct vnode *fvp, struct componentna if (vnode_isdir(tvp)) t = KAUTH_VNODE_ADD_SUBDIRECTORY; } + if (to_path != NULL) + kauth_authorize_fileop(vfs_context_ucred(ctx), + KAUTH_FILEOP_WILL_RENAME, + (uintptr_t)fvp, + (uintptr_t)to_path); error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE | f, ctx); if (error) goto out; + if (from_path != NULL) + kauth_authorize_fileop(vfs_context_ucred(ctx), + KAUTH_FILEOP_WILL_RENAME, + (uintptr_t)tvp, + (uintptr_t)from_path); error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE | t, ctx); if (error) goto out; @@ -6278,6 +6325,11 @@ vn_authorize_renamex(struct vnode *fdvp, struct vnode *fvp, struct componentna * If fvp is a directory, and we are changing it's parent, * then we also need rights to rewrite its ".." entry as well. */ + if (to_path != NULL) + kauth_authorize_fileop(vfs_context_ucred(ctx), + KAUTH_FILEOP_WILL_RENAME, + (uintptr_t)fvp, + (uintptr_t)to_path); if (vnode_isdir(fvp)) { if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE | KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0) goto out; @@ -9880,7 +9932,8 @@ static int vnode_trace_path_callback(struct vnode *vp, void *arg) { /* vn_getpath() NUL-terminates, and len includes the NUL */ if (!rv) { - kdebug_lookup_gen_events(ctx->path, len, vp, TRUE); + kdebug_vfs_lookup(ctx->path, len, vp, + KDBG_VFS_LOOKUP_FLAG_LOOKUP | KDBG_VFS_LOOKUP_FLAG_NOPROCFILT); if (++(ctx->count) == 1000) { thread_yield_to_preemption(); diff --git a/bsd/vfs/vfs_syscalls.c b/bsd/vfs/vfs_syscalls.c index dccc77bd6..767d352c6 100644 --- a/bsd/vfs/vfs_syscalls.c +++ b/bsd/vfs/vfs_syscalls.c @@ -174,8 +174,6 @@ static int getfsstat_callback(mount_t mp, void * arg); static int getutimes(user_addr_t usrtvp, struct timespec *tsp); static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag); static int sync_callback(mount_t, void *); -static void hibernate_sync_thread(void *, __unused wait_result_t); -static int hibernate_sync_async(int); static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp, user_addr_t bufp, int *sizep, boolean_t is_64_bit, boolean_t partial_copy); @@ -217,6 +215,13 @@ static void mount_end_update(mount_t mp); static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index); #endif /* CONFIG_IMGSRC_ACCESS */ +//snapshot functions +#if CONFIG_MNT_ROOTSNAP +static int snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx); +#else +static int snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused)); +#endif + int (*union_dircheckp)(struct vnode **, struct fileproc *, vfs_context_t); __private_extern__ @@ -2323,8 +2328,6 @@ int syncprt = 0; #endif int print_vmpage_stat=0; -int sync_timeout = 60; // Sync time limit (sec) - static int sync_callback(mount_t mp, __unused void *arg) @@ -2358,15 +2361,64 @@ sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval return 0; } +typedef enum { + SYNC_ALL = 0, + SYNC_ONLY_RELIABLE_MEDIA = 1, + SYNC_ONLY_UNRELIABLE_MEDIA = 2 +} sync_type_t; + +static int +sync_internal_callback(mount_t mp, void *arg) +{ + if (arg) { + int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) && + (mp->mnt_flag & MNT_LOCAL); + sync_type_t sync_type = *((sync_type_t *)arg); + + if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable) + return (VFS_RETURNED); + else if ((sync_type = SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable) + return (VFS_RETURNED); + } + + (void)sync_callback(mp, NULL); + + return (VFS_RETURNED); +} + +int sync_thread_state = 0; +int sync_timeout_seconds = 5; + +#define SYNC_THREAD_RUN 0x0001 +#define SYNC_THREAD_RUNNING 0x0002 + static void -hibernate_sync_thread(void *arg, __unused wait_result_t wr) +sync_thread(__unused void *arg, __unused wait_result_t wr) { - int *timeout = (int *) arg; + sync_type_t sync_type; - vfs_iterate(LK_NOWAIT, sync_callback, NULL); + lck_mtx_lock(sync_mtx_lck); + while (sync_thread_state & SYNC_THREAD_RUN) { + sync_thread_state &= ~SYNC_THREAD_RUN; + lck_mtx_unlock(sync_mtx_lck); + + sync_type = SYNC_ONLY_RELIABLE_MEDIA; + vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type); + sync_type = SYNC_ONLY_UNRELIABLE_MEDIA; + vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type); + + lck_mtx_lock(sync_mtx_lck); + } + /* + * This wakeup _has_ to be issued before the lock is released otherwise + * we may end up waking up a thread in sync_internal which is + * expecting a wakeup from a thread it just created and not from this + * thread which is about to exit. + */ + wakeup(&sync_thread_state); + sync_thread_state &= ~SYNC_THREAD_RUNNING; + lck_mtx_unlock(sync_mtx_lck); - if (timeout) - wakeup((caddr_t) timeout); if (print_vmpage_stat) { vm_countdirtypages(); } @@ -2377,41 +2429,52 @@ hibernate_sync_thread(void *arg, __unused wait_result_t wr) #endif /* DIAGNOSTIC */ } +struct timeval sync_timeout_last_print = {0, 0}; + /* - * Sync in a separate thread so we can time out if it blocks. + * An in-kernel sync for power management to call. + * This function always returns within sync_timeout seconds. */ -static int -hibernate_sync_async(int timeout) +__private_extern__ int +sync_internal(void) { thread_t thd; int error; - struct timespec ts = {timeout, 0}; + int thread_created = FALSE; + struct timespec ts = {sync_timeout_seconds, 0}; lck_mtx_lock(sync_mtx_lck); - if (kernel_thread_start(hibernate_sync_thread, &timeout, &thd) != KERN_SUCCESS) { - printf("hibernate_sync_thread failed\n"); - lck_mtx_unlock(sync_mtx_lck); - return (0); + sync_thread_state |= SYNC_THREAD_RUN; + if (!(sync_thread_state & SYNC_THREAD_RUNNING)) { + int kr; + + sync_thread_state |= SYNC_THREAD_RUNNING; + kr = kernel_thread_start(sync_thread, NULL, &thd); + if (kr != KERN_SUCCESS) { + sync_thread_state &= ~SYNC_THREAD_RUNNING; + lck_mtx_unlock(sync_mtx_lck); + printf("sync_thread failed\n"); + return (0); + } + thread_created = TRUE; } - error = msleep((caddr_t) &timeout, sync_mtx_lck, (PVFS | PDROP | PCATCH), "hibernate_sync_thread", &ts); + error = msleep((caddr_t)&sync_thread_state, sync_mtx_lck, + (PVFS | PDROP | PCATCH), "sync_thread", &ts); if (error) { - printf("sync timed out: %d sec\n", timeout); + struct timeval now; + + microtime(&now); + if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) { + printf("sync timed out: %d sec\n", sync_timeout_seconds); + sync_timeout_last_print.tv_sec = now.tv_sec; + } } - thread_deallocate(thd); - return (0); -} + if (thread_created) + thread_deallocate(thd); -/* - * An in-kernel sync for power management to call. - */ -__private_extern__ int -sync_internal(void) -{ - (void) hibernate_sync_async(sync_timeout); - - return 0; + return (0); } /* end of sync_internal call */ /* @@ -2422,12 +2485,12 @@ int quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval) { struct mount *mp; - int error, quota_cmd, quota_status; + int error, quota_cmd, quota_status = 0; caddr_t datap; size_t fnamelen; struct nameidata nd; vfs_context_t ctx = vfs_context_current(); - struct dqblk my_dqblk; + struct dqblk my_dqblk = {}; AUDIT_ARG(uid, uap->uid); AUDIT_ARG(cmd, uap->cmd); @@ -3646,6 +3709,12 @@ open1(vfs_context_t ctx, struct nameidata *ndp, int uflags, strlen(vp->v_name)) || !strncmp(vp->v_name, "mediaserverd", + strlen(vp->v_name)) || + !strncmp(vp->v_name, + "SpringBoard", + strlen(vp->v_name)) || + !strncmp(vp->v_name, + "backboardd", strlen(vp->v_name))) { /* * This file matters when launching Camera: @@ -5294,7 +5363,7 @@ access_extended(__unused proc_t p, struct access_extended_args *uap, __unused in error = ENOMEM; goto out; } - MALLOC(result, errno_t *, desc_actual * sizeof(errno_t), M_TEMP, M_WAITOK); + MALLOC(result, errno_t *, desc_actual * sizeof(errno_t), M_TEMP, M_WAITOK | M_ZERO); if (result == NULL) { error = ENOMEM; goto out; @@ -7340,6 +7409,57 @@ renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from, } batched = vnode_compound_rename_available(fdvp); + +#if CONFIG_FSE + need_event = need_fsevent(FSE_RENAME, fdvp); + if (need_event) { + if (fvp) { + get_fse_info(fvp, &from_finfo, ctx); + } else { + error = vfs_get_notify_attributes(&__rename_data->fv_attr); + if (error) { + goto out1; + } + + fvap = &__rename_data->fv_attr; + } + + if (tvp) { + get_fse_info(tvp, &to_finfo, ctx); + } else if (batched) { + error = vfs_get_notify_attributes(&__rename_data->tv_attr); + if (error) { + goto out1; + } + + tvap = &__rename_data->tv_attr; + } + } +#else + need_event = 0; +#endif /* CONFIG_FSE */ + + if (need_event || kauth_authorize_fileop_has_listeners()) { + if (from_name == NULL) { + GET_PATH(from_name); + if (from_name == NULL) { + error = ENOMEM; + goto out1; + } + } + + from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated); + + if (to_name == NULL) { + GET_PATH(to_name); + if (to_name == NULL) { + error = ENOMEM; + goto out1; + } + } + + to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated); + } if (!fvp) { /* * Claim: this check will never reject a valid rename. @@ -7359,7 +7479,7 @@ renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from, } if (!batched) { - error = vn_authorize_renamex(fdvp, fvp, &fromnd->ni_cnd, tdvp, tvp, &tond->ni_cnd, ctx, flags, NULL); + error = vn_authorize_renamex_with_paths(fdvp, fvp, &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx, flags, NULL); if (error) { if (error == ENOENT) { assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES); @@ -7550,56 +7670,6 @@ renameat_internal(vfs_context_t ctx, int fromfd, user_addr_t from, oparent = fvp->v_parent; skipped_lookup: -#if CONFIG_FSE - need_event = need_fsevent(FSE_RENAME, fdvp); - if (need_event) { - if (fvp) { - get_fse_info(fvp, &from_finfo, ctx); - } else { - error = vfs_get_notify_attributes(&__rename_data->fv_attr); - if (error) { - goto out1; - } - - fvap = &__rename_data->fv_attr; - } - - if (tvp) { - get_fse_info(tvp, &to_finfo, ctx); - } else if (batched) { - error = vfs_get_notify_attributes(&__rename_data->tv_attr); - if (error) { - goto out1; - } - - tvap = &__rename_data->tv_attr; - } - } -#else - need_event = 0; -#endif /* CONFIG_FSE */ - - if (need_event || kauth_authorize_fileop_has_listeners()) { - if (from_name == NULL) { - GET_PATH(from_name); - if (from_name == NULL) { - error = ENOMEM; - goto out1; - } - } - - from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated); - - if (to_name == NULL) { - GET_PATH(to_name); - if (to_name == NULL) { - error = ENOMEM; - goto out1; - } - } - - to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated); - } error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap, tdvp, &tvp, &tond->ni_cnd, tvap, flags, ctx); @@ -8658,10 +8728,10 @@ getdirentriesattr (proc_t p, struct getdirentriesattr_args *uap, int32_t *retval struct fileproc *fp; uio_t auio = NULL; int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32; - uint32_t count, savecount; - uint32_t newstate; + uint32_t count = 0, savecount = 0; + uint32_t newstate = 0; int error, eofflag; - uint32_t loff; + uint32_t loff = 0; struct attrlist attributelist; vfs_context_t ctx = vfs_context_current(); int fd = uap->fd; @@ -10613,7 +10683,8 @@ getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval) vp = nd.ni_vp; nameidone(&nd); - if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) { + error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen); + if (error != 0) { goto out; } if (xattr_protected(attrname)) { @@ -10693,7 +10764,8 @@ fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval) file_drop(uap->fd); return(error); } - if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) { + error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen); + if (error != 0) { goto out; } if (xattr_protected(attrname)) { @@ -10739,7 +10811,8 @@ setxattr(proc_t p, struct setxattr_args *uap, int *retval) if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT)) return (EINVAL); - if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) { + error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen); + if (error != 0) { if (error == EPERM) { /* if the string won't fit in attrname, copyinstr emits EPERM */ return (ENAMETOOLONG); @@ -10798,7 +10871,8 @@ fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval) if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) return (EINVAL); - if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) { + error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen); + if (error != 0) { if (error == EPERM) { /* if the string won't fit in attrname, copyinstr emits EPERM */ return (ENAMETOOLONG); @@ -11096,9 +11170,9 @@ static int fsgetpath_internal( if (kdebug_enable) { long dbg_parms[NUMPARMS]; - int dbg_namelen; + int dbg_namelen; - dbg_namelen = (int)sizeof(dbg_parms); + dbg_namelen = (int)sizeof(dbg_parms); if (length < dbg_namelen) { memcpy((char *)dbg_parms, buf, length); @@ -11109,7 +11183,8 @@ static int fsgetpath_internal( memcpy((char *)dbg_parms, buf + (length - dbg_namelen), dbg_namelen); } - kdebug_lookup_gen_events(dbg_parms, dbg_namelen, (void *)vp, TRUE); + kdebug_vfs_lookup(dbg_parms, dbg_namelen, (void *)vp, + KDBG_VFS_LOOKUP_FLAG_LOOKUP); } *pathlen = (user_ssize_t)length; /* may be superseded by error */ @@ -11140,7 +11215,7 @@ fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval) if (uap->bufsize > PAGE_SIZE) { return (EINVAL); } - MALLOC(realpath, char *, uap->bufsize, M_TEMP, M_WAITOK); + MALLOC(realpath, char *, uap->bufsize, M_TEMP, M_WAITOK | M_ZERO); if (realpath == NULL) { return (ENOMEM); } @@ -12031,11 +12106,11 @@ fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap, case SNAPSHOT_OP_REVERT: error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx); break; -#if !TARGET_OS_OSX +#if CONFIG_MNT_ROOTSNAP case SNAPSHOT_OP_ROOT: error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx); break; -#endif /* !TARGET_OS_OSX */ +#endif /* CONFIG_MNT_ROOTSNAP */ default: error = ENOSYS; } diff --git a/bsd/vfs/vfs_vnops.c b/bsd/vfs/vfs_vnops.c index 797573d75..6b03aa5a4 100644 --- a/bsd/vfs/vfs_vnops.c +++ b/bsd/vfs/vfs_vnops.c @@ -1795,7 +1795,7 @@ filt_vndetach(struct knote *kn) * differently than the regular case for VREG files. If not in poll(), * then we need to know current fileproc offset for VREG. */ -static intptr_t +static int64_t vnode_readable_data_count(vnode_t vp, off_t current_offset, int ispoll) { if (vnode_isfifo(vp)) { @@ -1803,25 +1803,25 @@ vnode_readable_data_count(vnode_t vp, off_t current_offset, int ispoll) int cnt; int err = fifo_charcount(vp, &cnt); if (err == 0) { - return (intptr_t)cnt; + return (int64_t)cnt; } else #endif { - return (intptr_t)0; + return 0; } } else if (vnode_isreg(vp)) { if (ispoll) { - return (intptr_t)1; + return 1; } off_t amount; amount = vp->v_un.vu_ubcinfo->ui_size - current_offset; - if (amount > (off_t)INTPTR_MAX) { - return INTPTR_MAX; - } else if (amount < (off_t)INTPTR_MIN) { - return INTPTR_MIN; + if (amount > INT64_MAX) { + return INT64_MAX; + } else if (amount < INT64_MIN) { + return INT64_MIN; } else { - return (intptr_t)amount; + return (int64_t)amount; } } else { panic("Should never have an EVFILT_READ except for reg or fifo."); @@ -1936,8 +1936,6 @@ filt_vntouch(struct knote *kn, struct kevent_internal_s *kev) /* accept new input fflags mask */ kn->kn_sfflags = kev->fflags; - if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) - kn->kn_udata = kev->udata; activate = filt_vnode_common(kn, vp, hint); diff --git a/bsd/vfs/vfs_xattr.c b/bsd/vfs/vfs_xattr.c index b47ec5553..f01d117b5 100644 --- a/bsd/vfs/vfs_xattr.c +++ b/bsd/vfs/vfs_xattr.c @@ -397,6 +397,48 @@ xattr_protected(const char *attrname) } +static void +vnode_setasnamedstream_internal(vnode_t vp, vnode_t svp) +{ + uint32_t streamflags = VISNAMEDSTREAM; + + if ((vp->v_mount->mnt_kern_flag & MNTK_NAMED_STREAMS) == 0) { + streamflags |= VISSHADOW; + } + + /* Tag the vnode. */ + vnode_lock_spin(svp); + svp->v_flag |= streamflags; + vnode_unlock(svp); + + /* Tag the parent so we know to flush credentials for streams on setattr */ + vnode_lock_spin(vp); + vp->v_lflag |= VL_HASSTREAMS; + vnode_unlock(vp); + + /* Make the file it's parent. + * Note: This parent link helps us distinguish vnodes for + * shadow stream files from vnodes for resource fork on file + * systems that support namedstream natively (both have + * VISNAMEDSTREAM set) by allowing access to mount structure + * for checking MNTK_NAMED_STREAMS bit at many places in the + * code. + */ + vnode_update_identity(svp, vp, NULL, 0, 0, VNODE_UPDATE_NAMEDSTREAM_PARENT); + + return; +} + +errno_t +vnode_setasnamedstream(vnode_t vp, vnode_t svp) +{ + if ((vp->v_mount->mnt_kern_flag & MNTK_NAMED_STREAMS) == 0) + return (EINVAL); + + vnode_setasnamedstream_internal(vp, svp); + return (0); +} + #if NAMEDSTREAMS /* @@ -417,33 +459,8 @@ vnode_getnamedstream(vnode_t vp, vnode_t *svpp, const char *name, enum nsoperati } if (error == 0) { - uint32_t streamflags = VISNAMEDSTREAM; - vnode_t svp = *svpp; - - if ((vp->v_mount->mnt_kern_flag & MNTK_NAMED_STREAMS) == 0) { - streamflags |= VISSHADOW; - } - - /* Tag the vnode. */ - vnode_lock_spin(svp); - svp->v_flag |= streamflags; - vnode_unlock(svp); - - /* Tag the parent so we know to flush credentials for streams on setattr */ - vnode_lock_spin(vp); - vp->v_lflag |= VL_HASSTREAMS; - vnode_unlock(vp); - - /* Make the file it's parent. - * Note: This parent link helps us distinguish vnodes for - * shadow stream files from vnodes for resource fork on file - * systems that support namedstream natively (both have - * VISNAMEDSTREAM set) by allowing access to mount structure - * for checking MNTK_NAMED_STREAMS bit at many places in the - * code. - */ - vnode_update_identity(svp, vp, NULL, 0, 0, VNODE_UPDATE_PARENT); - } + vnode_setasnamedstream_internal(vp, *svpp); + } return (error); } @@ -462,34 +479,9 @@ vnode_makenamedstream(vnode_t vp, vnode_t *svpp, const char *name, int flags, vf error = default_makenamedstream(vp, svpp, name, context); if (error == 0) { - uint32_t streamflags = VISNAMEDSTREAM; - vnode_t svp = *svpp; - - /* Tag the vnode. */ - if ((vp->v_mount->mnt_kern_flag & MNTK_NAMED_STREAMS) == 0) { - streamflags |= VISSHADOW; - } - - /* Tag the vnode. */ - vnode_lock_spin(svp); - svp->v_flag |= streamflags; - vnode_unlock(svp); - - /* Tag the parent so we know to flush credentials for streams on setattr */ - vnode_lock_spin(vp); - vp->v_lflag |= VL_HASSTREAMS; - vnode_unlock(vp); - - /* Make the file it's parent. - * Note: This parent link helps us distinguish vnodes for - * shadow stream files from vnodes for resource fork on file - * systems that support namedstream natively (both have - * VISNAMEDSTREAM set) by allowing access to mount structure - * for checking MNTK_NAMED_STREAMS bit at many places in the - * code. - */ - vnode_update_identity(svp, vp, NULL, 0, 0, VNODE_UPDATE_PARENT); + vnode_setasnamedstream_internal(vp, *svpp); } + return (error); } diff --git a/bsd/vm/vm_compressor_backing_file.c b/bsd/vm/vm_compressor_backing_file.c index 295d023fa..e54b68356 100644 --- a/bsd/vm/vm_compressor_backing_file.c +++ b/bsd/vm/vm_compressor_backing_file.c @@ -39,15 +39,20 @@ #include #include #include +#include void vm_swapfile_open(const char *path, vnode_t *vp); void vm_swapfile_close(uint64_t path, vnode_t vp); int vm_swapfile_preallocate(vnode_t vp, uint64_t *size, boolean_t *pin); uint64_t vm_swapfile_get_blksize(vnode_t vp); uint64_t vm_swapfile_get_transfer_size(vnode_t vp); -int vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flags); +int vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flags, void *); int vm_record_file_write(struct vnode *vp, uint64_t offset, char *buf, int size); +#if CONFIG_FREEZE +int vm_swap_vol_get_budget(vnode_t vp, uint64_t *freeze_daily_budget); +#endif /* CONFIG_FREEZE */ + void vm_swapfile_open(const char *path, vnode_t *vp) @@ -115,7 +120,9 @@ vm_swapfile_preallocate(vnode_t vp, uint64_t *size, boolean_t *pin) int error = 0; uint64_t file_size = 0; vfs_context_t ctx = NULL; - +#if CONFIG_FREEZE + struct vnode_attr va; +#endif /* CONFIG_FREEZE */ ctx = vfs_context_kernel(); @@ -148,6 +155,18 @@ vm_swapfile_preallocate(vnode_t vp, uint64_t *size, boolean_t *pin) vnode_lock_spin(vp); SET(vp->v_flag, VSWAP); vnode_unlock(vp); + +#if CONFIG_FREEZE + VATTR_INIT(&va); + VATTR_SET(&va, va_dataprotect_class, PROTECTION_CLASS_C); + error = VNOP_SETATTR(vp, &va, ctx); + + if (error) { + printf("setattr PROTECTION_CLASS_C for swap file failed: %d\n", error); + goto done; + } +#endif /* CONFIG_FREEZE */ + done: return error; } @@ -170,7 +189,7 @@ vm_record_file_write(vnode_t vp, uint64_t offset, char *buf, int size) int -vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flags) +vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flags, void *upl_iodone) { int error = 0; uint64_t io_size = npages * PAGE_SIZE_64; @@ -184,11 +203,13 @@ vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flag upl_create_flags = UPL_SET_INTERNAL | UPL_SET_LITE; + if (upl_iodone == NULL) + upl_control_flags = UPL_IOSYNC; + #if ENCRYPTED_SWAP - upl_control_flags = UPL_IOSYNC | UPL_PAGING_ENCRYPTED; -#else - upl_control_flags = UPL_IOSYNC; + upl_control_flags |= UPL_PAGING_ENCRYPTED; #endif + if ((flags & SWAP_READ) == FALSE) { upl_create_flags |= UPL_COPYOUT_FROM; } @@ -224,6 +245,8 @@ vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flag } } else { + upl_set_iodone(upl, upl_iodone); + vnode_pageout(vp, upl, 0, @@ -367,3 +390,19 @@ u_int32_t vnode_trim_list (vnode_t vp, struct trim_list *tl, boolean_t route_onl return error; } + +#if CONFIG_FREEZE +int +vm_swap_vol_get_budget(vnode_t vp, uint64_t *freeze_daily_budget) +{ + vnode_t devvp = NULL; + vfs_context_t ctx = vfs_context_kernel(); + errno_t err = 0; + + devvp = vp->v_mount->mnt_devvp; + + err = VNOP_IOCTL(devvp, DKIOCGETMAXSWAPWRITE, (caddr_t)freeze_daily_budget, 0, ctx); + + return err; +} +#endif /* CONFIG_FREEZE */ diff --git a/bsd/vm/vm_unix.c b/bsd/vm/vm_unix.c index df5f607ae..d3109c564 100644 --- a/bsd/vm/vm_unix.c +++ b/bsd/vm/vm_unix.c @@ -103,6 +103,10 @@ #include #endif +#if CONFIG_CSR +#include +#endif /* CONFIG_CSR */ + int _shared_region_map_and_slide(struct proc*, int, unsigned int, struct shared_file_mapping_np*, uint32_t, user_addr_t, user_addr_t); int shared_region_copyin_mappings(struct proc*, user_addr_t, unsigned int, struct shared_file_mapping_np *); @@ -276,12 +280,6 @@ extern int allow_stack_exec, allow_data_exec; SYSCTL_INT(_vm, OID_AUTO, allow_stack_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_stack_exec, 0, ""); SYSCTL_INT(_vm, OID_AUTO, allow_data_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_data_exec, 0, ""); -#if __arm64__ -extern int fourk_binary_compatibility_unsafe; -extern int fourk_binary_compatibility_allow_wx; -SYSCTL_INT(_vm, OID_AUTO, fourk_binary_compatibility_unsafe, CTLFLAG_RW | CTLFLAG_LOCKED, &fourk_binary_compatibility_unsafe, 0, ""); -SYSCTL_INT(_vm, OID_AUTO, fourk_binary_compatibility_allow_wx, CTLFLAG_RW | CTLFLAG_LOCKED, &fourk_binary_compatibility_allow_wx, 0, ""); -#endif /* __arm64__ */ #endif /* DEVELOPMENT || DEBUG */ static const char *prot_values[] = { @@ -330,7 +328,18 @@ static char scdir_path[] = "/System/Library/Caches/com.apple.dyld/"; #endif #ifndef SECURE_KERNEL -SYSCTL_INT(_vm, OID_AUTO, enforce_shared_cache_dir, CTLFLAG_RW | CTLFLAG_LOCKED, &scdir_enforce, 0, ""); +static int sysctl_scdir_enforce SYSCTL_HANDLER_ARGS +{ +#if CONFIG_CSR + if (csr_check(CSR_ALLOW_UNRESTRICTED_FS) != 0) { + printf("Failed attempt to set vm.enforce_shared_cache_dir sysctl\n"); + return EPERM; + } +#endif /* CONFIG_CSR */ + return sysctl_handle_int(oidp, arg1, arg2, req); +} + +SYSCTL_PROC(_vm, OID_AUTO, enforce_shared_cache_dir, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &scdir_enforce, 0, sysctl_scdir_enforce, "I", ""); #endif /* These log rate throttling state variables aren't thread safe, but @@ -1759,7 +1768,7 @@ _shared_region_map_and_slide( } /* check that the mappings are properly covered by code signatures */ - if (!cs_enforcement(NULL)) { + if (!cs_system_enforcement()) { /* code signing is not enforced: no need to check */ } else for (i = 0; i < mappings_count; i++) { if (mappings[i].sfm_init_prot & VM_PROT_ZF) { @@ -1790,7 +1799,7 @@ _shared_region_map_and_slide( } /* get the process's shared region (setup in vm_map_exec()) */ - shared_region = vm_shared_region_get(current_task()); + shared_region = vm_shared_region_trim_and_get(current_task()); if (shared_region == NULL) { SHARED_REGION_TRACE_ERROR( ("shared_region: %p [%d(%s)] map(%p:'%s'): " @@ -1798,6 +1807,7 @@ _shared_region_map_and_slide( (void *)VM_KERNEL_ADDRPERM(current_thread()), p->p_pid, p->p_comm, (void *)VM_KERNEL_ADDRPERM(vp), vp->v_name)); + error = EINVAL; goto done; } @@ -1970,9 +1980,8 @@ extern unsigned int vm_page_free_target; SYSCTL_INT(_vm, OID_AUTO, vm_page_free_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_free_target, 0, "Pageout daemon free target"); -extern unsigned int vm_memory_pressure; SYSCTL_INT(_vm, OID_AUTO, memory_pressure, CTLFLAG_RD | CTLFLAG_LOCKED, - &vm_memory_pressure, 0, "Memory pressure indicator"); + &vm_pageout_state.vm_memory_pressure, 0, "Memory pressure indicator"); static int vm_ctl_page_free_wanted SYSCTL_HANDLER_ARGS @@ -1995,9 +2004,42 @@ extern unsigned int vm_page_purgeable_wired_count; SYSCTL_INT(_vm, OID_AUTO, page_purgeable_wired_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_purgeable_wired_count, 0, "Wired purgeable page count"); -extern unsigned int vm_pageout_purged_objects; +#if DEVELOPMENT || DEBUG +extern uint64_t get_pages_grabbed_count(void); + +static int +pages_grabbed SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2, oidp) + uint64_t value = get_pages_grabbed_count(); + return SYSCTL_OUT(req, &value, sizeof(value)); +} + +SYSCTL_PROC(_vm, OID_AUTO, pages_grabbed, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, &pages_grabbed, "QU", "Total pages grabbed"); +SYSCTL_ULONG(_vm, OID_AUTO, pages_freed, CTLFLAG_RD | CTLFLAG_LOCKED, + &vm_pageout_vminfo.vm_page_pages_freed, "Total pages freed"); + SYSCTL_INT(_vm, OID_AUTO, pageout_purged_objects, CTLFLAG_RD | CTLFLAG_LOCKED, - &vm_pageout_purged_objects, 0, "System purged object count"); + &vm_pageout_debug.vm_pageout_purged_objects, 0, "System purged object count"); +SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_busy, CTLFLAG_RD | CTLFLAG_LOCKED, + &vm_pageout_debug.vm_pageout_cleaned_busy, 0, "Cleaned pages busy (deactivated)"); +SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_nolock, CTLFLAG_RD | CTLFLAG_LOCKED, + &vm_pageout_debug.vm_pageout_cleaned_nolock, 0, "Cleaned pages no-lock (deactivated)"); + +SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_volatile_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED, + &vm_pageout_debug.vm_pageout_cleaned_volatile_reactivated, 0, "Cleaned pages volatile reactivated"); +SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_fault_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED, + &vm_pageout_debug.vm_pageout_cleaned_fault_reactivated, 0, "Cleaned pages fault reactivated"); +SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED, + &vm_pageout_debug.vm_pageout_cleaned_reactivated, 0, "Cleaned pages reactivated"); /* sum of all reactivated AND busy and nolock (even though those actually get reDEactivated */ +SYSCTL_ULONG(_vm, OID_AUTO, pageout_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED, + &vm_pageout_vminfo.vm_pageout_freed_cleaned, "Cleaned pages freed"); +SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reference_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED, + &vm_pageout_debug.vm_pageout_cleaned_reference_reactivated, 0, "Cleaned pages reference reactivated"); +SYSCTL_UINT(_vm, OID_AUTO, pageout_enqueued_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED, + &vm_pageout_debug.vm_pageout_enqueued_cleaned, 0, ""); /* sum of next two */ +#endif extern int madvise_free_debug; SYSCTL_INT(_vm, OID_AUTO, madvise_free_debug, CTLFLAG_RW | CTLFLAG_LOCKED, @@ -2049,34 +2091,16 @@ SYSCTL_UINT(_vm, OID_AUTO, page_pageable_internal_count, CTLFLAG_RD | CTLFLAG_LO SYSCTL_UINT(_vm, OID_AUTO, page_pageable_external_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pageable_external_count, 0, ""); /* pageout counts */ -extern unsigned int vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external, vm_pageout_inactive_clean, vm_pageout_speculative_clean, vm_pageout_inactive_used; -extern unsigned int vm_pageout_freed_from_inactive_clean, vm_pageout_freed_from_speculative; -SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_dirty_internal, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_inactive_dirty_internal, 0, ""); -SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_dirty_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_inactive_dirty_external, 0, ""); -SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_inactive_clean, 0, ""); -SYSCTL_UINT(_vm, OID_AUTO, pageout_speculative_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_speculative_clean, 0, ""); -SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_used, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_inactive_used, 0, ""); -SYSCTL_UINT(_vm, OID_AUTO, pageout_freed_from_inactive_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_freed_from_inactive_clean, 0, ""); -SYSCTL_UINT(_vm, OID_AUTO, pageout_freed_from_speculative, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_freed_from_speculative, 0, ""); - -extern unsigned int vm_pageout_freed_from_cleaned; -SYSCTL_UINT(_vm, OID_AUTO, pageout_freed_from_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_freed_from_cleaned, 0, ""); - -/* counts of pages entering the cleaned queue */ -extern unsigned int vm_pageout_enqueued_cleaned, vm_pageout_enqueued_cleaned_from_inactive_dirty; -SYSCTL_UINT(_vm, OID_AUTO, pageout_enqueued_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_enqueued_cleaned, 0, ""); /* sum of next two */ -SYSCTL_UINT(_vm, OID_AUTO, pageout_enqueued_cleaned_from_inactive_dirty, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_enqueued_cleaned_from_inactive_dirty, 0, ""); - -/* counts of pages leaving the cleaned queue */ -extern unsigned int vm_pageout_cleaned_reclaimed, vm_pageout_cleaned_reactivated, vm_pageout_cleaned_reference_reactivated, vm_pageout_cleaned_volatile_reactivated, vm_pageout_cleaned_fault_reactivated, vm_pageout_cleaned_commit_reactivated, vm_pageout_cleaned_busy, vm_pageout_cleaned_nolock; -SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_cleaned_reclaimed, 0, "Cleaned pages reclaimed"); -SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_cleaned_reactivated, 0, "Cleaned pages reactivated"); /* sum of all reactivated AND busy and nolock (even though those actually get reDEactivated */ -SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reference_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_cleaned_reference_reactivated, 0, "Cleaned pages reference reactivated"); -SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_volatile_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_cleaned_volatile_reactivated, 0, "Cleaned pages volatile reactivated"); -SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_fault_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_cleaned_fault_reactivated, 0, "Cleaned pages fault reactivated"); -SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_commit_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_cleaned_commit_reactivated, 0, "Cleaned pages commit reactivated"); -SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_busy, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_cleaned_busy, 0, "Cleaned pages busy (deactivated)"); -SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_nolock, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_cleaned_nolock, 0, "Cleaned pages no-lock (deactivated)"); +SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_pageout_inactive_clean, 0, ""); +SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_used, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_pageout_inactive_used, 0, ""); + +SYSCTL_ULONG(_vm, OID_AUTO, pageout_inactive_dirty_internal, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_inactive_dirty_internal, ""); +SYSCTL_ULONG(_vm, OID_AUTO, pageout_inactive_dirty_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_inactive_dirty_external, ""); +SYSCTL_ULONG(_vm, OID_AUTO, pageout_speculative_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_speculative, ""); +SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_external, ""); +SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_speculative, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_speculative, ""); +SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_cleaned, ""); + /* counts of pages prefaulted when entering a memory object */ extern int64_t vm_prefault_nb_pages, vm_prefault_nb_bailout; @@ -2134,9 +2158,6 @@ SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_dirty, CTLFLAG_RD | CTLFLA SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_for_iokit, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_for_iokit, 0, ""); SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_for_iokit_success, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_for_iokit_success, 0, ""); -extern uint64_t vm_pageout_secluded_burst_count; -SYSCTL_QUAD(_vm, OID_AUTO, pageout_secluded_burst_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_secluded_burst_count, ""); - #endif /* CONFIG_SECLUDED_MEMORY */ #include @@ -2289,10 +2310,12 @@ SYSCTL_ULONG(_vm, OID_AUTO, vm_max_kernel_address, CTLFLAG_RD, (unsigned long *) extern uint32_t vm_page_pages; SYSCTL_UINT(_vm, OID_AUTO, pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pages, 0, ""); +extern uint32_t vm_page_busy_absent_skipped; +SYSCTL_UINT(_vm, OID_AUTO, page_busy_absent_skipped, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_busy_absent_skipped, 0, ""); + #if (__arm__ || __arm64__) && (DEVELOPMENT || DEBUG) -extern int pacified_footprint_suspend; -int footprint_suspend_allowed = 0; -SYSCTL_INT(_vm, OID_AUTO, footprint_suspend_allowed, CTLFLAG_RW | CTLFLAG_LOCKED, &footprint_suspend_allowed, 0, ""); +extern int vm_footprint_suspend_allowed; +SYSCTL_INT(_vm, OID_AUTO, footprint_suspend_allowed, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_footprint_suspend_allowed, 0, ""); extern void pmap_footprint_suspend(vm_map_t map, boolean_t suspend); static int @@ -2309,8 +2332,7 @@ sysctl_vm_footprint_suspend SYSCTL_HANDLER_ARGS if (error) { return error; } - if (pacified_footprint_suspend && - !footprint_suspend_allowed) { + if (!vm_footprint_suspend_allowed) { if (new_value != 0) { /* suspends are not allowed... */ return 0; @@ -2329,3 +2351,46 @@ SYSCTL_PROC(_vm, OID_AUTO, footprint_suspend, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_ANYBODY|CTLFLAG_LOCKED|CTLFLAG_MASKED, 0, 0, &sysctl_vm_footprint_suspend, "I", ""); #endif /* (__arm__ || __arm64__) && (DEVELOPMENT || DEBUG) */ + +extern uint64_t vm_map_corpse_footprint_count; +extern uint64_t vm_map_corpse_footprint_size_avg; +extern uint64_t vm_map_corpse_footprint_size_max; +extern uint64_t vm_map_corpse_footprint_full; +extern uint64_t vm_map_corpse_footprint_no_buf; +SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_count, + CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_count, ""); +SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_size_avg, + CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_size_avg, ""); +SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_size_max, + CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_size_max, ""); +SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_full, + CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_full, ""); +SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_no_buf, + CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_no_buf, ""); + +#if PMAP_CS +extern uint64_t vm_cs_defer_to_pmap_cs; +extern uint64_t vm_cs_defer_to_pmap_cs_not; +SYSCTL_QUAD(_vm, OID_AUTO, cs_defer_to_pmap_cs, + CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cs_defer_to_pmap_cs, ""); +SYSCTL_QUAD(_vm, OID_AUTO, cs_defer_to_pmap_cs_not, + CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cs_defer_to_pmap_cs_not, ""); +#endif /* PMAP_CS */ + +extern uint64_t shared_region_pager_copied; +extern uint64_t shared_region_pager_slid; +extern uint64_t shared_region_pager_slid_error; +extern uint64_t shared_region_pager_reclaimed; +SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_copied, + CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_copied, ""); +SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_slid, + CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_slid, ""); +SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_slid_error, + CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_slid_error, ""); +SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_reclaimed, + CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_reclaimed, ""); + +#if MACH_ASSERT +extern int pmap_ledgers_panic_leeway; +SYSCTL_INT(_vm, OID_AUTO, pmap_ledgers_panic_leeway, CTLFLAG_RW | CTLFLAG_LOCKED, &pmap_ledgers_panic_leeway, 0, ""); +#endif /* MACH_ASSERT */ diff --git a/bsd/vm/vnode_pager.c b/bsd/vm/vnode_pager.c index 69dad2981..fdbccff33 100644 --- a/bsd/vm/vnode_pager.c +++ b/bsd/vm/vnode_pager.c @@ -126,6 +126,15 @@ vnode_pager_issue_reprioritize_io(struct vnode *devvp, uint64_t blkno, uint32_t } #endif +void +vnode_pager_was_dirtied( + struct vnode *vp, + vm_object_offset_t s_offset, + vm_object_offset_t e_offset) +{ + cluster_update_state(vp, s_offset, e_offset, TRUE); +} + uint32_t vnode_pager_isinuse(struct vnode *vp) { diff --git a/config/BSDKernel.exports b/config/BSDKernel.exports index 83659ed3c..934486bb8 100644 --- a/config/BSDKernel.exports +++ b/config/BSDKernel.exports @@ -474,6 +474,7 @@ _proc_exiting _proc_find _proc_forcequota _proc_is64bit +_proc_is64bit_data _proc_is_classic _proc_isinferior _proc_issignal diff --git a/config/IOKit.arm.exports b/config/IOKit.arm.exports index f4ee08125..ad89576ca 100644 --- a/config/IOKit.arm.exports +++ b/config/IOKit.arm.exports @@ -307,3 +307,5 @@ __ZNK8IOPMprot12getMetaClassEv __ZNK8IOPMprot9MetaClass5allocEv __ZTV8IOPMprot __ZTVN8IOPMprot9MetaClassE + +__ZN9IOService23addMatchingNotificationEPK8OSSymbolP12OSDictionarylU13block_pointerFbPS_P10IONotifierE diff --git a/config/IOKit.arm64.exports b/config/IOKit.arm64.exports index ed271b62c..065a36f0f 100644 --- a/config/IOKit.arm64.exports +++ b/config/IOKit.arm64.exports @@ -228,3 +228,5 @@ __ZNK15IORegistryEntry12copyPropertyEPK8OSSymbolPK15IORegistryPlanej __ZNK15IORegistryEntry12copyPropertyEPKcPK15IORegistryPlanej __ZNK18IOMemoryDescriptor19dmaCommandOperationEjPvj __ZNK25IOGeneralMemoryDescriptor19dmaCommandOperationEjPvj + +__ZN9IOService23addMatchingNotificationEPK8OSSymbolP12OSDictionaryiU13block_pointerFbPS_P10IONotifierE diff --git a/config/IOKit.exports b/config/IOKit.exports index 0f0678d3d..c55892377 100644 --- a/config/IOKit.exports +++ b/config/IOKit.exports @@ -886,6 +886,7 @@ __ZN22_IOOpenServiceIteratorD0Ev __ZN22_IOOpenServiceIteratorD2Ev __ZN23IOMultiMemoryDescriptor10gMetaClassE __ZN23IOMultiMemoryDescriptor10superClassE +__ZN23IOMultiMemoryDescriptor16getPreparationIDEv __ZN23IOMultiMemoryDescriptor4freeEv __ZN23IOMultiMemoryDescriptor9MetaClassC1Ev __ZN23IOMultiMemoryDescriptor9MetaClassC2Ev @@ -964,6 +965,7 @@ __ZN28IOFilterInterruptEventSource20interruptEventSourceEP8OSObjectPFvS1_P22IOIn __ZN28IOFilterInterruptEventSource23normalInterruptOccurredEPvP9IOServicei __ZN28IOFilterInterruptEventSource24disableInterruptOccurredEPvP9IOServicei __ZN28IOFilterInterruptEventSource26filterInterruptEventSourceEP8OSObjectPFvS1_P22IOInterruptEventSourceiEPFbS1_PS_EP9IOServicei +__ZN28IOFilterInterruptEventSource4freeEv __ZN28IOFilterInterruptEventSource4initEP8OSObjectPFvS1_P22IOInterruptEventSourceiEP9IOServicei __ZN28IOFilterInterruptEventSource4initEP8OSObjectPFvS1_P22IOInterruptEventSourceiEPFbS1_PS_EP9IOServicei __ZN28IOFilterInterruptEventSource9MetaClassC1Ev @@ -1654,3 +1656,14 @@ __ZTVN14IOReportLegend9MetaClassE __ZTVN15IOStateReporter9MetaClassE __ZTVN16IOSimpleReporter9MetaClassE __ZTVN19IOHistogramReporter9MetaClassE +__ZN10IOWorkLoop14runActionBlockEU13block_pointerFivE +__ZN13IOCommandGate14runActionBlockEU13block_pointerFivE +__ZN13IOEventSource14setActionBlockEU13block_pointerFivE +__ZN18IOTimerEventSource16timerEventSourceEjP8OSObjectU13block_pointerFvPS_E +__ZN22IOInterruptEventSource20interruptEventSourceEP8OSObjectP9IOServiceiU13block_pointerFvPS_iE +__ZN28IOFilterInterruptEventSource26filterInterruptEventSourceEP8OSObjectP9IOServiceiU13block_pointerFvP22IOInterruptEventSourceiEU13block_pointerFbPS_E +__ZN9IOService16registerInterestEPK8OSSymbolU13block_pointerFijPS_PvmE +__ZN9IOService22registerInterruptBlockEiP8OSObjectU13block_pointerFvPS_iE +__ZNK13IOEventSource14getActionBlockEU13block_pointerFivE +__ZN13IOEventSource9setRefconEPv +__ZNK13IOEventSource9getRefconEv diff --git a/config/IOKit.x86_64.exports b/config/IOKit.x86_64.exports index 1f7734ca8..d53a169a5 100644 --- a/config/IOKit.x86_64.exports +++ b/config/IOKit.x86_64.exports @@ -499,3 +499,5 @@ __ZTV8IOSyncer __ZTVN8IOSyncer9MetaClassE _ev_try_lock _ev_unlock + +__ZN9IOService23addMatchingNotificationEPK8OSSymbolP12OSDictionaryiU13block_pointerFbPS_P10IONotifierE diff --git a/config/Libkern.arm.exports b/config/Libkern.arm.exports index 051d0d07b..ab47a9396 100644 --- a/config/Libkern.arm.exports +++ b/config/Libkern.arm.exports @@ -1,4 +1,5 @@ _OSAddAtomic64 _OSCompareAndSwap64 +__ZN15OSMetaClassBase9_ptmf2ptfEPKS_MS_FvvE __ZN12OSOrderedSet12withCapacityEjPFlPK15OSMetaClassBaseS2_PvES3_ __ZN12OSOrderedSet16initWithCapacityEjPFlPK15OSMetaClassBaseS2_PvES3_ diff --git a/config/Libkern.arm64.exports b/config/Libkern.arm64.exports index cc07f5dc2..40f33219b 100644 --- a/config/Libkern.arm64.exports +++ b/config/Libkern.arm64.exports @@ -1,5 +1,6 @@ _OSAddAtomic64 _OSCompareAndSwap64 _PAGE_SHIFT_CONST +__ZN15OSMetaClassBase9_ptmf2ptfEPKS_MS_FvvE __ZN12OSOrderedSet12withCapacityEjPFiPK15OSMetaClassBaseS2_PvES3_ __ZN12OSOrderedSet16initWithCapacityEjPFiPK15OSMetaClassBaseS2_PvES3_ diff --git a/config/Libkern.exports b/config/Libkern.exports index 173707504..d0c0a5554 100644 --- a/config/Libkern.exports +++ b/config/Libkern.exports @@ -1,4 +1,5 @@ _Assert +_img4_interface_register _MD5Final _MD5Init _MD5Update @@ -654,6 +655,7 @@ _kern_os_free _kern_os_malloc _kern_os_realloc _kext_assertions_enable +_kmod_info:_invalid_kmod_info _kprintf _lck_attr_alloc_init _lck_attr_free @@ -708,6 +710,12 @@ _os_log_debug_enabled _os_log_info_enabled _os_release _os_retain +_os_ref_init_count +_os_ref_retain +_os_ref_release_explicit +_os_ref_retain_try +_os_ref_retain_locked +_os_ref_release_locked _osrelease _ostype _page_mask @@ -761,3 +769,15 @@ _vsnprintf _vsscanf _zError _zlibVersion + +__Block_copy +__Block_release +__NSConcreteAutoBlock +__NSConcreteFinalizingBlock +__NSConcreteGlobalBlock +__NSConcreteMallocBlock +__NSConcreteStackBlock +__NSConcreteWeakBlockVariable +__ZN12OSCollection14iterateObjectsEU13block_pointerFbP8OSObjectE +__ZN12OSDictionary14iterateObjectsEU13block_pointerFbPK8OSSymbolP8OSObjectE +__ZN12OSSerializer9withBlockEU13block_pointerFbP11OSSerializeE diff --git a/config/MACFramework.exports b/config/MACFramework.exports index 67b209861..e594b265f 100644 --- a/config/MACFramework.exports +++ b/config/MACFramework.exports @@ -12,10 +12,6 @@ _mac_label_set _mac_audit_text _mac_iokit_check_hid_control -_mac_iokit_check_nvram_delete -_mac_iokit_check_nvram_get -_mac_iokit_check_nvram_set - _mac_vnode_check_trigger_resolve _sbuf_cat diff --git a/config/MASTER b/config/MASTER index d4561e50d..b3c36794a 100644 --- a/config/MASTER +++ b/config/MASTER @@ -1,7 +1,7 @@ # # Mach Operating System # Copyright (c) 1986 Carnegie-Mellon University -# Copyright 2001-2014 Apple Inc. +# Copyright 2001-2018 Apple Inc. # # All rights reserved. The CMU software License Agreement # specifies the terms and conditions for use and redistribution. @@ -116,6 +116,7 @@ options CONFIG_IMAGEBOOT # local image boot # options CONFIG_MBUF_JUMBO # jumbo cluster pool # options CONFIG_WORKQUEUE # +options CONFIG_WORKLOOP_DEBUG # # # 4.4 filesystems @@ -141,6 +142,7 @@ options CONFIG_TRIGGERS # trigger vnodes # options CONFIG_EXT_RESOLVER # e.g. memberd # options CONFIG_SEARCHFS # searchfs syscall support # options CONFIG_MNT_SUID # allow suid binaries # +options CONFIG_MNT_ROOTSNAP # allow rooting from snapshot # # # NFS support @@ -172,6 +174,8 @@ options CRYPTO # options CRYPTO_SHA2 # options ENCRYPTED_SWAP # +options CONFIG_IMG4 # + options ZLIB # inflate/deflate support # options IF_BRIDGE # @@ -307,6 +311,12 @@ options CONFIG_NO_KPRINTF_STRINGS # # options CONFIG_FINE_LOCK_GROUPS # +# +# configurable kernel - general switch to say we are building for an +# embedded device +# +options CONFIG_EMBEDDED # + # support dynamic signing of code # @@ -326,6 +336,9 @@ options CONFIG_CODE_DECRYPTION # # options CONFIG_PROTECT # +#allow write-protection of key page +options CONFIG_KEYPAGE_WP # + # # enable per-process memory priority tracking # @@ -371,6 +384,11 @@ options CONFIG_SECLUDED_MEMORY # options CONFIG_BACKGROUND_QUEUE # +# +# Ledger features +# +options CONFIG_LEDGER_INTERVAL_MAX # + # # I/O Scheduling # @@ -477,6 +495,7 @@ options NO_KERNEL_HID # # options LIBKERNCPP # C++ implementation # +options CONFIG_BLOCKS # Blocks runtime # options CONFIG_KXLD # kxld/runtime linking of kexts # options CONFIG_KEC_FIPS # Kernel External Components for FIPS compliance (KEC_FIPS) # @@ -554,7 +573,7 @@ options MACH_MP_DEBUG # # # operations on each element. # options ZONE_DEBUG # # - +options CONFIG_ZCACHE #Enable per-cpu caching for zones # options CONFIG_ZLEAKS # Live zone leak debugging # # @@ -740,3 +759,6 @@ options COPYOUT_SHIM # Shim for copyout memory analysis via kext # + +options CONFIG_QUIESCE_COUNTER # Support for _COMM_PAGE_CPU_QUIESCENT_COUNTER # + diff --git a/config/MASTER.arm b/config/MASTER.arm index 8deb4e445..d463ad189 100644 --- a/config/MASTER.arm +++ b/config/MASTER.arm @@ -16,15 +16,15 @@ # Standard Apple OS Configurations: # -------- ----- -- --------------- # -# KERNEL_BASE = [ arm xsmall config_embedded ] +# KERNEL_BASE = [ arm xsmall config_embedded config_enforce_signed_code config_zcache ] # KERNEL_RELEASE = [ KERNEL_BASE ] # KERNEL_DEV = [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug os_reason_debug ] -# KERNEL_DEBUG = [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_waitq_debug ] +# KERNEL_DEBUG = [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_workloop_debug config_waitq_debug ] # BSD_BASE = [ mach_bsd config_workqueue psynch config_proc_uuid_policy ] # BSD_RELEASE = [ BSD_BASE no_printf_str no_kprintf_str secure_kernel ] # BSD_DEV = [ BSD_BASE config_imageboot config_coredump pgo config_vnguard ] # BSD_DEBUG = [ BSD_BASE config_imageboot config_coredump pgo config_vnguard ] -# FILESYS_BASE = [ devfs fifo fs_compression config_protect config_fse routefs quota namedstreams ] +# FILESYS_BASE = [ devfs fifo fs_compression config_mnt_rootsnap config_protect config_fse routefs quota namedstreams ] # FILESYS_RELEASE= [ FILESYS_BASE ] # FILESYS_DEV = [ FILESYS_BASE fdesc ] # FILESYS_DEBUG = [ FILESYS_BASE fdesc ] @@ -37,7 +37,7 @@ # IOKIT_RELEASE = [ IOKIT_BASE ] # IOKIT_DEV = [ IOKIT_BASE iokitstats iotracking ] # IOKIT_DEBUG = [ IOKIT_BASE iokitstats iotracking ] -# LIBKERN_BASE = [ libkerncpp config_kec_fips zlib crypto_sha2 ] +# LIBKERN_BASE = [ libkerncpp config_blocks config_kec_fips zlib crypto_sha2 config_img4 ] # LIBKERN_RELEASE =[ LIBKERN_BASE ] # LIBKERN_DEV = [ LIBKERN_BASE iotracking ] # LIBKERN_DEBUG = [ LIBKERN_BASE iotracking ] @@ -45,10 +45,10 @@ # PERF_DBG_RELEASE=[ PERF_DBG_BASE ist_kdebug ] # PERF_DBG_DEV = [ PERF_DBG_BASE config_dtrace zleaks kdp_interactive_debugging interrupt_masked_debug ] # PERF_DBG_DEBUG = [ PERF_DBG_BASE config_dtrace zleaks kdp_interactive_debugging interrupt_masked_debug ] -# MACH_BASE = [ mach slidable vc_progress_white mdebug ipc_debug importance_inheritance config_atm config_coalitions config_library_validation config_iosched config_telemetry config_sysdiagnose ] +# MACH_BASE = [ mach slidable vc_progress_white mdebug ipc_debug importance_inheritance config_atm config_coalitions config_library_validation config_iosched config_telemetry config_sysdiagnose config_quiesce_counter ] # MACH_RELEASE = [ MACH_BASE config_skip_precise_user_kernel_time debugger_for_zone_info ] -# MACH_DEV = [ MACH_BASE task_zone_info config_io_accounting importance_trace ] -# MACH_DEBUG = [ MACH_BASE task_zone_info config_io_accounting importance_trace importance_debug ] +# MACH_DEV = [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max ] +# MACH_DEBUG = [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max importance_debug ] # SCHED_BASE = [ config_sched_traditional config_sched_multiq ] # SCHED_RELEASE = [ SCHED_BASE ] # SCHED_DEV = [ SCHED_BASE ] diff --git a/config/MASTER.arm64 b/config/MASTER.arm64 index a6636b773..32189c5fd 100644 --- a/config/MASTER.arm64 +++ b/config/MASTER.arm64 @@ -16,15 +16,15 @@ # Standard Apple OS Configurations: # -------- ----- -- --------------- # -# KERNEL_BASE = [ arm64 xsmall config_embedded config_requires_u32_munging ] +# KERNEL_BASE = [ arm64 xsmall config_embedded config_enforce_signed_code config_requires_u32_munging config_zcache ] # KERNEL_RELEASE = [ KERNEL_BASE ] # KERNEL_DEV = [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug os_reason_debug pgtrace ] -# KERNEL_DEBUG = [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_waitq_debug pgtrace ] +# KERNEL_DEBUG = [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_workloop_debug config_waitq_debug pgtrace ] # BSD_BASE = [ mach_bsd config_workqueue psynch config_proc_uuid_policy config_personas ] # BSD_RELEASE = [ BSD_BASE no_printf_str no_kprintf_str secure_kernel ] # BSD_DEV = [ BSD_BASE config_imageboot config_coredump pgo config_vnguard ] # BSD_DEBUG = [ BSD_BASE config_imageboot config_coredump pgo config_vnguard ] -# FILESYS_BASE = [ devfs fifo fs_compression config_protect config_fse routefs quota namedstreams ] +# FILESYS_BASE = [ devfs fifo fs_compression config_mnt_rootsnap config_protect config_fse routefs quota namedstreams ] # FILESYS_RELEASE= [ FILESYS_BASE ] # FILESYS_DEV = [ FILESYS_BASE fdesc ] # FILESYS_DEBUG = [ FILESYS_BASE fdesc ] @@ -37,7 +37,7 @@ # IOKIT_RELEASE = [ IOKIT_BASE ] # IOKIT_DEV = [ IOKIT_BASE iokitstats iotracking ] # IOKIT_DEBUG = [ IOKIT_BASE iokitstats iotracking] -# LIBKERN_BASE = [ libkerncpp config_kec_fips zlib crypto_sha2 ] +# LIBKERN_BASE = [ libkerncpp config_blocks config_kec_fips zlib crypto_sha2 config_img4 ] # LIBKERN_RELEASE =[ LIBKERN_BASE ] # LIBKERN_DEV = [ LIBKERN_BASE iotracking ] # LIBKERN_DEBUG = [ LIBKERN_BASE iotracking ] @@ -45,10 +45,10 @@ # PERF_DBG_RELEASE=[ PERF_DBG_BASE ist_kdebug ] # PERF_DBG_DEV = [ PERF_DBG_BASE config_dtrace zleaks kdp_interactive_debugging alternate_debugger interrupt_masked_debug ] # PERF_DBG_DEBUG = [ PERF_DBG_BASE config_dtrace zleaks kdp_interactive_debugging alternate_debugger interrupt_masked_debug ] -# MACH_BASE = [ mach slidable config_ecc_logging vc_progress_white mdebug ipc_debug importance_inheritance config_atm config_coalitions config_iosched config_library_validation config_sysdiagnose config_telemetry config_mach_bridge_recv_time] +# MACH_BASE = [ mach slidable config_ecc_logging vc_progress_white mdebug ipc_debug importance_inheritance config_atm config_coalitions config_iosched config_library_validation config_sysdiagnose config_telemetry config_mach_bridge_recv_time config_quiesce_counter ] # MACH_RELEASE = [ MACH_BASE config_skip_precise_user_kernel_time debugger_for_zone_info ] -# MACH_DEV = [ MACH_BASE task_zone_info config_io_accounting importance_trace ] -# MACH_DEBUG = [ MACH_BASE task_zone_info config_io_accounting importance_trace importance_debug ] +# MACH_DEV = [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max ] +# MACH_DEBUG = [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max importance_debug ] # SCHED_BASE = [ config_sched_traditional config_sched_multiq config_sched_deferred_ast ] # SCHED_RELEASE = [ SCHED_BASE ] # SCHED_DEV = [ SCHED_BASE ] diff --git a/config/MASTER.arm64.bcm2837 b/config/MASTER.arm64.bcm2837 new file mode 100644 index 000000000..65dd4861b --- /dev/null +++ b/config/MASTER.arm64.bcm2837 @@ -0,0 +1,88 @@ +# +# Mach Operating System +# Copyright (c) 1986 Carnegie-Mellon University +# Copyright 2001-2016 Apple Inc. +# +# All rights reserved. The CMU software License Agreement +# specifies the terms and conditions for use and redistribution. +# +###################################################################### +# +# Master Apple configuration file (see the master machine independent +# configuration file for a description of the file format). +# +###################################################################### +# +# Standard Apple OS Configurations: +# -------- ----- -- --------------- +# +# KERNEL_BASE = [ arm64 xsmall config_embedded config_requires_u32_munging config_zcache ] +# KERNEL_RELEASE = [ KERNEL_BASE ] +# KERNEL_DEV = [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug os_reason_debug pgtrace ] +# KERNEL_DEBUG = [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_workloop_debug config_waitq_debug pgtrace ] +# BSD_BASE = [ mach_bsd config_workqueue psynch config_proc_uuid_policy config_personas ] +# BSD_RELEASE = [ BSD_BASE no_printf_str no_kprintf_str secure_kernel ] +# BSD_DEV = [ BSD_BASE config_imageboot config_coredump pgo config_vnguard ] +# BSD_DEBUG = [ BSD_BASE config_imageboot config_coredump pgo config_vnguard ] +# FILESYS_BASE = [ devfs fifo fs_compression config_mnt_rootsnap config_protect config_fse routefs quota namedstreams ] +# FILESYS_RELEASE= [ FILESYS_BASE ] +# FILESYS_DEV = [ FILESYS_BASE fdesc ] +# FILESYS_DEBUG = [ FILESYS_BASE fdesc ] +# NFS = [ nfsclient nfsserver ] +# NETWORKING = [ inet tcpdrop_synfin bpfilter inet6 ipv6send if_bridge traffic_mgt dummynet ah_all_crypto packet_mangler if_fake ] +# VPN = [ ipsec flow_divert necp content_filter ] +# PF = [ pf ] +# MULTIPATH = [ multipath mptcp ] +# IOKIT_BASE = [ iokit iokitcpp no_kextd no_kernel_hid config_sleep ] +# IOKIT_RELEASE = [ IOKIT_BASE ] +# IOKIT_DEV = [ IOKIT_BASE iokitstats iotracking ] +# IOKIT_DEBUG = [ IOKIT_BASE iokitstats iotracking] +# LIBKERN_BASE = [ libkerncpp config_kec_fips zlib crypto_sha2 ] +# LIBKERN_RELEASE =[ LIBKERN_BASE ] +# LIBKERN_DEV = [ LIBKERN_BASE iotracking ] +# LIBKERN_DEBUG = [ LIBKERN_BASE iotracking ] +# PERF_DBG_BASE = [ mach_kdp config_serial_kdp kperf kpc ] +# PERF_DBG_RELEASE=[ PERF_DBG_BASE ist_kdebug ] +# PERF_DBG_DEV = [ PERF_DBG_BASE config_dtrace zleaks kdp_interactive_debugging alternate_debugger interrupt_masked_debug ] +# PERF_DBG_DEBUG = [ PERF_DBG_BASE config_dtrace zleaks kdp_interactive_debugging alternate_debugger interrupt_masked_debug ] +# MACH_BASE = [ mach slidable config_ecc_logging vc_progress_white mdebug ipc_debug importance_inheritance config_atm config_coalitions config_iosched config_library_validation config_sysdiagnose config_telemetry config_mach_bridge_recv_time config_quiesce_counter ] +# MACH_RELEASE = [ MACH_BASE config_skip_precise_user_kernel_time debugger_for_zone_info ] +# MACH_DEV = [ MACH_BASE task_zone_info config_io_accounting importance_trace ] +# MACH_DEBUG = [ MACH_BASE task_zone_info config_io_accounting importance_trace importance_debug ] +# SCHED_BASE = [ config_sched_traditional config_sched_multiq config_sched_deferred_ast ] +# SCHED_RELEASE = [ SCHED_BASE ] +# SCHED_DEV = [ SCHED_BASE ] +# SCHED_DEBUG = [ SCHED_BASE config_sched_grrr config_sched_proto ] +# VM_BASE = [ vm_pressure_events jetsam freeze memorystatus config_code_decryption phantom_cache config_secluded_memory config_background_queue config_cs_validation_bitmap] +# VM_RELEASE = [ VM_BASE ] +# VM_DEV = [ VM_BASE dynamic_codesigning ] +# VM_DEBUG = [ VM_BASE dynamic_codesigning ] +# SECURITY = [ config_macf kernel_integrity ] +# RELEASE = [ KERNEL_RELEASE BSD_RELEASE FILESYS_RELEASE SKYWALK_RELEASE NETWORKING PF MULTIPATH VPN IOKIT_RELEASE LIBKERN_RELEASE PERF_DBG_RELEASE MACH_RELEASE SCHED_RELEASE VM_RELEASE SECURITY ] +# DEVELOPMENT = [ KERNEL_DEV BSD_DEV FILESYS_DEV NFS SKYWALK_DEV NETWORKING PF MULTIPATH VPN IOKIT_DEV LIBKERN_DEV PERF_DBG_DEV MACH_DEV SCHED_DEV VM_DEV SECURITY ] +# DEBUG = [ KERNEL_DEBUG BSD_DEBUG FILESYS_DEBUG SKYWALK_DEBUG NETWORKING PF MULTIPATH VPN IOKIT_DEBUG LIBKERN_DEBUG PERF_DBG_DEBUG MACH_DEBUG SCHED_DEBUG VM_DEBUG SECURITY ] +# KASAN = [ DEVELOPMENT ] +# +###################################################################### +# +machine "arm64" # + +makeoptions OSFMK_MACHINE = "arm64" # + +options COUNT_SYSCALLS # count bsd system calls # +options TRASH_VFP_ON_SAVE # +options ALTERNATE_DEBUGGER # + +options CONFIG_VNODES=1024 # + +options CONFIG_FREEZE_SUSPENDED_MIN=4 # + +options CONFIG_MACH_APPROXIMATE_TIME + +options CONFIG_KERNEL_INTEGRITY # + +options INTERRUPT_MASKED_DEBUG=1 # # + +options CONFIG_PGTRACE # +options CONFIG_PGTRACE_NONKEXT # +pseudo-device pgtrace 1 init pgtrace_dev_init # diff --git a/config/MASTER.x86_64 b/config/MASTER.x86_64 index 1a934777d..b14a338d6 100644 --- a/config/MASTER.x86_64 +++ b/config/MASTER.x86_64 @@ -1,7 +1,7 @@ # # Mach Operating System # Copyright (c) 1986 Carnegie-Mellon University -# Copyright 2001-2016 Apple Inc. +# Copyright 2001-2018 Apple Inc. # # All rights reserved. The CMU software License Agreement # specifies the terms and conditions for use and redistribution. @@ -16,15 +16,15 @@ # Standard Apple OS Configurations: # -------- ----- -- --------------- # -# KERNEL_BASE = [ intel medium config_requires_u32_munging ] +# KERNEL_BASE = [ intel medium config_requires_u32_munging config_zcache ] # KERNEL_RELEASE = [ KERNEL_BASE ] # KERNEL_DEV = [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug os_reason_debug ] -# KERNEL_DEBUG = [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_waitq_debug ] +# KERNEL_DEBUG = [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_workloop_debug config_waitq_debug ] # BSD_BASE = [ mach_bsd sysv_sem sysv_msg sysv_shm config_imageboot config_workqueue psynch config_proc_uuid_policy config_coredump pgo config_32bit_telemetry ] # BSD_RELEASE = [ BSD_BASE ] # BSD_DEV = [ BSD_BASE config_vnguard ] # BSD_DEBUG = [ BSD_BASE config_vnguard ] -# FILESYS_BASE = [ devfs fdesc config_dev_kmem config_fse quota namedstreams config_protect fifo config_volfs fs_compression config_imgsrc_access config_triggers config_ext_resolver config_searchfs config_appledouble nullfs config_mnt_suid ] +# FILESYS_BASE = [ devfs fdesc config_dev_kmem config_fse quota namedstreams config_mnt_rootsnap config_keypage_wp config_protect fifo config_volfs fs_compression config_imgsrc_access config_triggers config_ext_resolver config_searchfs config_appledouble nullfs config_mnt_suid ] # FILESYS_RELEASE= [ FILESYS_BASE ] # FILESYS_DEV = [ FILESYS_BASE ] # FILESYS_DEBUG = [ FILESYS_BASE ] @@ -37,15 +37,15 @@ # IOKIT_RELEASE = [ IOKIT_BASE ] # IOKIT_DEV = [ IOKIT_BASE iotracking ] # IOKIT_DEBUG = [ IOKIT_BASE iotracking ] -# LIBKERN_BASE = [ libkerncpp config_kxld config_kec_fips zlib crypto_sha2 ] +# LIBKERN_BASE = [ libkerncpp config_blocks config_kxld config_kec_fips zlib crypto_sha2 config_img4 ] # LIBKERN_RELEASE =[ LIBKERN_BASE ] # LIBKERN_DEV = [ LIBKERN_BASE iotracking ] # LIBKERN_DEBUG = [ LIBKERN_BASE iotracking ] # PERF_DBG = [ config_dtrace mach_kdp config_serial_kdp kdp_interactive_debugging kperf kpc zleaks config_gzalloc MONOTONIC_BASE ] # MACH_BASE = [ mach config_kext_basement mdebug ipc_debug config_mca config_vmx config_mtrr config_lapic config_telemetry importance_inheritance config_atm config_coalitions hypervisor config_iosched config_sysdiagnose config_mach_bridge_send_time copyout_shim ] # MACH_RELEASE = [ MACH_BASE ] -# MACH_DEV = [ MACH_BASE task_zone_info importance_trace ] -# MACH_DEBUG = [ MACH_BASE task_zone_info importance_trace importance_debug ] +# MACH_DEV = [ MACH_BASE task_zone_info importance_trace config_ledger_interval_max ] +# MACH_DEBUG = [ MACH_BASE task_zone_info importance_trace config_ledger_interval_max importance_debug ] # SCHED_BASE = [ config_sched_traditional config_sched_multiq config_sched_sfi ] # SCHED_RELEASE = [ SCHED_BASE ] # SCHED_DEV = [ SCHED_BASE ] diff --git a/config/Makefile b/config/Makefile index 36b16f259..da46458ff 100644 --- a/config/Makefile +++ b/config/Makefile @@ -50,6 +50,10 @@ EXPORTS_FILES = $(foreach symbolset,$(SYMBOL_COMPONENT_LIST),$(symbolset).export SYMBOL_SET_BUILD = $(foreach symbolset, $(SYMBOL_COMPONENT_LIST), $(OBJPATH)/$(symbolset).symbolset) +ifeq ($(KASAN),1) +KASAN_EXPORTS = $(SRCROOT)/san/Kasan_kasan.exports +endif + $(OBJPATH)/allsymbols: $(OBJPATH)/$(KERNEL_FILE_NAME) $(_v)$(NM) -gj $< > $@ @@ -140,9 +144,12 @@ endif $(OBJPATH)/all-kpi.exp: $(EXPORTS_FILES) - $(_v)$(SOURCE)/generate_linker_exports.sh $@ $+ + $(_v)$(SOURCE)/generate_linker_exports.sh $@ $+ $(KASAN_EXPORTS) + +$(OBJPATH)/all-alias.exp: $(EXPORTS_FILES) + $(_v)$(SOURCE)/generate_linker_aliases.sh $@ $+ $(KASAN_EXPORTS) -do_build_all:: $(OBJPATH)/all-kpi.exp +do_build_all:: $(OBJPATH)/all-kpi.exp $(OBJPATH)/all-alias.exp include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/config/MasterVersion b/config/MasterVersion index b857211fe..68608495c 100644 --- a/config/MasterVersion +++ b/config/MasterVersion @@ -1,4 +1,4 @@ -17.7.0 +18.2.0 # The first line of this file contains the master version number for the kernel. # All other instances of the kernel version in xnu are derived from this file. diff --git a/config/Private.arm.exports b/config/Private.arm.exports index 0b393134f..36db1f3f7 100644 --- a/config/Private.arm.exports +++ b/config/Private.arm.exports @@ -14,8 +14,10 @@ _cpu_broadcast_xcall _cpu_xcall _cpu_number _enable_kernel_vfp_context +_get_preemption_level _PE_consistent_debug_register _ml_static_ptovirt _ml_static_mfree _sched_perfcontrol_register_callbacks _sched_perfcontrol_update_recommended_cores +_PE_panic_debugging_enabled diff --git a/config/Private.arm64.exports b/config/Private.arm64.exports index 3ac767939..a9e2160a2 100644 --- a/config/Private.arm64.exports +++ b/config/Private.arm64.exports @@ -7,6 +7,8 @@ _PE_mark_hwaccess _PE_smc_stashed_x86_system_state _PE_smc_stashed_x86_power_state _PE_smc_stashed_x86_efi_boot_state +_PE_smc_stashed_x86_shutdown_cause +_PE_smc_stashed_x86_prev_power_transitions _PE_pcie_stashed_link_state __ZN17IONVRAMController* __ZTV17IONVRAMController @@ -16,6 +18,7 @@ _cpu_cluster_id _cpu_number _cpu_qos_update_register _ecc_log_record_event +_get_preemption_level _ml_arm_sleep _ml_get_abstime_offset _ml_get_conttime_offset @@ -36,3 +39,4 @@ _pgtrace_stop _pgtrace_active _pgtrace_add_probe _pgtrace_clear_probe +_PE_panic_debugging_enabled diff --git a/config/Private.exports b/config/Private.exports index 5630d2a50..3e655ff10 100644 --- a/config/Private.exports +++ b/config/Private.exports @@ -1,4 +1,6 @@ _PE_i_can_has_debugger +__ZN15IORegistryEntry18setIndexedPropertyEjP8OSObject +__ZNK15IORegistryEntry18getIndexedPropertyEj __ZN16IOPlatformExpert* __ZNK16IOPlatformExpert* __ZTV16IOPlatformExpert @@ -88,14 +90,21 @@ _cpx_sizex _cpx_use_offset_for_iv _cpx_synthetic_offset_for_iv _cpx_writeprotect +_cs_blob_create_validated +_cs_blob_free _cs_blob_reset_cache _cs_debug -_cs_enforcement _cs_entitlement_flags _cs_entitlements_blob_get +_cs_debug_fail_on_unsigned_code +_cs_debug_unsigned_exec_failures +_cs_debug_unsigned_mmap_failures _cs_get_cdhash _cs_identity_get +_cs_process_enforcement +_cs_process_global_enforcement _cs_require_lv +_cs_system_enforcement _cs_system_require_lv _cs_restricted _cs_valid @@ -196,6 +205,7 @@ _ifnet_tx_compl_status _ifnet_get_unsent_bytes _ifnet_get_buffer_status _ifnet_normalise_unsent_data +_ifnet_set_low_power_mode _in6_localaddr _in6addr_local _in_localaddr @@ -237,8 +247,6 @@ _kern_stack_snapshot_with_reason _kernel_debug_string _kevent_id_internal _kevent_qos_internal -_kevent_qos_internal_bind -_kevent_qos_internal_unbind _kmem_alloc_kobject:_kmem_alloc_kobject_external _kmem_alloc_pageable:_kmem_alloc_pageable_external _kx_qsort @@ -330,8 +338,8 @@ _pffindproto:_pffindproto_old _port_name_to_task _port_name_to_thread _post_sys_powersource -_prng_factory_register _proc_getexecutablevnode +_proc_issetugid _proc_pidbackgrounded _proc_pidversion _proc_set_responsible_pid @@ -355,6 +363,7 @@ _pru_sockaddr_notsupp _pru_sopoll_notsupp _pthread_kext_register _q_to_b +_register_and_init_prng _register_crypto_functions _register_decmpfs_decompressor _rootdev @@ -484,6 +493,8 @@ _vnode_istty _vnode_lookup_continue_needed _vnode_clearnoflush _vnode_isnoflush +_vnode_getbackingvnode +_vnode_setasnamedstream _vnop_compound_mkdir_desc _vnop_compound_open_desc _vnop_compound_remove_desc @@ -600,3 +611,15 @@ _zone_change _fs_buffer_cache_gc_register _fs_buffer_cache_gc_unregister _cp_key_store_action_for_volume + +_Block_size +__Block_extended_layout +__Block_has_signature +__Block_isDeallocating +__Block_layout +__Block_object_assign +__Block_object_dispose +__Block_signature +__Block_tryRetain +__Block_use_RR2 +__Block_use_stret diff --git a/config/Private.x86_64.exports b/config/Private.x86_64.exports index 52902c403..0ad58ec1a 100644 --- a/config/Private.x86_64.exports +++ b/config/Private.x86_64.exports @@ -61,3 +61,10 @@ _register_copyout_shim _getsegdatafromheader _getsegbynamefromheader __mh_execute_header + +#macOS only codesigning kpi +_csproc_disable_enforcement +_csproc_mark_invalid_allowed +_csproc_check_invalid_allowed +_csproc_hardened_runtime +_csproc_forced_lv diff --git a/config/Unsupported.arm64.exports b/config/Unsupported.arm64.exports index 883f24480..3687a2147 100644 --- a/config/Unsupported.arm64.exports +++ b/config/Unsupported.arm64.exports @@ -4,6 +4,7 @@ __ZN9IODTNVRAM19convertPropToObjectEPhjS0_jPPK8OSSymbolPP8OSObject __ZN9IODTNVRAM19searchNVRAMPropertyEP17IONVRAMDescriptorPj __ZN9IODTNVRAM19unescapeBytesToDataEPKhj _bsd_set_dependency_capable +__get_commpage_priv_address _kdp_register_callout _kdp_set_ip_and_mac_addresses _logwakeup diff --git a/config/generate_linker_aliases.sh b/config/generate_linker_aliases.sh new file mode 100755 index 000000000..45c7700ad --- /dev/null +++ b/config/generate_linker_aliases.sh @@ -0,0 +1,15 @@ +#!/bin/sh + +set -e + +if [ $# -lt 2 ]; then + echo "Usage: $0 output.exp input1 [input2 ... ]" 1>&2 + exit 1 +fi + +OUTPUT="$1" +shift + +( grep -h ":" "$@" | awk -F: '{print $2 " " $1}' ) | sort -u > "$OUTPUT" + +exit 0 diff --git a/config/newvers.pl b/config/newvers.pl index f093b3378..8d670049e 100755 --- a/config/newvers.pl +++ b/config/newvers.pl @@ -14,6 +14,7 @@ # ###KERNEL_VERSION_REVISION### 3 # ###KERNEL_VERSION_STAGE### VERSION_STAGE_BETA (see libkern/version.h) # ###KERNEL_VERSION_PRERELEASE_LEVEL### 4 +# ###KERNEL_BUILD_CONFIG### development # ###KERNEL_BUILDER### root # ###KERNEL_BUILD_OBJROOT### xnu/xnu-690.obj~2/RELEASE_PPC # ###KERNEL_BUILD_DATE### Sun Oct 24 05:33:28 PDT 2004 @@ -56,6 +57,8 @@ sub WriteFile { $BUILD_OBJPATH =~ s,/+$,,; my $BUILD_DATE = `date`; $BUILD_DATE =~ s/[\n\t]//g; +my $BUILD_CONFIG = "unknown"; +$BUILD_CONFIG = $ENV{'CURRENT_KERNEL_CONFIG_LC'} if defined($ENV{'CURRENT_KERNEL_CONFIG_LC'}); my $BUILDER=`whoami`; $BUILDER =~ s/[\n\t]//g; my $RC_STRING = $ENV{'RC_ProjectNameAndSourceVersion'} . "~" . $ENV{'RC_ProjectBuildVersion'} if defined($ENV{'RC_XBS'}); @@ -166,6 +169,7 @@ sub describe { $count += $data =~ s/###KERNEL_VERSION_REVISION###/$VERSION_REVISION/g; $count += $data =~ s/###KERNEL_VERSION_STAGE###/$VERSION_STAGE/g; $count += $data =~ s/###KERNEL_VERSION_PRERELEASE_LEVEL###/$VERSION_PRERELEASE_LEVEL/g; + $count += $data =~ s/###KERNEL_BUILD_CONFIG###/$BUILD_CONFIG/g; $count += $data =~ s/###KERNEL_BUILDER###/$BUILDER/g; $count += $data =~ s/###KERNEL_BUILD_OBJROOT###/$BUILD_OBJROOT/g; $count += $data =~ s/###KERNEL_BUILD_DATE###/$BUILD_DATE/g; @@ -183,6 +187,7 @@ sub describe { print "newvers.pl: ###KERNEL_VERSION_REVISION### = $VERSION_REVISION\n"; print "newvers.pl: ###KERNEL_VERSION_STAGE### = $VERSION_STAGE\n"; print "newvers.pl: ###KERNEL_VERSION_PRERELEASE_LEVEL### = $VERSION_PRERELEASE_LEVEL\n"; + print "newvers.pl: ###KERNEL_BUILD_CONFIG### = $BUILD_CONFIG\n"; print "newvers.pl: ###KERNEL_BUILDER### = $BUILDER\n"; print "newvers.pl: ###KERNEL_BUILD_OBJROOT### = $BUILD_OBJROOT\n"; print "newvers.pl: ###KERNEL_BUILD_DATE### = $BUILD_DATE\n"; diff --git a/config/version.c b/config/version.c index 4870d134c..894ed9468 100644 --- a/config/version.c +++ b/config/version.c @@ -35,6 +35,8 @@ #include +// for what(1): +const char __kernelVersionString[] __attribute__((used)) = "@(#)VERSION: " OSTYPE " Kernel Version ###KERNEL_VERSION_LONG###: ###KERNEL_BUILD_DATE###; ###KERNEL_BUILDER###:###KERNEL_BUILD_OBJROOT###"; const char version[] = OSTYPE " Kernel Version ###KERNEL_VERSION_LONG###: ###KERNEL_BUILD_DATE###; ###KERNEL_BUILDER###:###KERNEL_BUILD_OBJROOT###"; const int version_major = VERSION_MAJOR; const int version_minor = VERSION_MINOR; @@ -42,6 +44,7 @@ const int version_revision = VERSION_REVISION; const int version_stage = VERSION_STAGE; const int version_prerelease_level = VERSION_PRERELEASE_LEVEL; const char version_variant[] = VERSION_VARIANT; +const char osbuild_config[] = "###KERNEL_BUILD_CONFIG###"; const char osbuilder[] = "###KERNEL_BUILDER###"; const char osrelease[] = OSRELEASE; const char ostype[] = OSTYPE; diff --git a/iokit/IOKit/IOBSD.h b/iokit/IOKit/IOBSD.h index 505e23efd..b72a4e8f5 100644 --- a/iokit/IOKit/IOBSD.h +++ b/iokit/IOKit/IOBSD.h @@ -63,6 +63,7 @@ extern void IOBSDMountChange(struct mount * mp, uint32_t op); extern boolean_t IOTaskHasEntitlement(task_t task, const char * entitlement); extern struct IOPolledFileIOVars * gIOPolledCoreFileVars; +extern kern_return_t gIOPolledCoreFileOpenRet; #ifdef __cplusplus } diff --git a/iokit/IOKit/IOCommandGate.h b/iokit/IOKit/IOCommandGate.h index 4bc13d299..431f179d4 100644 --- a/iokit/IOKit/IOCommandGate.h +++ b/iokit/IOKit/IOCommandGate.h @@ -154,6 +154,21 @@ work loop event sources. If the command is disabled the attempt to run a comman void *arg0 = 0, void *arg1 = 0, void *arg2 = 0, void *arg3 = 0); +#ifdef __BLOCKS__ +/*! @function runActionBlock + @abstract Single thread a call to an action with the target work loop. + @discussion Client function that causes the given action to be called in +a single threaded manner. Beware the work loop's gate is recursive and command +gates can cause direct or indirect re-entrancy. When the executing on a +client's thread runAction will sleep until the work loop's gate opens for +execution of client actions, the action is single threaded against all other +work loop event sources. If the command is disabled the attempt to run a command will be stalled until enable is called. + @param action Block to be executed in the context of the work loop. + @result The return value of action if it was called, kIOReturnBadArgument if action is not defined, kIOReturnAborted if a disabled command gate is free()ed before being reenabled. +*/ + IOReturn runActionBlock(ActionBlock action); +#endif /* __BLOCKS__ */ + /*! @function attemptCommand @abstract Single thread a command with the target work loop. @discussion Client function that causes the current action to be called in @@ -187,10 +202,10 @@ client's thread attemptCommand will fail if the work loop's gate is closed. /*! @function commandSleep @abstract Put a thread that is currently holding the command gate to sleep. - @discussion Put a thread to sleep waiting for an event but release the gate first. If the event occurs then the commandGate is closed before the function returns. + @discussion Put a thread to sleep waiting for an event but release the gate first. If the event occurs then the commandGate is closed before the function returns. If the thread does not hold the gate, panic. @param event Pointer to an address. @param interruptible THREAD_UNINT, THREAD_INTERRUPTIBLE or THREAD_ABORTSAFE. THREAD_UNINT specifies that the sleep cannot be interrupted by a signal. THREAD_INTERRUPTIBLE specifies that the sleep may be interrupted by a "kill -9" signal. THREAD_ABORTSAFE (the default value) specifies that the sleep may be interrupted by any user signal. - @result THREAD_AWAKENED - normal wakeup, THREAD_TIMED_OUT - timeout expired, THREAD_INTERRUPTED - interrupted, THREAD_RESTART - restart operation entirely, kIOReturnNotPermitted if the calling thread does not hold the command gate. */ + @result THREAD_AWAKENED - normal wakeup, THREAD_TIMED_OUT - timeout expired, THREAD_INTERRUPTED - interrupted, THREAD_RESTART - restart operation entirely. */ virtual IOReturn commandSleep(void *event, UInt32 interruptible = THREAD_ABORTSAFE); @@ -212,11 +227,11 @@ client's thread attemptCommand will fail if the work loop's gate is closed. /*! @function commandSleep @abstract Put a thread that is currently holding the command gate to sleep. - @discussion Put a thread to sleep waiting for an event but release the gate first. If the event occurs or timeout occurs then the commandGate is closed before the function returns. + @discussion Put a thread to sleep waiting for an event but release the gate first. If the event occurs or timeout occurs then the commandGate is closed before the function returns. If the thread does not hold the gate, panic. @param event Pointer to an address. @param deadline Clock deadline to timeout the sleep. @param interruptible THREAD_UNINT, THREAD_INTERRUPTIBLE or THREAD_ABORTSAFE. THREAD_UNINT specifies that the sleep cannot be interrupted by a signal. THREAD_INTERRUPTIBLE specifies that the sleep may be interrupted by a "kill -9" signal. THREAD_ABORTSAFE specifies that the sleep may be interrupted by any user signal. - @result THREAD_AWAKENED - normal wakeup, THREAD_TIMED_OUT - timeout expired, THREAD_INTERRUPTED - interrupted, THREAD_RESTART - restart operation entirely, kIOReturnNotPermitted if the calling thread does not hold the command gate. */ + @result THREAD_AWAKENED - normal wakeup, THREAD_TIMED_OUT - timeout expired, THREAD_INTERRUPTED - interrupted, THREAD_RESTART - restart operation entirely. */ virtual IOReturn commandSleep(void *event, AbsoluteTime deadline, UInt32 interruptible); diff --git a/iokit/IOKit/IOEventSource.h b/iokit/IOKit/IOEventSource.h index 44502a12e..34273c470 100644 --- a/iokit/IOKit/IOEventSource.h +++ b/iokit/IOKit/IOEventSource.h @@ -106,6 +106,10 @@ is implicitly the first paramter in the target member function's parameter list. @discussion Backward compatibilty define for the old non-class scoped type definition. See $link IOEventSource::Action */ #define IOEventSourceAction IOEventSource::Action +#ifdef __BLOCKS__ + typedef IOReturn (^ActionBlock)(); +#endif /* __BLOCKS__ */ + protected: /*! @var eventChainNext The next event source in the event chain. nil at end of chain. */ @@ -116,18 +120,24 @@ is implicitly the first paramter in the target member function's parameter list. /*! @var action The action method called when an event has been delivered */ + +#if XNU_KERNEL_PRIVATE + union { Action action; ActionBlock actionBlock; }; +#else /* XNU_KERNEL_PRIVATE */ Action action; +#endif /* !XNU_KERNEL_PRIVATE */ /*! @var enabled Is this event source enabled to deliver requests to the work-loop. */ bool enabled; #if XNU_KERNEL_PRIVATE - enum { - kPassive = 0x0001, - kActive = 0x0002, + kPassive = 0x0001, + kActive = 0x0002, + kActionBlock = 0x0004, + kSubClass0 = 0x0008, }; uint8_t eventSourceReserved1[1]; uint16_t flags; @@ -231,6 +241,26 @@ IOWorkLoop that at least reacts to signalWorkAvailable() and onThread functions. @result value of action. */ virtual IOEventSource::Action getAction() const; +#ifdef __BLOCKS__ +/*! @function setActionBlock + @abstract Setter for action ivar. The current block is released, & the new block is retained. + @param block Block pointer of type IOEventSource::ActionBlock. */ + void setActionBlock(ActionBlock block); +/*! @function getActionBlock + @abstract Getter for action ivar. + @result Block pointer of type IOEventSource::ActionBlock, if set, or NULL. */ + ActionBlock getActionBlock(ActionBlock) const; +#endif /* __BLOCKS__ */ + +/*! @function setRefcon + @abstract Setter for refcon ivar. This function will assert if a block action has been set. + @param refcon Refcon. */ + void setRefcon(void *refcon); +/*! @function getRefcon + @abstract Getter for refcon ivar. + @result The refcon. This function will assert if a block action has been set. */ + void * getRefcon() const; + /*! @function enable @abstract Enable event source. @discussion A subclass implementation is expected to respect the enabled diff --git a/iokit/IOKit/IOFilterInterruptEventSource.h b/iokit/IOKit/IOFilterInterruptEventSource.h index 3cf68bf11..1a5470b45 100644 --- a/iokit/IOKit/IOFilterInterruptEventSource.h +++ b/iokit/IOKit/IOFilterInterruptEventSource.h @@ -66,6 +66,10 @@ class IOFilterInterruptEventSource : public IOInterruptEventSource @discussion Backward compatibilty define for the old non-class scoped type definition. See $link IOFilterInterruptSource::Filter */ #define IOFilterInterruptAction IOFilterInterruptEventSource::Filter +#ifdef __BLOCKS__ + typedef bool (^FilterBlock)(IOFilterInterruptEventSource *sender); +#endif /* __BLOCKS__ */ + private: // Hide the superclass initializers virtual bool init(OSObject *inOwner, @@ -81,7 +85,12 @@ class IOFilterInterruptEventSource : public IOInterruptEventSource protected: /*! @var filterAction Filter callout */ + +#if XNU_KERNEL_PRIVATE + union { Filter filterAction; FilterBlock filterActionBlock; }; +#else /* XNU_KERNEL_PRIVATE */ Filter filterAction; +#endif /* !XNU_KERNEL_PRIVATE */ /*! @struct ExpansionData @discussion This structure will be used to expand the capablilties of the IOWorkLoop in the future. @@ -110,6 +119,30 @@ class IOFilterInterruptEventSource : public IOInterruptEventSource IOService *provider, int intIndex = 0); +#ifdef __BLOCKS__ +/*! @function filterInterruptEventSource + @abstract Factor method to create and initialise an IOFilterInterruptEventSource. See $link init. + @param owner Owner/client of this event source. + @param provider Service that provides interrupts. + @param intIndex The index of the interrupt within the provider's interrupt sources. + @param action Block for the callout routine of this event source. + @param filter Block to invoke when HW interrupt occurs. + @result a new event source if succesful, 0 otherwise. */ + static IOFilterInterruptEventSource * + filterInterruptEventSource(OSObject *owner, + IOService *provider, + int intIndex, + IOInterruptEventSource::ActionBlock action, + FilterBlock filter); +#endif /* __BLOCKS__ */ + +#if XNU_KERNEL_PRIVATE + enum + { + kFilterBlock = kSubClass0, + }; +#endif + /*! @function init @abstract Primary initialiser for the IOFilterInterruptEventSource class. @param owner Owner/client of this event source. @@ -125,6 +158,7 @@ successfully. */ IOService *provider, int intIndex = 0); + virtual void free( void ) APPLE_KEXT_OVERRIDE; /*! @function signalInterrupt @abstract Cause the work loop to schedule the action. @@ -136,6 +170,13 @@ successfully. */ @result value of filterAction. */ virtual Filter getFilterAction() const; +#ifdef __BLOCKS__ +/*! @function getFilterActionBlock + @abstract Get'ter for filterAction variable. + @result value of filterAction. */ + FilterBlock getFilterActionBlock() const; +#endif /* __BLOCKS__ */ + /*! @function normalInterruptOccurred @abstract Override $link IOInterruptEventSource::normalInterruptOccured to make a filter callout. */ virtual void normalInterruptOccurred(void *self, IOService *prov, int ind) APPLE_KEXT_OVERRIDE; diff --git a/iokit/IOKit/IOHibernatePrivate.h b/iokit/IOKit/IOHibernatePrivate.h index b6f3b7f62..f3195d0f1 100644 --- a/iokit/IOKit/IOHibernatePrivate.h +++ b/iokit/IOKit/IOHibernatePrivate.h @@ -310,8 +310,6 @@ typedef struct hibernate_statistics_t hibernate_statistics_t; void IOHibernateSystemInit(IOPMrootDomain * rootDomain); IOReturn IOHibernateSystemSleep(void); -void IOOpenDebugDataFile(const char *fname, uint64_t size); -void IOCloseDebugDataFile(); IOReturn IOHibernateIOKitSleep(void); IOReturn IOHibernateSystemHasSlept(void); IOReturn IOHibernateSystemWake(void); diff --git a/iokit/IOKit/IOInterruptEventSource.h b/iokit/IOKit/IOInterruptEventSource.h index 6acde040b..1d63d5c3a 100644 --- a/iokit/IOKit/IOInterruptEventSource.h +++ b/iokit/IOKit/IOInterruptEventSource.h @@ -70,6 +70,10 @@ class IOInterruptEventSource : public IOEventSource @param count Number of interrupts seen before delivery. */ typedef void (*Action)(OSObject *owner, IOInterruptEventSource *sender, int count); +#ifdef __BLOCKS__ + typedef void (^ActionBlock)(IOInterruptEventSource *sender, int count); +#endif /* __BLOCKS__ */ + /*! @defined IOInterruptEventAction @discussion Backward compatibilty define for the old non-class scoped type definition. See $link IOInterruptEventSource::Action */ #define IOInterruptEventAction IOInterruptEventSource::Action @@ -137,6 +141,26 @@ class IOInterruptEventSource : public IOEventSource IOService *provider = 0, int intIndex = 0); + +#ifdef __BLOCKS__ +/*! @function interruptEventSource + @abstract Factory function for IOInterruptEventSources creation and initialisation. + @param owner Owning client of the new event source. + @param provider IOService that represents the interrupt source. When no provider is defined the event source assumes that the client will in some manner call the interruptOccured method explicitly. This will start the ball rolling for safe delivery of asynchronous event's into the driver. + @param intIndex The index of the interrupt within the provider's interrupt sources. + @param action Block for the callout routine of this event source.. + @result A new interrupt event source if successfully created and initialised, 0 otherwise. */ + static IOInterruptEventSource * + interruptEventSource(OSObject *owner, + IOService *provider, + int intIndex, + ActionBlock action); +#endif /* __BLOCKS__ */ + +#if XNU_KERNEL_PRIVATE + static void actionToBlock(OSObject *owner, IOInterruptEventSource *sender, int count); +#endif /* XNU_KERNEL_PRIVATE */ + /*! @function init @abstract Primary initialiser for the IOInterruptEventSource class. @param owner Owning client of the new event source. diff --git a/iokit/IOKit/IOInterrupts.h b/iokit/IOKit/IOInterrupts.h index fa8aa7b33..d58ea9f07 100644 --- a/iokit/IOKit/IOInterrupts.h +++ b/iokit/IOKit/IOInterrupts.h @@ -50,6 +50,16 @@ struct IOInterruptSource { }; typedef struct IOInterruptSource IOInterruptSource; +#ifdef XNU_KERNEL_PRIVATE + +struct IOInterruptSourcePrivate { + void * vectorBlock; +}; +typedef struct IOInterruptSourcePrivate IOInterruptSourcePrivate; + +#endif /* XNU_KERNEL_PRIVATE */ + + #endif /* __cplusplus */ typedef void (*IOInterruptHandler)(void *target, void *refCon, diff --git a/iokit/IOKit/IOKitKeys.h b/iokit/IOKit/IOKitKeys.h index 44ed11807..1d5bf5afa 100644 --- a/iokit/IOKit/IOKitKeys.h +++ b/iokit/IOKit/IOKitKeys.h @@ -135,6 +135,7 @@ #define kIOMinimumSegmentAlignmentByteCountKey "IOMinimumSegmentAlignmentByteCount" // (OSNumber) #define kIOMaximumSegmentAddressableBitCountKey "IOMaximumSegmentAddressableBitCount" // (OSNumber) #define kIOMinimumSaturationByteCountKey "IOMinimumSaturationByteCount" // (OSNumber) +#define kIOMaximumSwapWriteKey "IOMaximumSwapWrite" // (OSNumber) // properties found in services that wish to describe an icon // diff --git a/iokit/IOKit/IOMemoryDescriptor.h b/iokit/IOKit/IOMemoryDescriptor.h index 35c330581..35d037da0 100644 --- a/iokit/IOKit/IOMemoryDescriptor.h +++ b/iokit/IOKit/IOMemoryDescriptor.h @@ -109,9 +109,7 @@ enum { kIOMemoryPreparedReadOnly = 0x00008000, #endif kIOMemoryPersistent = 0x00010000, -#ifdef XNU_KERNEL_PRIVATE kIOMemoryMapCopyOnWrite = 0x00020000, -#endif kIOMemoryRemote = 0x00040000, kIOMemoryThreadSafe = 0x00100000, // Shared with Buffer MD kIOMemoryClearEncrypt = 0x00200000, // Shared with Buffer MD diff --git a/iokit/IOKit/IOMultiMemoryDescriptor.h b/iokit/IOKit/IOMultiMemoryDescriptor.h index 1a5883abd..8d3fd47fe 100644 --- a/iokit/IOKit/IOMultiMemoryDescriptor.h +++ b/iokit/IOKit/IOMultiMemoryDescriptor.h @@ -118,6 +118,8 @@ class IOMultiMemoryDescriptor : public IOMemoryDescriptor IOReturn getPageCounts(IOByteCount * residentPageCount, IOByteCount * dirtyPageCount); + virtual uint64_t getPreparationID( void ) APPLE_KEXT_OVERRIDE; + #define IOMULTIMEMORYDESCRIPTOR_SUPPORTS_GETPAGECOUNTS 1 private: diff --git a/iokit/IOKit/IOPolledInterface.h b/iokit/IOKit/IOPolledInterface.h index 84d27043b..f22e999fc 100644 --- a/iokit/IOKit/IOPolledInterface.h +++ b/iokit/IOKit/IOPolledInterface.h @@ -119,9 +119,17 @@ class IOPolledInterface : public OSObject #include #include +// kern_open_file_for_direct_io() flags enum { - kIOPolledFileSSD = 0x00000001 + kIOPolledFileCreate = 0x00000001, + kIOPolledFileHibernate = 0x00000002, +}; + +// kern_open_file_for_direct_io() oflags +enum +{ + kIOPolledFileSSD = 0x00000001 }; #if !defined(__cplusplus) @@ -174,7 +182,8 @@ typedef struct IOPolledFileCryptVars IOPolledFileCryptVars; #if defined(__cplusplus) -IOReturn IOPolledFileOpen(const char * filename, +IOReturn IOPolledFileOpen(const char * filename, + uint32_t flags, uint64_t setFileSize, uint64_t fsFreeSize, void * write_file_addr, size_t write_file_len, IOPolledFileIOVars ** fileVars, @@ -224,7 +233,8 @@ __BEGIN_DECLS typedef void (*kern_get_file_extents_callback_t)(void * ref, uint64_t start, uint64_t size); struct kern_direct_file_io_ref_t * -kern_open_file_for_direct_io(const char * name, boolean_t create_file, +kern_open_file_for_direct_io(const char * name, + uint32_t flags, kern_get_file_extents_callback_t callback, void * callback_ref, off_t set_file_size, diff --git a/iokit/IOKit/IORegistryEntry.h b/iokit/IOKit/IORegistryEntry.h index 97f66e612..59ba42d99 100644 --- a/iokit/IOKit/IORegistryEntry.h +++ b/iokit/IOKit/IORegistryEntry.h @@ -57,6 +57,14 @@ enum { kIORegistryIterateParents = 0x00000002, }; +#ifdef KERNEL_PRIVATE +enum +{ + kIORegistryEntryIndexedPropertyCLPC = 0, + kIORegistryEntryIndexedPropertyCount, +}; +#endif /* KERNEL_PRIVATE */ + /*! @class IORegistryEntry : public OSObject @abstract The base class for all objects in the registry. @discussion The IORegistryEntry base class provides functions for describing graphs of connected registry entries, each with a dictionary-based property table. Entries may be connected in different planes, with differing topologies. Access to the registry is protected against multiple threads. Inside the kernel planes are specified with plane objects and are published by the creator - IOService exports the gIOServicePlane plane object for example. Non kernel clients specify planes by their name. @@ -280,6 +288,11 @@ member function's parameter list. virtual bool setProperty(const OSSymbol * aKey, OSObject * anObject); +#ifdef KERNEL_PRIVATE + OSObject * setIndexedProperty(uint32_t index, OSObject * anObject); + OSObject * getIndexedProperty(uint32_t index) const; +#endif /* KERNEL_PRIVATE */ + /*! @function setProperty @abstract Synchronized method to add a property to a registry entry's property table. @discussion This method will add or replace a property in a registry entry's property table, using the OSDictionary::setObject semantics. This method is synchronized with other IORegistryEntry accesses to the property table. diff --git a/iokit/IOKit/IOReturn.h b/iokit/IOKit/IOReturn.h index 504d6f221..3f1b2877f 100644 --- a/iokit/IOKit/IOReturn.h +++ b/iokit/IOKit/IOReturn.h @@ -62,6 +62,7 @@ typedef kern_return_t IOReturn; #define sub_iokit_hidsystem err_sub(14) #define sub_iokit_scsi err_sub(16) #define sub_iokit_usbaudio err_sub(17) +#define sub_iokit_wirelesscharging err_sub(18) //#define sub_iokit_pccard err_sub(21) #ifdef PRIVATE #define sub_iokit_nvme err_sub(28) @@ -81,6 +82,7 @@ typedef kern_return_t IOReturn; #define sub_iokit_sdio err_sub(0x174) #define sub_iokit_wlan err_sub(0x208) #define sub_iokit_appleembeddedsleepwakehandler err_sub(0x209) +#define sub_iokit_appleppm err_sub(0x20A) #define sub_iokit_vendor_specific err_sub(-2) #define sub_iokit_reserved err_sub(-1) diff --git a/iokit/IOKit/IOService.h b/iokit/IOKit/IOService.h index 20ad4e6ab..e06839160 100644 --- a/iokit/IOKit/IOService.h +++ b/iokit/IOKit/IOService.h @@ -155,6 +155,10 @@ extern SInt32 IOServiceOrdering( const OSMetaClassBase * inObj1, const OSMetaCla typedef void (*IOInterruptAction)( OSObject * target, void * refCon, IOService * nub, int source ); +#ifdef __BLOCKS__ +typedef void (^IOInterruptActionBlock)(IOService * nub, int source); +#endif /* __BLOCKS__ */ + /*! @typedef IOServiceNotificationHandler @param target Reference supplied when the notification was registered. @param refCon Reference constant supplied when the notification was registered. @@ -167,6 +171,12 @@ typedef bool (*IOServiceMatchingNotificationHandler)( void * target, void * refC IOService * newService, IONotifier * notifier ); +#ifdef __BLOCKS__ +typedef bool (^IOServiceMatchingNotificationHandlerBlock)(IOService * newService, + IONotifier * notifier ); +#endif /* __BLOCKS__ */ + + /*! @typedef IOServiceInterestHandler @param target Reference supplied when the notification was registered. @param refCon Reference constant supplied when the notification was registered. @@ -179,6 +189,11 @@ typedef IOReturn (*IOServiceInterestHandler)( void * target, void * refCon, UInt32 messageType, IOService * provider, void * messageArgument, vm_size_t argSize ); +#ifdef __BLOCKS__ +typedef IOReturn (^IOServiceInterestHandlerBlock)( uint32_t messageType, IOService * provider, + void * messageArgument, size_t argSize ); +#endif /* __BLOCKS__ */ + typedef void (*IOServiceApplierFunction)(IOService * service, void * context); typedef void (*OSObjectApplierFunction)(OSObject * object, void * context); @@ -774,6 +789,14 @@ virtual IOReturn updateReport(IOReportChannelList *channels, void * target, void * ref = 0, SInt32 priority = 0 ); + +#ifdef __BLOCKS__ + static IONotifier * addMatchingNotification( + const OSSymbol * type, OSDictionary * matching, + SInt32 priority, + IOServiceMatchingNotificationHandlerBlock handler); +#endif /* __BLOCKS__ */ + /*! @function waitForService @abstract Deprecated use waitForMatchingService(). Waits for a matching to service to be published. @discussion Provides a method of waiting for an IOService object matching the supplied matching dictionary to be registered and fully matched. @@ -1113,6 +1136,19 @@ virtual IOReturn updateReport(IOReportChannelList *channels, virtual IOReturn registerInterrupt(int source, OSObject *target, IOInterruptAction handler, void *refCon = 0); + +#ifdef __BLOCKS__ +/*! @function registerInterrupt + @abstract Registers a block handler for a device supplying interrupts. + @discussion This method installs a C function interrupt handler to be called at primary interrupt time for a device's interrupt. Only one handler may be installed per interrupt source. IOInterruptEventSource provides a work loop based abstraction for interrupt delivery that may be more appropriate for work loop based drivers. + @param source The index of the interrupt source in the device. + @param target An object instance to be passed to the interrupt handler. + @param handler The block to be invoked at primary interrupt time when the interrupt occurs. The handler should process the interrupt by clearing the interrupt, or by disabling the source. + @result An IOReturn code.
kIOReturnNoInterrupt is returned if the source is not valid; kIOReturnNoResources is returned if the interrupt already has an installed handler. */ + + IOReturn registerInterruptBlock(int source, OSObject *target, + IOInterruptActionBlock handler); +#endif /* __BLOCKS__ */ /*! @function unregisterInterrupt @abstract Removes a C function interrupt handler for a device supplying hardware interrupts. @@ -1215,6 +1251,11 @@ virtual IOReturn updateReport(IOReportChannelList *channels, IOServiceInterestHandler handler, void * target, void * ref = 0 ); +#ifdef __BLOCKS__ + IONotifier * registerInterest(const OSSymbol * typeOfInterest, + IOServiceInterestHandlerBlock handler); +#endif /* __BLOCKS__ */ + virtual void applyToProviders( IOServiceApplierFunction applier, void * context ); @@ -1841,13 +1882,15 @@ virtual IOReturn updateReport(IOReportChannelList *channels, uint32_t getPowerStateForClient( const OSSymbol * client ); static const char * getIOMessageString( uint32_t msg ); static void setAdvisoryTickleEnable( bool enable ); - void reset_watchdog_timer( void ); + void reset_watchdog_timer(IOService *obj, int timeout); void start_watchdog_timer ( void ); - bool stop_watchdog_timer ( void ); + void stop_watchdog_timer ( void ); + void start_watchdog_timer(uint64_t deadline); IOReturn registerInterestForNotifier( IONotifier *notify, const OSSymbol * typeOfInterest, IOServiceInterestHandler handler, void * target, void * ref ); static IOWorkLoop * getIOPMWorkloop( void ); + bool getBlockingDriverCall(thread_t *thread, const void **callMethod); protected: bool tellClientsWithResponse( int messageType ); diff --git a/iokit/IOKit/IOServicePM.h b/iokit/IOKit/IOServicePM.h index 17662cb4e..f04fe93a1 100644 --- a/iokit/IOKit/IOServicePM.h +++ b/iokit/IOKit/IOServicePM.h @@ -72,6 +72,7 @@ struct IOPMDriverCallEntry { queue_chain_t link; thread_t thread; IOService * target; + const void *callMethod; }; // Power clients (desires) diff --git a/iokit/IOKit/IOSharedDataQueue.h b/iokit/IOKit/IOSharedDataQueue.h index 16e7cdde2..5b8a9f3e6 100644 --- a/iokit/IOKit/IOSharedDataQueue.h +++ b/iokit/IOKit/IOSharedDataQueue.h @@ -32,6 +32,9 @@ #ifdef dequeue #undef dequeue #endif +#ifdef enqueue +#undef enqueue +#endif #define DISABLE_DATAQUEUE_WARNING /* IODataQueue is deprecated, please use IOSharedDataQueue instead */ @@ -148,6 +151,12 @@ class IOSharedDataQueue : public IODataQueue */ virtual Boolean enqueue(void *data, UInt32 dataSize) APPLE_KEXT_OVERRIDE; +#ifdef PRIVATE + /* workaround for queue.h redefine, please do not use */ + __inline__ Boolean enqueue_tail(void *data, UInt32 dataSize) { return (IOSharedDataQueue::enqueue(data, dataSize)); } +#endif + +#if APPLE_KEXT_VTABLE_PADDING OSMetaClassDeclareReservedUnused(IOSharedDataQueue, 0); OSMetaClassDeclareReservedUnused(IOSharedDataQueue, 1); OSMetaClassDeclareReservedUnused(IOSharedDataQueue, 2); @@ -156,6 +165,7 @@ class IOSharedDataQueue : public IODataQueue OSMetaClassDeclareReservedUnused(IOSharedDataQueue, 5); OSMetaClassDeclareReservedUnused(IOSharedDataQueue, 6); OSMetaClassDeclareReservedUnused(IOSharedDataQueue, 7); +#endif }; #endif /* _IOKIT_IOSHAREDDATAQUEUE_H */ diff --git a/iokit/IOKit/IOTimerEventSource.h b/iokit/IOKit/IOTimerEventSource.h index 91ab47cf0..8ef49ef19 100644 --- a/iokit/IOKit/IOTimerEventSource.h +++ b/iokit/IOKit/IOTimerEventSource.h @@ -159,6 +159,10 @@ class IOTimerEventSource : public IOEventSource @param sender The object that timed out. */ typedef void (*Action)(OSObject *owner, IOTimerEventSource *sender); +#ifdef __BLOCKS__ + typedef void (^ActionBlock)(IOTimerEventSource *sender); +#endif /* __BLOCKS__ */ + static IOTimerEventSource * timerEventSource(OSObject *owner, Action action = 0); @@ -171,6 +175,22 @@ class IOTimerEventSource : public IOEventSource static IOTimerEventSource * timerEventSource(uint32_t options, OSObject *owner, Action action = 0); +#ifdef __BLOCKS__ +/*! @function timerEventSource + @abstract Allocates and returns an initialized timer instance. + @param options Mask of kIOTimerEventSourceOptions* options. + @param inOwner The object that that will be passed to the Action callback. + @param action Block for the callout routine of this event source. + */ + static IOTimerEventSource * + timerEventSource(uint32_t options, OSObject *inOwner, ActionBlock action); +#endif /* __BLOCKS__ */ + +#if XNU_KERNEL_PRIVATE + __inline__ void invokeAction(IOTimerEventSource::Action action, IOTimerEventSource * ts, + OSObject * owner, IOWorkLoop * workLoop); +#endif /* XNU_KERNEL_PRIVATE */ + /*! @function init @abstract Initializes the timer with an owner, and a handler to call when the timeout expires. */ diff --git a/iokit/IOKit/IOTypes.h b/iokit/IOKit/IOTypes.h index 62b5a6b08..c3f056001 100644 --- a/iokit/IOKit/IOTypes.h +++ b/iokit/IOKit/IOTypes.h @@ -83,7 +83,7 @@ typedef mach_vm_address_t IOVirtualAddress; typedef vm_address_t IOVirtualAddress; #endif -#if !defined(__arm__) && !defined(__i386__) && !(defined(__x86_64__) && !defined(KERNEL)) +#if !defined(__arm__) && !defined(__i386__) && !(defined(__x86_64__) && !defined(KERNEL)) && !(defined(__arm64__) && !defined(__LP64__)) typedef IOByteCount64 IOByteCount; #else typedef IOByteCount32 IOByteCount; diff --git a/iokit/IOKit/IOWorkLoop.h b/iokit/IOKit/IOWorkLoop.h index afc4979b0..c62c13216 100644 --- a/iokit/IOKit/IOWorkLoop.h +++ b/iokit/IOKit/IOWorkLoop.h @@ -74,6 +74,11 @@ member function's parameter list. typedef IOReturn (*Action)(OSObject *target, void *arg0, void *arg1, void *arg2, void *arg3); + +#ifdef __BLOCKS__ + typedef IOReturn (^ActionBlock)(); +#endif /* __BLOCKS__ */ + enum { kPreciousStack = 0x00000001, kTimeLockPanics = 0x00000002, @@ -292,6 +297,16 @@ IOWorkLoop uses this to determine if the event source should be polled in runEve void *arg0 = 0, void *arg1 = 0, void *arg2 = 0, void *arg3 = 0); +#ifdef __BLOCKS__ +/*! @function runAction + @abstract Single thread a call to an action with the work-loop. + @discussion Client function that causes the given action to be called in a single threaded manner. Beware: the work-loop's gate is recursive and runAction can cause direct or indirect re-entrancy. When executing on a client's thread, runAction will sleep until the work-loop's gate opens for execution of client actions, the action is single threaded against all other work-loop event sources. + @param action Block to be executed in work-loop context. + @result Returns the result of the action block. +*/ + IOReturn runActionBlock(ActionBlock action); +#endif /* __BLOCKS__ */ + /*! @function runEventSources @discussion Consists of the inner 2 loops of the threadMain function(qv). The outer loop terminates when there is no more work, and the inside loop diff --git a/iokit/IOKit/perfcontrol/IOPerfControl.h b/iokit/IOKit/perfcontrol/IOPerfControl.h new file mode 100644 index 000000000..886d0a0c1 --- /dev/null +++ b/iokit/IOKit/perfcontrol/IOPerfControl.h @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2017 Apple Inc. All rights reserved. + */ + +#pragma once + +#ifdef KERNEL_PRIVATE +#ifdef __cplusplus + +#include + +struct thread_group; + +enum +{ + kIOPerfControlClientWorkUntracked = 0, +}; + +/*! + * @class IOPerfControlClient : public OSObject + * @abstract Class which implements an interface allowing device drivers to participate in performance control. + * @discussion TODO + */ +class IOPerfControlClient final : public OSObject +{ + OSDeclareDefaultStructors(IOPerfControlClient); + +protected: + virtual bool init(IOService *driver, uint64_t maxWorkCapacity); + +public: + /*! + * @function copyClient + * @abstract Return a retained reference to a client object, to be released by the driver. It may be + * shared with other drivers in the system. + * @param driver The device driver that will be using this interface. + * @param maxWorkCapacity The maximum number of concurrent work items supported by the device driver. + * @returns An instance of IOPerfControlClient. + */ + static IOPerfControlClient *copyClient(IOService *driver, uint64_t maxWorkCapacity); + + /*! + * @function registerDevice + * @abstract Inform the system that work will be dispatched to a device in the future. + * @discussion The system will do some one-time setup work associated with the device, and may block the + * current thread during the setup. Devices should not be passed to work workSubmit, workSubmitAndBegin, + * workBegin, or workEnd until they have been successfully registered. The unregistration process happens + * automatically when the device object is deallocated. + * @param device The device object. Some platforms require device to be a specific subclass of IOService. + * @returns kIOReturnSuccess or an IOReturn error code + */ + virtual IOReturn registerDevice(IOService *driver, IOService *device); + + /*! + * @function unregisterDevice + * @abstract Inform the system that work will be no longer be dispatched to a device in the future. + * @discussion This call is optional as the unregistration process happens automatically when the device + * object is deallocated. This call may block the current thread and/or acquire locks. It should not be + * called until after all submitted work has been ended using workEnd. + * @param device The device object. Some platforms require device to be a specific subclass of IOService. + */ + virtual void unregisterDevice(IOService *driver, IOService *device); + + /*! + * @struct WorkSubmitArgs + * @discussion Drivers may submit additional device-specific arguments related to the submission of a work item + * by passing a struct with WorkSubmitArgs as its first member. Note: Drivers are responsible for publishing + * a header file describing these arguments. + */ + struct WorkSubmitArgs + { + uint32_t version; + uint32_t size; + uint64_t submit_time; + uint64_t reserved[4]; + void *driver_data; + }; + + /*! + * @function workSubmit + * @abstract Tell the performance controller that work was submitted. + * @param device The device that will execute the work. Some platforms require device to be a + * specific subclass of IOService. + * @param args Optional device-specific arguments related to the submission of this work item. + * @returns A token representing this work item, which must be passed to workEnd when the work is finished + * unless the token equals kIOPerfControlClientWorkUntracked. Failure to do this will result in memory leaks + * and a degradation of system performance. + */ + virtual uint64_t workSubmit(IOService *device, WorkSubmitArgs *args = nullptr); + + /*! + * @struct WorkBeginArgs + * @discussion Drivers may submit additional device-specific arguments related to the start of a work item + * by passing a struct with WorkBeginArgs as its first member. Note: Drivers are responsible for publishing + * a header file describing these arguments. + */ + struct WorkBeginArgs + { + uint32_t version; + uint32_t size; + uint64_t begin_time; + uint64_t reserved[4]; + void *driver_data; + }; + + /*! + * @function workSubmitAndBegin + * @abstract Tell the performance controller that work was submitted and immediately began executing. + * @param device The device that is executing the work. Some platforms require device to be a + * specific subclass of IOService. + * @param submitArgs Optional device-specific arguments related to the submission of this work item. + * @param beginArgs Optional device-specific arguments related to the start of this work item. + * @returns A token representing this work item, which must be passed to workEnd when the work is finished + * unless the token equals kIOPerfControlClientWorkUntracked. Failure to do this will result in memory leaks + * and a degradation of system performance. + */ + virtual uint64_t workSubmitAndBegin(IOService *device, WorkSubmitArgs *submitArgs = nullptr, + WorkBeginArgs *beginArgs = nullptr); + + /*! + * @function workBegin + * @abstract Tell the performance controller that previously submitted work began executing. + * @param device The device that is executing the work. Some platforms require device to be a + * specific subclass of IOService. + * @param args Optional device-specific arguments related to the start of this work item. + */ + virtual void workBegin(IOService *device, uint64_t token, WorkBeginArgs *args = nullptr); + + /*! + * @struct WorkEndArgs + * @discussion Drivers may submit additional device-specific arguments related to the end of a work item + * by passing a struct with WorkEndArgs as its first member. Note: Drivers are responsible for publishing + * a header file describing these arguments. + */ + struct WorkEndArgs + { + uint32_t version; + uint32_t size; + uint64_t end_time; + uint64_t reserved[4]; + void *driver_data; + }; + + /*! + * @function workEnd + * @abstract Tell the performance controller that previously started work finished executing. + * @param device The device that executed the work. Some platforms require device to be a + * specific subclass of IOService. + * @param args Optional device-specific arguments related to the end of this work item. + * @param done Optional Set to false if the work has not yet completed. Drivers are then responsible for + * calling workBegin when the work resumes and workEnd with done set to True when it has completed. + */ + virtual void workEnd(IOService *device, uint64_t token, WorkEndArgs *args = nullptr, bool done = true); + + /*! + * @struct PerfControllerInterface + * @discussion Function pointers necessary to register a performance controller. Not for general driver use. + */ + struct PerfControllerInterface + { + struct WorkState { + uint64_t thread_group_id; + void *thread_group_data; + void *work_data; + uint32_t work_data_size; + }; + + using RegisterDeviceFunction = IOReturn (*)(IOService *); + using WorkCanSubmitFunction = bool (*)(IOService *, WorkState *, WorkSubmitArgs *); + using WorkSubmitFunction = void (*)(IOService *, uint64_t, WorkState *, WorkSubmitArgs *); + using WorkBeginFunction = void (*)(IOService *, uint64_t, WorkState *, WorkBeginArgs *); + using WorkEndFunction = void (*)(IOService *, uint64_t, WorkState *, WorkEndArgs *, bool); + + uint64_t version; + RegisterDeviceFunction registerDevice; + RegisterDeviceFunction unregisterDevice; + WorkCanSubmitFunction workCanSubmit; + WorkSubmitFunction workSubmit; + WorkBeginFunction workBegin; + WorkEndFunction workEnd; + }; + + /*! + * @function registerPerformanceController + * @abstract Register a performance controller to receive callbacks. Not for general driver use. + * @param interface Struct containing callback functions implemented by the performance controller. + * @returns kIOReturnSuccess or kIOReturnError if the interface was already registered. + */ + virtual IOReturn registerPerformanceController(PerfControllerInterface interface); + +private: + struct WorkTableEntry + { + struct thread_group *thread_group; + bool started; + uint8_t perfcontrol_data[32]; + }; + + // TODO: size of table should match sum(maxWorkCapacity) of all users + static constexpr size_t kWorkTableNumEntries = 1024; + + uint64_t allocateToken(thread_group *thread_group); + void deallocateToken(uint64_t token); + bool getEntryForToken(uint64_t token, WorkTableEntry &entry); + void markEntryStarted(uint64_t token, bool started); + + PerfControllerInterface interface; + IOLock *interfaceLock; + OSSet *deviceRegistrationList; + + // TODO: replace with ltable or pool of objects + WorkTableEntry workTable[kWorkTableNumEntries]; + size_t workTableNextIndex; + IOSimpleLock *workTableLock; +}; + +#endif /* __cplusplus */ +#endif /* KERNEL_PRIVATE */ diff --git a/iokit/IOKit/perfcontrol/Makefile b/iokit/IOKit/perfcontrol/Makefile new file mode 100644 index 000000000..3f8cad1d5 --- /dev/null +++ b/iokit/IOKit/perfcontrol/Makefile @@ -0,0 +1,32 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + +IOKIT_FRAMEDIR = $(FRAMEDIR)/IOKit.framework/Versions/A +INCDIR = $(IOKIT_FRAMEDIR)/Headers +LCLDIR = $(IOKIT_FRAMEDIR)/PrivateHeaders + +include $(MakeInc_cmd) +include $(MakeInc_def) + +MI_DIR = perfcontrol +NOT_EXPORT_HEADERS = + +ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h)) + +# Install these only in Kernel.framework's PrivateHeaders (not Headers). +NOT_KF_MI_HEADERS = $(NOT_EXPORT_HEADERS) \ + IOPerfControl.h + +INSTALL_MI_LIST = +INSTALL_MI_LCL_LIST = +INSTALL_MI_DIR = $(MI_DIR) + +EXPORT_MI_LIST = $(filter-out $(NOT_EXPORT_HEADERS), $(ALL_HEADERS)) +EXPORT_MI_DIR = IOKit/$(MI_DIR) + +INSTALL_KF_MI_LIST = $(filter-out $(NOT_KF_MI_HEADERS), $(ALL_HEADERS)) + +include $(MakeInc_rule) +include $(MakeInc_dir) diff --git a/iokit/IOKit/pwr_mgt/IOPM.h b/iokit/IOKit/pwr_mgt/IOPM.h index c83933543..930a45d8d 100644 --- a/iokit/IOKit/pwr_mgt/IOPM.h +++ b/iokit/IOKit/pwr_mgt/IOPM.h @@ -249,11 +249,29 @@ enum { /* kIOPMDeepSleepDelayKey * Key refers to a CFNumberRef that represents the delay in seconds before - * entering Deep Sleep state. The property is not present if Deep Sleep is - * unsupported. + * entering Deep Sleep state when on battery power and when remaining + * battery capacity is below a particular threshold (e.g., 50%.) The + * property is not present if Deep Sleep is unsupported. */ #define kIOPMDeepSleepDelayKey "Standby Delay" +/* kIOPMDeepSleepDelayHighKey + * Key refers to a CFNumberRef that represents the delay in seconds before + * entering Deep Sleep state. This is used instead of the value specified by + * kIOPMDeepSleepDelayKey if the remaining battery capacity is above a + * particular threshold (e.g. 50%) or on AC power. The property is not + * present if Deep Sleep is unsupported. + */ +#define kIOPMDeepSleepDelayHighKey "High Standby Delay" + +/* kIOPMLowBatteryThresholdKey + * Key refers to a CFNumberRef that represents the threshold used to choose + * between the normal deep sleep delay and the high deep sleep delay (as a + * percentage of total battery capacity remaining.) The property is not + * present if Deep Sleep is unsupported. + */ +#define kIOPMStandbyBatteryThresholdKey "Standby Battery Threshold" + /* kIOPMDestroyFVKeyOnStandbyKey * Specifies if FileVault key can be stored when going to standby mode * It has a boolean value, @@ -631,11 +649,15 @@ enum { kIOPSFamilyCodeUSBChargingPortDownstream = iokit_family_err(sub_iokit_usb, 5), kIOPSFamilyCodeUSBChargingPort = iokit_family_err(sub_iokit_usb, 6), kIOPSFamilyCodeUSBUnknown = iokit_family_err(sub_iokit_usb, 7), + kIOPSFamilyCodeUSBCBrick = iokit_family_err(sub_iokit_usb, 8), + kIOPSFamilyCodeUSBCTypeC = iokit_family_err(sub_iokit_usb, 9), + kIOPSFamilyCodeUSBCPD = iokit_family_err(sub_iokit_usb, 10), kIOPSFamilyCodeAC = iokit_family_err(sub_iokit_pmu, 0), kIOPSFamilyCodeExternal = iokit_family_err(sub_iokit_pmu, 1), kIOPSFamilyCodeExternal2 = iokit_family_err(sub_iokit_pmu, 2), kIOPSFamilyCodeExternal3 = iokit_family_err(sub_iokit_pmu, 3), kIOPSFamilyCodeExternal4 = iokit_family_err(sub_iokit_pmu, 4), + kIOPSFamilyCodeExternal5 = iokit_family_err(sub_iokit_pmu, 5), }; // values for kIOPMPSAdapterDetailsErrorFlagsKey diff --git a/iokit/IOKit/pwr_mgt/IOPMPowerSource.h b/iokit/IOKit/pwr_mgt/IOPMPowerSource.h index cd0db25bf..015c70a05 100644 --- a/iokit/IOKit/pwr_mgt/IOPMPowerSource.h +++ b/iokit/IOKit/pwr_mgt/IOPMPowerSource.h @@ -40,6 +40,7 @@ enum { kTenMinutesInSeconds = 600 }; + /*! @class IOPMPowerSource * * See IOKit/pwr_mgt/IOPM.h for power source keys relevant to this class. These diff --git a/iokit/IOKit/pwr_mgt/IOPMPrivate.h b/iokit/IOKit/pwr_mgt/IOPMPrivate.h index b3f7b3397..798be5d88 100644 --- a/iokit/IOKit/pwr_mgt/IOPMPrivate.h +++ b/iokit/IOKit/pwr_mgt/IOPMPrivate.h @@ -857,11 +857,14 @@ typedef struct { #define SWD_HDR_SIGNATURE 0xdeb8da2a -#define SWD_BUF_SIZE (40*PAGE_SIZE) -#define SWD_INITIAL_STACK_SIZE ((SWD_BUF_SIZE/2)-sizeof(swd_hdr)) +#define SWD_STACKSHOT_SIZE (40*PAGE_SIZE) +#define SWD_COMPRESSED_BUFSIZE (5*PAGE_SIZE) +#define SWD_ZLIB_BUFSIZE (10*PAGE_SIZE) +#define SWD_STACKSHOT_VAR_PREFIX "sleepwake_diags" #define SWD_SPINDUMP_SIZE (256*1024) #define SWD_INITIAL_SPINDUMP_SIZE ((SWD_SPINDUMP_SIZE/2)-sizeof(swd_hdr)) +#define SWD_MAX_STACKSHOTS (10) /* Bits in swd_flags */ #define SWD_WDOG_ENABLED 0x01 @@ -880,25 +883,11 @@ typedef struct { /* Filenames associated with the stackshots/logs generated by the SWD */ -#define kSleepWakeStackBinFilename "/var/log/SleepWakeStacks.bin" -#define kSleepWakeStackFilename "/var/log/SleepWakeStacks.dump" -#define kSleepWakeLogFilename "/var/log/SleepWakeLog.dump" -#define kAppleOSXWatchdogStackFilename "/var/log/AppleOSXWatchdogStacks.dump" -#define kAppleOSXWatchdogLogFilename "/var/log/AppleOSXWatchdogLog.dump" +#define kOSWatchdogStacksFilename "/var/log/OSXWatchdogStacks.gz" +#define kOSWatchdogFailureStringFile "/var/log/OSWatchdogFailureString.txt" +#define kSleepWakeStacksFilename "/var/log/SleepWakeStacks.gz" +#define kSleepWakeFailureStringFile "/var/log/SleepWakeFailureString.txt" -inline char const* getDumpStackFilename(swd_hdr *hdr) -{ - if (hdr && hdr->is_osx_watchdog) - return kAppleOSXWatchdogStackFilename; - return kSleepWakeStackFilename; -} - -inline char const* getDumpLogFilename(swd_hdr *hdr) -{ - if (hdr && hdr->is_osx_watchdog) - return kAppleOSXWatchdogLogFilename; - return kSleepWakeLogFilename; -} /* RootDomain IOReporting channels */ #define kSleepCntChID IOREPORT_MAKEID('S','l','e','e','p','C','n','t') diff --git a/iokit/IOKit/pwr_mgt/RootDomain.h b/iokit/IOKit/pwr_mgt/RootDomain.h index 95474d652..eef58a320 100644 --- a/iokit/IOKit/pwr_mgt/RootDomain.h +++ b/iokit/IOKit/pwr_mgt/RootDomain.h @@ -511,7 +511,7 @@ class IOPMrootDomain: public IOService uintptr_t param1, uintptr_t param2, uintptr_t param3 = 0); void tracePoint(uint8_t point); void traceDetail(uint32_t msgType, uint32_t msgIndex, uint32_t delay); - void traceDetail(OSObject *notifier); + void traceDetail(OSObject *notifier, bool start); void traceAckDelay(OSObject *notifier, uint32_t response, uint32_t delay_ms); void startSpinDump(uint32_t spindumpKind); @@ -553,12 +553,10 @@ class IOPMrootDomain: public IOService void sleepWakeDebugTrig(bool restart); void sleepWakeDebugEnableWdog(); bool sleepWakeDebugIsWdogEnabled(); - static void saveTimeoutAppStackShot(void *p0, void *p1); void sleepWakeDebugSaveSpinDumpFile(); - void swdDebugSetup(); - void swdDebugTeardown(); bool checkShutdownTimeout(); void panicWithShutdownLog(uint32_t timeoutInMs); + uint32_t getWatchdogTimeout(); private: friend class PMSettingObject; @@ -581,10 +579,6 @@ class IOPMrootDomain: public IOService IOService * newService, IONotifier * notifier); - static bool IONVRAMMatchPublished( void * target, void * refCon, - IOService * newService, - IONotifier * notifier); - static bool batteryPublished( void * target, void * refCon, IOService * resourceService, IONotifier * notifier); @@ -654,8 +648,6 @@ class IOPMrootDomain: public IOService thread_call_t extraSleepTimer; thread_call_t diskSyncCalloutEntry; thread_call_t fullWakeThreadCall; - thread_call_t swdDebugSetupEntry; - thread_call_t swdDebugTearDownEntry; thread_call_t updateConsoleUsersEntry; // Track system capabilities. @@ -787,13 +779,13 @@ class IOPMrootDomain: public IOService volatile uint32_t swd_lock; /* Lock to access swd_buffer & and its header */ void * swd_buffer; /* Memory allocated for dumping sleep/wake logs */ uint32_t swd_flags; /* Flags defined in IOPMPrivate.h */ - uint8_t swd_DebugImageSetup; + void * swd_compressed_buffer; void * swd_spindump_buffer; + thread_t notifierThread; + OSObject *notifierObject; IOBufferMemoryDescriptor *swd_memDesc; - IOMemoryMap * swd_logBufMap; /* Memory with sleep/wake logs from previous boot */ - // Wake Event Reporting OSArray * _systemWakeEventsArray; bool _acceptSystemWakeEvents; @@ -858,19 +850,12 @@ class IOPMrootDomain: public IOService uint32_t checkForValidDebugData(const char *fname, vfs_context_t *ctx, void *tmpBuf, struct vnode **vp); + void getFailureData(thread_t *thread, char *failureStr, size_t strLen); + void saveFailureData2File(); + void tracePhase2String(uint32_t tracePhase, const char **phaseString, const char **description); void sleepWakeDebugMemAlloc( ); void sleepWakeDebugSpinDumpMemAlloc( ); - void sleepWakeDebugDumpFromMem(IOMemoryMap *logBufMap); - void sleepWakeDebugDumpFromFile( ); - IOMemoryMap *sleepWakeDebugRetrieve(); errno_t sleepWakeDebugSaveFile(const char *name, char *buf, int len); - errno_t sleepWakeDebugCopyFile( struct vnode *srcVp, - vfs_context_t srcCtx, - char *tmpBuf, uint64_t tmpBufSize, - uint64_t srcOffset, - const char *dstFname, - uint64_t numBytes, - uint32_t crc); #if HIBERNATION diff --git a/iokit/IOKit/rtc/IORTCController.h b/iokit/IOKit/rtc/IORTCController.h index 2757c8a13..159f3eb93 100644 --- a/iokit/IOKit/rtc/IORTCController.h +++ b/iokit/IOKit/rtc/IORTCController.h @@ -55,7 +55,7 @@ OSDeclareAbstractStructors(IORTC); /*! @var reserved Reserved for future use. (Internal use only) */ struct ExpansionData { }; - ExpansionData *reserved; + ExpansionData *iortc_reserved __unused; public: diff --git a/iokit/Kernel/IOCPU.cpp b/iokit/Kernel/IOCPU.cpp index e9173c655..8ad8d76cd 100644 --- a/iokit/Kernel/IOCPU.cpp +++ b/iokit/Kernel/IOCPU.cpp @@ -47,6 +47,8 @@ extern void kperf_kernel_configure(char *); extern "C" void console_suspend(); extern "C" void console_resume(); +extern "C" void sched_override_recommended_cores_for_sleep(void); +extern "C" void sched_restore_recommended_cores_after_sleep(void); typedef kern_return_t (*iocpu_platform_action_t)(void * refcon0, void * refcon1, uint32_t priority, void * param1, void * param2, void * param3, @@ -352,64 +354,63 @@ IORemoveServicePlatformActions(IOService * service) kern_return_t PE_cpu_start(cpu_id_t target, vm_offset_t start_paddr, vm_offset_t arg_paddr) { - IOCPU *targetCPU = OSDynamicCast(IOCPU, (OSObject *)target); + IOCPU *targetCPU = (IOCPU *)target; - if (targetCPU == 0) return KERN_FAILURE; - return targetCPU->startCPU(start_paddr, arg_paddr); + if (targetCPU == NULL) return KERN_FAILURE; + return targetCPU->startCPU(start_paddr, arg_paddr); } void PE_cpu_halt(cpu_id_t target) { - IOCPU *targetCPU = OSDynamicCast(IOCPU, (OSObject *)target); + IOCPU *targetCPU = (IOCPU *)target; - if (targetCPU) targetCPU->haltCPU(); + targetCPU->haltCPU(); } void PE_cpu_signal(cpu_id_t source, cpu_id_t target) { - IOCPU *sourceCPU = OSDynamicCast(IOCPU, (OSObject *)source); - IOCPU *targetCPU = OSDynamicCast(IOCPU, (OSObject *)target); + IOCPU *sourceCPU = (IOCPU *)source; + IOCPU *targetCPU = (IOCPU *)target; - if (sourceCPU && targetCPU) sourceCPU->signalCPU(targetCPU); + sourceCPU->signalCPU(targetCPU); } void PE_cpu_signal_deferred(cpu_id_t source, cpu_id_t target) { - IOCPU *sourceCPU = OSDynamicCast(IOCPU, (OSObject *)source); - IOCPU *targetCPU = OSDynamicCast(IOCPU, (OSObject *)target); + IOCPU *sourceCPU = (IOCPU *)source; + IOCPU *targetCPU = (IOCPU *)target; - if (sourceCPU && targetCPU) sourceCPU->signalCPUDeferred(targetCPU); + sourceCPU->signalCPUDeferred(targetCPU); } void PE_cpu_signal_cancel(cpu_id_t source, cpu_id_t target) { - IOCPU *sourceCPU = OSDynamicCast(IOCPU, (OSObject *)source); - IOCPU *targetCPU = OSDynamicCast(IOCPU, (OSObject *)target); + IOCPU *sourceCPU = (IOCPU *)source; + IOCPU *targetCPU = (IOCPU *)target; - if (sourceCPU && targetCPU) sourceCPU->signalCPUCancel(targetCPU); + sourceCPU->signalCPUCancel(targetCPU); } void PE_cpu_machine_init(cpu_id_t target, boolean_t bootb) { - IOCPU *targetCPU = OSDynamicCast(IOCPU, (OSObject *)target); - - if (targetCPU) { - targetCPU->initCPU(bootb); + IOCPU *targetCPU = OSDynamicCast(IOCPU, (OSObject *)target); + + if (targetCPU == NULL) + panic("%s: invalid target CPU %p", __func__, target); + + targetCPU->initCPU(bootb); #if defined(__arm__) || defined(__arm64__) - if (!bootb && (targetCPU->getCPUNumber() == (UInt32)master_cpu)) ml_set_is_quiescing(false); + if (!bootb && (targetCPU->getCPUNumber() == (UInt32)master_cpu)) ml_set_is_quiescing(false); #endif /* defined(__arm__) || defined(__arm64__) */ - } } void PE_cpu_machine_quiesce(cpu_id_t target) { - IOCPU *targetCPU = OSDynamicCast(IOCPU, (OSObject *)target); - if (targetCPU) { + IOCPU *targetCPU = (IOCPU*)target; #if defined(__arm__) || defined(__arm64__) - if (targetCPU->getCPUNumber() == (UInt32)master_cpu) ml_set_is_quiescing(true); + if (targetCPU->getCPUNumber() == (UInt32)master_cpu) ml_set_is_quiescing(true); #endif /* defined(__arm__) || defined(__arm64__) */ - targetCPU->quiesceCPU(); - } + targetCPU->quiesceCPU(); } #if defined(__arm__) || defined(__arm64__) @@ -424,15 +425,17 @@ kern_return_t PE_cpu_perfmon_interrupt_install_handler(perfmon_interrupt_handler void PE_cpu_perfmon_interrupt_enable(cpu_id_t target, boolean_t enable) { - IOCPU *targetCPU = OSDynamicCast(IOCPU, (OSObject *)target); + IOCPU *targetCPU = (IOCPU*)target; - if (targetCPU) { - if (enable) { - targetCPU->getProvider()->registerInterrupt(1, targetCPU, (IOInterruptAction)pmi_handler, 0); - targetCPU->getProvider()->enableInterrupt(1); - } else { - targetCPU->getProvider()->disableInterrupt(1); - } + if (targetCPU == nullptr) { + return; + } + + if (enable) { + targetCPU->getProvider()->registerInterrupt(1, targetCPU, (IOInterruptAction)pmi_handler, 0); + targetCPU->getProvider()->enableInterrupt(1); + } else { + targetCPU->getProvider()->disableInterrupt(1); } } #endif @@ -461,6 +464,9 @@ void IOCPUSleepKernel(void) IOPMrootDomain *rootDomain = IOService::getPMRootDomain(); kprintf("IOCPUSleepKernel\n"); +#if defined(__arm64__) + sched_override_recommended_cores_for_sleep(); +#endif IORegistryIterator * iter; OSOrderedSet * all; @@ -526,10 +532,12 @@ void IOCPUSleepKernel(void) console_suspend(); rootDomain->tracePoint( kIOPMTracePointSleepPlatformDriver ); + rootDomain->stop_watchdog_timer(); // Now sleep the boot CPU. bootCPU->haltCPU(); + rootDomain->start_watchdog_timer(); rootDomain->tracePoint( kIOPMTracePointWakePlatformActions ); console_resume(); @@ -564,6 +572,10 @@ void IOCPUSleepKernel(void) processor_start(target->getMachProcessor()); } } + +#if defined(__arm64__) + sched_restore_recommended_cores_after_sleep(); +#endif } bool IOCPU::start(IOService *provider) diff --git a/iokit/Kernel/IOCommandGate.cpp b/iokit/Kernel/IOCommandGate.cpp index 6c1f45767..e69457efe 100644 --- a/iokit/Kernel/IOCommandGate.cpp +++ b/iokit/Kernel/IOCommandGate.cpp @@ -162,6 +162,19 @@ IOReturn IOCommandGate::attemptCommand(void *arg0, void *arg1, return attemptAction((Action) action, arg0, arg1, arg2, arg3); } + +static IOReturn IOCommandGateActionToBlock(OSObject *owner, + void *arg0, void *arg1, + void *arg2, void *arg3) +{ + return ((IOEventSource::ActionBlock) arg0)(); +} + +IOReturn IOCommandGate::runActionBlock(ActionBlock action) +{ + return (runAction(&IOCommandGateActionToBlock, action)); +} + IOReturn IOCommandGate::runAction(Action inAction, void *arg0, void *arg1, void *arg2, void *arg3) @@ -275,16 +288,20 @@ IOReturn IOCommandGate::attemptAction(Action inAction, IOReturn IOCommandGate::commandSleep(void *event, UInt32 interruptible) { - if (!workLoop->inGate()) - return kIOReturnNotPermitted; + if (!workLoop->inGate()) { + /* The equivalent of 'msleep' while not holding the mutex is invalid */ + panic("invalid commandSleep while not holding the gate"); + } return sleepGate(event, interruptible); } IOReturn IOCommandGate::commandSleep(void *event, AbsoluteTime deadline, UInt32 interruptible) { - if (!workLoop->inGate()) - return kIOReturnNotPermitted; + if (!workLoop->inGate()) { + /* The equivalent of 'msleep' while not holding the mutex is invalid */ + panic("invalid commandSleep while not holding the gate"); + } return sleepGate(event, deadline, interruptible); } diff --git a/iokit/Kernel/IODMACommand.cpp b/iokit/Kernel/IODMACommand.cpp index 261f86b1b..ee6642b22 100644 --- a/iokit/Kernel/IODMACommand.cpp +++ b/iokit/Kernel/IODMACommand.cpp @@ -379,7 +379,8 @@ IODMACommand::setMemoryDescriptor(const IOMemoryDescriptor *mem, bool autoPrepar fInternalState->fNewMD = true; mem->retain(); fMemory = mem; - if (!fMapper) mem->dmaCommandOperation(kIOMDSetDMAActive, this, 0); + fInternalState->fSetActiveNoMapper = (!fMapper); + if (fInternalState->fSetActiveNoMapper) mem->dmaCommandOperation(kIOMDSetDMAActive, this, 0); if (autoPrepare) { err = prepare(); if (err) { @@ -399,7 +400,7 @@ IODMACommand::clearMemoryDescriptor(bool autoComplete) if (fMemory) { while (fActive) complete(); - if (!fMapper) fMemory->dmaCommandOperation(kIOMDSetDMAInactive, this, 0); + if (fInternalState->fSetActiveNoMapper) fMemory->dmaCommandOperation(kIOMDSetDMAInactive, this, 0); fMemory->release(); fMemory = 0; } @@ -823,8 +824,6 @@ IODMACommand::prepare(UInt64 offset, UInt64 length, bool flushCache, bool synchr state->fLocalMapperAllocValid = false; state->fLocalMapperAllocLength = 0; - state->fLocalMapper = (fMapper && (fMapper != IOMapper::gSystem)); - state->fSourceAlignMask = fAlignMask; if (fMapper) state->fSourceAlignMask &= page_mask; diff --git a/iokit/Kernel/IODataQueue.cpp b/iokit/Kernel/IODataQueue.cpp index e6124dfcf..15f68a362 100644 --- a/iokit/Kernel/IODataQueue.cpp +++ b/iokit/Kernel/IODataQueue.cpp @@ -239,18 +239,28 @@ Boolean IODataQueue::enqueue(void * data, UInt32 dataSize) } } - // Store tail with a release memory barrier - __c11_atomic_store((_Atomic UInt32 *)&dataQueue->tail, newTail, __ATOMIC_RELEASE); - - // Send notification (via mach message) that data is available. - - if ( ( head == tail ) /* queue was empty prior to enqueue() */ - || ( tail == __c11_atomic_load((_Atomic UInt32 *)&dataQueue->head, __ATOMIC_ACQUIRE) ) ) /* queue was emptied during enqueue() */ - { - sendDataAvailableNotification(); - } + // Publish the data we just enqueued + __c11_atomic_store((_Atomic UInt32 *)&dataQueue->tail, newTail, __ATOMIC_RELEASE); + + if (tail != head) { + // + // The memory barrier below paris with the one in ::dequeue + // so that either our store to the tail cannot be missed by + // the next dequeue attempt, or we will observe the dequeuer + // making the queue empty. + // + // Of course, if we already think the queue is empty, + // there's no point paying this extra cost. + // + __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); + head = __c11_atomic_load((_Atomic UInt32 *)&dataQueue->head, __ATOMIC_RELAXED); + } - return true; + if (tail == head) { + // Send notification (via mach message) that data is now available. + sendDataAvailableNotification(); + } + return true; } void IODataQueue::setNotificationPort(mach_port_t port) diff --git a/iokit/Kernel/IOEventSource.cpp b/iokit/Kernel/IOEventSource.cpp index 3393993e0..76c2d5032 100644 --- a/iokit/Kernel/IOEventSource.cpp +++ b/iokit/Kernel/IOEventSource.cpp @@ -36,6 +36,7 @@ HISTORY #include #include +#include #define super OSObject @@ -162,6 +163,8 @@ bool IOEventSource::init(OSObject *inOwner, void IOEventSource::free( void ) { IOStatisticsUnregisterCounter(); + + if ((kActionBlock & flags) && actionBlock) Block_release(actionBlock); if (reserved) IODelete(reserved, ExpansionData, 1); @@ -169,13 +172,41 @@ void IOEventSource::free( void ) super::free(); } -IOEventSource::Action IOEventSource::getAction () const { return action; }; +void IOEventSource::setRefcon(void *newrefcon) +{ + refcon = newrefcon; +} + +void * IOEventSource::getRefcon() const +{ + return refcon; +} + +IOEventSource::Action IOEventSource::getAction() const +{ + if (kActionBlock & flags) return NULL; + return (action); +} + +IOEventSource::ActionBlock IOEventSource::getActionBlock(ActionBlock) const +{ + if (kActionBlock & flags) return actionBlock; + return (NULL); +} void IOEventSource::setAction(Action inAction) { + if ((kActionBlock & flags) && actionBlock) Block_release(actionBlock); action = inAction; } +void IOEventSource::setActionBlock(ActionBlock block) +{ + if ((kActionBlock & flags) && actionBlock) Block_release(actionBlock); + actionBlock = Block_copy(block); + flags |= kActionBlock; +} + IOEventSource *IOEventSource::getNext() const { return eventChainNext; }; void IOEventSource::setNext(IOEventSource *inNext) diff --git a/iokit/Kernel/IOFilterInterruptEventSource.cpp b/iokit/Kernel/IOFilterInterruptEventSource.cpp index c6f79e91d..f3c61367b 100644 --- a/iokit/Kernel/IOFilterInterruptEventSource.cpp +++ b/iokit/Kernel/IOFilterInterruptEventSource.cpp @@ -32,6 +32,7 @@ #include #include #include +#include #if IOKITSTATS @@ -123,6 +124,39 @@ ::filterInterruptEventSource(OSObject *inOwner, return me; } + +IOFilterInterruptEventSource *IOFilterInterruptEventSource +::filterInterruptEventSource(OSObject *inOwner, + IOService *inProvider, + int inIntIndex, + ActionBlock inAction, + FilterBlock inFilterAction) +{ + IOFilterInterruptEventSource *me = new IOFilterInterruptEventSource; + + FilterBlock filter = Block_copy(inFilterAction); + if (!filter) return 0; + + if (me + && !me->init(inOwner, (Action) NULL, (Filter) filter, inProvider, inIntIndex)) { + me->release(); + Block_release(filter); + return 0; + } + me->flags |= kFilterBlock; + me->setActionBlock((IOEventSource::ActionBlock) inAction); + + return me; +} + + +void IOFilterInterruptEventSource::free( void ) +{ + if ((kFilterBlock & flags) && filterActionBlock) Block_release(filterActionBlock); + + super::free(); +} + void IOFilterInterruptEventSource::signalInterrupt() { bool trace = (gIOKitTrace & kIOTraceIntEventSource) ? true : false; @@ -144,11 +178,16 @@ void IOFilterInterruptEventSource::signalInterrupt() IOFilterInterruptEventSource::Filter IOFilterInterruptEventSource::getFilterAction() const { + if (kFilterBlock & flags) return NULL; return filterAction; } - - +IOFilterInterruptEventSource::FilterBlock +IOFilterInterruptEventSource::getFilterActionBlock() const +{ + if (kFilterBlock & flags) return filterActionBlock; + return (NULL); +} void IOFilterInterruptEventSource::normalInterruptOccurred (void */*refcon*/, IOService */*prov*/, int /*source*/) @@ -169,7 +208,8 @@ void IOFilterInterruptEventSource::normalInterruptOccurred } // Call the filter. - filterRes = (*filterAction)(owner, this); + if (kFilterBlock & flags) filterRes = (filterActionBlock)(this); + else filterRes = (*filterAction)(owner, this); if (IOInterruptEventSource::reserved->statistics) { if (IA_GET_STATISTIC_ENABLED(kInterruptAccountingFirstLevelCountIndex)) { @@ -210,7 +250,8 @@ void IOFilterInterruptEventSource::disableInterruptOccurred } // Call the filter. - filterRes = (*filterAction)(owner, this); + if (kFilterBlock & flags) filterRes = (filterActionBlock)(this); + else filterRes = (*filterAction)(owner, this); if (IOInterruptEventSource::reserved->statistics) { if (IA_GET_STATISTIC_ENABLED(kInterruptAccountingFirstLevelCountIndex)) { diff --git a/iokit/Kernel/IOHibernateIO.cpp b/iokit/Kernel/IOHibernateIO.cpp index 6db950f67..94d5b465e 100644 --- a/iokit/Kernel/IOHibernateIO.cpp +++ b/iokit/Kernel/IOHibernateIO.cpp @@ -214,9 +214,6 @@ static OSData * gIOHibernateBoot0082Data; static OSData * gIOHibernateBootNextData; static OSObject * gIOHibernateBootNextSave; -static IOPolledFileIOVars * gDebugImageFileVars; -static IOLock * gDebugImageLock; - #endif /* defined(__i386__) || defined(__x86_64__) */ static IOLock * gFSLock; @@ -530,19 +527,11 @@ IOHibernateSystemSleep(void) } } - // Invalidate the image file - if (gDebugImageLock) { - IOLockLock(gDebugImageLock); - if (gDebugImageFileVars != 0) { - IOSetBootImageNVRAM(0); - IOPolledFileClose(&gDebugImageFileVars, 0, 0, 0, 0, 0); - } - IOLockUnlock(gDebugImageLock); - } - vars->volumeCryptKeySize = sizeof(vars->volumeCryptKey); - err = IOPolledFileOpen(gIOHibernateFilename, setFileSize, 0, - gIOHibernateCurrentHeader, sizeof(gIOHibernateCurrentHeader), + err = IOPolledFileOpen(gIOHibernateFilename, + (kIOPolledFileCreate | kIOPolledFileHibernate), + setFileSize, 0, + gIOHibernateCurrentHeader, sizeof(gIOHibernateCurrentHeader), &vars->fileVars, &nvramData, &vars->volumeCryptKey[0], &vars->volumeCryptKeySize); @@ -890,75 +879,6 @@ IOWriteExtentsToFile(IOPolledFileIOVars * vars, uint32_t signature) return err; } -extern "C" boolean_t root_is_CF_drive; - -void -IOOpenDebugDataFile(const char *fname, uint64_t size) -{ - IOReturn err; - OSData * imagePath = NULL; - uint64_t padding; - - if (!gDebugImageLock) { - gDebugImageLock = IOLockAlloc(); - } - - if (root_is_CF_drive) return; - - // Try to get a lock, but don't block for getting lock - if (!IOLockTryLock(gDebugImageLock)) { - HIBLOG("IOOpenDebugDataFile: Failed to get lock\n"); - return; - } - - if (gDebugImageFileVars || !fname || !size) { - HIBLOG("IOOpenDebugDataFile: conditions failed\n"); - goto exit; - } - - padding = (PAGE_SIZE*2); // allocate couple more pages for header and fileextents - err = IOPolledFileOpen(fname, size+padding, 32ULL*1024*1024*1024, - NULL, 0, - &gDebugImageFileVars, &imagePath, NULL, 0); - - if ((kIOReturnSuccess == err) && imagePath) - { - if ((gDebugImageFileVars->fileSize < (size+padding)) || - (gDebugImageFileVars->fileExtents->getLength() > PAGE_SIZE)) { - // Can't use the file - IOPolledFileClose(&gDebugImageFileVars, 0, 0, 0, 0, 0); - HIBLOG("IOOpenDebugDataFile: too many file extents\n"); - goto exit; - } - - // write extents for debug data usage in EFI - IOWriteExtentsToFile(gDebugImageFileVars, kIOHibernateHeaderOpenSignature); - IOSetBootImageNVRAM(imagePath); - } - -exit: - IOLockUnlock(gDebugImageLock); - - if (imagePath) imagePath->release(); - return; -} - -void -IOCloseDebugDataFile() -{ - IOSetBootImageNVRAM(0); - - if (gDebugImageLock) { - IOLockLock(gDebugImageLock); - if (gDebugImageFileVars != 0) { - IOPolledFileClose(&gDebugImageFileVars, 0, 0, 0, 0, 0); - } - IOLockUnlock(gDebugImageLock); - } - - -} - /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ DECLARE_IOHIBERNATEPROGRESSALPHA @@ -1394,6 +1314,8 @@ IOReturn IOHibernateSystemPostWake(bool now) { gIOHibernateCurrentHeader->signature = kIOHibernateHeaderInvalidSignature; + IOSetBootImageNVRAM(0); + IOLockLock(gFSLock); if (kFSTrimDelay == gFSState) { @@ -1913,12 +1835,11 @@ hibernate_write_image(void) }; bool cpuAES = (0 != (CPUID_FEATURE_AES & cpuid_features())); -#define _pmap_is_noencrypt(x) (cpuAES ? false : pmap_is_noencrypt((x))) for (pageType = kWiredEncrypt; pageType >= kUnwiredEncrypt; pageType--) { if (kUnwiredEncrypt == pageType) - { + { // start unwired image if (!vars->hwEncrypt && (kIOHibernateModeEncrypt & gIOHibernateMode)) { @@ -1933,27 +1854,36 @@ hibernate_write_image(void) } for (iterDone = false, ppnum = 0; !iterDone; ) { - count = hibernate_page_list_iterate((kWired & pageType) - ? vars->page_list_wired : vars->page_list, - &ppnum); + if (cpuAES && (pageType == kWiredClear)) + { + count = 0; + } + else + { + count = hibernate_page_list_iterate((kWired & pageType) ? vars->page_list_wired : vars->page_list, + &ppnum); + } // kprintf("[%d](%x : %x)\n", pageType, ppnum, count); iterDone = !count; - if (count && (kWired & pageType) && needEncrypt) - { - uint32_t checkIndex; - for (checkIndex = 0; - (checkIndex < count) - && (((kEncrypt & pageType) == 0) == _pmap_is_noencrypt(ppnum + checkIndex)); - checkIndex++) - {} - if (!checkIndex) - { - ppnum++; - continue; - } - count = checkIndex; - } + if (!cpuAES) + { + if (count && (kWired & pageType) && needEncrypt) + { + uint32_t checkIndex; + for (checkIndex = 0; + (checkIndex < count) + && (((kEncrypt & pageType) == 0) == pmap_is_noencrypt(ppnum + checkIndex)); + checkIndex++) + {} + if (!checkIndex) + { + ppnum++; + continue; + } + count = checkIndex; + } + } switch (pageType) { diff --git a/iokit/Kernel/IOInterruptEventSource.cpp b/iokit/Kernel/IOInterruptEventSource.cpp index a410de27e..0d96bbb86 100644 --- a/iokit/Kernel/IOInterruptEventSource.cpp +++ b/iokit/Kernel/IOInterruptEventSource.cpp @@ -220,6 +220,19 @@ IOInterruptEventSource::interruptEventSource(OSObject *inOwner, return me; } +IOInterruptEventSource * +IOInterruptEventSource::interruptEventSource(OSObject *inOwner, + IOService *inProvider, + int inIntIndex, + ActionBlock inAction) +{ + IOInterruptEventSource * ies; + ies = IOInterruptEventSource::interruptEventSource(inOwner, (Action) NULL, inProvider, inIntIndex); + if (ies) ies->setActionBlock((IOEventSource::ActionBlock) inAction); + + return ies; +} + void IOInterruptEventSource::free() { if (provider && intIndex >= 0) @@ -300,6 +313,7 @@ bool IOInterruptEventSource::checkForWork() unsigned int cacheProdCount = producerCount; int numInts = cacheProdCount - consumerCount; IOInterruptEventAction intAction = (IOInterruptEventAction) action; + ActionBlock intActionBlock = (ActionBlock) actionBlock; bool trace = (gIOKitTrace & kIOTraceIntEventSource) ? true : false; IOStatisticsCheckForWork(); @@ -322,7 +336,8 @@ bool IOInterruptEventSource::checkForWork() } // Call the handler - (*intAction)(owner, this, numInts); + if (kActionBlock & flags) (intActionBlock)(this, numInts); + else (*intAction)(owner, this, numInts); if (reserved->statistics) { if (IA_GET_STATISTIC_ENABLED(kInterruptAccountingSecondLevelCountIndex)) { @@ -368,7 +383,8 @@ bool IOInterruptEventSource::checkForWork() } // Call the handler - (*intAction)(owner, this, -numInts); + if (kActionBlock & flags) (intActionBlock)(this, numInts); + else (*intAction)(owner, this, numInts); if (reserved->statistics) { if (IA_GET_STATISTIC_ENABLED(kInterruptAccountingSecondLevelCountIndex)) { diff --git a/iokit/Kernel/IOKitKernelInternal.h b/iokit/Kernel/IOKitKernelInternal.h index d1711fcc3..4b0cf6ffc 100644 --- a/iokit/Kernel/IOKitKernelInternal.h +++ b/iokit/Kernel/IOKitKernelInternal.h @@ -131,10 +131,10 @@ struct IODMACommandInternal UInt8 fPrepared; UInt8 fDoubleBuffer; UInt8 fNewMD; - UInt8 fLocalMapper; UInt8 fLocalMapperAllocValid; UInt8 fIOVMAddrValid; UInt8 fForceDoubleBuffer; + UInt8 fSetActiveNoMapper; vm_page_t fCopyPageAlloc; vm_page_t fCopyNext; diff --git a/iokit/Kernel/IOLib.cpp b/iokit/Kernel/IOLib.cpp index 73057d7a9..385ce056f 100644 --- a/iokit/Kernel/IOLib.cpp +++ b/iokit/Kernel/IOLib.cpp @@ -548,8 +548,8 @@ IOKernelAllocateWithPhysicalRestrict(mach_vm_size_t size, mach_vm_address_t maxP alignment = 1; alignMask = alignment - 1; - adjustedSize = (2 * size) + sizeofIOLibPageMallocHeader; - if (adjustedSize < size) return (0); + + if (os_mul_and_add_overflow(2, size, sizeofIOLibPageMallocHeader, &adjustedSize)) return (0); contiguous = (contiguous && (adjustedSize > page_size)) || (alignment > page_size); diff --git a/iokit/Kernel/IOMemoryCursor.cpp b/iokit/Kernel/IOMemoryCursor.cpp index 99999991d..3314a68a1 100644 --- a/iokit/Kernel/IOMemoryCursor.cpp +++ b/iokit/Kernel/IOMemoryCursor.cpp @@ -244,8 +244,13 @@ IOBigMemoryCursor::outputSegment(PhysicalSegment inSegment, IOPhysicalAddress * segment; segment = &((PhysicalSegment *) inSegments)[inSegmentIndex].location; +#if IOPhysSize == 64 + OSWriteBigInt64(segment, 0, inSegment.location); + OSWriteBigInt64(segment, sizeof(IOPhysicalAddress), inSegment.length); +#else OSWriteBigInt(segment, 0, inSegment.location); OSWriteBigInt(segment, sizeof(IOPhysicalAddress), inSegment.length); +#endif } IOBigMemoryCursor * @@ -291,8 +296,13 @@ IOLittleMemoryCursor::outputSegment(PhysicalSegment inSegment, IOPhysicalAddress * segment; segment = &((PhysicalSegment *) inSegments)[inSegmentIndex].location; +#if IOPhysSize == 64 + OSWriteLittleInt64(segment, 0, inSegment.location); + OSWriteLittleInt64(segment, sizeof(IOPhysicalAddress), inSegment.length); +#else OSWriteLittleInt(segment, 0, inSegment.location); OSWriteLittleInt(segment, sizeof(IOPhysicalAddress), inSegment.length); +#endif } IOLittleMemoryCursor * diff --git a/iokit/Kernel/IOMemoryDescriptor.cpp b/iokit/Kernel/IOMemoryDescriptor.cpp index 0d03e32ca..3c1c4674b 100644 --- a/iokit/Kernel/IOMemoryDescriptor.cpp +++ b/iokit/Kernel/IOMemoryDescriptor.cpp @@ -42,6 +42,7 @@ #include #include +#include #include "IOKitKernelInternal.h" @@ -873,7 +874,7 @@ IOGeneralMemoryDescriptor::memoryReferenceMap( * kIOMapPrefault is redundant in that case, so don't try to use it for UPL * operations. */ - if ((reserved != NULL) && (reserved->dp.devicePager) && (_memoryEntries == NULL) && (_wireCount != 0)) + if ((reserved != NULL) && (reserved->dp.devicePager) && (_wireCount != 0)) options &= ~kIOMapPrefault; /* @@ -1704,6 +1705,7 @@ IOGeneralMemoryDescriptor::initWithOptions(void * buffers, && (VM_KERN_MEMORY_NONE == _kernelTag)) { _kernelTag = IOMemoryTag(kernel_map); + if (_kernelTag == gIOSurfaceTag) _userTag = VM_MEMORY_IOSURFACE; } if ( (kIOMemoryPersistent & _flags) && !_memRef) @@ -1962,7 +1964,11 @@ IOByteCount IOMemoryDescriptor::writeBytes assert(!remaining); +#if defined(__x86_64__) + // copypv does not cppvFsnk on intel +#else if (!srcAddr) performOperation(kIOMemoryIncoherentIOFlush, inoffset, length); +#endif return length - remaining; } @@ -3642,6 +3648,7 @@ IOReturn IOGeneralMemoryDescriptor::doMap( && (mapping->fAddressTask == _task) && (mapping->fAddressMap == get_task_map(_task)) && (options & kIOMapAnywhere) + && (!(kIOMapUnique & options)) && (1 == _rangesCount) && (0 == offset) && range0Addr @@ -4535,9 +4542,8 @@ IOMemoryMap * IOMemoryDescriptor::makeMapping( if (!(kIOMap64Bit & options)) panic("IOMemoryDescriptor::makeMapping !64bit"); #endif /* !__LP64__ */ - IOMemoryDescriptor * mapDesc = 0; - IOMemoryMap * result = 0; - OSIterator * iter; + IOMemoryDescriptor * mapDesc = 0; + __block IOMemoryMap * result = 0; IOMemoryMap * mapping = (IOMemoryMap *) __address; mach_vm_size_t offset = mapping->fOffset + __offset; @@ -4582,20 +4588,17 @@ IOMemoryMap * IOMemoryDescriptor::makeMapping( else { // look for a compatible existing mapping - if( (iter = OSCollectionIterator::withCollection(_mappings))) + if (_mappings) _mappings->iterateObjects(^(OSObject * object) { - IOMemoryMap * lookMapping; - while ((lookMapping = (IOMemoryMap *) iter->getNextObject())) + IOMemoryMap * lookMapping = (IOMemoryMap *) object; + if ((result = lookMapping->copyCompatible(mapping))) { - if ((result = lookMapping->copyCompatible(mapping))) - { - addMapping(result); - result->setMemoryDescriptor(this, offset); - break; - } + addMapping(result); + result->setMemoryDescriptor(this, offset); + return (true); } - iter->release(); - } + return (false); + }); if (result || (options & kIOMapReference)) { if (result != mapping) diff --git a/iokit/Kernel/IOMultiMemoryDescriptor.cpp b/iokit/Kernel/IOMultiMemoryDescriptor.cpp index 13a5a39a6..d54824088 100644 --- a/iokit/Kernel/IOMultiMemoryDescriptor.cpp +++ b/iokit/Kernel/IOMultiMemoryDescriptor.cpp @@ -394,3 +394,31 @@ IOReturn IOMultiMemoryDescriptor::getPageCounts(IOByteCount * pResidentPageCount return (err); } + +uint64_t IOMultiMemoryDescriptor::getPreparationID( void ) +{ + + if (!super::getKernelReserved()) + { + return (kIOPreparationIDUnsupported); + } + + for (unsigned index = 0; index < _descriptorsCount; index++) + { + uint64_t preparationID = _descriptors[index]->getPreparationID(); + + if ( preparationID == kIOPreparationIDUnsupported ) + { + return (kIOPreparationIDUnsupported); + } + + if ( preparationID == kIOPreparationIDUnprepared ) + { + return (kIOPreparationIDUnprepared); + } + } + + super::setPreparationID(); + + return (super::getPreparationID()); +} diff --git a/iokit/Kernel/IONVRAM.cpp b/iokit/Kernel/IONVRAM.cpp index 94d6b75dd..4814258d1 100644 --- a/iokit/Kernel/IONVRAM.cpp +++ b/iokit/Kernel/IONVRAM.cpp @@ -36,13 +36,6 @@ #include #include -#if CONFIG_MACF -extern "C" { -#include -#include -}; -#endif /* MAC */ - #define super IOService #define kIONVRAMPrivilege kIOClientPrivilegeAdministrator @@ -296,11 +289,7 @@ bool IODTNVRAM::serializeProperties(OSSerialize *s) const variablePerm = getOFVariablePerm(key); if ((hasPrivilege || (variablePerm != kOFVariablePermRootOnly)) && - ( ! (variablePerm == kOFVariablePermKernelOnly && current_task() != kernel_task) ) -#if CONFIG_MACF - && (current_task() == kernel_task || mac_iokit_check_nvram_get(kauth_cred_get(), key->getCStringNoCopy()) == 0) -#endif - ) { } + ( ! (variablePerm == kOFVariablePermKernelOnly && current_task() != kernel_task) )) { } else { dict->removeObject(key); iter->reset(); @@ -332,12 +321,6 @@ OSObject *IODTNVRAM::copyProperty(const OSSymbol *aKey) const } if (variablePerm == kOFVariablePermKernelOnly && current_task() != kernel_task) return 0; -#if CONFIG_MACF - if (current_task() != kernel_task && - mac_iokit_check_nvram_get(kauth_cred_get(), aKey->getCStringNoCopy()) != 0) - return 0; -#endif - IOLockLock(_ofLock); theObject = _ofDict->getObject(aKey); if (theObject) theObject->retain(); @@ -384,7 +367,7 @@ bool IODTNVRAM::setProperty(const OSSymbol *aKey, OSObject *anObject) { bool result; UInt32 propType, propPerm; - OSString *tmpString; + OSString *tmpString = 0; OSObject *propObject = 0, *oldObject; if (_ofDict == 0) return false; @@ -399,12 +382,6 @@ bool IODTNVRAM::setProperty(const OSSymbol *aKey, OSObject *anObject) // Don't allow change of 'aapl,panic-info'. if (aKey->isEqualTo(kIODTNVRAMPanicInfoKey)) return false; -#if CONFIG_MACF - if (current_task() != kernel_task && - mac_iokit_check_nvram_set(kauth_cred_get(), aKey->getCStringNoCopy(), anObject) != 0) - return false; -#endif - // Make sure the object is of the correct type. propType = getOFVariableType(aKey); switch (propType) { @@ -458,6 +435,9 @@ bool IODTNVRAM::setProperty(const OSSymbol *aKey, OSObject *anObject) if (oldObject) { oldObject->release(); } + if (tmpString) { + propObject->release(); + } IOLockUnlock(_ofLock); @@ -482,12 +462,6 @@ void IODTNVRAM::removeProperty(const OSSymbol *aKey) // Don't allow change of 'aapl,panic-info'. if (aKey->isEqualTo(kIODTNVRAMPanicInfoKey)) return; -#if CONFIG_MACF - if (current_task() != kernel_task && - mac_iokit_check_nvram_delete(kauth_cred_get(), aKey->getCStringNoCopy()) != 0) - return; -#endif - // If the object exists, remove it from the dictionary. IOLockLock(_ofLock); diff --git a/iokit/Kernel/IOPMrootDomain.cpp b/iokit/Kernel/IOPMrootDomain.cpp index b6ee3bce8..40a11e05e 100644 --- a/iokit/Kernel/IOPMrootDomain.cpp +++ b/iokit/Kernel/IOPMrootDomain.cpp @@ -59,11 +59,14 @@ #include #include #include +#include #include #include "IOServicePrivate.h" // _IOServiceInterestNotifier #include "IOServicePMPrivate.h" +#include + __BEGIN_DECLS #include #include @@ -182,6 +185,7 @@ IOReturn OSKextSystemSleepOrWake( UInt32 ); } extern "C" ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va); extern "C" addr64_t kvtophys(vm_offset_t va); +extern "C" boolean_t kdp_has_polled_corefile(); static void idleSleepTimerExpired( thread_call_param_t, thread_call_param_t ); static void notifySystemShutdown( IOService * root, uint32_t messageType ); @@ -198,7 +202,8 @@ static const OSSymbol *sleepMessagePEFunction = NULL; #define kIORequestWranglerIdleKey "IORequestIdle" #define kDefaultWranglerIdlePeriod 1000 // in milliseconds -#define kIOSleepWakeDebugKey "Persistent-memory-note" +#define kIOSleepWakeFailureString "SleepWakeFailureString" +#define kIOOSWatchdogFailureString "OSWatchdogFailureString" #define kIOEFIBootRomFailureKey "wake-failure" #define kRD_AllPowerSources (kIOPMSupportedOnAC \ @@ -331,8 +336,13 @@ uuid_string_t bootsessionuuid_string; static uint32_t gDarkWakeFlags = kDarkWakeFlagHIDTickleNone; static uint32_t gNoIdleFlag = 0; +static uint32_t gSwdPanic = 0; +static uint32_t gSwdSleepTimeout = 0; +static uint32_t gSwdWakeTimeout = 0; +static uint32_t gSwdSleepWakeTimeout = 0; static PMStatsStruct gPMStats; + #if HIBERNATION static IOPMSystemSleepPolicyHandler gSleepPolicyHandler = 0; static IOPMSystemSleepPolicyVariables * gSleepPolicyVars = 0; @@ -346,11 +356,18 @@ static char gWakeReasonString[128]; static bool gWakeReasonSysctlRegistered = false; static AbsoluteTime gIOLastWakeAbsTime; static AbsoluteTime gIOLastSleepAbsTime; +static AbsoluteTime gUserActiveAbsTime; +static AbsoluteTime gUserInactiveAbsTime; #if defined(__i386__) || defined(__x86_64__) static bool gSpinDumpBufferFull = false; #endif +z_stream swd_zs; +vm_offset_t swd_zs_zmem; +//size_t swd_zs_zsize; +size_t swd_zs_zoffset; + static unsigned int gPMHaltBusyCount; static unsigned int gPMHaltIdleCount; static int gPMHaltDepth; @@ -359,7 +376,6 @@ static IOLock * gPMHaltLock = 0; static OSArray * gPMHaltArray = 0; static const OSSymbol * gPMHaltClientAcknowledgeKey = 0; static bool gPMQuiesced; -static uint32_t gIOPMPCIHostBridgeWakeDelay; // Constants used as arguments to IOPMrootDomain::informCPUStateChange #define kCPUUnknownIndex 9999999 @@ -697,7 +713,6 @@ extern "C" void IOSystemShutdownNotification(int stage) #if HIBERNATION startTime = mach_absolute_time(); IOHibernateSystemPostWake(true); - gRootDomain->swdDebugTeardown(); halt_log_enter("IOHibernateSystemPostWake", 0, mach_absolute_time() - startTime); #endif if (OSCompareAndSwap(0, 1, &gPagingOff)) @@ -789,76 +804,6 @@ void IOPMrootDomain::updateConsoleUsers(void) //****************************************************************************** -static void swdDebugSetupCallout( thread_call_param_t p0, thread_call_param_t p1 ) -{ - IOPMrootDomain * rootDomain = (IOPMrootDomain *) p0; - uint32_t notifyRef = (uint32_t)(uintptr_t) p1; - - rootDomain->swdDebugSetup(); - - if (p1) { - rootDomain->allowPowerChange(notifyRef); - } - DLOG("swdDebugSetupCallout finish\n"); -} - -void IOPMrootDomain::swdDebugSetup( ) -{ -#if HIBERNATION - static int32_t noDebugFile = -1; - if (noDebugFile == -1) { - if (PEGetCoprocessorVersion() >= kCoprocessorVersion2) - noDebugFile = 1; - else if (PE_parse_boot_argn("swd_mem_only", &noDebugFile, sizeof(noDebugFile)) == false) - noDebugFile = 0; - } - - if ((noDebugFile == 1) || (gRootDomain->sleepWakeDebugIsWdogEnabled() == false)) { - return; - } - DLOG("swdDebugSetup state:%d\n", swd_DebugImageSetup); - if (swd_DebugImageSetup == FALSE) { - swd_DebugImageSetup = TRUE; - if (CAP_GAIN(kIOPMSystemCapabilityGraphics) || - (CAP_LOSS(kIOPMSystemCapabilityGraphics))) { - IOHibernateSystemPostWake(true); - IOCloseDebugDataFile(); - } - IOOpenDebugDataFile(kSleepWakeStackBinFilename, SWD_BUF_SIZE); - } -#endif - - -} - -static void swdDebugTeardownCallout( thread_call_param_t p0, thread_call_param_t p1 ) -{ - IOPMrootDomain * rootDomain = (IOPMrootDomain *) p0; - uint32_t notifyRef = (uint32_t)(uintptr_t) p1; - - rootDomain->swdDebugTeardown(); - if (p1) { - rootDomain->allowPowerChange(notifyRef); - } - DLOG("swdDebugTeardownCallout finish\n"); -} - -void IOPMrootDomain::swdDebugTeardown( ) -{ - -#if HIBERNATION - DLOG("swdDebugTeardown state:%d\n", swd_DebugImageSetup); - if (swd_DebugImageSetup == TRUE) { - swd_DebugImageSetup = FALSE; - IOCloseDebugDataFile(); - } -#endif - - -} -//****************************************************************************** - - static void disk_sync_callout( thread_call_param_t p0, thread_call_param_t p1 ) { IOService * rootDomain = (IOService *) p0; @@ -875,12 +820,10 @@ static void disk_sync_callout( thread_call_param_t p0, thread_call_param_t p1 ) // Block sleep until trim issued on previous wake path is completed. IOHibernateSystemPostWake(true); #endif - swdDebugSetupCallout(p0, NULL); } #if HIBERNATION else { - swdDebugTeardownCallout(p0, NULL); IOHibernateSystemPostWake(false); if (gRootDomain) @@ -943,6 +886,8 @@ static SYSCTL_PROC(_kern, OID_AUTO, waketime, SYSCTL_QUAD(_kern, OID_AUTO, wake_abs_time, CTLFLAG_RD|CTLFLAG_LOCKED, &gIOLastWakeAbsTime, ""); SYSCTL_QUAD(_kern, OID_AUTO, sleep_abs_time, CTLFLAG_RD|CTLFLAG_LOCKED, &gIOLastSleepAbsTime, ""); +SYSCTL_QUAD(_kern, OID_AUTO, useractive_abs_time, CTLFLAG_RD|CTLFLAG_LOCKED, &gUserActiveAbsTime, ""); +SYSCTL_QUAD(_kern, OID_AUTO, userinactive_abs_time, CTLFLAG_RD|CTLFLAG_LOCKED, &gUserInactiveAbsTime, ""); static int sysctl_willshutdown @@ -1081,6 +1026,11 @@ SYSCTL_PROC(_hw, OID_AUTO, targettype, static SYSCTL_INT(_debug, OID_AUTO, darkwake, CTLFLAG_RW, &gDarkWakeFlags, 0, ""); static SYSCTL_INT(_debug, OID_AUTO, noidle, CTLFLAG_RW, &gNoIdleFlag, 0, ""); +static SYSCTL_INT(_debug, OID_AUTO, swd_sleep_timeout, CTLFLAG_RW, &gSwdSleepTimeout, 0, ""); +static SYSCTL_INT(_debug, OID_AUTO, swd_wake_timeout, CTLFLAG_RW, &gSwdWakeTimeout, 0, ""); +static SYSCTL_INT(_debug, OID_AUTO, swd_timeout, CTLFLAG_RW, &gSwdSleepWakeTimeout, 0, ""); +static SYSCTL_INT(_debug, OID_AUTO, swd_panic, CTLFLAG_RW, &gSwdPanic, 0, ""); + static const OSSymbol * gIOPMSettingAutoWakeCalendarKey; static const OSSymbol * gIOPMSettingAutoWakeSecondsKey; @@ -1103,9 +1053,6 @@ bool IOPMrootDomain::start( IOService * nub ) OSIterator *psIterator; OSDictionary *tmpDict; IORootParent * patriarch; -#if defined(__i386__) || defined(__x86_64__) - IONotifier * notifier; -#endif super::start(nub); @@ -1151,9 +1098,11 @@ bool IOPMrootDomain::start( IOService * nub ) PE_parse_boot_argn("darkwake", &gDarkWakeFlags, sizeof(gDarkWakeFlags)); PE_parse_boot_argn("noidle", &gNoIdleFlag, sizeof(gNoIdleFlag)); + PE_parse_boot_argn("swd_sleeptimeout", &gSwdSleepTimeout, sizeof(gSwdSleepTimeout)); + PE_parse_boot_argn("swd_waketimeout", &gSwdWakeTimeout, sizeof(gSwdWakeTimeout)); + PE_parse_boot_argn("swd_timeout", &gSwdSleepWakeTimeout, sizeof(gSwdSleepWakeTimeout)); PE_parse_boot_argn("haltmspanic", &gHaltTimeMaxPanic, sizeof(gHaltTimeMaxPanic)); PE_parse_boot_argn("haltmslog", &gHaltTimeMaxLog, sizeof(gHaltTimeMaxLog)); - PE_parse_boot_argn("pcihostbridge_wake_delay", &gIOPMPCIHostBridgeWakeDelay, sizeof(gIOPMPCIHostBridgeWakeDelay)); queue_init(&aggressivesQueue); aggressivesThreadCall = thread_call_allocate(handleAggressivesFunction, this); @@ -1173,12 +1122,6 @@ bool IOPMrootDomain::start( IOService * nub ) diskSyncCalloutEntry = thread_call_allocate( &disk_sync_callout, (thread_call_param_t) this); - swdDebugSetupEntry = thread_call_allocate( - &swdDebugSetupCallout, - (thread_call_param_t) this); - swdDebugTearDownEntry = thread_call_allocate( - &swdDebugTeardownCallout, - (thread_call_param_t) this); updateConsoleUsersEntry = thread_call_allocate( &updateConsoleUsersCallout, (thread_call_param_t) this); @@ -1215,6 +1158,7 @@ bool IOPMrootDomain::start( IOService * nub ) // Will never transition to user inactive w/o wrangler. fullWakeReason = kFullWakeReasonLocalUser; userIsActive = userWasActive = true; + clock_get_uptime(&gUserActiveAbsTime); setProperty(gIOPMUserIsActiveKey, kOSBooleanTrue); // Set the default system capabilities at boot. @@ -1302,15 +1246,6 @@ bool IOPMrootDomain::start( IOService * nub ) #if defined(__i386__) || defined(__x86_64__) - if ((tmpDict = serviceMatching("IODTNVRAM"))) - { - notifier = addMatchingNotification( - gIOFirstPublishNotification, tmpDict, - (IOServiceMatchingNotificationHandler) &IONVRAMMatchPublished, - this, 0); - tmpDict->release(); - } - wranglerIdleSettings = NULL; OSNumber * wranglerIdlePeriod = NULL; wranglerIdleSettings = OSDictionary::withCapacity(1); @@ -2324,6 +2259,7 @@ void IOPMrootDomain::powerChangeDone( unsigned long previousPowerState ) DLOG("PowerChangeDone: %u->%u\n", (uint32_t) previousPowerState, (uint32_t) getPowerState()); + notifierThread = current_thread(); switch ( getPowerState() ) { case SLEEP_STATE: { @@ -2376,7 +2312,6 @@ void IOPMrootDomain::powerChangeDone( unsigned long previousPowerState ) } } assertOnWakeSecs = 0; - ((IOService *)this)->stop_watchdog_timer(); //14456299 lowBatteryCondition = false; #if DEVELOPMENT || DEBUG @@ -2403,7 +2338,6 @@ void IOPMrootDomain::powerChangeDone( unsigned long previousPowerState ) IOLog("gIOLastWakeAbsTime: %lld\n", gIOLastWakeAbsTime); _highestCapability = 0; - ((IOService *)this)->start_watchdog_timer(); //14456299 #if HIBERNATION IOHibernateSystemWake(); #endif @@ -2611,6 +2545,7 @@ void IOPMrootDomain::powerChangeDone( unsigned long previousPowerState ) #endif } + notifierThread = NULL; } //****************************************************************************** @@ -3082,19 +3017,7 @@ IOReturn IOPMrootDomain::sysPowerDownHandler( if (!gRootDomain) return kIOReturnUnsupported; - if (messageType == kIOMessageSystemWillSleep) - { -#if HIBERNATION - IOPowerStateChangeNotification *notify = - (IOPowerStateChangeNotification *)messageArgs; - - notify->returnValue = 30 * 1000 * 1000; - thread_call_enter1( - gRootDomain->swdDebugSetupEntry, - (thread_call_param_t)(uintptr_t) notify->powerRef); -#endif - } - else if (messageType == kIOMessageSystemCapabilityChange) + if (messageType == kIOMessageSystemCapabilityChange) { IOPMSystemCapabilityChangeParameters * params = (IOPMSystemCapabilityChangeParameters *) messageArgs; @@ -3161,25 +3084,6 @@ IOReturn IOPMrootDomain::sysPowerDownHandler( gRootDomain->diskSyncCalloutEntry, (thread_call_param_t)(uintptr_t) params->notifyRef); } - else if (CAP_WILL_CHANGE_TO_OFF(params, kIOPMSystemCapabilityGraphics) || - CAP_WILL_CHANGE_TO_ON(params, kIOPMSystemCapabilityGraphics)) - { - // WillChange for Full wake -> Darkwake - params->maxWaitForReply = 30 * 1000 * 1000; - thread_call_enter1( - gRootDomain->swdDebugSetupEntry, - (thread_call_param_t)(uintptr_t) params->notifyRef); - } - else if (CAP_DID_CHANGE_TO_OFF(params, kIOPMSystemCapabilityGraphics) || - CAP_DID_CHANGE_TO_ON(params, kIOPMSystemCapabilityGraphics)) - { - // DidChange for Full wake -> Darkwake - params->maxWaitForReply = 30 * 1000 * 1000; - thread_call_enter1( - gRootDomain->swdDebugTearDownEntry, - (thread_call_param_t)(uintptr_t) params->notifyRef); - - } #endif ret = kIOReturnSuccess; } @@ -3402,6 +3306,7 @@ void IOPMrootDomain::willNotifyPowerChildren( IOPMPowerStateIndex newPowerState if (SLEEP_STATE == newPowerState) { + notifierThread = current_thread(); if (!tasksSuspended) { AbsoluteTime deadline; @@ -3431,6 +3336,7 @@ void IOPMrootDomain::willNotifyPowerChildren( IOPMPowerStateIndex newPowerState if (secs) secs->release(); } + notifierThread = NULL; } } @@ -4524,7 +4430,8 @@ void IOPMrootDomain::evaluateSystemSleepPolicyFinal( void ) { if ((kIOPMSleepTypeStandby == params.sleepType) && gIOHibernateStandbyDisabled && gSleepPolicyVars - && (!(kIOPMSleepFactorStandbyForced & gSleepPolicyVars->sleepFactors))) + && (!((kIOPMSleepFactorStandbyForced|kIOPMSleepFactorAutoPowerOffForced|kIOPMSleepFactorHibernateForced) + & gSleepPolicyVars->sleepFactors))) { standbyNixed = true; wakeNow = true; @@ -4925,8 +4832,6 @@ IOReturn IOPMrootDomain::restartSystem( void ) // MARK: - // MARK: System Capability -SYSCTL_UINT(_kern, OID_AUTO, pcihostbridge_wake_delay, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (uint32_t *)&gIOPMPCIHostBridgeWakeDelay, 0, ""); - //****************************************************************************** // tagPowerPlaneService // @@ -4997,7 +4902,7 @@ void IOPMrootDomain::tagPowerPlaneService( while (child != this) { - if ((gIOPMPCIHostBridgeWakeDelay ? (parent == pciHostBridgeDriver) : (parent->metaCast("IOPCIDevice") != NULL)) || + if (parent->metaCast("IOPCIDevice") || (parent == this)) { if (OSDynamicCast(IOPowerConnection, child)) @@ -6262,50 +6167,6 @@ bool IOPMrootDomain::displayWranglerMatchPublished( return true; } -#if defined(__i386__) || defined(__x86_64__) - -bool IOPMrootDomain::IONVRAMMatchPublished( - void * target, - void * refCon, - IOService * newService, - IONotifier * notifier) -{ - unsigned int len = 0; - IOPMrootDomain *rd = (IOPMrootDomain *)target; - OSNumber *statusCode = NULL; - - if (PEReadNVRAMProperty(kIOSleepWakeDebugKey, NULL, &len)) - { - statusCode = OSDynamicCast(OSNumber, rd->getProperty(kIOPMSleepWakeFailureCodeKey)); - if (statusCode != NULL) { - if (statusCode->unsigned64BitValue() != 0) { - rd->swd_flags |= SWD_BOOT_BY_SW_WDOG; - MSG("System was rebooted due to Sleep/Wake failure\n"); - } - else { - rd->swd_flags |= SWD_BOOT_BY_OSX_WDOG; - MSG("System was non-responsive and was rebooted by watchdog\n"); - } - } - - rd->swd_logBufMap = rd->sleepWakeDebugRetrieve(); - } - if (notifier) notifier->remove(); - return true; -} - -#else -bool IOPMrootDomain::IONVRAMMatchPublished( - void * target, - void * refCon, - IOService * newService, - IONotifier * notifier __unused) -{ - return false; -} - -#endif - //****************************************************************************** // reportUserInput // @@ -6663,19 +6524,9 @@ void IOPMrootDomain::dispatchPowerEvent( break; } - if (swd_flags & SWD_VALID_LOGS) { - if (swd_flags & SWD_LOGS_IN_MEM) { - sleepWakeDebugDumpFromMem(swd_logBufMap); - swd_logBufMap->release(); - swd_logBufMap = 0; - } - else if (swd_flags & SWD_LOGS_IN_FILE) - sleepWakeDebugDumpFromFile(); - } - else if (swd_flags & (SWD_BOOT_BY_SW_WDOG|SWD_BOOT_BY_OSX_WDOG)) { - // If logs are invalid, write the failure code - sleepWakeDebugDumpFromMem(NULL); - } + sleepWakeDebugMemAlloc(); + saveFailureData2File(); + // If lid is closed, re-send lid closed notification // now that booting is complete. if ( clamshellClosed ) @@ -7004,20 +6855,25 @@ void IOPMrootDomain::handlePowerNotification( UInt32 msg ) */ if (msg & kIOPMClamshellClosed) { - DLOG("Clamshell closed\n"); - // Received clamshel open message from clamshell controlling driver - // Update our internal state and tell general interest clients - clamshellClosed = true; - clamshellExists = true; + if (clamshellClosed && clamshellExists) { + DLOG("Ignoring redundant Clamshell close event\n"); + } + else { + DLOG("Clamshell closed\n"); + // Received clamshel open message from clamshell controlling driver + // Update our internal state and tell general interest clients + clamshellClosed = true; + clamshellExists = true; - // Tell PMCPU - informCPUStateChange(kInformLid, 1); + // Tell PMCPU + informCPUStateChange(kInformLid, 1); - // Tell general interest clients - sendClientClamshellNotification(); + // Tell general interest clients + sendClientClamshellNotification(); - // And set eval_clamshell = so we can attempt - eval_clamshell = true; + // And set eval_clamshell = so we can attempt + eval_clamshell = true; + } } /* @@ -7190,6 +7046,7 @@ void IOPMrootDomain::evaluatePolicy( int stimulus, uint32_t arg ) { userIsActive = true; userWasActive = true; + clock_get_uptime(&gUserActiveAbsTime); // Stay awake after dropping demand for display power on if (kFullWakeReasonDisplayOn == fullWakeReason) { @@ -7209,6 +7066,7 @@ void IOPMrootDomain::evaluatePolicy( int stimulus, uint32_t arg ) DLOG("evaluatePolicy( %d, 0x%x )\n", stimulus, arg); if (userIsActive) { + clock_get_uptime(&gUserInactiveAbsTime); userIsActive = false; clock_get_uptime(&userBecameInactiveTime); flags.bit.userBecameInactive = true; @@ -7987,15 +7845,20 @@ void IOPMrootDomain::tracePoint( uint8_t point ) pmTracer->tracePoint(point); } -void IOPMrootDomain::traceDetail(OSObject *object) +void IOPMrootDomain::traceDetail(OSObject *object, bool start) { - IOPMServiceInterestNotifier *notifier = OSDynamicCast(IOPMServiceInterestNotifier, object); + IOPMServiceInterestNotifier *notifier; + + if (systemBooting) { + return; + } + + notifier = OSDynamicCast(IOPMServiceInterestNotifier, object); if (!notifier) { - DLOG("Unknown notifier\n"); return; } - if (!systemBooting) { + if (start) { pmTracer->traceDetail( notifier->uuid0 >> 32 ); kdebugTrace(kPMLogSleepWakeMessage, pmTracer->getTracePhase(), notifier->msgType, notifier->uuid0, notifier->uuid1); if (notifier->identifier) { @@ -8005,8 +7868,15 @@ void IOPMrootDomain::traceDetail(OSObject *object) else { DLOG("trace point 0x%02x msg 0x%x\n", pmTracer->getTracePhase(), notifier->msgType); } + notifierThread = current_thread(); + notifierObject = notifier; + notifier->retain(); + } + else { + notifierThread = NULL; + notifierObject = NULL; + notifier->release(); } - } @@ -9762,13 +9632,24 @@ OSObject * IORootParent::copyProperty( const char * aKey) const return (IOService::copyProperty(aKey)); } +uint32_t IOPMrootDomain::getWatchdogTimeout() +{ + if (gSwdSleepWakeTimeout) { + gSwdSleepTimeout = gSwdWakeTimeout = gSwdSleepWakeTimeout; + } + if ((pmTracer->getTracePhase() < kIOPMTracePointSystemSleep) || + (pmTracer->getTracePhase() == kIOPMTracePointDarkWakeEntry)) { + return gSwdSleepTimeout ? gSwdSleepTimeout : WATCHDOG_SLEEP_TIMEOUT; + } + else { + return gSwdWakeTimeout ? gSwdWakeTimeout : WATCHDOG_WAKE_TIMEOUT; + } +} + #if defined(__i386__) || defined(__x86_64__) IOReturn IOPMrootDomain::restartWithStackshot() { - if ((swd_flags & SWD_WDOG_ENABLED) == 0) - return kIOReturnError; - takeStackshot(true, true, false); return kIOReturnSuccess; @@ -9779,200 +9660,688 @@ void IOPMrootDomain::sleepWakeDebugTrig(bool wdogTrigger) takeStackshot(wdogTrigger, false, false); } -void IOPMrootDomain::takeStackshot(bool wdogTrigger, bool isOSXWatchdog, bool isSpinDump) +void IOPMrootDomain::tracePhase2String(uint32_t tracePhase, const char **phaseString, const char **description) { - swd_hdr * hdr = NULL; - addr64_t data[3]; - int wdog_panic = -1; - int stress_rack = -1; - int cnt = 0; - pid_t pid = 0; - kern_return_t kr = KERN_SUCCESS; - uint32_t flags; + switch (tracePhase) { - char * dstAddr; - uint32_t size; - uint32_t bytesRemaining; - unsigned bytesWritten = 0; - unsigned totalBytes = 0; - unsigned int len; - OSString * UUIDstring = NULL; - uint64_t code; - IOMemoryMap * logBufMap = NULL; + case kIOPMTracePointSleepStarted: + *phaseString = "kIOPMTracePointSleepStarted"; + *description = "starting sleep"; + break; + case kIOPMTracePointSleepApplications: + *phaseString = "kIOPMTracePointSleepApplications"; + *description = "notifying applications"; + break; - uint32_t bufSize; - uint32_t initialStackSize; + case kIOPMTracePointSleepPriorityClients: + *phaseString = "kIOPMTracePointSleepPriorityClients"; + *description = "notifying clients about upcoming system capability changes"; + break; - if (isSpinDump) { - if (_systemTransitionType != kSystemTransitionSleep && - _systemTransitionType != kSystemTransitionWake) - return; - } else { - if ( kIOSleepWakeWdogOff & gIOKitDebug ) - return; - } + case kIOPMTracePointSleepWillChangeInterests: + *phaseString = "kIOPMTracePointSleepWillChangeInterests"; + *description = "creating hibernation file or while calling rootDomain's clients about upcoming rootDomain's state changes"; + break; - if (wdogTrigger) { - PE_parse_boot_argn("swd_panic", &wdog_panic, sizeof(wdog_panic)); - PE_parse_boot_argn("stress-rack", &stress_rack, sizeof(stress_rack)); - if ((wdog_panic == 1) || (stress_rack == 1) || (PEGetCoprocessorVersion() >= kCoprocessorVersion2)) { - // If boot-arg specifies to panic then panic. - panic("Sleep/Wake hang detected"); - return; - } - else if (swd_flags & SWD_BOOT_BY_SW_WDOG) { - // If current boot is due to this watch dog trigger restart in previous boot, - // then don't trigger again until at least 1 successful sleep & wake. - if (!(sleepCnt && (displayWakeCnt || darkWakeCnt))) { - IOLog("Shutting down due to repeated Sleep/Wake failures\n"); - if (!tasksSuspended) { - tasksSuspended = TRUE; - tasks_system_suspend(true); - } - PEHaltRestart(kPEHaltCPU); - return; - } - } + case kIOPMTracePointSleepPowerPlaneDrivers: + *phaseString = "kIOPMTracePointSleepPowerPlaneDrivers"; + *description = "calling power state change callbacks"; + break; - } + case kIOPMTracePointSleepDidChangeInterests: + *phaseString = "kIOPMTracePointSleepDidChangeInterests"; + *description = "calling rootDomain's clients about rootDomain's state changes"; + break; - if (isSpinDump) { - if (gSpinDumpBufferFull) - return; - if (swd_spindump_buffer == NULL) { - sleepWakeDebugSpinDumpMemAlloc(); - if (swd_spindump_buffer == NULL) return; - } + case kIOPMTracePointSleepCapabilityClients: + *phaseString = "kIOPMTracePointSleepCapabilityClients"; + *description = "notifying clients about current system capabilities"; + break; - bufSize = SWD_SPINDUMP_SIZE; - initialStackSize = SWD_INITIAL_SPINDUMP_SIZE; - } else { - if (sleepWakeDebugIsWdogEnabled() == false) - return; + case kIOPMTracePointSleepPlatformActions: + *phaseString = "kIOPMTracePointSleepPlatformActions"; + *description = "calling Quiesce/Sleep action callbacks"; + break; - if (swd_buffer == NULL) { - sleepWakeDebugMemAlloc(); - if (swd_buffer == NULL) return; - } + case kIOPMTracePointSleepCPUs: + *phaseString = "kIOPMTracePointSleepCPUs"; + *description = "halting all non-boot CPUs"; + break; - bufSize = SWD_BUF_SIZE; - initialStackSize = SWD_INITIAL_STACK_SIZE; - } + case kIOPMTracePointSleepPlatformDriver: + *phaseString = "kIOPMTracePointSleepPlatformDriver"; + *description = "executing platform specific code"; + break; - if (!OSCompareAndSwap(0, 1, &gRootDomain->swd_lock)) - return; + case kIOPMTracePointHibernate: + *phaseString = "kIOPMTracePointHibernate"; + *description = "writing the hibernation image"; + break; - if (isSpinDump) { - hdr = (swd_hdr *)swd_spindump_buffer; - } - else { - hdr = (swd_hdr *)swd_buffer; - } + case kIOPMTracePointSystemSleep: + *phaseString = "kIOPMTracePointSystemSleep"; + *description = "in EFI/Bootrom after last point of entry to sleep"; + break; - memset(hdr->UUID, 0x20, sizeof(hdr->UUID)); - if ((UUIDstring = OSDynamicCast(OSString, getProperty(kIOPMSleepWakeUUIDKey))) != NULL ) { + case kIOPMTracePointWakePlatformDriver: + *phaseString = "kIOPMTracePointWakePlatformDriver"; + *description = "executing platform specific code"; + break; - if (wdogTrigger || (!UUIDstring->isEqualTo(hdr->UUID))) { - const char *str = UUIDstring->getCStringNoCopy(); - snprintf(hdr->UUID, sizeof(hdr->UUID), "UUID: %s", str); - } - else { - DLOG("Data for current UUID already exists\n"); - goto exit; - } - } - dstAddr = (char*)hdr + hdr->spindump_offset; - bytesRemaining = bufSize - hdr->spindump_offset; + case kIOPMTracePointWakePlatformActions: + *phaseString = "kIOPMTracePointWakePlatformActions"; + *description = "calling Wake action callbacks"; + break; - /* if AppleOSXWatchdog triggered the stackshot, set the flag in the heaer */ - hdr->is_osx_watchdog = isOSXWatchdog; + case kIOPMTracePointWakeCPUs: + *phaseString = "kIOPMTracePointWakeCPUs"; + *description = "starting non-boot CPUs"; + break; - DLOG("Taking snapshot. bytesRemaining: %d\n", bytesRemaining); + case kIOPMTracePointWakeWillPowerOnClients: + *phaseString = "kIOPMTracePointWakeWillPowerOnClients"; + *description = "sending kIOMessageSystemWillPowerOn message to kernel and userspace clients"; + break; - flags = STACKSHOT_KCDATA_FORMAT|STACKSHOT_NO_IO_STATS|STACKSHOT_SAVE_KEXT_LOADINFO; - while (kr == KERN_SUCCESS) { + case kIOPMTracePointWakeWillChangeInterests: + *phaseString = "kIOPMTracePointWakeWillChangeInterests"; + *description = "calling rootDomain's clients about upcoming rootDomain's state changes"; + break; - if (cnt == 0) { - /* - * Take stackshot of all process on first sample. Size is restricted - * to SWD_INITIAL_STACK_SIZE - */ - pid = -1; - size = (bytesRemaining > initialStackSize) ? initialStackSize : bytesRemaining; - flags |= STACKSHOT_ACTIVE_KERNEL_THREADS_ONLY; - } - else { - /* Take sample of kernel threads only */ - pid = 0; - size = bytesRemaining; - } + case kIOPMTracePointWakeDidChangeInterests: + *phaseString = "kIOPMTracePointWakeDidChangeInterests"; + *description = "calling rootDomain's clients about completed rootDomain's state changes"; + break; - kr = stack_snapshot_from_kernel(pid, dstAddr, size, flags, 0, &bytesWritten); - DLOG("stack_snapshot_from_kernel returned 0x%x. pid: %d bufsize:0x%x flags:0x%x bytesWritten: %d\n", - kr, pid, size, flags, bytesWritten); - if (kr == KERN_INSUFFICIENT_BUFFER_SIZE) { - if (pid == -1) { - // Insufficient buffer when trying to take stackshot of user & kernel space threads. - // Continue to take stackshot of just kernel threads - ++cnt; - kr = KERN_SUCCESS; - continue; - } - else if (totalBytes == 0) { - MSG("Failed to get stackshot(0x%x) bufsize:0x%x flags:0x%x\n", kr, size, flags); - } - } + case kIOPMTracePointWakePowerPlaneDrivers: + *phaseString = "kIOPMTracePointWakePowerPlaneDrivers"; + *description = "calling power state change callbacks"; + break; - dstAddr += bytesWritten; - totalBytes += bytesWritten; - bytesRemaining -= bytesWritten; + case kIOPMTracePointWakeCapabilityClients: + *phaseString = "kIOPMTracePointWakeCapabilityClients"; + *description = "informing clients about current system capabilities"; + break; - if (++cnt == 10) { - break; - } - IOSleep(10); // 10 ms - } + case kIOPMTracePointWakeApplications: + *phaseString = "kIOPMTracePointWakeApplications"; + *description = "sending asynchronous kIOMessageSystemHasPoweredOn message to userspace clients"; + break; - hdr->spindump_size = (bufSize - bytesRemaining - hdr->spindump_offset); + case kIOPMTracePointDarkWakeEntry: + *phaseString = "kIOPMTracePointDarkWakeEntry"; + *description = "entering darkwake on way to sleep"; + break; + case kIOPMTracePointDarkWakeExit: + *phaseString = "kIOPMTracePointDarkWakeExit"; + *description = "entering fullwake from darkwake"; + break; - memset(hdr->spindump_status, 0x20, sizeof(hdr->spindump_status)); - code = pmTracer->getPMStatusCode(); - memset(hdr->PMStatusCode, 0x20, sizeof(hdr->PMStatusCode)); - snprintf(hdr->PMStatusCode, sizeof(hdr->PMStatusCode), "\nCode: %08x %08x", - (uint32_t)((code >> 32) & 0xffffffff), (uint32_t)(code & 0xffffffff)); - memset(hdr->reason, 0x20, sizeof(hdr->reason)); - if (isSpinDump) { - snprintf(hdr->reason, sizeof(hdr->reason), "\nStackshot reason: PSC Delay\n\n"); - gRootDomain->swd_lock = 0; - gSpinDumpBufferFull = true; - return; + default: + *phaseString = NULL; + *description = NULL; + } + +} + +void IOPMrootDomain::saveFailureData2File( ) +{ + unsigned int len = 0; + char failureStr[512]; + errno_t error; + char *outbuf; + bool oswatchdog = false; + + if (!PEReadNVRAMProperty(kIOSleepWakeFailureString, NULL, &len) && + !PEReadNVRAMProperty(kIOOSWatchdogFailureString, NULL, &len) ) { + DLOG("No SleepWake failure or OSWatchdog failure string to read\n"); + return; + } + + if (len == 0) { + DLOG("Ignoring zero byte SleepWake failure string\n"); + goto exit; + } + + if (len > sizeof(failureStr)) { + len = sizeof(failureStr); + } + failureStr[0] = 0; + if (PEReadNVRAMProperty(kIOSleepWakeFailureString, failureStr, &len) == false) { + if (PEReadNVRAMProperty(kIOOSWatchdogFailureString, failureStr, &len)) { + oswatchdog = true; + } + } + if (failureStr[0] != 0) { + error = sleepWakeDebugSaveFile(oswatchdog ? kOSWatchdogFailureStringFile : kSleepWakeFailureStringFile, + failureStr, len); + if (error) { + DLOG("Failed to save SleepWake failure string to file. error:%d\n", error); + } + else { + DLOG("Saved SleepWake failure string to file.\n"); + } + if (!oswatchdog) { + swd_flags |= SWD_BOOT_BY_SW_WDOG; + } + } + + if (!OSCompareAndSwap(0, 1, &gRootDomain->swd_lock)) + goto exit; + + if (swd_buffer) { + unsigned int len = 0; + errno_t error; + char nvram_var_name_buffer[20]; + unsigned int concat_len = 0; + swd_hdr *hdr = NULL; + + + hdr = (swd_hdr *)swd_buffer; + outbuf = (char *)hdr + hdr->spindump_offset; + + for (int i=0; i < 8; i++) { + snprintf(nvram_var_name_buffer, 20, "%s%02d", SWD_STACKSHOT_VAR_PREFIX, i+1); + if (!PEReadNVRAMProperty(nvram_var_name_buffer, NULL, &len)) { + LOG("No SleepWake blob to read beyond chunk %d\n", i); + break; + } + if (PEReadNVRAMProperty(nvram_var_name_buffer, outbuf+concat_len, &len) == FALSE) { + PERemoveNVRAMProperty(nvram_var_name_buffer); + LOG("Could not read the property :-(\n"); + break; + } + PERemoveNVRAMProperty(nvram_var_name_buffer); + concat_len += len; + } + LOG("Concatenated length for the SWD blob %d\n", concat_len); + + if (concat_len) { + error = sleepWakeDebugSaveFile(oswatchdog ? kOSWatchdogStacksFilename : kSleepWakeStacksFilename, + outbuf, concat_len); + if (error) { + LOG("Failed to save SleepWake zipped data to file. error:%d\n", error); + } else { + LOG("Saved SleepWake zipped data to file.\n"); + } + } + + } + else { + LOG("No buffer allocated to save failure stackshot\n"); + } + + + gRootDomain->swd_lock = 0; +exit: + PERemoveNVRAMProperty(oswatchdog ? kIOOSWatchdogFailureString : kIOSleepWakeFailureString); + return; +} + + +void IOPMrootDomain::getFailureData(thread_t *thread, char *failureStr, size_t strLen) +{ + IORegistryIterator * iter; + IORegistryEntry * entry; + IOService * node; + bool nodeFound = false; + + const void * callMethod = NULL; + const char * objectName = NULL; + uint32_t timeout = getWatchdogTimeout(); + const char * phaseString = NULL; + const char * phaseDescription = NULL; + + IOPMServiceInterestNotifier *notifier = OSDynamicCast(IOPMServiceInterestNotifier, notifierObject); + uint32_t tracePhase = pmTracer->getTracePhase(); + + *thread = NULL; + if ((tracePhase < kIOPMTracePointSystemSleep) || (tracePhase == kIOPMTracePointDarkWakeEntry)) { + snprintf(failureStr, strLen, "%sSleep transition timed out after %d seconds", failureStr, timeout); + } + else { + snprintf(failureStr, strLen, "%sWake transition timed out after %d seconds", failureStr,timeout); + } + tracePhase2String(tracePhase, &phaseString, &phaseDescription); + + if (notifierThread) { + if (notifier && (notifier->identifier)) { + objectName = notifier->identifier->getCStringNoCopy(); + } + *thread = notifierThread; + } + else { + + iter = IORegistryIterator::iterateOver( + getPMRootDomain(), gIOPowerPlane, kIORegistryIterateRecursively); + + if (iter) + { + while ((entry = iter->getNextObject())) + { + node = OSDynamicCast(IOService, entry); + if (!node) + continue; + if (OSDynamicCast(IOPowerConnection, node)) { + continue; + } + + if(node->getBlockingDriverCall(thread, &callMethod)) { + nodeFound = true; + break; + } + } + iter->release(); + } + if (nodeFound) { + OSKext *kext = OSKext::lookupKextWithAddress((vm_address_t)callMethod); + if (kext) { + objectName = kext->getIdentifierCString(); + } + } + } + if (phaseDescription) { + snprintf(failureStr, strLen, "%s while %s.", failureStr, phaseDescription); + } + if (objectName) { + snprintf(failureStr, strLen, "%s Suspected bundle: %s.", failureStr, objectName); + } + if (*thread) { + snprintf(failureStr, strLen, "%s Thread 0x%llx.", failureStr, thread_tid(*thread)); + } + + DLOG("%s\n", failureStr); +} + +struct swd_stackshot_compressed_data +{ + z_output_func zoutput; + size_t zipped; + uint64_t totalbytes; + uint64_t lastpercent; + IOReturn error; + unsigned outremain; + unsigned outlen; + unsigned writes; + Bytef * outbuf; +}; +struct swd_stackshot_compressed_data swd_zip_var = { }; + +static void *swd_zs_alloc(void *__unused ref, u_int items, u_int size) +{ + void *result; + LOG("Alloc in zipping %d items of size %d\n", items, size); + + result = (void *)(swd_zs_zmem + swd_zs_zoffset); + swd_zs_zoffset += ~31L & (31 + (items * size)); // 32b align for vector crc + LOG("Offset %zu\n", swd_zs_zoffset); + return (result); +} + +static int swd_zinput(z_streamp strm, Bytef *buf, unsigned size) +{ + unsigned len; + + len = strm->avail_in; + + if (len > size) + len = size; + if (len == 0) + return 0; + + if (strm->next_in != (Bytef *) strm) + memcpy(buf, strm->next_in, len); + else + bzero(buf, len); + + strm->adler = z_crc32(strm->adler, buf, len); + + strm->avail_in -= len; + strm->next_in += len; + strm->total_in += len; + + return (int)len; +} + +static int swd_zoutput(z_streamp strm, Bytef *buf, unsigned len) +{ + unsigned int i = 0; + // if outlen > max size don't add to the buffer + if (strm && buf) { + if (swd_zip_var.outlen + len > SWD_COMPRESSED_BUFSIZE) { + LOG("No space to GZIP... not writing to NVRAM\n"); + return (len); + } + } + for (i = 0; i < len; i++) { + *(swd_zip_var.outbuf + swd_zip_var.outlen + i) = *(buf +i); + } + swd_zip_var.outlen += len; + return (len); +} +static void swd_zs_free(void * __unused ref, void * __unused ptr) {} + +static int swd_compress(char *inPtr, char *outPtr, size_t numBytes) +{ + int wbits = 12; + int memlevel = 3; + + if (!swd_zs.zalloc) { + swd_zs.zalloc = swd_zs_alloc; + swd_zs.zfree = swd_zs_free; + if (deflateInit2(&swd_zs, Z_BEST_SPEED, Z_DEFLATED, wbits + 16, memlevel, Z_DEFAULT_STRATEGY)) { + // allocation failed + bzero(&swd_zs, sizeof(swd_zs)); + // swd_zs_zoffset = 0; + } else { + LOG("PMRD inited the zlib allocation routines\n"); + } } - snprintf(hdr->reason, sizeof(hdr->reason), "\nStackshot reason: Watchdog\n\n"); - data[0] = round_page(sizeof(swd_hdr) + hdr->spindump_size); - /* Header & rootdomain log is constantly changing and is not covered by CRC */ - data[1] = hdr->crc = crc32(0, ((char*)swd_buffer+hdr->spindump_offset), hdr->spindump_size); - data[2] = kvtophys((vm_offset_t)swd_buffer); - len = sizeof(addr64_t)*3; - DLOG("bytes: 0x%llx crc:0x%llx paddr:0x%llx\n", - data[0], data[1], data[2]); - if (PEWriteNVRAMProperty(kIOSleepWakeDebugKey, data, len) == false) + swd_zip_var.zipped = 0; + swd_zip_var.totalbytes = 0; // should this be the max that we have? + swd_zip_var.lastpercent = 0; + swd_zip_var.error = kIOReturnSuccess; + swd_zip_var.outremain = 0; + swd_zip_var.outlen = 0; + swd_zip_var.writes = 0; + swd_zip_var.outbuf = (Bytef *)outPtr; + + swd_zip_var.totalbytes = numBytes; + + swd_zs.avail_in = 0; + swd_zs.next_in = NULL; + swd_zs.avail_out = 0; + swd_zs.next_out = NULL; + + deflateResetWithIO(&swd_zs, swd_zinput, swd_zoutput); + + z_stream *zs; + int zr; + zs = &swd_zs; + + zr = Z_OK; + + while (swd_zip_var.error >= 0) { + if (!zs->avail_in) { + zs->next_in = (unsigned char *)inPtr ? (Bytef *)inPtr : (Bytef *)zs; /* zero marker? */ + zs->avail_in = numBytes; + } + if (!zs->avail_out) { + zs->next_out = (Bytef *)zs; + zs->avail_out = UINT32_MAX; + } + zr = deflate(zs, Z_NO_FLUSH); + if (Z_STREAM_END == zr) + break; + if (zr != Z_OK) { + LOG("ZERR %d\n", zr); + swd_zip_var.error = zr; + } else { + if (zs->total_in == numBytes) { + break; + } + } + } + zr = Z_OK; + //now flush the stream + while (swd_zip_var.error >= 0) { + if (!zs->avail_out) { + zs->next_out = (Bytef *)zs; + zs->avail_out = UINT32_MAX; + } + zr = deflate(zs, Z_FINISH); + if (Z_STREAM_END == zr) { + break; + } + if (zr != Z_OK) { + LOG("ZERR %d\n", zr); + swd_zip_var.error = zr; + } else { + if (zs->total_in == numBytes) { + LOG("Total output size %d\n", swd_zip_var.outlen); + break; + } + } + } + + return swd_zip_var.outlen; +} + +void IOPMrootDomain::takeStackshot(bool wdogTrigger, bool isOSXWatchdog, bool isSpinDump) +{ + swd_hdr * hdr = NULL; + int wdog_panic = -1; + int cnt = 0; + pid_t pid = 0; + kern_return_t kr = KERN_SUCCESS; + uint32_t flags; + + char * dstAddr; + uint32_t size; + uint32_t bytesRemaining; + unsigned bytesWritten = 0; + unsigned totalBytes = 0; + OSString * UUIDstring = NULL; + + char failureStr[512]; + thread_t thread = NULL; + const char * uuid; + + + uint32_t bufSize; + uint32_t initialStackSize; + + + + failureStr[0] = 0; + if (isSpinDump) { + if (_systemTransitionType != kSystemTransitionSleep && + _systemTransitionType != kSystemTransitionWake) + return; + + if (gSpinDumpBufferFull) + return; + if (swd_spindump_buffer == NULL) { + sleepWakeDebugSpinDumpMemAlloc(); + if (swd_spindump_buffer == NULL) return; + } + + bufSize = SWD_SPINDUMP_SIZE; + initialStackSize = SWD_INITIAL_SPINDUMP_SIZE; + hdr = (swd_hdr *)swd_spindump_buffer; + + } else { + if ( (kIOSleepWakeWdogOff & gIOKitDebug) || systemBooting || systemShutdown || gWillShutdown) + return; + + if (isOSXWatchdog) { + snprintf(failureStr, sizeof(failureStr), "Stackshot Reason: "); + snprintf(failureStr, sizeof(failureStr), "%smacOS watchdog triggered failure\n", failureStr); + } + else if (wdogTrigger) { + if ((UUIDstring = OSDynamicCast(OSString, getProperty(kIOPMSleepWakeUUIDKey))) != NULL ) { + uuid = UUIDstring->getCStringNoCopy(); + snprintf(failureStr, sizeof(failureStr), "UUID: %s\n", uuid); + } + + snprintf(failureStr, sizeof(failureStr), "%sStackshot Reason: ", failureStr); + getFailureData(&thread, failureStr, sizeof(failureStr)); + if (PEGetCoprocessorVersion() >= kCoprocessorVersion2) { + goto skip_stackshot; + } + + } + else { + snprintf(failureStr, sizeof(failureStr), "%sStackshot triggered for debugging stackshot collection.\n", failureStr); + } + // Take only one stackshot in this case. + cnt = SWD_MAX_STACKSHOTS-1; + + if (swd_buffer == NULL) { + sleepWakeDebugMemAlloc(); + if (swd_buffer == NULL) return; + } + hdr = (swd_hdr *)swd_buffer; + + bufSize = hdr->alloc_size;; + initialStackSize = bufSize; + + } + + + if (!OSCompareAndSwap(0, 1, &gRootDomain->swd_lock)) + return; + + + dstAddr = (char*)hdr + hdr->spindump_offset; + bytesRemaining = bufSize - hdr->spindump_offset; + + DLOG("Taking snapshot. bytesRemaining: %d\n", bytesRemaining); + + flags = STACKSHOT_KCDATA_FORMAT|STACKSHOT_NO_IO_STATS|STACKSHOT_SAVE_KEXT_LOADINFO|STACKSHOT_ACTIVE_KERNEL_THREADS_ONLY|STACKSHOT_THREAD_WAITINFO; + while (kr == KERN_SUCCESS) { + + if (cnt == 0) { + /* + * Take stackshot of all process on first sample. Size is restricted + * to SWD_INITIAL_STACK_SIZE + */ + pid = -1; + size = (bytesRemaining > initialStackSize) ? initialStackSize : bytesRemaining; + } + else { + /* Take sample of kernel threads only */ + pid = 0; + size = bytesRemaining; + } + + kr = stack_snapshot_from_kernel(pid, dstAddr, size, flags, 0, &bytesWritten); + DLOG("stack_snapshot_from_kernel returned 0x%x. pid: %d bufsize:0x%x flags:0x%x bytesWritten: %d\n", + kr, pid, size, flags, bytesWritten); + if (kr == KERN_INSUFFICIENT_BUFFER_SIZE) { + if (pid == -1) { + // Insufficient buffer when trying to take stackshot of user & kernel space threads. + // Continue to take stackshot of just kernel threads + ++cnt; + kr = KERN_SUCCESS; + continue; + } + else if (totalBytes == 0) { + MSG("Failed to get stackshot(0x%x) bufsize:0x%x flags:0x%x\n", kr, size, flags); + } + } + + dstAddr += bytesWritten; + totalBytes += bytesWritten; + bytesRemaining -= bytesWritten; + + if (++cnt == SWD_MAX_STACKSHOTS) { + break; + } + IOSleep(10); // 10 ms + } + + hdr->spindump_size = (bufSize - bytesRemaining - hdr->spindump_offset); + + memset(hdr->reason, 0x20, sizeof(hdr->reason)); + if (isSpinDump) { + snprintf(hdr->reason, sizeof(hdr->reason), "\nStackshot reason: Power State Change Delay\n\n"); + gRootDomain->swd_lock = 0; + gSpinDumpBufferFull = true; + return; + } + + // Compress stackshot and save to NVRAM { - DLOG("Failed to update nvram boot-args\n"); - goto exit; + char *outbuf = (char *)swd_compressed_buffer; + int outlen = 0; + int num_chunks = 0; + int max_chunks = 0; + int leftover = 0; + char nvram_var_name_buffer[20]; + + outlen = swd_compress((char*)hdr + hdr->spindump_offset, outbuf, bytesWritten); + + if (outlen) { + max_chunks = outlen / (2096 - 200); + leftover = outlen % (2096 - 200); + + if (max_chunks < 8) { + for (num_chunks = 0; num_chunks < max_chunks; num_chunks++) { + snprintf(nvram_var_name_buffer, 20, "%s%02d", SWD_STACKSHOT_VAR_PREFIX, num_chunks+1); + if (PEWriteNVRAMProperty(nvram_var_name_buffer, (outbuf + (num_chunks * (2096-200))), (2096 - 200)) == FALSE) { + LOG("Failed to update NVRAM %d\n", num_chunks); + break; + } + } + if (leftover) { + snprintf(nvram_var_name_buffer, 20, "%s%02d", SWD_STACKSHOT_VAR_PREFIX, num_chunks+1); + if (PEWriteNVRAMProperty(nvram_var_name_buffer, (outbuf + (num_chunks * (2096-200))), leftover) == FALSE) { + LOG("Failed to update NVRAM with leftovers\n"); + } + } + } + else { + LOG("Compressed failure stackshot is too large. size=%d bytes\n", outlen); + } + } } -exit: + if (failureStr[0]) { + if (!isOSXWatchdog) { + // append sleep-wake failure code + snprintf(failureStr, sizeof(failureStr), "%s\nFailure code:: 0x%08x %08x\n", + failureStr, pmTracer->getTraceData(), pmTracer->getTracePhase()); + if (PEWriteNVRAMProperty(kIOSleepWakeFailureString, failureStr, strlen(failureStr)) == false) { + DLOG("Failed to write SleepWake failure string\n"); + } + } + else { + if (PEWriteNVRAMProperty(kIOOSWatchdogFailureString, failureStr, strlen(failureStr)) == false) { + DLOG("Failed to write OSWatchdog failure string\n"); + } + } + } gRootDomain->swd_lock = 0; +skip_stackshot: if (wdogTrigger) { - IOLog("Restarting to collect Sleep wake debug logs\n"); + PE_parse_boot_argn("swd_panic", &wdog_panic, sizeof(wdog_panic)); + + if ((wdog_panic == 1) || (PEGetCoprocessorVersion() >= kCoprocessorVersion2)) { + if (thread) { + panic_with_thread_context(0, NULL, DEBUGGER_OPTION_ATTEMPTCOREDUMPANDREBOOT, thread, "%s", failureStr); + } + else { + panic_with_options(0, NULL, DEBUGGER_OPTION_ATTEMPTCOREDUMPANDREBOOT, "%s", failureStr); + } + return; + } + else if (swd_flags & SWD_BOOT_BY_SW_WDOG) { + // If current boot is due to this watch dog trigger restart in previous boot, + // then don't trigger again until at least 1 successful sleep & wake. + if (!(sleepCnt && (displayWakeCnt || darkWakeCnt))) { + LOG("Shutting down due to repeated Sleep/Wake failures\n"); + if (!tasksSuspended) { + tasksSuspended = TRUE; + tasks_system_suspend(true); + } + PEHaltRestart(kPEHaltCPU); + return; + } + } + } + + + if (wdogTrigger) { + LOG("Restarting to collect Sleep wake debug logs\n"); if (!tasksSuspended) { tasksSuspended = TRUE; tasks_system_suspend(true); @@ -9981,20 +10350,16 @@ void IOPMrootDomain::takeStackshot(bool wdogTrigger, bool isOSXWatchdog, bool is PEHaltRestart(kPERestartCPU); } else { - logBufMap = sleepWakeDebugRetrieve(); - if (logBufMap) { - sleepWakeDebugDumpFromMem(logBufMap); - logBufMap->release(); - logBufMap = 0; - } + saveFailureData2File(); } } void IOPMrootDomain::sleepWakeDebugMemAlloc( ) { - vm_size_t size = SWD_BUF_SIZE; + vm_size_t size = SWD_STACKSHOT_SIZE + SWD_COMPRESSED_BUFSIZE + SWD_ZLIB_BUFSIZE; swd_hdr *hdr = NULL; + void *bufPtr = NULL; IOBufferMemoryDescriptor *memDesc = NULL; @@ -10008,28 +10373,31 @@ void IOPMrootDomain::sleepWakeDebugMemAlloc( ) if (!OSCompareAndSwap(0, 1, &gRootDomain->swd_lock)) return; - // Try allocating above 4GB. If that fails, try at 2GB - memDesc = IOBufferMemoryDescriptor::inTaskWithPhysicalMask( - kernel_task, kIOMemoryPhysicallyContiguous|kIOMemoryMapperNone, - size, 0xFFFFFFFF00000000ULL); - if (!memDesc) { - memDesc = IOBufferMemoryDescriptor::inTaskWithPhysicalMask( - kernel_task, kIOMemoryPhysicallyContiguous|kIOMemoryMapperNone, - size, 0xFFFFFFFF10000000ULL); - } - + memDesc = IOBufferMemoryDescriptor::inTaskWithOptions( + kernel_task, kIODirectionIn|kIOMemoryMapperNone, + size); if (memDesc == NULL) { DLOG("Failed to allocate Memory descriptor for sleepWake debug\n"); goto exit; } + bufPtr = memDesc->getBytesNoCopy(); - hdr = (swd_hdr *)memDesc->getBytesNoCopy(); + // Carve out memory for zlib routines + swd_zs_zmem = (vm_offset_t)bufPtr; + bufPtr = (char *)bufPtr + SWD_ZLIB_BUFSIZE; + + // Carve out memory for compressed stackshots + swd_compressed_buffer = bufPtr; + bufPtr = (char *)bufPtr + SWD_COMPRESSED_BUFSIZE; + + // Remaining is used for holding stackshot + hdr = (swd_hdr *)bufPtr; memset(hdr, 0, sizeof(swd_hdr)); hdr->signature = SWD_HDR_SIGNATURE; - hdr->alloc_size = size; + hdr->alloc_size = SWD_STACKSHOT_SIZE; hdr->spindump_offset = sizeof(swd_hdr); swd_buffer = (void *)hdr; @@ -10077,15 +10445,11 @@ void IOPMrootDomain::sleepWakeDebugSpinDumpMemAlloc( ) void IOPMrootDomain::sleepWakeDebugEnableWdog() { - swd_flags |= SWD_WDOG_ENABLED; - if (!swd_buffer) - sleepWakeDebugMemAlloc(); } bool IOPMrootDomain::sleepWakeDebugIsWdogEnabled() { - return ((swd_flags & SWD_WDOG_ENABLED) && - !systemBooting && !systemShutdown && !gWillShutdown); + return (!systemBooting && !systemShutdown && !gWillShutdown); } void IOPMrootDomain::sleepWakeDebugSaveSpinDumpFile() @@ -10120,7 +10484,7 @@ errno_t IOPMrootDomain::sleepWakeDebugSaveFile(const char *name, char *buf, int if (vnode_open(name, (O_CREAT | FWRITE | O_NOFOLLOW), S_IRUSR|S_IRGRP|S_IROTH, VNODE_LOOKUP_NOFOLLOW, &vp, ctx) != 0) { - IOLog("Failed to open the file %s\n", name); + LOG("Failed to open the file %s\n", name); swd_flags |= SWD_FILEOP_ERROR; goto exit; } @@ -10129,7 +10493,7 @@ errno_t IOPMrootDomain::sleepWakeDebugSaveFile(const char *name, char *buf, int /* Don't dump to non-regular files or files with links. */ if (vp->v_type != VREG || vnode_getattr(vp, &va, ctx) || va.va_nlink != 1) { - IOLog("Bailing as this is not a regular file\n"); + LOG("Bailing as this is not a regular file\n"); swd_flags |= SWD_FILEOP_ERROR; goto exit; } @@ -10140,9 +10504,9 @@ errno_t IOPMrootDomain::sleepWakeDebugSaveFile(const char *name, char *buf, int if (buf != NULL) { error = vn_rdwr(UIO_WRITE, vp, buf, len, 0, - UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, cred, (int *) 0, vfs_context_proc(ctx)); + UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, cred, (int *) NULL, vfs_context_proc(ctx)); if (error != 0) { - IOLog("Failed to save sleep wake log. err 0x%x\n", error); + LOG("Failed to save sleep wake log. err 0x%x\n", error); swd_flags |= SWD_FILEOP_ERROR; } else { @@ -10158,515 +10522,6 @@ errno_t IOPMrootDomain::sleepWakeDebugSaveFile(const char *name, char *buf, int } -errno_t IOPMrootDomain::sleepWakeDebugCopyFile( - struct vnode *srcVp, - vfs_context_t srcCtx, - char *tmpBuf, uint64_t tmpBufSize, - uint64_t srcOffset, - const char *dstFname, - uint64_t numBytes, - uint32_t crc) -{ - struct vnode *vp = NULL; - vfs_context_t ctx = vfs_context_create(vfs_context_current()); - struct vnode_attr va; - errno_t error = EIO; - uint64_t bytesToRead, bytesToWrite; - uint64_t readFileOffset, writeFileOffset, srcDataOffset; - uint32_t newcrc = 0; - - if (vnode_open(dstFname, (O_CREAT | FWRITE | O_NOFOLLOW), - S_IRUSR|S_IRGRP|S_IROTH, VNODE_LOOKUP_NOFOLLOW, &vp, ctx) != 0) - { - IOLog("Failed to open the file %s\n", dstFname); - swd_flags |= SWD_FILEOP_ERROR; - goto exit; - } - VATTR_INIT(&va); - VATTR_WANTED(&va, va_nlink); - /* Don't dump to non-regular files or files with links. */ - if (vp->v_type != VREG || - vnode_getattr(vp, &va, ctx) || va.va_nlink != 1) { - IOLog("Bailing as this is not a regular file\n"); - swd_flags |= SWD_FILEOP_ERROR; - goto exit; - } - VATTR_INIT(&va); - VATTR_SET(&va, va_data_size, 0); - vnode_setattr(vp, &va, ctx); - - writeFileOffset = 0; - while(numBytes) { - bytesToRead = (round_page(numBytes) > tmpBufSize) ? tmpBufSize : round_page(numBytes); - readFileOffset = trunc_page(srcOffset); - - DLOG("Read file (numBytes:0x%llx offset:0x%llx)\n", bytesToRead, readFileOffset); - error = vn_rdwr(UIO_READ, srcVp, tmpBuf, bytesToRead, readFileOffset, - UIO_SYSSPACE, IO_SKIP_ENCRYPTION|IO_SYNC|IO_NODELOCKED|IO_UNIT|IO_NOCACHE, - vfs_context_ucred(srcCtx), (int *) 0, - vfs_context_proc(srcCtx)); - if (error) { - IOLog("Failed to read file(numBytes:0x%llx)\n", bytesToRead); - swd_flags |= SWD_FILEOP_ERROR; - break; - } - - srcDataOffset = (uint64_t)tmpBuf + (srcOffset - readFileOffset); - bytesToWrite = bytesToRead - (srcOffset - readFileOffset); - if (bytesToWrite > numBytes) bytesToWrite = numBytes; - - if (crc) { - newcrc = crc32(newcrc, (void *)srcDataOffset, bytesToWrite); - } - DLOG("Write file (numBytes:0x%llx offset:0x%llx)\n", bytesToWrite, writeFileOffset); - error = vn_rdwr(UIO_WRITE, vp, (char *)srcDataOffset, bytesToWrite, writeFileOffset, - UIO_SYSSPACE, IO_SYNC|IO_NODELOCKED|IO_UNIT, - vfs_context_ucred(ctx), (int *) 0, - vfs_context_proc(ctx)); - if (error) { - IOLog("Failed to write file(numBytes:0x%llx)\n", bytesToWrite); - swd_flags |= SWD_FILEOP_ERROR; - break; - } - - writeFileOffset += bytesToWrite; - numBytes -= bytesToWrite; - srcOffset += bytesToWrite; - - } - if (crc != newcrc) { - /* Set stackshot size to 0 if crc doesn't match */ - VATTR_INIT(&va); - VATTR_SET(&va, va_data_size, 0); - vnode_setattr(vp, &va, ctx); - - IOLog("CRC check failed. expected:0x%x actual:0x%x\n", crc, newcrc); - swd_flags |= SWD_DATA_CRC_ERROR; - error = EFAULT; - } -exit: - if (vp) { - error = vnode_close(vp, FWRITE, ctx); - DLOG("vnode_close on file %s returned 0x%x\n",dstFname, error); - } - if (ctx) vfs_context_rele(ctx); - - return error; - - - -} -uint32_t IOPMrootDomain::checkForValidDebugData(const char *fname, vfs_context_t *ctx, - void *tmpBuf, struct vnode **vp) -{ - int rc; - uint64_t hdrOffset; - uint32_t error = 0; - - struct vnode_attr va; - IOHibernateImageHeader *imageHdr; - - *vp = NULL; - if (vnode_open(fname, (FREAD | O_NOFOLLOW), 0, - VNODE_LOOKUP_NOFOLLOW, vp, *ctx) != 0) - { - DMSG("sleepWakeDebugDumpFromFile: Failed to open the file %s\n", fname); - goto err; - } - VATTR_INIT(&va); - VATTR_WANTED(&va, va_nlink); - VATTR_WANTED(&va, va_data_alloc); - if ((*vp)->v_type != VREG || - vnode_getattr((*vp), &va, *ctx) || va.va_nlink != 1) { - IOLog("sleepWakeDebugDumpFromFile: Bailing as %s is not a regular file\n", fname); - error = SWD_FILEOP_ERROR; - goto err; - } - - /* Read the sleepimage file header */ - rc = vn_rdwr(UIO_READ, *vp, (char *)tmpBuf, round_page(sizeof(IOHibernateImageHeader)), 0, - UIO_SYSSPACE, IO_SKIP_ENCRYPTION|IO_SYNC|IO_NODELOCKED|IO_UNIT|IO_NOCACHE, - vfs_context_ucred(*ctx), (int *) 0, - vfs_context_proc(*ctx)); - if (rc != 0) { - IOLog("sleepWakeDebugDumpFromFile: Failed to read header size %llu(rc=%d) from %s\n", - mach_vm_round_page(sizeof(IOHibernateImageHeader)), rc, fname); - error = SWD_FILEOP_ERROR; - goto err; - } - - imageHdr = ((IOHibernateImageHeader *)tmpBuf); - if (imageHdr->signature != kIOHibernateHeaderDebugDataSignature) { - IOLog("sleepWakeDebugDumpFromFile: File %s header has unexpected value 0x%x\n", - fname, imageHdr->signature); - error = SWD_HDR_SIGNATURE_ERROR; - goto err; - } - - /* Sleep/Wake debug header(swd_hdr) is at the beggining of the second block */ - hdrOffset = imageHdr->deviceBlockSize; - if (hdrOffset + sizeof(swd_hdr) >= va.va_data_alloc) { - IOLog("sleepWakeDebugDumpFromFile: header is crossing file size(0x%llx) in file %s\n", - va.va_data_alloc, fname); - error = SWD_HDR_SIZE_ERROR; - goto err; - } - - return 0; - -err: - if (*vp) vnode_close(*vp, FREAD, *ctx); - *vp = NULL; - - return error; -} - -void IOPMrootDomain::sleepWakeDebugDumpFromFile( ) -{ -#if HIBERNATION - int rc; - char hibernateFilename[MAXPATHLEN+1]; - void *tmpBuf; - swd_hdr *hdr = NULL; - uint32_t stacksSize, logSize; - uint64_t tmpBufSize; - uint64_t hdrOffset, stacksOffset, logOffset; - errno_t error = EIO; - OSObject *obj = NULL; - OSString *str = NULL; - OSNumber *failStat = NULL; - struct vnode *vp = NULL; - vfs_context_t ctx = NULL; - const char *stacksFname, *logFname; - - IOBufferMemoryDescriptor *tmpBufDesc = NULL; - - DLOG("sleepWakeDebugDumpFromFile\n"); - if ((swd_flags & SWD_LOGS_IN_FILE) == 0) - return; - - if (!OSCompareAndSwap(0, 1, &gRootDomain->swd_lock)) - return; - - - /* Allocate a temp buffer to copy data between files */ - tmpBufSize = 2*4096; - tmpBufDesc = IOBufferMemoryDescriptor:: - inTaskWithOptions(kernel_task, kIODirectionOutIn | kIOMemoryMapperNone, - tmpBufSize, PAGE_SIZE); - - if (!tmpBufDesc) { - DMSG("sleepWakeDebugDumpFromFile: Fail to allocate temp buf\n"); - goto exit; - } - - tmpBuf = tmpBufDesc->getBytesNoCopy(); - - ctx = vfs_context_create(vfs_context_current()); - - /* First check if 'kSleepWakeStackBinFilename' has valid data */ - swd_flags |= checkForValidDebugData(kSleepWakeStackBinFilename, &ctx, tmpBuf, &vp); - if (vp == NULL) { - /* Check if the debug data is saved to hibernation file */ - hibernateFilename[0] = 0; - if ((obj = copyProperty(kIOHibernateFileKey))) - { - if ((str = OSDynamicCast(OSString, obj))) - strlcpy(hibernateFilename, str->getCStringNoCopy(), - sizeof(hibernateFilename)); - obj->release(); - } - if (!hibernateFilename[0]) { - DMSG("sleepWakeDebugDumpFromFile: Failed to get hibernation file name\n"); - goto exit; - } - - swd_flags |= checkForValidDebugData(hibernateFilename, &ctx, tmpBuf, &vp); - if (vp == NULL) { - DMSG("sleepWakeDebugDumpFromFile: No valid debug data is found\n"); - goto exit; - } - DLOG("Getting SW Stacks image from file %s\n", hibernateFilename); - } - else { - DLOG("Getting SW Stacks image from file %s\n", kSleepWakeStackBinFilename); - } - - hdrOffset = ((IOHibernateImageHeader *)tmpBuf)->deviceBlockSize; - - DLOG("Reading swd_hdr len 0x%llx offset 0x%lx\n", mach_vm_round_page(sizeof(swd_hdr)), trunc_page(hdrOffset)); - /* Read the sleep/wake debug header(swd_hdr) */ - rc = vn_rdwr(UIO_READ, vp, (char *)tmpBuf, round_page(sizeof(swd_hdr)), trunc_page(hdrOffset), - UIO_SYSSPACE, IO_SKIP_ENCRYPTION|IO_SYNC|IO_NODELOCKED|IO_UNIT|IO_NOCACHE, - vfs_context_ucred(ctx), (int *) 0, - vfs_context_proc(ctx)); - if (rc != 0) { - DMSG("sleepWakeDebugDumpFromFile: Failed to debug read header size %llu. rc=%d\n", - mach_vm_round_page(sizeof(swd_hdr)), rc); - swd_flags |= SWD_FILEOP_ERROR; - goto exit; - } - - hdr = (swd_hdr *)((char *)tmpBuf + (hdrOffset - trunc_page(hdrOffset))); - if ((hdr->signature != SWD_HDR_SIGNATURE) || (hdr->alloc_size > SWD_BUF_SIZE) || - (hdr->spindump_offset > SWD_BUF_SIZE) || (hdr->spindump_size > SWD_BUF_SIZE)) { - DMSG("sleepWakeDebugDumpFromFile: Invalid data in debug header. sign:0x%x size:0x%x spindump_offset:0x%x spindump_size:0x%x\n", - hdr->signature, hdr->alloc_size, hdr->spindump_offset, hdr->spindump_size); - swd_flags |= SWD_BUF_SIZE_ERROR; - goto exit; - } - stacksSize = hdr->spindump_size; - - /* Get stacks & log offsets in the image file */ - stacksOffset = hdrOffset + hdr->spindump_offset; - logOffset = hdrOffset + offsetof(swd_hdr, UUID); - logSize = sizeof(swd_hdr)-offsetof(swd_hdr, UUID); - stacksFname = getDumpStackFilename(hdr); - logFname = getDumpLogFilename(hdr); - - error = sleepWakeDebugCopyFile(vp, ctx, (char *)tmpBuf, tmpBufSize, stacksOffset, - stacksFname, stacksSize, hdr->crc); - if (error == EFAULT) { - DMSG("sleepWakeDebugDumpFromFile: Stackshot CRC doesn't match\n"); - goto exit; - } - error = sleepWakeDebugCopyFile(vp, ctx, (char *)tmpBuf, tmpBufSize, logOffset, - logFname, logSize, 0); - if (error) { - DMSG("sleepWakeDebugDumpFromFile: Failed to write the log file(0x%x)\n", error); - goto exit; - } -exit: - if (error) { - // Write just the SleepWakeLog.dump with failure code - uint64_t fcode = 0; - const char *fname; - swd_hdr hdrCopy; - char *offset = NULL; - int size; - - hdr = &hdrCopy; - if (swd_flags & SWD_BOOT_BY_SW_WDOG) { - failStat = OSDynamicCast(OSNumber, getProperty(kIOPMSleepWakeFailureCodeKey)); - fcode = failStat->unsigned64BitValue(); - fname = kSleepWakeLogFilename; - } - else { - fname = kAppleOSXWatchdogLogFilename; - } - - offset = (char*)hdr+offsetof(swd_hdr, UUID); - size = sizeof(swd_hdr)-offsetof(swd_hdr, UUID); - memset(offset, 0x20, size); // Fill with spaces - - - snprintf(hdr->spindump_status, sizeof(hdr->spindump_status), "\nstatus: 0x%x", swd_flags); - snprintf(hdr->PMStatusCode, sizeof(hdr->PMStatusCode), "\nCode: 0x%llx", fcode); - snprintf(hdr->reason, sizeof(hdr->reason), "\nStackshot reason: Watchdog\n\n"); - sleepWakeDebugSaveFile(fname, offset, size); - - } - gRootDomain->swd_lock = 0; - - if (vp) vnode_close(vp, FREAD, ctx); - if (ctx) vfs_context_rele(ctx); - if (tmpBufDesc) tmpBufDesc->release(); -#endif /* HIBERNATION */ -} - -void IOPMrootDomain::sleepWakeDebugDumpFromMem(IOMemoryMap *logBufMap) -{ - IOVirtualAddress srcBuf = NULL; - char *stackBuf = NULL, *logOffset = NULL; - int logSize = 0; - - errno_t error = EIO; - uint64_t bufSize = 0; - swd_hdr *hdr = NULL; - OSNumber *failStat = NULL; - - if (!OSCompareAndSwap(0, 1, &gRootDomain->swd_lock)) - return; - - if ((logBufMap == 0) || ( (srcBuf = logBufMap->getVirtualAddress()) == 0) ) - { - DLOG("Nothing saved to dump to file\n"); - goto exit; - } - - hdr = (swd_hdr *)srcBuf; - bufSize = logBufMap->getLength(); - if (bufSize <= sizeof(swd_hdr)) - { - IOLog("SleepWake log buffer size is invalid\n"); - swd_flags |= SWD_BUF_SIZE_ERROR; - goto exit; - } - - stackBuf = (char*)hdr+hdr->spindump_offset; - - error = sleepWakeDebugSaveFile(getDumpStackFilename(hdr), stackBuf, hdr->spindump_size); - if (error) goto exit; - - logOffset = (char*)hdr+offsetof(swd_hdr, UUID); - logSize = sizeof(swd_hdr)-offsetof(swd_hdr, UUID); - - error = sleepWakeDebugSaveFile(getDumpLogFilename(hdr), logOffset, logSize); - if (error) goto exit; - - hdr->spindump_size = 0; - error = 0; - -exit: - if (error) { - // Write just the SleepWakeLog.dump with failure code - uint64_t fcode = 0; - const char *sname, *lname; - swd_hdr hdrCopy; - - /* Try writing an empty stacks file */ - hdr = &hdrCopy; - if (swd_flags & SWD_BOOT_BY_SW_WDOG) { - failStat = OSDynamicCast(OSNumber, getProperty(kIOPMSleepWakeFailureCodeKey)); - fcode = failStat->unsigned64BitValue(); - lname = kSleepWakeLogFilename; - sname = kSleepWakeStackFilename; - } - else { - lname = kAppleOSXWatchdogLogFilename; - sname= kAppleOSXWatchdogStackFilename; - } - - sleepWakeDebugSaveFile(sname, NULL, 0); - - logOffset = (char*)hdr+offsetof(swd_hdr, UUID); - logSize = sizeof(swd_hdr)-offsetof(swd_hdr, UUID); - memset(logOffset, 0x20, logSize); // Fill with spaces - - - snprintf(hdr->spindump_status, sizeof(hdr->spindump_status), "\nstatus: 0x%x", swd_flags); - snprintf(hdr->PMStatusCode, sizeof(hdr->PMStatusCode), "\nCode: 0x%llx", fcode); - snprintf(hdr->reason, sizeof(hdr->reason), "\nStackshot reason: Watchdog\n\n"); - sleepWakeDebugSaveFile(lname, logOffset, logSize); - } - - gRootDomain->swd_lock = 0; -} - -IOMemoryMap *IOPMrootDomain::sleepWakeDebugRetrieve( ) -{ - IOVirtualAddress vaddr = NULL; - IOMemoryDescriptor * desc = NULL; - IOMemoryMap * logBufMap = NULL; - - uint32_t len = INT_MAX; - addr64_t data[3]; - uint64_t bufSize = 0; - uint64_t crc = 0; - uint64_t newcrc = 0; - uint64_t paddr = 0; - swd_hdr *hdr = NULL; - bool ret = false; - char str[20]; - - - if (!OSCompareAndSwap(0, 1, &gRootDomain->swd_lock)) - return NULL; - - if (!PEReadNVRAMProperty(kIOSleepWakeDebugKey, 0, &len)) { - DLOG("No sleepWakeDebug note to read\n"); - goto exit; - } - - if (len == strlen("sleepimage")) { - str[0] = 0; - PEReadNVRAMProperty(kIOSleepWakeDebugKey, str, &len); - - if (!strncmp((char*)str, "sleepimage", strlen("sleepimage"))) { - DLOG("sleepWakeDebugRetrieve: in file logs\n"); - swd_flags |= SWD_LOGS_IN_FILE|SWD_VALID_LOGS; - goto exit; - } - } - else if (len == sizeof(addr64_t)*3) { - PEReadNVRAMProperty(kIOSleepWakeDebugKey, data, &len); - } - else { - DLOG("Invalid sleepWakeDebug note length(%d)\n", len); - goto exit; - } - - - - DLOG("sleepWakeDebugRetrieve: data[0]:0x%llx data[1]:0x%llx data[2]:0x%llx\n", - data[0], data[1], data[2]); - DLOG("sleepWakeDebugRetrieve: in mem logs\n"); - bufSize = data[0]; - crc = data[1]; - paddr = data[2]; - if ( (bufSize <= sizeof(swd_hdr)) ||(bufSize > SWD_BUF_SIZE) || (crc == 0) ) - { - IOLog("SleepWake log buffer size is invalid\n"); - swd_flags |= SWD_BUF_SIZE_ERROR; - return NULL; - } - - DLOG("size:0x%llx crc:0x%llx paddr:0x%llx\n", - bufSize, crc, paddr); - - - desc = IOMemoryDescriptor::withAddressRange( paddr, bufSize, - kIODirectionOutIn | kIOMemoryMapperNone, NULL); - if (desc == NULL) - { - IOLog("Fail to map SleepWake log buffer\n"); - swd_flags |= SWD_INTERNAL_FAILURE; - goto exit; - } - - logBufMap = desc->map(); - - vaddr = logBufMap->getVirtualAddress(); - - - if ( (logBufMap->getLength() <= sizeof(swd_hdr)) || (vaddr == NULL) ) { - IOLog("Fail to map SleepWake log buffer\n"); - swd_flags |= SWD_INTERNAL_FAILURE; - goto exit; - } - - hdr = (swd_hdr *)vaddr; - if (hdr->spindump_offset+hdr->spindump_size > bufSize) - { - IOLog("SleepWake log header size is invalid\n"); - swd_flags |= SWD_HDR_SIZE_ERROR; - goto exit; - } - - hdr->crc = crc; - newcrc = crc32(0, (void *)((char*)vaddr+hdr->spindump_offset), - hdr->spindump_size); - if (newcrc != crc) { - IOLog("SleepWake log buffer contents are invalid\n"); - swd_flags |= SWD_DATA_CRC_ERROR; - goto exit; - } - - ret = true; - swd_flags |= SWD_LOGS_IN_MEM | SWD_VALID_LOGS; - - -exit: - PERemoveNVRAMProperty(kIOSleepWakeDebugKey); - if (!ret) { - if (logBufMap) logBufMap->release(); - logBufMap = 0; - } - if (desc) desc->release(); - gRootDomain->swd_lock = 0; - - return logBufMap; -} #else @@ -10693,28 +10548,8 @@ void IOPMrootDomain::takeStackshot(bool restart, bool isOSXWatchdog, bool isSpin void IOPMrootDomain::sleepWakeDebugMemAlloc( ) { } -void IOPMrootDomain::sleepWakeDebugDumpFromMem(IOMemoryMap *map) -{ -} -errno_t IOPMrootDomain::sleepWakeDebugCopyFile( - struct vnode *srcVp, - vfs_context_t srcCtx, - char *tmpBuf, uint64_t tmpBufSize, - uint64_t srcOffset, - const char *dstFname, - uint64_t numBytes, - uint32_t crc) -{ - return EIO; -} - -void IOPMrootDomain::sleepWakeDebugDumpFromFile() -{ -} - -IOMemoryMap *IOPMrootDomain::sleepWakeDebugRetrieve( ) +void IOPMrootDomain::saveFailureData2File( ) { - return NULL; } void IOPMrootDomain::sleepWakeDebugEnableWdog() diff --git a/iokit/Kernel/IOPerfControl.cpp b/iokit/Kernel/IOPerfControl.cpp new file mode 100644 index 000000000..e5ece3480 --- /dev/null +++ b/iokit/Kernel/IOPerfControl.cpp @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2017 Apple Inc. All rights reserved. + */ + +#include + +#include + +#include + +#undef super +#define super OSObject +OSDefineMetaClassAndStructors(IOPerfControlClient, OSObject); + +bool IOPerfControlClient::init(IOService *driver, uint64_t maxWorkCapacity) +{ + if (!super::init()) + return false; + + interface = PerfControllerInterface{ + .version = 0, + .registerDevice = + [](IOService *device) { + return kIOReturnSuccess; + }, + .unregisterDevice = + [](IOService *device) { + return kIOReturnSuccess; + }, + .workCanSubmit = + [](IOService *device, PerfControllerInterface::WorkState *state, WorkSubmitArgs *args) { + return false; + }, + .workSubmit = + [](IOService *device, uint64_t token, PerfControllerInterface::WorkState *state, WorkSubmitArgs *args) { + }, + .workBegin = + [](IOService *device, uint64_t token, PerfControllerInterface::WorkState *state, WorkBeginArgs *args) { + }, + .workEnd = + [](IOService *device, uint64_t token, PerfControllerInterface::WorkState *state, WorkEndArgs *args, bool done) { + }, + }; + + interfaceLock = IOLockAlloc(); + if (!interfaceLock) + goto error; + + deviceRegistrationList = OSSet::withCapacity(4); + if (!deviceRegistrationList) + goto error; + + bzero(workTable, sizeof(workTable)); + memset(&workTable[kIOPerfControlClientWorkUntracked], ~0, sizeof(WorkTableEntry)); + workTableNextIndex = kIOPerfControlClientWorkUntracked + 1; + + workTableLock = IOSimpleLockAlloc(); + if (!workTableLock) + goto error; + + // TODO: check sum(maxWorkCapacities) < table size + + return true; + +error: + if (interfaceLock) + IOLockFree(interfaceLock); + if (deviceRegistrationList) + deviceRegistrationList->release(); + if (workTableLock) + IOSimpleLockFree(workTableLock); + return false; +} + +IOPerfControlClient *_Atomic gSharedClient = nullptr; + +IOPerfControlClient *IOPerfControlClient::copyClient(IOService *driver, uint64_t maxWorkCapacity) +{ + IOPerfControlClient *client = atomic_load_explicit(&gSharedClient, memory_order_acquire); + if (client == nullptr) { + IOPerfControlClient *expected = client; + client = new IOPerfControlClient; + if (!client || !client->init(driver, maxWorkCapacity)) + panic("could not create IOPerfControlClient"); + if (!atomic_compare_exchange_strong_explicit(&gSharedClient, &expected, client, memory_order_acq_rel, + memory_order_acquire)) { + client->release(); + client = expected; + } + } + // TODO: add maxWorkCapacity to existing client + client->retain(); + return client; +} + +uint64_t IOPerfControlClient::allocateToken(thread_group *thread_group) +{ + uint64_t token = kIOPerfControlClientWorkUntracked; + + + return token; +} + +void IOPerfControlClient::deallocateToken(uint64_t token) +{ +} + +bool IOPerfControlClient::getEntryForToken(uint64_t token, IOPerfControlClient::WorkTableEntry &entry) +{ + if (token == kIOPerfControlClientWorkUntracked) + return false; + + if (token >= kWorkTableNumEntries) + panic("Invalid work token (%llu): index out of bounds.", token); + + entry = workTable[token]; + auto *thread_group = entry.thread_group; + assertf(thread_group, "Invalid work token: %llu", token); + return thread_group != nullptr; +} + +void IOPerfControlClient::markEntryStarted(uint64_t token, bool started) +{ + if (token == kIOPerfControlClientWorkUntracked) + return; + + if (token >= kWorkTableNumEntries) + panic("Invalid work token (%llu): index out of bounds.", token); + + workTable[token].started = started; +} + +IOReturn IOPerfControlClient::registerDevice(__unused IOService *driver, IOService *device) +{ + IOReturn ret = kIOReturnSuccess; + + IOLockLock(interfaceLock); + + if (interface.version > 0) + ret = interface.registerDevice(device); + else + deviceRegistrationList->setObject(device); + + IOLockUnlock(interfaceLock); + + return ret; +} + +void IOPerfControlClient::unregisterDevice(__unused IOService *driver, IOService *device) +{ + IOLockLock(interfaceLock); + + if (interface.version > 0) + interface.unregisterDevice(device); + else + deviceRegistrationList->removeObject(device); + + IOLockUnlock(interfaceLock); +} + +uint64_t IOPerfControlClient::workSubmit(IOService *device, WorkSubmitArgs *args) +{ + return kIOPerfControlClientWorkUntracked; +} + +uint64_t IOPerfControlClient::workSubmitAndBegin(IOService *device, WorkSubmitArgs *submitArgs, WorkBeginArgs *beginArgs) +{ + return kIOPerfControlClientWorkUntracked; +} + +void IOPerfControlClient::workBegin(IOService *device, uint64_t token, WorkBeginArgs *args) +{ +} + +void IOPerfControlClient::workEnd(IOService *device, uint64_t token, WorkEndArgs *args, bool done) +{ +} + +IOReturn IOPerfControlClient::registerPerformanceController(PerfControllerInterface pci) +{ + IOReturn result = kIOReturnError; + + IOLockLock(interfaceLock); + + if (interface.version == 0 && pci.version > 0) { + assert(pci.registerDevice && pci.unregisterDevice && pci.workCanSubmit && pci.workSubmit && pci.workBegin && pci.workEnd); + result = kIOReturnSuccess; + + OSObject *obj; + while ((obj = deviceRegistrationList->getAnyObject())) { + IOService *device = OSDynamicCast(IOService, obj); + if (device) + pci.registerDevice(device); + deviceRegistrationList->removeObject(obj); + } + + interface = pci; + } + + IOLockUnlock(interfaceLock); + + return result; +} diff --git a/iokit/Kernel/IOPlatformExpert.cpp b/iokit/Kernel/IOPlatformExpert.cpp index 7372f799a..035177e8f 100644 --- a/iokit/Kernel/IOPlatformExpert.cpp +++ b/iokit/Kernel/IOPlatformExpert.cpp @@ -1637,48 +1637,6 @@ IOWorkLoop *IOPlatformExpertDevice::getWorkLoop() const IOReturn IOPlatformExpertDevice::setProperties( OSObject * properties ) { - OSDictionary * dictionary; - OSObject * object; - IOReturn status; - - status = super::setProperties( properties ); - if ( status != kIOReturnUnsupported ) return status; - - status = IOUserClient::clientHasPrivilege( current_task( ), kIOClientPrivilegeAdministrator ); - if ( status != kIOReturnSuccess ) return status; - - dictionary = OSDynamicCast( OSDictionary, properties ); - if ( dictionary == 0 ) return kIOReturnBadArgument; - - object = dictionary->getObject( kIOPlatformUUIDKey ); - if ( object ) - { - IORegistryEntry * entry; - OSString * string; - uuid_t uuid; - - string = ( OSString * ) getProperty( kIOPlatformUUIDKey ); - if ( string ) return kIOReturnNotPermitted; - - string = OSDynamicCast( OSString, object ); - if ( string == 0 ) return kIOReturnBadArgument; - - status = uuid_parse( string->getCStringNoCopy( ), uuid ); - if ( status != 0 ) return kIOReturnBadArgument; - - entry = IORegistryEntry::fromPath( "/options", gIODTPlane ); - if ( entry ) - { - entry->setProperty( "platform-uuid", uuid, sizeof( uuid_t ) ); - entry->release( ); - } - - setProperty( kIOPlatformUUIDKey, string ); - publishResource( kIOPlatformUUIDKey, string ); - - return kIOReturnSuccess; - } - return kIOReturnUnsupported; } diff --git a/iokit/Kernel/IOPolledInterface.cpp b/iokit/Kernel/IOPolledInterface.cpp index 28256a35c..8218c5dde 100644 --- a/iokit/Kernel/IOPolledInterface.cpp +++ b/iokit/Kernel/IOPolledInterface.cpp @@ -584,6 +584,7 @@ IOGetVolumeCryptKey(dev_t block_dev, OSString ** pKeyUUID, IOReturn IOPolledFileOpen(const char * filename, + uint32_t flags, uint64_t setFileSize, uint64_t fsFreeSize, void * write_file_addr, size_t write_file_len, IOPolledFileIOVars ** fileVars, @@ -614,7 +615,7 @@ IOPolledFileOpen(const char * filename, clock_get_uptime(&startTime); vars->fileRef = kern_open_file_for_direct_io(filename, - (write_file_addr != NULL) || (0 != setFileSize), + flags, &file_extent_callback, &ctx, setFileSize, fsFreeSize, diff --git a/iokit/Kernel/IORegistryEntry.cpp b/iokit/Kernel/IORegistryEntry.cpp index f07b42318..03bb8724f 100644 --- a/iokit/Kernel/IORegistryEntry.cpp +++ b/iokit/Kernel/IORegistryEntry.cpp @@ -33,7 +33,7 @@ #include #include - +#include #include #include "IOKitKernelInternal.h" @@ -60,9 +60,10 @@ OSDefineMetaClassAndStructors(IORegistryEntry, OSObject) struct IORegistryEntry::ExpansionData { - IORecursiveLock * fLock; - uint64_t fRegistryEntryID; - SInt32 fRegistryEntryGenerationCount; + IORecursiveLock * fLock; + uint64_t fRegistryEntryID; + SInt32 fRegistryEntryGenerationCount; + OSObject **_Atomic fIndexedProperties; }; @@ -404,7 +405,15 @@ void IORegistryEntry::free( void ) if (reserved) { - if (reserved->fLock) IORecursiveLockFree(reserved->fLock); + if (reserved->fIndexedProperties) + { + for (int idx = 0; idx < kIORegistryEntryIndexedPropertyCount; idx++) + { + if (reserved->fIndexedProperties[idx]) reserved->fIndexedProperties[idx]->release(); + } + IODelete(reserved->fIndexedProperties, OSObject *, kIORegistryEntryIndexedPropertyCount); + } + if (reserved->fLock) IORecursiveLockFree(reserved->fLock); IODelete(reserved, ExpansionData, 1); } @@ -744,6 +753,40 @@ IORegistryEntry::setProperty( const char * aKey, /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ +OSObject * IORegistryEntry::setIndexedProperty(uint32_t index, OSObject * anObject) +{ + OSObject ** array; + OSObject * prior; + + if (index >= kIORegistryEntryIndexedPropertyCount) return (0); + + array = atomic_load_explicit(&reserved->fIndexedProperties, memory_order_acquire); + if (!array) + { + array = IONew(OSObject *, kIORegistryEntryIndexedPropertyCount); + if (!array) return (0); + bzero(array, kIORegistryEntryIndexedPropertyCount * sizeof(array[0])); + if (!OSCompareAndSwapPtr(NULL, array, &reserved->fIndexedProperties)) IODelete(array, OSObject *, kIORegistryEntryIndexedPropertyCount); + } + if (!reserved->fIndexedProperties) return (0); + + prior = reserved->fIndexedProperties[index]; + if (anObject) anObject->retain(); + reserved->fIndexedProperties[index] = anObject; + + return (prior); +} + +OSObject * IORegistryEntry::getIndexedProperty(uint32_t index) const +{ + if (index >= kIORegistryEntryIndexedPropertyCount) return (0); + if (!reserved->fIndexedProperties) return (0); + + return (reserved->fIndexedProperties[index]); +} + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + /* Name, location, paths */ const char * IORegistryEntry::getName( const IORegistryPlane * plane ) const @@ -886,7 +929,7 @@ IORegistryEntry::compareName( OSString * name, OSString ** matched ) const const OSSymbol * sym = copyName(); bool isEqual; - isEqual = sym->isEqualTo( name ); + isEqual = (sym && sym->isEqualTo(name)); if( isEqual && matched) { name->retain(); diff --git a/iokit/Kernel/IOService.cpp b/iokit/Kernel/IOService.cpp index 5d4be71a6..cfb2d9c0f 100644 --- a/iokit/Kernel/IOService.cpp +++ b/iokit/Kernel/IOService.cpp @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -613,7 +614,12 @@ void IOService::free( void ) if (_numInterruptSources && _interruptSources) { - IOFree(_interruptSources, _numInterruptSources * sizeof(IOInterruptSource)); + for (i = 0; i < _numInterruptSources; i++) { + void * block = _interruptSourcesPrivate(this)[i].vectorBlock; + if (block) Block_release(block); + } + IOFree(_interruptSources, + _numInterruptSources * sizeofAllIOInterruptSource); _interruptSources = 0; } @@ -724,7 +730,7 @@ void IOService::detach( IOService * provider ) unlockForArbitration(); - if( newProvider) { + if( newProvider && adjParent) { newProvider->lockForArbitration(); newProvider->_adjustBusy(1); newProvider->unlockForArbitration(); @@ -1810,12 +1816,45 @@ IONotifier * IOService::registerInterest( const OSSymbol * typeOfInterest, return( notify ); } + + +static IOReturn +IOServiceInterestHandlerToBlock( void * target __unused, void * refCon, + UInt32 messageType, IOService * provider, + void * messageArgument, vm_size_t argSize ) +{ + return ((IOServiceInterestHandlerBlock) refCon)(messageType, provider, messageArgument, argSize); +} + +IONotifier * IOService::registerInterest(const OSSymbol * typeOfInterest, + IOServiceInterestHandlerBlock handler) +{ + IONotifier * notify; + void * block; + + block = Block_copy(handler); + if (!block) return (NULL); + + notify = registerInterest(typeOfInterest, &IOServiceInterestHandlerToBlock, NULL, block); + + if (!notify) Block_release(block); + + return (notify); +} + IOReturn IOService::registerInterestForNotifier( IONotifier *svcNotify, const OSSymbol * typeOfInterest, IOServiceInterestHandler handler, void * target, void * ref ) { IOReturn rc = kIOReturnSuccess; _IOServiceInterestNotifier *notify = 0; + if (!svcNotify || !(notify = OSDynamicCast(_IOServiceInterestNotifier, svcNotify))) + return( kIOReturnBadArgument ); + + notify->handler = handler; + notify->target = target; + notify->ref = ref; + if( (typeOfInterest != gIOGeneralInterest) && (typeOfInterest != gIOBusyInterest) && (typeOfInterest != gIOAppPowerStateInterest) @@ -1823,15 +1862,9 @@ IOReturn IOService::registerInterestForNotifier( IONotifier *svcNotify, const OS && (typeOfInterest != gIOPriorityPowerStateInterest)) return( kIOReturnBadArgument ); - if (!svcNotify || !(notify = OSDynamicCast(_IOServiceInterestNotifier, svcNotify))) - return( kIOReturnBadArgument ); - lockForArbitration(); if( 0 == (__state[0] & kIOServiceInactiveState)) { - notify->handler = handler; - notify->target = target; - notify->ref = ref; notify->state = kIOServiceNotifyEnable; ////// queue @@ -1942,6 +1975,9 @@ void _IOServiceInterestNotifier::wait() void _IOServiceInterestNotifier::free() { assert( queue_empty( &handlerInvocations )); + + if (handler == &IOServiceInterestHandlerToBlock) Block_release(ref); + OSObject::free(); } @@ -4680,6 +4716,34 @@ IONotifier * IOService::addMatchingNotification( return( ret ); } +static bool +IOServiceMatchingNotificationHandlerToBlock( void * target __unused, void * refCon, + IOService * newService, + IONotifier * notifier ) +{ + return ((IOServiceMatchingNotificationHandlerBlock) refCon)(newService, notifier); +} + +IONotifier * IOService::addMatchingNotification( + const OSSymbol * type, OSDictionary * matching, + SInt32 priority, + IOServiceMatchingNotificationHandlerBlock handler) +{ + IONotifier * notify; + void * block; + + block = Block_copy(handler); + if (!block) return (NULL); + + notify = addMatchingNotification(type, matching, + &IOServiceMatchingNotificationHandlerToBlock, NULL, block, priority); + + if (!notify) Block_release(block); + + return (notify); +} + + bool IOService::syncNotificationHandler( void * /* target */, void * ref, IOService * newService, @@ -4981,6 +5045,9 @@ void _IOServiceNotifier::wait() void _IOServiceNotifier::free() { assert( queue_empty( &handlerInvocations )); + + if (handler == &IOServiceMatchingNotificationHandlerToBlock) Block_release(ref); + OSObject::free(); } @@ -6303,10 +6370,11 @@ IOReturn IOService::resolveInterrupt(IOService *nub, int source) // Allocate space for the IOInterruptSources if needed... then return early. if (nub->_interruptSources == 0) { numSources = array->getCount(); - interruptSources = (IOInterruptSource *)IOMalloc(numSources * sizeof(IOInterruptSource)); + interruptSources = (IOInterruptSource *)IOMalloc( + numSources * sizeofAllIOInterruptSource); if (interruptSources == 0) return kIOReturnNoMemory; - bzero(interruptSources, numSources * sizeof(IOInterruptSource)); + bzero(interruptSources, numSources * sizeofAllIOInterruptSource); nub->_numInterruptSources = numSources; nub->_interruptSources = interruptSources; @@ -6353,7 +6421,7 @@ IOReturn IOService::lookupInterrupt(int source, bool resolve, IOInterruptControl if (*interruptController == NULL) { if (!resolve) return kIOReturnNoInterrupt; - /* Try to reslove the interrupt */ + /* Try to resolve the interrupt */ ret = resolveInterrupt(this, source); if (ret != kIOReturnSuccess) return ret; @@ -6379,16 +6447,49 @@ IOReturn IOService::registerInterrupt(int source, OSObject *target, refCon); } +static void IOServiceInterruptActionToBlock( OSObject * target, void * refCon, + IOService * nub, int source ) +{ + ((IOInterruptActionBlock)(refCon))(nub, source); +} + +IOReturn IOService::registerInterruptBlock(int source, OSObject *target, + IOInterruptActionBlock handler) +{ + IOReturn ret; + void * block; + + block = Block_copy(handler); + if (!block) return (kIOReturnNoMemory); + + ret = registerInterrupt(source, target, &IOServiceInterruptActionToBlock, block); + if (kIOReturnSuccess != ret) { + Block_release(block); + return (ret); + } + _interruptSourcesPrivate(this)[source].vectorBlock = block; + + return (ret); +} + IOReturn IOService::unregisterInterrupt(int source) { - IOInterruptController *interruptController; IOReturn ret; + IOInterruptController *interruptController; + void *block; ret = lookupInterrupt(source, false, &interruptController); if (ret != kIOReturnSuccess) return ret; /* Unregister the source */ - return interruptController->unregisterInterrupt(this, source); + block = _interruptSourcesPrivate(this)[source].vectorBlock; + ret = interruptController->unregisterInterrupt(this, source); + if ((kIOReturnSuccess == ret) && (block = _interruptSourcesPrivate(this)[source].vectorBlock)) { + _interruptSourcesPrivate(this)[source].vectorBlock = NULL; + Block_release(block); + } + + return ret; } IOReturn IOService::addInterruptStatistics(IOInterruptAccountingData * statistics, int source) diff --git a/iokit/Kernel/IOServicePM.cpp b/iokit/Kernel/IOServicePM.cpp index 6fc90a603..e6bf85a01 100644 --- a/iokit/Kernel/IOServicePM.cpp +++ b/iokit/Kernel/IOServicePM.cpp @@ -474,6 +474,9 @@ void IOService::PMinit( void ) { fWatchdogTimer = thread_call_allocate( &IOService::watchdog_timer_expired, (thread_call_param_t)this); + fWatchdogLock = IOLockAlloc(); + + fBlockedArray = OSArray::withCapacity(4); } fAckTimer = thread_call_allocate( @@ -544,6 +547,16 @@ void IOService::PMfree( void ) fWatchdogTimer = NULL; } + if (fWatchdogLock) { + IOLockFree(fWatchdogLock); + fWatchdogLock = NULL; + } + + if (fBlockedArray) { + fBlockedArray->release(); + fBlockedArray = NULL; + } + if ( fSettleTimer ) { thread_call_cancel(fSettleTimer); thread_call_free(fSettleTimer); @@ -1080,6 +1093,7 @@ IOReturn IOService::removePowerChild( IOPowerConnection * theNub ) if ( fHeadNotePendingAcks == 0 ) { stop_ack_timer(); + getPMRootDomain()->reset_watchdog_timer(this, 0); // This parent may have a request in the work queue that is // blocked on fHeadNotePendingAcks=0. And removePowerChild() @@ -1600,6 +1614,7 @@ bool IOService::handleAcknowledgePowerChange( IOPMRequest * request ) stop_ack_timer(); // and now we can continue all_acked = true; + getPMRootDomain()->reset_watchdog_timer(this, 0); } } else { OUR_PMLog(kPMLogAcknowledgeErr3, 0, 0); // not expecting anybody to ack @@ -3608,6 +3623,7 @@ void IOService::notifyInterestedDriversDone( void ) IOItemCount count; DriverCallParam * param; IOReturn result; + int maxTimeout = 0; PM_ASSERT_IN_GATE(); assert( fDriverCallBusy == false ); @@ -3650,6 +3666,9 @@ void IOService::notifyInterestedDriversDone( void ) result = kMinAckTimeoutTicks; informee->timer = (result / (ACK_TIMER_PERIOD / ns_per_us)) + 1; + if (result > maxTimeout) { + maxTimeout = result; + } } // else, child has already acked or driver has removed interest, // and head_note_pendingAcks decremented. @@ -3665,6 +3684,7 @@ void IOService::notifyInterestedDriversDone( void ) { OUR_PMLog(kPMLogStartAckTimer, 0, 0); start_ack_timer(); + getPMRootDomain()->reset_watchdog_timer(this, maxTimeout/USEC_PER_SEC+1); } } @@ -3986,6 +4006,7 @@ void IOService::driverSetPowerState( void ) param = (DriverCallParam *) fDriverCallParamPtr; powerState = fHeadNotePowerState; + callEntry.callMethod = OSMemberFunctionCast(const void *, fControllingDriver, &IOService::setPowerState); if (assertPMDriverCall(&callEntry)) { OUR_PMLogFuncStart(kPMLogProgramHardware, (uintptr_t) this, powerState); @@ -4066,6 +4087,12 @@ void IOService::driverInformPowerChange( void ) informee = (IOPMinformee *) param->Target; driver = informee->whatObject; + if (fDriverCallReason == kDriverCallInformPreChange) { + callEntry.callMethod = OSMemberFunctionCast(const void *, driver, &IOService::powerStateWillChangeTo); + } + else { + callEntry.callMethod = OSMemberFunctionCast(const void *, driver, &IOService::powerStateDidChangeTo); + } if (assertPMDriverCall(&callEntry, 0, informee)) { if (fDriverCallReason == kDriverCallInformPreChange) @@ -4277,6 +4304,7 @@ void IOService::notifyControllingDriverDone( void ) { OUR_PMLog(kPMLogStartAckTimer, 0, 0); start_ack_timer(); + getPMRootDomain()->reset_watchdog_timer(this, result/USEC_PER_SEC+1); } } @@ -5311,6 +5339,7 @@ bool IOService::ackTimerTick( void ) done = true; } #endif + getPMRootDomain()->reset_watchdog_timer(this, 0); } else { // still waiting, set timer again start_ack_timer(); @@ -5392,55 +5421,124 @@ bool IOService::ackTimerTick( void ) //********************************************************************************* void IOService::start_watchdog_timer( void ) { - AbsoluteTime deadline; - boolean_t pending; - static int timeout = -1; + int timeout; + uint64_t deadline; if (!fWatchdogTimer || (kIOSleepWakeWdogOff & gIOKitDebug)) return; - if (thread_call_isactive(fWatchdogTimer)) return; - if (timeout == -1) { - PE_parse_boot_argn("swd_timeout", &timeout, sizeof(timeout)); - } - if (timeout < 60) { - timeout = WATCHDOG_TIMER_PERIOD; - } + IOLockLock(fWatchdogLock); + timeout = getPMRootDomain()->getWatchdogTimeout(); clock_interval_to_deadline(timeout, kSecondScale, &deadline); + fWatchdogDeadline = deadline; + start_watchdog_timer(deadline); + IOLockUnlock(fWatchdogLock); +} - retain(); - pending = thread_call_enter_delayed(fWatchdogTimer, deadline); - if (pending) release(); +void IOService::start_watchdog_timer(uint64_t deadline) +{ + + IOLockAssert(fWatchdogLock, kIOLockAssertOwned); + + if (!thread_call_isactive(fWatchdogTimer)) { + thread_call_enter_delayed(fWatchdogTimer, deadline); + } } //********************************************************************************* // [private] stop_watchdog_timer -// Returns true if watchdog was enabled and stopped now //********************************************************************************* -bool IOService::stop_watchdog_timer( void ) +void IOService::stop_watchdog_timer( void ) { - boolean_t pending; - if (!fWatchdogTimer || (kIOSleepWakeWdogOff & gIOKitDebug)) - return false; + return; - pending = thread_call_cancel(fWatchdogTimer); - if (pending) release(); + IOLockLock(fWatchdogLock); + + thread_call_cancel(fWatchdogTimer); + fWatchdogDeadline = 0; + + while (fBlockedArray->getCount()) { + IOService *obj = OSDynamicCast(IOService, fBlockedArray->getObject(0)); + if (obj) { + PM_ERROR("WDOG:Object %s unexpected in blocked array\n", obj->fName); + fBlockedArray->removeObject(0); + } + } - return pending; + IOLockUnlock(fWatchdogLock); } //********************************************************************************* // reset_watchdog_timer //********************************************************************************* -void IOService::reset_watchdog_timer( void ) +void IOService::reset_watchdog_timer(IOService *blockedObject, int pendingResponseTimeout) { - if (stop_watchdog_timer()) - start_watchdog_timer(); + unsigned int i; + uint64_t deadline; + IOService *obj; + + if (!fWatchdogTimer || (kIOSleepWakeWdogOff & gIOKitDebug)) + return; + + + IOLockLock(fWatchdogLock); + if (!fWatchdogDeadline) { + goto exit; + } + + i = fBlockedArray->getNextIndexOfObject(blockedObject, 0); + if (pendingResponseTimeout == 0) { + blockedObject->fPendingResponseDeadline = 0; + if (i == (unsigned int)-1) { + goto exit; + } + fBlockedArray->removeObject(i); + } + else { + // Set deadline 2secs after the expected response timeout to allow + // ack timer to handle the timeout. + clock_interval_to_deadline(pendingResponseTimeout+2, kSecondScale, &deadline); + + if (i != (unsigned int)-1) { + PM_ERROR("WDOG:Object %s is already blocked for responses. Ignoring timeout %d\n", + fName, pendingResponseTimeout); + goto exit; + } + + + for (i = 0; i < fBlockedArray->getCount(); i++) { + obj = OSDynamicCast(IOService, fBlockedArray->getObject(i)); + if (obj && (obj->fPendingResponseDeadline < deadline)) { + blockedObject->fPendingResponseDeadline = deadline; + fBlockedArray->setObject(i, blockedObject); + break; + } + } + if (i == fBlockedArray->getCount()) { + blockedObject->fPendingResponseDeadline = deadline; + fBlockedArray->setObject(blockedObject); + } + } + + obj = OSDynamicCast(IOService, fBlockedArray->getObject(0)); + if (!obj) { + int timeout = getPMRootDomain()->getWatchdogTimeout(); + clock_interval_to_deadline(timeout, kSecondScale, &deadline); + } + else { + deadline = obj->fPendingResponseDeadline; + } + + thread_call_cancel(fWatchdogTimer); + start_watchdog_timer(deadline); + +exit: + IOLockUnlock(fWatchdogLock); } @@ -5493,10 +5591,6 @@ void IOService::start_ack_timer ( UInt32 interval, UInt32 scale ) pending = thread_call_enter_delayed(fAckTimer, deadline); if (pending) release(); - // Stop watchdog if ack is delayed by more than a sec - if (interval * scale > kSecondScale) { - stop_watchdog_timer(); - } } //********************************************************************************* @@ -5509,8 +5603,6 @@ void IOService::stop_ack_timer( void ) pending = thread_call_cancel(fAckTimer); if (pending) release(); - - start_watchdog_timer(); } //********************************************************************************* @@ -5535,7 +5627,6 @@ IOService::actionAckTimerExpired( if (done && gIOPMWorkQueue) { gIOPMWorkQueue->signalWorkAvailable(); - me->start_watchdog_timer(); } return kIOReturnSuccess; @@ -5832,6 +5923,9 @@ void IOService::cleanClientResponses( bool logErrors ) } } + if (IS_ROOT_DOMAIN) { + getPMRootDomain()->reset_watchdog_timer(this, 0); + } if (fResponseArray) { fResponseArray->release(); @@ -5924,6 +6018,7 @@ bool IOService::tellClientsWithResponse( int messageType ) } } context.maxTimeRequested = maxTimeOut; + context.enableTracing = isRootDomain; applyToInterested( gIOGeneralInterest, pmTellClientWithResponse, (void *) &context ); @@ -5972,6 +6067,7 @@ bool IOService::tellClientsWithResponse( int messageType ) OUR_PMLog(kPMLogStartAckTimer, context.maxTimeRequested, 0); if (context.enableTracing) { getPMRootDomain()->traceDetail(context.messageType, 0, context.maxTimeRequested / 1000); + getPMRootDomain()->reset_watchdog_timer(this, context.maxTimeRequested/USEC_PER_SEC+1); } start_ack_timer( context.maxTimeRequested / 1000, kMillisecondScale ); return false; @@ -6160,13 +6256,19 @@ void IOService::pmTellClientWithResponse( OSObject * object, void * arg ) if (context->enableTracing && (notifier != 0)) { - getPMRootDomain()->traceDetail(notifier); + getPMRootDomain()->traceDetail(notifier, true); } clock_get_uptime(&start); retCode = context->us->messageClient(msgType, object, (void *) ¬ify, sizeof(notify)); clock_get_uptime(&end); + if (context->enableTracing && (notifier != NULL)) + { + getPMRootDomain()->traceDetail(notifier, false); + } + + if (kIOReturnSuccess == retCode) { if (0 == notify.returnValue) { @@ -6373,13 +6475,17 @@ void IOService::pmTellCapabilityClientWithResponse( if (context->enableTracing && (notifier != 0)) { - getPMRootDomain()->traceDetail(notifier); + getPMRootDomain()->traceDetail(notifier, true); } clock_get_uptime(&start); retCode = context->us->messageClient( msgType, object, (void *) &msgArg, sizeof(msgArg)); clock_get_uptime(&end); + if (context->enableTracing && (notifier != NULL)) + { + getPMRootDomain()->traceDetail(notifier, false); + } if ( kIOReturnSuccess == retCode ) { @@ -6490,6 +6596,7 @@ void IOService::tellClients( int messageType ) context.stateNumber = fHeadNotePowerState; context.stateFlags = fHeadNotePowerArrayEntry->capabilityFlags; context.changeFlags = fHeadNoteChangeFlags; + context.enableTracing = IS_ROOT_DOMAIN; context.messageFilter = (IS_ROOT_DOMAIN) ? OSMemberFunctionCast( IOPMMessageFilter, @@ -6539,7 +6646,17 @@ static void tellKernelClientApplier( OSObject * object, void * arg ) notify.stateNumber = context->stateNumber; notify.stateFlags = context->stateFlags; + if (context->enableTracing && object) + { + IOService::getPMRootDomain()->traceDetail(object, true); + } context->us->messageClient(context->messageType, object, ¬ify, sizeof(notify)); + if (context->enableTracing && object) + { + IOService::getPMRootDomain()->traceDetail(object, false); + } + + if ((kIOLogDebugPower & gIOKitDebug) && (OSDynamicCast(_IOServiceInterestNotifier, object))) @@ -7974,6 +8091,7 @@ bool IOService::actionPMReplyQueue( IOPMRequest * request, IOPMRequestQueue * qu // expected ack, stop the timer stop_ack_timer(); + getPMRootDomain()->reset_watchdog_timer(this, 0); uint64_t nsec = computeTimeDeltaNS(&fDriverCallStartTime); if (nsec > LOG_SETPOWER_TIMES) { @@ -8109,6 +8227,33 @@ void IOService::deassertPMDriverCall( IOPMDriverCallEntry * entry ) PM_LOCK_WAKEUP(&fPMDriverCallQueue); } +bool IOService::getBlockingDriverCall(thread_t *thread, const void **callMethod) +{ + const IOPMDriverCallEntry * entry = NULL; + bool blocked = false; + + if (!initialized) { + return false; + } + + if (current_thread() != gIOPMWatchDogThread) { + // Meant to be accessed only from watchdog thread + return false; + } + + PM_LOCK(); + entry = qe_queue_first(&fPMDriverCallQueue, IOPMDriverCallEntry, link); + if (entry) { + *thread = entry->thread; + *callMethod = entry->callMethod; + blocked = true; + } + PM_UNLOCK(); + + return blocked; +} + + void IOService::waitForPMDriverCall( IOService * target ) { const IOPMDriverCallEntry * entry; diff --git a/iokit/Kernel/IOServicePMPrivate.h b/iokit/Kernel/IOServicePMPrivate.h index 0dbc58aca..26bfbee7f 100644 --- a/iokit/Kernel/IOServicePMPrivate.h +++ b/iokit/Kernel/IOServicePMPrivate.h @@ -186,6 +186,11 @@ class IOServicePM : public OSObject thread_call_t WatchdogTimer; thread_call_t SpinDumpTimer; + IOLock * WatchdogLock; + OSArray * BlockedArray; + uint64_t PendingResponseDeadline; + uint64_t WatchdogDeadline; + // Settle time after changing power state. uint32_t SettleTimeUS; uint32_t IdleTimerGeneration; @@ -360,6 +365,10 @@ class IOServicePM : public OSObject #define fSettleTimer pwrMgt->SettleTimer #define fIdleTimer pwrMgt->IdleTimer #define fWatchdogTimer pwrMgt->WatchdogTimer +#define fWatchdogDeadline pwrMgt->WatchdogDeadline +#define fWatchdogLock pwrMgt->WatchdogLock +#define fBlockedArray pwrMgt->BlockedArray +#define fPendingResponseDeadline pwrMgt->PendingResponseDeadline #define fSpinDumpTimer pwrMgt->SpinDumpTimer #define fSettleTimeUS pwrMgt->SettleTimeUS #define fIdleTimerGeneration pwrMgt->IdleTimerGeneration @@ -459,9 +468,11 @@ the ack timer is ticking every tenth of a second. #define ACK_TIMER_PERIOD 100000000 #if defined(__i386__) || defined(__x86_64__) -#define WATCHDOG_TIMER_PERIOD (300) // 300 secs +#define WATCHDOG_SLEEP_TIMEOUT (180) // 180 secs +#define WATCHDOG_WAKE_TIMEOUT (180) // 180 secs #else -#define WATCHDOG_TIMER_PERIOD (180) // 180 secs +#define WATCHDOG_SLEEP_TIMEOUT (180) // 180 secs +#define WATCHDOG_WAKE_TIMEOUT (180) // 180 secs #endif // Max wait time in microseconds for kernel priority and capability clients diff --git a/iokit/Kernel/IOServicePrivate.h b/iokit/Kernel/IOServicePrivate.h index 7b01220b3..5b420452e 100644 --- a/iokit/Kernel/IOServicePrivate.h +++ b/iokit/Kernel/IOServicePrivate.h @@ -71,7 +71,8 @@ enum { // notify state enum { kIOServiceNotifyEnable = 0x00000001, - kIOServiceNotifyWaiter = 0x00000002 + kIOServiceNotifyWaiter = 0x00000002, + kIOServiceNotifyBlock = 0x00000004 }; struct _IOServiceNotifierInvocation @@ -225,5 +226,12 @@ extern const OSSymbol * gIOConsoleSessionAuditIDKey; extern const OSSymbol * gIOConsoleSessionOnConsoleKey; extern const OSSymbol * gIOConsoleSessionSecureInputPIDKey; + +#define _interruptSourcesPrivate(service) \ + ((IOInterruptSourcePrivate *)(&(service)->_interruptSources[(service)->_numInterruptSources])) + +#define sizeofAllIOInterruptSource \ + (sizeof(IOInterruptSourcePrivate) + sizeof(IOInterruptSource)) + #endif /* ! _IOKIT_IOSERVICEPRIVATE_H */ diff --git a/iokit/Kernel/IOSharedDataQueue.cpp b/iokit/Kernel/IOSharedDataQueue.cpp index 17656644a..385393f65 100644 --- a/iokit/Kernel/IOSharedDataQueue.cpp +++ b/iokit/Kernel/IOSharedDataQueue.cpp @@ -285,18 +285,28 @@ Boolean IOSharedDataQueue::enqueue(void * data, UInt32 dataSize) } } - // Update tail with release barrier - __c11_atomic_store((_Atomic UInt32 *)&dataQueue->tail, newTail, __ATOMIC_RELEASE); - - // Send notification (via mach message) that data is available. - - if ( ( tail == head ) /* queue was empty prior to enqueue() */ - || ( tail == __c11_atomic_load((_Atomic UInt32 *)&dataQueue->head, __ATOMIC_ACQUIRE) ) ) /* queue was emptied during enqueue() */ - { - sendDataAvailableNotification(); - } - - return true; + // Publish the data we just enqueued + __c11_atomic_store((_Atomic UInt32 *)&dataQueue->tail, newTail, __ATOMIC_RELEASE); + + if (tail != head) { + // + // The memory barrier below paris with the one in ::dequeue + // so that either our store to the tail cannot be missed by + // the next dequeue attempt, or we will observe the dequeuer + // making the queue empty. + // + // Of course, if we already think the queue is empty, + // there's no point paying this extra cost. + // + __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); + head = __c11_atomic_load((_Atomic UInt32 *)&dataQueue->head, __ATOMIC_RELAXED); + } + + if (tail == head) { + // Send notification (via mach message) that data is now available. + sendDataAvailableNotification(); + } + return true; } Boolean IOSharedDataQueue::dequeue(void *data, UInt32 *dataSize) @@ -308,7 +318,7 @@ Boolean IOSharedDataQueue::dequeue(void *data, UInt32 *dataSize) UInt32 tailOffset = 0; UInt32 newHeadOffset = 0; - if (!dataQueue) { + if (!dataQueue || (data && !dataSize)) { return false; } @@ -356,30 +366,30 @@ Boolean IOSharedDataQueue::dequeue(void *data, UInt32 *dataSize) } newHeadOffset = headOffset + entrySize + DATA_QUEUE_ENTRY_HEADER_SIZE; } - } - - if (entry) { - if (data) { - if (dataSize) { - if (entrySize <= *dataSize) { - memcpy(data, &(entry->data), entrySize); - __c11_atomic_store((_Atomic UInt32 *)&dataQueue->head, newHeadOffset, __ATOMIC_RELEASE); - } else { - retVal = FALSE; - } - } else { - retVal = FALSE; - } - } else { - __c11_atomic_store((_Atomic UInt32 *)&dataQueue->head, newHeadOffset, __ATOMIC_RELEASE); - } - - if (dataSize) { - *dataSize = entrySize; - } - } else { - retVal = FALSE; - } + } else { + // empty queue + return false; + } + + if (data) { + if (entrySize > *dataSize) { + // not enough space + return false; + } + memcpy(data, &(entry->data), entrySize); + *dataSize = entrySize; + } + + __c11_atomic_store((_Atomic UInt32 *)&dataQueue->head, newHeadOffset, __ATOMIC_RELEASE); + + if (newHeadOffset == tailOffset) { + // + // If we are making the queue empty, then we need to make sure + // that either the enqueuer notices, or we notice the enqueue + // that raced with our making of the queue empty. + // + __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); + } return retVal; } diff --git a/iokit/Kernel/IOStatistics.cpp b/iokit/Kernel/IOStatistics.cpp index f3771602a..141eecabf 100644 --- a/iokit/Kernel/IOStatistics.cpp +++ b/iokit/Kernel/IOStatistics.cpp @@ -762,7 +762,7 @@ int IOStatistics::getWorkLoopStatistics(sysctl_req *req) error = ENOMEM; goto exit; } - + memset(buffer, 0, calculatedSize); header = (IOStatisticsWorkLoopHeader*)((void*)buffer); header->sig = IOSTATISTICS_SIG_WORKLOOP; @@ -827,7 +827,7 @@ int IOStatistics::getUserClientStatistics(sysctl_req *req) error = ENOMEM; goto exit; } - + memset(buffer, 0, calculatedSize); header = (IOStatisticsUserClientHeader*)((void*)buffer); header->sig = IOSTATISTICS_SIG_USERCLIENT; diff --git a/iokit/Kernel/IOStringFuncs.c b/iokit/Kernel/IOStringFuncs.c index c4f9458fe..6890305d6 100644 --- a/iokit/Kernel/IOStringFuncs.c +++ b/iokit/Kernel/IOStringFuncs.c @@ -475,17 +475,18 @@ strtouq(const char *nptr, char * strncat(char *s1, const char *s2, unsigned long n) { - char *os1; - int i = n; + if (n != 0) { + char *d = s1; + const char *s = s2; - os1 = s1; - while (*s1++) - ; - --s1; - while ((*s1++ = *s2++)) - if (--i < 0) { - *--s1 = '\0'; - break; - } - return(os1); + while (*d != 0) + d++; + do { + if ((*d = *s++) == '\0') + break; + d++; + } while (--n != 0); + *d = '\0'; + } + return (s1); } diff --git a/iokit/Kernel/IOTimerEventSource.cpp b/iokit/Kernel/IOTimerEventSource.cpp index eedde763e..22987d8b8 100644 --- a/iokit/Kernel/IOTimerEventSource.cpp +++ b/iokit/Kernel/IOTimerEventSource.cpp @@ -45,6 +45,9 @@ __END_DECLS #include #endif +#include + + #define super IOEventSource OSDefineMetaClassAndStructors(IOTimerEventSource, IOEventSource) OSMetaClassDefineReservedUsed(IOTimerEventSource, 0); @@ -96,8 +99,8 @@ do { \ // the timeout interval expires. // -static __inline__ void -InvokeAction(IOTimerEventSource::Action action, IOTimerEventSource * ts, +__inline__ void +IOTimerEventSource::invokeAction(IOTimerEventSource::Action action, IOTimerEventSource * ts, OSObject * owner, IOWorkLoop * workLoop) { bool trace = (gIOKitTrace & kIOTraceTimers) ? true : false; @@ -106,7 +109,8 @@ InvokeAction(IOTimerEventSource::Action action, IOTimerEventSource * ts, IOTimeStampStartConstant(IODBG_TIMES(IOTIMES_ACTION), VM_KERNEL_ADDRHIDE(action), VM_KERNEL_ADDRHIDE(owner)); - (*action)(owner, ts); + if (kActionBlock & flags) ((IOTimerEventSource::ActionBlock) actionBlock)(ts); + else (*action)(owner, ts); #if CONFIG_DTRACE DTRACE_TMR3(iotescallout__expire, Action, action, OSObject, owner, void, workLoop); @@ -135,7 +139,7 @@ void IOTimerEventSource::timeout(void *self) doit = (Action) me->action; if (doit && me->enabled && AbsoluteTime_to_scalar(&me->abstime)) { - InvokeAction(doit, me, me->owner, me->workLoop); + me->invokeAction(doit, me, me->owner, me->workLoop); } IOStatisticsOpenGate(); wl->openGate(); @@ -164,7 +168,7 @@ void IOTimerEventSource::timeoutAndRelease(void * self, void * c) doit = (Action) me->action; if (doit && (me->reserved->calloutGeneration == count)) { - InvokeAction(doit, me, me->owner, me->workLoop); + me->invokeAction(doit, me, me->owner, me->workLoop); } IOStatisticsOpenGate(); wl->openGate(); @@ -186,7 +190,7 @@ bool IOTimerEventSource::checkForWork() && enabled && (doit = (Action) action)) { reserved->calloutGenerationSignaled = ~reserved->calloutGeneration; - InvokeAction(doit, this, owner, workLoop); + invokeAction(doit, this, owner, workLoop); } return false; @@ -303,6 +307,16 @@ IOTimerEventSource::timerEventSource(uint32_t inOptions, OSObject *inOwner, Acti return me; } +IOTimerEventSource * +IOTimerEventSource::timerEventSource(uint32_t options, OSObject *inOwner, ActionBlock action) +{ + IOTimerEventSource * tes; + tes = IOTimerEventSource::timerEventSource(options, inOwner, (Action) NULL); + if (tes) tes->setActionBlock((IOEventSource::ActionBlock) action); + + return tes; +} + #define _thread_call_cancel(tc) ((kActive & flags) ? thread_call_cancel_wait((tc)) : thread_call_cancel((tc))) IOTimerEventSource * diff --git a/iokit/Kernel/IOUserClient.cpp b/iokit/Kernel/IOUserClient.cpp index d331cb2b0..b99f027dd 100644 --- a/iokit/Kernel/IOUserClient.cpp +++ b/iokit/Kernel/IOUserClient.cpp @@ -383,6 +383,7 @@ class IOUserIterator : public OSIterator virtual void reset() APPLE_KEXT_OVERRIDE; virtual bool isValid() APPLE_KEXT_OVERRIDE; virtual OSObject * getNextObject() APPLE_KEXT_OVERRIDE; + virtual OSObject * copyNextObject(); }; /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -471,11 +472,20 @@ IOUserIterator::isValid() OSObject * IOUserIterator::getNextObject() { - OSObject * ret; + assert(false); + return (NULL); +} + +OSObject * +IOUserIterator::copyNextObject() +{ + OSObject * ret = NULL; IOLockLock(lock); - assert(OSDynamicCast(OSIterator, userIteratorObject)); - ret = ((OSIterator *)userIteratorObject)->getNextObject(); + if (userIteratorObject) { + ret = ((OSIterator *)userIteratorObject)->getNextObject(); + if (ret) ret->retain(); + } IOLockUnlock(lock); return (ret); @@ -616,7 +626,6 @@ class IOServiceUserNotification : public IOUserNotification PingMsg * pingMsg; vm_size_t msgSize; OSArray * newSet; - OSObject * lastEntry; bool armed; bool ipcLogged; @@ -633,6 +642,7 @@ class IOServiceUserNotification : public IOUserNotification virtual bool handler( void * ref, IOService * newService ); virtual OSObject * getNextObject() APPLE_KEXT_OVERRIDE; + virtual OSObject * copyNextObject() APPLE_KEXT_OVERRIDE; }; class IOServiceMessageUserNotification : public IOUserNotification @@ -670,6 +680,7 @@ class IOServiceMessageUserNotification : public IOUserNotification void * messageArgument, vm_size_t argSize ); virtual OSObject * getNextObject() APPLE_KEXT_OVERRIDE; + virtual OSObject * copyNextObject() APPLE_KEXT_OVERRIDE; }; /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -776,11 +787,9 @@ void IOServiceUserNotification::free( void ) PingMsg * _pingMsg; vm_size_t _msgSize; OSArray * _newSet; - OSObject * _lastEntry; _pingMsg = pingMsg; _msgSize = msgSize; - _lastEntry = lastEntry; _newSet = newSet; super::free(); @@ -792,9 +801,6 @@ void IOServiceUserNotification::free( void ) IOFree(_pingMsg, _msgSize); } - if( _lastEntry) - _lastEntry->release(); - if( _newSet) _newSet->release(); } @@ -850,16 +856,19 @@ bool IOServiceUserNotification::handler( void * ref, return( true ); } - OSObject * IOServiceUserNotification::getNextObject() +{ + assert(false); + return (NULL); +} + +OSObject * IOServiceUserNotification::copyNextObject() { unsigned int count; OSObject * result; - OSObject * releaseEntry; IOLockLock(lock); - releaseEntry = lastEntry; count = newSet->getCount(); if( count ) { result = newSet->getObject( count - 1 ); @@ -869,12 +878,9 @@ OSObject * IOServiceUserNotification::getNextObject() result = 0; armed = true; } - lastEntry = result; IOLockUnlock(lock); - if (releaseEntry) releaseEntry->release(); - return( result ); } @@ -1068,6 +1074,11 @@ OSObject * IOServiceMessageUserNotification::getNextObject() return( 0 ); } +OSObject * IOServiceMessageUserNotification::copyNextObject() +{ + return( NULL ); +} + /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ #undef super @@ -1438,7 +1449,7 @@ IOUserClient::registerOwner(task_t task) if (newOwner) { owner = IONew(IOUserClientOwner, 1); - if (!newOwner) ret = kIOReturnNoMemory; + if (!owner) ret = kIOReturnNoMemory; else { owner->task = task; @@ -2049,12 +2060,24 @@ kern_return_t is_io_iterator_next( { IOReturn ret; OSObject * obj; + OSIterator * iter; + IOUserIterator * uiter; - CHECK( OSIterator, iterator, iter ); + if ((uiter = OSDynamicCast(IOUserIterator, iterator))) + { + obj = uiter->copyNextObject(); + } + else if ((iter = OSDynamicCast(OSIterator, iterator))) + { + obj = iter->getNextObject(); + if (obj) obj->retain(); + } + else + { + return( kIOReturnBadArgument ); + } - obj = iter->getNextObject(); if( obj) { - obj->retain(); *object = obj; ret = kIOReturnSuccess; } else @@ -3292,8 +3315,8 @@ kern_return_t is_io_registry_entry_get_child_iterator( { CHECK( IORegistryEntry, registry_entry, entry ); - *iterator = entry->getChildIterator( - IORegistryEntry::getPlane( plane )); + *iterator = IOUserIterator::withIterator(entry->getChildIterator( + IORegistryEntry::getPlane( plane ))); return( kIOReturnSuccess ); } @@ -3306,8 +3329,8 @@ kern_return_t is_io_registry_entry_get_parent_iterator( { CHECK( IORegistryEntry, registry_entry, entry ); - *iterator = entry->getParentIterator( - IORegistryEntry::getPlane( plane )); + *iterator = IOUserIterator::withIterator(entry->getParentIterator( + IORegistryEntry::getPlane( plane ))); return( kIOReturnSuccess ); } @@ -4944,7 +4967,7 @@ kern_return_t is_io_catalog_send_data( return kIOReturnBadArgument; } - if (!IOTaskHasEntitlement(current_task(), "com.apple.rootless.kext-management")) + if (!IOTaskHasEntitlement(current_task(), "com.apple.rootless.kext-secure-management")) { OSString * taskName = IOCopyLogNameForPID(proc_selfpid()); IOLog("IOCatalogueSendData(%s): Not entitled\n", taskName ? taskName->getCStringNoCopy() : ""); diff --git a/iokit/Kernel/IOWorkLoop.cpp b/iokit/Kernel/IOWorkLoop.cpp index a5eb85181..74efbd0cd 100644 --- a/iokit/Kernel/IOWorkLoop.cpp +++ b/iokit/Kernel/IOWorkLoop.cpp @@ -416,11 +416,14 @@ void IOWorkLoop::disableAllInterrupts() const } while(workToDo); exitThread: - thread_t thread = workThread; + closeGate(); + thread_t thread = workThread; workThread = 0; // Say we don't have a loop and free ourselves + openGate(); + free(); - thread_deallocate(thread); + thread_deallocate(thread); (void) thread_terminate(thread); } @@ -494,6 +497,18 @@ void IOWorkLoop::wakeupGate(void *event, bool oneThread) IORecursiveLockWakeup(gateLock, event, oneThread); } +static IOReturn IOWorkLoopActionToBlock(OSObject *owner, + void *arg0, void *arg1, + void *arg2, void *arg3) +{ + return ((IOWorkLoop::ActionBlock) arg0)(); +} + +IOReturn IOWorkLoop::runActionBlock(ActionBlock action) +{ + return (runAction(&IOWorkLoopActionToBlock, this, action)); +} + IOReturn IOWorkLoop::runAction(Action inAction, OSObject *target, void *arg0, void *arg1, void *arg2, void *arg3) diff --git a/iokit/Tests/Tests.cpp b/iokit/Tests/Tests.cpp index d1ba499d8..dca24997b 100644 --- a/iokit/Tests/Tests.cpp +++ b/iokit/Tests/Tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2018 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -29,7 +29,7 @@ * */ -#define TEST_HEADERS 0 +#define TEST_HEADERS 0 #if TEST_HEADERS @@ -177,11 +177,17 @@ #include #include "Tests.h" -#include -#include #if DEVELOPMENT || DEBUG +#include +#include +#include +#include +#include +#include +#include + static uint64_t gIOWorkLoopTestDeadline; static void @@ -197,6 +203,7 @@ IOWorkLoopTest(int newValue) uint32_t idx; IOWorkLoop * wl; IOTimerEventSource * tes; + IOInterruptEventSource * ies; wl = IOWorkLoop::workLoop(); assert(wl); @@ -204,7 +211,7 @@ IOWorkLoopTest(int newValue) assert(tes); err = wl->addEventSource(tes); assert(kIOReturnSuccess == err); - clock_interval_to_deadline(2000, kMillisecondScale, &gIOWorkLoopTestDeadline); + clock_interval_to_deadline(100, kMillisecondScale, &gIOWorkLoopTestDeadline); for (idx = 0; mach_absolute_time() < gIOWorkLoopTestDeadline; idx++) { tes->setTimeout(idx & 1023, kNanosecondScale); @@ -212,11 +219,166 @@ IOWorkLoopTest(int newValue) tes->cancelTimeout(); wl->removeEventSource(tes); tes->release(); + + int value = 3; + + tes = IOTimerEventSource::timerEventSource(kIOTimerEventSourceOptionsDefault, wl, ^(IOTimerEventSource * tes){ + kprintf("wl %p, value %d\n", wl, value); + }); + err = wl->addEventSource(tes); + assert(kIOReturnSuccess == err); + + value = 2; + tes->setTimeout(1, kNanosecondScale); + IOSleep(1); + wl->removeEventSource(tes); + tes->release(); + + ies = IOInterruptEventSource::interruptEventSource(wl, NULL, 0, ^void(IOInterruptEventSource *sender, int count){ + kprintf("ies block %p, %d\n", sender, count); + }); + + assert(ies); + kprintf("ies %p\n", ies); + err = wl->addEventSource(ies); + assert(kIOReturnSuccess == err); + ies->interruptOccurred(NULL, NULL, 0); + IOSleep(1); + ies->interruptOccurred(NULL, NULL, 0); + IOSleep(1); + wl->removeEventSource(ies); + ies->release(); + wl->release(); return (0); } +static int +OSCollectionTest(int newValue) +{ + OSArray * array = OSArray::withCapacity(8); + array->setObject(kOSBooleanTrue); + array->setObject(kOSBooleanFalse); + array->setObject(kOSBooleanFalse); + array->setObject(kOSBooleanTrue); + array->setObject(kOSBooleanFalse); + array->setObject(kOSBooleanTrue); + + __block unsigned int index; + index = 0; + array->iterateObjects(^bool(OSObject * obj) { + kprintf("%d:%d ", index, (obj == kOSBooleanTrue) ? 1 : (obj == kOSBooleanFalse) ? 0 : 2); + index++; + return (false); + }); + kprintf("\n"); + array->release(); + + OSDictionary * dict = IOService::resourceMatching("hello"); + assert(dict); + index = 0; + dict->iterateObjects(^bool(const OSSymbol * sym, OSObject * obj) { + OSString * str = OSDynamicCast(OSString, obj); + assert(str); + kprintf("%d:%s=%s\n", index, sym->getCStringNoCopy(), str->getCStringNoCopy()); + index++; + return (false); + }); + dict->release(); + + OSSerializer * serializer = OSSerializer::withBlock(^bool(OSSerialize * s){ + return (gIOBSDUnitKey->serialize(s)); + }); + assert(serializer); + IOService::getPlatform()->setProperty("OSSerializer_withBlock", serializer); + serializer->release(); + + return (0); +} + +#if 0 +#include +class TestUserClient : public IOUserClient +{ + OSDeclareDefaultStructors(TestUserClient); + virtual void stop( IOService *provider) APPLE_KEXT_OVERRIDE; + virtual bool finalize(IOOptionBits options) APPLE_KEXT_OVERRIDE; + virtual IOReturn externalMethod( uint32_t selector, + IOExternalMethodArguments * arguments, + IOExternalMethodDispatch * dispatch, + OSObject * target, + void * reference ) APPLE_KEXT_OVERRIDE; +}; + +void TestUserClient::stop( IOService *provider) +{ + kprintf("TestUserClient::stop\n"); +} +bool TestUserClient::finalize(IOOptionBits options) +{ + kprintf("TestUserClient::finalize\n"); + return(true); +} +IOReturn TestUserClient::externalMethod( uint32_t selector, + IOExternalMethodArguments * arguments, + IOExternalMethodDispatch * dispatch, + OSObject * target, + void * reference ) +{ + getProvider()->terminate(); + IOSleep(500); + return (0); +} +OSDefineMetaClassAndStructors(TestUserClient, IOUserClient); +#endif + +static int +IOServiceTest(int newValue) +{ + OSDictionary * matching; + IONotifier * note; + __block IOService * found; + +#if 0 + found = new IOService; + found->init(); + found->setName("IOTestUserClientProvider"); + found->attach(IOService::getPlatform()); + found->setProperty("IOUserClientClass", "TestUserClient"); + found->registerService(); +#endif + + matching = IOService::serviceMatching("IOPlatformExpert"); + assert(matching); + found = nullptr; + note = IOService::addMatchingNotification(gIOMatchedNotification, matching, 0, + ^bool(IOService * newService, IONotifier * notifier) { + kprintf("found %s, %d\n", newService->getName(), newService->getRetainCount()); + found = newService; + found->retain(); + return (true); + } + ); + assert(note); + assert(found); + matching->release(); + note->remove(); + + note = found->registerInterest(gIOBusyInterest, + ^IOReturn(uint32_t messageType, IOService * provider, + void * messageArgument, size_t argSize) { + kprintf("%p messageType 0x%08x %p\n", provider, messageType, messageArgument); + return (kIOReturnSuccess); + }); + assert(note); + IOSleep(1*1000); + note->remove(); + found->release(); + + return (0); +} + #endif /* DEVELOPMENT || DEBUG */ static int @@ -229,9 +391,34 @@ sysctl_iokittest(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused if (error) return (error); #if DEVELOPMENT || DEBUG + if (changed && (66==newValue)) + { + IOReturn ret; + IOWorkLoop * wl = IOWorkLoop::workLoop(); + IOCommandGate * cg = IOCommandGate::commandGate(wl); + ret = wl->addEventSource(cg); + + struct x + { + uint64_t h; + uint64_t l; + }; + struct x y; + + y.h = 0x1111111122222222; + y.l = 0x3333333344444444; + + kprintf("ret1 %d\n", ret); + ret = cg->runActionBlock(^(){ + printf("hello %d 0x%qx\n", wl->inGate(), y.h); + return 99; + }); + kprintf("ret %d\n", ret); + } + if (changed && (999==newValue)) { - OSData * data = OSData::withCapacity(16); + OSData * data = OSData::withCapacity(16); data->release(); data->release(); } @@ -241,6 +428,10 @@ sysctl_iokittest(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused { error = IOWorkLoopTest(newValue); assert(KERN_SUCCESS == error); + error = IOServiceTest(newValue); + assert(KERN_SUCCESS == error); + error = OSCollectionTest(newValue); + assert(KERN_SUCCESS == error); error = IOMemoryDescriptorTest(newValue); assert(KERN_SUCCESS == error); } @@ -250,7 +441,7 @@ sysctl_iokittest(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused } SYSCTL_PROC(_kern, OID_AUTO, iokittest, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, - 0, 0, sysctl_iokittest, "I", ""); + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, + 0, 0, sysctl_iokittest, "I", ""); diff --git a/iokit/bsddev/IOKitBSDInit.cpp b/iokit/bsddev/IOKitBSDInit.cpp index 27ef74433..7c0e5e9a7 100644 --- a/iokit/bsddev/IOKitBSDInit.cpp +++ b/iokit/bsddev/IOKitBSDInit.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2018 Apple Inc. All rights reserved. + * Copyright (c) 1998-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -38,6 +38,7 @@ extern "C" { #include #include +#include #include #include #include @@ -57,30 +58,33 @@ extern void mdevremoveall(void); extern int mdevgetrange(int devid, uint64_t *base, uint64_t *size); extern void di_root_ramfile(IORegistryEntry * entry); - #if CONFIG_EMBEDDED #define IOPOLLED_COREFILE (CONFIG_KDP_INTERACTIVE_DEBUGGING) #if defined(XNU_TARGET_OS_BRIDGE) + #define kIOCoreDumpSize 150ULL*1024ULL*1024ULL // leave free space on volume: #define kIOCoreDumpFreeSize 150ULL*1024ULL*1024ULL #define kIOCoreDumpPath "/private/var/internal/kernelcore" -#else -#define kIOCoreDumpSize 350ULL*1024ULL*1024ULL + +#else /* defined(XNU_TARGET_OS_BRIDGE) */ +#define kIOCoreDumpMinSize 350ULL*1024ULL*1024ULL +#define kIOCoreDumpLargeSize 500ULL*1024ULL*1024ULL // leave free space on volume: #define kIOCoreDumpFreeSize 350ULL*1024ULL*1024ULL #define kIOCoreDumpPath "/private/var/vm/kernelcore" -#endif -#elif DEVELOPMENT +#endif /* defined(XNU_TARGET_OS_BRIDGE) */ + +#elif DEVELOPMENT /* CONFIG_EMBEDDED */ #define IOPOLLED_COREFILE 1 // no sizing #define kIOCoreDumpSize 0ULL #define kIOCoreDumpFreeSize 0ULL -#else +#else /* CONFIG_EMBEDDED */ #define IOPOLLED_COREFILE 0 -#endif +#endif /* CONFIG_EMBEDDED */ #if IOPOLLED_COREFILE @@ -764,7 +768,7 @@ kern_return_t IOBSDGetPlatformUUID( uuid_t uuid, mach_timespec_t timeout ) #include IOPolledFileIOVars * gIOPolledCoreFileVars; - +kern_return_t gIOPolledCoreFileOpenRet = kIOReturnNotReady; #if IOPOLLED_COREFILE static IOReturn @@ -772,6 +776,7 @@ IOOpenPolledCoreFile(const char * filename) { IOReturn err; unsigned int debug; + uint64_t corefile_size_bytes = 0; if (gIOPolledCoreFileVars) return (kIOReturnBusy); if (!IOPolledInterface::gMetaClass.getInstanceCount()) return (kIOReturnUnsupported); @@ -780,15 +785,89 @@ IOOpenPolledCoreFile(const char * filename) PE_parse_boot_argn("debug", &debug, sizeof (debug)); if (DB_DISABLE_LOCAL_CORE & debug) return (kIOReturnUnsupported); - err = IOPolledFileOpen(filename, kIOCoreDumpSize, kIOCoreDumpFreeSize, - NULL, 0, - &gIOPolledCoreFileVars, NULL, NULL, 0); - if (kIOReturnSuccess != err) return (err); +#if CONFIG_EMBEDDED + unsigned int requested_corefile_size = 0; + if (PE_parse_boot_argn("corefile_size_mb", &requested_corefile_size, sizeof(requested_corefile_size))) { + IOLog("Boot-args specify %d MB kernel corefile\n", requested_corefile_size); + + corefile_size_bytes = (requested_corefile_size * 1024ULL * 1024ULL); + } +#endif + + + do { +#if defined(kIOCoreDumpLargeSize) + if (0 == corefile_size_bytes) + { + // If no custom size was requested and we're on a device with >3GB of DRAM, attempt + // to allocate a large corefile otherwise use a small file. + if (max_mem > (3 * 1024ULL * 1024ULL * 1024ULL)) + { + corefile_size_bytes = kIOCoreDumpLargeSize; + err = IOPolledFileOpen(filename, + kIOPolledFileCreate, + corefile_size_bytes, kIOCoreDumpFreeSize, + NULL, 0, + &gIOPolledCoreFileVars, NULL, NULL, 0); + if (kIOReturnSuccess == err) + { + break; + } + else if (kIOReturnNoSpace == err) + { + IOLog("Failed to open corefile of size %llu MB (low disk space)", + (corefile_size_bytes / (1024ULL * 1024ULL))); + if (corefile_size_bytes == kIOCoreDumpMinSize) + { + gIOPolledCoreFileOpenRet = err; + return (err); + } + // Try to open a smaller corefile (set size and fall-through) + corefile_size_bytes = kIOCoreDumpMinSize; + } + else + { + IOLog("Failed to open corefile of size %llu MB (returned error 0x%x)\n", + (corefile_size_bytes / (1024ULL * 1024ULL)), err); + gIOPolledCoreFileOpenRet = err; + return (err); + } + } + else + { + corefile_size_bytes = kIOCoreDumpMinSize; + } + } +#else /* defined(kIOCoreDumpLargeSize) */ + if (0 == corefile_size_bytes) + { + corefile_size_bytes = kIOCoreDumpSize; + } +#endif /* defined(kIOCoreDumpLargeSize) */ + err = IOPolledFileOpen(filename, + kIOPolledFileCreate, + corefile_size_bytes, kIOCoreDumpFreeSize, + NULL, 0, + &gIOPolledCoreFileVars, NULL, NULL, 0); + if (kIOReturnSuccess != err) + { + IOLog("Failed to open corefile of size %llu MB (returned error 0x%x)\n", + (corefile_size_bytes / (1024ULL * 1024ULL)), err); + gIOPolledCoreFileOpenRet = err; + return (err); + } + } while (false); err = IOPolledFilePollersSetup(gIOPolledCoreFileVars, kIOPolledPreflightCoreDumpState); if (kIOReturnSuccess != err) { - IOPolledFileClose(&gIOPolledCoreFileVars, NULL, NULL, 0, 0, 0); + IOPolledFileClose(&gIOPolledCoreFileVars, NULL, NULL, 0, 0, 0); + IOLog("IOPolledFilePollersSetup for corefile failed with error: 0x%x\n", err); + gIOPolledCoreFileOpenRet = err; + } + else + { + IOLog("Opened corefile of size %llu MB\n", (corefile_size_bytes / (1024ULL * 1024ULL))); } return (err); @@ -797,6 +876,7 @@ IOOpenPolledCoreFile(const char * filename) static void IOClosePolledCoreFile(void) { + gIOPolledCoreFileOpenRet = kIOReturnNotOpen; IOPolledFilePollersClose(gIOPolledCoreFileVars, kIOPolledPostflightCoreDumpState); IOPolledFileClose(&gIOPolledCoreFileVars, NULL, NULL, 0, 0, 0); } @@ -940,7 +1020,6 @@ IOBSDMountChange(struct mount * mp, uint32_t op) #endif /* IOPOLLED_COREFILE */ } - /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ extern "C" boolean_t diff --git a/iokit/conf/Makefile.x86_64 b/iokit/conf/Makefile.x86_64 index 61de9d584..587fe4264 100644 --- a/iokit/conf/Makefile.x86_64 +++ b/iokit/conf/Makefile.x86_64 @@ -9,7 +9,7 @@ UNCONFIGURED_HIB_FILES= \ HIB_FILES=$(filter $(UNCONFIGURED_HIB_FILES),$(OBJS)) # Unconfigured __HIB files must be Mach-O for "setsegname" -IOHibernateRestoreKernel.o_CFLAGS_ADD += -fno-stack-protector $(CFLAGS_NOLTO_FLAG) +IOHibernateRestoreKernel.o_CFLAGS_ADD += -fno-stack-protector -fno-stack-check $(CFLAGS_NOLTO_FLAG) ###################################################################### #END Machine dependent Makefile fragment for x86_64 diff --git a/iokit/conf/files b/iokit/conf/files index bc356a362..ac6b06e0a 100644 --- a/iokit/conf/files +++ b/iokit/conf/files @@ -107,3 +107,4 @@ iokit/Kernel/IOPowerConnection.cpp optional iokitcpp # System Management iokit/Families/IOSystemManagement/IOWatchDogTimer.cpp optional iokitcpp + diff --git a/libkdd/kcdata.h b/libkdd/kcdata.h index 702bfacbc..e36c55352 100644 --- a/libkdd/kcdata.h +++ b/libkdd/kcdata.h @@ -436,47 +436,49 @@ struct kcdata_type_definition { * NOTE: Please update kcdata/libkdd/kcdtypes.c if you make any changes * in STACKSHOT_KCTYPE_* types. */ -#define STACKSHOT_KCTYPE_IOSTATS 0x901u /* io_stats_snapshot */ -#define STACKSHOT_KCTYPE_GLOBAL_MEM_STATS 0x902u /* struct mem_and_io_snapshot */ +#define STACKSHOT_KCTYPE_IOSTATS 0x901u /* io_stats_snapshot */ +#define STACKSHOT_KCTYPE_GLOBAL_MEM_STATS 0x902u /* struct mem_and_io_snapshot */ #define STACKSHOT_KCCONTAINER_TASK 0x903u #define STACKSHOT_KCCONTAINER_THREAD 0x904u -#define STACKSHOT_KCTYPE_TASK_SNAPSHOT 0x905u /* task_snapshot_v2 */ -#define STACKSHOT_KCTYPE_THREAD_SNAPSHOT 0x906u /* thread_snapshot_v2, thread_snapshot_v3 */ -#define STACKSHOT_KCTYPE_DONATING_PIDS 0x907u /* int[] */ -#define STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO 0x908u /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */ -#define STACKSHOT_KCTYPE_THREAD_NAME 0x909u /* char[] */ -#define STACKSHOT_KCTYPE_KERN_STACKFRAME 0x90Au /* struct stack_snapshot_frame32 */ -#define STACKSHOT_KCTYPE_KERN_STACKFRAME64 0x90Bu /* struct stack_snapshot_frame64 */ -#define STACKSHOT_KCTYPE_USER_STACKFRAME 0x90Cu /* struct stack_snapshot_frame32 */ -#define STACKSHOT_KCTYPE_USER_STACKFRAME64 0x90Du /* struct stack_snapshot_frame64 */ -#define STACKSHOT_KCTYPE_BOOTARGS 0x90Eu /* boot args string */ -#define STACKSHOT_KCTYPE_OSVERSION 0x90Fu /* os version string */ -#define STACKSHOT_KCTYPE_KERN_PAGE_SIZE 0x910u /* kernel page size in uint32_t */ -#define STACKSHOT_KCTYPE_JETSAM_LEVEL 0x911u /* jetsam level in uint32_t */ -#define STACKSHOT_KCTYPE_DELTA_SINCE_TIMESTAMP 0x912u /* timestamp used for the delta stackshot */ +#define STACKSHOT_KCTYPE_TASK_SNAPSHOT 0x905u /* task_snapshot_v2 */ +#define STACKSHOT_KCTYPE_THREAD_SNAPSHOT 0x906u /* thread_snapshot_v2, thread_snapshot_v3 */ +#define STACKSHOT_KCTYPE_DONATING_PIDS 0x907u /* int[] */ +#define STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO 0x908u /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */ +#define STACKSHOT_KCTYPE_THREAD_NAME 0x909u /* char[] */ +#define STACKSHOT_KCTYPE_KERN_STACKFRAME 0x90Au /* struct stack_snapshot_frame32 */ +#define STACKSHOT_KCTYPE_KERN_STACKFRAME64 0x90Bu /* struct stack_snapshot_frame64 */ +#define STACKSHOT_KCTYPE_USER_STACKFRAME 0x90Cu /* struct stack_snapshot_frame32 */ +#define STACKSHOT_KCTYPE_USER_STACKFRAME64 0x90Du /* struct stack_snapshot_frame64 */ +#define STACKSHOT_KCTYPE_BOOTARGS 0x90Eu /* boot args string */ +#define STACKSHOT_KCTYPE_OSVERSION 0x90Fu /* os version string */ +#define STACKSHOT_KCTYPE_KERN_PAGE_SIZE 0x910u /* kernel page size in uint32_t */ +#define STACKSHOT_KCTYPE_JETSAM_LEVEL 0x911u /* jetsam level in uint32_t */ +#define STACKSHOT_KCTYPE_DELTA_SINCE_TIMESTAMP 0x912u /* timestamp used for the delta stackshot */ +#define STACKSHOT_KCTYPE_KERN_STACKLR 0x913u /* uint32_t */ +#define STACKSHOT_KCTYPE_KERN_STACKLR64 0x914u /* uint64_t */ +#define STACKSHOT_KCTYPE_USER_STACKLR 0x915u /* uint32_t */ +#define STACKSHOT_KCTYPE_USER_STACKLR64 0x916u /* uint64_t */ +#define STACKSHOT_KCTYPE_NONRUNNABLE_TIDS 0x917u /* uint64_t */ +#define STACKSHOT_KCTYPE_NONRUNNABLE_TASKS 0x918u /* uint64_t */ +#define STACKSHOT_KCTYPE_CPU_TIMES 0x919u /* struct stackshot_cpu_times or stackshot_cpu_times_v2 */ +#define STACKSHOT_KCTYPE_STACKSHOT_DURATION 0x91au /* struct stackshot_duration */ +#define STACKSHOT_KCTYPE_STACKSHOT_FAULT_STATS 0x91bu /* struct stackshot_fault_stats */ +#define STACKSHOT_KCTYPE_KERNELCACHE_LOADINFO 0x91cu /* kernelcache UUID -- same as KCDATA_TYPE_LIBRARY_LOADINFO64 */ +#define STACKSHOT_KCTYPE_THREAD_WAITINFO 0x91du /* struct stackshot_thread_waitinfo */ +#define STACKSHOT_KCTYPE_THREAD_GROUP_SNAPSHOT 0x91eu /* struct thread_group_snapshot or thread_group_snapshot_v2 */ +#define STACKSHOT_KCTYPE_THREAD_GROUP 0x91fu /* uint64_t */ +#define STACKSHOT_KCTYPE_JETSAM_COALITION_SNAPSHOT 0x920u /* struct jetsam_coalition_snapshot */ +#define STACKSHOT_KCTYPE_JETSAM_COALITION 0x921u /* uint64_t */ +#define STACKSHOT_KCTYPE_THREAD_POLICY_VERSION 0x922u /* THREAD_POLICY_INTERNAL_STRUCT_VERSION in uint32 */ +#define STACKSHOT_KCTYPE_INSTRS_CYCLES 0x923u /* struct instrs_cycles_snapshot */ +#define STACKSHOT_KCTYPE_USER_STACKTOP 0x924u /* struct stack_snapshot_stacktop */ +#define STACKSHOT_KCTYPE_ASID 0x925u /* uint32_t */ +#define STACKSHOT_KCTYPE_PAGE_TABLES 0x926u /* uint64_t */ +#define STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT 0x927u /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */ #define STACKSHOT_KCTYPE_TASK_DELTA_SNAPSHOT 0x940u /* task_delta_snapshot_v2 */ #define STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT 0x941u /* thread_delta_snapshot_v* */ -#define STACKSHOT_KCTYPE_KERN_STACKLR 0x913u /* uint32_t */ -#define STACKSHOT_KCTYPE_KERN_STACKLR64 0x914u /* uint64_t */ -#define STACKSHOT_KCTYPE_USER_STACKLR 0x915u /* uint32_t */ -#define STACKSHOT_KCTYPE_USER_STACKLR64 0x916u /* uint64_t */ -#define STACKSHOT_KCTYPE_NONRUNNABLE_TIDS 0x917u /* uint64_t */ -#define STACKSHOT_KCTYPE_NONRUNNABLE_TASKS 0x918u /* uint64_t */ -#define STACKSHOT_KCTYPE_CPU_TIMES 0x919u /* struct stackshot_cpu_times */ -#define STACKSHOT_KCTYPE_STACKSHOT_DURATION 0x91au /* struct stackshot_duration */ -#define STACKSHOT_KCTYPE_STACKSHOT_FAULT_STATS 0x91bu /* struct stackshot_fault_stats */ -#define STACKSHOT_KCTYPE_KERNELCACHE_LOADINFO 0x91cu /* kernelcache UUID -- same as KCDATA_TYPE_LIBRARY_LOADINFO64 */ -#define STACKSHOT_KCTYPE_THREAD_WAITINFO 0x91du /* struct stackshot_thread_waitinfo */ -#define STACKSHOT_KCTYPE_THREAD_GROUP_SNAPSHOT 0x91eu /* struct thread_group_snapshot or thread_group_snapshot_v2 */ -#define STACKSHOT_KCTYPE_THREAD_GROUP 0x91fu /* uint64_t */ -#define STACKSHOT_KCTYPE_JETSAM_COALITION_SNAPSHOT 0x920u /* struct jetsam_coalition_snapshot */ -#define STACKSHOT_KCTYPE_JETSAM_COALITION 0x921u /* uint64_t */ -#define STACKSHOT_KCTYPE_INSTRS_CYCLES 0x923u /* struct instrs_cycles_snapshot */ - -#define STACKSHOT_KCTYPE_THREAD_POLICY_VERSION 0x922u /* THREAD_POLICY_INTERNAL_STRUCT_VERSION in uint32 */ - struct stack_snapshot_frame32 { uint32_t lr; uint32_t sp; @@ -537,6 +539,10 @@ enum task_snapshot_flags { kTaskUUIDInfoMissing = 0x200000, /* some UUID info was paged out */ kTaskUUIDInfoTriedFault = 0x400000, /* tried to fault in UUID info */ kTaskSharedRegionInfoUnavailable = 0x800000, /* shared region info unavailable */ + kTaskTALEngaged = 0x1000000, + /* 0x2000000 unused */ + kTaskIsDirtyTracked = 0x4000000, + kTaskAllowIdleExit = 0x8000000, }; enum thread_snapshot_flags { @@ -785,6 +791,12 @@ struct stackshot_cpu_times { uint64_t system_usec; } __attribute__((packed)); +struct stackshot_cpu_times_v2 { + uint64_t user_usec; + uint64_t system_usec; + uint64_t runnable_usec; +} __attribute__((packed)); + struct stackshot_duration { uint64_t stackshot_duration; uint64_t stackshot_duration_outer; @@ -813,6 +825,12 @@ typedef struct stackshot_thread_waitinfo { #define STACKSHOT_WAITOWNER_SUSPENDED (UINT64_MAX - 7) /* workloop is suspended */ +struct stack_snapshot_stacktop { + uint64_t sp; + uint8_t stack_contents[8]; +}; + + /**************** definitions for crashinfo *********************/ /* @@ -866,6 +884,22 @@ struct crashinfo_proc_uniqidentifierinfo { #define TASK_CRASHINFO_UDATA_PTRS 0x81C /* uint64_t */ #define TASK_CRASHINFO_MEMORY_LIMIT 0x81D /* uint64_t */ +#define TASK_CRASHINFO_LEDGER_INTERNAL 0x81E /* uint64_t */ +#define TASK_CRASHINFO_LEDGER_INTERNAL_COMPRESSED 0x81F /* uint64_t */ +#define TASK_CRASHINFO_LEDGER_IOKIT_MAPPED 0x820 /* uint64_t */ +#define TASK_CRASHINFO_LEDGER_ALTERNATE_ACCOUNTING 0x821 /* uint64_t */ +#define TASK_CRASHINFO_LEDGER_ALTERNATE_ACCOUNTING_COMPRESSED 0x822 /* uint64_t */ +#define TASK_CRASHINFO_LEDGER_PURGEABLE_NONVOLATILE 0x823 /* uint64_t */ +#define TASK_CRASHINFO_LEDGER_PURGEABLE_NONVOLATILE_COMPRESSED 0x824 /* uint64_t */ +#define TASK_CRASHINFO_LEDGER_PAGE_TABLE 0x825 /* uint64_t */ +#define TASK_CRASHINFO_LEDGER_PHYS_FOOTPRINT 0x826 /* uint64_t */ +#define TASK_CRASHINFO_LEDGER_PHYS_FOOTPRINT_LIFETIME_MAX 0x827 /* uint64_t */ +#define TASK_CRASHINFO_LEDGER_NETWORK_NONVOLATILE 0x828 /* uint64_t */ +#define TASK_CRASHINFO_LEDGER_NETWORK_NONVOLATILE_COMPRESSED 0x829 /* uint64_t */ +#define TASK_CRASHINFO_LEDGER_WIRED_MEM 0x82A /* uint64_t */ + + + #define TASK_CRASHINFO_END KCDATA_TYPE_BUFFER_END /**************** definitions for os reasons *********************/ @@ -963,7 +997,7 @@ kcdata_iter_type(kcdata_iter_t iter) static inline uint32_t kcdata_calc_padding(uint32_t size) { - /* calculate number of bits to add to size to get something divisible by 16 */ + /* calculate number of bytes to add to size to get something divisible by 16 */ return (-size) & 0xf; } diff --git a/libkdd/kcdtypes.c b/libkdd/kcdtypes.c index 0a36b5aa7..c9a2809d5 100644 --- a/libkdd/kcdtypes.c +++ b/libkdd/kcdtypes.c @@ -121,15 +121,15 @@ kcdata_get_typedescription(unsigned type_id, uint8_t * buffer, uint32_t buffer_s break; } - case KCDATA_TYPE_TYPEDEFINTION: { - i = 0; - setup_subtype_description(&subtypes[i++], KC_ST_UINT32, offsetof(struct kcdata_type_definition, kct_type_identifier), "typeID"); - setup_subtype_description(&subtypes[i++], KC_ST_UINT32, offsetof(struct kcdata_type_definition, kct_num_elements), "numOfFields"); - setup_subtype_array_description(&subtypes[i++], KC_ST_CHAR, offsetof(struct kcdata_type_definition, kct_name), KCDATA_DESC_MAXLEN, "name"); - // Note "fields" is an array of run time defined length. So we populate fields at parsing time. - setup_type_definition(retval, type_id, i, "typedef"); - break; - } + case KCDATA_TYPE_TYPEDEFINTION: { + i = 0; + setup_subtype_description(&subtypes[i++], KC_ST_UINT32, offsetof(struct kcdata_type_definition, kct_type_identifier), "typeID"); + setup_subtype_description(&subtypes[i++], KC_ST_UINT32, offsetof(struct kcdata_type_definition, kct_num_elements), "numOfFields"); + setup_subtype_array_description(&subtypes[i++], KC_ST_CHAR, offsetof(struct kcdata_type_definition, kct_name), KCDATA_DESC_MAXLEN, "name"); + // Note "fields" is an array of run time defined length. So we populate fields at parsing time. + setup_type_definition(retval, type_id, i, "typedef"); + break; + } case KCDATA_TYPE_CONTAINER_BEGIN: { i = 0; @@ -536,8 +536,9 @@ kcdata_get_typedescription(unsigned type_id, uint8_t * buffer, uint32_t buffer_s case STACKSHOT_KCTYPE_CPU_TIMES: { i = 0; - _SUBTYPE(KC_ST_UINT64, struct stackshot_cpu_times, user_usec); - _SUBTYPE(KC_ST_UINT64, struct stackshot_cpu_times, system_usec); + _SUBTYPE(KC_ST_UINT64, struct stackshot_cpu_times_v2, user_usec); + _SUBTYPE(KC_ST_UINT64, struct stackshot_cpu_times_v2, system_usec); + _SUBTYPE(KC_ST_UINT64, struct stackshot_cpu_times_v2, runnable_usec); setup_type_definition(retval, type_id, i, "cpu_times"); break; } @@ -614,6 +615,14 @@ kcdata_get_typedescription(unsigned type_id, uint8_t * buffer, uint32_t buffer_s break; } + case STACKSHOT_KCTYPE_USER_STACKTOP: { + i = 0; + _SUBTYPE(KC_ST_UINT64, struct stack_snapshot_stacktop, sp); + _SUBTYPE_ARRAY(KC_ST_UINT8, struct stack_snapshot_stacktop, stack_contents, 8); + setup_type_definition(retval, type_id, i, "user_stacktop"); + break; + } + case TASK_CRASHINFO_PROC_STARTTIME: { i = 0; _SUBTYPE(KC_ST_INT64, struct timeval64, tv_sec); @@ -784,8 +793,8 @@ kcdata_get_typedescription(unsigned type_id, uint8_t * buffer, uint32_t buffer_s _SUBTYPE(KC_ST_UINT8, struct codesigning_exit_reason_info, ceri_page_dirty); _SUBTYPE(KC_ST_UINT32, struct codesigning_exit_reason_info, ceri_page_shadow_depth); setup_type_definition(retval, type_id, i, "exit_reason_codesigning_info"); - break; + } case EXIT_REASON_WORKLOOP_ID: { i = 0; @@ -801,8 +810,28 @@ kcdata_get_typedescription(unsigned type_id, uint8_t * buffer, uint32_t buffer_s break; } + case STACKSHOT_KCTYPE_ASID: { + i = 0; + setup_subtype_description(&subtypes[i++], KC_ST_UINT32, 0, "ts_asid"); + setup_type_definition(retval, type_id, i, "ts_asid"); + break; } + case STACKSHOT_KCTYPE_PAGE_TABLES: { + i = 0; + setup_subtype_description(&subtypes[i++], KC_ST_UINT64, 0, "ts_pagetable"); + setup_type_definition(retval, type_id, i, "ts_pagetable"); + break; + } + + case STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT: { + i = 0; + _SUBTYPE(KC_ST_UINT64, struct user64_dyld_uuid_info, imageLoadAddress); + _SUBTYPE_ARRAY(KC_ST_UINT8, struct user64_dyld_uuid_info, imageUUID, 16); + setup_type_definition(retval, type_id, i, "system_shared_cache_layout"); + break; + } + default: retval = NULL; break; diff --git a/libkdd/kdd.xcodeproj/project.pbxproj b/libkdd/kdd.xcodeproj/project.pbxproj index f9e1e5425..a16b8bdca 100644 --- a/libkdd/kdd.xcodeproj/project.pbxproj +++ b/libkdd/kdd.xcodeproj/project.pbxproj @@ -6,6 +6,21 @@ objectVersion = 46; objects = { +/* Begin PBXAggregateTarget section */ + 08CFD8441FBB9E39008D51F6 /* Default */ = { + isa = PBXAggregateTarget; + buildConfigurationList = 08CFD8471FBB9E39008D51F6 /* Build configuration list for PBXAggregateTarget "Default" */; + buildPhases = ( + ); + dependencies = ( + 08CFD8491FBB9E42008D51F6 /* PBXTargetDependency */, + 08CFD84B1FBB9E43008D51F6 /* PBXTargetDependency */, + ); + name = Default; + productName = Default; + }; +/* End PBXAggregateTarget section */ + /* Begin PBXBuildFile section */ 045F7F121D2ADE7C00B4808B /* stackshot-with-waitinfo in Resources */ = {isa = PBXBuildFile; fileRef = 04C64AC91D25C43400C6C781 /* stackshot-with-waitinfo */; }; 045F7F131D2ADE8000B4808B /* stackshot-with-waitinfo.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 04C64ACA1D25C43400C6C781 /* stackshot-with-waitinfo.plist.gz */; }; @@ -18,6 +33,8 @@ 084085AC1FA3CE3D005BAD16 /* kdd.h in Headers */ = {isa = PBXBuildFile; fileRef = 084085AA1FA3CE32005BAD16 /* kdd.h */; settings = {ATTRIBUTES = (Public, ); }; }; 0843EE921BF6AFC600CD4150 /* stackshot-sample in Resources */ = {isa = PBXBuildFile; fileRef = 0843EE911BF6AFB700CD4150 /* stackshot-sample */; }; 0843EE941BF6BAC100CD4150 /* stackshot-sample.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 0843EE931BF6BAB400CD4150 /* stackshot-sample.plist.gz */; }; + 084422F82048BABB008A085B /* stackshot-sample-asid in Resources */ = {isa = PBXBuildFile; fileRef = 084422F62048B801008A085B /* stackshot-sample-asid */; }; + 084422F92048BABB008A085B /* stackshot-sample-asid.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 084422F72048B801008A085B /* stackshot-sample-asid.plist.gz */; }; 08603F371BF69EDE007D3784 /* Tests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 08603F361BF69EDE007D3784 /* Tests.swift */; }; 08603F391BF69EDE007D3784 /* libkdd.a in Frameworks */ = {isa = PBXBuildFile; fileRef = C91C93C71ACB58B700119B60 /* libkdd.a */; }; 0860F87A1BFC3857007E1301 /* stackshot-sample-tailspin-2 in Resources */ = {isa = PBXBuildFile; fileRef = 0860F8781BFC3845007E1301 /* stackshot-sample-tailspin-2 */; }; @@ -37,6 +54,8 @@ 08A4C94C1C4701B800D5F010 /* KCDEmbeddedBufferDescription.m in Sources */ = {isa = PBXBuildFile; fileRef = 08A4C94B1C4701B800D5F010 /* KCDEmbeddedBufferDescription.m */; }; 08A4C94F1C470F1C00D5F010 /* nested-sample in Resources */ = {isa = PBXBuildFile; fileRef = 08A4C94D1C470F0900D5F010 /* nested-sample */; }; 08A4C9501C470F1C00D5F010 /* nested-sample.plist in Resources */ = {isa = PBXBuildFile; fileRef = 08A4C94E1C470F0900D5F010 /* nested-sample.plist */; }; + 08AD0BF01FBE370000CB41B2 /* stackshot-sample-stacktop in Resources */ = {isa = PBXBuildFile; fileRef = 08AD0BEE1FBE370000CB41B2 /* stackshot-sample-stacktop */; }; + 08AD0BF11FBE370000CB41B2 /* stackshot-sample-stacktop.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 08AD0BEF1FBE370000CB41B2 /* stackshot-sample-stacktop.plist.gz */; }; 08B480781BF8297500B4AAE0 /* stackshot-sample-new-arrays in Resources */ = {isa = PBXBuildFile; fileRef = 08B480741BF8294E00B4AAE0 /* stackshot-sample-new-arrays */; }; 08B480791BF8297500B4AAE0 /* stackshot-sample-new-arrays.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 08B480751BF8294E00B4AAE0 /* stackshot-sample-new-arrays.plist.gz */; }; 08B4807A1BF8297500B4AAE0 /* stackshot-sample-old-arrays in Resources */ = {isa = PBXBuildFile; fileRef = 08B480761BF8294E00B4AAE0 /* stackshot-sample-old-arrays */; }; @@ -59,6 +78,8 @@ 08F2AC0B1FA136EB00271A11 /* stackshot-sample-delta-thread-policy.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 08F2AC091FA136EB00271A11 /* stackshot-sample-delta-thread-policy.plist.gz */; }; 1368F0851C87E06A00940FC6 /* exitreason-codesigning.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 1368F0841C87E06300940FC6 /* exitreason-codesigning.plist.gz */; }; 1368F0861C87E06C00940FC6 /* exitreason-codesigning in Resources */ = {isa = PBXBuildFile; fileRef = 1368F0831C87E06300940FC6 /* exitreason-codesigning */; }; + 13739E8520DB18B600D8D9B9 /* stackshot-with-shared-cache-layout.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 13739E8320DB18B500D8D9B9 /* stackshot-with-shared-cache-layout.plist.gz */; }; + 13739E8620DB18B600D8D9B9 /* stackshot-with-shared-cache-layout in Resources */ = {isa = PBXBuildFile; fileRef = 13739E8420DB18B600D8D9B9 /* stackshot-with-shared-cache-layout */; }; 13A79CAA1CF8C5D600FFC181 /* stackshot-with-kcid in Resources */ = {isa = PBXBuildFile; fileRef = 13A79CA81CF8C5D200FFC181 /* stackshot-with-kcid */; }; 13A79CAB1CF8C5D600FFC181 /* stackshot-with-kcid.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 13A79CA91CF8C5D200FFC181 /* stackshot-with-kcid.plist.gz */; }; 13CC08441CB97F8D00EA6069 /* stackshot-fault-stats in Resources */ = {isa = PBXBuildFile; fileRef = 13CC08421CB97F8A00EA6069 /* stackshot-fault-stats */; }; @@ -83,6 +104,8 @@ C91C93E51ACB598700119B60 /* KCDBasicTypeDescription.m in Sources */ = {isa = PBXBuildFile; fileRef = C91C93E11ACB598700119B60 /* KCDBasicTypeDescription.m */; }; C91C93E61ACB598700119B60 /* KCDStructTypeDescription.h in Headers */ = {isa = PBXBuildFile; fileRef = C91C93E21ACB598700119B60 /* KCDStructTypeDescription.h */; }; C91C93E71ACB598700119B60 /* KCDStructTypeDescription.m in Sources */ = {isa = PBXBuildFile; fileRef = C91C93E31ACB598700119B60 /* KCDStructTypeDescription.m */; }; + C95E4D1A204F42C500FD2229 /* stackshot-sample-cpu-times.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = C95E4D18204F42C400FD2229 /* stackshot-sample-cpu-times.plist.gz */; }; + C95E4D1B204F42C500FD2229 /* stackshot-sample-cpu-times in Resources */ = {isa = PBXBuildFile; fileRef = C95E4D19204F42C500FD2229 /* stackshot-sample-cpu-times */; }; C9C5C68C1ACDAFDB00BE0E5E /* kcdtypes.c in Sources */ = {isa = PBXBuildFile; fileRef = C9C5C68B1ACDAFDB00BE0E5E /* kcdtypes.c */; }; C9D7B53F1D1B41D700F1019D /* xnupost_testconfig-sample.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = C9D7B53D1D1B41D700F1019D /* xnupost_testconfig-sample.plist.gz */; }; C9D7B5401D1B41D700F1019D /* xnupost_testconfig-sample in Resources */ = {isa = PBXBuildFile; fileRef = C9D7B53E1D1B41D700F1019D /* xnupost_testconfig-sample */; }; @@ -92,20 +115,27 @@ /* End PBXBuildFile section */ /* Begin PBXContainerItemProxy section */ - 08603F3A1BF69EDE007D3784 /* PBXContainerItemProxy */ = { + 086395BA1BF565AB005ED913 /* PBXContainerItemProxy */ = { isa = PBXContainerItemProxy; containerPortal = C91C93BF1ACB58B700119B60 /* Project object */; proxyType = 1; remoteGlobalIDString = C91C93C61ACB58B700119B60; remoteInfo = libkdd; }; - 086395BA1BF565AB005ED913 /* PBXContainerItemProxy */ = { + 08CFD8481FBB9E42008D51F6 /* PBXContainerItemProxy */ = { isa = PBXContainerItemProxy; containerPortal = C91C93BF1ACB58B700119B60 /* Project object */; proxyType = 1; remoteGlobalIDString = C91C93C61ACB58B700119B60; remoteInfo = libkdd; }; + 08CFD84A1FBB9E43008D51F6 /* PBXContainerItemProxy */ = { + isa = PBXContainerItemProxy; + containerPortal = C91C93BF1ACB58B700119B60 /* Project object */; + proxyType = 1; + remoteGlobalIDString = 0864FCEE1FA3C0B7001B7B0B; + remoteInfo = kdd.framework; + }; /* End PBXContainerItemProxy section */ /* Begin PBXCopyFilesBuildPhase section */ @@ -132,6 +162,8 @@ 084085AE1FA3D156005BAD16 /* module.modulemap */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = "sourcecode.module-map"; path = module.modulemap; sourceTree = ""; }; 0843EE911BF6AFB700CD4150 /* stackshot-sample */ = {isa = PBXFileReference; lastKnownFileType = text; name = "stackshot-sample"; path = "tests/stackshot-sample"; sourceTree = SOURCE_ROOT; }; 0843EE931BF6BAB400CD4150 /* stackshot-sample.plist.gz */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = archive.gzip; name = "stackshot-sample.plist.gz"; path = "tests/stackshot-sample.plist.gz"; sourceTree = SOURCE_ROOT; }; + 084422F62048B801008A085B /* stackshot-sample-asid */ = {isa = PBXFileReference; lastKnownFileType = file; path = "stackshot-sample-asid"; sourceTree = ""; }; + 084422F72048B801008A085B /* stackshot-sample-asid.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; path = "stackshot-sample-asid.plist.gz"; sourceTree = ""; }; 08603F341BF69EDE007D3784 /* tests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = tests.xctest; sourceTree = BUILT_PRODUCTS_DIR; }; 08603F361BF69EDE007D3784 /* Tests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Tests.swift; sourceTree = ""; }; 08603F381BF69EDE007D3784 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; @@ -150,6 +182,8 @@ 08A4C94B1C4701B800D5F010 /* KCDEmbeddedBufferDescription.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = KCDEmbeddedBufferDescription.m; sourceTree = ""; }; 08A4C94D1C470F0900D5F010 /* nested-sample */ = {isa = PBXFileReference; lastKnownFileType = file; name = "nested-sample"; path = "tests/nested-sample"; sourceTree = SOURCE_ROOT; }; 08A4C94E1C470F0900D5F010 /* nested-sample.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; name = "nested-sample.plist"; path = "tests/nested-sample.plist"; sourceTree = SOURCE_ROOT; }; + 08AD0BEE1FBE370000CB41B2 /* stackshot-sample-stacktop */ = {isa = PBXFileReference; lastKnownFileType = file; path = "stackshot-sample-stacktop"; sourceTree = ""; }; + 08AD0BEF1FBE370000CB41B2 /* stackshot-sample-stacktop.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; path = "stackshot-sample-stacktop.plist.gz"; sourceTree = ""; }; 08B480741BF8294E00B4AAE0 /* stackshot-sample-new-arrays */ = {isa = PBXFileReference; lastKnownFileType = text; name = "stackshot-sample-new-arrays"; path = "tests/stackshot-sample-new-arrays"; sourceTree = SOURCE_ROOT; }; 08B480751BF8294E00B4AAE0 /* stackshot-sample-new-arrays.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; name = "stackshot-sample-new-arrays.plist.gz"; path = "tests/stackshot-sample-new-arrays.plist.gz"; sourceTree = SOURCE_ROOT; }; 08B480761BF8294E00B4AAE0 /* stackshot-sample-old-arrays */ = {isa = PBXFileReference; lastKnownFileType = text; name = "stackshot-sample-old-arrays"; path = "tests/stackshot-sample-old-arrays"; sourceTree = SOURCE_ROOT; }; @@ -172,6 +206,8 @@ 08F2AC091FA136EB00271A11 /* stackshot-sample-delta-thread-policy.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; path = "stackshot-sample-delta-thread-policy.plist.gz"; sourceTree = ""; }; 1368F0831C87E06300940FC6 /* exitreason-codesigning */ = {isa = PBXFileReference; lastKnownFileType = file; name = "exitreason-codesigning"; path = "tests/exitreason-codesigning"; sourceTree = SOURCE_ROOT; }; 1368F0841C87E06300940FC6 /* exitreason-codesigning.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; name = "exitreason-codesigning.plist.gz"; path = "tests/exitreason-codesigning.plist.gz"; sourceTree = SOURCE_ROOT; }; + 13739E8320DB18B500D8D9B9 /* stackshot-with-shared-cache-layout.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; path = "stackshot-with-shared-cache-layout.plist.gz"; sourceTree = ""; }; + 13739E8420DB18B600D8D9B9 /* stackshot-with-shared-cache-layout */ = {isa = PBXFileReference; lastKnownFileType = file; path = "stackshot-with-shared-cache-layout"; sourceTree = ""; }; 13A79CA81CF8C5D200FFC181 /* stackshot-with-kcid */ = {isa = PBXFileReference; lastKnownFileType = file; name = "stackshot-with-kcid"; path = "tests/stackshot-with-kcid"; sourceTree = SOURCE_ROOT; }; 13A79CA91CF8C5D200FFC181 /* stackshot-with-kcid.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; name = "stackshot-with-kcid.plist.gz"; path = "tests/stackshot-with-kcid.plist.gz"; sourceTree = SOURCE_ROOT; }; 13AF287B1C4A0D6A000795E2 /* corpse-twr-sample */ = {isa = PBXFileReference; lastKnownFileType = file; name = "corpse-twr-sample"; path = "tests/corpse-twr-sample"; sourceTree = SOURCE_ROOT; }; @@ -197,6 +233,8 @@ C91C93E11ACB598700119B60 /* KCDBasicTypeDescription.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = KCDBasicTypeDescription.m; sourceTree = ""; }; C91C93E21ACB598700119B60 /* KCDStructTypeDescription.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KCDStructTypeDescription.h; sourceTree = ""; }; C91C93E31ACB598700119B60 /* KCDStructTypeDescription.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = KCDStructTypeDescription.m; sourceTree = ""; }; + C95E4D18204F42C400FD2229 /* stackshot-sample-cpu-times.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; path = "stackshot-sample-cpu-times.plist.gz"; sourceTree = ""; }; + C95E4D19204F42C500FD2229 /* stackshot-sample-cpu-times */ = {isa = PBXFileReference; lastKnownFileType = file; path = "stackshot-sample-cpu-times"; sourceTree = ""; }; C9C5C68B1ACDAFDB00BE0E5E /* kcdtypes.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = kcdtypes.c; sourceTree = ""; }; C9D7B53D1D1B41D700F1019D /* xnupost_testconfig-sample.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; path = "xnupost_testconfig-sample.plist.gz"; sourceTree = ""; }; C9D7B53E1D1B41D700F1019D /* xnupost_testconfig-sample */ = {isa = PBXFileReference; lastKnownFileType = file; path = "xnupost_testconfig-sample"; sourceTree = ""; }; @@ -245,6 +283,14 @@ 08603F351BF69EDE007D3784 /* tests */ = { isa = PBXGroup; children = ( + 13739E8420DB18B600D8D9B9 /* stackshot-with-shared-cache-layout */, + 13739E8320DB18B500D8D9B9 /* stackshot-with-shared-cache-layout.plist.gz */, + C95E4D19204F42C500FD2229 /* stackshot-sample-cpu-times */, + C95E4D18204F42C400FD2229 /* stackshot-sample-cpu-times.plist.gz */, + 084422F62048B801008A085B /* stackshot-sample-asid */, + 084422F72048B801008A085B /* stackshot-sample-asid.plist.gz */, + 08AD0BEE1FBE370000CB41B2 /* stackshot-sample-stacktop */, + 08AD0BEF1FBE370000CB41B2 /* stackshot-sample-stacktop.plist.gz */, 08F2AC081FA136EB00271A11 /* stackshot-sample-delta-thread-policy */, 08F2AC091FA136EB00271A11 /* stackshot-sample-delta-thread-policy.plist.gz */, 18C577C51F96DB7100C67EB3 /* stackshot-sample-thread-groups-flags.plist.gz */, @@ -423,7 +469,6 @@ buildRules = ( ); dependencies = ( - 08603F3B1BF69EDE007D3784 /* PBXTargetDependency */, ); name = tests; productName = Tests; @@ -505,6 +550,10 @@ CreatedOnToolsVersion = 9.1; ProvisioningStyle = Automatic; }; + 08CFD8441FBB9E39008D51F6 = { + CreatedOnToolsVersion = 9.0.1; + ProvisioningStyle = Automatic; + }; C91C93C61ACB58B700119B60 = { CreatedOnToolsVersion = 7.0; }; @@ -522,6 +571,7 @@ projectDirPath = ""; projectRoot = ""; targets = ( + 08CFD8441FBB9E39008D51F6 /* Default */, C91C93C61ACB58B700119B60 /* libkdd */, 086395B11BF5655D005ED913 /* kdd */, 08603F331BF69EDE007D3784 /* tests */, @@ -535,6 +585,9 @@ isa = PBXResourcesBuildPhase; buildActionMask = 2147483647; files = ( + 13739E8620DB18B600D8D9B9 /* stackshot-with-shared-cache-layout in Resources */, + 084422F82048BABB008A085B /* stackshot-sample-asid in Resources */, + 084422F92048BABB008A085B /* stackshot-sample-asid.plist.gz in Resources */, 08F2AC0A1FA136EB00271A11 /* stackshot-sample-delta-thread-policy in Resources */, 18C577C61F96DB7100C67EB3 /* stackshot-sample-thread-groups-flags.plist.gz in Resources */, 18C577C31F96DB5200C67EB3 /* stackshot-sample-thread-groups-flags in Resources */, @@ -544,11 +597,13 @@ 088C36E11EF323C300ABB2E0 /* stackshot-sample-thread-policy.plist.gz in Resources */, 045F7F131D2ADE8000B4808B /* stackshot-with-waitinfo.plist.gz in Resources */, 045F7F121D2ADE7C00B4808B /* stackshot-with-waitinfo in Resources */, + C95E4D1A204F42C500FD2229 /* stackshot-sample-cpu-times.plist.gz in Resources */, 08A4C94F1C470F1C00D5F010 /* nested-sample in Resources */, 1862B0341E7A083F0005ADF4 /* stackshot-sample-thread-groups in Resources */, 08A4C9501C470F1C00D5F010 /* nested-sample.plist in Resources */, 13D6C5D21C4DDDBE005E617C /* test-twr-sample in Resources */, 13D6C5D01C4DDDB6005E617C /* corpse-twr-sample in Resources */, + C95E4D1B204F42C500FD2229 /* stackshot-sample-cpu-times in Resources */, C9D7B5401D1B41D700F1019D /* xnupost_testconfig-sample in Resources */, 081725D51C3F476500371A54 /* stackshot-sample-duration in Resources */, 08F2AC0B1FA136EB00271A11 /* stackshot-sample-delta-thread-policy.plist.gz in Resources */, @@ -564,6 +619,7 @@ 0860F87B1BFC3857007E1301 /* stackshot-sample-tailspin-2.plist.gz in Resources */, 08CF18FF1BF9B7B100D05813 /* stackshot-sample-tailspin in Resources */, 1368F0861C87E06C00940FC6 /* exitreason-codesigning in Resources */, + 13739E8520DB18B600D8D9B9 /* stackshot-with-shared-cache-layout.plist.gz in Resources */, 13DBA26A1CAB1BA000227EB2 /* stackshot-sample-sharedcachev2 in Resources */, C9D7B53F1D1B41D700F1019D /* xnupost_testconfig-sample.plist.gz in Resources */, 13DBA2681CAB1AD600227EB2 /* stackshot-sample-sharedcachev2.plist.gz in Resources */, @@ -577,6 +633,7 @@ 08C9D83E1BFFF8E100DF6C05 /* exitreason-sample.plist.gz in Resources */, 18E592981E9451A20018612A /* stackshot-sample-coalitions in Resources */, 08B4808B1BF9474A00B4AAE0 /* corpse-sample in Resources */, + 08AD0BF01FBE370000CB41B2 /* stackshot-sample-stacktop in Resources */, 13D6C5D11C4DDDB8005E617C /* corpse-twr-sample.plist.gz in Resources */, 08B4808C1BF9474A00B4AAE0 /* corpse-sample.plist.gz in Resources */, 08B480881BF92E0500B4AAE0 /* kcdata.py in Resources */, @@ -591,6 +648,7 @@ 08B4807B1BF8297500B4AAE0 /* stackshot-sample-old-arrays.plist.gz in Resources */, 0843EE941BF6BAC100CD4150 /* stackshot-sample.plist.gz in Resources */, 0843EE921BF6AFC600CD4150 /* stackshot-sample in Resources */, + 08AD0BF11FBE370000CB41B2 /* stackshot-sample-stacktop.plist.gz in Resources */, 18E592991E9451A20018612A /* stackshot-sample-coalitions.plist.gz in Resources */, ); runOnlyForDeploymentPostprocessing = 0; @@ -668,15 +726,20 @@ /* End PBXSourcesBuildPhase section */ /* Begin PBXTargetDependency section */ - 08603F3B1BF69EDE007D3784 /* PBXTargetDependency */ = { + 086395BB1BF565AB005ED913 /* PBXTargetDependency */ = { isa = PBXTargetDependency; target = C91C93C61ACB58B700119B60 /* libkdd */; - targetProxy = 08603F3A1BF69EDE007D3784 /* PBXContainerItemProxy */; + targetProxy = 086395BA1BF565AB005ED913 /* PBXContainerItemProxy */; }; - 086395BB1BF565AB005ED913 /* PBXTargetDependency */ = { + 08CFD8491FBB9E42008D51F6 /* PBXTargetDependency */ = { isa = PBXTargetDependency; target = C91C93C61ACB58B700119B60 /* libkdd */; - targetProxy = 086395BA1BF565AB005ED913 /* PBXContainerItemProxy */; + targetProxy = 08CFD8481FBB9E42008D51F6 /* PBXContainerItemProxy */; + }; + 08CFD84B1FBB9E43008D51F6 /* PBXTargetDependency */ = { + isa = PBXTargetDependency; + target = 0864FCEE1FA3C0B7001B7B0B /* kdd.framework */; + targetProxy = 08CFD84A1FBB9E43008D51F6 /* PBXContainerItemProxy */; }; /* End PBXTargetDependency section */ @@ -689,13 +752,14 @@ COMBINE_HIDPI_IMAGES = YES; ENABLE_TESTABILITY = YES; INFOPLIST_FILE = tests/Info.plist; + INSTALL_PATH = /AppleInternal/XCTests/com.apple.libkdd; LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/../Frameworks @loader_path/../Frameworks"; PRODUCT_BUNDLE_IDENTIFIER = apple.com.Tests; PRODUCT_NAME = "$(TARGET_NAME)"; SDKROOT = macosx; SWIFT_OBJC_BRIDGING_HEADER = tests/kdd_bridge.h; SWIFT_OPTIMIZATION_LEVEL = "-Onone"; - SWIFT_VERSION = 3.0; + SWIFT_VERSION = 4.0; }; name = Debug; }; @@ -706,12 +770,13 @@ CODE_SIGN_IDENTITY = "-"; COMBINE_HIDPI_IMAGES = YES; INFOPLIST_FILE = tests/Info.plist; + INSTALL_PATH = /AppleInternal/XCTests/com.apple.libkdd; LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/../Frameworks @loader_path/../Frameworks"; PRODUCT_BUNDLE_IDENTIFIER = apple.com.Tests; PRODUCT_NAME = "$(TARGET_NAME)"; SDKROOT = macosx; SWIFT_OBJC_BRIDGING_HEADER = tests/kdd_bridge.h; - SWIFT_VERSION = 3.0; + SWIFT_VERSION = 4.0; }; name = Release; }; @@ -767,7 +832,7 @@ GCC_C_LANGUAGE_STANDARD = gnu11; INFOPLIST_FILE = kdd.framework/Info.plist; INSTALLHDRS_SCRIPT_PHASE = YES; - INSTALL_PATH = /AppleInternal/Ariadne/Frameworks/; + INSTALL_PATH = /AppleInternal/Library/Frameworks/; LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/../Frameworks @loader_path/Frameworks"; MACOSX_DEPLOYMENT_TARGET = 10.13; MODULEMAP_FILE = "$(SRCROOT)/kdd.framework/module.modulemap"; @@ -810,7 +875,7 @@ GCC_C_LANGUAGE_STANDARD = gnu11; INFOPLIST_FILE = kdd.framework/Info.plist; INSTALLHDRS_SCRIPT_PHASE = YES; - INSTALL_PATH = /AppleInternal/Ariadne/Frameworks/; + INSTALL_PATH = /AppleInternal/Library/Frameworks/; LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/../Frameworks @loader_path/Frameworks"; MACOSX_DEPLOYMENT_TARGET = 10.13; MODULEMAP_FILE = "$(SRCROOT)/kdd.framework/module.modulemap"; @@ -824,6 +889,147 @@ }; name = Release; }; + 08CFD8451FBB9E39008D51F6 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + CODE_SIGN_STYLE = Automatic; + PRODUCT_NAME = "$(TARGET_NAME)"; + }; + name = Debug; + }; + 08CFD8461FBB9E39008D51F6 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + CODE_SIGN_STYLE = Automatic; + PRODUCT_NAME = "$(TARGET_NAME)"; + }; + name = Release; + }; + 08CFD84C1FBB9E72008D51F6 /* ReleaseHost */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x"; + CLANG_CXX_LIBRARY = "libc++"; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_IMPLICIT_SIGN_CONVERSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + GCC_C_LANGUAGE_STANDARD = gnu99; + GCC_NO_COMMON_BLOCKS = YES; + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + HEADER_SEARCH_PATHS = "$(SRCROOT)"; + MTL_ENABLE_DEBUG_INFO = NO; + OTHER_CFLAGS = ""; + SDKROOT = macosx.internal; + }; + name = ReleaseHost; + }; + 08CFD84D1FBB9E72008D51F6 /* ReleaseHost */ = { + isa = XCBuildConfiguration; + buildSettings = { + CODE_SIGN_STYLE = Automatic; + PRODUCT_NAME = "$(TARGET_NAME)"; + }; + name = ReleaseHost; + }; + 08CFD84E1FBB9E72008D51F6 /* ReleaseHost */ = { + isa = XCBuildConfiguration; + buildSettings = { + COMBINE_HIDPI_IMAGES = YES; + EXECUTABLE_PREFIX = lib; + OTHER_CFLAGS = "-I$(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders"; + PRODUCT_NAME = kdd; + }; + name = ReleaseHost; + }; + 08CFD84F1FBB9E72008D51F6 /* ReleaseHost */ = { + isa = XCBuildConfiguration; + buildSettings = { + CLANG_ENABLE_MODULES = YES; + CODE_SIGN_IDENTITY = "-"; + MACOSX_DEPLOYMENT_TARGET = 10.11; + PRODUCT_NAME = "$(TARGET_NAME)"; + SDKROOT = macosx; + }; + name = ReleaseHost; + }; + 08CFD8501FBB9E72008D51F6 /* ReleaseHost */ = { + isa = XCBuildConfiguration; + buildSettings = { + CLANG_ENABLE_MODULES = YES; + CODE_SIGN_IDENTITY = "-"; + COMBINE_HIDPI_IMAGES = YES; + INFOPLIST_FILE = tests/Info.plist; + INSTALL_PATH = /AppleInternal/XCTests/com.apple.libkdd; + LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/../Frameworks @loader_path/../Frameworks"; + PRODUCT_BUNDLE_IDENTIFIER = apple.com.Tests; + PRODUCT_NAME = "$(TARGET_NAME)"; + SDKROOT = macosx; + SWIFT_OBJC_BRIDGING_HEADER = tests/kdd_bridge.h; + SWIFT_VERSION = 4.0; + }; + name = ReleaseHost; + }; + 08CFD8511FBB9E72008D51F6 /* ReleaseHost */ = { + isa = XCBuildConfiguration; + buildSettings = { + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++14"; + CLANG_ENABLE_MODULES = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CODE_SIGN_IDENTITY = "-"; + CODE_SIGN_STYLE = Automatic; + COMBINE_HIDPI_IMAGES = YES; + CURRENT_PROJECT_VERSION = 1; + DEFINES_MODULE = YES; + DYLIB_COMPATIBILITY_VERSION = 1; + DYLIB_CURRENT_VERSION = 1; + DYLIB_INSTALL_NAME_BASE = "@rpath"; + FRAMEWORK_VERSION = A; + GCC_C_LANGUAGE_STANDARD = gnu11; + INFOPLIST_FILE = kdd.framework/Info.plist; + INSTALLHDRS_SCRIPT_PHASE = YES; + INSTALL_PATH = /AppleInternal/Ariadne/Frameworks/; + LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/../Frameworks @loader_path/Frameworks"; + MACOSX_DEPLOYMENT_TARGET = 10.13; + MODULEMAP_FILE = "$(SRCROOT)/kdd.framework/module.modulemap"; + PRODUCT_BUNDLE_IDENTIFIER = "test.kdd-framework"; + PRODUCT_NAME = kdd; + SDKROOT = macosx.internal; + SKIP_INSTALL = NO; + SUPPORTS_TEXT_BASED_API = YES; + VERSIONING_SYSTEM = "apple-generic"; + VERSION_INFO_PREFIX = ""; + }; + name = ReleaseHost; + }; C91C93D81ACB58B700119B60 /* Debug */ = { isa = XCBuildConfiguration; buildSettings = { @@ -932,6 +1138,7 @@ buildConfigurations = ( 08603F3C1BF69EDE007D3784 /* Debug */, 08603F3D1BF69EDE007D3784 /* Release */, + 08CFD8501FBB9E72008D51F6 /* ReleaseHost */, ); defaultConfigurationIsVisible = 0; defaultConfigurationName = Release; @@ -941,6 +1148,7 @@ buildConfigurations = ( 086395B71BF5655D005ED913 /* Debug */, 086395B81BF5655D005ED913 /* Release */, + 08CFD84F1FBB9E72008D51F6 /* ReleaseHost */, ); defaultConfigurationIsVisible = 0; defaultConfigurationName = Release; @@ -950,6 +1158,17 @@ buildConfigurations = ( 0864FD001FA3C0B7001B7B0B /* Debug */, 0864FD011FA3C0B7001B7B0B /* Release */, + 08CFD8511FBB9E72008D51F6 /* ReleaseHost */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 08CFD8471FBB9E39008D51F6 /* Build configuration list for PBXAggregateTarget "Default" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 08CFD8451FBB9E39008D51F6 /* Debug */, + 08CFD8461FBB9E39008D51F6 /* Release */, + 08CFD84D1FBB9E72008D51F6 /* ReleaseHost */, ); defaultConfigurationIsVisible = 0; defaultConfigurationName = Release; @@ -959,6 +1178,7 @@ buildConfigurations = ( C91C93D81ACB58B700119B60 /* Debug */, C91C93D91ACB58B700119B60 /* Release */, + 08CFD84C1FBB9E72008D51F6 /* ReleaseHost */, ); defaultConfigurationIsVisible = 0; defaultConfigurationName = Release; @@ -968,6 +1188,7 @@ buildConfigurations = ( C91C93DB1ACB58B700119B60 /* Debug */, C91C93DC1ACB58B700119B60 /* Release */, + 08CFD84E1FBB9E72008D51F6 /* ReleaseHost */, ); defaultConfigurationIsVisible = 0; defaultConfigurationName = Release; diff --git a/libkdd/tests/Tests.swift b/libkdd/tests/Tests.swift index 7a8a6ffee..e073f85a4 100644 --- a/libkdd/tests/Tests.swift +++ b/libkdd/tests/Tests.swift @@ -1208,7 +1208,7 @@ class Tests: XCTestCase { // check that we agree with sample file guard let sampledata = self.dataWithResource(name) - else { XCTFail(); return } + else { XCTFail("failed to open bundle resource named " + name); return } var dict : NSDictionary? dict = try? self.parseBuffer(sampledata) as NSDictionary @@ -1233,14 +1233,16 @@ class Tests: XCTestCase { self.dataWithResource(name + ".plist") else {XCTFail(); return} - var dict2 = try? PropertyListSerialization.propertyList(from: plistdata as Data, options: [], format: nil) - if dict2 == nil { - dict2 = try? PropertyListSerialization.propertyList(from:decompress(plistdata) as Data, options:[], format: nil) + var opt_dict2 = try? PropertyListSerialization.propertyList(from: plistdata as Data, options: [], format: nil) + if opt_dict2 == nil { + opt_dict2 = try? PropertyListSerialization.propertyList(from:decompress(plistdata) as Data, options:[], format: nil) } + guard let dict2 = opt_dict2 + else { XCTFail(); return} - XCTAssert(dict2 != nil) + XCTAssertEqual(dict, dict2 as! NSDictionary); - XCTAssert(dict == dict2 as? NSDictionary) + //XCTAssert(dict == dict2 as? NSDictionary) // check that we agree with python @@ -1378,6 +1380,26 @@ class Tests: XCTestCase { self.testSampleStackshot("stackshot-sample-instrs-cycles") } + func testStackshotWithStacktop() { + self.testSampleStackshot("stackshot-sample-stacktop") + } + + func testStackshotWithASID() { + self.testSampleStackshot("stackshot-sample-asid") + } + + func testStackshotWithPageTables() { + self.testSampleStackshot("stackshot-sample-asid-pagetable") + } + + func testStackshotCPUTimes() { + self.testSampleStackshot("stackshot-sample-cpu-times") + } + + func testStackshotWithSharedCacheLayout() { + self.testSampleStackshot("stackshot-with-shared-cache-layout") + } + func testTrivial() { } } diff --git a/libkdd/tests/stackshot-sample-asid b/libkdd/tests/stackshot-sample-asid new file mode 100644 index 000000000..048e7c4c2 Binary files /dev/null and b/libkdd/tests/stackshot-sample-asid differ diff --git a/libkdd/tests/stackshot-sample-asid-pagetable b/libkdd/tests/stackshot-sample-asid-pagetable new file mode 100644 index 000000000..5f278e96e Binary files /dev/null and b/libkdd/tests/stackshot-sample-asid-pagetable differ diff --git a/libkdd/tests/stackshot-sample-asid-pagetable.plist.gz b/libkdd/tests/stackshot-sample-asid-pagetable.plist.gz new file mode 100644 index 000000000..8542e5e53 Binary files /dev/null and b/libkdd/tests/stackshot-sample-asid-pagetable.plist.gz differ diff --git a/libkdd/tests/stackshot-sample-asid.plist.gz b/libkdd/tests/stackshot-sample-asid.plist.gz new file mode 100644 index 000000000..4c371a261 Binary files /dev/null and b/libkdd/tests/stackshot-sample-asid.plist.gz differ diff --git a/libkdd/tests/stackshot-sample-cpu-times b/libkdd/tests/stackshot-sample-cpu-times new file mode 100644 index 000000000..f7d7f843a Binary files /dev/null and b/libkdd/tests/stackshot-sample-cpu-times differ diff --git a/libkdd/tests/stackshot-sample-cpu-times.plist.gz b/libkdd/tests/stackshot-sample-cpu-times.plist.gz new file mode 100644 index 000000000..f0092d93e Binary files /dev/null and b/libkdd/tests/stackshot-sample-cpu-times.plist.gz differ diff --git a/libkdd/tests/stackshot-sample-stacktop b/libkdd/tests/stackshot-sample-stacktop new file mode 100644 index 000000000..3d3bbed94 Binary files /dev/null and b/libkdd/tests/stackshot-sample-stacktop differ diff --git a/libkdd/tests/stackshot-sample-stacktop.plist.gz b/libkdd/tests/stackshot-sample-stacktop.plist.gz new file mode 100644 index 000000000..079ca7d3b Binary files /dev/null and b/libkdd/tests/stackshot-sample-stacktop.plist.gz differ diff --git a/libkdd/tests/stackshot-with-shared-cache-layout b/libkdd/tests/stackshot-with-shared-cache-layout new file mode 100644 index 000000000..8f218d18c Binary files /dev/null and b/libkdd/tests/stackshot-with-shared-cache-layout differ diff --git a/libkdd/tests/stackshot-with-shared-cache-layout.plist.gz b/libkdd/tests/stackshot-with-shared-cache-layout.plist.gz new file mode 100644 index 000000000..54808ef83 Binary files /dev/null and b/libkdd/tests/stackshot-with-shared-cache-layout.plist.gz differ diff --git a/libkern/OSKextVersion.c b/libkern/OSKextVersion.c index ea6a8e53c..236984072 100644 --- a/libkern/OSKextVersion.c +++ b/libkern/OSKextVersion.c @@ -38,18 +38,18 @@ #include #define VERS_MAJOR_DIGITS (4) -#define VERS_MINOR_DIGITS (2) -#define VERS_REVISION_DIGITS (2) +#define VERS_MINOR_DIGITS (4) +#define VERS_REVISION_DIGITS (4) #define VERS_STAGE_DIGITS (1) #define VERS_STAGE_LEVEL_DIGITS (3) #define VERS_MAJOR_MAX (9999) #define VERS_STAGE_LEVEL_MAX (255) -#define VERS_MAJOR_MULT (100000000) -#define VERS_MINOR_MULT (1000000) -#define VERS_REVISION_MULT (10000) -#define VERS_STAGE_MULT (1000) +#define VERS_MAJOR_MULT (1000000000000) +#define VERS_MINOR_MULT (100000000) +#define VERS_REVISION_MULT (10000) +#define VERS_STAGE_MULT (1000) typedef enum { diff --git a/libkern/c++/OSCollection.cpp b/libkern/c++/OSCollection.cpp index 53b3b7b96..260d9b361 100644 --- a/libkern/c++/OSCollection.cpp +++ b/libkern/c++/OSCollection.cpp @@ -102,3 +102,37 @@ OSCollection * OSCollection::copyCollection(OSDictionary *cycleDict) return this; } } + +bool OSCollection::iterateObjects(void * refcon, bool (*callback)(void * refcon, OSObject * object)) +{ + uint64_t iteratorStore[2]; + unsigned int initialUpdateStamp; + bool done; + + assert(iteratorSize() < sizeof(iteratorStore)); + + if (!initIterator(&iteratorStore[0])) return (false); + + initialUpdateStamp = updateStamp; + done = false; + do + { + OSObject * object; + if (!getNextObjectForIterator(&iteratorStore[0], &object)) break; + done = callback(refcon, object); + } + while (!done && (initialUpdateStamp == updateStamp)); + + return initialUpdateStamp == updateStamp; +} + +static bool OSCollectionIterateObjectsBlock(void * refcon, OSObject * object) +{ + bool (^block)(OSObject * object) = (typeof(block)) refcon; + return (block(object)); +} + +bool OSCollection::iterateObjects(bool (^block)(OSObject * object)) +{ + return (iterateObjects((void *) block, OSCollectionIterateObjectsBlock)); +} diff --git a/libkern/c++/OSData.cpp b/libkern/c++/OSData.cpp index fda3dd7c9..e37bb128b 100644 --- a/libkern/c++/OSData.cpp +++ b/libkern/c++/OSData.cpp @@ -289,7 +289,9 @@ bool OSData::appendBytes(const void *bytes, unsigned int inLength) if (capacity == EXTERNAL) return false; - newSize = length + inLength; + if (os_add_overflow(length, inLength, &newSize)) + return false; + if ( (newSize > capacity) && newSize > ensureCapacity(newSize) ) return false; @@ -313,7 +315,9 @@ bool OSData::appendByte(unsigned char byte, unsigned int inLength) if (capacity == EXTERNAL) return false; - newSize = length + inLength; + if (os_add_overflow(length, inLength, &newSize)) + return false; + if ( (newSize > capacity) && newSize > ensureCapacity(newSize) ) return false; diff --git a/libkern/c++/OSDictionary.cpp b/libkern/c++/OSDictionary.cpp index 27224c707..868152ba1 100644 --- a/libkern/c++/OSDictionary.cpp +++ b/libkern/c++/OSDictionary.cpp @@ -723,3 +723,31 @@ OSArray * OSDictionary::copyKeys(void) } return (array); } + +bool OSDictionary::iterateObjects(void * refcon, bool (*callback)(void * refcon, const OSSymbol * key, OSObject * object)) +{ + unsigned int initialUpdateStamp; + bool done; + + initialUpdateStamp = updateStamp; + done = false; + for (unsigned int i = 0; i < count; i++) + { + done = callback(refcon, dictionary[i].key, EXT_CAST(dictionary[i].value)); + if (done) break; + if (initialUpdateStamp != updateStamp) break; + } + + return initialUpdateStamp == updateStamp; +} + +static bool OSDictionaryIterateObjectsBlock(void * refcon, const OSSymbol * key, OSObject * object) +{ + bool (^block)(const OSSymbol * key, OSObject * object) = (typeof(block)) refcon; + return (block(key, object)); +} + +bool OSDictionary::iterateObjects(bool (^block)(const OSSymbol * key, OSObject * object)) +{ + return (iterateObjects((void *)block, &OSDictionaryIterateObjectsBlock)); +} diff --git a/libkern/c++/OSKext.cpp b/libkern/c++/OSKext.cpp index feb99abad..3f448512c 100644 --- a/libkern/c++/OSKext.cpp +++ b/libkern/c++/OSKext.cpp @@ -48,7 +48,6 @@ extern "C" { #include #include #include -// 04/18/11 - gab: #include #include @@ -81,12 +80,17 @@ extern "C" { extern "C" { extern int IODTGetLoaderInfo(const char * key, void ** infoAddr, int * infoSize); extern void IODTFreeLoaderInfo(const char * key, void * infoAddr, int infoSize); -extern void OSRuntimeUnloadCPPForSegment(kernel_segment_command_t * segment); -extern void OSRuntimeUnloadCPP(kmod_info_t * ki, void * data); extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va); /* osfmk/machine/pmap.h */ +extern int dtrace_keep_kernel_symbols(void); } +extern unsigned long gVirtBase; +extern unsigned long gPhysBase; +#if CONFIG_EMBEDDED +extern vm_offset_t segLOWESTTEXT; +#endif /* CONFIG_EMBEDDED */ + static OSReturn _OSKextCreateRequest( const char * predicate, OSDictionary ** requestP); @@ -110,10 +114,6 @@ static bool _OSKextInUnloadedPrelinkedKexts(const OSSymbol * theBundleID); // So few pad slots, though.... static bool _OSArrayContainsCString(OSArray * array, const char * cString); -#if CONFIG_KEC_FIPS -static void * GetAppleTEXTHashForKext(OSKext * theKext, OSDictionary *theInfoDict); -#endif // CONFIG_KEC_FIPS - /* Prelinked arm kexts do not have VM entries because the method we use to * fake an entry (see libsa/bootstrap.cpp:readPrelinkedExtensions()) does * not work on ARM. To get around that, we must free prelinked kext @@ -327,6 +327,23 @@ kmod_info_t g_kernel_kmod_info = { /* stop */ 0 }; +/* Set up a fake kmod_info struct for statically linked kexts that don't have one. */ + +kmod_info_t invalid_kmod_info = { + /* next */ 0, + /* info_version */ KMOD_INFO_VERSION, + /* id */ UINT32_MAX, + /* name */ "invalid", + /* version */ "0", + /* reference_count */ -1, + /* reference_list */ NULL, + /* address */ 0, + /* size */ 0, + /* hdr_size */ 0, + /* start */ 0, + /* stop */ 0 +}; + extern "C" { // symbol 'kmod' referenced in: model_dep.c, db_trace.c, symbols.c, db_low_trace.c, // dtrace.c, dtrace_glue.h, OSKext.cpp, locore.s, lowmem_vectors.s, @@ -352,6 +369,25 @@ static u_long last_unloaded_strlen = 0; static void * last_unloaded_address = NULL; static u_long last_unloaded_size = 0; +// Statically linked kmods described by several mach-o sections: +// +// kPrelinkInfoSegment:kBuiltinInfoSection +// Array of pointers to kmod_info_t structs. +// +// kPrelinkInfoSegment:kBuiltinInfoSection +// Array of pointers to an embedded mach-o header. +// +// __DATA:kBuiltinInitSection, kBuiltinTermSection +// Structors for all kmods. Has to be filtered by proc address. +// + +static uint32_t gBuiltinKmodsCount; +static kernel_section_t * gBuiltinKmodsSectionInfo; +static kernel_section_t * gBuiltinKmodsSectionStart; + +static const OSSymbol * gIOSurfaceIdentifier; +vm_tag_t gIOSurfaceTag; + /********************************************************************* * sKextInnerLock protects against cross-calls with IOService and * IOCatalogue, and owns the variables declared immediately below. @@ -687,6 +723,10 @@ OSKext::initialize(void) } PE_parse_boot_argn("keepsyms", &sKeepSymbols, sizeof(sKeepSymbols)); +#if CONFIG_DTRACE + if (dtrace_keep_kernel_symbols()) + sKeepSymbols = true; +#endif /* CONFIG_DTRACE */ #if KASAN_DYNAMIC_BLACKLIST /* needed for function lookup */ sKeepSymbols = true; @@ -717,6 +757,7 @@ OSKext::initialize(void) sKernelKext->version = OSKextParseVersionString(osrelease); sKernelKext->compatibleVersion = sKernelKext->version; sKernelKext->linkedExecutable = kernelExecutable; + sKernelKext->interfaceUUID = sKernelKext->copyUUID(); sKernelKext->flags.hasAllDependencies = 1; sKernelKext->flags.kernelComponent = 1; @@ -783,6 +824,27 @@ OSKext::initialize(void) OSSafeReleaseNULL(kernelCPUType); OSSafeReleaseNULL(kernelCPUSubtype); + gBuiltinKmodsSectionInfo = getsectbyname(kPrelinkInfoSegment, kBuiltinInfoSection); + if (gBuiltinKmodsSectionInfo) { + uint32_t count; + + assert(gBuiltinKmodsSectionInfo->addr); + assert(gBuiltinKmodsSectionInfo->size); + gBuiltinKmodsCount = (gBuiltinKmodsSectionInfo->size / sizeof(kmod_info_t *)); + + gBuiltinKmodsSectionStart = getsectbyname(kPrelinkInfoSegment, kBuiltinStartSection); + assert(gBuiltinKmodsSectionStart); + assert(gBuiltinKmodsSectionStart->addr); + assert(gBuiltinKmodsSectionStart->size); + count = (gBuiltinKmodsSectionStart->size / sizeof(uintptr_t)); + // one extra pointer for the end of last kmod + assert(count == (gBuiltinKmodsCount + 1)); + + vm_kernel_builtinkmod_text = ((uintptr_t *)gBuiltinKmodsSectionStart->addr)[0]; + vm_kernel_builtinkmod_text_end = ((uintptr_t *)gBuiltinKmodsSectionStart->addr)[count - 1]; + } + gIOSurfaceIdentifier = OSSymbol::withCStringNoCopy("com.apple.iokit.IOSurface"); + timestamp = __OSAbsoluteTimePtr(&last_loaded_timestamp); *timestamp = 0; timestamp = __OSAbsoluteTimePtr(&last_unloaded_timestamp); @@ -801,8 +863,8 @@ OSKext::initialize(void) } /********************************************************************* -* This could be in OSKextLib.cpp but we need to hold a lock -* while removing all the segments and sKextLock will do. +* This is expected to be called exactly once, from exactly one thread +* context, during kernel bootstrap. *********************************************************************/ /* static */ OSReturn @@ -810,8 +872,6 @@ OSKext::removeKextBootstrap(void) { OSReturn result = kOSReturnError; - static bool alreadyDone = false; - const char * dt_kernel_header_name = "Kernel-__HEADER"; const char * dt_kernel_symtab_name = "Kernel-__SYMTAB"; kernel_mach_header_t * dt_mach_header = NULL; @@ -828,17 +888,6 @@ OSKext::removeKextBootstrap(void) int segment_size = 0; #endif - /* This must be the very first thing done by this function. - */ - IORecursiveLockLock(sKextLock); - - /* If we already did this, it's a success. - */ - if (alreadyDone) { - result = kOSReturnSuccess; - goto finish; - } - OSKextLog(/* kext */ NULL, kOSKextLogProgressLevel | kOSKextLogGeneralFlag, @@ -870,7 +919,6 @@ OSKext::removeKextBootstrap(void) } #if __arm__ || __arm64__ -#if !(defined(KERNEL_INTEGRITY_KTRR)) /* Free the memory that was set up by bootx. */ dt_segment_name = "Kernel-__KLD"; @@ -882,7 +930,6 @@ OSKext::removeKextBootstrap(void) IODTFreeLoaderInfo(dt_segment_name, (void *)segment_paddress, (int)segment_size); } -#endif /* !(defined(KERNEL_INTEGRITY_KTRR)) */ #elif __i386__ || __x86_64__ /* On x86, use the mapping data from the segment load command to * unload KLD directly. @@ -943,7 +990,7 @@ OSKext::removeKextBootstrap(void) kOSKextLogErrorLevel | kOSKextLogGeneralFlag | kOSKextLogArchiveFlag, "Can't copy __LINKEDIT segment for VM reassign."); - goto finish; + return result; } seg_copy_offset = (vm_map_offset_t) seg_copy; @@ -978,7 +1025,7 @@ OSKext::removeKextBootstrap(void) kOSKextLogGeneralFlag | kOSKextLogArchiveFlag, "Can't create __LINKEDIT VM entry at %p, length 0x%llx (error 0x%x).", seg_data, seg_length, mem_result); - goto finish; + return result; } /* And copy it back. @@ -1018,15 +1065,8 @@ OSKext::removeKextBootstrap(void) seg_to_remove = NULL; - alreadyDone = true; result = kOSReturnSuccess; -finish: - - /* This must be the very last thing done before returning. - */ - IORecursiveLockUnlock(sKextLock); - return result; } @@ -1503,12 +1543,12 @@ OSKext::initWithPrelinkedInfoDict( goto finish; } - data = (void *) ((intptr_t) (addressNum->unsigned64BitValue()) + vm_kernel_slide); + data = (void *) ml_static_slide((intptr_t) (addressNum->unsigned64BitValue())); length = (uint32_t) (lengthNum->unsigned32BitValue()); #if KASLR_KEXT_DEBUG IOLog("kaslr: unslid 0x%lx slid 0x%lx length %u - prelink executable \n", - (unsigned long)VM_KERNEL_UNSLIDE(data), + (unsigned long)ml_static_unslide(data), (unsigned long)data, length); #endif @@ -1521,11 +1561,11 @@ OSKext::initWithPrelinkedInfoDict( */ addressNum = OSDynamicCast(OSNumber, anInfoDict->getObject(kPrelinkExecutableSourceKey)); if (addressNum) { - srcData = (void *) ((intptr_t) (addressNum->unsigned64BitValue()) + vm_kernel_slide); + srcData = (void *) ml_static_slide((intptr_t) (addressNum->unsigned64BitValue())); #if KASLR_KEXT_DEBUG IOLog("kaslr: unslid 0x%lx slid 0x%lx - prelink executable source \n", - (unsigned long)VM_KERNEL_UNSLIDE(srcData), + (unsigned long)ml_static_unslide(srcData), (unsigned long)srcData); #endif @@ -1583,14 +1623,14 @@ OSKext::initWithPrelinkedInfoDict( } if (addressNum->unsigned64BitValue() != 0) { - kmod_info = (kmod_info_t *) (intptr_t) (addressNum->unsigned64BitValue() + vm_kernel_slide); - kmod_info->address += vm_kernel_slide; + kmod_info = (kmod_info_t *) ml_static_slide((intptr_t) (addressNum->unsigned64BitValue())); + kmod_info->address = ml_static_slide(kmod_info->address); #if KASLR_KEXT_DEBUG IOLog("kaslr: unslid 0x%lx slid 0x%lx - kmod_info \n", - (unsigned long)VM_KERNEL_UNSLIDE(kmod_info), + (unsigned long)ml_static_unslide(kmod_info), (unsigned long)kmod_info); IOLog("kaslr: unslid 0x%lx slid 0x%lx - kmod_info->address \n", - (unsigned long)VM_KERNEL_UNSLIDE(kmod_info->address), + (unsigned long)ml_static_unslide(kmod_info->address), (unsigned long)kmod_info->address); #endif } @@ -1598,6 +1638,23 @@ OSKext::initWithPrelinkedInfoDict( anInfoDict->removeObject(kPrelinkKmodInfoKey); } + if ((addressNum = OSDynamicCast(OSNumber, anInfoDict->getObject("ModuleIndex")))) + { + uintptr_t builtinTextStart; + uintptr_t builtinTextEnd; + + flags.builtin = true; + builtinKmodIdx = addressNum->unsigned32BitValue(); + assert(builtinKmodIdx < gBuiltinKmodsCount); + + builtinTextStart = ((uintptr_t *)gBuiltinKmodsSectionStart->addr)[builtinKmodIdx]; + builtinTextEnd = ((uintptr_t *)gBuiltinKmodsSectionStart->addr)[builtinKmodIdx + 1]; + + kmod_info = ((kmod_info_t **)gBuiltinKmodsSectionInfo->addr)[builtinKmodIdx]; + kmod_info->address = builtinTextStart; + kmod_info->size = builtinTextEnd - builtinTextStart; + } + /* If the plist has a UUID for an interface, save that off. */ if (isInterface()) { @@ -3345,13 +3402,17 @@ OSKext::lookupKextWithAddress(vm_address_t address) OSData * OSKext::copyKextUUIDForAddress(OSNumber *address) { - OSData *uuid = NULL; + OSData * uuid = NULL; + OSKextActiveAccount * active; + OSKext * kext = NULL; + uint32_t baseIdx; + uint32_t lim; if (!address) { return NULL; } - uintptr_t addr = (uintptr_t)address->unsigned64BitValue() + vm_kernel_slide; + uintptr_t addr = ml_static_slide((uintptr_t)address->unsigned64BitValue()); #if CONFIG_MACF /* Is the calling process allowed to query kext info? */ @@ -3373,18 +3434,36 @@ OSKext::copyKextUUIDForAddress(OSNumber *address) } #endif - if (((vm_offset_t)addr >= vm_kernel_stext) && ((vm_offset_t)addr < vm_kernel_etext)) { - /* address in xnu proper */ - unsigned long uuid_len = 0; - uuid = OSData::withBytes(getuuidfromheader(&_mh_execute_header, &uuid_len), uuid_len); - } else { - IOLockLock(sKextSummariesLock); - OSKextLoadedKextSummary *summary = OSKext::summaryForAddress(addr); - if (summary) { - uuid = OSData::withBytes(summary->uuid, sizeof(uuid_t)); - } - IOLockUnlock(sKextSummariesLock); - } + IOSimpleLockLock(sKextAccountsLock); + // bsearch sKextAccounts list + for (baseIdx = 0, lim = sKextAccountsCount; lim; lim >>= 1) + { + active = &sKextAccounts[baseIdx + (lim >> 1)]; + if ((addr >= active->address) && (addr < active->address_end)) + { + kext = active->account->kext; + if (kext) kext->retain(); + break; + } + else if (addr > active->address) + { + // move right + baseIdx += (lim >> 1) + 1; + lim--; + } + // else move left + } + IOSimpleLockUnlock(sKextAccountsLock); + + if (kext) + { + uuid = kext->copyTextUUID(); + kext->release(); + } + else if (((vm_offset_t)addr >= vm_kernel_stext) && ((vm_offset_t)addr < vm_kernel_etext)) + { + uuid = sKernelKext->copyTextUUID(); + } return uuid; } @@ -3923,6 +4002,8 @@ OSKext::getExecutable(void) OSData * extractedExecutable = NULL; // must release OSData * mkextExecutableRef = NULL; // do not release + if (flags.builtin) return (sKernelKext->linkedExecutable); + result = OSDynamicCast(OSData, infoDict->getObject(_kOSKextExecutableKey)); if (result) { goto finish; @@ -4124,10 +4205,7 @@ OSKext::copyUUID(void) { OSData * result = NULL; OSData * theExecutable = NULL; // do not release - const kernel_mach_header_t * header = NULL; - const struct load_command * load_cmd = NULL; - const struct uuid_command * uuid_cmd = NULL; - uint32_t i; + const kernel_mach_header_t * header; /* An interface kext doesn't have a linked executable with an LC_UUID, * we create one when it's linked. @@ -4138,6 +4216,8 @@ OSKext::copyUUID(void) goto finish; } + if (flags.builtin || isInterface()) return (sKernelKext->copyUUID()); + /* For real kexts, try to get the UUID from the linked executable, * or if is hasn't been linked yet, the unrelocated executable. */ @@ -4150,6 +4230,34 @@ OSKext::copyUUID(void) } header = (const kernel_mach_header_t *)theExecutable->getBytesNoCopy(); + result = copyMachoUUID(header); + +finish: + return result; +} + +/********************************************************************* +*********************************************************************/ +OSData * +OSKext::copyTextUUID(void) +{ + if (flags.builtin) + { + return (copyMachoUUID((const kernel_mach_header_t *)kmod_info->address)); + } + return (copyUUID()); +} + +/********************************************************************* +*********************************************************************/ +OSData * +OSKext::copyMachoUUID(const kernel_mach_header_t * header) +{ + OSData * result = NULL; + const struct load_command * load_cmd = NULL; + const struct uuid_command * uuid_cmd = NULL; + uint32_t i; + load_cmd = (const struct load_command *)&header[1]; if (header->magic != MH_MAGIC_KERNEL) { @@ -4799,7 +4907,9 @@ OSKext::load( pendingPgoHead.next = &pendingPgoHead; pendingPgoHead.prev = &pendingPgoHead; - uuid_generate(instance_uuid); + // The kernel PRNG is not initialized when the first kext is + // loaded, so use early random + uuid_generate_early_random(instance_uuid); account = IONew(OSKextAccount, 1); if (!account) { result = KERN_MEMORY_ERROR; @@ -4810,6 +4920,10 @@ OSKext::load( account->site.refcount = 0; account->site.flags = VM_TAG_KMOD; account->kext = this; + if (gIOSurfaceIdentifier == bundleID) { + vm_tag_alloc(&account->site); + gIOSurfaceTag = account->site.tag; + } flags.loaded = true; @@ -4964,6 +5078,8 @@ OSKext::lookupSection(const char *segname, const char *secname) kernel_segment_command_t * seg = NULL; kernel_section_t * sec = NULL; + if (!linkedExecutable) return (NULL); + mh = (kernel_mach_header_t *)linkedExecutable->getBytesNoCopy(); for (seg = firstsegfromheader(mh); seg != NULL; seg = nextsegfromheader(mh, seg)) { @@ -5010,7 +5126,7 @@ OSKext::slidePrelinkedExecutable(bool doCoalesedSlides) int reloc_size; vm_offset_t new_kextsize; - if (linkedExecutable == NULL || vm_kernel_slide == 0) { + if (linkedExecutable == NULL || flags.builtin) { result = kOSReturnSuccess; goto finish; } @@ -5022,12 +5138,13 @@ OSKext::slidePrelinkedExecutable(bool doCoalesedSlides) if (!seg->vmaddr) { continue; } - seg->vmaddr += vm_kernel_slide; - + + seg->vmaddr = ml_static_slide(seg->vmaddr); + #if KASLR_KEXT_DEBUG IOLog("kaslr: segname %s unslid 0x%lx slid 0x%lx \n", seg->segname, - (unsigned long)VM_KERNEL_UNSLIDE(seg->vmaddr), + (unsigned long)ml_static_unslide(seg->vmaddr), (unsigned long)seg->vmaddr); #endif @@ -5041,12 +5158,12 @@ OSKext::slidePrelinkedExecutable(bool doCoalesedSlides) linkeditSeg = seg; } for (sec = firstsect(seg); sec != NULL; sec = nextsect(seg, sec)) { - sec->addr += vm_kernel_slide; + sec->addr = ml_static_slide(sec->addr); #if KASLR_KEXT_DEBUG IOLog("kaslr: sectname %s unslid 0x%lx slid 0x%lx \n", sec->sectname, - (unsigned long)VM_KERNEL_UNSLIDE(sec->addr), + (unsigned long)ml_static_unslide(sec->addr), (unsigned long)sec->addr); #endif } @@ -5065,13 +5182,13 @@ OSKext::slidePrelinkedExecutable(bool doCoalesedSlides) if (sym[i].n_type & N_STAB) { continue; } - sym[i].n_value += vm_kernel_slide; + sym[i].n_value = ml_static_slide(sym[i].n_value); #if KASLR_KEXT_DEBUG #define MAX_SYMS_TO_LOG 5 if ( i < MAX_SYMS_TO_LOG ) { IOLog("kaslr: LC_SYMTAB unslid 0x%lx slid 0x%lx \n", - (unsigned long)VM_KERNEL_UNSLIDE(sym[i].n_value), + (unsigned long)ml_static_unslide(sym[i].n_value), (unsigned long)sym[i].n_value); } #endif @@ -5130,13 +5247,14 @@ OSKext::slidePrelinkedExecutable(bool doCoalesedSlides) if (reloc[i].r_pcrel != 0) { continue; } - *((uintptr_t *)(relocBase + reloc[i].r_address)) += vm_kernel_slide; + uintptr_t *relocAddr = (uintptr_t*)(relocBase + reloc[i].r_address); + *relocAddr = ml_static_slide(*relocAddr); #if KASLR_KEXT_DEBUG #define MAX_DYSYMS_TO_LOG 5 if ( i < MAX_DYSYMS_TO_LOG ) { IOLog("kaslr: LC_DYSYMTAB unslid 0x%lx slid 0x%lx \n", - (unsigned long)VM_KERNEL_UNSLIDE(*((uintptr_t *)(relocBase + reloc[i].r_address))), + (unsigned long)ml_static_unslide(*((uintptr_t *)(relocAddr))), (unsigned long)*((uintptr_t *)(relocBase + reloc[i].r_address))); } #endif @@ -5272,7 +5390,7 @@ OSKext::loadExecutable() } /* all callers must be entitled */ - if (FALSE == IOTaskHasEntitlement(current_task(), "com.apple.rootless.kext-management")) { + if (FALSE == IOTaskHasEntitlement(current_task(), "com.apple.rootless.kext-secure-management")) { OSKextLog(this, kOSKextLogErrorLevel | kOSKextLogLoadFlag, "Not entitled to link kext '%s'", @@ -5502,7 +5620,7 @@ OSKext::loadExecutable() "Kext %s executable loaded; %u pages at 0x%lx (load tag %u).", kmod_info->name, (unsigned)kmod_info->size / PAGE_SIZE, - (unsigned long)VM_KERNEL_UNSLIDE(kmod_info->address), + (unsigned long)ml_static_unslide(kmod_info->address), (unsigned)kmod_info->id); } @@ -5513,8 +5631,10 @@ OSKext::loadExecutable() } #if KASAN - kasan_load_kext((vm_offset_t)linkedExecutable->getBytesNoCopy(), - linkedExecutable->getLength(), getIdentifierCString()); + if (linkedExecutable) { + kasan_load_kext((vm_offset_t)linkedExecutable->getBytesNoCopy(), + linkedExecutable->getLength(), getIdentifierCString()); + } #else if (lookupSection(KASAN_GLOBAL_SEGNAME, KASAN_GLOBAL_SECTNAME)) { OSKextLog(this, @@ -5669,6 +5789,7 @@ OSKext::jettisonDATASegmentPadding(void) vm_offset_t dataSegEnd, lastSecEnd; vm_size_t padSize; + if (flags.builtin) return; mh = (kernel_mach_header_t *)kmod_info->address; dataSeg = getsegbynamefromheader(mh, SEG_DATA); @@ -5774,6 +5895,9 @@ OSKext::registerWithDTrace(void) if (forceInit == kOSBooleanTrue) { modflag |= KMOD_DTRACE_FORCE_INIT; } + if (flags.builtin) { + modflag |= KMOD_DTRACE_STATIC_KEXT; + } (void)(*dtrace_modload)(kmod_info, modflag); flags.dtraceInitialized = true; @@ -5872,7 +5996,7 @@ OSKext::setVMAttributes(bool protect, bool wire) vm_map_offset_t end = 0; OSReturn result = kOSReturnError; - if (isInterface() || !declaresExecutable()) { + if (isInterface() || !declaresExecutable() || flags.builtin) { result = kOSReturnSuccess; goto finish; } @@ -5981,6 +6105,8 @@ OSKext::validateKextMapping(bool startFlag) mach_msg_type_number_t count; vm_region_submap_short_info_data_64_t info; + if (flags.builtin) return (kOSReturnSuccess); + count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64; bzero(&info, sizeof(info)); @@ -6041,8 +6167,8 @@ OSKext::validateKextMapping(bool startFlag) getIdentifierCString(), whichOp, whichOp, - (void *)VM_KERNEL_UNSLIDE(address), - (void *)VM_KERNEL_UNSLIDE(kmod_info->address)); + (void *)ml_static_unslide(address), + (void *)ml_static_unslide(kmod_info->address)); result = kOSKextReturnBadData; goto finish; } @@ -6060,9 +6186,9 @@ OSKext::validateKextMapping(bool startFlag) getIdentifierCString(), whichOp, whichOp, - (void *)VM_KERNEL_UNSLIDE(address), - (void *)VM_KERNEL_UNSLIDE(kmod_info->address), - (void *)(VM_KERNEL_UNSLIDE(kmod_info->address) + kmod_info->size)); + (void *)ml_static_unslide(address), + (void *)ml_static_unslide(kmod_info->address), + (void *)(ml_static_unslide(kmod_info->address) + kmod_info->size)); result = kOSKextReturnBadData; goto finish; } @@ -6084,7 +6210,7 @@ OSKext::validateKextMapping(bool startFlag) kOSKextLogLoadFlag, "Kext %s - bad %s pointer %p.", getIdentifierCString(), - whichOp, (void *)VM_KERNEL_UNSLIDE(address)); + whichOp, (void *)ml_static_unslide(address)); result = kOSKextReturnBadData; goto finish; } @@ -6164,14 +6290,14 @@ OSKextLogKextInfo(OSKext *aKext, uint64_t address, uint64_t size, firehose_trace stamp = firehose_tracepoint_time(firehose_activity_flags_default); trace_id.ftid_value = FIREHOSE_TRACE_ID_MAKE(firehose_tracepoint_namespace_metadata, _firehose_tracepoint_type_metadata_kext, (firehose_tracepoint_flags_t)0, code); - uuid_data = aKext->copyUUID(); + uuid_data = aKext->copyTextUUID(); if (uuid_data) { memcpy(uuid_info->ftui_uuid, uuid_data->getBytesNoCopy(), sizeof(uuid_info->ftui_uuid)); OSSafeReleaseNULL(uuid_data); } uuid_info->ftui_size = size; - uuid_info->ftui_address = VM_KERNEL_UNSLIDE(address); + uuid_info->ftui_address = ml_static_unslide(address); firehose_trace_metadata(firehose_stream_metadata, trace_id, stamp, uuid_info, uuid_info_len); return; @@ -6267,33 +6393,10 @@ OSKext::start(bool startDependenciesFlag) // Drop a log message so logd can grab the needed information to decode this kext OSKextLogKextInfo(this, kmod_info->address, kmod_info->size, firehose_tracepoint_code_load); - -#if !CONFIG_STATIC_CPPINIT - result = OSRuntimeInitializeCPP(kmod_info, NULL); + result = OSRuntimeInitializeCPP(this); if (result == KERN_SUCCESS) { -#endif - -#if CONFIG_KEC_FIPS - kmodStartData = GetAppleTEXTHashForKext(this, this->infoDict); - -#if 0 - if (kmodStartData) { - OSKextLog(this, - kOSKextLogErrorLevel | - kOSKextLogGeneralFlag, - "Kext %s calling module start function. kmodStartData %p. arch %s", - getIdentifierCString(), kmodStartData, ARCHNAME); - } -#endif -#endif // CONFIG_KEC_FIPS result = startfunc(kmod_info, kmodStartData); - -#if !CONFIG_STATIC_CPPINIT - if (result != KERN_SUCCESS) { - (void) OSRuntimeFinalizeCPP(kmod_info, NULL); - } } -#endif flags.starting = 0; @@ -6426,11 +6529,9 @@ OSKext::stop(void) flags.stopping = 1; result = stopfunc(kmod_info, /* userData */ NULL); -#if !CONFIG_STATIC_CPPINIT if (result == KERN_SUCCESS) { - result = OSRuntimeFinalizeCPP(kmod_info, NULL); + result = OSRuntimeFinalizeCPP(this); } -#endif flags.stopping = 0; @@ -6571,6 +6672,7 @@ OSKext::unload(void) if (metaClasses) { metaClasses->flushCollection(); } + (void) OSRuntimeFinalizeCPP(this); /* Remove the kext from the list of loaded kexts, patch the gap * in the kmod_info_t linked list, and reset "kmod" to point to the @@ -8634,14 +8736,10 @@ OSKext::copyLoadedKextInfoByUUID( OSKext *thisKext = NULL; // do not release Boolean includeThis = true; uuid_t thisKextUUID; + uuid_t thisKextTextUUID; OSData *uuid_data; uuid_string_t uuid_key; - if (kextInfo) { - kextInfo->release(); - kextInfo = NULL; - } - thisKext = OSDynamicCast(OSKext, sLoadedKexts->getObject(i)); if (!thisKext) { continue; @@ -8657,6 +8755,13 @@ OSKext::copyLoadedKextInfoByUUID( uuid_unparse(thisKextUUID, uuid_key); + uuid_data = thisKext->copyTextUUID(); + if (!uuid_data) { + continue; + } + memcpy(&thisKextTextUUID, uuid_data->getBytesNoCopy(), sizeof(thisKextTextUUID)); + OSSafeReleaseNULL(uuid_data); + /* Skip current kext if we have a list of UUIDs and * it isn't in the list. */ @@ -8670,8 +8775,15 @@ OSKext::copyLoadedKextInfoByUUID( uuid_t uuid; uuid_parse(wantedUUID->getCStringNoCopy(), uuid); - if (0 == uuid_compare(uuid, thisKextUUID)) { + if ((0 == uuid_compare(uuid, thisKextUUID)) + || (0 == uuid_compare(uuid, thisKextTextUUID))) { includeThis = true; + /* Only need to find the first kext if multiple match, + * ie. asking for the kernel uuid does not need to find + * interface kexts or builtin static kexts. + */ + kextIdentifiers->removeObject(idIndex); + uuid_unparse(uuid, uuid_key); break; } @@ -8685,14 +8797,17 @@ OSKext::copyLoadedKextInfoByUUID( kextInfo = thisKext->copyInfo(infoKeys); if (kextInfo) { result->setObject(uuid_key, kextInfo); + kextInfo->release(); + } + + if (kextIdentifiers && !kextIdentifiers->getCount()) { + break; } } finish: IORecursiveLockUnlock(sKextLock); - if (kextInfo) kextInfo->release(); - return result; } @@ -8705,10 +8820,8 @@ OSKext::copyLoadedKextInfo( OSArray * infoKeys) { OSDictionary * result = NULL; - OSDictionary * kextInfo = NULL; // must release - uint32_t count, i; uint32_t idCount = 0; - uint32_t idIndex = 0; + bool onlyLoaded; IORecursiveLockLock(sKextLock); @@ -8746,8 +8859,9 @@ OSKext::copyLoadedKextInfo( infoKeys = NULL; } - count = sLoadedKexts->getCount(); - result = OSDictionary::withCapacity(count); + onlyLoaded = (!infoKeys || !_OSArrayContainsCString(infoKeys, kOSBundleAllPrelinkedKey)); + + result = OSDictionary::withCapacity(128); if (!result) { goto finish; } @@ -8790,28 +8904,31 @@ OSKext::copyLoadedKextInfo( vm_slinkedit, vm_elinkedit); #endif - for (i = 0; i < count; i++) { - OSKext * thisKext = NULL; // do not release - Boolean includeThis = true; + sKextsByID->iterateObjects(^bool(const OSSymbol * thisKextID, OSObject * obj) + { + OSKext * thisKext = NULL; // do not release + Boolean includeThis = true; + OSDictionary * kextInfo = NULL; // must release - if (kextInfo) { - kextInfo->release(); - kextInfo = NULL; - } - thisKext = OSDynamicCast(OSKext, sLoadedKexts->getObject(i)); + thisKext = OSDynamicCast(OSKext, obj); if (!thisKext) { - continue; + return (false);; + } + + /* Skip current kext if not yet started and caller didn't request all. + */ + if (onlyLoaded && (-1U == sLoadedKexts->getNextIndexOfObject(thisKext, 0))) { + return (false);; } /* Skip current kext if we have a list of bundle IDs and * it isn't in the list. */ if (kextIdentifiers) { - const OSString * thisKextID = thisKext->getIdentifier(); includeThis = false; - for (idIndex = 0; idIndex < idCount; idIndex++) { + for (uint32_t idIndex = 0; idIndex < idCount; idIndex++) { const OSString * thisRequestID = OSDynamicCast(OSString, kextIdentifiers->getObject(idIndex)); if (thisKextID->isEqualTo(thisRequestID)) { @@ -8822,20 +8939,20 @@ OSKext::copyLoadedKextInfo( } if (!includeThis) { - continue; + return (false); } kextInfo = thisKext->copyInfo(infoKeys); if (kextInfo) { result->setObject(thisKext->getIdentifier(), kextInfo); + kextInfo->release(); } - } + return (false); + }); finish: IORecursiveLockUnlock(sKextLock); - if (kextInfo) kextInfo->release(); - return result; } @@ -8951,10 +9068,10 @@ OSKext::copyInfo(OSArray * infoKeys) __FUNCTION__, segp->vmaddr, vm_kext_base, vm_kext_top); } #endif - segp->vmaddr = VM_KERNEL_UNSLIDE(segp->vmaddr); + segp->vmaddr = ml_static_unslide(segp->vmaddr); for (secp = firstsect(segp); secp != NULL; secp = nextsect(segp, secp)) { - secp->addr = VM_KERNEL_UNSLIDE(secp->addr); + secp->addr = ml_static_unslide(secp->addr); } } lcp = (struct load_command *)((caddr_t)lcp + lcp->cmdsize); @@ -9096,6 +9213,8 @@ OSKext::copyInfo(OSArray * infoKeys) } result->setObject(kOSBundleExecutablePathKey, executablePathString); + } else if (flags.builtin) { + result->setObject(kOSBundleExecutablePathKey, bundleID); } } @@ -9105,6 +9224,13 @@ OSKext::copyInfo(OSArray * infoKeys) uuid = copyUUID(); if (uuid) { result->setObject(kOSBundleUUIDKey, uuid); + uuid->release(); + } + } + if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleTextUUIDKey)) { + uuid = copyTextUUID(); + if (uuid) { + result->setObject(kOSBundleTextUUIDKey, uuid); uuid->release(); } } @@ -9152,7 +9278,7 @@ OSKext::copyInfo(OSArray * infoKeys) _OSArrayContainsCString(infoKeys, kOSBundleExecLoadSizeKey) || _OSArrayContainsCString(infoKeys, kOSBundleWiredSizeKey)) { - if (isInterface() || linkedExecutable) { + if (isInterface() || flags.builtin || linkedExecutable) { /* These go to userspace via serialization, so we don't want any doubts * about their size. */ @@ -9168,21 +9294,27 @@ OSKext::copyInfo(OSArray * infoKeys) * xxx - leaving in # when we have a linkedExecutable...a kernelcomp * xxx - shouldn't have one! */ - if (linkedExecutable /* && !isInterface() */) { + + if (flags.builtin || linkedExecutable) { kernel_mach_header_t *mh = NULL; kernel_segment_command_t *seg = NULL; - loadAddress = (uint64_t)linkedExecutable->getBytesNoCopy(); + if (flags.builtin) { + loadAddress = kmod_info->address; + loadSize = kmod_info->size; + } else { + loadAddress = (uint64_t)linkedExecutable->getBytesNoCopy(); + loadSize = linkedExecutable->getLength(); + } mh = (kernel_mach_header_t *)loadAddress; - loadAddress = VM_KERNEL_UNSLIDE(loadAddress); - loadSize = linkedExecutable->getLength(); + loadAddress = ml_static_unslide(loadAddress); /* Walk through the kext, looking for the first executable * segment in case we were asked for its size/address. */ for (seg = firstsegfromheader(mh); seg != NULL; seg = nextsegfromheader(mh, seg)) { if (seg->initprot & VM_PROT_EXECUTE) { - execLoadAddress = VM_KERNEL_UNSLIDE(seg->vmaddr); + execLoadAddress = ml_static_unslide(seg->vmaddr); execLoadSize = seg->vmsize; break; } @@ -9208,6 +9340,23 @@ OSKext::copyInfo(OSArray * infoKeys) result->setObject(kOSBundleLoadAddressKey, scratchNumber); OSSafeReleaseNULL(scratchNumber); } +#if CONFIG_EMBEDDED + if ((!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleCacheLoadAddressKey)) + && loadAddress && loadSize) { + scratchNumber = OSNumber::withNumber( + (unsigned long long)ml_static_unslide((uintptr_t)segLOWESTTEXT), + /* numBits */ 8 * sizeof(loadAddress)); + if (!scratchNumber) { + goto finish; + } + result->setObject(kOSBundleCacheLoadAddressKey, scratchNumber); + OSSafeReleaseNULL(scratchNumber); + } + if ((!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleKextsInKernelTextKey)) + && (this == sKernelKext) && gBuiltinKmodsCount) { + result->setObject(kOSBundleKextsInKernelTextKey, kOSBooleanTrue); + } +#endif /* CONFIG_EMBEDDED */ if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleExecLoadAddressKey)) { scratchNumber = OSNumber::withNumber( (unsigned long long)(execLoadAddress), @@ -9358,7 +9507,6 @@ OSKext::copyInfo(OSArray * infoKeys) OSSafeReleaseNULL(cpuSubtypeNumber); OSSafeReleaseNULL(executablePathString); if (executablePathCString) kfree(executablePathCString, executablePathCStringSize); - OSSafeReleaseNULL(uuid); OSSafeReleaseNULL(scratchNumber); OSSafeReleaseNULL(dependencyLoadTags); OSSafeReleaseNULL(metaClassIterator); @@ -10913,10 +11061,10 @@ bool ScanForAddrInObject(OSObject * theObject, /* static */ void OSKext::printKextsInBacktrace( - vm_offset_t * addr, - unsigned int cnt, - int (* printf_func)(const char *fmt, ...), - uint32_t flags) + vm_offset_t * addr __unused, + unsigned int cnt __unused, + int (* printf_func)(const char *fmt, ...) __unused, + uint32_t flags __unused) { addr64_t summary_page = 0; addr64_t last_summary_page = 0; @@ -11038,24 +11186,46 @@ OSKext::summaryForAddress(const uintptr_t addr) /* static */ void * -OSKext::kextForAddress(const void *addr) +OSKext::kextForAddress(const void *address) { - void *image = NULL; + void * image = NULL; + OSKextActiveAccount * active; + OSKext * kext = NULL; + uint32_t baseIdx; + uint32_t lim; + uintptr_t addr = (uintptr_t) address; - if (((vm_offset_t)(uintptr_t)addr >= vm_kernel_stext) && - ((vm_offset_t)(uintptr_t)addr < vm_kernel_etext)) { - return (void *)&_mh_execute_header; - } - - if (!sKextSummariesLock) { + if (!addr) { return NULL; } - IOLockLock(sKextSummariesLock); - OSKextLoadedKextSummary *summary = OSKext::summaryForAddress((uintptr_t)addr); - if (summary) { - image = (void *)summary->address; - } - IOLockUnlock(sKextSummariesLock); + + if (sKextAccountsCount) + { + IOSimpleLockLock(sKextAccountsLock); + // bsearch sKextAccounts list + for (baseIdx = 0, lim = sKextAccountsCount; lim; lim >>= 1) + { + active = &sKextAccounts[baseIdx + (lim >> 1)]; + if ((addr >= active->address) && (addr < active->address_end)) + { + kext = active->account->kext; + if (kext && kext->kmod_info) image = (void *) kext->kmod_info->address; + break; + } + else if (addr > active->address) + { + // move right + baseIdx += (lim >> 1) + 1; + lim--; + } + // else move left + } + IOSimpleLockUnlock(sKextAccountsLock); + } + if (!image && (addr >= vm_kernel_stext) && (addr < vm_kernel_etext)) + { + image = (void *) &_mh_execute_header; + } return image; } @@ -11108,7 +11278,7 @@ void OSKext::printSummary( (void) uuid_unparse(summary->uuid, uuid); if (kPrintKextsUnslide & flags) { - tmpAddr = VM_KERNEL_UNSLIDE(summary->address); + tmpAddr = ml_static_unslide(summary->address); } else { tmpAddr = summary->address; @@ -11129,7 +11299,7 @@ void OSKext::printSummary( if (pmap_find_phys(kernel_pmap, (addr64_t)((uintptr_t)kmod_ref)) == 0) { (*printf_func)(" kmod dependency scan stopped " "due to missing dependency page: %p\n", - (kPrintKextsUnslide & flags) ? (void *)VM_KERNEL_UNSLIDE(kmod_ref) : kmod_ref); + (kPrintKextsUnslide & flags) ? (void *)ml_static_unslide((vm_offset_t)kmod_ref) : kmod_ref); break; } rinfo = kmod_ref->info; @@ -11137,7 +11307,7 @@ void OSKext::printSummary( if (pmap_find_phys(kernel_pmap, (addr64_t)((uintptr_t)rinfo)) == 0) { (*printf_func)(" kmod dependency scan stopped " "due to missing kmod page: %p\n", - (kPrintKextsUnslide & flags) ? (void *)VM_KERNEL_UNSLIDE(rinfo) : rinfo); + (kPrintKextsUnslide & flags) ? (void *)ml_static_unslide((vm_offset_t)rinfo) : rinfo); break; } @@ -11149,7 +11319,7 @@ void OSKext::printSummary( findSummaryUUID(rinfo->id, uuid); if (kPrintKextsUnslide & flags) { - tmpAddr = VM_KERNEL_UNSLIDE(rinfo->address); + tmpAddr = ml_static_unslide(rinfo->address); } else { tmpAddr = rinfo->address; @@ -11718,8 +11888,16 @@ OSKext::updateLoadedKextSummary(OSKextLoadedKextSummary *summary) OSSafeReleaseNULL(uuid); } - summary->address = kmod_info->address; - summary->size = kmod_info->size; + if (flags.builtin) { +// this value will stop lldb from parsing the mach-o header +// summary->address = UINT64_MAX; +// summary->size = 0; + summary->address = kmod_info->address; + summary->size = kmod_info->size; + } else { + summary->address = kmod_info->address; + summary->size = kmod_info->size; + } summary->version = getVersion(); summary->loadTag = kmod_info->id; summary->flags = 0; @@ -11737,8 +11915,9 @@ OSKext::updateActiveAccount(OSKextActiveAccount *accountp) kernel_mach_header_t *hdr = NULL; kernel_segment_command_t *seg = NULL; - hdr = (kernel_mach_header_t *)kmod_info->address; + bzero(accountp, sizeof(*accountp)); + hdr = (kernel_mach_header_t *)kmod_info->address; if (getcommandfromheader(hdr, LC_SEGMENT_SPLIT_INFO)) { /* If this kext supports split segments, use the first * executable segment as the range for instructions @@ -11750,8 +11929,6 @@ OSKext::updateActiveAccount(OSKextActiveAccount *accountp) } } } - - bzero(accountp, sizeof(*accountp)); if (seg) { accountp->address = seg->vmaddr; if (accountp->address) { @@ -11768,6 +11945,7 @@ OSKext::updateActiveAccount(OSKextActiveAccount *accountp) accountp->address_end = kmod_info->address + kmod_info->size; } } + accountp->account = this->account; } @@ -11794,8 +11972,8 @@ OSKextGetAllocationSiteForCaller(uintptr_t address) if (!site->tag) vm_tag_alloc_locked(site, &releasesite); break; } - else if (address > active->address) - { + else if (address > active->address) + { // move right baseIdx += (lim >> 1) + 1; lim--; @@ -11833,75 +12011,6 @@ OSKextFreeSite(vm_allocation_site_t * site) /********************************************************************* *********************************************************************/ - -#if CONFIG_KEC_FIPS - -#if PRAGMA_MARK -#pragma mark Kernel External Components for FIPS compliance -#endif - -/********************************************************************* - * Kernel External Components for FIPS compliance (KEC_FIPS) - *********************************************************************/ -static void * -GetAppleTEXTHashForKext(OSKext * theKext, OSDictionary *theInfoDict) -{ - AppleTEXTHash_t my_ath = {2, 0, NULL}; - AppleTEXTHash_t * my_athp = NULL; // do not release - OSData * segmentHash = NULL; // do not release - - if (theKext == NULL || theInfoDict == NULL) { - return(NULL); - } - - // Get the part of the plist associate with kAppleTextHashesKey and let - // the crypto library do further parsing (slice/architecture) - segmentHash = OSDynamicCast(OSData, theInfoDict->getObject(kAppleTextHashesKey)); - // Support for ATH v1 while rolling out ATH v2 without revision locking submissions - // Remove this when v2 PLIST are supported - if (segmentHash == NULL) { - // If this fails, we may be dealing with a v1 PLIST - OSDictionary * textHashDict = NULL; // do not release - textHashDict = OSDynamicCast(OSDictionary, theInfoDict->getObject(kAppleTextHashesKey)); - if (textHashDict == NULL) { - return(NULL); - } - my_ath.ath_version=1; - segmentHash = OSDynamicCast(OSData,textHashDict->getObject(ARCHNAME)); - } // end of v2 rollout - - if (segmentHash == NULL) { - return(NULL); - } - - // KEC_FIPS type kexts never unload so we don't have to clean up our - // AppleTEXTHash_t - if (kmem_alloc(kernel_map, (vm_offset_t *) &my_athp, - sizeof(AppleTEXTHash_t), VM_KERN_MEMORY_OSKEXT) != KERN_SUCCESS) { - return(NULL); - } - - memcpy(my_athp, &my_ath, sizeof(my_ath)); - my_athp->ath_length = segmentHash->getLength(); - if (my_athp->ath_length > 0) { - my_athp->ath_hash = (void *)segmentHash->getBytesNoCopy(); - } - -#if 0 - OSKextLog(theKext, - kOSKextLogErrorLevel | - kOSKextLogGeneralFlag, - "Kext %s ath_version %d ath_length %d ath_hash %p", - theKext->getIdentifierCString(), - my_athp->ath_version, - my_athp->ath_length, - my_athp->ath_hash); -#endif - - return( (void *) my_athp ); -} - -#endif // CONFIG_KEC_FIPS #if CONFIG_IMAGEBOOT int OSKextGetUUIDForName(const char *name, uuid_t uuid) diff --git a/libkern/c++/OSMetaClass.cpp b/libkern/c++/OSMetaClass.cpp index 71b316c8c..f7c0594d8 100644 --- a/libkern/c++/OSMetaClass.cpp +++ b/libkern/c++/OSMetaClass.cpp @@ -148,6 +148,66 @@ void OSMetaClassBase::_RESERVEDOSMetaClassBase6() { panic("OSMetaClassBase::_RESERVEDOSMetaClassBase%d called.", 6); } #endif + +/********************************************************************* +*********************************************************************/ + +#if defined(__arm__) || defined(__arm64__) + + + +/* +IHI0059A "C++ Application Binary Interface Standard for the ARM 64 - bit Architecture": + +3.2.1 Representation of pointer to member function The generic C++ ABI [GC++ABI] +specifies that a pointer to member function is a pair of words . The +least significant bit of ptr discriminates between (0) the address of a non- +virtual member function and (1) the offset in the class's virtual table of the +address of a virtual function. This encoding cannot work for the AArch64 +instruction set where the architecture reserves all bits of code addresses. This +ABI specifies that adj contains twice the this adjustment, plus 1 if the member +function is virtual. The least significant bit of adj then makes exactly the +same discrimination as the least significant bit of ptr does for Itanium. A +pointer to member function is NULL when ptr = 0 and the least significant bit of +adj is zero. +*/ + +OSMetaClassBase::_ptf_t +OSMetaClassBase::_ptmf2ptf(const OSMetaClassBase *self, void (OSMetaClassBase::*func)(void)) +{ + typedef long int ptrdiff_t; + struct ptmf_t { + _ptf_t fPFN; + ptrdiff_t delta; + }; + union { + void (OSMetaClassBase::*fIn)(void); + struct ptmf_t pTMF; + } map; + _ptf_t pfn; + + map.fIn = func; + pfn = map.pTMF.fPFN; + + if (map.pTMF.delta & 1) { + // virtual + union { + const OSMetaClassBase *fObj; + _ptf_t **vtablep; + } u; + u.fObj = self; + + // Virtual member function so dereference table + pfn = *(_ptf_t *)(((uintptr_t)*u.vtablep) + (uintptr_t)pfn); + return pfn; + + } else { + // Not virtual, i.e. plain member func + return pfn; + } +} + +#endif /* defined(__arm__) || defined(__arm64__) */ /********************************************************************* * These used to be inline in the header but gcc didn't believe us * Now we MUST pull the inline out at least until the compiler is diff --git a/libkern/c++/OSRuntime.cpp b/libkern/c++/OSRuntime.cpp index e6cf48ba7..d0a09ca06 100644 --- a/libkern/c++/OSRuntime.cpp +++ b/libkern/c++/OSRuntime.cpp @@ -42,6 +42,7 @@ __BEGIN_DECLS #include #include #include +#include #include #if PRAGMA_MARK @@ -169,160 +170,189 @@ kern_os_realloc( } #if PRAGMA_MARK -#pragma mark C++ Runtime Load/Unload +#pragma mark Libkern Init #endif /* PRAGMA_MARK */ /********************************************************************* -* kern_os C++ Runtime Load/Unload +* Libkern Init *********************************************************************/ -/********************************************************************* -*********************************************************************/ #if __GNUC__ >= 3 void __cxa_pure_virtual( void ) { panic("%s", __FUNCTION__); } #else void __pure_virtual( void ) { panic("%s", __FUNCTION__); } #endif -typedef void (*structor_t)(void); - -/********************************************************************* -*********************************************************************/ -static boolean_t -sectionIsDestructor(kernel_section_t * section) -{ - boolean_t result; - - result = !strncmp(section->sectname, SECT_MODTERMFUNC, - sizeof(SECT_MODTERMFUNC) - 1); -#if !__LP64__ - result = result || !strncmp(section->sectname, SECT_DESTRUCTOR, - sizeof(SECT_DESTRUCTOR) - 1); -#endif - - return result; -} - -/********************************************************************* -*********************************************************************/ -static boolean_t -sectionIsConstructor(kernel_section_t * section) -{ - boolean_t result; - - result = !strncmp(section->sectname, SECT_MODINITFUNC, - sizeof(SECT_MODINITFUNC) - 1); -#if !__LP64__ - result = result || !strncmp(section->sectname, SECT_CONSTRUCTOR, - sizeof(SECT_CONSTRUCTOR) - 1); -#endif - - return result; -} +extern lck_grp_t * IOLockGroup; +extern kmod_info_t g_kernel_kmod_info; +enum { + kOSSectionNamesDefault = 0, + kOSSectionNamesBuiltinKext = 1, + kOSSectionNamesCount = 2, +}; +enum { + kOSSectionNameInitializer = 0, + kOSSectionNameFinalizer = 1, + kOSSectionNameCount = 2 +}; -/********************************************************************* -* OSRuntimeUnloadCPPForSegment() -* -* Given a pointer to a mach object segment, iterate the segment to -* obtain a destructor section for C++ objects, and call each of the -* destructors there. -*********************************************************************/ +static const char * +gOSStructorSectionNames[kOSSectionNamesCount][kOSSectionNameCount] = { + { SECT_MODINITFUNC, SECT_MODTERMFUNC }, + { kBuiltinInitSection, kBuiltinTermSection } +}; -void -OSRuntimeUnloadCPPForSegmentInKmod( - kernel_segment_command_t * segment, - kmod_info_t * kmodInfo) +void OSlibkernInit(void) { + // This must be called before calling OSRuntimeInitializeCPP. + OSMetaClassBase::initialize(); - kernel_section_t * section = NULL; // do not free - OSKext * theKext = NULL; // must release - - if (gKernelCPPInitialized && kmodInfo) { - theKext = OSKext::lookupKextWithIdentifier(kmodInfo->name); + g_kernel_kmod_info.address = (vm_address_t) &_mh_execute_header; + if (kOSReturnSuccess != OSRuntimeInitializeCPP(NULL)) { + // &g_kernel_kmod_info, gOSSectionNamesStandard, 0, 0)) { + panic("OSRuntime: C++ runtime failed to initialize."); } - for (section = firstsect(segment); - section != 0; - section = nextsect(segment, section)) { - - if (sectionIsDestructor(section)) { - structor_t * destructors = (structor_t *)section->addr; - - if (destructors) { - int num_destructors = section->size / sizeof(structor_t); - int hit_null_destructor = 0; - - for (int i = 0; i < num_destructors; i++) { - if (destructors[i]) { - (*destructors[i])(); - } else if (!hit_null_destructor) { - hit_null_destructor = 1; - OSRuntimeLog(theKext, kOSRuntimeLogSpec, - "Null destructor in kext %s segment %s!", - kmodInfo ? kmodInfo->name : "(unknown)", - section->segname); - } - } - } /* if (destructors) */ - } /* if (strncmp...) */ - } /* for (section...) */ + gKernelCPPInitialized = true; - OSSafeReleaseNULL(theKext); return; } -void -OSRuntimeUnloadCPPForSegment(kernel_segment_command_t * segment) { - OSRuntimeUnloadCPPForSegmentInKmod(segment, NULL); -} +__END_DECLS +#if PRAGMA_MARK +#pragma mark C++ Runtime Load/Unload +#endif /* PRAGMA_MARK */ /********************************************************************* +* kern_os C++ Runtime Load/Unload *********************************************************************/ -void -OSRuntimeUnloadCPP( - kmod_info_t * kmodInfo, - void * data __unused) -{ - if (kmodInfo && kmodInfo->address) { - - kernel_segment_command_t * segment; - kernel_mach_header_t * header; - OSSymbol::checkForPageUnload((void *)kmodInfo->address, - (void *)(kmodInfo->address + kmodInfo->size)); - header = (kernel_mach_header_t *)kmodInfo->address; - segment = firstsegfromheader(header); +typedef void (*structor_t)(void); - for (segment = firstsegfromheader(header); - segment != 0; - segment = nextsegfromheader(header, segment)) { +static bool +OSRuntimeCallStructorsInSection( + OSKext * theKext, + kmod_info_t * kmodInfo, + void * metaHandle, + kernel_segment_command_t * segment, + const char * sectionName, + uintptr_t textStart, + uintptr_t textEnd) +{ + kernel_section_t * section; + bool result = TRUE; - OSRuntimeUnloadCPPForSegmentInKmod(segment, kmodInfo); + for (section = firstsect(segment); + section != NULL; + section = nextsect(segment, section)) + { + if (strncmp(section->sectname, sectionName, sizeof(section->sectname) - 1)) continue; + + structor_t * structors = (structor_t *)section->addr; + if (!structors) continue; + + structor_t structor; + unsigned int num_structors = section->size / sizeof(structor_t); + unsigned int hit_null_structor = 0; + unsigned int firstIndex = 0; + + if (textStart) + { + // bsearch for any in range + unsigned int baseIdx; + unsigned int lim; + uintptr_t value; + firstIndex = num_structors; + for (lim = num_structors, baseIdx = 0; lim; lim >>= 1) + { + value = (uintptr_t) structors[baseIdx + (lim >> 1)]; + if (!value) panic("%s: null structor", kmodInfo->name); + if ((value >= textStart) && (value < textEnd)) + { + firstIndex = (baseIdx + (lim >> 1)); + // scan back for the first in range + for (; firstIndex; firstIndex--) + { + value = (uintptr_t) structors[firstIndex - 1]; + if ((value < textStart) || (value >= textEnd)) break; + } + break; + } + if (textStart > value) + { + // move right + baseIdx += (lim >> 1) + 1; + lim--; + } + // else move left + } + baseIdx = (baseIdx + (lim >> 1)); } - } - - return; + for (; + (firstIndex < num_structors) + && (!metaHandle || OSMetaClass::checkModLoad(metaHandle)); + firstIndex++) + { + if ((structor = structors[firstIndex])) + { + if ((textStart && ((uintptr_t) structor < textStart)) + || (textEnd && ((uintptr_t) structor >= textEnd))) break; + + (*structor)(); + } + else if (!hit_null_structor) + { + hit_null_structor = 1; + OSRuntimeLog(theKext, kOSRuntimeLogSpec, + "Null structor in kext %s segment %s!", + kmodInfo->name, section->segname); + } + } + if (metaHandle) result = OSMetaClass::checkModLoad(metaHandle); + break; + } /* for (section...) */ + return (result); } /********************************************************************* *********************************************************************/ kern_return_t OSRuntimeFinalizeCPP( - kmod_info_t * kmodInfo, - void * data __unused) + OSKext * theKext) { - kern_return_t result = KMOD_RETURN_FAILURE; - void * metaHandle = NULL; // do not free - OSKext * theKext = NULL; // must release - - if (gKernelCPPInitialized) { - theKext = OSKext::lookupKextWithIdentifier(kmodInfo->name); - } - - if (theKext && !theKext->isCPPInitialized()) { - result = KMOD_RETURN_SUCCESS; - goto finish; + kern_return_t result = KMOD_RETURN_FAILURE; + void * metaHandle = NULL; // do not free + kernel_mach_header_t * header; + kernel_segment_command_t * segment; + kmod_info_t * kmodInfo; + const char ** sectionNames; + uintptr_t textStart; + uintptr_t textEnd; + + textStart = 0; + textEnd = 0; + sectionNames = gOSStructorSectionNames[kOSSectionNamesDefault]; + if (theKext) { + if (!theKext->isCPPInitialized()) { + result = KMOD_RETURN_SUCCESS; + goto finish; + } + kmodInfo = theKext->kmod_info; + if (!kmodInfo || !kmodInfo->address) { + result = kOSKextReturnInvalidArgument; + goto finish; + } + header = (kernel_mach_header_t *)kmodInfo->address; + if (theKext->flags.builtin) { + header = (kernel_mach_header_t *)g_kernel_kmod_info.address; + textStart = kmodInfo->address; + textEnd = textStart + kmodInfo->size; + sectionNames = gOSStructorSectionNames[kOSSectionNamesBuiltinKext]; + } + } else { + kmodInfo = &g_kernel_kmod_info; + header = (kernel_mach_header_t *)kmodInfo->address; } /* OSKext checks for this condition now, but somebody might call @@ -344,7 +374,21 @@ OSRuntimeFinalizeCPP( * return a failure (it only does actual work on the init path anyhow). */ metaHandle = OSMetaClass::preModLoad(kmodInfo->name); - OSRuntimeUnloadCPP(kmodInfo, 0); + + OSSymbol::checkForPageUnload((void *)kmodInfo->address, + (void *)(kmodInfo->address + kmodInfo->size)); + + header = (kernel_mach_header_t *)kmodInfo->address; + segment = firstsegfromheader(header); + + for (segment = firstsegfromheader(header); + segment != 0; + segment = nextsegfromheader(header, segment)) { + + OSRuntimeCallStructorsInSection(theKext, kmodInfo, NULL, segment, + sectionNames[kOSSectionNameFinalizer], textStart, textEnd); + } + (void)OSMetaClass::postModLoad(metaHandle); if (theKext) { @@ -352,43 +396,53 @@ OSRuntimeFinalizeCPP( } result = KMOD_RETURN_SUCCESS; finish: - OSSafeReleaseNULL(theKext); return result; } -// Functions used by the extenTools/kmod library project - /********************************************************************* *********************************************************************/ kern_return_t OSRuntimeInitializeCPP( - kmod_info_t * kmodInfo, - void * data __unused) + OSKext * theKext) { kern_return_t result = KMOD_RETURN_FAILURE; - OSKext * theKext = NULL; // must release kernel_mach_header_t * header = NULL; void * metaHandle = NULL; // do not free bool load_success = true; kernel_segment_command_t * segment = NULL; // do not free kernel_segment_command_t * failure_segment = NULL; // do not free + kmod_info_t * kmodInfo; + const char ** sectionNames; + uintptr_t textStart; + uintptr_t textEnd; + + textStart = 0; + textEnd = 0; + sectionNames = gOSStructorSectionNames[kOSSectionNamesDefault]; + if (theKext) { + if (theKext->isCPPInitialized()) { + result = KMOD_RETURN_SUCCESS; + goto finish; + } - if (!kmodInfo || !kmodInfo->address) { - result = kOSKextReturnInvalidArgument; - goto finish; - } - - if (gKernelCPPInitialized) { - theKext = OSKext::lookupKextWithIdentifier(kmodInfo->name); - } + kmodInfo = theKext->kmod_info; + if (!kmodInfo || !kmodInfo->address) { + result = kOSKextReturnInvalidArgument; + goto finish; + } + header = (kernel_mach_header_t *)kmodInfo->address; - if (theKext && theKext->isCPPInitialized()) { - result = KMOD_RETURN_SUCCESS; - goto finish; + if (theKext->flags.builtin) { + header = (kernel_mach_header_t *)g_kernel_kmod_info.address; + textStart = kmodInfo->address; + textEnd = textStart + kmodInfo->size; + sectionNames = gOSStructorSectionNames[kOSSectionNamesBuiltinKext]; + } + } else { + kmodInfo = &g_kernel_kmod_info; + header = (kernel_mach_header_t *)kmodInfo->address; } - header = (kernel_mach_header_t *)kmodInfo->address; - /* Tell the meta class system that we are starting the load */ metaHandle = OSMetaClass::preModLoad(kmodInfo->name); @@ -404,45 +458,15 @@ OSRuntimeInitializeCPP( */ for (segment = firstsegfromheader(header); segment != NULL && load_success; - segment = nextsegfromheader(header, segment)) { - - kernel_section_t * section; - + segment = nextsegfromheader(header, segment)) + { /* Record the current segment in the event of a failure. */ failure_segment = segment; - - for (section = firstsect(segment); - section != NULL; - section = nextsect(segment, section)) { - - if (sectionIsConstructor(section)) { - structor_t * constructors = (structor_t *)section->addr; - - if (constructors) { - int num_constructors = section->size / sizeof(structor_t); - int hit_null_constructor = 0; - - for (int i = 0; - i < num_constructors && - OSMetaClass::checkModLoad(metaHandle); - i++) { - - if (constructors[i]) { - (*constructors[i])(); - } else if (!hit_null_constructor) { - hit_null_constructor = 1; - OSRuntimeLog(theKext, kOSRuntimeLogSpec, - "Null constructor in kext %s segment %s!", - kmodInfo->name, section->segname); - } - } - load_success = OSMetaClass::checkModLoad(metaHandle); - - break; - } /* if (constructors) */ - } /* if (strncmp...) */ - } /* for (section...) */ + load_success = OSRuntimeCallStructorsInSection( + theKext, kmodInfo, metaHandle, segment, + sectionNames[kOSSectionNameInitializer], + textStart, textEnd); } /* for (segment...) */ /* We failed so call all of the destructors. We must do this before @@ -458,7 +482,8 @@ OSRuntimeInitializeCPP( segment != failure_segment && segment != 0; segment = nextsegfromheader(header, segment)) { - OSRuntimeUnloadCPPForSegment(segment); + OSRuntimeCallStructorsInSection(theKext, kmodInfo, NULL, segment, + sectionNames[kOSSectionNameFinalizer], textStart, textEnd); } /* for (segment...) */ } @@ -478,46 +503,28 @@ OSRuntimeInitializeCPP( * classes, and there had better not be any created on the C++ init path. */ if (load_success && result != KMOD_RETURN_SUCCESS) { - (void)OSRuntimeFinalizeCPP(kmodInfo, NULL); + (void)OSRuntimeFinalizeCPP(theKext); //kmodInfo, sectionNames, textStart, textEnd); } if (theKext && load_success && result == KMOD_RETURN_SUCCESS) { theKext->setCPPInitialized(true); } finish: - OSSafeReleaseNULL(theKext); return result; } -#if PRAGMA_MARK -#pragma mark Libkern Init -#endif /* PRAGMA_MARK */ -/********************************************************************* -* Libkern Init -*********************************************************************/ - /********************************************************************* +Unload a kernel segment. *********************************************************************/ -extern lck_grp_t * IOLockGroup; -extern kmod_info_t g_kernel_kmod_info; -void OSlibkernInit(void) +void +OSRuntimeUnloadCPPForSegment( + kernel_segment_command_t * segment) { - // This must be called before calling OSRuntimeInitializeCPP. - OSMetaClassBase::initialize(); - - g_kernel_kmod_info.address = (vm_address_t) &_mh_execute_header; - if (kOSReturnSuccess != OSRuntimeInitializeCPP(&g_kernel_kmod_info, 0)) { - panic("OSRuntime: C++ runtime failed to initialize."); - } - - gKernelCPPInitialized = true; - - return; + OSRuntimeCallStructorsInSection(NULL, &g_kernel_kmod_info, NULL, segment, + gOSStructorSectionNames[kOSSectionNamesDefault][kOSSectionNameFinalizer], 0, 0); } -__END_DECLS - #if PRAGMA_MARK #pragma mark C++ Allocators & Deallocators #endif /* PRAGMA_MARK */ @@ -526,9 +533,6 @@ __END_DECLS *********************************************************************/ void * operator new(size_t size) -#if __cplusplus >= 201103L - noexcept -#endif { void * result; @@ -548,9 +552,6 @@ operator delete(void * addr) void * operator new[](unsigned long sz) -#if __cplusplus >= 201103L - noexcept -#endif { if (sz == 0) sz = 1; return kern_os_malloc(sz); @@ -584,4 +585,3 @@ __throw_length_error(const char *msg __unused) } }; - diff --git a/libkern/c++/OSSerialize.cpp b/libkern/c++/OSSerialize.cpp index e2d93058e..a82a37891 100644 --- a/libkern/c++/OSSerialize.cpp +++ b/libkern/c++/OSSerialize.cpp @@ -37,6 +37,7 @@ __END_DECLS #include #include #include +#include #include #define super OSObject @@ -290,6 +291,35 @@ OSSerializer * OSSerializer::forTarget( void * target, return( thing ); } +bool OSSerializer::callbackToBlock(void * target __unused, void * ref, + OSSerialize * serializer) +{ + return ((OSSerializerBlock)ref)(serializer); +} + +OSSerializer * OSSerializer::withBlock( + OSSerializerBlock callback) +{ + OSSerializer * serializer; + OSSerializerBlock block; + + block = Block_copy(callback); + if (!block) return (0); + + serializer = (OSSerializer::forTarget(NULL, &OSSerializer::callbackToBlock, block)); + + if (!serializer) Block_release(block); + + return (serializer); +} + +void OSSerializer::free(void) +{ + if (callback == &callbackToBlock) Block_release(ref); + + super::free(); +} + bool OSSerializer::serialize( OSSerialize * s ) const { return( (*callback)(target, ref, s) ); diff --git a/libkern/c++/OSSerializeBinary.cpp b/libkern/c++/OSSerializeBinary.cpp index ffaef6a68..3de4336af 100644 --- a/libkern/c++/OSSerializeBinary.cpp +++ b/libkern/c++/OSSerializeBinary.cpp @@ -75,6 +75,7 @@ bool OSSerialize::addBinary(const void * bits, size_t size) if (newCapacity >= capacity) { newCapacity = (((newCapacity - 1) / capacityIncrement) + 1) * capacityIncrement; + if (newCapacity < capacity) return (false); if (newCapacity > ensureCapacity(newCapacity)) return (false); } @@ -99,6 +100,7 @@ bool OSSerialize::addBinaryObject(const OSMetaClassBase * o, uint32_t key, if (newCapacity >= capacity) { newCapacity = (((newCapacity - 1) / capacityIncrement) + 1) * capacityIncrement; + if (newCapacity < capacity) return (false); if (newCapacity > ensureCapacity(newCapacity)) return (false); } diff --git a/libkern/c++/OSUnserialize.cpp b/libkern/c++/OSUnserialize.cpp index 6b32a76ee..3d82a99ce 100644 --- a/libkern/c++/OSUnserialize.cpp +++ b/libkern/c++/OSUnserialize.cpp @@ -185,11 +185,7 @@ static OSObject *parsedObject; #define YYSTYPE object_t * -extern "C" { -extern void *kern_os_malloc(size_t size); -extern void *kern_os_realloc(void * addr, size_t size); -extern void kern_os_free(void * addr); -} /* extern "C" */ +#include #define malloc(s) kern_os_malloc(s) #define realloc(a, s) kern_os_realloc(a, s) diff --git a/libkern/c++/OSUnserialize.y b/libkern/c++/OSUnserialize.y index 450ce0811..86f396784 100644 --- a/libkern/c++/OSUnserialize.y +++ b/libkern/c++/OSUnserialize.y @@ -99,11 +99,7 @@ static OSObject *parsedObject; #define YYSTYPE object_t * -extern "C" { -extern void *kern_os_malloc(size_t size); -extern void *kern_os_realloc(void * addr, size_t size); -extern void kern_os_free(void * addr); -} /* extern "C" */ +#include #define malloc(s) kern_os_malloc(s) #define realloc(a, s) kern_os_realloc(a, s) diff --git a/libkern/c++/OSUnserializeXML.cpp b/libkern/c++/OSUnserializeXML.cpp index 60f1bb238..6905c3979 100644 --- a/libkern/c++/OSUnserializeXML.cpp +++ b/libkern/c++/OSUnserializeXML.cpp @@ -219,12 +219,7 @@ static object_t *buildData(parser_state_t *state, object_t *o); static object_t *buildNumber(parser_state_t *state, object_t *o); static object_t *buildBoolean(parser_state_t *state, object_t *o); -extern "C" { -extern void *kern_os_malloc(size_t size); -extern void *kern_os_realloc(void * addr, size_t size); -extern void kern_os_free(void * addr); - -} /* extern "C" */ +#include #define malloc(s) kern_os_malloc(s) #define realloc(a, s) kern_os_realloc(a, s) @@ -1635,11 +1630,11 @@ int yynerrs; { (yyval) = retrieveObject(STATE, (yyvsp[(1) - (1)])->idref); if ((yyval)) { STATE->retrievedObjectCount++; + (yyval)->object->retain(); if (STATE->retrievedObjectCount > MAX_REFED_OBJECTS) { yyerror("maximum object reference count"); YYERROR; } - (yyval)->object->retain(); } else { yyerror("forward reference detected"); YYERROR; diff --git a/libkern/c++/OSUnserializeXML.y b/libkern/c++/OSUnserializeXML.y index 5bd216770..4f1c3cc97 100644 --- a/libkern/c++/OSUnserializeXML.y +++ b/libkern/c++/OSUnserializeXML.y @@ -122,12 +122,7 @@ static object_t *buildData(parser_state_t *state, object_t *o); static object_t *buildNumber(parser_state_t *state, object_t *o); static object_t *buildBoolean(parser_state_t *state, object_t *o); -extern "C" { -extern void *kern_os_malloc(size_t size); -extern void *kern_os_realloc(void * addr, size_t size); -extern void kern_os_free(void * addr); - -} /* extern "C" */ +#include #define malloc(s) kern_os_malloc(s) #define realloc(a, s) kern_os_realloc(a, s) diff --git a/libkern/conf/Makefile.template b/libkern/conf/Makefile.template index c448bb853..b38b73fe1 100644 --- a/libkern/conf/Makefile.template +++ b/libkern/conf/Makefile.template @@ -37,6 +37,10 @@ inflate.o_CWARNFLAGS_ADD = -Wno-cast-qual trees.o_CWARNFLAGS_ADD = -Wno-cast-qual uncompr.o_CWARNFLAGS_ADD = -Wno-cast-qual +# libclosure +runtime.cpo_CXXWARNFLAGS_ADD = -Wno-cast-qual + + # warnings in bison-generated code OSUnserializeXML.cpo_CXXWARNFLAGS_ADD += -Wno-uninitialized -Wno-unreachable-code -Wno-unreachable-code-break OSUnserialize.cpo_CXXWARNFLAGS_ADD += -Wno-unreachable-code diff --git a/libkern/conf/files b/libkern/conf/files index 5867b9373..5181c0143 100644 --- a/libkern/conf/files +++ b/libkern/conf/files @@ -54,12 +54,13 @@ libkern/uuid/uuid.c standard libkern/os/log.c standard libkern/os/object.c standard libkern/os/internal.c standard +libkern/os/refcnt.c standard libkern/kernel_mach_header.c standard libkern/zlib/adler32.c optional zlib libkern/zlib/compress.c optional zlib -libkern/zlib/crc32.c optional zlib +libkern/zlib/z_crc32.c optional zlib libkern/zlib/deflate.c optional zlib libkern/zlib/infback.c optional zlib libkern/zlib/inffast.c optional zlib @@ -80,6 +81,8 @@ libkern/crypto/corecrypto_rand.c optional crypto libkern/crypto/corecrypto_rsa.c optional crypto libkern/crypto/corecrypto_chacha20poly1305.c optional crypto +libkern/img4/interface.c standard + libkern/stack_protector.c standard libkern/kxld/kxld.c optional config_kxld @@ -101,3 +104,6 @@ libkern/kxld/kxld_uuid.c optional config_kxld libkern/kxld/kxld_versionmin.c optional config_kxld libkern/kxld/kxld_vtable.c optional config_kxld libkern/kxld/kxld_stubs.c standard + +libkern/libclosure/runtime.cpp optional config_blocks +libkern/libclosure/libclosuredata.c optional config_blocks diff --git a/libkern/conf/files.arm64 b/libkern/conf/files.arm64 new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/libkern/conf/files.arm64 @@ -0,0 +1 @@ + diff --git a/libkern/firehose/firehose_types_private.h b/libkern/firehose/firehose_types_private.h index ea1f91279..d99a5819f 100644 --- a/libkern/firehose/firehose_types_private.h +++ b/libkern/firehose/firehose_types_private.h @@ -73,7 +73,7 @@ OS_ENUM(firehose_stream, uint8_t, firehose_stream_special = 1, firehose_stream_memory = 2, firehose_stream_metadata = 3, - firehose_stream_memory_high_traffic = 4, + firehose_stream_signpost = 4, firehose_stream_memory_wifi = 5, firehose_stream_memory_baseband = 6, @@ -92,6 +92,7 @@ OS_ENUM(firehose_tracepoint_namespace, uint8_t, firehose_tracepoint_namespace_log = 0x04, firehose_tracepoint_namespace_metadata = 0x05, firehose_tracepoint_namespace_signpost = 0x06, + firehose_tracepoint_namespace_loss = 0x07, ); /*! @@ -203,6 +204,7 @@ OS_ENUM(_firehose_tracepoint_flags_log, uint16_t, _firehose_tracepoint_flags_log_has_subsystem = 0x0200, _firehose_tracepoint_flags_log_has_rules = 0x0400, _firehose_tracepoint_flags_log_has_oversize = 0x0800, + _firehose_tracepoint_flags_log_has_context_data = 0x1000, ); /*! @@ -239,12 +241,15 @@ OS_ENUM(_firehose_tracepoint_type_signpost, firehose_tracepoint_type_t, * * @abstract * Flags for Log tracepoints (namespace signpost). + * + * When flags are shared with the log type, they should have the same values. */ OS_ENUM(_firehose_tracepoint_flags_signpost, uint16_t, _firehose_tracepoint_flags_signpost_has_private_data = 0x0100, _firehose_tracepoint_flags_signpost_has_subsystem = 0x0200, _firehose_tracepoint_flags_signpost_has_rules = 0x0400, _firehose_tracepoint_flags_signpost_has_oversize = 0x0800, + _firehose_tracepoint_flags_signpost_has_context_data = 0x1000, ); /* MIG firehose push reply structure */ diff --git a/libkern/firehose/private.h b/libkern/firehose/private.h index d4c4e1040..399c95a54 100644 --- a/libkern/firehose/private.h +++ b/libkern/firehose/private.h @@ -21,7 +21,7 @@ #ifndef __FIREHOSE_FIREHOSE_PRIVATE__ #define __FIREHOSE_FIREHOSE_PRIVATE__ -#define FIREHOSE_SPI_VERSION 20170907 +#define FIREHOSE_SPI_VERSION 20180416 #include "firehose_types_private.h" #include "tracepoint_private.h" diff --git a/libkern/firehose/tracepoint_private.h b/libkern/firehose/tracepoint_private.h index a0c712212..76531fd28 100644 --- a/libkern/firehose/tracepoint_private.h +++ b/libkern/firehose/tracepoint_private.h @@ -114,6 +114,21 @@ typedef struct firehose_tracepoint_s { #define FIREHOSE_TRACE_ID_SET_CODE(tid, code) \ ((tid).ftid._code = code) +/*! + * @typedef firehose_loss_payload_s + * + * @abstract + * The payload for tracepoints in the loss namespace, generated by the firehose + * itself when unreliable tracepoints are lost. + */ +typedef struct firehose_loss_payload_s { + uint64_t start_stamp; /* may (rarely!) disagree with the tracepoint stamp */ + uint64_t end_stamp; +#define FIREHOSE_LOSS_COUNT_WIDTH 6 /* as many bits as can be spared */ +#define FIREHOSE_LOSS_COUNT_MAX ((1u << FIREHOSE_LOSS_COUNT_WIDTH) - 1) + uint32_t count; +} firehose_loss_payload_s, *firehose_loss_payload_t; + __BEGIN_DECLS #if __has_feature(address_sanitizer) diff --git a/libkern/gen/OSDebug.cpp b/libkern/gen/OSDebug.cpp index f11263631..cbcdd5728 100644 --- a/libkern/gen/OSDebug.cpp +++ b/libkern/gen/OSDebug.cpp @@ -43,6 +43,7 @@ #include #include + extern int etext; __BEGIN_DECLS // From osmfk/kern/thread.h but considered to be private diff --git a/libkern/img4/interface.c b/libkern/img4/interface.c new file mode 100644 index 000000000..3863334d8 --- /dev/null +++ b/libkern/img4/interface.c @@ -0,0 +1,18 @@ +#include +#include +#include + +#if defined(SECURITY_READ_ONLY_LATE) +SECURITY_READ_ONLY_LATE(const img4_interface_t *) img4if = NULL; +#else +const img4_interface_t *img4if = NULL; +#endif + +void +img4_interface_register(const img4_interface_t *i4) +{ + if (img4if) { + panic("img4 interface already set"); + } + img4if = i4; +} diff --git a/libkern/kmod/cplus_start.c b/libkern/kmod/cplus_start.c index eb77e72a9..f0b5f0f64 100644 --- a/libkern/kmod/cplus_start.c +++ b/libkern/kmod/cplus_start.c @@ -42,7 +42,3 @@ The linkline must look like this. *.o -lkmodc++ kmod_info.o -lkmod */ - -/* The following preprocessor test must match exactly with the architectures - * that define the CONFIG_STATIC_CPPINIT config option. - */ diff --git a/libkern/kmod/cplus_stop.c b/libkern/kmod/cplus_stop.c index eb77e72a9..f0b5f0f64 100644 --- a/libkern/kmod/cplus_stop.c +++ b/libkern/kmod/cplus_stop.c @@ -42,7 +42,3 @@ The linkline must look like this. *.o -lkmodc++ kmod_info.o -lkmod */ - -/* The following preprocessor test must match exactly with the architectures - * that define the CONFIG_STATIC_CPPINIT config option. - */ diff --git a/libkern/kxld/Makefile b/libkern/kxld/Makefile index 81160694c..bebe89829 100644 --- a/libkern/kxld/Makefile +++ b/libkern/kxld/Makefile @@ -66,6 +66,7 @@ ifeq ($(strip $(SDK_DIR)),) SDK_DIR := / endif + DEFINES = -DPRIVATE CFLAGS=-std=c99 -Wall -Wextra -Werror -pedantic -Wformat=2 -Wcast-align \ -Wwrite-strings -Wshorten-64-to-32 -Wshadow -Winit-self -Wpointer-arith \ @@ -140,6 +141,7 @@ $(LIBKXLDSYM_ARCHIVE): $(LIBKXLDOBJ_ARCHIVE) @mkdir -p $(SYMROOT) install -c -m 644 $< $@ + $(LIBKXLDOBJ_DYLIB): $(OBJS) $(CC) $(LDFLAGS) -o $@ $^ diff --git a/libkern/libclosure/libclosuredata.c b/libkern/libclosure/libclosuredata.c new file mode 100644 index 000000000..27e906f31 --- /dev/null +++ b/libkern/libclosure/libclosuredata.c @@ -0,0 +1,24 @@ +/* + * data.c + * libclosure + * + * Copyright (c) 2008-2010 Apple Inc. All rights reserved. + * + * @APPLE_LLVM_LICENSE_HEADER@ + * + */ + +/******************** +NSBlock support + +We allocate space and export a symbol to be used as the Class for the on-stack and malloc'ed copies until ObjC arrives on the scene. These data areas are set up by Foundation to link in as real classes post facto. + +We keep these in a separate file so that we can include the runtime code in test subprojects but not include the data so that compiled code that sees the data in libSystem doesn't get confused by a second copy. Somehow these don't get unified in a common block. +**********************/ + +void * _NSConcreteStackBlock[32] = { 0 }; +void * _NSConcreteMallocBlock[32] = { 0 }; +void * _NSConcreteAutoBlock[32] = { 0 }; +void * _NSConcreteFinalizingBlock[32] = { 0 }; +void * _NSConcreteGlobalBlock[32] = { 0 }; +void * _NSConcreteWeakBlockVariable[32] = { 0 }; diff --git a/libkern/libclosure/runtime.cpp b/libkern/libclosure/runtime.cpp new file mode 100644 index 000000000..42e379848 --- /dev/null +++ b/libkern/libclosure/runtime.cpp @@ -0,0 +1,540 @@ +/* + * runtime.c + * libclosure + * + * Copyright (c) 2008-2010 Apple Inc. All rights reserved. + * + * @APPLE_LLVM_LICENSE_HEADER@ + */ + + +#ifndef KERNEL + +#include "Block_private.h" +#include +#include +#include +#include + +#else /* !KERNEL */ + +#include +#include + +#define malloc(s) kern_os_malloc((s)) +#define free(a) kern_os_free((a)) + +#endif /* KERNEL */ + +#include +#include +#ifndef os_assumes +#define os_assumes(_x) (_x) +#endif +#ifndef os_assert +#define os_assert(_x) assert(_x) +#endif + +#if TARGET_OS_WIN32 +#define _CRT_SECURE_NO_WARNINGS 1 +#include +static __inline bool OSAtomicCompareAndSwapLong(long oldl, long newl, long volatile *dst) +{ + // fixme barrier is overkill -- see objc-os.h + long original = InterlockedCompareExchange(dst, newl, oldl); + return (original == oldl); +} + +static __inline bool OSAtomicCompareAndSwapInt(int oldi, int newi, int volatile *dst) +{ + // fixme barrier is overkill -- see objc-os.h + int original = InterlockedCompareExchange(dst, newi, oldi); + return (original == oldi); +} +#else +#define OSAtomicCompareAndSwapLong(_Old, _New, _Ptr) __sync_bool_compare_and_swap(_Ptr, _Old, _New) +#define OSAtomicCompareAndSwapInt(_Old, _New, _Ptr) __sync_bool_compare_and_swap(_Ptr, _Old, _New) +#endif + + +/******************************************************************************* +Internal Utilities +********************************************************************************/ + +static int32_t latching_incr_int(volatile int32_t *where) { + while (1) { + int32_t old_value = *where; + if ((old_value & BLOCK_REFCOUNT_MASK) == BLOCK_REFCOUNT_MASK) { + return BLOCK_REFCOUNT_MASK; + } + if (OSAtomicCompareAndSwapInt(old_value, old_value+2, where)) { + return old_value+2; + } + } +} + +static bool latching_incr_int_not_deallocating(volatile int32_t *where) { + while (1) { + int32_t old_value = *where; + if (old_value & BLOCK_DEALLOCATING) { + // if deallocating we can't do this + return false; + } + if ((old_value & BLOCK_REFCOUNT_MASK) == BLOCK_REFCOUNT_MASK) { + // if latched, we're leaking this block, and we succeed + return true; + } + if (OSAtomicCompareAndSwapInt(old_value, old_value+2, where)) { + // otherwise, we must store a new retained value without the deallocating bit set + return true; + } + } +} + + +// return should_deallocate? +static bool latching_decr_int_should_deallocate(volatile int32_t *where) { + while (1) { + int32_t old_value = *where; + if ((old_value & BLOCK_REFCOUNT_MASK) == BLOCK_REFCOUNT_MASK) { + return false; // latched high + } + if ((old_value & BLOCK_REFCOUNT_MASK) == 0) { + return false; // underflow, latch low + } + int32_t new_value = old_value - 2; + bool result = false; + if ((old_value & (BLOCK_REFCOUNT_MASK|BLOCK_DEALLOCATING)) == 2) { + new_value = old_value - 1; + result = true; + } + if (OSAtomicCompareAndSwapInt(old_value, new_value, where)) { + return result; + } + } +} + + +/************************************************************************** +Framework callback functions and their default implementations. +***************************************************************************/ +#if !TARGET_OS_WIN32 +#pragma mark Framework Callback Routines +#endif + +static void _Block_retain_object_default(const void *ptr __unused) { } + +static void _Block_release_object_default(const void *ptr __unused) { } + +static void _Block_destructInstance_default(const void *aBlock __unused) {} + +static void (*_Block_retain_object)(const void *ptr) = _Block_retain_object_default; +static void (*_Block_release_object)(const void *ptr) = _Block_release_object_default; +static void (*_Block_destructInstance) (const void *aBlock) = _Block_destructInstance_default; + + +/************************************************************************** +Callback registration from ObjC runtime and CoreFoundation +***************************************************************************/ + +void _Block_use_RR2(const Block_callbacks_RR *callbacks) { + _Block_retain_object = callbacks->retain; + _Block_release_object = callbacks->release; + _Block_destructInstance = callbacks->destructInstance; +} + +/**************************************************************************** +Accessors for block descriptor fields +*****************************************************************************/ +#if 0 +static struct Block_descriptor_1 * _Block_descriptor_1(struct Block_layout *aBlock) +{ + return aBlock->descriptor; +} +#endif + +static struct Block_descriptor_2 * _Block_descriptor_2(struct Block_layout *aBlock) +{ + if (! (aBlock->flags & BLOCK_HAS_COPY_DISPOSE)) return NULL; + uint8_t *desc = (uint8_t *)aBlock->descriptor; + desc += sizeof(struct Block_descriptor_1); + return __IGNORE_WCASTALIGN((struct Block_descriptor_2 *)desc); +} + +static struct Block_descriptor_3 * _Block_descriptor_3(struct Block_layout *aBlock) +{ + if (! (aBlock->flags & BLOCK_HAS_SIGNATURE)) return NULL; + uint8_t *desc = (uint8_t *)aBlock->descriptor; + desc += sizeof(struct Block_descriptor_1); + if (aBlock->flags & BLOCK_HAS_COPY_DISPOSE) { + desc += sizeof(struct Block_descriptor_2); + } + return __IGNORE_WCASTALIGN((struct Block_descriptor_3 *)desc); +} + +static void _Block_call_copy_helper(void *result, struct Block_layout *aBlock) +{ + struct Block_descriptor_2 *desc = _Block_descriptor_2(aBlock); + if (!desc) return; + + (*desc->copy)(result, aBlock); // do fixup +} + +static void _Block_call_dispose_helper(struct Block_layout *aBlock) +{ + struct Block_descriptor_2 *desc = _Block_descriptor_2(aBlock); + if (!desc) return; + + (*desc->dispose)(aBlock); +} + +/******************************************************************************* +Internal Support routines for copying +********************************************************************************/ + +#if !TARGET_OS_WIN32 +#pragma mark Copy/Release support +#endif + +// Copy, or bump refcount, of a block. If really copying, call the copy helper if present. +void *_Block_copy(const void *arg) { + struct Block_layout *aBlock; + + if (!arg) return NULL; + + // The following would be better done as a switch statement + aBlock = (struct Block_layout *)arg; + if (aBlock->flags & BLOCK_NEEDS_FREE) { + // latches on high + latching_incr_int(&aBlock->flags); + return aBlock; + } + else if (aBlock->flags & BLOCK_IS_GLOBAL) { + return aBlock; + } + else { + // Its a stack block. Make a copy. + struct Block_layout *result = (typeof(result)) malloc(aBlock->descriptor->size); + if (!result) return NULL; + memmove(result, aBlock, aBlock->descriptor->size); // bitcopy first +#if __has_feature(ptrauth_calls) + // Resign the invoke pointer as it uses address authentication. + result->invoke = aBlock->invoke; +#endif + // reset refcount + result->flags &= ~(BLOCK_REFCOUNT_MASK|BLOCK_DEALLOCATING); // XXX not needed + result->flags |= BLOCK_NEEDS_FREE | 2; // logical refcount 1 + _Block_call_copy_helper(result, aBlock); + // Set isa last so memory analysis tools see a fully-initialized object. + result->isa = _NSConcreteMallocBlock; + return result; + } +} + + +// Runtime entry points for maintaining the sharing knowledge of byref data blocks. + +// A closure has been copied and its fixup routine is asking us to fix up the reference to the shared byref data +// Closures that aren't copied must still work, so everyone always accesses variables after dereferencing the forwarding ptr. +// We ask if the byref pointer that we know about has already been copied to the heap, and if so, increment and return it. +// Otherwise we need to copy it and update the stack forwarding pointer +static struct Block_byref *_Block_byref_copy(const void *arg) { + struct Block_byref *src = (struct Block_byref *)arg; + + if ((src->forwarding->flags & BLOCK_REFCOUNT_MASK) == 0) { + // src points to stack + struct Block_byref *copy = (struct Block_byref *)malloc(src->size); + copy->isa = NULL; + // byref value 4 is logical refcount of 2: one for caller, one for stack + copy->flags = src->flags | BLOCK_BYREF_NEEDS_FREE | 4; + copy->forwarding = copy; // patch heap copy to point to itself + src->forwarding = copy; // patch stack to point to heap copy + copy->size = src->size; + + if (src->flags & BLOCK_BYREF_HAS_COPY_DISPOSE) { + // Trust copy helper to copy everything of interest + // If more than one field shows up in a byref block this is wrong XXX + struct Block_byref_2 *src2 = (struct Block_byref_2 *)(src+1); + struct Block_byref_2 *copy2 = (struct Block_byref_2 *)(copy+1); + copy2->byref_keep = src2->byref_keep; + copy2->byref_destroy = src2->byref_destroy; + + if (src->flags & BLOCK_BYREF_LAYOUT_EXTENDED) { + struct Block_byref_3 *src3 = (struct Block_byref_3 *)(src2+1); + struct Block_byref_3 *copy3 = (struct Block_byref_3*)(copy2+1); + copy3->layout = src3->layout; + } + + (*src2->byref_keep)(copy, src); + } + else { + // Bitwise copy. + // This copy includes Block_byref_3, if any. + memmove(copy+1, src+1, src->size - sizeof(*src)); + } + } + // already copied to heap + else if ((src->forwarding->flags & BLOCK_BYREF_NEEDS_FREE) == BLOCK_BYREF_NEEDS_FREE) { + latching_incr_int(&src->forwarding->flags); + } + + return src->forwarding; +} + +static void _Block_byref_release(const void *arg) { + struct Block_byref *byref = (struct Block_byref *)arg; + + // dereference the forwarding pointer since the compiler isn't doing this anymore (ever?) + byref = byref->forwarding; + + if (byref->flags & BLOCK_BYREF_NEEDS_FREE) { + __assert_only int32_t refcount = byref->flags & BLOCK_REFCOUNT_MASK; + os_assert(refcount); + if (latching_decr_int_should_deallocate(&byref->flags)) { + if (byref->flags & BLOCK_BYREF_HAS_COPY_DISPOSE) { + struct Block_byref_2 *byref2 = (struct Block_byref_2 *)(byref+1); + (*byref2->byref_destroy)(byref); + } + free(byref); + } + } +} + + +/************************************************************ + * + * API supporting SPI + * _Block_copy, _Block_release, and (old) _Block_destroy + * + ***********************************************************/ + +#if !TARGET_OS_WIN32 +#pragma mark SPI/API +#endif + + +// API entry point to release a copied Block +void _Block_release(const void *arg) { + struct Block_layout *aBlock = (struct Block_layout *)arg; + if (!aBlock) return; + if (aBlock->flags & BLOCK_IS_GLOBAL) return; + if (! (aBlock->flags & BLOCK_NEEDS_FREE)) return; + + if (latching_decr_int_should_deallocate(&aBlock->flags)) { + _Block_call_dispose_helper(aBlock); + _Block_destructInstance(aBlock); + free(aBlock); + } +} + +bool _Block_tryRetain(const void *arg) { + struct Block_layout *aBlock = (struct Block_layout *)arg; + return latching_incr_int_not_deallocating(&aBlock->flags); +} + +bool _Block_isDeallocating(const void *arg) { + struct Block_layout *aBlock = (struct Block_layout *)arg; + return (aBlock->flags & BLOCK_DEALLOCATING) != 0; +} + + +/************************************************************ + * + * SPI used by other layers + * + ***********************************************************/ + +size_t Block_size(void *aBlock) { + return ((struct Block_layout *)aBlock)->descriptor->size; +} + +bool _Block_use_stret(void *aBlock) { + struct Block_layout *layout = (struct Block_layout *)aBlock; + + int requiredFlags = BLOCK_HAS_SIGNATURE | BLOCK_USE_STRET; + return (layout->flags & requiredFlags) == requiredFlags; +} + +// Checks for a valid signature, not merely the BLOCK_HAS_SIGNATURE bit. +bool _Block_has_signature(void *aBlock) { + return _Block_signature(aBlock) ? true : false; +} + +const char * _Block_signature(void *aBlock) +{ + struct Block_descriptor_3 *desc3 = _Block_descriptor_3((struct Block_layout *)aBlock); + if (!desc3) return NULL; + + return desc3->signature; +} + +const char * _Block_layout(void *aBlock) +{ + // Don't return extended layout to callers expecting old GC layout + struct Block_layout *layout = (struct Block_layout *)aBlock; + if (layout->flags & BLOCK_HAS_EXTENDED_LAYOUT) return NULL; + + struct Block_descriptor_3 *desc3 = _Block_descriptor_3((struct Block_layout *)aBlock); + if (!desc3) return NULL; + + return desc3->layout; +} + +const char * _Block_extended_layout(void *aBlock) +{ + // Don't return old GC layout to callers expecting extended layout + struct Block_layout *layout = (struct Block_layout *)aBlock; + if (! (layout->flags & BLOCK_HAS_EXTENDED_LAYOUT)) return NULL; + + struct Block_descriptor_3 *desc3 = _Block_descriptor_3((struct Block_layout *)aBlock); + if (!desc3) return NULL; + + // Return empty string (all non-object bytes) instead of NULL + // so callers can distinguish "empty layout" from "no layout". + if (!desc3->layout) return ""; + else return desc3->layout; +} + +#if !TARGET_OS_WIN32 +#pragma mark Compiler SPI entry points +#endif + + +/******************************************************* + +Entry points used by the compiler - the real API! + + +A Block can reference four different kinds of things that require help when the Block is copied to the heap. +1) C++ stack based objects +2) References to Objective-C objects +3) Other Blocks +4) __block variables + +In these cases helper functions are synthesized by the compiler for use in Block_copy and Block_release, called the copy and dispose helpers. The copy helper emits a call to the C++ const copy constructor for C++ stack based objects and for the rest calls into the runtime support function _Block_object_assign. The dispose helper has a call to the C++ destructor for case 1 and a call into _Block_object_dispose for the rest. + +The flags parameter of _Block_object_assign and _Block_object_dispose is set to + * BLOCK_FIELD_IS_OBJECT (3), for the case of an Objective-C Object, + * BLOCK_FIELD_IS_BLOCK (7), for the case of another Block, and + * BLOCK_FIELD_IS_BYREF (8), for the case of a __block variable. +If the __block variable is marked weak the compiler also or's in BLOCK_FIELD_IS_WEAK (16) + +So the Block copy/dispose helpers should only ever generate the four flag values of 3, 7, 8, and 24. + +When a __block variable is either a C++ object, an Objective-C object, or another Block then the compiler also generates copy/dispose helper functions. Similarly to the Block copy helper, the "__block" copy helper (formerly and still a.k.a. "byref" copy helper) will do a C++ copy constructor (not a const one though!) and the dispose helper will do the destructor. And similarly the helpers will call into the same two support functions with the same values for objects and Blocks with the additional BLOCK_BYREF_CALLER (128) bit of information supplied. + +So the __block copy/dispose helpers will generate flag values of 3 or 7 for objects and Blocks respectively, with BLOCK_FIELD_IS_WEAK (16) or'ed as appropriate and always 128 or'd in, for the following set of possibilities: + __block id 128+3 (0x83) + __block (^Block) 128+7 (0x87) + __weak __block id 128+3+16 (0x93) + __weak __block (^Block) 128+7+16 (0x97) + + +********************************************************/ + +// +// When Blocks or Block_byrefs hold objects then their copy routine helpers use this entry point +// to do the assignment. +// +void _Block_object_assign(void *destArg, const void *object, const int flags) { + const void **dest = (const void **)destArg; + switch (os_assumes(flags & BLOCK_ALL_COPY_DISPOSE_FLAGS)) { + case BLOCK_FIELD_IS_OBJECT: + /******* + id object = ...; + [^{ object; } copy]; + ********/ + + _Block_retain_object(object); + *dest = object; + break; + + case BLOCK_FIELD_IS_BLOCK: + /******* + void (^object)(void) = ...; + [^{ object; } copy]; + ********/ + + *dest = _Block_copy(object); + break; + + case BLOCK_FIELD_IS_BYREF | BLOCK_FIELD_IS_WEAK: + case BLOCK_FIELD_IS_BYREF: + /******* + // copy the onstack __block container to the heap + // Note this __weak is old GC-weak/MRC-unretained. + // ARC-style __weak is handled by the copy helper directly. + __block ... x; + __weak __block ... x; + [^{ x; } copy]; + ********/ + + *dest = _Block_byref_copy(object); + break; + + case BLOCK_BYREF_CALLER | BLOCK_FIELD_IS_OBJECT: + case BLOCK_BYREF_CALLER | BLOCK_FIELD_IS_BLOCK: + /******* + // copy the actual field held in the __block container + // Note this is MRC unretained __block only. + // ARC retained __block is handled by the copy helper directly. + __block id object; + __block void (^object)(void); + [^{ object; } copy]; + ********/ + + *dest = object; + break; + + case BLOCK_BYREF_CALLER | BLOCK_FIELD_IS_OBJECT | BLOCK_FIELD_IS_WEAK: + case BLOCK_BYREF_CALLER | BLOCK_FIELD_IS_BLOCK | BLOCK_FIELD_IS_WEAK: + /******* + // copy the actual field held in the __block container + // Note this __weak is old GC-weak/MRC-unretained. + // ARC-style __weak is handled by the copy helper directly. + __weak __block id object; + __weak __block void (^object)(void); + [^{ object; } copy]; + ********/ + + *dest = object; + break; + + default: + break; + } +} + +// When Blocks or Block_byrefs hold objects their destroy helper routines call this entry point +// to help dispose of the contents +void _Block_object_dispose(const void *object, const int flags) { + switch (os_assumes(flags & BLOCK_ALL_COPY_DISPOSE_FLAGS)) { + case BLOCK_FIELD_IS_BYREF | BLOCK_FIELD_IS_WEAK: + case BLOCK_FIELD_IS_BYREF: + // get rid of the __block data structure held in a Block + _Block_byref_release(object); + break; + case BLOCK_FIELD_IS_BLOCK: + _Block_release(object); + break; + case BLOCK_FIELD_IS_OBJECT: + _Block_release_object(object); + break; + case BLOCK_BYREF_CALLER | BLOCK_FIELD_IS_OBJECT: + case BLOCK_BYREF_CALLER | BLOCK_FIELD_IS_BLOCK: + case BLOCK_BYREF_CALLER | BLOCK_FIELD_IS_OBJECT | BLOCK_FIELD_IS_WEAK: + case BLOCK_BYREF_CALLER | BLOCK_FIELD_IS_BLOCK | BLOCK_FIELD_IS_WEAK: + break; + default: + break; + } +} + + +// Workaround for dylib with no __DATA segment fails to rebase +__attribute__((used)) +static int let_there_be_data = 42; + +#undef malloc +#undef free + diff --git a/libkern/libkern/Block.h b/libkern/libkern/Block.h new file mode 100644 index 000000000..5509250df --- /dev/null +++ b/libkern/libkern/Block.h @@ -0,0 +1,66 @@ +/* + * Block.h + * + * Copyright (c) 2008-2010 Apple Inc. All rights reserved. + * + * @APPLE_LLVM_LICENSE_HEADER@ + * + */ + +#ifndef _Block_H_ +#define _Block_H_ + +#if !defined(BLOCK_EXPORT) +# if defined(__cplusplus) +# define BLOCK_EXPORT extern "C" +# else +# define BLOCK_EXPORT extern +# endif +#endif + +#include +#ifndef KERNEL +#include +#endif /* KERNEL */ + +#if __cplusplus +extern "C" { +#endif + +// Create a heap based copy of a Block or simply add a reference to an existing one. +// This must be paired with Block_release to recover memory, even when running +// under Objective-C Garbage Collection. +BLOCK_EXPORT void *_Block_copy(const void *aBlock) + __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_3_2); + +// Lose the reference, and if heap based and last reference, recover the memory +BLOCK_EXPORT void _Block_release(const void *aBlock) + __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_3_2); + + +// Used by the compiler. Do not call this function yourself. +BLOCK_EXPORT void _Block_object_assign(void *, const void *, const int) + __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_3_2); + +// Used by the compiler. Do not call this function yourself. +BLOCK_EXPORT void _Block_object_dispose(const void *, const int) + __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_3_2); + +// Used by the compiler. Do not use these variables yourself. +BLOCK_EXPORT void * _NSConcreteGlobalBlock[32] + __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_3_2); +BLOCK_EXPORT void * _NSConcreteStackBlock[32] + __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_3_2); + + +#if __cplusplus +} +#endif + +// Type correct macros + +#define Block_copy(...) ((__typeof(__VA_ARGS__))_Block_copy((const void *)(__VA_ARGS__))) +#define Block_release(...) _Block_release((const void *)(__VA_ARGS__)) + + +#endif diff --git a/libkern/libkern/Block_private.h b/libkern/libkern/Block_private.h new file mode 100644 index 000000000..d122f92d5 --- /dev/null +++ b/libkern/libkern/Block_private.h @@ -0,0 +1,458 @@ +/* + * Block_private.h + * + * SPI for Blocks + * + * Copyright (c) 2008-2010 Apple Inc. All rights reserved. + * + * @APPLE_LLVM_LICENSE_HEADER@ + * + */ + +#ifndef _BLOCK_PRIVATE_H_ +#define _BLOCK_PRIVATE_H_ + +#include +#include +#ifndef KERNEL +#include +#endif + +#include +#include +#ifdef KERNEL +#include +#else +#include +#endif + + +#ifdef KERNEL +#include +struct Block_byref; +#else +#include +#endif + +#if __has_include() +#include +#endif + +#if __has_feature(ptrauth_calls) && __cplusplus < 201103L + +// C ptrauth or old C++ ptrauth + +#define _Block_set_function_pointer(field, value) \ + ((value) \ + ? ((field) = \ + (__typeof__(field)) \ + ptrauth_auth_and_resign((void*)(value), \ + ptrauth_key_function_pointer, 0, \ + ptrauth_key_block_function, &(field))) \ + : ((field) = 0)) + +#define _Block_get_function_pointer(field) \ + ((field) \ + ? (__typeof__(field)) \ + ptrauth_auth_function((void*)(field), \ + ptrauth_key_block_function, &(field)) \ + : (__typeof__(field))0) + +#else + +// C++11 ptrauth or no ptrauth + +#define _Block_set_function_pointer(field, value) \ + (field) = (value) + +#define _Block_get_function_pointer(field) \ + (field) + +#endif + + +#if __has_feature(ptrauth_calls) && __cplusplus >= 201103L + +// StorageSignedFunctionPointer stores a function pointer of type +// Fn but signed with the given ptrauth key and with the address of its +// storage as extra data. +// Function pointers inside block objects are signed this way. +template +class StorageSignedFunctionPointer { + uintptr_t bits; + + public: + + // Authenticate function pointer fn as a C function pointer. + // Re-sign it with our key and the storage address as extra data. + // DOES NOT actually write to our storage. + uintptr_t prepareWrite(Fn fn) const + { + if (fn == nullptr) { + return 0; + } else { + return (uintptr_t) + ptrauth_auth_and_resign(fn, ptrauth_key_function_pointer, 0, + Key, &bits); + } + } + + // Authenticate otherBits at otherStorage. + // Re-sign it with our storage address. + // DOES NOT actually write to our storage. + uintptr_t prepareWrite(const StorageSignedFunctionPointer& other) const + { + if (other.bits == 0) { + return 0; + } else { + return (uintptr_t) + ptrauth_auth_and_resign((void*)other.bits, Key, &other.bits, + Key, &bits); + } + } + + // Authenticate ptr as if it were stored at our storage address. + // Re-sign it as a C function pointer. + // DOES NOT actually read from our storage. + Fn completeReadFn(uintptr_t ptr) const + { + if (ptr == 0) { + return nullptr; + } else { + return ptrauth_auth_function((Fn)ptr, Key, &bits); + } + } + + // Authenticate ptr as if it were at our storage address. + // Return it as a dereferenceable pointer. + // DOES NOT actually read from our storage. + void* completeReadRaw(uintptr_t ptr) const + { + if (ptr == 0) { + return nullptr; + } else { + return ptrauth_auth_data((void*)ptr, Key, &bits); + } + } + + StorageSignedFunctionPointer() { } + + StorageSignedFunctionPointer(Fn value) + : bits(prepareWrite(value)) { } + + StorageSignedFunctionPointer(const StorageSignedFunctionPointer& value) + : bits(prepareWrite(value)) { } + + StorageSignedFunctionPointer& + operator = (Fn rhs) { + bits = prepareWrite(rhs); + return *this; + } + + StorageSignedFunctionPointer& + operator = (const StorageSignedFunctionPointer& rhs) { + bits = prepareWrite(rhs); + return *this; + } + + operator Fn () const { + return completeReadFn(bits); + } + + explicit operator void* () const { + return completeReadRaw(bits); + } + + explicit operator bool () const { + return completeReadRaw(bits) != nullptr; + } +}; + +using BlockCopyFunction = StorageSignedFunctionPointer + ; + +using BlockDisposeFunction = StorageSignedFunctionPointer + ; + +using BlockInvokeFunction = StorageSignedFunctionPointer + ; + +using BlockByrefKeepFunction = StorageSignedFunctionPointer + ; + +using BlockByrefDestroyFunction = StorageSignedFunctionPointer + ; + +// c++11 and ptrauth_calls +#elif !__has_feature(ptrauth_calls) +// not ptrauth_calls + +typedef void(*BlockCopyFunction)(void *, const void *); +typedef void(*BlockDisposeFunction)(const void *); +typedef void(*BlockInvokeFunction)(void *, ...); +typedef void(*BlockByrefKeepFunction)(struct Block_byref*, struct Block_byref*); +typedef void(*BlockByrefDestroyFunction)(struct Block_byref *); + +#else +// ptrauth_calls but not c++11 + +typedef uintptr_t BlockCopyFunction; +typedef uintptr_t BlockDisposeFunction; +typedef uintptr_t BlockInvokeFunction; +typedef uintptr_t BlockByrefKeepFunction; +typedef uintptr_t BlockByrefDestroyFunction; + +#endif + + +// Values for Block_layout->flags to describe block objects +enum { + BLOCK_DEALLOCATING = (0x0001), // runtime + BLOCK_REFCOUNT_MASK = (0xfffe), // runtime + BLOCK_NEEDS_FREE = (1 << 24), // runtime + BLOCK_HAS_COPY_DISPOSE = (1 << 25), // compiler + BLOCK_HAS_CTOR = (1 << 26), // compiler: helpers have C++ code + BLOCK_IS_GC = (1 << 27), // runtime + BLOCK_IS_GLOBAL = (1 << 28), // compiler + BLOCK_USE_STRET = (1 << 29), // compiler: undefined if !BLOCK_HAS_SIGNATURE + BLOCK_HAS_SIGNATURE = (1 << 30), // compiler + BLOCK_HAS_EXTENDED_LAYOUT=(1 << 31) // compiler +}; + +#define BLOCK_DESCRIPTOR_1 1 +struct Block_descriptor_1 { + uintptr_t reserved; + uintptr_t size; +}; + +#define BLOCK_DESCRIPTOR_2 1 +struct Block_descriptor_2 { + // requires BLOCK_HAS_COPY_DISPOSE + BlockCopyFunction copy; + BlockDisposeFunction dispose; +}; + +#define BLOCK_DESCRIPTOR_3 1 +struct Block_descriptor_3 { + // requires BLOCK_HAS_SIGNATURE + const char *signature; + const char *layout; // contents depend on BLOCK_HAS_EXTENDED_LAYOUT +}; + +struct Block_layout { + void *isa; + volatile int32_t flags; // contains ref count + int32_t reserved; + BlockInvokeFunction invoke; + struct Block_descriptor_1 *descriptor; + // imported variables +}; + + +// Values for Block_byref->flags to describe __block variables +enum { + // Byref refcount must use the same bits as Block_layout's refcount. + // BLOCK_DEALLOCATING = (0x0001), // runtime + // BLOCK_REFCOUNT_MASK = (0xfffe), // runtime + + BLOCK_BYREF_LAYOUT_MASK = (0xf << 28), // compiler + BLOCK_BYREF_LAYOUT_EXTENDED = ( 1 << 28), // compiler + BLOCK_BYREF_LAYOUT_NON_OBJECT = ( 2 << 28), // compiler + BLOCK_BYREF_LAYOUT_STRONG = ( 3 << 28), // compiler + BLOCK_BYREF_LAYOUT_WEAK = ( 4 << 28), // compiler + BLOCK_BYREF_LAYOUT_UNRETAINED = ( 5 << 28), // compiler + + BLOCK_BYREF_IS_GC = ( 1 << 27), // runtime + + BLOCK_BYREF_HAS_COPY_DISPOSE = ( 1 << 25), // compiler + BLOCK_BYREF_NEEDS_FREE = ( 1 << 24), // runtime +}; + +struct Block_byref { + void *isa; + struct Block_byref *forwarding; + volatile int32_t flags; // contains ref count + uint32_t size; +}; + +struct Block_byref_2 { + // requires BLOCK_BYREF_HAS_COPY_DISPOSE + BlockByrefKeepFunction byref_keep; + BlockByrefDestroyFunction byref_destroy; +}; + +struct Block_byref_3 { + // requires BLOCK_BYREF_LAYOUT_EXTENDED + const char *layout; +}; + + +// Extended layout encoding. + +// Values for Block_descriptor_3->layout with BLOCK_HAS_EXTENDED_LAYOUT +// and for Block_byref_3->layout with BLOCK_BYREF_LAYOUT_EXTENDED + +// If the layout field is less than 0x1000, then it is a compact encoding +// of the form 0xXYZ: X strong pointers, then Y byref pointers, +// then Z weak pointers. + +// If the layout field is 0x1000 or greater, it points to a +// string of layout bytes. Each byte is of the form 0xPN. +// Operator P is from the list below. Value N is a parameter for the operator. +// Byte 0x00 terminates the layout; remaining block data is non-pointer bytes. + +enum { + BLOCK_LAYOUT_ESCAPE = 0, // N=0 halt, rest is non-pointer. N!=0 reserved. + BLOCK_LAYOUT_NON_OBJECT_BYTES = 1, // N bytes non-objects + BLOCK_LAYOUT_NON_OBJECT_WORDS = 2, // N words non-objects + BLOCK_LAYOUT_STRONG = 3, // N words strong pointers + BLOCK_LAYOUT_BYREF = 4, // N words byref pointers + BLOCK_LAYOUT_WEAK = 5, // N words weak pointers + BLOCK_LAYOUT_UNRETAINED = 6, // N words unretained pointers + BLOCK_LAYOUT_UNKNOWN_WORDS_7 = 7, // N words, reserved + BLOCK_LAYOUT_UNKNOWN_WORDS_8 = 8, // N words, reserved + BLOCK_LAYOUT_UNKNOWN_WORDS_9 = 9, // N words, reserved + BLOCK_LAYOUT_UNKNOWN_WORDS_A = 0xA, // N words, reserved + BLOCK_LAYOUT_UNUSED_B = 0xB, // unspecified, reserved + BLOCK_LAYOUT_UNUSED_C = 0xC, // unspecified, reserved + BLOCK_LAYOUT_UNUSED_D = 0xD, // unspecified, reserved + BLOCK_LAYOUT_UNUSED_E = 0xE, // unspecified, reserved + BLOCK_LAYOUT_UNUSED_F = 0xF, // unspecified, reserved +}; + + +// Runtime support functions used by compiler when generating copy/dispose helpers + +// Values for _Block_object_assign() and _Block_object_dispose() parameters +enum { + // see function implementation for a more complete description of these fields and combinations + BLOCK_FIELD_IS_OBJECT = 3, // id, NSObject, __attribute__((NSObject)), block, ... + BLOCK_FIELD_IS_BLOCK = 7, // a block variable + BLOCK_FIELD_IS_BYREF = 8, // the on stack structure holding the __block variable + BLOCK_FIELD_IS_WEAK = 16, // declared __weak, only used in byref copy helpers + BLOCK_BYREF_CALLER = 128, // called from __block (byref) copy/dispose support routines. +}; + +enum { + BLOCK_ALL_COPY_DISPOSE_FLAGS = + BLOCK_FIELD_IS_OBJECT | BLOCK_FIELD_IS_BLOCK | BLOCK_FIELD_IS_BYREF | + BLOCK_FIELD_IS_WEAK | BLOCK_BYREF_CALLER +}; + + +// Function pointer accessors + +static inline __typeof__(void (*)(void *, ...)) +_Block_get_invoke_fn(struct Block_layout *block) +{ + return (void (*)(void *, ...))_Block_get_function_pointer(block->invoke); +} + +static inline void +_Block_set_invoke_fn(struct Block_layout *block, void (*fn)(void *, ...)) +{ + _Block_set_function_pointer(block->invoke, fn); +} + + +static inline __typeof__(void (*)(void *, const void *)) +_Block_get_copy_fn(struct Block_descriptor_2 *desc) +{ + return (void (*)(void *, const void *))_Block_get_function_pointer(desc->copy); +} + +static inline void +_Block_set_copy_fn(struct Block_descriptor_2 *desc, + void (*fn)(void *, const void *)) +{ + _Block_set_function_pointer(desc->copy, fn); +} + + +static inline __typeof__(void (*)(const void *)) +_Block_get_dispose_fn(struct Block_descriptor_2 *desc) +{ + return (void (*)(const void *))_Block_get_function_pointer(desc->dispose); +} + +static inline void +_Block_set_dispose_fn(struct Block_descriptor_2 *desc, + void (*fn)(const void *)) +{ + _Block_set_function_pointer(desc->dispose, fn); +} + + +// Other support functions + + +// runtime entry to get total size of a closure +BLOCK_EXPORT size_t Block_size(void *aBlock); + +// indicates whether block was compiled with compiler that sets the ABI related metadata bits +BLOCK_EXPORT bool _Block_has_signature(void *aBlock) + __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_4_3); + +// returns TRUE if return value of block is on the stack, FALSE otherwise +BLOCK_EXPORT bool _Block_use_stret(void *aBlock) + __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_4_3); + +// Returns a string describing the block's parameter and return types. +// The encoding scheme is the same as Objective-C @encode. +// Returns NULL for blocks compiled with some compilers. +BLOCK_EXPORT const char * _Block_signature(void *aBlock) + __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_4_3); + +// Returns a string describing the block's GC layout. +// This uses the GC skip/scan encoding. +// May return NULL. +BLOCK_EXPORT const char * _Block_layout(void *aBlock) + __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_4_3); + +// Returns a string describing the block's layout. +// This uses the "extended layout" form described above. +// May return NULL. +BLOCK_EXPORT const char * _Block_extended_layout(void *aBlock) + __OSX_AVAILABLE_STARTING(__MAC_10_8, __IPHONE_7_0); + +// Callable only from the ARR weak subsystem while in exclusion zone +BLOCK_EXPORT bool _Block_tryRetain(const void *aBlock) + __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_4_3); + +// Callable only from the ARR weak subsystem while in exclusion zone +BLOCK_EXPORT bool _Block_isDeallocating(const void *aBlock) + __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_4_3); + + +// the raw data space for runtime classes for blocks +// class+meta used for stack, malloc, and collectable based blocks +BLOCK_EXPORT void * _NSConcreteMallocBlock[32] + __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_3_2); +BLOCK_EXPORT void * _NSConcreteAutoBlock[32] + __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_3_2); +BLOCK_EXPORT void * _NSConcreteFinalizingBlock[32] + __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_3_2); +BLOCK_EXPORT void * _NSConcreteWeakBlockVariable[32] + __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_3_2); +// declared in Block.h +// BLOCK_EXPORT void * _NSConcreteGlobalBlock[32]; +// BLOCK_EXPORT void * _NSConcreteStackBlock[32]; + + +struct Block_callbacks_RR { + size_t size; // size == sizeof(struct Block_callbacks_RR) + void (*retain)(const void *); + void (*release)(const void *); + void (*destructInstance)(const void *); +}; +typedef struct Block_callbacks_RR Block_callbacks_RR; + +BLOCK_EXPORT void _Block_use_RR2(const Block_callbacks_RR *callbacks); + + +#endif diff --git a/libkern/libkern/Makefile b/libkern/libkern/Makefile index e1c7b3a4a..a7b66fe8a 100644 --- a/libkern/libkern/Makefile +++ b/libkern/libkern/Makefile @@ -7,9 +7,10 @@ include $(MakeInc_cmd) include $(MakeInc_def) INSTINC_SUBDIRS = \ - machine \ - c++ \ - crypto + machine \ + c++ \ + crypto \ + img4 INSTINC_SUBDIRS_X86_64 = \ i386 INSTINC_SUBDIRS_X86_64H = \ @@ -42,7 +43,9 @@ KERNELFILES = \ sysctl.h \ tree.h \ zconf.h \ - zlib.h + zlib.h \ + crc.h \ + Block.h PRIVATE_KERNELFILES = \ OSKextLibPrivate.h \ @@ -50,7 +53,8 @@ PRIVATE_KERNELFILES = \ kext_request_keys.h \ mkext.h \ prelink.h \ - section_keywords.h + section_keywords.h \ + Block_private.h PRIVATE_DATAFILES = \ ${PRIVATE_KERNELFILES} \ diff --git a/libkern/libkern/OSKextLibPrivate.h b/libkern/libkern/OSKextLibPrivate.h index fd08744ed..147ab96f6 100644 --- a/libkern/libkern/OSKextLibPrivate.h +++ b/libkern/libkern/OSKextLibPrivate.h @@ -130,6 +130,7 @@ typedef uint8_t OSKextExcludeLevel; #define kOSBundlePathKey "OSBundlePath" #define kOSBundleExecutablePathKey "OSBundleExecutablePath" #define kOSBundleUUIDKey "OSBundleUUID" +#define kOSBundleTextUUIDKey "OSBundleTextUUID" #define kOSBundleStartedKey "OSBundleStarted" #define kOSBundlePrelinkedKey "OSBundlePrelinked" #define kOSBundleLoadTagKey "OSBundleLoadTag" @@ -140,6 +141,11 @@ typedef uint8_t OSKextExcludeLevel; #define kOSBundleWiredSizeKey "OSBundleWiredSize" #define kOSBundleDependenciesKey "OSBundleDependencies" #define kOSBundleRetainCountKey "OSBundleRetainCount" +#define kOSBundleCacheLoadAddressKey "OSBundleCacheLoadAddress" +// Kernel TEXT encompasses kexts +#define kOSBundleKextsInKernelTextKey "OSBundleKextsInKernelText" +// OSKextCopyLoadedKextInfo includes non-started kexts when present: +#define kOSBundleAllPrelinkedKey "OSBundleAllPrelinked" /* Dictionary of metaclass info keyed by classname. */ @@ -934,6 +940,10 @@ extern void OSKextFreeSite(vm_allocation_site_t * site); extern int OSKextGetUUIDForName(const char *, uuid_t); #endif +extern vm_tag_t gIOSurfaceTag; + +extern void *OSKextKextForAddress(const void *addr); + #endif /* XNU_KERNEL_PRIVATE */ __END_DECLS diff --git a/libkern/libkern/OSRuntime.h b/libkern/libkern/OSRuntime.h new file mode 100644 index 000000000..bf7232a1c --- /dev/null +++ b/libkern/libkern/OSRuntime.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 1999-2012 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _OS_OSRUNTIME_H +#define _OS_OSRUNTIME_H + +#include + +__BEGIN_DECLS + +extern void *kern_os_malloc(size_t size) __attribute__((alloc_size(1))); +extern void *kern_os_realloc(void * addr, size_t size) __attribute__((alloc_size(2))); +extern void kern_os_free(void * address); + +__END_DECLS + +#endif /* _OS_OSRUNTIME_H */ diff --git a/libkern/libkern/c++/OSCollection.h b/libkern/libkern/c++/OSCollection.h index 91deba1fa..f162bbdc3 100644 --- a/libkern/libkern/c++/OSCollection.h +++ b/libkern/libkern/c++/OSCollection.h @@ -448,6 +448,46 @@ class OSCollection : public OSObject virtual OSCollection *copyCollection(OSDictionary * cycleDict = 0); OSMetaClassDeclareReservedUsed(OSCollection, 1); + /*! + * @function iterateObjects + * + * @abstract + * Invoke a callback for each member of the collection. + * + * @param refcon A reference constant for the callback. + * @param callback The callback function, + * called with the refcon and each member object + * of the collection in turn, on the callers thread. + * The callback should return true to early terminate + * the iteration, false otherwise. + * + * @result + * False if the collection iteration was made invalid + * (see OSCollectionIterator::isValid()) otherwise true. + */ + bool iterateObjects(void * refcon, bool (*callback)(void * refcon, OSObject * object)); + +#ifdef __BLOCKS__ + + /*! + * @function iterateObjects + * + * @abstract + * Invoke a block for each member of the collection. + * + * @param block The block, + * called with the refcon and each member object + * of the collection in turn, on the callers thread. + * The block should return true to early terminate + * the iteration, false otherwise. + * + * @result + * False if the collection iteration was made invalid + * (see OSCollectionIterator::isValid()) otherwise true. + */ + bool iterateObjects(bool (^block)(OSObject * object)); + +#endif /* __BLOCKS__ */ OSMetaClassDeclareReservedUnused(OSCollection, 2); OSMetaClassDeclareReservedUnused(OSCollection, 3); diff --git a/libkern/libkern/c++/OSDictionary.h b/libkern/libkern/c++/OSDictionary.h index c5438e9d3..5168ca4d8 100644 --- a/libkern/libkern/c++/OSDictionary.h +++ b/libkern/libkern/c++/OSDictionary.h @@ -925,6 +925,48 @@ class OSDictionary : public OSCollection OSArray * copyKeys(void); #endif /* XNU_KERNEL_PRIVATE */ + + /*! + * @function iterateObjects + * + * @abstract + * Invoke a callback for each member of the collection. + * + * @param refcon A reference constant for the callback. + * @param callback The callback function, + * called with the refcon and each member key & object + * of the dictionary in turn, on the callers thread. + * The callback should return true to early terminate + * the iteration, false otherwise. + * + * @result + * False if the dictionary iteration was made invalid + * (see OSCollectionIterator::isValid()) otherwise true. + */ + bool iterateObjects(void * refcon, bool (*callback)(void * refcon, const OSSymbol * key, OSObject * object)); + +#ifdef __BLOCKS__ + + /*! + * @function iterateObjects + * + * @abstract + * Invoke a block for each member of the collection. + * + * @param block The block, + * called with the refcon and each member key & object + * of the dictionary in turn, on the callers thread. + * The callback should return true to early terminate + * the iteration, false otherwise. + * + * @result + * False if the dictionary iteration was made invalid + * (see OSCollectionIterator::isValid()) otherwise true. + */ + bool iterateObjects(bool (^block)(const OSSymbol * key, OSObject * object)); + +#endif /* __BLOCKS__ */ + OSMetaClassDeclareReservedUnused(OSDictionary, 0); OSMetaClassDeclareReservedUnused(OSDictionary, 1); OSMetaClassDeclareReservedUnused(OSDictionary, 2); diff --git a/libkern/libkern/c++/OSKext.h b/libkern/libkern/c++/OSKext.h index 2930a5fc7..2abc2929c 100644 --- a/libkern/libkern/c++/OSKext.h +++ b/libkern/libkern/c++/OSKext.h @@ -88,11 +88,11 @@ void OSKextVLog( void OSKextRemoveKextBootstrap(void); kern_return_t OSRuntimeInitializeCPP( - kmod_info_t * kmodInfo, - void * data); + OSKext * kext); kern_return_t OSRuntimeFinalizeCPP( - kmod_info_t * kmodInfo, - void * data); + OSKext * kext); +void OSRuntimeUnloadCPPForSegment( + kernel_segment_command_t * segment); kern_return_t is_io_catalog_send_data( mach_port_t masterPort, @@ -212,11 +212,11 @@ class OSKext : public OSObject __unused thread_call_param_t p1); friend kern_return_t OSRuntimeInitializeCPP( - kmod_info_t * kmodInfo, - void * data); + OSKext * kext); friend kern_return_t OSRuntimeFinalizeCPP( - kmod_info_t * kmodInfo, - void * data); + OSKext * kext); + friend void OSRuntimeUnloadCPPForSegment( + kernel_segment_command_t * segment); friend kern_return_t is_io_catalog_send_data( mach_port_t masterPort, @@ -275,6 +275,7 @@ class OSKext : public OSObject unsigned int interface:1; unsigned int kernelComponent:1; unsigned int prelinked:1; + unsigned int builtin:1; unsigned int loaded:1; unsigned int dtraceInitialized:1; unsigned int starting:1; @@ -292,6 +293,7 @@ class OSKext : public OSObject struct list_head pendingPgoHead; uuid_t instance_uuid; OSKextAccount * account; + uint32_t builtinKmodIdx; #if PRAGMA_MARK /**************************************/ @@ -307,6 +309,10 @@ class OSKext : public OSObject static OSDictionary * copyKexts(void); static OSReturn removeKextBootstrap(void); static void willShutdown(void); // called by IOPMrootDomain on shutdown + static void reportOSMetaClassInstances( + const char * kextIdentifier, + OSKextLogSpec msgLogSpec); + #endif /* XNU_KERNEL_PRIVATE */ private: @@ -500,9 +506,7 @@ class OSKext : public OSObject OSMetaClass * aClass); virtual bool hasOSMetaClassInstances(void); virtual OSSet * getMetaClasses(void); - static void reportOSMetaClassInstances( - const char * kextIdentifier, - OSKextLogSpec msgLogSpec); + virtual void reportOSMetaClassInstances( OSKextLogSpec msgLogSpec); @@ -565,12 +569,14 @@ class OSKext : public OSObject void updateLoadedKextSummary(OSKextLoadedKextSummary *summary); void updateActiveAccount(OSKextActiveAccount *accountp); +#ifdef XNU_KERNEL_PRIVATE +public: +#endif /* XNU_KERNEL_PRIVATE */ + /* C++ Initialization. */ virtual void setCPPInitialized(bool initialized=true); - - #if PRAGMA_MARK /**************************************/ #pragma mark Public Functions @@ -645,6 +651,8 @@ class OSKext : public OSObject virtual OSKextLoadTag getLoadTag(void); virtual void getSizeInfo(uint32_t *loadSize, uint32_t *wiredSize); virtual OSData * copyUUID(void); + OSData * copyTextUUID(void); + OSData * copyMachoUUID(const kernel_mach_header_t * header); virtual OSArray * copyPersonalitiesArray(void); /* This removes personalities naming the kext (by CFBundleIdentifier), diff --git a/libkern/libkern/c++/OSMetaClass.h b/libkern/libkern/c++/OSMetaClass.h index 6098d6b79..f05a9b858 100644 --- a/libkern/libkern/c++/OSMetaClass.h +++ b/libkern/libkern/c++/OSMetaClass.h @@ -318,87 +318,14 @@ class OSMetaClassBase * @abstract Release an object if not NULL, then set it to NULL. * @param inst Instance of an OSObject, may be NULL. */ -#define OSSafeReleaseNULL(inst) do { if (inst) (inst)->release(); (inst) = NULL; } while (0) +#define OSSafeReleaseNULL(inst) do { if (inst != NULL) (inst)->release(); (inst) = NULL; } while (0) typedef void (*_ptf_t)(void); -#if APPLE_KEXT_LEGACY_ABI - -// Arcane evil code interprets a C++ pointer to function as specified in the -// -fapple-kext ABI, i.e. the gcc-2.95 generated code. IT DOES NOT ALLOW -// the conversion of functions that are from MULTIPLY inherited classes. - -static inline _ptf_t -_ptmf2ptf(const OSMetaClassBase *self, void (OSMetaClassBase::*func)(void)) -{ - union { - void (OSMetaClassBase::*fIn)(void); - struct { // Pointer to member function 2.95 - unsigned short fToff; - short fVInd; - union { - _ptf_t fPFN; - short fVOff; - } u; - } fptmf2; - } map; - - map.fIn = func; - if (map.fptmf2.fToff) { - panic("Multiple inheritance is not supported"); - return 0; - } else if (map.fptmf2.fVInd < 0) { - // Not virtual, i.e. plain member func - return map.fptmf2.u.fPFN; - } else { - union { - const OSMetaClassBase *fObj; - _ptf_t **vtablep; - } u; - u.fObj = self; - - // Virtual member function so dereference vtable - return (*u.vtablep)[map.fptmf2.fVInd - 1]; - } -} - -#else /* !APPLE_KEXT_LEGACY_ABI */ #if defined(__arm__) || defined(__arm64__) -typedef long int ptrdiff_t; -/* - * Ugly reverse engineered ABI. Where does it come from? Nobody knows. - * gcc 4.2-built ARM kernel panics with multiple inheritance (no, really) - */ -static inline _ptf_t -_ptmf2ptf(const OSMetaClassBase *self, void (OSMetaClassBase::*func)(void)) -{ - struct ptmf_t { - _ptf_t fPFN; - ptrdiff_t delta; - }; - union { - void (OSMetaClassBase::*fIn)(void); - struct ptmf_t pTMF; - } map; + static _ptf_t _ptmf2ptf(const OSMetaClassBase *self, void (OSMetaClassBase::*func)(void)); - map.fIn = func; - - if (map.pTMF.delta & 1) { - // virtual - union { - const OSMetaClassBase *fObj; - _ptf_t **vtablep; - } u; - u.fObj = self; - - // Virtual member function so dereference table - return *(_ptf_t *)(((uintptr_t)*u.vtablep) + (uintptr_t)map.pTMF.fPFN); - } else { - // Not virtual, i.e. plain member func - return map.pTMF.fPFN; - } -} #elif defined(__i386__) || defined(__x86_64__) // Slightly less arcane and slightly less evil code to do @@ -436,7 +363,6 @@ _ptmf2ptf(const OSMetaClassBase *self, void (OSMetaClassBase::*func)(void)) #error Unknown architecture. #endif /* __arm__ */ -#endif /* !APPLE_KEXT_LEGACY_ABI */ /*! * @define OSMemberFunctionCast diff --git a/libkern/libkern/c++/OSSerialize.h b/libkern/libkern/c++/OSSerialize.h index 59e12d108..53de72aa6 100644 --- a/libkern/libkern/c++/OSSerialize.h +++ b/libkern/libkern/c++/OSSerialize.h @@ -310,11 +310,15 @@ class OSSerialize : public OSObject OSMetaClassDeclareReservedUnused(OSSerialize, 7); }; -// xx-review: this whole class seems to be unused! typedef bool (*OSSerializerCallback)(void * target, void * ref, OSSerialize * serializer); +#ifdef __BLOCKS__ +typedef bool (^OSSerializerBlock)(OSSerialize * serializer); +#endif /* __BLOCKS__ */ + + class OSSerializer : public OSObject { OSDeclareDefaultStructors(OSSerializer) @@ -330,6 +334,18 @@ class OSSerializer : public OSObject OSSerializerCallback callback, void * ref = 0); +#ifdef __BLOCKS__ + static OSSerializer * withBlock( + OSSerializerBlock callback); +#endif + + virtual void free( void ) APPLE_KEXT_OVERRIDE; + +#if XNU_KERNEL_PRIVATE + static bool callbackToBlock(void * target, void * ref, + OSSerialize * serializer); +#endif /* XNU_KERNEL_PRIVATE */ + virtual bool serialize(OSSerialize * serializer) const APPLE_KEXT_OVERRIDE; }; diff --git a/osfmk/corecrypto/ccmode/src/ccmode_factory_ctr_crypt.c b/libkern/libkern/crc.h similarity index 77% rename from osfmk/corecrypto/ccmode/src/ccmode_factory_ctr_crypt.c rename to libkern/libkern/crc.h index ddac576e3..bf7f42b61 100644 --- a/osfmk/corecrypto/ccmode/src/ccmode_factory_ctr_crypt.c +++ b/libkern/libkern/crc.h @@ -1,11 +1,5 @@ /* - * ccmode_factory_ctr_crypt.c - * corecrypto - * - * Created on 05/19/2015 - * - * Copyright (c) 2015 Apple Inc. All rights reserved. - * + * Copyright (c) 2017-2017 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -32,10 +26,17 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#include "ccmode_internal.h" +#ifndef _LIBKERN_CRC_H_ +#define _LIBKERN_CRC_H_ + +#include +#include + +__BEGIN_DECLS + +uint16_t crc16(uint16_t crc, const void *bufp, size_t len); +uint32_t crc32(uint32_t crc, const void *bufp, size_t len); + +__END_DECLS -void ccmode_factory_ctr_crypt(struct ccmode_ctr *ctr, - const struct ccmode_ecb *ecb) { - struct ccmode_ctr ctr_crypt = CCMODE_FACTORY_CTR_CRYPT(ecb); - *ctr = ctr_crypt; -} +#endif /* _LIBKERN_CRC_H_ */ diff --git a/libkern/libkern/img4/Makefile b/libkern/libkern/img4/Makefile new file mode 100644 index 000000000..1ae8a810b --- /dev/null +++ b/libkern/libkern/img4/Makefile @@ -0,0 +1,24 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + +include $(MakeInc_cmd) +include $(MakeInc_def) + +DATAFILES = +PRIVATE_DATAFILES = +KERNELFILES = +PRIVATE_KERNELFILES = interface.h + +INSTALL_MI_LIST = ${DATAFILES} +INSTALL_MI_LCL_LIST = ${PRIVATE_DATAFILES} +INSTALL_KF_MI_LIST = ${KERNELFILES} +INSTALL_KF_MI_LCL_LIST = ${PRIVATE_KERNELFILES} +EXPORT_MI_LIST = ${INSTALL_KF_MI_LCL_LIST} + +INSTALL_MI_DIR = libkern/img4 +EXPORT_MI_DIR = libkern/img4 + +include $(MakeInc_rule) +include $(MakeInc_dir) diff --git a/libkern/libkern/img4/interface.h b/libkern/libkern/img4/interface.h new file mode 100644 index 000000000..f88d89b61 --- /dev/null +++ b/libkern/libkern/img4/interface.h @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/*! + * @header + * Interfaces to register the AppleImage4 interface with xnu-proper to avoid a + * build-time layering inversion. + */ +#ifndef __IMG4_INTERFACE_H +#define __IMG4_INTERFACE_H + +#include +#include + +/* + * We rely on img4.h's logic for either including sys/types.h or declaring + * errno_t ourselves. + */ +#include + +/*! + * @const IMG4_INTERFACE_VERSION + * The version of the interface supported by the implementation. As new + * functions are added to the interface, this value will be incremented so that + * it can be tested at build-time and not require rev-locked submissions of xnu + * and AppleImage4. + */ +#define IMG4_INTERFACE_VERSION (1u) + +/*! + * @typedef img4_init_t + * A type describing the img4_init() function. + */ +typedef errno_t (*img4_init_t)( + img4_t *i4, + img4_flags_t flags, + const uint8_t *bytes, + size_t len, + img4_destructor_t destructor +); + +/*! + * @typedef img4_init_t + * A type describing the img4_set_custom_tag_handler() function. + */ +typedef void (*img4_set_custom_tag_handler_t)( + img4_t *i4, + const img4_custom_tag_t *tags, + size_t tags_cnt +); + +/*! + * @typedef img4_init_t + * A type describing the img4_get_trusted_payload() function. + */ +typedef errno_t (*img4_get_trusted_payload_t)( + img4_t *i4, + img4_tag_t tag, + const img4_environment_t *env, + void *ctx, + const uint8_t **bytes, + size_t *len +); + +/*! + * @typedef img4_init_t + * A type describing the img4_get_trusted_external_payload() function. + */ +typedef errno_t (*img4_get_trusted_external_payload_t)( + img4_t *img4, + img4_payload_t *payload, + const img4_environment_t *env, + void *ctx, + const uint8_t **bytes, + size_t *len +); + +/*! + * @typedef img4_init_t + * A type describing the img4_get_entitlement_bool() function. + */ +typedef bool (*img4_get_entitlement_bool_t)( + img4_t *i4, + img4_tag_t entitlement +); + +/*! + * @typedef img4_init_t + * A type describing the img4_get_object_entitlement_bool() function. + */ +typedef bool (*img4_get_object_entitlement_bool_t)( + img4_t *i4, + img4_tag_t object, + img4_tag_t entitlement +); + +/*! + * @typedef img4_init_t + * A type describing the img4_destroy() function. + */ +typedef void (*img4_destroy_t)( + img4_t *i4 +); + +/*! + * @typedef img4_interface_t + * A structure describing the interface to the AppleImage4 kext. + * + * @property i4if_version + * The version of the structure supported by the implementation. + * + * @property i4if_init + * A pointer to the img4_init function. + * + * @property i4if_set_custom_tag_handler + * A pointer to the img4_set_custom_tag_handler function. + * + * @property i4if_get_trusted_payload + * A pointer to the img4_get_trusted_payload function. + * + * @property i4if_get_trusted_external_payload + * A pointer to the img4_get_trusted_external_payload function. + * + * @property i4if_get_entitlement_bool + * A pointer to the img4_get_entitlement_bool function. + * + * @property i4if_get_object_entitlement_bool + * A pointer to the img4_get_object_entitlement_bool function. + * + * @property i4if_destroy + * A pointer to the img4_destroy function. + * + * @property i4if_v1 + * All members added in version 1 of the structure. + * + * @property environment_platform + * The IMG4_ENVIRONMENT_PLATFORM global. + */ +typedef struct _img4_interface { + const uint32_t i4if_version; + const img4_init_t i4if_init; + const img4_set_custom_tag_handler_t i4if_set_custom_tag_handler; + const img4_get_trusted_payload_t i4if_get_trusted_payload; + const img4_get_trusted_external_payload_t i4if_get_trusted_external_payload; + const img4_get_entitlement_bool_t i4if_get_entitlement_bool; + const img4_get_object_entitlement_bool_t i4if_get_object_entitlement_bool; + const img4_destroy_t i4if_destroy; + struct { + const img4_environment_t *environment_platform; + } i4if_v1; + void *__reserved[23]; +} img4_interface_t; + +__BEGIN_DECLS; + +/*! + * @const img4if + * The AppleImage4 interface that was registered. + */ +extern const img4_interface_t *img4if; + +/*! + * @function img4_interface_register + * Registers the AppleImage4 kext interface with xnu. + * + * @param i4 + * The interface to register. + * + * @discussion + * This routine may only be called once and must be called before late-const has + * been applied to kernel memory. + */ +OS_EXPORT OS_NONNULL1 +void +img4_interface_register(const img4_interface_t *i4); + +__END_DECLS; + +#endif // __IMG4_INTERFACE_H diff --git a/libkern/libkern/prelink.h b/libkern/libkern/prelink.h index 929ab17bc..38996dbb9 100644 --- a/libkern/libkern/prelink.h +++ b/libkern/libkern/prelink.h @@ -37,6 +37,12 @@ #define kPrelinkInfoSegment "__PRELINK_INFO" #define kPrelinkInfoSection "__info" +#define kBuiltinInfoSection "__kmod_info" +#define kBuiltinStartSection "__kmod_start" + +// __DATA segment +#define kBuiltinInitSection "__kmod_init" +#define kBuiltinTermSection "__kmod_term" #define kPrelinkBundlePathKey "_PrelinkBundlePath" #define kPrelinkExecutableRelativePathKey "_PrelinkExecutableRelativePath" diff --git a/libkern/libkern/version.h.template b/libkern/libkern/version.h.template index 57b97d48a..281f8e410 100644 --- a/libkern/libkern/version.h.template +++ b/libkern/libkern/version.h.template @@ -59,6 +59,10 @@ */ #define VERSION_PRERELEASE_LEVEL ###KERNEL_VERSION_PRERELEASE_LEVEL### +/* OSBUILD_CONFIG, osbuild_config is a one-word string describing the build + * configuration of the kernel, e.g., development or release */ +#define OSBUILD_CONFIG "###KERNEL_BUILD_CONFIG###" + /* OSTYPE, ostype, is a string as returned by uname -s */ #define OSTYPE "Darwin" @@ -89,6 +93,9 @@ extern const int version_stage; /* Build-time value of VERSION_PRERELEASE_LEVEL */ extern const int version_prerelease_level; +/* Build-time value of CURRENT_KERNEL_CONFIG */ +extern const char osbuild_config[]; + /* Build-time value of OSTYPE */ extern const char ostype[]; diff --git a/libkern/os/Makefile b/libkern/os/Makefile index 390b9b861..dc30508ab 100644 --- a/libkern/os/Makefile +++ b/libkern/os/Makefile @@ -17,7 +17,8 @@ KERNELFILES = \ PRIVATE_KERNELFILES = \ object_private.h \ - reason_private.h + reason_private.h \ + refcnt.h PRIVATE_DATAFILES = \ reason_private.h diff --git a/libkern/os/log.c b/libkern/os/log.c index 143269862..d4a8a3e7a 100644 --- a/libkern/os/log.c +++ b/libkern/os/log.c @@ -34,6 +34,15 @@ #include "log_encode.h" +/* on embedded, with no kext loading or unloads, + * make the kernel use the libtrace shared cache path for logging + */ +#define FIREHOSE_USES_SHARED_CACHE NO_KEXTD + +#if FIREHOSE_USES_SHARED_CACHE +extern vm_offset_t segLOWESTTEXT; +#endif + struct os_log_s { int a; }; @@ -280,6 +289,13 @@ _os_log_to_log_internal(os_log_t oslog, os_log_type_t type, uint8_t pubdata[OS_LOG_BUFFER_MAX_SIZE]; va_list args_copy; + if (addr == NULL) { + return; + } + +#if FIREHOSE_USES_SHARED_CACHE + dso = (void *) segLOWESTTEXT; +#else /* FIREHOSE_USES_SHARED_CACHE */ if (dso == NULL) { dso = (void *) OSKextKextForAddress(format); if (dso == NULL) { @@ -291,14 +307,11 @@ _os_log_to_log_internal(os_log_t oslog, os_log_type_t type, return; } - if (addr == NULL) { - return; - } - void *dso_addr = (void *) OSKextKextForAddress(addr); if (dso != dso_addr) { return; } +#endif /* FIREHOSE_USES_SHARED_CACHE */ memset(&context, 0, sizeof(context)); memset(buffer, 0, OS_LOG_BUFFER_MAX_SIZE); @@ -326,10 +339,18 @@ static inline size_t _os_trace_write_location_for_address(uint8_t buf[static sizeof(uint64_t)], void *dso, const void *address, firehose_tracepoint_flags_t *flags) { - kernel_mach_header_t *mh = dso; +#if FIREHOSE_USES_SHARED_CACHE + *flags = _firehose_tracepoint_flags_pc_style_shared_cache; + memcpy(buf, (uint32_t[]){ (uintptr_t)address - (uintptr_t)dso }, + sizeof(uint32_t)); + return sizeof(uint32_t); + +#else /* FIREHOSE_USES_SHARED_CACHE */ + kernel_mach_header_t *mh = dso; if (mh->filetype == MH_EXECUTE) { *flags = _firehose_tracepoint_flags_pc_style_main_exe; + memcpy(buf, (uint32_t[]){ (uintptr_t)address - (uintptr_t)dso }, sizeof(uint32_t)); return sizeof(uint32_t); @@ -342,6 +363,7 @@ _os_trace_write_location_for_address(uint8_t buf[static sizeof(uint64_t)], return sizeof(uintptr_t); #endif } +#endif /* !FIREHOSE_USES_SHARED_CACHE */ } @@ -616,3 +638,260 @@ __firehose_critical_region_leave(void) { return; } +#ifdef CONFIG_XNUPOST + +#include +#define TESTOSLOGFMT(fn_name) "%u^%llu/%llu^kernel^0^test^" fn_name +#define TESTOSLOGPFX "TESTLOG:%u#" +#define TESTOSLOG(fn_name) TESTOSLOGPFX TESTOSLOGFMT(fn_name "#") + +extern u_int32_t RandomULong(void); +extern uint32_t find_pattern_in_buffer(char * pattern, uint32_t len, int expected_count); +void test_oslog_default_helper(uint32_t uniqid, uint64_t count); +void test_oslog_info_helper(uint32_t uniqid, uint64_t count); +void test_oslog_debug_helper(uint32_t uniqid, uint64_t count); +void test_oslog_error_helper(uint32_t uniqid, uint64_t count); +void test_oslog_fault_helper(uint32_t uniqid, uint64_t count); +void _test_log_loop(void * arg __unused, wait_result_t wres __unused); +void test_oslog_handleOSLogCtl(int32_t * in, int32_t * out, int32_t len); +kern_return_t test_stresslog_dropmsg(uint32_t uniqid); + +kern_return_t test_os_log(void); +kern_return_t test_os_log_parallel(void); + +#define GENOSLOGHELPER(fname, ident, callout_f) \ + void fname(uint32_t uniqid, uint64_t count) \ + { \ + int32_t datalen = 0; \ + uint32_t checksum = 0; \ + char databuffer[256]; \ + T_LOG("Doing os_log of %llu TESTLOG msgs for fn " ident, count); \ + for (uint64_t i = 0; i < count; i++) \ + { \ + datalen = snprintf(databuffer, sizeof(databuffer), TESTOSLOGFMT(ident), uniqid, i + 1, count); \ + checksum = crc32(0, databuffer, datalen); \ + callout_f(OS_LOG_DEFAULT, TESTOSLOG(ident), checksum, uniqid, i + 1, count); \ + /*T_LOG(TESTOSLOG(ident), checksum, uniqid, i + 1, count);*/ \ + } \ + } + +GENOSLOGHELPER(test_oslog_info_helper, "oslog_info_helper", os_log_info); +GENOSLOGHELPER(test_oslog_fault_helper, "oslog_fault_helper", os_log_fault); +GENOSLOGHELPER(test_oslog_debug_helper, "oslog_debug_helper", os_log_debug); +GENOSLOGHELPER(test_oslog_error_helper, "oslog_error_helper", os_log_error); +GENOSLOGHELPER(test_oslog_default_helper, "oslog_default_helper", os_log); + +kern_return_t test_os_log() +{ + char databuffer[256]; + uint32_t uniqid = RandomULong(); + uint32_t match_count = 0; + uint32_t checksum = 0; + uint32_t total_msg = 0; + uint32_t saved_msg = 0; + uint32_t dropped_msg = 0; + int datalen = 0; + uint64_t a = mach_absolute_time(); + uint64_t seqno = 1; + uint64_t total_seqno = 2; + + os_log_t log_handle = os_log_create("com.apple.xnu.test.t1", "kpost"); + + T_ASSERT_EQ_PTR(&_os_log_default, log_handle, "os_log_create returns valid value."); + T_ASSERT_EQ_INT(TRUE, os_log_info_enabled(log_handle), "os_log_info is enabled"); + T_ASSERT_EQ_INT(TRUE, os_log_debug_enabled(log_handle), "os_log_debug is enabled"); + T_ASSERT_EQ_PTR(&_os_log_default, OS_LOG_DEFAULT, "ensure OS_LOG_DEFAULT is _os_log_default"); + + total_msg = oslog_p_total_msgcount; + saved_msg = oslog_p_saved_msgcount; + dropped_msg = oslog_p_dropped_msgcount; + T_LOG("oslog internal counters total %u , saved %u, dropped %u", total_msg, saved_msg, dropped_msg); + + T_LOG("Validating with uniqid %u u64 %llu", uniqid, a); + T_ASSERT_NE_UINT(0, uniqid, "random number should not be zero"); + T_ASSERT_NE_ULLONG(0, a, "absolute time should not be zero"); + + datalen = snprintf(databuffer, sizeof(databuffer), TESTOSLOGFMT("printf_only"), uniqid, seqno, total_seqno); + checksum = crc32(0, databuffer, datalen); + printf(TESTOSLOG("printf_only") "mat%llu\n", checksum, uniqid, seqno, total_seqno, a); + + seqno += 1; + datalen = snprintf(databuffer, sizeof(databuffer), TESTOSLOGFMT("printf_only"), uniqid, seqno, total_seqno); + checksum = crc32(0, databuffer, datalen); + printf(TESTOSLOG("printf_only") "mat%llu\n", checksum, uniqid, seqno, total_seqno, a); + + datalen = snprintf(databuffer, sizeof(databuffer), "kernel^0^test^printf_only#mat%llu", a); + match_count = find_pattern_in_buffer(databuffer, datalen, total_seqno); + T_EXPECT_EQ_UINT(match_count, 2, "verify printf_only goes to systemlog buffer"); + + uint32_t logging_config = atm_get_diagnostic_config(); + T_LOG("checking atm_diagnostic_config 0x%X", logging_config); + + if ((logging_config & ATM_TRACE_OFF) || (logging_config & ATM_TRACE_DISABLE)) + { + T_LOG("ATM_TRACE_OFF / ATM_TRACE_DISABLE is set. Would not see oslog messages. skipping the rest of test."); + return KERN_SUCCESS; + } + + /* for enabled logging printfs should be saved in oslog as well */ + T_EXPECT_GE_UINT((oslog_p_total_msgcount - total_msg), 2, "atleast 2 msgs should be seen by oslog system"); + + a = mach_absolute_time(); + total_seqno = 1; + seqno = 1; + total_msg = oslog_p_total_msgcount; + saved_msg = oslog_p_saved_msgcount; + dropped_msg = oslog_p_dropped_msgcount; + datalen = snprintf(databuffer, sizeof(databuffer), TESTOSLOGFMT("oslog_info"), uniqid, seqno, total_seqno); + checksum = crc32(0, databuffer, datalen); + os_log_info(log_handle, TESTOSLOG("oslog_info") "mat%llu", checksum, uniqid, seqno, total_seqno, a); + T_EXPECT_GE_UINT((oslog_p_total_msgcount - total_msg), 1, "total message count in buffer"); + + datalen = snprintf(databuffer, sizeof(databuffer), "kernel^0^test^oslog_info#mat%llu", a); + match_count = find_pattern_in_buffer(databuffer, datalen, total_seqno); + T_EXPECT_EQ_UINT(match_count, 1, "verify oslog_info does not go to systemlog buffer"); + + total_msg = oslog_p_total_msgcount; + test_oslog_info_helper(uniqid, 10); + T_EXPECT_GE_UINT(oslog_p_total_msgcount - total_msg, 10, "test_oslog_info_helper: Should have seen 10 msgs"); + + total_msg = oslog_p_total_msgcount; + test_oslog_debug_helper(uniqid, 10); + T_EXPECT_GE_UINT(oslog_p_total_msgcount - total_msg, 10, "test_oslog_debug_helper:Should have seen 10 msgs"); + + total_msg = oslog_p_total_msgcount; + test_oslog_error_helper(uniqid, 10); + T_EXPECT_GE_UINT(oslog_p_total_msgcount - total_msg, 10, "test_oslog_error_helper:Should have seen 10 msgs"); + + total_msg = oslog_p_total_msgcount; + test_oslog_default_helper(uniqid, 10); + T_EXPECT_GE_UINT(oslog_p_total_msgcount - total_msg, 10, "test_oslog_default_helper:Should have seen 10 msgs"); + + total_msg = oslog_p_total_msgcount; + test_oslog_fault_helper(uniqid, 10); + T_EXPECT_GE_UINT(oslog_p_total_msgcount - total_msg, 10, "test_oslog_fault_helper:Should have seen 10 msgs"); + + T_LOG("oslog internal counters total %u , saved %u, dropped %u", oslog_p_total_msgcount, oslog_p_saved_msgcount, + oslog_p_dropped_msgcount); + + return KERN_SUCCESS; +} + +static uint32_t _test_log_loop_count = 0; +void _test_log_loop(void * arg __unused, wait_result_t wres __unused) +{ + uint32_t uniqid = RandomULong(); + test_oslog_debug_helper(uniqid, 100); + (void)hw_atomic_add(&_test_log_loop_count, 100); +} + +kern_return_t test_os_log_parallel(void) +{ + thread_t thread[2]; + kern_return_t kr; + uint32_t uniqid = RandomULong(); + + printf("oslog internal counters total %u , saved %u, dropped %u", oslog_p_total_msgcount, oslog_p_saved_msgcount, + oslog_p_dropped_msgcount); + + kr = kernel_thread_start(_test_log_loop, NULL, &thread[0]); + T_ASSERT_EQ_INT(kr, KERN_SUCCESS, "kernel_thread_start returned successfully"); + + kr = kernel_thread_start(_test_log_loop, NULL, &thread[1]); + T_ASSERT_EQ_INT(kr, KERN_SUCCESS, "kernel_thread_start returned successfully"); + + test_oslog_info_helper(uniqid, 100); + + /* wait until other thread has also finished */ + while (_test_log_loop_count < 200) + { + delay(1000); + } + + thread_deallocate(thread[0]); + thread_deallocate(thread[1]); + + T_LOG("oslog internal counters total %u , saved %u, dropped %u", oslog_p_total_msgcount, oslog_p_saved_msgcount, + oslog_p_dropped_msgcount); + T_PASS("parallel_logging tests is now complete"); + + return KERN_SUCCESS; +} + +void test_oslog_handleOSLogCtl(int32_t * in, int32_t * out, int32_t len) +{ + if (!in || !out || len != 4) + return; + switch (in[0]) { + case 1: + { + /* send out counters */ + out[1] = oslog_p_total_msgcount; + out[2] = oslog_p_saved_msgcount; + out[3] = oslog_p_dropped_msgcount; + out[0] = KERN_SUCCESS; + break; + } + case 2: + { + /* mini stress run */ + out[0] = test_os_log_parallel(); + break; + } + case 3: + { + /* drop msg tests */ + out[1] = RandomULong(); + out[0] = test_stresslog_dropmsg(out[1]); + break; + } + case 4: + { + /* invoke log helpers */ + uint32_t uniqid = in[3]; + int32_t msgcount = in[2]; + if (uniqid == 0 || msgcount == 0) + { + out[0] = KERN_INVALID_VALUE; + return; + } + + switch (in[1]) { + case OS_LOG_TYPE_INFO: test_oslog_info_helper(uniqid, msgcount); break; + case OS_LOG_TYPE_DEBUG: test_oslog_debug_helper(uniqid, msgcount); break; + case OS_LOG_TYPE_ERROR: test_oslog_error_helper(uniqid, msgcount); break; + case OS_LOG_TYPE_FAULT: test_oslog_fault_helper(uniqid, msgcount); break; + case OS_LOG_TYPE_DEFAULT: + default: test_oslog_default_helper(uniqid, msgcount); break; + } + out[0] = KERN_SUCCESS; + break; + /* end of case 4 */ + } + default: + { + out[0] = KERN_INVALID_VALUE; + break; + } + } + return; +} + +kern_return_t test_stresslog_dropmsg(uint32_t uniqid) +{ + uint32_t total, saved, dropped; + total = oslog_p_total_msgcount; + saved = oslog_p_saved_msgcount; + dropped = oslog_p_dropped_msgcount; + uniqid = RandomULong(); + test_oslog_debug_helper(uniqid, 100); + while ((oslog_p_dropped_msgcount - dropped) == 0) + { + test_oslog_debug_helper(uniqid, 100); + } + printf("test_stresslog_dropmsg: logged %u msgs, saved %u and caused a drop of %u msgs. \n", oslog_p_total_msgcount - total, + oslog_p_saved_msgcount - saved, oslog_p_dropped_msgcount - dropped); + return KERN_SUCCESS; +} + +#endif diff --git a/libkern/os/log_encode.h b/libkern/os/log_encode.h index 4f8afae5c..d214bab21 100644 --- a/libkern/os/log_encode.h +++ b/libkern/os/log_encode.h @@ -27,6 +27,11 @@ #include "log_encode_types.h" #include +#if __has_feature(ptrauth_calls) +#include +#include +#endif /* __has_feature(ptrauth_calls) */ + #ifdef KERNEL #define isdigit(ch) (((ch) >= '0') && ((ch) <= '9')) extern boolean_t doprnt_hide_pointers; @@ -156,13 +161,21 @@ _os_log_encode_arg(void *arg, uint16_t arg_len, os_log_value_type_t ctype, bool unsigned long long value = 0; memcpy(&value, arg, arg_len); +#if __has_feature(ptrauth_calls) + /** + * Strip out the pointer authentication code before + * checking whether the pointer is a kernel address. + */ + value = (unsigned long long)VM_KERNEL_STRIP_PTR(value); +#endif /* __has_feature(ptrauth_calls) */ + if (value >= VM_MIN_KERNEL_AND_KEXT_ADDRESS && value <= VM_MAX_KERNEL_ADDRESS) { is_private = true; bzero(arg, arg_len); } } #endif - + content->type = ctype; content->flags = (is_private ? OS_LOG_CONTENT_FLAG_PRIVATE : 0); diff --git a/libkern/os/reason_private.h b/libkern/os/reason_private.h index 477bceeed..56a68f1f1 100644 --- a/libkern/os/reason_private.h +++ b/libkern/os/reason_private.h @@ -37,6 +37,7 @@ OS_ENUM(os_reason_libsystem_code, uint64_t, OS_REASON_LIBSYSTEM_CODE_WORKLOOP_OWNERSHIP_LEAK = 1, OS_REASON_LIBSYSTEM_CODE_FAULT = 2, /* generated by os_log_fault */ + OS_REASON_LIBSYSTEM_CODE_SECINIT_INITIALIZER = 3, ); #ifndef KERNEL diff --git a/libkern/os/refcnt.c b/libkern/os/refcnt.c new file mode 100644 index 000000000..539659869 --- /dev/null +++ b/libkern/os/refcnt.c @@ -0,0 +1,298 @@ +#include +#include +#include +#include +#include +#include +#include "refcnt.h" + +#define OS_REFCNT_MAX_COUNT ((os_ref_count_t)0x0FFFFFFFUL) + +#if OS_REFCNT_DEBUG +os_refgrp_decl(static, global_ref_group, "all", NULL); +static bool ref_debug_enable = false; +static const size_t ref_log_nrecords = 1000000; + +#define REFLOG_BTDEPTH 10 +#define REFLOG_RETAIN 1 +#define REFLOG_RELEASE 2 + +#define __debug_only +#else +# define __debug_only __unused +#endif /* OS_REFCNT_DEBUG */ + +static const char * +ref_grp_name(struct os_refcnt __debug_only *rc) +{ +#if OS_REFCNT_DEBUG + if (rc && rc->ref_group && rc->ref_group->grp_name) { + return rc->ref_group->grp_name; + } +#endif + return ""; +} + +static void +os_ref_check_underflow(struct os_refcnt *rc, os_ref_count_t count) +{ + if (__improbable(count == 0)) { + panic("os_refcnt: underflow (rc=%p, grp=%s)\n", rc, ref_grp_name(rc)); + __builtin_unreachable(); + } +} + +static void +os_ref_assert_referenced(struct os_refcnt *rc, os_ref_count_t count) +{ + if (__improbable(count == 0)) { + panic("os_refcnt: used unsafely when zero (rc=%p, grp=%s)\n", rc, ref_grp_name(rc)); + __builtin_unreachable(); + } +} + +static void +os_ref_check_overflow(struct os_refcnt *rc, os_ref_count_t count) +{ + if (__improbable(count >= OS_REFCNT_MAX_COUNT)) { + panic("os_refcnt: overflow (rc=%p, grp=%s)\n", rc, ref_grp_name(rc)); + __builtin_unreachable(); + } +} + +static void +os_ref_check_retain(struct os_refcnt *rc, os_ref_count_t count) +{ + os_ref_assert_referenced(rc, count); + os_ref_check_overflow(rc, count); +} + +#if OS_REFCNT_DEBUG +static void +ref_log_op(struct os_refgrp *grp, void *elem, int op) +{ + if (!ref_debug_enable || grp == NULL) { + return; + } + + if (grp->grp_log == NULL) { + ref_log_op(grp->grp_parent, elem, op); + return; + } + + uintptr_t bt[REFLOG_BTDEPTH]; + uint32_t nframes = backtrace(bt, REFLOG_BTDEPTH); + btlog_add_entry((btlog_t *)grp->grp_log, elem, op, (void **)bt, nframes); +} + +static void +ref_log_drop(struct os_refgrp *grp, void *elem) +{ + if (!ref_debug_enable || grp == NULL) { + return; + } + + if (grp->grp_log == NULL) { + ref_log_drop(grp->grp_parent, elem); + return; + } + + btlog_remove_entries_for_element(grp->grp_log, elem); +} + +static void +ref_log_init(struct os_refgrp *grp) +{ + if (grp->grp_log != NULL) { + return; + } + + char grpbuf[128]; + char *refgrp = grpbuf; + if (!PE_parse_boot_argn("rlog", refgrp, sizeof(grpbuf))) { + return; + } + + const char *g; + while ((g = strsep(&refgrp, ",")) != NULL) { + if (strcmp(g, grp->grp_name) == 0) { + /* enable logging on this refgrp */ + grp->grp_log = btlog_create(ref_log_nrecords, REFLOG_BTDEPTH, true); + assert(grp->grp_log); + ref_debug_enable = true; + return; + } + } + +} + +/* + * attach a new refcnt to a group + */ +static void +ref_attach_to_group(struct os_refcnt *rc, struct os_refgrp *grp, os_ref_count_t init_count) +{ + if (grp == NULL) { + return; + } + + if (atomic_fetch_add_explicit(&grp->grp_children, 1, memory_order_relaxed) == 0) { + /* First reference count object in this group. Check if we should enable + * refcount logging. */ + ref_log_init(grp); + } + + atomic_fetch_add_explicit(&grp->grp_count, init_count, memory_order_relaxed); + atomic_fetch_add_explicit(&grp->grp_retain_total, init_count, memory_order_relaxed); + + if (grp == &global_ref_group) { + return; + } + + if (grp->grp_parent == NULL) { + grp->grp_parent = &global_ref_group; + } + + ref_attach_to_group(rc, grp->grp_parent, init_count); +} + +static inline void +ref_retain_group(struct os_refgrp *grp) +{ + if (grp) { + atomic_fetch_add_explicit(&grp->grp_count, 1, memory_order_relaxed); + atomic_fetch_add_explicit(&grp->grp_retain_total, 1, memory_order_relaxed); + ref_retain_group(grp->grp_parent); + } +} + +static inline void +ref_release_group(struct os_refgrp *grp, bool final) +{ + if (grp) { + atomic_fetch_sub_explicit(&grp->grp_count, 1, memory_order_relaxed); + atomic_fetch_add_explicit(&grp->grp_release_total, 1, memory_order_relaxed); + if (final) { + atomic_fetch_sub_explicit(&grp->grp_children, 1, memory_order_relaxed); + } + + ref_release_group(grp->grp_parent, final); + } +} +#endif + +#undef os_ref_init_count +void +os_ref_init_count(struct os_refcnt *rc, struct os_refgrp __debug_only *grp, os_ref_count_t count) +{ + atomic_init(&rc->ref_count, count); + +#if OS_REFCNT_DEBUG + assert(count > 0); + if (grp) { + rc->ref_group = grp; + } else { + rc->ref_group = &global_ref_group; + } + + ref_attach_to_group(rc, rc->ref_group, count); + + for (os_ref_count_t i = 0; i < count; i++) { + ref_log_op(rc->ref_group, (void *)rc, REFLOG_RETAIN); + } +#endif +} + +void +os_ref_retain(struct os_refcnt *rc) +{ + os_ref_count_t old = atomic_fetch_add_explicit(&rc->ref_count, 1, memory_order_relaxed); + os_ref_check_retain(rc, old); + +#if OS_REFCNT_DEBUG + ref_retain_group(rc->ref_group); + ref_log_op(rc->ref_group, (void *)rc, REFLOG_RETAIN); +#endif +} + +bool +os_ref_retain_try(struct os_refcnt *rc) +{ + os_ref_count_t cur = os_ref_get_count(rc); + + while (1) { + if (__improbable(cur == 0)) { + return false; + } + + os_ref_check_retain(rc, cur); + + if (atomic_compare_exchange_weak_explicit(&rc->ref_count, &cur, cur+1, + memory_order_relaxed, memory_order_relaxed)) { +#if OS_REFCNT_DEBUG + ref_retain_group(rc->ref_group); + ref_log_op(rc->ref_group, (void *)rc, REFLOG_RETAIN); +#endif + return true; + } + } +} + +os_ref_count_t +os_ref_release_explicit(struct os_refcnt *rc, memory_order release_order, memory_order dealloc_order) +{ +#if OS_REFCNT_DEBUG + /* + * Care not to use 'rc' after the decrement because it might be deallocated + * under us. + */ + struct os_refgrp *grp = rc->ref_group; + ref_log_op(grp, (void *)rc, REFLOG_RELEASE); +#endif + + os_ref_count_t val = atomic_fetch_sub_explicit(&rc->ref_count, 1, release_order); + os_ref_check_underflow(rc, val); + if (__improbable(--val == 0)) { + atomic_load_explicit(&rc->ref_count, dealloc_order); +#if OS_REFCNT_DEBUG + ref_log_drop(grp, (void *)rc); /* rc is only used as an identifier */ +#endif + } + +#if OS_REFCNT_DEBUG + ref_release_group(grp, !val); +#endif + + return val; +} + +void +os_ref_retain_locked(struct os_refcnt *rc) +{ + os_ref_count_t val = rc->ref_count; + os_ref_check_retain(rc, val); + rc->ref_count = ++val; + +#if OS_REFCNT_DEBUG + ref_retain_group(rc->ref_group); + ref_log_op(rc->ref_group, (void *)rc, REFLOG_RETAIN); +#endif +} + +os_ref_count_t +os_ref_release_locked(struct os_refcnt *rc) +{ + os_ref_count_t val = rc->ref_count; + os_ref_check_underflow(rc, val); + rc->ref_count = --val; + +#if OS_REFCNT_DEBUG + ref_release_group(rc->ref_group, !val); + ref_log_op(rc->ref_group, (void *)rc, REFLOG_RELEASE); + if (val == 0) { + ref_log_drop(rc->ref_group, (void *)rc); + } +#endif + return val; +} + diff --git a/libkern/os/refcnt.h b/libkern/os/refcnt.h new file mode 100644 index 000000000..6148059ee --- /dev/null +++ b/libkern/os/refcnt.h @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _OS_REFCNT_H_ +#define _OS_REFCNT_H_ + +/* + * os_refcnt reference counting API + * + * Two flavors are provided: atomic and locked. Atomic internally uses C11 atomic + * operations and requires no external synchronization, whereas the locked flavor + * assumes the refcnt object is locked by the caller. It is NOT safe to + * mix-and-match locked and atomic calls. + */ + +#include +#include +#include + +struct os_refcnt; +struct os_refgrp; +typedef struct os_refcnt os_refcnt_t; + +/* type of the internal counter */ +typedef uint32_t os_ref_count_t; + +#if DEVELOPMENT || DEBUG +# define OS_REFCNT_DEBUG 1 +#else +# define OS_REFCNT_DEBUG 0 +#endif + +/* + * Debugging is keyed off ref_group, so leave that field for kexts so that the + * combination of dev/debug kernel and release kext works. + */ +#if XNU_KERNEL_PRIVATE +# define OS_REFCNT_HAS_GROUP OS_REFCNT_DEBUG +#else +# define OS_REFCNT_HAS_GROUP 1 +#endif + +struct os_refcnt { + _Atomic os_ref_count_t ref_count; +#if OS_REFCNT_HAS_GROUP + struct os_refgrp *ref_group; +#endif +}; + +#if OS_REFCNT_DEBUG +struct os_refgrp { + const char *const grp_name; + _Atomic os_ref_count_t grp_children; /* number of refcount objects in group */ + _Atomic os_ref_count_t grp_count; /* current reference count of group */ + _Atomic uint64_t grp_retain_total; + _Atomic uint64_t grp_release_total; + struct os_refgrp *grp_parent; + void *grp_log; /* refcount logging context */ +}; +#endif + +#if __has_attribute(diagnose_if) +# define os_error_if(cond, msg) __attribute__((diagnose_if((cond), (msg), "error"))) +#else +# define os_error_if(...) +#endif + +__BEGIN_DECLS + +/* + * os_ref_init: initialize an os_refcnt with a count of 1 + * os_ref_init_count: initialize an os_refcnt with a specific count >= 1 + */ +#define os_ref_init(rc, grp) os_ref_init_count((rc), (grp), 1) +void os_ref_init_count(struct os_refcnt *, struct os_refgrp *, os_ref_count_t count) + os_error_if(count == 0, "Reference count must be non-zero initialized"); + +#if OS_REFCNT_DEBUG +# define os_refgrp_decl(qual, var, name, parent) \ + qual struct os_refgrp __attribute__((section("__DATA,__refgrps"))) var = { \ + .grp_name = (name), \ + .grp_children = ATOMIC_VAR_INIT(0), \ + .grp_count = ATOMIC_VAR_INIT(0), \ + .grp_retain_total = ATOMIC_VAR_INIT(0), \ + .grp_release_total = ATOMIC_VAR_INIT(0), \ + .grp_parent = (parent), \ + .grp_log = NULL, \ + } + +/* Create a default group based on the init() callsite if no explicit group + * is provided. */ +# define os_ref_init_count(rc, grp, count) ({ \ + os_refgrp_decl(static, __grp, __func__, NULL); \ + (os_ref_init_count)((rc), (grp) ? (grp) : &__grp, (count)); \ + }) +#else +# define os_refgrp_decl(...) +# define os_ref_init_count(rc, grp, count) (os_ref_init_count)((rc), NULL, (count)) +#endif /* OS_REFCNT_DEBUG */ + +/* + * os_ref_retain: acquire a reference (increment reference count by 1) atomically. + * + * os_ref_release: release a reference (decrement reference count) atomically and + * return the new count. Memory is synchronized such that the dealloc block + * (i.e. code handling the final release() == 0 call) sees up-to-date memory + * with respect to all prior release()s on the same refcnt object. This + * memory ordering is sufficient for most use cases. + * + * os_ref_release_relaxed: same as release() but with weaker relaxed memory ordering. + * This can be used when the dealloc block is already synchronized with other + * accesses to the object (for example, with a lock). + * + * os_ref_release_live: release a reference that is guaranteed not to be the last one. + */ +void os_ref_retain(struct os_refcnt *); + +os_ref_count_t os_ref_release_explicit(struct os_refcnt *rc, + memory_order release_order, memory_order dealloc_order) OS_WARN_RESULT; + +static inline os_ref_count_t OS_WARN_RESULT +os_ref_release(struct os_refcnt *rc) +{ + return os_ref_release_explicit(rc, memory_order_release, memory_order_acquire); +} + +static inline os_ref_count_t OS_WARN_RESULT +os_ref_release_relaxed(struct os_refcnt *rc) +{ + return os_ref_release_explicit(rc, memory_order_relaxed, memory_order_relaxed); +} + +static inline void +os_ref_release_live(struct os_refcnt *rc) +{ + if (__improbable(os_ref_release_explicit(rc, + memory_order_release, memory_order_relaxed) == 0)) { + panic("os_refcnt: unexpected release of final reference (rc=%p)\n", rc); + __builtin_unreachable(); + } +} + + +/* + * os_ref_retain_try: a variant of atomic retain that fails for objects with a + * zero reference count. The caller must therefore ensure that the object + * remains alive for any possible retain_try() caller, usually by using a + * lock protecting both the retain and dealloc paths. This variant is useful + * for objects stored in a collection, because no lock is required on the + * release() side until the object is deallocated. + */ +bool os_ref_retain_try(struct os_refcnt *) OS_WARN_RESULT; + + +/* + * os_ref_retain_locked: acquire a reference on an object protected by a held + * lock. The caller must ensure mutual exclusivity of retain_locked() and + * release_locked() calls on the same object. + * + * os_ref_release_locked: release a reference on an object protected by a held + * lock. + */ +void os_ref_retain_locked(struct os_refcnt *); +os_ref_count_t os_ref_release_locked(struct os_refcnt *) OS_WARN_RESULT; + + +/* + * os_ref_get_count: return the current reference count. This is unsafe for + * synchronization. + */ +static inline os_ref_count_t +os_ref_get_count(struct os_refcnt *rc) +{ + return atomic_load_explicit(&rc->ref_count, memory_order_relaxed); +} + +__END_DECLS + +#endif diff --git a/libkern/os/trace_internal.h b/libkern/os/trace_internal.h index 1ee1c984e..dc40b5fa0 100644 --- a/libkern/os/trace_internal.h +++ b/libkern/os/trace_internal.h @@ -26,6 +26,7 @@ #include #include +#include #include __BEGIN_DECLS @@ -34,6 +35,7 @@ OS_ALWAYS_INLINE inline uint32_t _os_trace_offset(const void *dso, const void *addr, _firehose_tracepoint_flags_activity_t flags __unused) { + assert((uintptr_t)addr >= (uintptr_t)dso); return (uint32_t) ((uintptr_t)addr - (uintptr_t)dso); } diff --git a/libkern/uuid/uuid.c b/libkern/uuid/uuid.c index ce69ad766..eec3f4943 100644 --- a/libkern/uuid/uuid.c +++ b/libkern/uuid/uuid.c @@ -82,13 +82,25 @@ uuid_copy(uuid_t dst, const uuid_t src) memcpy(dst, src, sizeof(uuid_t)); } +static void +uuid_random_setflags(uuid_t out) +{ + out[6] = (out[6] & 0x0F) | 0x40; + out[8] = (out[8] & 0x3F) | 0x80; +} + void uuid_generate_random(uuid_t out) { read_random(out, sizeof(uuid_t)); + uuid_random_setflags(out); +} - out[6] = (out[6] & 0x0F) | 0x40; - out[8] = (out[8] & 0x3F) | 0x80; +void +uuid_generate_early_random(uuid_t out) +{ + read_frandom(out, sizeof(uuid_t)); + uuid_random_setflags(out); } void diff --git a/libkern/zlib/crc32.c b/libkern/zlib/z_crc32.c similarity index 100% rename from libkern/zlib/crc32.c rename to libkern/zlib/z_crc32.c diff --git a/libsa/bootstrap.cpp b/libsa/bootstrap.cpp index a73df0946..fecfc43db 100644 --- a/libsa/bootstrap.cpp +++ b/libsa/bootstrap.cpp @@ -29,10 +29,6 @@ extern "C" { #include #include #include - -#if CONFIG_EMBEDDED -extern uuid_t kernelcache_uuid; -#endif } #include @@ -104,6 +100,7 @@ static const char * sKernelComponentNames[] = { "com.apple.kpi.bsd", "com.apple.kpi.dsep", "com.apple.kpi.iokit", + "com.apple.kpi.kasan", "com.apple.kpi.libkern", "com.apple.kpi.mach", "com.apple.kpi.private", @@ -246,9 +243,7 @@ KLDBootstrap::readPrelinkedExtensions( OSDictionary * prelinkInfoDict = NULL; // do not release OSString * errorString = NULL; // must release OSKext * theKernel = NULL; // must release -#if CONFIG_EMBEDDED OSData * kernelcacheUUID = NULL; // do not release -#endif kernel_segment_command_t * prelinkTextSegment = NULL; // see code kernel_segment_command_t * prelinkInfoSegment = NULL; // see code @@ -374,19 +369,19 @@ KLDBootstrap::readPrelinkedExtensions( ramDiskBoot = IORamDiskBSDRoot(); #endif /* NO_KEXTD */ -#if CONFIG_EMBEDDED /* Copy in the kernelcache UUID */ kernelcacheUUID = OSDynamicCast(OSData, prelinkInfoDict->getObject(kPrelinkInfoKCIDKey)); - if (!kernelcacheUUID) { - bzero(&kernelcache_uuid, sizeof(kernelcache_uuid)); - } else if (kernelcacheUUID->getLength() != sizeof(kernelcache_uuid)) { - panic("kernelcacheUUID length is %d, expected %lu", kernelcacheUUID->getLength(), - sizeof(kernelcache_uuid)); - } else { - memcpy((void *)&kernelcache_uuid, (const void *)kernelcacheUUID->getBytesNoCopy(), kernelcacheUUID->getLength()); + if (kernelcacheUUID) { + if (kernelcacheUUID->getLength() != sizeof(kernelcache_uuid)) { + panic("kernelcacheUUID length is %d, expected %lu", kernelcacheUUID->getLength(), + sizeof(kernelcache_uuid)); + } else { + kernelcache_uuid_valid = TRUE; + memcpy((void *)&kernelcache_uuid, (const void *)kernelcacheUUID->getBytesNoCopy(), kernelcacheUUID->getLength()); + uuid_unparse_upper(kernelcache_uuid, kernelcache_uuid_string); + } } -#endif /* CONFIG_EMBEDDED */ infoDictArray = OSDynamicCast(OSArray, prelinkInfoDict->getObject(kPrelinkInfoDictionaryKey)); @@ -454,7 +449,7 @@ KLDBootstrap::readPrelinkedExtensions( infoDict->getObject(kPrelinkExecutableSizeKey)); if (addressNum && lengthNum) { #if __arm__ || __arm64__ - vm_offset_t data = (vm_offset_t) ((addressNum->unsigned64BitValue()) + vm_kernel_slide); + vm_offset_t data = ml_static_slide(addressNum->unsigned64BitValue()); vm_size_t length = (vm_size_t) (lengthNum->unsigned32BitValue()); ml_static_mfree(data, length); #else @@ -493,7 +488,7 @@ KLDBootstrap::readPrelinkedExtensions( slideAddrSegIndex = __whereIsAddr( (vm_offset_t)slideAddr, &plk_segSizes[0], &plk_segAddrs[0], PLK_SEGMENTS ); if (slideAddrSegIndex >= 0) { - addrToSlideSegIndex = __whereIsAddr( (vm_offset_t)(*slideAddr + vm_kernel_slide), &plk_segSizes[0], &plk_segAddrs[0], PLK_SEGMENTS ); + addrToSlideSegIndex = __whereIsAddr(ml_static_slide((vm_offset_t)(*slideAddr)), &plk_segSizes[0], &plk_segAddrs[0], PLK_SEGMENTS ); if (addrToSlideSegIndex < 0) { badSlideTarget++; continue; @@ -505,7 +500,7 @@ KLDBootstrap::readPrelinkedExtensions( } slidKextAddrCount++; - *(slideAddr) += vm_kernel_slide; + *slideAddr = ml_static_slide(*slideAddr); } // for ... /* All kexts are now slid, set VM protections for them */ diff --git a/libsyscall/Libsyscall.xcconfig b/libsyscall/Libsyscall.xcconfig index 105571937..737c41068 100644 --- a/libsyscall/Libsyscall.xcconfig +++ b/libsyscall/Libsyscall.xcconfig @@ -28,7 +28,7 @@ GCC_TREAT_WARNINGS_AS_ERRORS = YES GCC_WARN_ABOUT_MISSING_NEWLINE = YES CODE_SIGN_IDENTITY = - DYLIB_CURRENT_VERSION = $(RC_ProjectSourceVersion) -DYLIB_LDFLAGS = -umbrella System -all_load -lCrashReporterClient +DYLIB_LDFLAGS = -umbrella System -all_load DYLIB_LDFLAGS[sdk=iphoneos*] = $(inherited) -Wl,-sectalign,__DATA,__data,1000 DYLIB_LDFLAGS[sdk=watchos*] = $(inherited) -Wl,-sectalign,__DATA,__data,1000 DYLIB_LDFLAGS[sdk=tvos*] = $(inherited) -Wl,-sectalign,__DATA,__data,1000 diff --git a/libsyscall/Libsyscall.xcodeproj/project.pbxproj b/libsyscall/Libsyscall.xcodeproj/project.pbxproj index b489c7ed0..e138202c1 100644 --- a/libsyscall/Libsyscall.xcodeproj/project.pbxproj +++ b/libsyscall/Libsyscall.xcodeproj/project.pbxproj @@ -119,6 +119,9 @@ 40DF0F741E5CD7BB0035A864 /* cpu_copy_in_cksum_gen.c in Sources */ = {isa = PBXBuildFile; fileRef = 40DF0F731E5CD7B30035A864 /* cpu_copy_in_cksum_gen.c */; settings = {COMPILER_FLAGS = "-fno-builtin"; }; }; 435F3CAA1B06B7BA005ED9EF /* work_interval.c in Sources */ = {isa = PBXBuildFile; fileRef = 435F3CA91B06B7BA005ED9EF /* work_interval.c */; }; 467DAFD4157E8AF200CE68F0 /* guarded_open_np.c in Sources */ = {isa = PBXBuildFile; fileRef = 467DAFD3157E8AF200CE68F0 /* guarded_open_np.c */; }; + 4BCDD8AF20741A5B00FA37A3 /* mach_right.h in Headers */ = {isa = PBXBuildFile; fileRef = 4BCDD8AE20741A4700FA37A3 /* mach_right.h */; }; + 4BCDD8B020741BC400FA37A3 /* mach_right.h in Headers */ = {isa = PBXBuildFile; fileRef = 4BCDD8AE20741A4700FA37A3 /* mach_right.h */; }; + 4BCDD8B220741C2F00FA37A3 /* mach_right.c in Sources */ = {isa = PBXBuildFile; fileRef = 4BCDD8B120741C2F00FA37A3 /* mach_right.c */; }; 4BDD5F1D1891AB2F004BF300 /* mach_approximate_time.c in Sources */ = {isa = PBXBuildFile; fileRef = 4BDD5F1B1891AB2F004BF300 /* mach_approximate_time.c */; }; 4BDD5F1E1891AB2F004BF300 /* mach_approximate_time.s in Sources */ = {isa = PBXBuildFile; fileRef = 4BDD5F1C1891AB2F004BF300 /* mach_approximate_time.s */; }; 726D915520ACD7FC0039A2FE /* mach_bridge_remote_time.c in Sources */ = {isa = PBXBuildFile; fileRef = 726D915420ACD7FC0039A2FE /* mach_bridge_remote_time.c */; }; @@ -138,6 +141,8 @@ 929FD46F1C5711DB0087B9C8 /* mach_timebase_info.c in Sources */ = {isa = PBXBuildFile; fileRef = 929FD46E1C5711CF0087B9C8 /* mach_timebase_info.c */; }; 978228281B8678DC008385AC /* pselect-darwinext.c in Sources */ = {isa = PBXBuildFile; fileRef = 978228271B8678CB008385AC /* pselect-darwinext.c */; }; 978228291B8678DF008385AC /* pselect-darwinext-cancel.c in Sources */ = {isa = PBXBuildFile; fileRef = 978228261B8678C2008385AC /* pselect-darwinext-cancel.c */; }; + 9C6DA3D220A3D09F0090330B /* mach_sync_ipc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9C6DA3D120A3D09F0090330B /* mach_sync_ipc.h */; }; + 9C6DA3D320A3D09F0090330B /* mach_sync_ipc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9C6DA3D120A3D09F0090330B /* mach_sync_ipc.h */; }; 9CCF28271E68E993002EE6CD /* pid_shutdown_networking.c in Sources */ = {isa = PBXBuildFile; fileRef = 9CCF28261E68E993002EE6CD /* pid_shutdown_networking.c */; }; A50845861DDA69AC0041C0E0 /* thread_self_restrict.h in CopyFiles */ = {isa = PBXBuildFile; fileRef = A50BD52E1DDA548F006622C8 /* thread_self_restrict.h */; }; A50845871DDA69C90041C0E0 /* thread_self_restrict.h in CopyFiles */ = {isa = PBXBuildFile; fileRef = A50BD52E1DDA548F006622C8 /* thread_self_restrict.h */; }; @@ -184,6 +189,8 @@ C6D3EFC816542C510052CF30 /* exc_catcher.h in Headers */ = {isa = PBXBuildFile; fileRef = 247A091611F8E7A800E4693F /* exc_catcher.h */; }; C6D3EFC916542C510052CF30 /* _libkernel_init.h in Headers */ = {isa = PBXBuildFile; fileRef = 247A08B211F8B05900E4693F /* _libkernel_init.h */; settings = {ATTRIBUTES = (Private, ); }; }; C6D3F03016542C980052CF30 /* dummy.c in Sources */ = {isa = PBXBuildFile; fileRef = C6D3F02F16542C980052CF30 /* dummy.c */; }; + C9001753206B00AC0070D674 /* port_descriptions.c in Sources */ = {isa = PBXBuildFile; fileRef = C9001751206B00850070D674 /* port_descriptions.c */; }; + C9001754206B00D00070D674 /* port_descriptions.h in Headers */ = {isa = PBXBuildFile; fileRef = C9001752206B008B0070D674 /* port_descriptions.h */; }; C962B16C18DBA2C80031244A /* setpriority.c in Sources */ = {isa = PBXBuildFile; fileRef = C962B16B18DBA2C80031244A /* setpriority.c */; }; C962B16E18DBB43F0031244A /* thread_act.c in Sources */ = {isa = PBXBuildFile; fileRef = C962B16D18DBB43F0031244A /* thread_act.c */; }; C99A4F501305B2BD0054B7B7 /* __get_cpu_capabilities.s in Sources */ = {isa = PBXBuildFile; fileRef = C99A4F4E1305B1B70054B7B7 /* __get_cpu_capabilities.s */; }; @@ -506,6 +513,8 @@ 40DF0F731E5CD7B30035A864 /* cpu_copy_in_cksum_gen.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = cpu_copy_in_cksum_gen.c; path = skywalk/cpu_copy_in_cksum_gen.c; sourceTree = ""; }; 435F3CA91B06B7BA005ED9EF /* work_interval.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = work_interval.c; sourceTree = ""; }; 467DAFD3157E8AF200CE68F0 /* guarded_open_np.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = guarded_open_np.c; sourceTree = ""; }; + 4BCDD8AE20741A4700FA37A3 /* mach_right.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = mach_right.h; sourceTree = ""; }; + 4BCDD8B120741C2F00FA37A3 /* mach_right.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; path = mach_right.c; sourceTree = ""; }; 4BDD5F1B1891AB2F004BF300 /* mach_approximate_time.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mach_approximate_time.c; sourceTree = ""; }; 4BDD5F1C1891AB2F004BF300 /* mach_approximate_time.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = mach_approximate_time.s; sourceTree = ""; }; 726D915420ACD7FC0039A2FE /* mach_bridge_remote_time.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = mach_bridge_remote_time.c; path = wrappers/mach_bridge_remote_time.c; sourceTree = ""; }; @@ -521,6 +530,7 @@ 929FD46E1C5711CF0087B9C8 /* mach_timebase_info.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mach_timebase_info.c; sourceTree = ""; }; 978228261B8678C2008385AC /* pselect-darwinext-cancel.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "pselect-darwinext-cancel.c"; sourceTree = ""; }; 978228271B8678CB008385AC /* pselect-darwinext.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "pselect-darwinext.c"; sourceTree = ""; }; + 9C6DA3D120A3D09F0090330B /* mach_sync_ipc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mach_sync_ipc.h; sourceTree = ""; }; 9CCF28261E68E993002EE6CD /* pid_shutdown_networking.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = pid_shutdown_networking.c; sourceTree = ""; }; A50BD52E1DDA548F006622C8 /* thread_self_restrict.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = thread_self_restrict.h; sourceTree = ""; }; A59CB95516669DB700B064B3 /* stack_logging_internal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = stack_logging_internal.h; sourceTree = ""; }; @@ -539,6 +549,8 @@ C6C40121174154D9000AE69F /* gethostuuid_private.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = gethostuuid_private.h; sourceTree = ""; }; C6D3F02E16542C510052CF30 /* libsystem_Libsyscall_headers_Sim.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libsystem_Libsyscall_headers_Sim.a; sourceTree = BUILT_PRODUCTS_DIR; }; C6D3F02F16542C980052CF30 /* dummy.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = dummy.c; sourceTree = ""; }; + C9001751206B00850070D674 /* port_descriptions.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = port_descriptions.c; sourceTree = ""; }; + C9001752206B008B0070D674 /* port_descriptions.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = port_descriptions.h; sourceTree = ""; }; C93B50491C487698009DD6AB /* __kdebug_trace_string.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = __kdebug_trace_string.s; sourceTree = ""; }; C962B16B18DBA2C80031244A /* setpriority.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = setpriority.c; sourceTree = ""; }; C962B16D18DBB43F0031244A /* thread_act.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = thread_act.c; sourceTree = ""; }; @@ -672,6 +684,7 @@ 08FB7795FE84155DC02AAC07 /* mach */ = { isa = PBXGroup; children = ( + C9001751206B00850070D674 /* port_descriptions.c */, 247A08FF11F8E18000E4693F /* abort.h */, C9D9BCC5114B00600000D8B9 /* clock_priv.defs */, C9D9BCC6114B00600000D8B9 /* clock_reply.defs */, @@ -706,6 +719,7 @@ C9D9BCF1114B00600000D8B9 /* mach_msg.c */, 291D3C261354FDD100D46061 /* mach_port.c */, C9D9BCF2114B00600000D8B9 /* mach_port.defs */, + 4BCDD8B120741C2F00FA37A3 /* mach_right.c */, C9D9BCF3114B00600000D8B9 /* mach_traps.s */, 291D3C271354FDD100D46061 /* mach_vm.c */, E4216C301822D404006F2632 /* mach_voucher.defs */, @@ -1001,11 +1015,14 @@ C9D9BCD8114B00600000D8B9 /* mach */ = { isa = PBXGroup; children = ( + C9001752206B008B0070D674 /* port_descriptions.h */, C9D9BCD9114B00600000D8B9 /* errorlib.h */, C9D9BCDA114B00600000D8B9 /* mach.h */, C9D9BCDB114B00600000D8B9 /* mach_error.h */, C9D9BCDC114B00600000D8B9 /* mach_init.h */, C9D9BCDD114B00600000D8B9 /* mach_interface.h */, + 9C6DA3D120A3D09F0090330B /* mach_sync_ipc.h */, + 4BCDD8AE20741A4700FA37A3 /* mach_right.h */, C9D9BCDF114B00600000D8B9 /* port_obj.h */, C9D9BCE0114B00600000D8B9 /* sync.h */, 928336A21B8412C100873B90 /* thread_state.h */, @@ -1092,9 +1109,11 @@ C6D3EFC616542C510052CF30 /* SYS.h in Headers */, C6D3EFC716542C510052CF30 /* abort.h in Headers */, C6D3EFC816542C510052CF30 /* exc_catcher.h in Headers */, + 4BCDD8B020741BC400FA37A3 /* mach_right.h in Headers */, C6D3EFC916542C510052CF30 /* _libkernel_init.h in Headers */, E453AF3A17013F4C00F2C94C /* stack_logging_internal.h in Headers */, E453AF3817013F1400F2C94C /* spawn.h in Headers */, + 9C6DA3D320A3D09F0090330B /* mach_sync_ipc.h in Headers */, E453AF3917013F1B00F2C94C /* spawn_private.h in Headers */, E453AF3617013CBF00F2C94C /* libproc.h in Headers */, E453AF3717013CC200F2C94C /* libproc_internal.h in Headers */, @@ -1113,6 +1132,7 @@ C9D9BD26114B00600000D8B9 /* mach.h in Headers */, C9D9BD27114B00600000D8B9 /* mach_error.h in Headers */, C9D9BD28114B00600000D8B9 /* mach_init.h in Headers */, + C9001754206B00D00070D674 /* port_descriptions.h in Headers */, 9299E14A1B841E74005B7350 /* thread_state.h in Headers */, C6C40122174155E3000AE69F /* gethostuuid_private.h in Headers */, C9D9BD29114B00600000D8B9 /* mach_interface.h in Headers */, @@ -1127,9 +1147,11 @@ 24D1158311E671B20063D54D /* SYS.h in Headers */, 247A090011F8E18000E4693F /* abort.h in Headers */, 247A091711F8E7A800E4693F /* exc_catcher.h in Headers */, + 4BCDD8AF20741A5B00FA37A3 /* mach_right.h in Headers */, 24B028F511FF5C3500CA64A9 /* _libkernel_init.h in Headers */, A59CB95616669EFB00B064B3 /* stack_logging_internal.h in Headers */, E4D45C3F16FB20D30002AF25 /* spawn.h in Headers */, + 9C6DA3D220A3D09F0090330B /* mach_sync_ipc.h in Headers */, E4D45C4016FB20DC0002AF25 /* spawn_private.h in Headers */, E4D45C2F16F868ED0002AF25 /* libproc.h in Headers */, E4D45C3016F868ED0002AF25 /* libproc_internal.h in Headers */, @@ -1205,7 +1227,7 @@ 08FB7793FE84155DC02AAC07 /* Project object */ = { isa = PBXProject; attributes = { - LastUpgradeCheck = 0500; + LastUpgradeCheck = 1000; }; buildConfigurationList = 1DEB914E08733D8E0010E9CD /* Build configuration list for PBXProject "Libsyscall" */; compatibilityVersion = "Xcode 3.2"; @@ -1307,6 +1329,7 @@ isa = PBXSourcesBuildPhase; buildActionMask = 2147483647; files = ( + C9001753206B00AC0070D674 /* port_descriptions.c in Sources */, 726D915520ACD7FC0039A2FE /* mach_bridge_remote_time.c in Sources */, 403C7CEE1E1F4E4400D6FEEF /* os_packet.c in Sources */, E214BDC81C2E358300CEE8A3 /* clonefile.c in Sources */, @@ -1363,6 +1386,7 @@ 978228281B8678DC008385AC /* pselect-darwinext.c in Sources */, 2485235511582D8F0051B413 /* mach_legacy.c in Sources */, 242AB66611EBDC1200107336 /* errno.c in Sources */, + 4BCDD8B220741C2F00FA37A3 /* mach_right.c in Sources */, E4D45C2E16F868ED0002AF25 /* libproc.c in Sources */, 24A7C5BC11FF8DA6007669EB /* accept.c in Sources */, 24A7C5BD11FF8DA6007669EB /* bind.c in Sources */, @@ -1483,6 +1507,7 @@ isa = XCBuildConfiguration; baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */; buildSettings = { + CLANG_ENABLE_OBJC_WEAK = YES; COPY_PHASE_STRIP = NO; INSTALL_PATH = /usr/local/lib/dyld; STRIP_INSTALLED_PRODUCT = NO; @@ -1493,8 +1518,30 @@ isa = XCBuildConfiguration; baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */; buildSettings = { + CLANG_ANALYZER_LOCALIZABILITY_NONLOCALIZED = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = NO; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + ENABLE_STRICT_OBJC_MSGSEND = YES; GCC_C_LANGUAGE_STANDARD = gnu99; + GCC_NO_COMMON_BLOCKS = YES; + GCC_WARN_64_TO_32_BIT_CONVERSION = NO; GCC_WARN_ABOUT_RETURN_TYPE = YES; + GCC_WARN_UNDECLARED_SELECTOR = YES; GCC_WARN_UNINITIALIZED_AUTOS = YES; GCC_WARN_UNUSED_FUNCTION = YES; GCC_WARN_UNUSED_PARAMETER = YES; @@ -1507,7 +1554,7 @@ isa = XCBuildConfiguration; baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */; buildSettings = { - ARCHS = "$(ARCHS_STANDARD_32_64_BIT)"; + CLANG_ENABLE_OBJC_WEAK = YES; COPY_PHASE_STRIP = YES; DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; MAP_PLATFORM = "$(MAP_PLATFORM_$(PLATFORM_NAME))"; @@ -1527,6 +1574,7 @@ isa = XCBuildConfiguration; baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */; buildSettings = { + CLANG_ENABLE_OBJC_WEAK = YES; OTHER_LDFLAGS = "$(DYLIB_LDFLAGS)"; VERSION_INFO_PREFIX = "___"; }; @@ -1536,6 +1584,7 @@ isa = XCBuildConfiguration; baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */; buildSettings = { + CLANG_ENABLE_OBJC_WEAK = YES; COPY_PHASE_STRIP = NO; INSTALLHDRS_COPY_PHASE = NO; PRODUCT_NAME = Build; @@ -1547,6 +1596,7 @@ isa = XCBuildConfiguration; baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */; buildSettings = { + CLANG_ENABLE_OBJC_WEAK = YES; PRODUCT_NAME = "$(TARGET_NAME)"; }; name = Release; @@ -1555,6 +1605,7 @@ isa = XCBuildConfiguration; baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */; buildSettings = { + CLANG_ENABLE_OBJC_WEAK = YES; COPY_PHASE_STRIP = NO; PRODUCT_NAME = Libsyscall_headers_Sim; SKIP_INSTALL = YES; diff --git a/libsyscall/custom/SYS.h b/libsyscall/custom/SYS.h index ff93f852a..a62cc1f99 100644 --- a/libsyscall/custom/SYS.h +++ b/libsyscall/custom/SYS.h @@ -228,9 +228,9 @@ LEAF(pseudo, 0) ;\ #endif #define MI_ENTRY_POINT(name) \ + .text ;\ .align 2 ;\ .globl name ;\ - .text ;\ name: /* load the syscall number into r12 and trap */ @@ -425,6 +425,18 @@ pseudo: ;\ #include #include +#if defined(__arm64__) && !defined(__LP64__) +#define ZERO_EXTEND(argnum) uxtw x ## argnum, w ## argnum +#else +#define ZERO_EXTEND(argnum) +#endif + +#if defined(__arm64__) && !defined(__LP64__) +#define SIGN_EXTEND(argnum) sxtw x ## argnum, w ## argnum +#else +#define SIGN_EXTEND(argnum) +#endif + /* * ARM64 system call interface: * diff --git a/libsyscall/custom/__fork.s b/libsyscall/custom/__fork.s index dc517a1a2..ffcbe5ab4 100644 --- a/libsyscall/custom/__fork.s +++ b/libsyscall/custom/__fork.s @@ -132,6 +132,7 @@ Lparent: #include MI_ENTRY_POINT(___fork) + ARM64_STACK_PROLOG PUSH_FRAME // ARM moves a 1 in to r1 here, but I can't see why. mov x16, #SYS_fork // Syscall code @@ -144,14 +145,14 @@ MI_ENTRY_POINT(___fork) mov w0, #0 str w0, [x9] // Clear cached current pid POP_FRAME // And done - ret + ARM64_STACK_EPILOG Lbotch: MI_CALL_EXTERNAL(_cerror) // Handle error mov w0, #-1 // Return value is -1 Lparent: POP_FRAME // Return - ret + ARM64_STACK_EPILOG #else #error Unsupported architecture diff --git a/libsyscall/custom/__getpid.s b/libsyscall/custom/__getpid.s index a048f48aa..a8daa7398 100644 --- a/libsyscall/custom/__getpid.s +++ b/libsyscall/custom/__getpid.s @@ -159,7 +159,7 @@ MI_ENTRY_POINT(___getpid) MI_GET_ADDRESS(x9, __current_pid) // Get address of cached value ldr w0, [x9] // Load it cmp w0, #0 // See if there's a cached value - b.ls L_notcached // If not, make syscall + b.le L_notcached // If not, make syscall ret // Else, we're done L_notcached: SYSCALL_NONAME(getpid, 0, cerror_nocancel) diff --git a/libsyscall/custom/__sigreturn.s b/libsyscall/custom/__sigreturn.s index a6a24404e..62238b9ed 100644 --- a/libsyscall/custom/__sigreturn.s +++ b/libsyscall/custom/__sigreturn.s @@ -30,19 +30,19 @@ #if defined(__x86_64__) -__SYSCALL(___sigreturn, sigreturn, 2) +__SYSCALL(___sigreturn, sigreturn, 3) #elif defined(__i386__) -__SYSCALL_INT(___sigreturn, sigreturn, 2) +__SYSCALL_INT(___sigreturn, sigreturn, 3) #elif defined(__arm__) -__SYSCALL(___sigreturn, sigreturn, 2) +__SYSCALL(___sigreturn, sigreturn, 3) #elif defined(__arm64__) -__SYSCALL(___sigreturn, sigreturn, 2) +__SYSCALL(___sigreturn, sigreturn, 3) #else #error Unsupported architecture diff --git a/libsyscall/mach/mach/mach.h b/libsyscall/mach/mach/mach.h index 00abb7216..fbe13755a 100644 --- a/libsyscall/mach/mach/mach.h +++ b/libsyscall/mach/mach/mach.h @@ -129,6 +129,7 @@ extern mach_msg_return_t mach_msg_server_importance(boolean_t (*) mach_msg_size_t, mach_port_t, mach_msg_options_t); + /* * Prototypes for compatibility */ diff --git a/libsyscall/mach/mach/mach_right.h b/libsyscall/mach/mach/mach_right.h new file mode 100644 index 000000000..2a7522ee8 --- /dev/null +++ b/libsyscall/mach/mach/mach_right.h @@ -0,0 +1,394 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef __MACH_RIGHT_H +#define __MACH_RIGHT_H + +#include +#include +#include +#include +#include +#include + +__BEGIN_DECLS; + +/*! + * @typedef mach_right_recv_t + * A type representing the receive right to a Mach port. + */ +typedef struct _mach_right_recv { + mach_port_t mrr_name; +} mach_right_recv_t; + +/*! + * @const MACH_RIGHT_RECV_NULL + * A convenience initializer for a receive right object. + */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L +#define MACH_RIGHT_RECV_NULL ((mach_right_recv_t){MACH_PORT_NULL}) +#elif defined(__cplusplus) && __cplusplus >= 201103L +#define MACH_RIGHT_RECV_NULL (mach_right_recv_t{MACH_PORT_NULL}) +#elif defined(__cplusplus) +#define MACH_RIGHT_RECV_NULL \ + (mach_right_recv_t((mach_right_recv_t){MACH_PORT_NULL})) +#else +#define MACH_RIGHT_RECV_NULL {MACH_PORT_NULL} +#endif + +/*! + * @typedef mach_right_send_t + * A type representing a send right to a Mach port. + */ +typedef struct _mach_right_send { + mach_port_t mrs_name; +} mach_right_send_t; + +/*! + * @const MACH_RIGHT_SEND_NULL + * A convenience initializer for a send right object. + */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L +#define MACH_RIGHT_SEND_NULL ((mach_right_send_t){MACH_PORT_NULL}) +#elif defined(__cplusplus) && __cplusplus >= 201103L +#define MACH_RIGHT_SEND_NULL (mach_right_send_t{MACH_PORT_NULL}) +#elif defined(__cplusplus) +#define MACH_RIGHT_SEND_NULL \ + (mach_right_send_t((mach_right_send_t){MACH_PORT_NULL})) +#else +#define MACH_RIGHT_SEND_NULL {MACH_PORT_NULL} +#endif + +/*! + * @typedef mach_right_send_once_t + * A type representing a send-once right to a Mach port. + */ +typedef struct _mach_right_send_once { + mach_port_t mrso_name; +} mach_right_send_once_t; + +/*! + * @const MACH_RIGHT_SEND_ONCE_NULL + * A convenience initializer for a send-once right object. + */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L +#define MACH_RIGHT_SEND_ONCE_NULL ((mach_right_send_once_t){MACH_PORT_NULL}) +#elif defined(__cplusplus) && __cplusplus >= 201103L +#define MACH_RIGHT_SEND_ONCE_NULL (mach_right_send_once_t{MACH_PORT_NULL}) +#elif defined(__cplusplus) +#define MACH_RIGHT_SEND_ONCE_NULL \ + (mach_right_send_once_t((mach_right_send_once_t){MACH_PORT_NULL})) +#else +#define MACH_RIGHT_SEND_ONCE_NULL {MACH_PORT_NULL} +#endif + +/*! + * @function mach_right_recv + * Wraps a port name as a receive right object. + * + * @param pn + * The port name. If this name is valid but does not represent a receive right, + * the behavior of mach_right_recv_* implementations is undefined. + * + * @result + * A new receive right object. + */ +OS_ALWAYS_INLINE OS_WARN_RESULT +static inline mach_right_recv_t +mach_right_recv(mach_port_name_t pn) +{ + mach_right_recv_t mrr = {pn}; + return mrr; +} + +/*! + * @function mach_right_send + * Wraps a port name as a send right object. + * + * @param pn + * The port name. If this name is valid but does not represent a send right, the + * behavior of mach_right_send_* implementations is undefined. + * + * @result + * A new send right object. + */ +OS_ALWAYS_INLINE OS_WARN_RESULT +static inline mach_right_send_t +mach_right_send(mach_port_name_t pn) +{ + mach_right_send_t mrs = {pn}; + return mrs; +} + +/*! + * @function mach_right_send_valid + * Checks if the given send right object is valid. + * + * @param mrs + * The send right object to check. + * + * @result + * A Boolean indicating whether the right is valid. + */ +OS_ALWAYS_INLINE OS_WARN_RESULT +static inline bool +mach_right_send_valid(mach_right_send_t mrs) +{ + return MACH_PORT_VALID(mrs.mrs_name); +} + +/*! + * @function mach_right_send_once + * Wraps a port name as a send-once right object. + * + * @param pn + * The port name. If this name is valid but does not represent a send-once + * right, the behavior of mach_right_send_once_* implementations is undefined. + * + * @result + * A new send-once right object. + */ +OS_ALWAYS_INLINE OS_WARN_RESULT +static inline mach_right_send_once_t +mach_right_send_once(mach_port_name_t pn) +{ + mach_right_send_once_t mrso = {pn}; + return mrso; +} + +/*! + * @function mach_right_send_once_valid + * Checks if the given send-once right object is valid. + * + * @param mrso + * The send-once right object to check. + * + * @result + * A Boolean indicating whether the right is valid. + */ +OS_ALWAYS_INLINE OS_WARN_RESULT +static inline bool +mach_right_send_once_valid(mach_right_send_once_t mrso) +{ + return MACH_PORT_VALID(mrso.mrso_name); +} + +/*! + * @typedef mach_right_flags_t + * Flags influencing the behavior of a constructed Mach port. + * + * @const MACH_RIGHT_RECV_INIT + * No flags set. This value is suitable for initialization purposes. + * + * @const MACH_RIGHT_RECV_UNGUARDED + * The given context should not serve as a guard for the underlying port's + * destruction. + */ +OS_ENUM(mach_right_flags, uint64_t, + MACH_RIGHT_RECV_FLAG_INIT = 0, + MACH_RIGHT_RECV_FLAG_UNGUARDED = (1 << 0), +); + +/*! + * @function mach_right_recv_construct + * Allocates a new Mach port and returns the receive right to the caller. + * + * @param flags + * Flags to influence the behavior of the new port. + * + * @param sr + * If non-NULL, will be filled in with the name of a send right which + * corresponds to the new port. The caller is responsible for disposing of this + * send right with {@link mach_right_send_release}. + * + * @param ctx + * Context to be associated with the new port. By default, this context must be + * passed to {@link mach_right_recv_destruct} in order to destroy the underlying + * port. This requirement may be elided with the + * {@link MACH_RIGHT_RECV_UNGUARDED} flag. + * + * @result + * A new port handle which refers to the receive right for the newly-created + * port. The caller is responsible for disposing of this handle with + * {@link mach_right_recv_destruct}. + * + * @discussion + * The implementation will abort on any failure to allocate a new port object in + * the kernel. Thus the caller may assert that a new, valid receive right is + * always returned. + */ +OS_EXPORT OS_WARN_RESULT +mach_right_recv_t +mach_right_recv_construct(mach_right_flags_t flags, + mach_right_send_t *_Nullable sr, uintptr_t ctx); + +/*! + * @function mach_right_recv_destruct + * Closes the port referred to by the given receive right. + * + * @param r + * The receive right for the port to manipulate. + * + * @param s + * A pointer to the send right to dispose of. If NULL is given, no attempt will + * be made to clean up any send right associated with the port. If the name of + * the given send right does not match the name of the given receive right, the + * implementation's behavior is undefined. + * + * @param ctx + * The context which guards the underlying port destruction. If the receive + * right was created with {@link MACH_RIGHT_RECV_UNGUARDED}, this parameter is + * ignored. + * + * @discussion + * If a send right is passed, the implementation performs the moral equivalent + * of + * + * mach_right_recv_destruct(r, MACH_PORT_NULL, ctx); + * mach_right_send_release(s); + * + * except in a more efficient manner, requiring only one system call. + * + * The implementation will abort on any failure to dispose of the port. As such, + * this routine should only be used on ports that are known to be under the + * caller's complete control. + */ +OS_EXPORT +void +mach_right_recv_destruct(mach_right_recv_t r, mach_right_send_t *_Nullable s, + uintptr_t ctx); + +/*! + * @function mach_right_send_create + * Creates a send right to the port referenced by the given receive right. + * + * @param r + * The receive right for the port for which to create the send right. + * + * @result + * The name of the new send right. The caller is responsible for disposing of + * this send right with {@link mach_right_send_release}. + * + * This operation will increment the make-send count of the port referenced by + * the given receive right. + * + * @discussion + * The implementation will abort on any failure to create the send right. As + * such, this routine should only be used on ports that are known to be under + * the caller's complete control. + */ +OS_EXPORT OS_WARN_RESULT +mach_right_send_t +mach_right_send_create(mach_right_recv_t r); + +/*! + * @function mach_right_send_retain + * Increments the user reference count for the given send right. + * + * @param s + * The send right to manipulate. + * + * @result + * If the reference count was successfully incremented, the given port name is + * returned. If either MACH_PORT_NULL or MACH_PORT_DEAD are given, the given + * value is returned. If the given send right became a dead name before or + * during the attempt to retain the send right, MACH_PORT_DEAD is returned. + * + * If the implementation encounters any other failure condition, it will abort. + */ +OS_EXPORT OS_WARN_RESULT +mach_right_send_t +mach_right_send_retain(mach_right_send_t s); + +/*! + * @function mach_right_send_release + * Decrements the user reference count for the given send right. + * + * @param s + * The send right to manipulate. + * + * @discussion + * If the given send right became a dead name before or during the attempt to + * release it, the implementation will dispose of that dead name. + * + * If the implementation encounters any other failure condition, it will abort. + */ +OS_EXPORT +void +mach_right_send_release(mach_right_send_t s); + +/*! + * @function mach_right_send_once_create + * Creates a send-once right from the given receive right. + * + * @param r + * The receive right for the port for which to create the send-once right. + * + * @result + * The newly-created send-once right. + * + * @discussion + * The implementation will abort on any failure to allocate a new send-once + * right, and therefore the caller should only provide a receive right which is + * under its complete control. The caller may assert that a new, valid send-once + * right is always returned. + * + * The returned send-once right will never share a name with the given receive + * right. A send-once right must be consumed either by using it to send a + * message or by consuming it with {@link mach_right_send_once_consume}. + * + * The returned right does not support retain/release semantics despite the + * presence of "create" in the name. + */ +OS_EXPORT OS_WARN_RESULT +mach_right_send_once_t +mach_right_send_once_create(mach_right_recv_t r); + +/*! + * @function mach_right_send_once_consume + * Consumes the given send-once right. + * + * @param so + * The send-once right to manipulate. + * + * @discussion + * If the given send-once right became a dead name before or during the attempt + * to release it, the implementation will dispose of that dead name. + * + * If the implementation encounters any other failure condition, it will abort. + * + * This operation will cause a send-once notification to be delivered to the + * port to which the send-once right refers unless the right is a dead name, in + * which case there are no side effects. + */ +OS_EXPORT +void +mach_right_send_once_consume(mach_right_send_once_t so); + +__END_DECLS; + +#endif // __MACH_RIGHT_H diff --git a/libsyscall/mach/mach/mach_sync_ipc.h b/libsyscall/mach/mach/mach_sync_ipc.h new file mode 100644 index 000000000..032e7acb1 --- /dev/null +++ b/libsyscall/mach/mach/mach_sync_ipc.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2018 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ + +#ifndef _MACH_SYNC_IPC_H_ +#define _MACH_SYNC_IPC_H_ + +#include + +__BEGIN_DECLS + +/*! + * @function mach_sync_ipc_link_monitoring_start + * + * @abstract + * Starts monitoring the sync IPC priority inversion avoidance + * facility of the current thread. + * A subsequent call to mach_sync_ipc_link_monitoring_stop() will + * validate that the facility took effect for all synchronous IPC + * performed from this thread between the calls to start and stop. + * + * @discussion + * In case of success, a port right is returned, which has to be + * deallocated by passing it to mach_sync_ipc_link_monitoring_stop(). + * + * @param port + * Pointer to a mach_port_t that will be populated in case of success. + * + * @result + * KERN_SUCCESS in case of success, specific error otherwise. + * If the call is not supported, KERN_NOT_SUPPORTED is returned. + */ +extern kern_return_t mach_sync_ipc_link_monitoring_start(mach_port_t* port); + +/*! + * @function mach_sync_ipc_link_monitoring_stop + * + * @abstract + * Stops monitoring the sync IPC priority inversion avoidance facility + * of the current thread started by a call to mach_sync_ipc_link_monitoring_start(). + * + * Returns whether the facility took effect for all synchronous IPC performed + * from this thread between the calls to start and stop. + * + * Reasons for this function to return false include: + * -remote message event handler did not reply to the message itself + * -remote message was not received by a workloop (xpc connection or dispatch mach channel) + * + * @discussion + * To be called after mach_sync_ipc_link_monitoring_start(). If + * mach_sync_ipc_link_monitoring_start() didn't return an error this + * function must be called to deallocate the port right that was returned. + * + * @param port + * mach_port_t returned by mach_sync_ipc_link_monitoring_start(). + * + * @param in_effect + * Pointer to boolean_t value that will be populated in the case of success. + * Indicates whether the sync IPC priority inversion avoidance facility took + * effect for all synchronous IPC performed from this thread between the calls + * to start and stop. + * + * @result + * KERN_SUCCESS in case of no errors, specific error otherwise. + * If the call is not supported, KERN_NOT_SUPPORTED is returned. + */ +extern kern_return_t mach_sync_ipc_link_monitoring_stop(mach_port_t port, boolean_t* in_effect); + +typedef enum thread_destruct_special_reply_port_rights { + THREAD_SPECIAL_REPLY_PORT_ALL, + THREAD_SPECIAL_REPLY_PORT_RECEIVE_ONLY, + THREAD_SPECIAL_REPLY_PORT_SEND_ONLY, +} thread_destruct_special_reply_port_rights_t; + +extern kern_return_t thread_destruct_special_reply_port(mach_port_name_t port, thread_destruct_special_reply_port_rights_t rights); + +extern mach_port_t mig_get_special_reply_port(void); + +extern void mig_dealloc_special_reply_port(mach_port_t migport); + + +__END_DECLS + +#endif /* _MACH_SYNC_IPC_H_ */ diff --git a/libsyscall/mach/mach/port_descriptions.h b/libsyscall/mach/mach/port_descriptions.h new file mode 100644 index 000000000..e237e2757 --- /dev/null +++ b/libsyscall/mach/mach/port_descriptions.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2018 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef _MACH_PORT_DESCRIPTIONS_ +#define _MACH_PORT_DESCRIPTIONS_ + +#include + +__BEGIN_DECLS + +/* + * Returns a string describing the host special port offset provided, or NULL if + * the provided offset is not a host special port offset. + */ +const char *mach_host_special_port_description(int offset); + +/* + * Returns a string describing the task special port offset provided, or NULL if + * the provided offset is not a task special port offset. + */ +const char *mach_task_special_port_description(int offset); + +/* + * Returns the port for the given identifier of a host special port. For + * instance, passing "HOST_PRIV_PORT" would return 1. + * + * Returns -1 on error. + */ +int mach_host_special_port_for_id(const char *id); + +/* + * Returns the port for the given identifier of a task special port. + * + * Returns -1 on error. + */ +int mach_task_special_port_for_id(const char *id); + +__END_DECLS + +#endif /* !defined(_MACH_PORT_DESCRIPTIONS_) */ diff --git a/libsyscall/mach/mach/thread_state.h b/libsyscall/mach/mach/thread_state.h index 42abc4ec2..67afb6835 100644 --- a/libsyscall/mach/mach/thread_state.h +++ b/libsyscall/mach/mach/thread_state.h @@ -35,30 +35,29 @@ #ifndef KERNEL /* * Gets all register values in the target thread with pointer-like contents. - * There's no guarantee that the returned values are valid pointers, but all + * + * There is no guarantee that the returned values are valid pointers, but all * valid pointers will be returned. The order and count of the provided * register values is unspecified and may change; registers with values that * are not valid pointers may be omitted, so the number of pointers returned * may vary from call to call. * - * sp is an out parameter that will contain the stack pointer - * length is an in/out parameter for the length of the values array - * values is an array of pointers + * sp is an out parameter that will contain the stack pointer. + * length is an in/out parameter for the length of the values array. + * values is an array of pointers. * * This may only be called on threads in the current task. If the current * platform defines a stack red zone, the stack pointer returned will be * adjusted to account for red zone. * - * If length is insufficient KERN_INSUFFICIENT_BUFFER_SIZE will be returned and - * length set to the amount of memory required. Callers MUST NOT assume that - * any particular size of buffer will be sufficient and should retry with an - * aproproately sized buffer upon this error. + * If length is insufficient, KERN_INSUFFICIENT_BUFFER_SIZE will be returned + * and length set to the amount of memory required. Callers MUST NOT assume + * that any particular size of buffer will be sufficient and should retry with + * an appropriately sized buffer upon this error. */ -__OSX_UNAVAILABLE -__IOS_UNAVAILABLE -__TVOS_AVAILABLE(9.0) -__WATCHOS_UNAVAILABLE -kern_return_t thread_get_register_pointer_values(thread_t thread, uintptr_t *sp, size_t *length, uintptr_t *values); +__API_AVAILABLE(macosx(10.14), ios(12.0), tvos(9.0), watchos(5.0)) +kern_return_t thread_get_register_pointer_values(thread_t thread, + uintptr_t *sp, size_t *length, uintptr_t *values); #endif #endif /* _MACH_THREAD_STATE_H_ */ diff --git a/libsyscall/mach/mach_msg.c b/libsyscall/mach/mach_msg.c index bdb446c33..4b90d19e5 100644 --- a/libsyscall/mach/mach_msg.c +++ b/libsyscall/mach/mach_msg.c @@ -559,6 +559,7 @@ mach_msg_server( buffers_swapped = FALSE; old_state = voucher_mach_msg_adopt(&bufRequest->Head); + bufReply->Head = (mach_msg_header_t){}; (void) (*demux)(&bufRequest->Head, &bufReply->Head); diff --git a/libsyscall/mach/mach_port.c b/libsyscall/mach/mach_port.c index e2cf670be..3219d7301 100644 --- a/libsyscall/mach/mach_port.c +++ b/libsyscall/mach/mach_port.c @@ -30,6 +30,8 @@ #include #include #include +#include +#include "tsd.h" kern_return_t mach_port_names( @@ -302,9 +304,23 @@ mach_port_get_attributes( { kern_return_t rv; - rv = _kernelrpc_mach_port_get_attributes(task, name, flavor, + rv = _kernelrpc_mach_port_get_attributes_trap(task, name, flavor, port_info_out, port_info_outCnt); +#ifdef __x86_64__ + /* REMOVE once XBS kernel has new trap */ + if (rv == ((1 << 24) | 40)) /* see mach/i386/syscall_sw.h */ + rv = MACH_SEND_INVALID_DEST; +#elif defined(__i386__) + /* REMOVE once XBS kernel has new trap */ + if (rv == (kern_return_t)(-40)) + rv = MACH_SEND_INVALID_DEST; +#endif + + if (rv == MACH_SEND_INVALID_DEST) + rv = _kernelrpc_mach_port_get_attributes(task, name, flavor, + port_info_out, port_info_outCnt); + return (rv); } @@ -407,6 +423,93 @@ mach_port_space_basic_info( return (rv); } +static inline mach_port_t +_tsd_get_special_reply_port() +{ + return (mach_port_t)(uintptr_t)_os_tsd_get_direct(__TSD_MACH_SPECIAL_REPLY); +} + +static inline void +_tsd_set_special_reply_port(mach_port_t port) +{ + _os_tsd_set_direct(__TSD_MACH_SPECIAL_REPLY, (void *)(uintptr_t)port); +} + +mach_port_t +mig_get_special_reply_port(void) +{ + mach_port_t srp; + + srp = _tsd_get_special_reply_port(); + if (!MACH_PORT_VALID(srp)) { + srp = thread_get_special_reply_port(); + _tsd_set_special_reply_port(srp); + } + + return srp; +} + +void +mig_dealloc_special_reply_port(mach_port_t migport) +{ + mach_port_t srp = _tsd_get_special_reply_port(); + if (MACH_PORT_VALID(srp)) { + thread_destruct_special_reply_port(srp, THREAD_SPECIAL_REPLY_PORT_ALL); + if (migport != srp) { + mach_port_deallocate(mach_task_self(), migport); + } + _tsd_set_special_reply_port(MACH_PORT_NULL); + } +} + +kern_return_t +mach_sync_ipc_link_monitoring_start(mach_port_t *special_reply_port) +{ + mach_port_t srp; + boolean_t link_broken; + kern_return_t kr; + + *special_reply_port = MACH_PORT_DEAD; + + srp = mig_get_special_reply_port(); + + kr = mach_port_mod_refs(mach_task_self(), srp, MACH_PORT_RIGHT_SEND, 1); + + if (kr != KERN_SUCCESS) { + return kr; + } + + kr = _kernelrpc_mach_port_special_reply_port_reset_link(mach_task_self(), srp, &link_broken); + if (kr != KERN_SUCCESS) { + mach_port_deallocate(mach_task_self(), srp); + return kr; + } + + *special_reply_port = srp; + + return kr; +} + +kern_return_t +mach_sync_ipc_link_monitoring_stop(mach_port_t srp, boolean_t* in_effect) +{ + kern_return_t kr; + boolean_t link_broken = TRUE; + + kr = _kernelrpc_mach_port_special_reply_port_reset_link(mach_task_self(), srp, &link_broken); + + /* + * We return if the sync IPC priority inversion avoidance facility took + * effect, so if the link was broken it didn't take effect. + * Flip the return. + */ + *in_effect = !link_broken; + + mach_port_deallocate(mach_task_self(), srp); + + return kr; +} + kern_return_t mach_port_dnrequest_info( ipc_space_t task, @@ -602,18 +705,29 @@ mach_voucher_extract_attr_recipe( rv = mach_voucher_extract_attr_recipe_trap(voucher, key, recipe, recipe_size); -#ifdef __x86_64__ - /* REMOVE once XBS kernel has new trap */ - if (rv == ((1 << 24) | 72)) /* see mach/i386/syscall_sw.h */ - rv = MACH_SEND_INVALID_DEST; -#elif defined(__i386__) - /* REMOVE once XBS kernel has new trap */ - if (rv == (kern_return_t)(-72)) - rv = MACH_SEND_INVALID_DEST; -#endif - if (rv == MACH_SEND_INVALID_DEST) rv = _kernelrpc_mach_voucher_extract_attr_recipe(voucher, key, recipe, recipe_size); return rv; } + + +kern_return_t +thread_destruct_special_reply_port( + mach_port_name_t port, + thread_destruct_special_reply_port_rights_t rights) +{ + switch (rights) { + case THREAD_SPECIAL_REPLY_PORT_ALL: + return mach_port_destruct(mach_task_self(), port, -1, 0); + + case THREAD_SPECIAL_REPLY_PORT_RECEIVE_ONLY: + return mach_port_destruct(mach_task_self(), port, 0, 0); + + case THREAD_SPECIAL_REPLY_PORT_SEND_ONLY: + return mach_port_deallocate(mach_task_self(), port); + + default: + return KERN_INVALID_ARGUMENT; + } +} diff --git a/libsyscall/mach/mach_right.c b/libsyscall/mach/mach_right.c new file mode 100644 index 000000000..c69133e3a --- /dev/null +++ b/libsyscall/mach/mach_right.c @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include +#include +#include + + +#pragma mark Utilities +#define _assert_mach(__op, __kr) \ + do { \ + if (kr != KERN_SUCCESS) { \ + __builtin_trap(); \ + } \ + } while (0) + +#pragma mark API +mach_right_recv_t +mach_right_recv_construct(mach_right_flags_t flags, + mach_right_send_t *_Nullable sr, uintptr_t ctx) +{ + kern_return_t kr = KERN_FAILURE; + mach_port_t p = MACH_PORT_NULL; + mach_port_options_t opts = { + .flags = MPO_CONTEXT_AS_GUARD, + .mpl = { + .mpl_qlimit = MACH_PORT_QLIMIT_BASIC, + }, + }; + + if (flags & MACH_RIGHT_RECV_FLAG_UNGUARDED) { + opts.flags &= (~MPO_CONTEXT_AS_GUARD); + } + if (sr) { + opts.flags |= MPO_INSERT_SEND_RIGHT; + } + + kr = mach_port_construct(mach_task_self(), &opts, ctx, &p); + _mach_assert("construct recv right", kr); + + if (sr) { + sr->mrs_name = p; + } + + return mach_right_recv(p); +} + +void +mach_right_recv_destruct(mach_right_recv_t r, mach_right_send_t *s, + uintptr_t ctx) +{ + kern_return_t kr = KERN_FAILURE; + mach_port_delta_t srd = 0; + + if (s) { + if (r.mrr_name != s->mrs_name) { + _os_set_crash_log_cause_and_message(s->mrs_name, + "api misuse: bad send right"); + __builtin_trap(); + } + + srd = -1; + } + + kr = mach_port_destruct(mach_task_self(), r.mrr_name, srd, ctx); + _mach_assert("destruct recv right", kr); +} + +mach_right_send_t +mach_right_send_create(mach_right_recv_t r) +{ + kern_return_t kr = KERN_FAILURE; + + kr = mach_port_insert_right(mach_task_self(), r.mrr_name, r.mrr_name, + MACH_MSG_TYPE_MAKE_SEND); + _mach_assert("create send right", kr); + + return mach_right_send(r.mrr_name); +} + +mach_right_send_t +mach_right_send_retain(mach_right_send_t s) +{ + kern_return_t kr = KERN_FAILURE; + mach_right_send_t rs = MACH_RIGHT_SEND_NULL; + + kr = mach_port_mod_refs(mach_task_self(), s.mrs_name, + MACH_PORT_RIGHT_SEND, 1); + switch (kr) { + case 0: + rs = s; + break; + case KERN_INVALID_RIGHT: + rs.mrs_name = MACH_PORT_DEAD; + break; + case KERN_INVALID_NAME: + // mach_port_mod_refs() will return success when given either + // MACH_PORT_DEAD or MACH_PORT_NULL with send or send-once right + // operations, so this is always fatal. + default: + _mach_assert("retain send right", kr); + } + + return rs; +} + +void +mach_right_send_release(mach_right_send_t s) +{ + kern_return_t kr = KERN_FAILURE; + + kr = mach_port_mod_refs(mach_task_self(), s.mrs_name, + MACH_PORT_RIGHT_SEND, -1); + switch (kr) { + case 0: + break; + case KERN_INVALID_RIGHT: + kr = mach_port_mod_refs(mach_task_self(), s.mrs_name, + MACH_PORT_RIGHT_DEAD_NAME, -1); + _mach_assert("release dead name", kr); + break; + default: + _mach_assert("release send right", kr); + } +} + +mach_right_send_once_t +mach_right_send_once_create(mach_right_recv_t r) +{ + mach_msg_type_name_t right = 0; + mach_port_t so = MACH_PORT_NULL; + kern_return_t kr = mach_port_extract_right(mach_task_self(), r.mrr_name, + MACH_MSG_TYPE_MAKE_SEND_ONCE, &so, &right); + _mach_assert("create send-once right", kr); + + return mach_right_send_once(so); +} + +void +mach_right_send_once_consume(mach_right_send_once_t so) +{ + kern_return_t kr = KERN_FAILURE; + + kr = mach_port_mod_refs(mach_task_self(), so.mrso_name, + MACH_PORT_RIGHT_SEND_ONCE, -1); + switch (kr) { + case 0: + break; + case KERN_INVALID_RIGHT: + kr = mach_port_mod_refs(mach_task_self(), so.mrso_name, + MACH_PORT_RIGHT_DEAD_NAME, -1); + _mach_assert("release dead name", kr); + break; + default: + _mach_assert("consume send-once right", kr); + } +} diff --git a/libsyscall/mach/mig_reply_port.c b/libsyscall/mach/mig_reply_port.c index 934c1aa20..ee7e867cd 100644 --- a/libsyscall/mach/mig_reply_port.c +++ b/libsyscall/mach/mig_reply_port.c @@ -36,13 +36,13 @@ __XNU_PRIVATE_EXTERN mach_port_t _task_reply_port = MACH_PORT_NULL; static inline mach_port_t _mig_get_reply_port() { - return _os_tsd_get_direct(__TSD_MIG_REPLY); + return (mach_port_t)(uintptr_t)_os_tsd_get_direct(__TSD_MIG_REPLY); } static inline void _mig_set_reply_port(mach_port_t port) { - _os_tsd_set_direct(__TSD_MIG_REPLY, port); + _os_tsd_set_direct(__TSD_MIG_REPLY, (void *)(uintptr_t)port); } /* diff --git a/libsyscall/mach/port_descriptions.c b/libsyscall/mach/port_descriptions.c new file mode 100644 index 000000000..a5d8a93e1 --- /dev/null +++ b/libsyscall/mach/port_descriptions.c @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include + +const char * +mach_host_special_port_description(int port) +{ + int port_index = (int)port; + + if (port_index < 0 || port_index > HOST_MAX_SPECIAL_PORT) { + return NULL; + } + + static const char *hsp_descs[] = { + [HOST_PORT] = "host (restricted)", + [HOST_PRIV_PORT] = "host private (restricted)", + [HOST_IO_MASTER_PORT] = "I/O master (restricted)", + + [HOST_DYNAMIC_PAGER_PORT] = "dynamic pager", + [HOST_AUDIT_CONTROL_PORT] = "audit control", + [HOST_USER_NOTIFICATION_PORT] = "user notification", + [HOST_AUTOMOUNTD_PORT] = "automounter", + [HOST_LOCKD_PORT] = "lockd", + [HOST_KTRACE_BACKGROUND_PORT] = "ktrace background notification", + [HOST_SEATBELT_PORT] = "seatbelt", + [HOST_KEXTD_PORT] = "kextd", + [HOST_LAUNCHCTL_PORT] = "launchctl", + [HOST_UNFREED_PORT] = "fairplay", + [HOST_AMFID_PORT] = "amfi", + [HOST_GSSD_PORT] = "gssd", + [HOST_TELEMETRY_PORT] = "telemetry", + [HOST_ATM_NOTIFICATION_PORT] = "atm notification", + [HOST_COALITION_PORT] = "coalition notification", + [HOST_SYSDIAGNOSE_PORT] = "sysdiagnose notification", + [HOST_XPC_EXCEPTION_PORT] = "XPC exception", + [HOST_CONTAINERD_PORT] = "container manager", + [HOST_NODE_PORT] = "node", + [HOST_RESOURCE_NOTIFY_PORT] = "resource notify", + [HOST_CLOSURED_PORT] = "closured", + [HOST_SYSPOLICYD_PORT] = "syspolicyd", + }; + _Static_assert(HOST_SYSPOLICYD_PORT == HOST_MAX_SPECIAL_PORT, + "all host special ports must have descriptions"); + + return hsp_descs[port_index]; +} + +const char * +mach_task_special_port_description(int port) +{ + int port_index = (int)port; + + if (port_index < 0 || port_index > TASK_MAX_SPECIAL_PORT) { + return NULL; + } + + static const char *tsp_descs[] = { + [TASK_KERNEL_PORT] = "kernel", + [TASK_HOST_PORT] = "host", + [TASK_NAME_PORT] = "name", + [TASK_BOOTSTRAP_PORT] = "bootstrap", + [TASK_SEATBELT_PORT] = "seatbelt", + [TASK_ACCESS_PORT] = "access", + [TASK_DEBUG_CONTROL_PORT] = "debug control", + [TASK_RESOURCE_NOTIFY_PORT] = "resource notify", + }; + _Static_assert(TASK_RESOURCE_NOTIFY_PORT == TASK_MAX_SPECIAL_PORT, + "all task special ports must have descriptions"); + + return tsp_descs[port_index]; +} + +static int +port_for_id_internal(const char *id, const char **ids, int nids) +{ + if (!id) { + errno = EINVAL; + return -1; + } + + for (int i = 0; i < nids; i++) { + if (ids[i] && strcmp(ids[i], id) == 0) { + return i; + } + } + + errno = ENOENT; + return -1; +} + +int +mach_host_special_port_for_id(const char *id) +{ + static const char *hsp_ids[] = { +#define SP_ENTRY(id) [id] = #id + SP_ENTRY(HOST_PORT), + SP_ENTRY(HOST_PRIV_PORT), + SP_ENTRY(HOST_IO_MASTER_PORT), + SP_ENTRY(HOST_DYNAMIC_PAGER_PORT), + SP_ENTRY(HOST_AUDIT_CONTROL_PORT), + SP_ENTRY(HOST_USER_NOTIFICATION_PORT), + SP_ENTRY(HOST_AUTOMOUNTD_PORT), + SP_ENTRY(HOST_LOCKD_PORT), + SP_ENTRY(HOST_KTRACE_BACKGROUND_PORT), + SP_ENTRY(HOST_SEATBELT_PORT), + SP_ENTRY(HOST_KEXTD_PORT), + SP_ENTRY(HOST_LAUNCHCTL_PORT), + SP_ENTRY(HOST_UNFREED_PORT), + SP_ENTRY(HOST_AMFID_PORT), + SP_ENTRY(HOST_GSSD_PORT), + SP_ENTRY(HOST_TELEMETRY_PORT), + SP_ENTRY(HOST_ATM_NOTIFICATION_PORT), + SP_ENTRY(HOST_COALITION_PORT), + SP_ENTRY(HOST_SYSDIAGNOSE_PORT), + SP_ENTRY(HOST_XPC_EXCEPTION_PORT), + SP_ENTRY(HOST_CONTAINERD_PORT), + SP_ENTRY(HOST_NODE_PORT), + SP_ENTRY(HOST_RESOURCE_NOTIFY_PORT), + SP_ENTRY(HOST_CLOSURED_PORT), + SP_ENTRY(HOST_SYSPOLICYD_PORT), + }; + + return port_for_id_internal(id, hsp_ids, + sizeof(hsp_ids) / sizeof(hsp_ids[0])); +} + +int +mach_task_special_port_for_id(const char *id) +{ + static const char *tsp_ids[] = { + SP_ENTRY(TASK_KERNEL_PORT), + SP_ENTRY(TASK_HOST_PORT), + SP_ENTRY(TASK_NAME_PORT), + SP_ENTRY(TASK_BOOTSTRAP_PORT), + SP_ENTRY(TASK_SEATBELT_PORT), + SP_ENTRY(TASK_ACCESS_PORT), + SP_ENTRY(TASK_DEBUG_CONTROL_PORT), + SP_ENTRY(TASK_RESOURCE_NOTIFY_PORT), +#undef SP_ENTRY + }; + + return port_for_id_internal(id, tsp_ids, + sizeof(tsp_ids) / sizeof(tsp_ids[0])); +} diff --git a/osfmk/corecrypto/ccmode/src/ccmode_ctr_setctr.c b/libsyscall/os/thread_self_restrict.h similarity index 74% rename from osfmk/corecrypto/ccmode/src/ccmode_ctr_setctr.c rename to libsyscall/os/thread_self_restrict.h index 6b54e209f..153f516a0 100644 --- a/osfmk/corecrypto/ccmode/src/ccmode_ctr_setctr.c +++ b/libsyscall/os/thread_self_restrict.h @@ -1,11 +1,5 @@ /* - * ccmode_ctr_setctr.c - * corecrypto - * - * Created on 2/1/2017 - * - * Copyright (c) 2017 Apple Inc. All rights reserved. - * + * Copyright (c) 2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -32,12 +26,7 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#include "ccmode_internal.h" +#ifndef OS_THREAD_SELF_RESTRICT_H +#define OS_THREAD_SELF_RESTRICT_H +#endif /* OS_THREAD_SELF_RESTRICT_H */ -int ccmode_ctr_setctr(CC_UNUSED const struct ccmode_ctr *mode, ccctr_ctx *ctx, const void *ctr) -{ - CCMODE_CTR_KEY_PAD_OFFSET(ctx) = CCMODE_CTR_KEY_ECB(ctx)->block_size; - CC_MEMCPY(CCMODE_CTR_KEY_CTR(ctx), ctr, CCMODE_CTR_KEY_ECB(ctx)->block_size); - - return 0; -} diff --git a/libsyscall/os/tsd.h b/libsyscall/os/tsd.h index d49087f14..474c97aec 100644 --- a/libsyscall/os/tsd.h +++ b/libsyscall/os/tsd.h @@ -29,8 +29,6 @@ #ifndef OS_TSD_H #define OS_TSD_H -#include - /* The low nine slots of the TSD are reserved for libsyscall usage. */ #define __TSD_RESERVED_BASE 0 #define __TSD_RESERVED_MAX 9 @@ -42,12 +40,20 @@ #define __TSD_THREAD_QOS_CLASS 4 #define __TSD_RETURN_TO_KERNEL 5 /* slot 6 is reserved for Windows/WINE compatibility reasons */ +#define __TSD_PTR_MUNGE 7 +#define __TSD_MACH_SPECIAL_REPLY 8 #define __TSD_SEMAPHORE_CACHE 9 +#ifndef __ASSEMBLER__ + +#include + #ifdef __arm__ #include #endif +extern void _thread_set_tsd_base(void *tsd_base); + __attribute__((always_inline)) static __inline__ unsigned int _os_cpu_number(void) @@ -147,6 +153,61 @@ _os_tsd_set_direct(unsigned long slot, void *val) } #endif -extern void _thread_set_tsd_base(void *tsd_base); +__attribute__((always_inline, pure)) +static __inline__ uintptr_t +_os_ptr_munge_token(void) +{ + return (uintptr_t)_os_tsd_get_direct(__TSD_PTR_MUNGE); +} -#endif +__attribute__((always_inline, pure)) +static __inline__ uintptr_t +_os_ptr_munge(uintptr_t ptr) +{ + return ptr ^ _os_ptr_munge_token(); +} +#define _OS_PTR_MUNGE(_ptr) _os_ptr_munge((uintptr_t)(_ptr)) +#define _OS_PTR_UNMUNGE(_ptr) _os_ptr_munge((uintptr_t)(_ptr)) + +#else // __ASSEMBLER__ + +#define _OS_TSD_OFFSET(_key) \ + ((__POINTER_WIDTH__/__CHAR_BIT__)*_key) + +#if defined(__i386__) || defined(__x86_64__) + +#define _OS_PTR_MUNGE(_reg) \ + xor %gs:_OS_TSD_OFFSET(__TSD_PTR_MUNGE), _reg + +#define _OS_PTR_UNMUNGE(_reg) \ + _OS_PTR_MUNGE(_reg) + +#elif defined(__arm__) || defined(__arm64__) + +#if defined(__arm__) + +#define _OS_PTR_MUNGE_TOKEN(_reg, _token) \ + mrc p15, 0, _reg, c13, c0, 3; \ + bic _reg, _reg, #3; \ + ldr _token, [ _reg, #_OS_TSD_OFFSET(__TSD_PTR_MUNGE) ] + +#elif defined(__arm64__) + +#define _OS_PTR_MUNGE_TOKEN(_reg, _token) \ + mrs _reg, TPIDRRO_EL0 %% \ + and _reg, _reg, #~0x7 %% \ + ldr _token, [ _reg, #_OS_TSD_OFFSET(__TSD_PTR_MUNGE) ] + +#endif // defined(__arm64__) + +#define _OS_PTR_MUNGE(_regdest, _regsrc, _token) \ + eor _regdest, _regsrc, _token + +#define _OS_PTR_UNMUNGE(_regdest, _regsrc, _token) \ + _OS_PTR_MUNGE(_regdest, _regsrc, _token) + +#endif // defined(__arm__) || defined(__arm64__) + +#endif // __ASSEMBLER__ + +#endif // OS_TSD_H diff --git a/libsyscall/wrappers/__commpage_gettimeofday.c b/libsyscall/wrappers/__commpage_gettimeofday.c index d7ecb2573..e1e7e001d 100644 --- a/libsyscall/wrappers/__commpage_gettimeofday.c +++ b/libsyscall/wrappers/__commpage_gettimeofday.c @@ -95,7 +95,11 @@ __commpage_gettimeofday_internal(struct timeval *tp, uint64_t *tbr_out) if (delta >= Ticks_per_sec) return(1); - tp->tv_sec = TimeStamp_sec; + if (TimeStamp_sec > __LONG_MAX__) { + return(1); + } + + tp->tv_sec = (__darwin_time_t)TimeStamp_sec; over = multi_overflow(Tick_scale, delta); if(over){ diff --git a/libsyscall/wrappers/_libc_funcptr.c b/libsyscall/wrappers/_libc_funcptr.c index 63eb09ca6..9c65ef3ff 100644 --- a/libsyscall/wrappers/_libc_funcptr.c +++ b/libsyscall/wrappers/_libc_funcptr.c @@ -27,6 +27,8 @@ */ #include "_libkernel_init.h" +#include "strings.h" + extern _libkernel_functions_t _libkernel_functions; extern void mig_os_release(void* ptr); @@ -82,6 +84,157 @@ _pthread_clear_qos_tsd(mach_port_t thread_port) } } +/* + * Upcalls to optimized libplatform string functions + */ + +static const struct _libkernel_string_functions + _libkernel_generic_string_functions = { + .bzero = _libkernel_bzero, + .memmove = _libkernel_memmove, + .memset = _libkernel_memset, + .strchr = _libkernel_strchr, + .strcmp = _libkernel_strcmp, + .strcpy = _libkernel_strcpy, + .strlcpy = _libkernel_strlcpy, + .strlen = _libkernel_strlen, +}; +static _libkernel_string_functions_t _libkernel_string_functions = + &_libkernel_generic_string_functions; + +kern_return_t +__libkernel_platform_init(_libkernel_string_functions_t fns) +{ + _libkernel_string_functions = fns; + return KERN_SUCCESS; +} + +__attribute__((visibility("hidden"))) +void +bzero(void *s, size_t n) +{ + return _libkernel_string_functions->bzero(s, n); +} + +__attribute__((visibility("hidden"))) +void * +memchr(const void *s, int c, size_t n) +{ + return _libkernel_string_functions->memchr(s, c, n); +} + +__attribute__((visibility("hidden"))) +int +memcmp(const void *s1, const void *s2, size_t n) +{ + return _libkernel_string_functions->memcmp(s1, s2, n); +} + +__attribute__((visibility("hidden"))) +void * +memmove(void *dst, const void *src, size_t n) +{ + return _libkernel_string_functions->memmove(dst, src, n); +} + +__attribute__((visibility("hidden"))) +void * +memcpy(void *dst, const void *src, size_t n) +{ + return _libkernel_string_functions->memmove(dst, src, n); +} + +__attribute__((visibility("hidden"))) +void * +memccpy(void *__restrict dst, const void *__restrict src, int c, size_t n) +{ + return _libkernel_string_functions->memccpy(dst, src, c, n); +} + +__attribute__((visibility("hidden"))) +void * +memset(void *b, int c, size_t len) +{ + return _libkernel_string_functions->memset(b, c, len); +} + +__attribute__((visibility("hidden"))) +char * +strchr(const char *s, int c) +{ + return _libkernel_string_functions->strchr(s, c); +} + +__attribute__((visibility("hidden"))) +char * +index(const char *s, int c) +{ + return _libkernel_string_functions->strchr(s, c); +} + +__attribute__((visibility("hidden"))) +int +strcmp(const char *s1, const char *s2) +{ + return _libkernel_string_functions->strcmp(s1, s2); +} + +__attribute__((visibility("hidden"))) +char * +strcpy(char * restrict dst, const char * restrict src) +{ + return _libkernel_string_functions->strcpy(dst, src); +} + +__attribute__((visibility("hidden"))) +size_t +strlcat(char * restrict dst, const char * restrict src, size_t maxlen) +{ + return _libkernel_string_functions->strlcat(dst, src, maxlen); +} + +__attribute__((visibility("hidden"))) +size_t +strlcpy(char * restrict dst, const char * restrict src, size_t maxlen) +{ + return _libkernel_string_functions->strlcpy(dst, src, maxlen); +} + +__attribute__((visibility("hidden"))) +size_t +strlen(const char *str) +{ + return _libkernel_string_functions->strlen(str); +} + +__attribute__((visibility("hidden"))) +int +strncmp(const char *s1, const char *s2, size_t n) +{ + return _libkernel_string_functions->strncmp(s1, s2, n); +} + +__attribute__((visibility("hidden"))) +char * +strncpy(char * restrict dst, const char * restrict src, size_t maxlen) +{ + return _libkernel_string_functions->strncpy(dst, src, maxlen); +} + +__attribute__((visibility("hidden"))) +size_t +strnlen(const char *s, size_t maxlen) +{ + return _libkernel_string_functions->strnlen(s, maxlen); +} + +__attribute__((visibility("hidden"))) +char * +strstr(const char *s, const char *find) +{ + return _libkernel_string_functions->strstr(s, find); +} + /* * mach/mach.h voucher_mach_msg API */ diff --git a/libsyscall/wrappers/_libkernel_init.c b/libsyscall/wrappers/_libkernel_init.c index 3eb67853d..31e6cb47a 100644 --- a/libsyscall/wrappers/_libkernel_init.c +++ b/libsyscall/wrappers/_libkernel_init.c @@ -46,6 +46,7 @@ void* (*_dlsym)(void*, const char*) __attribute__((visibility("hidden"))); __attribute__((visibility("hidden"))) _libkernel_functions_t _libkernel_functions; + void __libkernel_init(_libkernel_functions_t fns, const char *envp[] __attribute__((unused)), diff --git a/libsyscall/wrappers/_libkernel_init.h b/libsyscall/wrappers/_libkernel_init.h index 68a7067e4..b081ebc90 100644 --- a/libsyscall/wrappers/_libkernel_init.h +++ b/libsyscall/wrappers/_libkernel_init.h @@ -67,6 +67,28 @@ typedef const struct _libkernel_functions { /* Subsequent versions must only add pointers! */ } *_libkernel_functions_t; +typedef const struct _libkernel_string_functions { + /* The following functions are included in version 1 of this structure */ + unsigned long version; + void (*bzero)(void *s, size_t n); + void * (*memchr)(const void *s, int c, size_t n); + int (*memcmp)(const void *s1, const void *s2, size_t n); + void * (*memmove)(void *dst, const void *src, size_t n); + void * (*memccpy)(void *__restrict dst, const void *__restrict src, int c, size_t n); + void * (*memset)(void *b, int c, size_t len); + char * (*strchr)(const char *s, int c); + int (*strcmp)(const char *s1, const char *s2); + char * (*strcpy)(char * restrict dst, const char * restrict src); + size_t (*strlcat)(char * restrict dst, const char * restrict src, size_t maxlen); + size_t (*strlcpy)(char * restrict dst, const char * restrict src, size_t maxlen); + size_t (*strlen)(const char *str); + int (*strncmp)(const char *s1, const char *s2, size_t n); + char * (*strncpy)(char * restrict dst, const char * restrict src, size_t maxlen); + size_t (*strnlen)(const char *s, size_t maxlen); + char * (*strstr)(const char *s, const char *find); + /* Subsequent versions must only add pointers! */ +} *_libkernel_string_functions_t; + typedef const struct _libkernel_voucher_functions { /* The following functions are included in version 1 of this structure */ unsigned long version; @@ -83,6 +105,8 @@ struct ProgramVars; /* forward reference */ void __libkernel_init(_libkernel_functions_t fns, const char *envp[], const char *apple[], const struct ProgramVars *vars); +kern_return_t __libkernel_platform_init(_libkernel_string_functions_t fns); + kern_return_t __libkernel_voucher_init(_libkernel_voucher_functions_t fns); #endif // __LIBKERNEL_INIT_H diff --git a/libsyscall/wrappers/cancelable/fcntl-base.c b/libsyscall/wrappers/cancelable/fcntl-base.c index ba5350824..fc98ea7ae 100644 --- a/libsyscall/wrappers/cancelable/fcntl-base.c +++ b/libsyscall/wrappers/cancelable/fcntl-base.c @@ -28,8 +28,6 @@ int __FCNTL(int, int, void *); * Stub function to account for the differences in the size of the third * argument when int and void * are different sizes. Also add pthread * cancelability. - * - * This is for LP64 only. */ int fcntl(int fd, int cmd, ...) diff --git a/libsyscall/wrappers/cancelable/fcntl-cancel.c b/libsyscall/wrappers/cancelable/fcntl-cancel.c index e5db000a6..3354657ed 100644 --- a/libsyscall/wrappers/cancelable/fcntl-cancel.c +++ b/libsyscall/wrappers/cancelable/fcntl-cancel.c @@ -21,7 +21,7 @@ * @APPLE_LICENSE_HEADER_END@ */ -#if defined(__LP64__) || defined(__arm__) +#if !defined(__i386__) #include #define __FCNTL __fcntl diff --git a/libsyscall/wrappers/cancelable/fcntl.c b/libsyscall/wrappers/cancelable/fcntl.c index f31bff7ef..830a79f5d 100644 --- a/libsyscall/wrappers/cancelable/fcntl.c +++ b/libsyscall/wrappers/cancelable/fcntl.c @@ -21,7 +21,7 @@ * @APPLE_LICENSE_HEADER_END@ */ -#if defined(__LP64__) || defined(__arm__) +#if !defined(__i386__) #undef __DARWIN_NON_CANCELABLE #define __DARWIN_NON_CANCELABLE 1 diff --git a/libsyscall/wrappers/coalition.c b/libsyscall/wrappers/coalition.c index 627da2261..ecc36b1ec 100644 --- a/libsyscall/wrappers/coalition.c +++ b/libsyscall/wrappers/coalition.c @@ -29,7 +29,7 @@ /* Syscall entry points */ int __coalition(uint32_t operation, uint64_t *cid, uint32_t flags); -int __coalition_info(uint32_t operation, uint64_t *cid, void *buffer, size_t bufsize); +int __coalition_info(uint32_t operation, uint64_t *cid, void *buffer, size_t *bufsize); int coalition_create(uint64_t *cid_out, uint32_t flags) { diff --git a/libsyscall/wrappers/getiopolicy_np.c b/libsyscall/wrappers/getiopolicy_np.c index 583321622..335717f2b 100644 --- a/libsyscall/wrappers/getiopolicy_np.c +++ b/libsyscall/wrappers/getiopolicy_np.c @@ -33,7 +33,7 @@ getiopolicy_np(int iotype, int scope) int policy, error; struct _iopol_param_t iop_param; - if (iotype != IOPOL_TYPE_DISK || + if ((iotype != IOPOL_TYPE_DISK && iotype != IOPOL_TYPE_VFS_ATIME_UPDATES) || (scope != IOPOL_SCOPE_PROCESS && scope != IOPOL_SCOPE_THREAD)) { errno = EINVAL; policy = -1; diff --git a/libsyscall/wrappers/init_cpu_capabilities.c b/libsyscall/wrappers/init_cpu_capabilities.c index 8414c1dfa..70271d320 100644 --- a/libsyscall/wrappers/init_cpu_capabilities.c +++ b/libsyscall/wrappers/init_cpu_capabilities.c @@ -35,7 +35,7 @@ int _cpu_capabilities = 0; void _init_cpu_capabilities( void ) { - _cpu_capabilities = _get_cpu_capabilities(); + _cpu_capabilities = (int)_get_cpu_capabilities(); } #elif defined(__arm__) || defined(__arm64__) diff --git a/libsyscall/wrappers/ioctl.c b/libsyscall/wrappers/ioctl.c index eced7e7e1..a0f12a27e 100644 --- a/libsyscall/wrappers/ioctl.c +++ b/libsyscall/wrappers/ioctl.c @@ -21,7 +21,7 @@ * @APPLE_LICENSE_HEADER_END@ */ -#if defined(__LP64__) || defined(__arm__) +#if !defined(__i386__) #include #include @@ -29,8 +29,6 @@ int __ioctl(int, unsigned long, void *); /* * Stub function to account for the third argument being void * - * - * This is for LP64 only. */ int ioctl(int d, unsigned long request, ...) diff --git a/libsyscall/wrappers/libproc/libproc.c b/libsyscall/wrappers/libproc/libproc.c index 39958a1a5..5cc1f7258 100644 --- a/libsyscall/wrappers/libproc/libproc.c +++ b/libsyscall/wrappers/libproc/libproc.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2010 Apple Inc. All rights reserved. + * Copyright (c) 2006-2018 Apple Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -499,7 +499,7 @@ proc_set_cpumon_params_fatal(pid_t pid, int percentage, int interval) return (ret); } - if ((ret = proc_rlimit_control(pid, RLIMIT_CPU_USAGE_MONITOR, CPUMON_MAKE_FATAL)) != 0) { + if ((ret = proc_rlimit_control(pid, RLIMIT_CPU_USAGE_MONITOR, (void *)(uintptr_t)CPUMON_MAKE_FATAL)) != 0) { /* Failed to set termination, back out the CPU monitor settings. */ (void)proc_disable_cpumon(pid); } @@ -592,6 +592,12 @@ proc_setcpu_percentage(pid_t pid, int action, int percentage) return(errno); } +int +proc_reset_footprint_interval(pid_t pid) +{ + return (proc_rlimit_control(pid, RLIMIT_FOOTPRINT_INTERVAL, (void *)(uintptr_t)FOOTPRINT_INTERVAL_RESET)); +} + int proc_clear_cpulimits(pid_t pid) { @@ -712,7 +718,7 @@ proc_pidbind(int pid, uint64_t threadid, int bind) int proc_can_use_foreground_hw(int pid, uint32_t *reason) { - return __proc_info(PROC_INFO_CALL_CANUSEFGHW, pid, 0, NULL, reason, sizeof(*reason)); + return __proc_info(PROC_INFO_CALL_CANUSEFGHW, pid, 0, 0, reason, sizeof(*reason)); } #endif /* TARGET_OS_EMBEDDED */ diff --git a/libsyscall/wrappers/libproc/libproc_internal.h b/libsyscall/wrappers/libproc/libproc_internal.h index f18366427..513fda9ba 100644 --- a/libsyscall/wrappers/libproc/libproc_internal.h +++ b/libsyscall/wrappers/libproc/libproc_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 Apple Inc. All rights reserved. + * Copyright (c) 2010-2018 Apple Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -150,6 +150,8 @@ int proc_set_wakemon_params(pid_t pid, int rate_hz, int flags) __OSX_AVAILABLE_S int proc_get_wakemon_params(pid_t pid, int *rate_hz, int *flags) __OSX_AVAILABLE_STARTING(__MAC_10_9, __IPHONE_7_0); int proc_disable_wakemon(pid_t pid) __OSX_AVAILABLE_STARTING(__MAC_10_9, __IPHONE_7_0); +int proc_reset_footprint_interval(pid_t pid) __OSX_AVAILABLE_STARTING(__MAC_10_14, __IPHONE_12_0); + /* request trace buffer collection */ int proc_trace_log(pid_t pid, uint64_t uniqueid) __OSX_AVAILABLE_STARTING(__MAC_10_10, __IPHONE_8_0); diff --git a/libsyscall/wrappers/skywalk/cpu_copy_in_cksum.s b/libsyscall/wrappers/skywalk/cpu_copy_in_cksum.s new file mode 100644 index 000000000..a32c0a210 --- /dev/null +++ b/libsyscall/wrappers/skywalk/cpu_copy_in_cksum.s @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * extern int cpu_copy_in_cksum(const void *src, void *dst, uint32_t len, + * uint32_t initial_sum); + * + * input : + * src : source starting address + * dst : destination starting address + * len : byte stream length + * initial_sum : 32bit sum + * + * output : + * the source byte stream is copied into the destination buffer + * the function returns the final 16bit checksum + */ + diff --git a/libsyscall/wrappers/skywalk/cpu_copy_in_cksum_gen.c b/libsyscall/wrappers/skywalk/cpu_copy_in_cksum_gen.c new file mode 100644 index 000000000..d21226b24 --- /dev/null +++ b/libsyscall/wrappers/skywalk/cpu_copy_in_cksum_gen.c @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + diff --git a/libsyscall/wrappers/skywalk/cpu_in_cksum.s b/libsyscall/wrappers/skywalk/cpu_in_cksum.s new file mode 100644 index 000000000..9410d93ac --- /dev/null +++ b/libsyscall/wrappers/skywalk/cpu_in_cksum.s @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * extern uint32_t os_cpu_in_cksum(const void *data, uint32_t len, + * uint32_t initial_sum); + * + * input : + * data : starting address + * len : byte stream length + * initial_sum : 32-bit sum + * + * output : + * This function returns the partial 16-bit checksum accumulated in + * a 32-bit variable (withouth 1's complement); caller is responsible + * for folding the 32-bit sum into 16-bit and performinng the 1's + * complement if applicable + */ + diff --git a/libsyscall/wrappers/skywalk/cpu_in_cksum_gen.c b/libsyscall/wrappers/skywalk/cpu_in_cksum_gen.c new file mode 100644 index 000000000..d21226b24 --- /dev/null +++ b/libsyscall/wrappers/skywalk/cpu_in_cksum_gen.c @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + diff --git a/libsyscall/wrappers/skywalk/os_channel.c b/libsyscall/wrappers/skywalk/os_channel.c new file mode 100644 index 000000000..4aee6e02d --- /dev/null +++ b/libsyscall/wrappers/skywalk/os_channel.c @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2015-2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + diff --git a/libsyscall/wrappers/skywalk/os_nexus.c b/libsyscall/wrappers/skywalk/os_nexus.c new file mode 100644 index 000000000..121ec4a73 --- /dev/null +++ b/libsyscall/wrappers/skywalk/os_nexus.c @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2015-2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + diff --git a/libsyscall/wrappers/skywalk/os_packet.c b/libsyscall/wrappers/skywalk/os_packet.c new file mode 100644 index 000000000..6eda01c17 --- /dev/null +++ b/libsyscall/wrappers/skywalk/os_packet.c @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2015-2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + diff --git a/libsyscall/wrappers/spawn/posix_spawn.c b/libsyscall/wrappers/spawn/posix_spawn.c index 3a149790c..20083809a 100644 --- a/libsyscall/wrappers/spawn/posix_spawn.c +++ b/libsyscall/wrappers/spawn/posix_spawn.c @@ -123,6 +123,9 @@ posix_spawnattr_init(posix_spawnattr_t *attr) (*psattrp)->psa_memlimit_active = -1; (*psattrp)->psa_memlimit_inactive = -1; + /* Default is no thread limit */ + (*psattrp)->psa_thread_limit = 0; + /* Default is no CPU usage monitor active. */ (*psattrp)->psa_cpumonitor_percent = 0; (*psattrp)->psa_cpumonitor_interval = 0; @@ -150,6 +153,8 @@ posix_spawnattr_init(posix_spawnattr_t *attr) /* Default is no change to role */ (*psattrp)->psa_darwin_role = POSIX_SPAWN_DARWIN_ROLE_NONE; + + (*psattrp)->psa_max_addr = 0; } return (err); @@ -1415,6 +1420,23 @@ posix_spawnattr_setjetsam_ext(posix_spawnattr_t * __restrict attr, return (0); } +int +posix_spawnattr_set_threadlimit_ext(posix_spawnattr_t * __restrict attr, + int thread_limit) +{ + _posix_spawnattr_t psattr; + + if (attr == NULL || *attr == NULL) + return EINVAL; + + psattr = *(_posix_spawnattr_t *)attr; + + psattr->psa_thread_limit = thread_limit; + + return (0); + +} + /* * posix_spawnattr_set_importancewatch_port_np @@ -1752,7 +1774,20 @@ posix_spawnattr_set_persona_groups_np(const posix_spawnattr_t * __restrict attr, return 0; } +int +posix_spawnattr_set_max_addr_np(const posix_spawnattr_t * __restrict attr, uint64_t max_addr) +{ + _posix_spawnattr_t psattr; + if (attr == NULL || *attr == NULL) { + return EINVAL; + } + + psattr = *(_posix_spawnattr_t *)attr; + psattr->psa_max_addr = max_addr; + + return 0; +} /* * posix_spawn diff --git a/libsyscall/wrappers/spawn/spawn_private.h b/libsyscall/wrappers/spawn/spawn_private.h index bebd58e60..e0ea9d495 100644 --- a/libsyscall/wrappers/spawn/spawn_private.h +++ b/libsyscall/wrappers/spawn/spawn_private.h @@ -48,6 +48,9 @@ int posix_spawnattr_setjetsam(posix_spawnattr_t * __restrict attr, int posix_spawnattr_setjetsam_ext(posix_spawnattr_t * __restrict attr, short flags, int priority, int memlimit_active, int memlimit_inactive) __OSX_AVAILABLE_STARTING(__MAC_10_11, __IPHONE_9_0); +int posix_spawnattr_set_threadlimit_ext(posix_spawnattr_t * __restrict attr, + int thread_limit); + #define POSIX_SPAWN_IMPORTANCE_PORT_COUNT 32 int posix_spawnattr_set_importancewatch_port_np(posix_spawnattr_t * __restrict attr, int count, mach_port_t portarray[]) __OSX_AVAILABLE_STARTING(__MAC_10_9, __IPHONE_6_0); @@ -68,5 +71,6 @@ int posix_spawnattr_set_persona_np(const posix_spawnattr_t * __restrict, uid int posix_spawnattr_set_persona_uid_np(const posix_spawnattr_t * __restrict, uid_t) __OSX_AVAILABLE_STARTING(__MAC_10_11, __IPHONE_9_0); int posix_spawnattr_set_persona_gid_np(const posix_spawnattr_t * __restrict, gid_t) __OSX_AVAILABLE_STARTING(__MAC_10_11, __IPHONE_9_0); int posix_spawnattr_set_persona_groups_np(const posix_spawnattr_t * __restrict, int, gid_t *, uid_t) __OSX_AVAILABLE_STARTING(__MAC_10_11, __IPHONE_9_0); +int posix_spawnattr_set_max_addr_np(const posix_spawnattr_t * __restrict attr, uint64_t max_addr) __OSX_AVAILABLE_STARTING(__MAC_10_14, __IPHONE_12_0); #endif /* !defined _SPAWN_PRIVATE_H_*/ diff --git a/libsyscall/wrappers/stackshot.c b/libsyscall/wrappers/stackshot.c index 0e065edd2..3a7489e02 100644 --- a/libsyscall/wrappers/stackshot.c +++ b/libsyscall/wrappers/stackshot.c @@ -140,9 +140,9 @@ stackshot_capture_with_config(stackshot_config_t *stackshot_config) return EINVAL; } - s_config->sc_out_buffer_addr = &s_config->sc_buffer; - s_config->sc_out_size_addr = &s_config->sc_size; - ret = __stack_snapshot_with_config(STACKSHOT_CONFIG_TYPE, s_config, sizeof(stackshot_config_t)); + s_config->sc_out_buffer_addr = (uintptr_t)&s_config->sc_buffer; + s_config->sc_out_size_addr = (uintptr_t)&s_config->sc_size; + ret = __stack_snapshot_with_config(STACKSHOT_CONFIG_TYPE, (uintptr_t)s_config, sizeof(stackshot_config_t)); if (ret != 0) { ret = errno; diff --git a/libsyscall/wrappers/string/index.c b/libsyscall/wrappers/string/index.c index 8c4d3e05e..fcbc147da 100644 --- a/libsyscall/wrappers/string/index.c +++ b/libsyscall/wrappers/string/index.c @@ -31,8 +31,7 @@ __attribute__((visibility("hidden"))) char * -index -(const char *p, int ch) +_libkernel_strchr(const char *p, int ch) { char c; diff --git a/libsyscall/wrappers/string/memcpy.c b/libsyscall/wrappers/string/memcpy.c index ef30a90ba..53d527b6a 100644 --- a/libsyscall/wrappers/string/memcpy.c +++ b/libsyscall/wrappers/string/memcpy.c @@ -48,7 +48,7 @@ typedef int word; /* "word" used for optimal copy speed */ */ __attribute__((visibility("hidden"))) -void * memcpy(void *dst0, const void *src0, size_t length) +void * _libkernel_memmove(void *dst0, const void *src0, size_t length) { char *dst = dst0; const char *src = src0; @@ -113,13 +113,6 @@ void * memcpy(void *dst0, const void *src0, size_t length) return (dst0); } -__attribute__((visibility("hidden"))) -void * -memmove(void *s1, const void *s2, size_t n) -{ - return memcpy(s1, s2, n); -} - __attribute__((visibility("hidden"))) void bcopy(const void *s1, void *s2, size_t n) diff --git a/libsyscall/wrappers/string/memset.c b/libsyscall/wrappers/string/memset.c index cab6587d6..82c1eb0a9 100644 --- a/libsyscall/wrappers/string/memset.c +++ b/libsyscall/wrappers/string/memset.c @@ -40,9 +40,9 @@ // a recursive call to bzero. __attribute__((visibility("hidden"))) void -bzero(void *dst0, size_t length) +_libkernel_bzero(void *dst0, size_t length) { - return (void)memset(dst0, 0, length); + return (void)_libkernel_memset(dst0, 0, length); } #define RETURN return (dst0) @@ -51,7 +51,7 @@ bzero(void *dst0, size_t length) __attribute__((visibility("hidden"))) void * -memset(void *dst0, int c0, size_t length) +_libkernel_memset(void *dst0, int c0, size_t length) { size_t t; u_int c; diff --git a/libsyscall/wrappers/string/strcmp.c b/libsyscall/wrappers/string/strcmp.c index cffe07883..cfe403516 100644 --- a/libsyscall/wrappers/string/strcmp.c +++ b/libsyscall/wrappers/string/strcmp.c @@ -37,7 +37,7 @@ */ __attribute__((visibility("hidden"))) int -strcmp(const char *s1, const char *s2) +_libkernel_strcmp(const char *s1, const char *s2) { while (*s1 == *s2++) if (*s1++ == '\0') diff --git a/libsyscall/wrappers/string/strcpy.c b/libsyscall/wrappers/string/strcpy.c index 026d098c7..e67282e06 100644 --- a/libsyscall/wrappers/string/strcpy.c +++ b/libsyscall/wrappers/string/strcpy.c @@ -25,11 +25,11 @@ __attribute__((visibility("hidden"))) char * -strcpy(char * restrict dst, const char * restrict src) { - const size_t length = strlen(src); +_libkernel_strcpy(char * restrict dst, const char * restrict src) { + const size_t length = _libkernel_strlen(src); // The stpcpy() and strcpy() functions copy the string src to dst // (including the terminating '\0' character). - memcpy(dst, src, length+1); + _libkernel_memmove(dst, src, length+1); // The strcpy() and strncpy() functions return dst. return dst; } diff --git a/libsyscall/wrappers/string/strings.h b/libsyscall/wrappers/string/strings.h index 540711260..a8222044a 100644 --- a/libsyscall/wrappers/string/strings.h +++ b/libsyscall/wrappers/string/strings.h @@ -72,8 +72,17 @@ char *strsep(char **, const char *); void bcopy(const void *, void *, size_t); void bzero(void *, size_t); char *index(const char *, int); +char *strchr(const char *, int); #include "string.h" -#endif /* _STRINGS_H_ */ +void *_libkernel_memmove(void *, const void *, size_t); +void *_libkernel_memset(void *, int, size_t); +int _libkernel_strcmp(const char *, const char *); +char *_libkernel_strcpy(char *, const char *); +size_t _libkernel_strlen(const char *); +size_t _libkernel_strlcpy(char *, const char *, size_t); +void _libkernel_bzero(void *, size_t); +char *_libkernel_strchr(const char *, int); +#endif /* _STRINGS_H_ */ diff --git a/libsyscall/wrappers/string/strlcpy.c b/libsyscall/wrappers/string/strlcpy.c index 72e4b2bd8..1be4fe333 100644 --- a/libsyscall/wrappers/string/strlcpy.c +++ b/libsyscall/wrappers/string/strlcpy.c @@ -25,12 +25,12 @@ __attribute__((visibility("hidden"))) size_t -strlcpy(char * restrict dst, const char * restrict src, size_t maxlen) { - const size_t srclen = strlen(src); +_libkernel_strlcpy(char * restrict dst, const char * restrict src, size_t maxlen) { + const size_t srclen = _libkernel_strlen(src); if (srclen < maxlen) { - memcpy(dst, src, srclen+1); + _libkernel_memmove(dst, src, srclen+1); } else if (maxlen != 0) { - memcpy(dst, src, maxlen-1); + _libkernel_memmove(dst, src, maxlen-1); dst[maxlen-1] = '\0'; } return srclen; diff --git a/libsyscall/wrappers/string/strlen.c b/libsyscall/wrappers/string/strlen.c index 6854e0343..9054ac39e 100644 --- a/libsyscall/wrappers/string/strlen.c +++ b/libsyscall/wrappers/string/strlen.c @@ -75,7 +75,7 @@ static const unsigned long mask80 = 0x8080808080808080; __attribute__((visibility("hidden"))) size_t -strlen(const char *str) +_libkernel_strlen(const char *str) { const char *p; const unsigned long *lp; diff --git a/libsyscall/wrappers/terminate_with_reason.c b/libsyscall/wrappers/terminate_with_reason.c index 3bb8a6683..6277f4be0 100644 --- a/libsyscall/wrappers/terminate_with_reason.c +++ b/libsyscall/wrappers/terminate_with_reason.c @@ -73,7 +73,7 @@ static void abort_with_payload_wrapper_internal(uint32_t reason_namespace, uint6 /* If sending a SIGABRT failed, we fall back to SIGKILL */ terminate_with_payload(getpid(), reason_namespace, reason_code, payload, payload_size, - reason_string, reason_flags); + reason_string, reason_flags | OS_REASON_FLAG_ABORT); __builtin_unreachable(); } diff --git a/libsyscall/wrappers/thread_register_state.c b/libsyscall/wrappers/thread_register_state.c index e1181d251..d2a9f3268 100644 --- a/libsyscall/wrappers/thread_register_state.c +++ b/libsyscall/wrappers/thread_register_state.c @@ -74,8 +74,6 @@ thread_get_register_pointer_values(thread_t thread, uintptr_t *sp, size_t *lengt #if defined(__i386__) if (sp) *sp = state.__esp; - push_register_value(state.__eip); - push_register_value(state.__eax); push_register_value(state.__ebx); push_register_value(state.__ecx); @@ -91,8 +89,6 @@ thread_get_register_pointer_values(thread_t thread, uintptr_t *sp, size_t *lengt *sp = 0; } - push_register_value(state.__rip); - push_register_value(state.__rax); push_register_value(state.__rbx); push_register_value(state.__rcx); @@ -110,7 +106,6 @@ thread_get_register_pointer_values(thread_t thread, uintptr_t *sp, size_t *lengt #elif defined(__arm__) if (sp) *sp = state.__sp; - push_register_value(state.__pc); push_register_value(state.__lr); for (int i = 0; i < 13; i++){ @@ -118,14 +113,14 @@ thread_get_register_pointer_values(thread_t thread, uintptr_t *sp, size_t *lengt } #elif defined(__arm64__) if (sp) { - if (state.__sp > 128) - *sp = state.__sp - 128 /* redzone */; + uintptr_t __sp = arm_thread_state64_get_sp(state); + if (__sp > 128) + *sp = __sp - 128 /* redzone */; else *sp = 0; } - push_register_value(state.__pc); - push_register_value(state.__lr); + push_register_value(arm_thread_state64_get_lr(state)); for (int i = 0; i < 29; i++){ push_register_value(state.__x[i]); diff --git a/libsyscall/wrappers/varargs_wrappers.s b/libsyscall/wrappers/varargs_wrappers.s index 6a22a5395..fae37a483 100644 --- a/libsyscall/wrappers/varargs_wrappers.s +++ b/libsyscall/wrappers/varargs_wrappers.s @@ -36,112 +36,171 @@ * sem_t* __sem_open(const char *name, int oflag, int mode, int value); */ MI_ENTRY_POINT(_sem_open) + ARM64_STACK_PROLOG PUSH_FRAME +#if __LP64__ ldp x2, x3, [fp, #16] +#else + ldp w2, w3, [fp, #16] +#endif MI_CALL_EXTERNAL(___sem_open) +#if !__LP64__ + /* xnu returns a 64-bit '-1' on failure, but pointers must have the high + * 32-bits set to zero. The following instruction is equivalent to + * masking off the top 32-bits. + */ + mov w0, w0 +#endif POP_FRAME - ret + ARM64_STACK_EPILOG /* * int open(const char *name, int oflag, ...); * int __open(const char *name, int oflag, int mode, int value); */ MI_ENTRY_POINT(_open) + ARM64_STACK_PROLOG PUSH_FRAME +#if __LP64__ ldr x2, [fp, #16] +#else + ldr w2, [fp, #16] +#endif MI_CALL_EXTERNAL(___open) POP_FRAME - ret + ARM64_STACK_EPILOG /* * int open_nocancel(const char *name, int oflag, ...); * int __open_nocancel(const char *name, int oflag, int mode); */ MI_ENTRY_POINT(_open$NOCANCEL) + ARM64_STACK_PROLOG PUSH_FRAME +#if __LP64__ ldr x2, [fp, #16] +#else + ldr w2, [fp, #16] +#endif MI_CALL_EXTERNAL(___open_nocancel) POP_FRAME - ret + ARM64_STACK_EPILOG /* * int openat(int fd,const char *name, int oflag, ...); * int __openat(int fd, const char *name, int oflag, int mode, int value); */ MI_ENTRY_POINT(_openat) + ARM64_STACK_PROLOG PUSH_FRAME +#if __LP64__ ldr x3, [fp, #16] +#else + ldr w3, [fp, #16] +#endif MI_CALL_EXTERNAL(___openat) POP_FRAME - ret + ARM64_STACK_EPILOG /* * int openat_nocancel(int fd, const char *name, int oflag, ...); * int __openat_nocancel(int fd, const char *name, int oflag, int mode); */ MI_ENTRY_POINT(_openat$NOCANCEL) + ARM64_STACK_PROLOG PUSH_FRAME +#if __LP64__ ldr x3, [fp, #16] +#else + ldr w3, [fp, #16] +#endif MI_CALL_EXTERNAL(___openat_nocancel) POP_FRAME - ret + ARM64_STACK_EPILOG /* * int shm_open(const char *, int, ...); * int __shm_open(const char*, int oflag, int mode); */ MI_ENTRY_POINT(_shm_open) + ARM64_STACK_PROLOG PUSH_FRAME - ldr x2, [fp, #16] +#if __LP64__ + ldr x2, [fp, #16] +#else + ldr w2, [fp, #16] +#endif MI_CALL_EXTERNAL(___shm_open) POP_FRAME - ret + ARM64_STACK_EPILOG /* * int msgsys(int, ...); * int __msgsys(int which, int a2, int a3, int a4, int a5); */ MI_ENTRY_POINT(_msgsys) + ARM64_STACK_PROLOG PUSH_FRAME - ldp x1, x2, [fp, #16] - ldp x3, x4, [fp, #32] +#if __LP64__ + ldp x1, x2, [fp, #16] + ldp x3, x4, [fp, #32] +#else + ldp w1, w2, [fp, #16] + ldp w3, w4, [fp, #24] +#endif MI_CALL_EXTERNAL(___msgsys) POP_FRAME - ret /* * int semsys(int, ...); * int __semsys(int which, int a2, int a3, int a4, int a5); */ MI_ENTRY_POINT(_semsys) + ARM64_STACK_PROLOG PUSH_FRAME - ldp x1, x2, [fp, #16] - ldp x3, x4, [fp, #32] +#if __LP64__ + ldp x1, x2, [fp, #16] + ldp x3, x4, [fp, #32] +#else + ldp w1, w2, [fp, #16] + ldp w3, w4, [fp, #24] +#endif MI_CALL_EXTERNAL(___semsys) POP_FRAME - ret + ARM64_STACK_EPILOG /* * int semctl(int, int, int, ...); * int __semctl(int semid, int semnum, int cmd, semun_t arg); */ MI_ENTRY_POINT(_semctl) + ARM64_STACK_PROLOG PUSH_FRAME - ldr x3, [fp, #16] +#if __LP64__ + ldr x3, [fp, #16] +#else + ldr w3, [fp, #16] +#endif MI_CALL_EXTERNAL(___semctl) POP_FRAME - ret + ARM64_STACK_EPILOG /* * int shmsys(int, ...); * int __shmsys(int which, int a2, int a3, int a4); */ MI_ENTRY_POINT(_shmsys) + ARM64_STACK_PROLOG PUSH_FRAME - ldp x1, x2, [fp, #16] - ldr x3, [fp, #32] +#if __LP64__ + ldp x1, x2, [fp, #16] + ldr x3, [fp, #32] +#else + ldp w1, w2, [fp, #16] + ldr w3, [fp, #24] +#endif MI_CALL_EXTERNAL(___shmsys) POP_FRAME - ret + ARM64_STACK_EPILOG #endif /* defined(__arm64__) */ diff --git a/libsyscall/xcodescripts/create-syscalls.pl b/libsyscall/xcodescripts/create-syscalls.pl index 81dfc8a8a..a4e17d689 100755 --- a/libsyscall/xcodescripts/create-syscalls.pl +++ b/libsyscall/xcodescripts/create-syscalls.pl @@ -94,6 +94,17 @@ 'uuid_t' => 4, ); +# Types that potentially have different sizes in user-space compared to +# kernel-space as well as whether the value should be sign/zero-extended when +# passing the user/kernel boundary. +my %UserKernelMismatchTypes = ( + 'long' => 'SIGN_EXTEND', + 'size_t' => 'ZERO_EXTEND', + 'u_long' => 'ZERO_EXTEND', + 'user_size_t' => 'ZERO_EXTEND', + 'user_ssize_t' => 'SIGN_EXTEND' +); + # Moving towards storing all data in this hash, then we always know # if data is aliased or not, or promoted or not. my %Symbols = ( @@ -106,6 +117,7 @@ nargs => 0, bytes => 0, aliases => {}, + mismatch_args => {}, # Arguments that might need to be zero/sign-extended }, ); @@ -178,12 +190,15 @@ sub readMaster { $args =~ s/\s+$//; my $argbytes = 0; my $nargs = 0; + my %mismatch_args; if($args ne '' && $args ne 'void') { my @a = split(',', $args); $nargs = scalar(@a); - # Calculate the size of all the arguments (only used for i386) + my $index = 0; for my $type (@a) { $type =~ s/\s*\w+$//; # remove the argument name + + # Calculate the size of all the arguments (only used for i386) if($type =~ /\*$/) { $argbytes += 4; # a pointer type } else { @@ -192,6 +207,12 @@ sub readMaster { die "$MyName: $name: unknown type '$type'\n" unless defined($b); $argbytes += $b; } + # Determine which arguments might need to be zero/sign-extended + if(exists $UserKernelMismatchTypes{$type}) { + $mismatch_args{$index} = $UserKernelMismatchTypes{$type}; + } + + $index++; } } $Symbols{$name} = { @@ -203,6 +224,7 @@ sub readMaster { nargs => $nargs, bytes => $argbytes, aliases => {}, + mismatch_args => \%mismatch_args, # Arguments that might need to be zero/sign-extended except => [], }; } @@ -301,23 +323,47 @@ sub writeStubForSymbol { my ($f, $symbol) = @_; my @conditions; + my $has_arm64 = 0; for my $subarch (@Architectures) { (my $arch = $subarch) =~ s/arm(v.*)/arm/; $arch =~ s/x86_64(.*)/x86_64/; $arch =~ s/arm64(.*)/arm64/; push(@conditions, "defined(__${arch}__)") unless grep { $_ eq $arch } @{$$symbol{except}}; + + if($arch == 'arm64') { + $has_arm64 = 1 unless grep { $_ eq $arch } @{$$symbol{except}}; + } } my %is_cancel; for (@Cancelable) { $is_cancel{$_} = 1 }; - + print $f "#define __SYSCALL_32BIT_ARG_BYTES $$symbol{bytes}\n"; print $f "#include \"SYS.h\"\n\n"; + if (scalar(@conditions)) { printf $f "#ifndef SYS_%s\n", $$symbol{syscall}; printf $f "#error \"SYS_%s not defined. The header files libsyscall is building against do not match syscalls.master.\"\n", $$symbol{syscall}; - printf $f "#endif\n\n"; - my $nc = ($is_cancel{$$symbol{syscall}} ? "cerror" : "cerror_nocancel"); + printf $f "#endif\n\n"; + } + + my $nc = ($is_cancel{$$symbol{syscall}} ? "cerror" : "cerror_nocancel"); + + if($has_arm64) { + printf $f "#if defined(__arm64__)\n"; + printf $f "MI_ENTRY_POINT(%s)\n", $$symbol{asm_sym}; + if(keys %{$$symbol{mismatch_args}}) { + while(my($argnum, $extend) = each %{$$symbol{mismatch_args}}) { + printf $f "%s(%d)\n", $extend, $argnum; + } + } + + printf $f "SYSCALL_NONAME(%s, %d, %s)\n", $$symbol{syscall}, $$symbol{nargs}, $nc; + printf $f "ret\n"; + printf $f "#else\n"; + } + + if (scalar(@conditions)) { printf $f "#if " . join(" || ", @conditions) . "\n"; printf $f "__SYSCALL2(%s, %s, %d, %s)\n", $$symbol{asm_sym}, $$symbol{syscall}, $$symbol{nargs}, $nc; if (!$$symbol{is_private} && (scalar(@conditions) < scalar(@Architectures))) { @@ -329,6 +375,10 @@ sub writeStubForSymbol { # actually this isnt an inconsistency. kernel can expose what it wants but if all our arches # override it we need to honour that. } + + if($has_arm64) { + printf $f "#endif\n\n"; + } } sub writeAliasesForSymbol { diff --git a/libsyscall/xcodescripts/mach_install_mig.sh b/libsyscall/xcodescripts/mach_install_mig.sh index 94cc8dbb7..cdd598aee 100755 --- a/libsyscall/xcodescripts/mach_install_mig.sh +++ b/libsyscall/xcodescripts/mach_install_mig.sh @@ -39,6 +39,7 @@ MIG_HEADER_DST="$BUILT_PRODUCTS_DIR/mig_hdr/include/mach" MIG_PRIVATE_HEADER_DST="$BUILT_PRODUCTS_DIR/mig_hdr/local/include/mach" SERVER_HEADER_DST="$BUILT_PRODUCTS_DIR/mig_hdr/include/servers" MACH_HEADER_DST="$BUILT_PRODUCTS_DIR/mig_hdr/include/mach" +MACH_PRIVATE_HEADER_DST="$BUILT_PRODUCTS_DIR/mig_hdr/local/include/mach" # from old Libsystem makefiles MACHINE_ARCH=`echo $ARCHS | cut -d' ' -f 1` @@ -46,16 +47,20 @@ if [[ ( "$MACHINE_ARCH" =~ ^"arm64" || "$MACHINE_ARCH" =~ ^"x86_64" ) && `echo $ then # MACHINE_ARCH needs to be a 32-bit arch to generate vm_map_internal.h correctly. MACHINE_ARCH=`echo $ARCHS | cut -d' ' -f 2` - if [[ ( "$MACHINE_ARCH" =~ ^"arm64" || "$MACHINE_ARCH" =~ ^"x86_64" ) && `echo $ARCHS | wc -w` -gt 1 ]] - then - # MACHINE_ARCH needs to be a 32-bit arch to generate vm_map_internal.h correctly. - MACHINE_ARCH=`echo $ARCHS | cut -d' ' -f 3` - fi + if [[ ( "$MACHINE_ARCH" =~ ^"arm64" || "$MACHINE_ARCH" =~ ^"x86_64" ) && `echo $ARCHS | wc -w` -gt 2 ]] + then + # MACHINE_ARCH needs to be a 32-bit arch to generate vm_map_internal.h correctly. + MACHINE_ARCH=`echo $ARCHS | cut -d' ' -f 3` + fi fi +# MACHINE_ARCH *really* needs to be a 32-bit arch to generate vm_map_internal.h correctly, even if there are no 32-bit targets. if [[ ( "$MACHINE_ARCH" =~ ^"arm64" ) ]] then - # MACHINE_ARCH *really* needs to be a 32-bit arch to generate vm_map_internal.h correctly, even if there are no 32-bit targets. - MACHINE_ARCH="armv7" + MACHINE_ARCH="armv7" +fi +if [[ ( "$MACHINE_ARCH" =~ ^"x86_64" ) ]] +then + MACHINE_ARCH="i386" fi SRC="$SRCROOT/mach" @@ -109,12 +114,16 @@ MACH_HDRS="mach.h mach_error.h mach_init.h mach_interface.h + mach_right.h port_obj.h sync.h vm_task.h vm_page_size.h thread_state.h" +MACH_PRIVATE_HDRS="port_descriptions.h + mach_sync_ipc.h" + MIG_FILTERS="watchos_prohibited_mig.txt tvos_prohibited_mig.txt" # install /usr/include/server headers @@ -129,6 +138,12 @@ for hdr in $MACH_HDRS; do install $ASROOT -c -m 444 $SRC/mach/$hdr $MACH_HEADER_DST done +# install /usr/local/include/mach headers +mkdir -p $MACH_PRIVATE_HEADER_DST +for hdr in $MACH_PRIVATE_HDRS; do + install $ASROOT -c -m 444 $SRC/mach/$hdr $MACH_PRIVATE_HEADER_DST +done + # special case because we only have one to do here $MIG -novouchers -arch $MACHINE_ARCH -header "$SERVER_HEADER_DST/netname.h" $SRC/servers/netname.defs @@ -153,7 +168,7 @@ for mig in $MIGS_PRIVATE $MIGS_DUAL_PUBLIC_PRIVATE; do MIG_NAME=`basename $mig .defs` $MIG -novouchers -arch $MACHINE_ARCH -cc $MIGCC -header "$MIG_PRIVATE_HEADER_DST/$MIG_NAME.h" $MIG_DEFINES $MIG_PRIVATE_DEFS_INCFLAGS $SRC/$mig if [ ! -e "$MIG_HEADER_DST/$MIG_NAME.h" ]; then - echo "#error $MIG_NAME.h unsupported." > "$MIG_HEADER_DST/$MIG_NAME.h" + echo "#error $MIG_NAME.h unsupported." > "$MIG_HEADER_DST/$MIG_NAME.h" fi done diff --git a/makedefs/MakeInc.cmd b/makedefs/MakeInc.cmd index a70a2d815..3f6f71317 100644 --- a/makedefs/MakeInc.cmd +++ b/makedefs/MakeInc.cmd @@ -34,7 +34,7 @@ else XCRUN = /usr/bin/xcrun endif -SDKROOT ?= macosx.internal +SDKROOT ?= macosx HOST_SDKROOT ?= macosx # SDKROOT may be passed as a shorthand like "iphoneos.internal". We @@ -50,6 +50,9 @@ override SDKROOT = $(SDKROOT_RESOLVED) ifeq ($(HOST_SDKROOT_RESOLVED),) export HOST_SDKROOT_RESOLVED := $(shell $(XCRUN) -sdk $(HOST_SDKROOT) -show-sdk-path) +ifeq ($(strip $(HOST_SDKROOT_RESOLVED)),) +export HOST_SDKROOT_RESOLVED := / +endif endif override HOST_SDKROOT = $(HOST_SDKROOT_RESOLVED) diff --git a/makedefs/MakeInc.def b/makedefs/MakeInc.def index dcba4ab78..73f7cdc57 100644 --- a/makedefs/MakeInc.def +++ b/makedefs/MakeInc.def @@ -9,7 +9,7 @@ # # Architecture Configuration options # -SUPPORTED_ARCH_CONFIGS := X86_64 X86_64H +SUPPORTED_ARCH_CONFIGS := X86_64 X86_64H ARM ARM64 # # Kernel Configuration options @@ -23,6 +23,9 @@ SUPPORTED_KERNEL_CONFIGS = RELEASE DEVELOPMENT DEBUG PROFILE KASAN SUPPORTED_X86_64_MACHINE_CONFIGS = NONE SUPPORTED_X86_64H_MACHINE_CONFIGS = NONE +SUPPORTED_ARM_MACHINE_CONFIGS = S7002 T8002 T8004 +SUPPORTED_ARM64_MACHINE_CONFIGS = S5L8960X T7000 T7001 S8000 S8001 T8010 T8011 BCM2837 + # # Setup up *_LC variables during recursive invocations @@ -47,26 +50,45 @@ COMPONENT_LIST = osfmk bsd libkern iokit pexpert libsa security san COMPONENT = $(if $(word 2,$(subst /, ,$(RELATIVE_SOURCE_PATH))),$(word 2,$(subst /, ,$(RELATIVE_SOURCE_PATH))),$(firstword $(subst /, ,$(RELATIVE_SOURCE_PATH)))) COMPONENT_IMPORT_LIST = $(filter-out $(COMPONENT),$(COMPONENT_LIST)) +MACHINE_FLAGS_ARM64_S5L8960X = -DARM64_BOARD_CONFIG_S5L8960X +MACHINE_FLAGS_ARM64_T7000 = -DARM64_BOARD_CONFIG_T7000 +MACHINE_FLAGS_ARM64_T7001 = -DARM64_BOARD_CONFIG_T7001 +MACHINE_FLAGS_ARM_S7002 = -DARM_BOARD_CONFIG_S7002 +MACHINE_FLAGS_ARM64_S8000 = -DARM64_BOARD_CONFIG_S8000 +MACHINE_FLAGS_ARM64_S8001 = -DARM64_BOARD_CONFIG_S8001 +MACHINE_FLAGS_ARM_T8002 = -DARM_BOARD_CONFIG_T8002 +MACHINE_FLAGS_ARM_T8004 = -DARM_BOARD_CONFIG_T8004 +MACHINE_FLAGS_ARM64_T8010 = -DARM64_BOARD_CONFIG_T8010 -mcpu=hurricane +MACHINE_FLAGS_ARM64_T8011 = -DARM64_BOARD_CONFIG_T8011 -mcpu=hurricane +MACHINE_FLAGS_ARM64_BCM2837 = -DARM64_BOARD_CONFIG_BCM2837 + # # Deployment target flag # ifeq ($(PLATFORM),MacOSX) DEPLOYMENT_TARGET_FLAGS = -mmacosx-version-min=$(SDKVERSION) + DEPLOYMENT_LINKER_FLAGS = -Wl,-macosx_version_min,$(SDKVERSION) else ifeq ($(PLATFORM),WatchOS) - DEPLOYMENT_TARGET_FLAGS = -mwatchos-version-min=$(SDKVERSION) + DEPLOYMENT_TARGET_FLAGS = -mwatchos-version-min=$(SDKVERSION) -DXNU_TARGET_OS_WATCH + DEPLOYMENT_LINKER_FLAGS = else ifeq ($(PLATFORM),tvOS) DEPLOYMENT_TARGET_FLAGS = -mtvos-version-min=$(SDKVERSION) + DEPLOYMENT_LINKER_FLAGS = else ifeq ($(PLATFORM),AppleTVOS) DEPLOYMENT_TARGET_FLAGS = -mtvos-version-min=$(SDKVERSION) else ifeq ($(PLATFORM),BridgeOS) DEPLOYMENT_TARGET_FLAGS = -mbridgeos-version-min=$(SDKVERSION) -DXNU_TARGET_OS_BRIDGE + DEPLOYMENT_LINKER_FLAGS = else ifneq ($(filter $(SUPPORTED_EMBEDDED_PLATFORMS),$(PLATFORM)),) DEPLOYMENT_TARGET_FLAGS = -miphoneos-version-min=$(SDKVERSION) + DEPLOYMENT_LINKER_FLAGS = -Wl,-ios_version_min,$(SDKVERSION) else ifneq ($(filter $(SUPPORTED_SIMULATOR_PLATFORMS),$(PLATFORM)),) DEPLOYMENT_TARGET_FLAGS = + DEPLOYMENT_LINKER_FLAGS = else DEPLOYMENT_TARGET_FLAGS = + DEPLOYMENT_LINKER_FLAGS = endif DEPLOYMENT_TARGET_DEFINES = -DPLATFORM_$(PLATFORM) @@ -176,7 +198,34 @@ endef ARCH_FLAGS_X86_64 = -arch x86_64 ARCH_FLAGS_X86_64H = -arch x86_64h +ifneq ($(filter ARM ARM64,$(CURRENT_ARCH_CONFIG)),) + +ifndef ARCH_STRING_FOR_CURRENT_MACHINE_CONFIG +export ARCH_STRING_FOR_CURRENT_MACHINE_CONFIG := $(shell $(EMBEDDED_DEVICE_MAP) -db $(EDM_DBPATH) -query SELECT DISTINCT KernelMachOArchitecture FROM Targets WHERE KernelPlatform IS \"$(CURRENT_MACHINE_CONFIG_LC)\" LIMIT 1 || echo UNKNOWN ) +endif + +BUILD_STATIC_LINK := 1 +endif + +ARCH_FLAGS_ARM = -arch $(ARCH_STRING_FOR_CURRENT_MACHINE_CONFIG) +ARCH_FLAGS_ARM64 = -arch $(ARCH_STRING_FOR_CURRENT_MACHINE_CONFIG) + +# +# Clang static analyzer flags +# +ANALYZER = $(CC) +ANALYZERPP = $(CXX) +ANALYZERFLAGS = --analyze -D__clang_analyzer__ +ifneq ($(ANALYZE_FORMAT),text) +ANALYZERFLAGS += -Xanalyzer -analyzer-output=html +ANALYZERFLAGS += -o $(OBJROOT)/analyzer-html +else +ANALYZERFLAGS += -Xanalyzer -analyzer-output=text +endif +ifneq ($(ANALYZE_VERBOSE),YES) +ANALYZERFLAGS += -Xclang -analyzer-disable-checker -Xclang deadcode.DeadStores +endif # # Default CFLAGS @@ -215,6 +264,11 @@ CFLAGS_X86_64 = -Dx86_64 -DX86_64 -D__X86_64__ -DLP64 \ CFLAGS_X86_64H = $(CFLAGS_X86_64) +CFLAGS_ARM = -Darm -DARM -D__ARM__ -DPAGE_SIZE_FIXED \ + -fno-strict-aliasing -D__API__=v4 + +CFLAGS_ARM64 = -Darm64 -DARM64 -D__ARM64__ -DLP64 -DPAGE_SIZE_FIXED \ + -fno-strict-aliasing -D__API__=v4 -mkernel CFLAGS_RELEASEX86_64 = -O2 CFLAGS_DEVELOPMENTX86_64 = -O2 @@ -235,31 +289,58 @@ CFLAGS_DEVELOPMENTARM = -O2 CFLAGS_DEBUGARM = -O0 CFLAGS_PROFILEARM = -O2 - +CFLAGS_RELEASEARM64 = -O2 +CFLAGS_DEVELOPMENTARM64 = -O2 +CFLAGS_KASANARM64 = $(CFLAGS_DEVELOPMENTARM64) +CFLAGS_DEBUGARM64 = -O0 +CFLAGS_PROFILEARM64 = -O2 # -# KASAN support +# Sanitizers Support (KASan, UBSan) # +SAN=0 + ifeq ($(CURRENT_KERNEL_CONFIG),KASAN) KASAN = 1 endif ifeq ($(KASAN),1) - +SAN=1 BUILD_LTO = 0 +KASAN_SHIFT_ARM64=0xdffffff800000000 KASAN_SHIFT_X86_64=0xdffffe1000000000 KASAN_SHIFT_X86_64H=$(KASAN_SHIFT_X86_64) KASAN_SHIFT=$($(addsuffix $(CURRENT_ARCH_CONFIG),KASAN_SHIFT_)) -KASAN_BLACKLIST=$(OBJROOT)/san/kasan-blacklist-$(CURRENT_ARCH_CONFIG_LC) CFLAGS_GEN += -DKASAN=1 -DKASAN_SHIFT=$(KASAN_SHIFT) -fsanitize=address \ -mllvm -asan-globals-live-support \ - -mllvm -asan-mapping-offset=$(KASAN_SHIFT) \ - -fsanitize-blacklist=$(KASAN_BLACKLIST) + -mllvm -asan-mapping-offset=$(KASAN_SHIFT) endif +ifeq ($(UBSAN),1) +SAN=1 +UBSAN_CHECKS = signed-integer-overflow shift pointer-overflow # non-fatal (calls runtime, can return) +UBSAN_CHECKS_FATAL = # fatal (calls runtime, must not return) +UBSAN_CHECKS_TRAP = vla-bound builtin # emit a trap instruction (no runtime support) +UBSAN_DISABLED = bounds object-size + +ifneq ($(KASAN),1) +UBSAN_CHECKS += alignment # UBSan alignment + KASan code size is too large +UBSAN_CHECKS_FATAL += unreachable # UBSan unreachable doesn't play nice with ASan (40723397) +endif + +CFLAGS_GEN += -DUBSAN=1 +CFLAGS_GEN += $(foreach x,$(UBSAN_CHECKS) $(UBSAN_CHECKS_FATAL) $(UBSAN_CHECKS_TRAP),-fsanitize=$(x)) +CFLAGS_GEN += $(foreach x,$(UBSAN_CHECKS_FATAL),-fno-sanitize-recover=$(x)) +CFLAGS_GEN += $(foreach x,$(UBSAN_CHECKS_TRAP),-fsanitize-trap=$(x)) +endif + +ifeq ($(SAN),1) +CFLAGS_GEN += -fsanitize-blacklist=$(OBJROOT)/san/kasan-blacklist-$(CURRENT_ARCH_CONFIG_LC) +endif + CFLAGS = $(CFLAGS_GEN) \ $($(addsuffix $(CURRENT_MACHINE_CONFIG),MACHINE_FLAGS_$(CURRENT_ARCH_CONFIG)_)) \ $($(addsuffix $(CURRENT_ARCH_CONFIG),ARCH_FLAGS_)) \ @@ -276,7 +357,7 @@ CFLAGS = $(CFLAGS_GEN) \ OTHER_CXXFLAGS = -CXXFLAGS_GEN = -std=gnu++11 -fapple-kext $(OTHER_CXXFLAGS) +CXXFLAGS_GEN = -std=gnu++1z -fapple-kext $(OTHER_CXXFLAGS) CXXFLAGS = $(CXXFLAGS_GEN) \ $($(addsuffix $(CURRENT_ARCH_CONFIG),CXXFLAGS_)) \ @@ -301,6 +382,8 @@ SFLAGS_PROFILE = SFLAGS_X86_64 = $(CFLAGS_X86_64) SFLAGS_X86_64H = $(CFLAGS_X86_64H) +SFLAGS_ARM = $(CFLAGS_ARM) +SFLAGS_ARM64 = $(CFLAGS_ARM64) SFLAGS = $(SFLAGS_GEN) \ $($(addsuffix $(CURRENT_MACHINE_CONFIG),MACHINE_FLAGS_$(CURRENT_ARCH_CONFIG)_)) \ @@ -330,12 +413,7 @@ LDFLAGS_KERNEL_GEN = \ -Wl,-sectalign,__TEXT,__text,0x1000 \ -Wl,-sectalign,__DATA,__common,0x1000 \ -Wl,-sectalign,__DATA,__bss,0x1000 \ - -Wl,-sectcreate,__PRELINK_TEXT,__text,/dev/null \ - -Wl,-sectcreate,"__PLK_TEXT_EXEC",__text,/dev/null \ - -Wl,-sectcreate,__PRELINK_DATA,__data,/dev/null \ - -Wl,-sectcreate,"__PLK_DATA_CONST",__data,/dev/null \ - -Wl,-sectcreate,"__PLK_LLVM_COV",__llvm_covmap,/dev/null \ - -Wl,-sectcreate,"__PLK_LINKEDIT",__data,/dev/null \ + -Wl,-sectcreate,__PRELINK_TEXT,__text,/dev/null \ -Wl,-sectcreate,__PRELINK_INFO,__info,/dev/null \ -Wl,-new_linker \ -Wl,-pagezero_size,0x0 \ @@ -343,7 +421,8 @@ LDFLAGS_KERNEL_GEN = \ -Wl,-function_starts \ -Wl,-headerpad,152 -LDFLAGS_KERNEL_SDK = -L$(SDKROOT)/usr/local/lib/kernel -lfirehose_kernel +# LDFLAGS_KERNEL_SDK = -L$(SDKROOT)/usr/local/lib/kernel -lfirehose_kernel +LDFLAGS_KERNEL_SDK = -L$(SDKROOT)/usr/local/lib/kernel LDFLAGS_KERNEL_RELEASE = LDFLAGS_KERNEL_DEVELOPMENT = @@ -386,9 +465,13 @@ LDFLAGS_KERNEL_RELEASEX86_64 = \ -Wl,-no_zero_fill_sections \ $(LDFLAGS_NOSTRIP_FLAG) +ifeq ($(SAN),1) +LDFLAGS_KERNEL_RELEASEX86_64 += \ + -Wl,-sectalign,__HIB,__cstring,0x1000 +endif + ifeq ($(KASAN),1) LDFLAGS_KERNEL_RELEASEX86_64 += \ - -Wl,-sectalign,__HIB,__cstring,0x1000 \ -Wl,-sectalign,__HIB,__asan_globals,0x1000 \ -Wl,-sectalign,__HIB,__asan_liveness,0x1000 \ -Wl,-sectalign,__HIB,__mod_term_func,0x1000 \ @@ -411,6 +494,99 @@ LDFLAGS_KERNEL_DEVELOPMENTX86_64H = $(LDFLAGS_KERNEL_RELEASEX86_64H) LDFLAGS_KERNEL_KASANX86_64H = $(LDFLAGS_KERNEL_RELEASEX86_64H) LDFLAGS_KERNEL_PROFILEX86_64H = $(LDFLAGS_KERNEL_RELEASEX86_64H) +# We preload ___udivmoddi4 in order to work around an issue with building +# LTO on armv7. +LDFLAGS_KERNEL_GENARM = \ + -Wl,-pie \ + -Wl,-static \ + -Wl,-image_base,0x80001000 \ + -Wl,-sectalign,__DATA,__const,0x1000 \ + -Wl,-u,___udivmoddi4 + +LDFLAGS_KERNEL_RELEASEARM = \ + $(LDFLAGS_KERNEL_GENARM) + +LDFLAGS_KERNEL_EXPORTS_RELEASEARM = \ + -Wl,-exported_symbols_list,$(TARGET)/all-kpi.exp + +LDFLAGS_KERNEL_DEVELOPMENTARM = \ + $(LDFLAGS_KERNEL_GENARM) \ + $(LDFLAGS_NOSTRIP_FLAG) + +LDFLAGS_KERNEL_EXPORTS_DEVELOPMENTARM = + +LDFLAGS_KERNEL_DEBUGARM = $(LDFLAGS_KERNEL_DEVELOPMENTARM) +LDFLAGS_KERNEL_EXPORTS_DEBUGARM = $(LDFLAGS_KERNEL_EXPORTS_DEVELOPMENTARM) + +# Offset image base by page to have iBoot load kernel TEXT correctly. +# First page is used for various purposes : sleep token, reset vector. +# We also need a 32MB offset, as this is the minimum block mapping size +# for a 16KB page runtime, and we wish to use the first virtual block +# to map the low globals page. We also need another 4MB to account for +# the address space reserved by L4 (because the reservation is not a +# multiple of the block size in alignment/length, we will implictly map +# it with our block mapping, and we therefore must reflect that the +# first 4MB of the block mapping for xnu do not belong to xnu). +# For the moment, kaliber has a unique memory layout (monitor at the top +# of memory). Support this by breaking 16KB on other platforms and +# mandating 32MB alignment. Image base (i.e. __TEXT) must be 16KB +# aligned since ld64 will link with 16KB alignment for ARM64. +# +# We currently offset by an additional 32MB in order to reclaim memory. +# We need a dedicated virtual page for the low globals. Our bootloader +# may have a significant chunk of memory (up to an L2 entry in size) +# that lies before the kernel. The addition 32MB of virtual padding +# ensures that we have enough virtual address space to map all of that +# memory as part of the V-to-P mapping. +# 23355738 - put __PRELINK_TEXT first. We reserve enough room +# for 0x0000000003000000 = 48MB of kexts +# +# 0xfffffff000000000 (32MB range for low globals) +# 0xfffffff002000000 (32MB range to allow for large page physical slide) +# 0xfffffff004000000 (16KB range to reserve the first available page) +# 0xfffffff004004000 (48MB range for kexts) +# 0xfffffff007004000 (Start of xnu proper). +LDFLAGS_KERNEL_GENARM64 = \ + -Wl,-pie \ + -Wl,-static \ + -Wl,-segaddr,__PRELINK_TEXT,0xfffffff004004000 \ + -Wl,-image_base,0xfffffff007004000 \ + -Wl,-sectalign,__DATA,__const,0x4000 \ + -Wl,-rename_section,__DATA,__mod_init_func,__DATA_CONST,__mod_init_func \ + -Wl,-rename_section,__DATA,__mod_term_func,__DATA_CONST,__mod_term_func \ + -Wl,-rename_section,__DATA,__const,__DATA_CONST,__const \ + -Wl,-rename_section,__TEXT,__text,__TEXT_EXEC,__text \ + -Wl,-rename_section,__TEXT,__stubs,__TEXT_EXEC,__stubs \ + -Wl,-rename_section,__TEXT,initcode,__TEXT_EXEC,initcode \ + -Wl,-sectcreate,"__PLK_TEXT_EXEC",__text,/dev/null \ + -Wl,-sectcreate,__PRELINK_DATA,__data,/dev/null \ + -Wl,-sectcreate,"__PLK_DATA_CONST",__data,/dev/null \ + -Wl,-sectcreate,"__PLK_LLVM_COV",__llvm_covmap,/dev/null \ + -Wl,-sectcreate,"__PLK_LINKEDIT",__data,/dev/null + + +LDFLAGS_KERNEL_SEGARM64 ?= \ + -Wl,-segment_order,__TEXT:__DATA_CONST:__LINKEDIT:__TEXT_EXEC:__LAST:__KLD:__DATA:__BOOTDATA + +LDFLAGS_KERNEL_RELEASEARM64 = \ + $(LDFLAGS_KERNEL_GENARM64) \ + $(LDFLAGS_KERNEL_SEGARM64) + +LDFLAGS_KERNEL_EXPORTS_RELEASEARM64 = \ + -Wl,-exported_symbols_list,$(TARGET)/all-kpi.exp + +LDFLAGS_KERNEL_DEVELOPMENTARM64 = \ + $(LDFLAGS_KERNEL_GENARM64) \ + $(LDFLAGS_KERNEL_SEGARM64) \ + $(LDFLAGS_NOSTRIP_FLAG) + +LDFLAGS_KERNEL_EXPORTS_DEVELOPMENTARM64 = + +LDFLAGS_KERNEL_KASANARM64 = $(LDFLAGS_KERNEL_DEVELOPMENTARM64) +LDFLAGS_KERNEL_DEBUGARM64 = $(LDFLAGS_KERNEL_DEVELOPMENTARM64) + +LDFLAGS_KERNEL_EXPORTS_KASANARM64 = $(LDFLAGS_KERNEL_EXPORTS_DEVELOPMENTARM64) +LDFLAGS_KERNEL_EXPORTS_DEBUGARM64 = $(LDFLAGS_KERNEL_EXPORTS_DEVELOPMENTARM64) LDFLAGS_KERNEL = $(LDFLAGS_KERNEL_GEN) \ $(LDFLAGS_KERNEL_SDK) \ @@ -420,10 +596,15 @@ LDFLAGS_KERNEL = $(LDFLAGS_KERNEL_GEN) \ $($(addsuffix $(CURRENT_ARCH_CONFIG), $(addsuffix $(CURRENT_KERNEL_CONFIG),LDFLAGS_KERNEL_))) \ $(DEPLOYMENT_TARGET_FLAGS) + +LDFLAGS_KERNEL_EXPORTS = \ + $($(addsuffix $(CURRENT_ARCH_CONFIG), $(addsuffix $(CURRENT_KERNEL_CONFIG),LDFLAGS_KERNEL_EXPORTS_))) + # # Default runtime libraries to be linked with the kernel # -LD_KERNEL_LIBS = -lcc_kext +LD_KERNEL_LIBS = -lcc_kext +LD_KERNEL_ARCHIVES = $(LDFLAGS_KERNEL_SDK) -lfirehose_kernel # # DTrace support @@ -575,10 +756,12 @@ XNU_PRIVATE_UNIFDEF = -UMACH_KERNEL_PRIVATE -UBSD_KERNEL_PRIVATE -UIOKIT_KERNEL_ PLATFORM_UNIFDEF = $(foreach x,$(SUPPORTED_PLATFORMS),$(if $(filter $(PLATFORM),$(x)),-DPLATFORM_$(x) $(foreach token,$(PLATFORM_UNIFDEF_BLACKLIST_TOKENS_$(x)),-U$(token)),-UPLATFORM_$(x))) + SPINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -UKERNEL -DPRIVATE -U_OPEN_SOURCE_ -U__OPEN_SOURCE__ SINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -UKERNEL -UPRIVATE -D_OPEN_SOURCE_ -D__OPEN_SOURCE__ KPINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -DKERNEL_PRIVATE -DPRIVATE -DKERNEL -U_OPEN_SOURCE_ -U__OPEN_SOURCE__ KINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -UPRIVATE -DKERNEL -D_OPEN_SOURCE_ -D__OPEN_SOURCE__ +DATA_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -D_OPEN_SOURCE_ -D__OPEN_SOURCE__ # # Compononent Header file destinations @@ -603,6 +786,8 @@ DSYMUTIL_FLAGS_GEN = --minimize DSYMUTIL_FLAGS_X86_64 = --arch=x86_64 DSYMUTIL_FLAGS_X86_64H = --arch=x86_64h +DSYMUTIL_FLAGS_ARM = --arch=arm +DSYMUTIL_FLAGS_ARM64 = DSYMUTIL_FLAGS = $(DSYMUTIL_FLAGS_GEN) \ $($(addsuffix $(CURRENT_ARCH_CONFIG),DSYMUTIL_FLAGS_)) diff --git a/makedefs/MakeInc.kernel b/makedefs/MakeInc.kernel index 0885b3fab..55de6d307 100644 --- a/makedefs/MakeInc.kernel +++ b/makedefs/MakeInc.kernel @@ -26,8 +26,6 @@ ifeq ($(filter $(PLATFORM),$(SUPPORTED_PLATFORMS)),) $(error Unsupported PLATFORM $(PLATFORM)) endif -STATIC_KMODS = $(SRCROOT)/kmods.a - ifeq ($(BUILD_JSON_COMPILATION_DATABASE),1) do_build_setup:: $(_v)$(CAT) > $(OBJPATH)/compile_commands.json < /dev/null @@ -41,13 +39,29 @@ endif # 1) $(KERNEL_FILE_NAME).unstripped (raw linked kernel, unstripped) # 2) $(KERNEL_FILE_NAME) (stripped kernel, with optional CTF data) # 3) $(KERNEL_FILE_NAME).dSYM (dSYM) -# +# 4) $(KERNEL_FILE_NAME).link (bits for static linking) + +ifeq ($(BUILD_STATIC_LINK),1) + +KERNEL_STATIC_LINK_TARGETS = \ + $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).a + +KERNEL_STATIC_LINK_DST = \ + $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).a \ + $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).linkarguments \ + $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).linkarchives \ + $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).exp \ + $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).alias.exp \ + $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/lldbmacros \ + $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/$(KERNEL_LLDBBOOTSTRAP_NAME) + +endif do_build_all:: do_build_kernel .PHONY: do_build_kernel -do_build_kernel: $(TARGET)/$(KERNEL_FILE_NAME) $(TARGET)/$(KERNEL_FILE_NAME).unstripped +do_build_kernel: $(TARGET)/$(KERNEL_FILE_NAME) $(TARGET)/$(KERNEL_FILE_NAME).unstripped $(KERNEL_STATIC_LINK_TARGETS) @: ifeq ($(BUILD_DSYM),1) @@ -60,7 +74,7 @@ do_build_kernel_dSYM: $(TARGET)/$(KERNEL_FILE_NAME).dSYM @: .LDFLAGS: ALWAYS - $(_v)$(REPLACECONTENTS) $@ $(LD) $(LDFLAGS_KERNEL) $(LD_KERNEL_LIBS) + $(_v)$(REPLACECONTENTS) $@ $(LD) $(LDFLAGS_KERNEL) $(LDFLAGS_KERNEL_EXPORTS) $(LD_KERNEL_LIBS) .CFLAGS: ALWAYS $(_v)$(REPLACECONTENTS) $@ $(KCC) $(CFLAGS) $(INCFLAGS) @@ -90,7 +104,18 @@ $(TARGET)/$(KERNEL_FILE_NAME).unstripped: $(addprefix $(TARGET)/,$(foreach compo $(_v)${MAKE} -f $(firstword $(MAKEFILE_LIST)) version.o @echo "$(ColorL)LD$(Color0) $(ColorLF)$(@F)$(Color0)" $(_v)$(CAT) $(filter %.filelist,$+) < /dev/null > link.filelist - $(_v)$(LD) $(LDFLAGS_KERNEL) -filelist link.filelist version.o $(filter %.o,$+) -o $@ $(LD_KERNEL_LIBS) + $(_v)$(LD) $(LDFLAGS_KERNEL) $(LDFLAGS_KERNEL_EXPORTS) -filelist link.filelist version.o $(filter %.o,$+) -o $@ $(LD_KERNEL_LIBS) $(LD_KERNEL_ARCHIVES) + +$(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).a: $(addprefix $(TARGET)/,$(foreach component,$(COMPONENT_LIST),$(component)/$(CURRENT_KERNEL_CONFIG)/$(component).filelist)) lastkerneldataconst.o lastkernelconstructor.o $(TARGET)/$(KERNEL_FILE_NAME).unstripped .LDFLAGS $(filter %/MakeInc.kernel,$(MAKEFILE_LIST)) + @echo "$(ColorL)LIBTOOL$(Color0) $(ColorLF)$(@F)$(Color0)" + $(_v)$(MKDIR) $(dir $@) + $(_v)$(CAT) $(filter %.filelist,$+) < /dev/null > libtool.filelist + $(_v)$(LIBTOOL) -ca -filelist libtool.filelist $(filter %.o,$+) version.o -o $@ + $(_v)cp $(TARGET)/all-kpi.exp $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).exp + $(_v)cp $(TARGET)/all-alias.exp $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).alias.exp + $(_v)echo "$(LD_KERNEL_ARCHIVES)" >$(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).linkarchives + $(_v)echo "$(LDFLAGS_KERNEL) $(LD_KERNEL_LIBS)" >$(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).linkarguments + $(_v)$(LN) $(call function_convert_build_config_to_objdir,$(CURRENT_BUILD_CONFIG))/$(KERNEL_FILE_NAME).link $(OBJROOT)/$(KERNEL_FILE_NAME).link -include version.d version.o: .CFLAGS $(filter %/MakeInc.kernel,$(MAKEFILE_LIST)) @@ -105,6 +130,7 @@ $(OBJPATH)/version.c: $(SRCROOT)/config/version.c $(NEWVERS) $(SRCROOT)/config/M $(_v)$(CP) $< $@ $(_v)$(NEWVERS) $(OBJPATH)/version.c > /dev/null; + -include lastkerneldataconst.d lastkerneldataconst.o: .CFLAGS $(filter %/MakeInc.kernel,$(MAKEFILE_LIST)) lastkerneldataconst.o: $(SRCROOT)/libsa/lastkerneldataconst.c @@ -194,6 +220,36 @@ $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME): $(TARGET)/$(KERNEL_FILE_NA fi; \ exit $$cmdstatus +ifeq ($(BUILD_STATIC_LINK),1) + +$(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).a: $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).a ALWAYS + $(_v)$(MKDIR) $(dir $@) + @echo "$(ColorH)INSTALL$(Color0) $(ColorF)$(@F)$(Color0)" + $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@ + +$(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).linkarguments: $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).linkarguments ALWAYS + $(_v)$(MKDIR) $(dir $@) + @echo "$(ColorH)INSTALL$(Color0) $(ColorF)$(@F)$(Color0)" + $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@ + +$(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).linkarchives: $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).linkarchives ALWAYS + $(_v)$(MKDIR) $(dir $@) + @echo "$(ColorH)INSTALL$(Color0) $(ColorF)$(@F)$(Color0)" + $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@ + +$(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).exp: $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).exp ALWAYS + $(_v)$(MKDIR) $(dir $@) + @echo "$(ColorH)INSTALL$(Color0) $(ColorF)$(@F)$(Color0)" + $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@ + +$(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).alias.exp: $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).alias.exp ALWAYS + $(_v)$(MKDIR) $(dir $@) + @echo "$(ColorH)INSTALL$(Color0) $(ColorF)$(@F)$(Color0)" + $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@ + +# BUILD_STATIC_LINK +endif + $(SYMROOT)/$(KERNEL_FILE_NAME): $(TARGET)/$(KERNEL_FILE_NAME).unstripped ALWAYS $(_v)$(MKDIR) $(dir $@) $(_v)if [ $(OBJROOT)/.mach_kernel.timestamp -nt $@ ]; then \ @@ -207,13 +263,20 @@ $(SYMROOT)/$(KERNEL_FILE_NAME): $(TARGET)/$(KERNEL_FILE_NAME).unstripped ALWAYS fi; \ exit $$cmdstatus -$(SYMROOT)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/lldbmacros $(DSTROOT)/$(INSTALL_KERNEL_SYM_DIR)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/lldbmacros: $(TARGET)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/lldbmacros + +$(SYMROOT)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/lldbmacros \ +$(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/lldbmacros \ +$(DSTROOT)/$(INSTALL_KERNEL_SYM_DIR)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/lldbmacros: \ +$(TARGET)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/lldbmacros $(_v)$(MKDIR) $(dir $@) @echo "$(ColorH)INSTALLMACROS$(Color0) $(ColorF)$(@F)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\"" $(_v)$(CP) -r $< $(dir $@) $(_v)$(TOUCH) $@ -$(SYMROOT)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/$(KERNEL_LLDBBOOTSTRAP_NAME) $(DSTROOT)/$(INSTALL_KERNEL_SYM_DIR)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/$(KERNEL_LLDBBOOTSTRAP_NAME): $(TARGET)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/$(KERNEL_LLDBBOOTSTRAP_NAME) +$(SYMROOT)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/$(KERNEL_LLDBBOOTSTRAP_NAME) \ +$(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/$(KERNEL_LLDBBOOTSTRAP_NAME) \ +$(DSTROOT)/$(INSTALL_KERNEL_SYM_DIR)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/$(KERNEL_LLDBBOOTSTRAP_NAME): \ +$(TARGET)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/$(KERNEL_LLDBBOOTSTRAP_NAME) $(_v)$(MKDIR) $(dir $@) @echo "$(ColorH)INSTALLMACROS$(Color0) $(ColorF)$(@F)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\"" $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@ @@ -243,8 +306,9 @@ $(SYMROOT)/$(KERNEL_FILE_NAME).dSYM/$(DSYMDWARFDIR)/$(KERNEL_FILE_NAME) $(DSTROO .PHONY: do_install_machine_specific_kernel do_install_machine_specific_kernel_dSYM -do_install_machine_specific_kernel: $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME) \ - $(SYMROOT)/$(KERNEL_FILE_NAME) +do_install_machine_specific_kernel: $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME) \ + $(SYMROOT)/$(KERNEL_FILE_NAME) \ + $(KERNEL_STATIC_LINK_DST) @: do_install_machine_specific_kernel_dSYM: \ diff --git a/makedefs/MakeInc.rule b/makedefs/MakeInc.rule index 8c1c5d0c8..d2e05a89f 100644 --- a/makedefs/MakeInc.rule +++ b/makedefs/MakeInc.rule @@ -112,7 +112,7 @@ $(3)/.UNIFDEF_FLAGS: ALWAYS | $(3)_MKDIR $$(_v)$$(REPLACECONTENTS) $$@ $$(UNIFDEF) $(4) $(1): $(dir $(firstword $(1)))% : $(if $(2),%,$$(SOURCE)/%) | $(3)_MKDIR - @echo "$$(ColorH)INSTALLHDR$(Color0) $$(ColorF)$$*$$(Color0)" + @echo "$$(ColorH)INSTALLHDR$$(Color0) $$(ColorF)$$*$$(Color0)" $$(_v)$$(UNIFDEF) $(4) $$< > ./$(3)/$$*.unifdef.$$$$$$$$; \ if [ $$$$? -eq 2 ]; then \ echo Parse failure for $$<; \ @@ -126,6 +126,43 @@ $(1): $(dir $(firstword $(1)))% : $(if $(2),%,$$(SOURCE)/%) | $(3)_MKDIR $$(RM) ./$(3)/$$*.unifdef.$$$$$$$$ ./$(3)/$$*.strip.$$$$$$$$ endef +# $(1) is the list of install paths +# $(2) is the source path pattern (using % to match with $(5)) or source file +# $(3) is the local temp directory for processing +# $(4) is the unifdef flags +# $(5) is the destination directory (when pattern matching) or empty +# +# $$$$$$$$ is a double-escaped "$$" to represent the current pid +# of the shell process for creating uniquely named temporary files + +define INSTALLPYTHON_RULE_template + +.PHONY: $(3)_MKDIR + +$(3)_MKDIR: + $$(_v)$$(MKDIR) ./$(3) + +# Rebuild if unifdef flags change +$(1): $(3)/.UNIFDEF_FLAGS +$(3)/.UNIFDEF_FLAGS: ALWAYS | $(3)_MKDIR + $$(_v)$$(REPLACECONTENTS) $$@ $$(UNIFDEF) -t $(4) + +$(1): $(5)% : $(2) | $(3)_MKDIR + @echo "$$(ColorH)INSTALLPY$$(Color0) $$(ColorF)$$*$$(Color0)" + $$(_v)$$(MKDIR) $$(dir $$@) $$(dir ./$(3)/$$*) + $$(_v)$$(UNIFDEF) -t $(4) $$< > ./$(3)/$$*.unifdef.$$$$$$$$$$(suffix $$*); \ + if [ $$$$? -eq 2 ]; then \ + echo Parse failure for $$<; \ + exit 1; \ + fi; \ + $$(INSTALL) $$(DATA_INSTALL_FLAGS) \ + ./$(3)/$$*.unifdef.$$$$$$$$$$(suffix $$*) $$@ || exit 1; \ + $$(PYTHON) $$(LLDBMACROS_SOURCE)/core/syntax_checker.py \ + ./$(3)/$$*.unifdef.$$$$$$$$$$(suffix $$*) $$(_vstdout) || exit 1; \ + $$(RM) ./$(3)/$$*.unifdef.$$$$$$$$$$(suffix $$*) + $$(_v)if [ -n "$(5)" ]; then $$(TOUCH) "$(5)"; fi +endef + # # Machine-independent (public) files # diff --git a/makedefs/MakeInc.top b/makedefs/MakeInc.top index 76fd8500a..6d9bcf146 100644 --- a/makedefs/MakeInc.top +++ b/makedefs/MakeInc.top @@ -87,7 +87,8 @@ endif override DEFAULT_I386_MACHINE_CONFIG := NONE override DEFAULT_X86_64_MACHINE_CONFIG := NONE override DEFAULT_X86_64H_MACHINE_CONFIG := NONE - +override DEFAULT_ARM_MACHINE_CONFIG := T8002 +override DEFAULT_ARM64_MACHINE_CONFIG := S5L8960X # This is typically never specified (TARGET_CONFIGS is used) ifndef MACHINE_CONFIGS @@ -114,6 +115,76 @@ endif # default kernel configuration = DEFAULT_KERNEL_CONFIG # default architecture configuration = system architecture where you are running make. +ifneq ($(filter $(SUPPORTED_EMBEDDED_PLATFORMS),$(PLATFORM)),) + +# Defaults for "make all_embedded" +ifeq ($(KERNEL_CONFIGS),DEFAULT) +KERNEL_CONFIGS_EMBEDDED := RELEASE DEVELOPMENT +else +KERNEL_CONFIGS_EMBEDDED := $(KERNEL_CONFIGS) +endif + +ifeq ($(ARCH_CONFIGS),DEFAULT) +ARCH_CONFIGS_EMBEDDED := ARM ARM64 +else +ARCH_CONFIGS_EMBEDDED := $(strip $(shell echo $(ARCH_CONFIGS) | $(TR) a-z A-Z)) +endif + +# Find supported products from the device map +DEVICEMAP_PRODUCTS_ARMV7 := $(shell $(EMBEDDED_DEVICE_MAP) -db $(EDM_DBPATH) \ + -query 'SELECT DISTINCT TargetType \ + FROM Files \ + INNER JOIN Manifests USING (manifestID) \ + INNER JOIN Targets USING (Target) \ + WHERE (KernelMachOArchitecture LIKE "armv7" \ + AND fileType in ("KernelCache", "RestoreKernelCache"))') +DEVICEMAP_PRODUCTS_ARMV7S := $(shell $(EMBEDDED_DEVICE_MAP) -db $(EDM_DBPATH) \ + -query 'SELECT DISTINCT TargetType \ + FROM Files \ + INNER JOIN Manifests USING (manifestID) \ + INNER JOIN Targets USING (Target) \ + WHERE (KernelMachOArchitecture LIKE "armv7s" \ + AND fileType in ("KernelCache", "RestoreKernelCache"))') +DEVICEMAP_PRODUCTS_ARMV7K := $(shell $(EMBEDDED_DEVICE_MAP) -db $(EDM_DBPATH) \ + -query 'SELECT DISTINCT TargetType \ + FROM Files \ + INNER JOIN Manifests USING (manifestID) \ + INNER JOIN Targets USING (Target) \ + WHERE (KernelMachOArchitecture LIKE "armv7k" \ + AND fileType in ("KernelCache", "RestoreKernelCache"))') +DEVICEMAP_PRODUCTS_ARM := $(DEVICEMAP_PRODUCTS_ARMV7) $(DEVICEMAP_PRODUCTS_ARMV7S) $(DEVICEMAP_PRODUCTS_ARMV7K) + + +DEVICEMAP_PRODUCTS_ARM64 := $(shell $(EMBEDDED_DEVICE_MAP) -db $(EDM_DBPATH) \ + -query 'SELECT DISTINCT TargetType \ + FROM Files \ + INNER JOIN Manifests USING (manifestID) \ + INNER JOIN Targets USING (Target) \ + WHERE (KernelMachOArchitecture LIKE "arm64" \ + AND fileType in ("KernelCache", "RestoreKernelCache"))') + + +# Generate a list of mappings of the form "n75:arm;t8002" based on the device map +DEVICEMAP_PRODUCT_SOC_MAPPINGS := $(shell $(EMBEDDED_DEVICE_MAP) -db $(EDM_DBPATH) -query SELECT DISTINCT TargetType, KernelMachOArchitecture, KernelPlatform FROM Targets | awk -F\| '{ if ($$2 ~ /armv[0-9][a-z]?/) { print $$1 ":arm;" $$3 } else if ($$2 ~ /arm64[a-z]?/) { print $$1 ":arm64;" $$3 ";" $$4} else { print $$1 ":" $$2 ";" $$3 ";" $$4} }' ) + +# Map a product like "n75" to "arm;t8002" +# $(1) is a product name in lower case +function_lookup_product = $(call function_substitute_word_with_replacement, \ + $(1), \ + $(DEVICEMAP_PRODUCT_SOC_MAPPINGS), \ + unknown_arch_for_$(1);unknown_platform_for_$(1) \ + ) + +# Generate a list of mappings for products that use a different platform for their kernel configuration than their true platform +# of the form "n71m:arm64;s8000;s8003". The 4th element is the true SoC platform, which will get an on-disk copy, while the +# kernel's recursive build system will build the 3rd element as the KernelPlatform +DEVICEMAP_PRODUCT_SOC_ALIASES := $(shell $(EMBEDDED_DEVICE_MAP) -db $(EDM_DBPATH) -query SELECT DISTINCT TargetType, KernelMachOArchitecture, KernelPlatform, Platform FROM Targets WHERE KernelPlatform "!=" Platform | awk -F\| '{ if ($$2 ~ /armv[0-9][a-z]?/) { print $$1 ":arm;" $$3 ";" $$4} else if ($$2 ~ /arm64[a-z]?/) { print $$1 ":arm64;" $$3 ";" $$4} else { print $$1 ":" $$2 ";" $$3 ";" $$4} }' ) + +function_lookup_product_alias = $(call function_substitute_word_with_replacement, \ + $(1), \ + $(DEVICEMAP_PRODUCT_SOC_ALIASES), \ + ) +endif ifeq ($(PLATFORM),MacOSX) diff --git a/osfmk/Makefile b/osfmk/Makefile index 64af5e1e0..b3beb2619 100644 --- a/osfmk/Makefile +++ b/osfmk/Makefile @@ -6,6 +6,7 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir include $(MakeInc_cmd) include $(MakeInc_def) + INSTINC_SUBDIRS = \ mach \ atm \ @@ -70,7 +71,8 @@ EXPINC_SUBDIRS = \ libsa \ console \ kperf \ - prng + prng \ + tests EXPINC_SUBDIRS_X86_64 = \ diff --git a/osfmk/arm/Makefile b/osfmk/arm/Makefile index 8fd552a10..38c19f380 100644 --- a/osfmk/arm/Makefile +++ b/osfmk/arm/Makefile @@ -21,11 +21,11 @@ ARM_HEADER_FILES = \ machine_cpuid.h \ machine_routines.h \ pal_routines.h \ + pmap_public.h \ proc_reg.h \ + simple_lock.h \ smp.h \ - thread.h \ - simple_lock.h - + thread.h INSTALL_MD_DIR = arm diff --git a/osfmk/arm/arm_init.c b/osfmk/arm/arm_init.c index e81af968b..b38086203 100644 --- a/osfmk/arm/arm_init.c +++ b/osfmk/arm/arm_init.c @@ -83,9 +83,10 @@ extern int serial_init(void); extern void sleep_token_buffer_init(void); extern vm_offset_t intstack_top; -extern vm_offset_t fiqstack_top; #if __arm64__ extern vm_offset_t excepstack_top; +#else +extern vm_offset_t fiqstack_top; #endif extern const char version[]; @@ -132,10 +133,74 @@ unsigned int page_shift_user32; /* for page_size as seen by a 32-bit task */ #endif /* __arm64__ */ +/* + * JOP rebasing + */ + + +// Note, the following should come from a header from dyld +static void +rebase_chain(uintptr_t chainStartAddress, uint64_t stepMultiplier, uintptr_t baseAddress __unused, uint64_t slide) +{ + uint64_t delta = 0; + uintptr_t address = chainStartAddress; + do { + uint64_t value = *(uint64_t*)address; + + bool isAuthenticated = (value & (1ULL << 63)) != 0; + bool isRebase = (value & (1ULL << 62)) == 0; + if (isRebase) { + if (isAuthenticated) { + // The new value for a rebase is the low 32-bits of the threaded value plus the slide. + uint64_t newValue = (value & 0xFFFFFFFF) + slide; + // Add in the offset from the mach_header + newValue += baseAddress; + *(uint64_t*)address = newValue; + + } else + { + // Regular pointer which needs to fit in 51-bits of value. + // C++ RTTI uses the top bit, so we'll allow the whole top-byte + // and the bottom 43-bits to be fit in to 51-bits. + uint64_t top8Bits = value & 0x0007F80000000000ULL; + uint64_t bottom43Bits = value & 0x000007FFFFFFFFFFULL; + uint64_t targetValue = ( top8Bits << 13 ) | (((intptr_t)(bottom43Bits << 21) >> 21) & 0x00FFFFFFFFFFFFFF); + targetValue = targetValue + slide; + *(uint64_t*)address = targetValue; + } + } + + // The delta is bits [51..61] + // And bit 62 is to tell us if we are a rebase (0) or bind (1) + value &= ~(1ULL << 62); + delta = ( value & 0x3FF8000000000000 ) >> 51; + address += delta * stepMultiplier; + } while ( delta != 0 ); +} + +// Note, the following method should come from a header from dyld +static bool +rebase_threaded_starts(uint32_t *threadArrayStart, uint32_t *threadArrayEnd, + uintptr_t macho_header_addr, uintptr_t macho_header_vmaddr, size_t slide) +{ + uint32_t threadStartsHeader = *threadArrayStart; + uint64_t stepMultiplier = (threadStartsHeader & 1) == 1 ? 8 : 4; + for (uint32_t* threadOffset = threadArrayStart + 1; threadOffset != threadArrayEnd; ++threadOffset) { + if (*threadOffset == 0xFFFFFFFF) + break; + rebase_chain(macho_header_addr + *threadOffset, stepMultiplier, macho_header_vmaddr, slide); + } + return true; +} + /* * Routine: arm_init * Function: */ + +extern uint32_t __thread_starts_sect_start[] __asm("section$start$__TEXT$__thread_starts"); +extern uint32_t __thread_starts_sect_end[] __asm("section$end$__TEXT$__thread_starts"); + void arm_init( boot_args *args) @@ -146,15 +211,27 @@ arm_init( thread_t thread; processor_t my_master_proc; + // rebase and sign jops + if (&__thread_starts_sect_end[0] != &__thread_starts_sect_start[0]) + { + uintptr_t mh = (uintptr_t) &_mh_execute_header; + uintptr_t slide = mh - VM_KERNEL_LINK_ADDRESS; + rebase_threaded_starts( &__thread_starts_sect_start[0], + &__thread_starts_sect_end[0], + mh, mh - slide, slide); + } + /* If kernel integrity is supported, use a constant copy of the boot args. */ const_boot_args = *args; - BootArgs = &const_boot_args; + BootArgs = args = &const_boot_args; cpu_data_init(&BootCpuData); - PE_init_platform(FALSE, args); /* Get platform expert set up */ + PE_init_platform(FALSE, args); /* Get platform expert set up */ #if __arm64__ + + { unsigned int tmp_16k = 0; @@ -221,11 +298,12 @@ arm_init( #endif BootCpuData.intstack_top = (vm_offset_t) & intstack_top; BootCpuData.istackptr = BootCpuData.intstack_top; - BootCpuData.fiqstack_top = (vm_offset_t) & fiqstack_top; - BootCpuData.fiqstackptr = BootCpuData.fiqstack_top; #if __arm64__ BootCpuData.excepstack_top = (vm_offset_t) & excepstack_top; BootCpuData.excepstackptr = BootCpuData.excepstack_top; +#else + BootCpuData.fiqstack_top = (vm_offset_t) & fiqstack_top; + BootCpuData.fiqstackptr = BootCpuData.fiqstack_top; #endif BootCpuData.cpu_processor = cpu_processor_alloc(TRUE); BootCpuData.cpu_console_buf = (void *)NULL; @@ -312,6 +390,10 @@ arm_init( printf_init(); panic_init(); +#if __arm64__ + /* Enable asynchronous exceptions */ + __builtin_arm_wsr("DAIFClr", DAIFSC_ASYNCF); +#endif #if __arm64__ && WITH_CLASSIC_S2R sleep_token_buffer_init(); #endif @@ -367,7 +449,7 @@ arm_init( PE_init_platform(TRUE, &BootCpuData); cpu_timebase_init(TRUE); - fiq_context_init(TRUE); + fiq_context_bootstrap(TRUE); /* @@ -407,8 +489,10 @@ arm_init_cpu( machine_set_current_thread(cpu_data_ptr->cpu_active_thread); #if __arm64__ + pmap_clear_user_ttb(); + flush_mmu_tlb(); /* Enable asynchronous exceptions */ - __builtin_arm_wsr("DAIFClr", DAIFSC_ASYNCF); + __builtin_arm_wsr("DAIFClr", DAIFSC_ASYNCF); #endif cpu_machine_idle_init(FALSE); @@ -455,10 +539,11 @@ arm_init_cpu( #if CONFIG_TELEMETRY bootprofile_wake_from_sleep(); #endif /* CONFIG_TELEMETRY */ + } #if MONOTONIC && defined(__arm64__) - mt_wake(); + mt_wake_per_core(); #endif /* MONOTONIC && defined(__arm64__) */ - } + slave_main(NULL); } @@ -481,8 +566,10 @@ arm_init_idle_cpu( machine_set_current_thread(cpu_data_ptr->cpu_active_thread); #if __arm64__ + pmap_clear_user_ttb(); + flush_mmu_tlb(); /* Enable asynchronous exceptions */ - __builtin_arm_wsr("DAIFClr", DAIFSC_ASYNCF); + __builtin_arm_wsr("DAIFClr", DAIFSC_ASYNCF); #endif #if (__ARM_ARCH__ == 7) @@ -496,5 +583,5 @@ arm_init_idle_cpu( fiq_context_init(FALSE); - cpu_idle_exit(); + cpu_idle_exit(TRUE); } diff --git a/osfmk/arm/arm_vm_init.c b/osfmk/arm/arm_vm_init.c index 07bcfb9b2..ebdfe7735 100644 --- a/osfmk/arm/arm_vm_init.c +++ b/osfmk/arm/arm_vm_init.c @@ -43,11 +43,13 @@ #include #include +#include #include #include #include #include +#include #include @@ -77,6 +79,9 @@ vm_offset_t vm_elinkedit; vm_offset_t vm_prelink_sdata; vm_offset_t vm_prelink_edata; +vm_offset_t vm_kernel_builtinkmod_text; +vm_offset_t vm_kernel_builtinkmod_text_end; + unsigned long gVirtBase, gPhysBase, gPhysSize; /* Used by */ vm_offset_t mem_size; /* Size of actual physical memory present @@ -93,6 +98,9 @@ addr64_t vm_last_addr = VM_MAX_KERNEL_ADDRESS; /* Highest kernel * virtual address known * to the VM system */ +vm_offset_t segEXTRADATA; +unsigned long segSizeEXTRADATA; +vm_offset_t segLOWESTTEXT; static vm_offset_t segTEXTB; static unsigned long segSizeTEXT; static vm_offset_t segDATAB; @@ -105,6 +113,11 @@ static vm_offset_t segLASTB; static unsigned long segSizeLAST; static vm_offset_t sectCONSTB; static unsigned long sectSizeCONST; +vm_offset_t segBOOTDATAB; +unsigned long segSizeBOOTDATA; +extern vm_offset_t intstack_low_guard; +extern vm_offset_t intstack_high_guard; +extern vm_offset_t fiqstack_high_guard; vm_offset_t segPRELINKTEXTB; unsigned long segSizePRELINKTEXT; @@ -139,6 +152,11 @@ extern vm_offset_t ExceptionVectorsBase; /* the code we want to load there */ #define round_x_table(x) \ (((pmap_paddr_t)(x) + (ARM_PGBYTES<<2) - 1) & ~((ARM_PGBYTES<<2) - 1)) +vm_map_address_t +phystokv(pmap_paddr_t pa) +{ + return (pa - gPhysBase + gVirtBase); +} static void arm_vm_page_granular_helper(vm_offset_t start, vm_offset_t _end, vm_offset_t va, @@ -154,6 +172,11 @@ arm_vm_page_granular_helper(vm_offset_t start, vm_offset_t _end, vm_offset_t va, pa = va - gVirtBase + gPhysBase; + if (pa >= avail_end) + return; + + assert(_end >= va); + if (ARM_TTE_TYPE_TABLE == (tmplate & ARM_TTE_TYPE_MASK)) { /* pick up the existing page table. */ ppte = (pt_entry_t *)phystokv((tmplate & ARM_TTE_TABLE_MASK)); @@ -161,13 +184,17 @@ arm_vm_page_granular_helper(vm_offset_t start, vm_offset_t _end, vm_offset_t va, /* TTE must be reincarnated COARSE. */ ppte = (pt_entry_t *)phystokv(avail_start); avail_start += ARM_PGBYTES; - - pmap_init_pte_static_page(kernel_pmap, ppte, pa); + bzero(ppte, ARM_PGBYTES); for (i = 0; i < 4; ++i) tte[i] = pa_to_tte(kvtophys((vm_offset_t)ppte) + (i * 0x400)) | ARM_TTE_TYPE_TABLE; } + vm_offset_t len = _end - va; + if ((pa + len) > avail_end) + _end -= (pa + len - avail_end); + assert((start - gVirtBase + gPhysBase) >= gPhysBase); + /* Apply the desired protections to the specified page range */ for (i = 0; i < (ARM_PGBYTES / sizeof(*ppte)); i++) { if (start <= va && va < _end) { @@ -189,7 +216,7 @@ arm_vm_page_granular_helper(vm_offset_t start, vm_offset_t _end, vm_offset_t va, static void arm_vm_page_granular_prot(vm_offset_t start, unsigned long size, - int tte_prot_XN, int pte_prot_APX, int pte_prot_XN, int forceCoarse) + int tte_prot_XN, int pte_prot_APX, int pte_prot_XN, int force_page_granule) { vm_offset_t _end = start + size; vm_offset_t align_start = (start + ARM_TT_L1_PT_OFFMASK) & ~ARM_TT_L1_PT_OFFMASK; @@ -198,7 +225,7 @@ arm_vm_page_granular_prot(vm_offset_t start, unsigned long size, arm_vm_page_granular_helper(start, _end, start, pte_prot_APX, pte_prot_XN); while (align_start < align_end) { - if (forceCoarse) { + if (force_page_granule) { arm_vm_page_granular_helper(align_start, align_end, align_start + 1, pte_prot_APX, pte_prot_XN); } else { @@ -221,27 +248,27 @@ arm_vm_page_granular_prot(vm_offset_t start, unsigned long size, } static inline void -arm_vm_page_granular_RNX(vm_offset_t start, unsigned long size, int forceCoarse) +arm_vm_page_granular_RNX(vm_offset_t start, unsigned long size, int force_page_granule) { - arm_vm_page_granular_prot(start, size, 1, AP_RONA, 1, forceCoarse); + arm_vm_page_granular_prot(start, size, 1, AP_RONA, 1, force_page_granule); } static inline void -arm_vm_page_granular_ROX(vm_offset_t start, unsigned long size, int forceCoarse) +arm_vm_page_granular_ROX(vm_offset_t start, unsigned long size, int force_page_granule) { - arm_vm_page_granular_prot(start, size, 0, AP_RONA, 0, forceCoarse); + arm_vm_page_granular_prot(start, size, 0, AP_RONA, 0, force_page_granule); } static inline void -arm_vm_page_granular_RWNX(vm_offset_t start, unsigned long size, int forceCoarse) +arm_vm_page_granular_RWNX(vm_offset_t start, unsigned long size, int force_page_granule) { - arm_vm_page_granular_prot(start, size, 1, AP_RWNA, 1, forceCoarse); + arm_vm_page_granular_prot(start, size, 1, AP_RWNA, 1, force_page_granule); } static inline void -arm_vm_page_granular_RWX(vm_offset_t start, unsigned long size, int forceCoarse) +arm_vm_page_granular_RWX(vm_offset_t start, unsigned long size, int force_page_granule) { - arm_vm_page_granular_prot(start, size, 0, AP_RWNA, 0, forceCoarse); + arm_vm_page_granular_prot(start, size, 0, AP_RWNA, 0, force_page_granule); } void @@ -276,6 +303,10 @@ arm_vm_prot_init(boot_args * args) /* If we aren't protecting const, just map DATA as a single blob. */ arm_vm_page_granular_RWNX(segDATAB, segSizeDATA, FALSE); } + arm_vm_page_granular_RWNX(segBOOTDATAB, segSizeBOOTDATA, TRUE); + arm_vm_page_granular_RNX((vm_offset_t)&intstack_low_guard, PAGE_MAX_SIZE, TRUE); + arm_vm_page_granular_RNX((vm_offset_t)&intstack_high_guard, PAGE_MAX_SIZE, TRUE); + arm_vm_page_granular_RNX((vm_offset_t)&fiqstack_high_guard, PAGE_MAX_SIZE, TRUE); arm_vm_page_granular_ROX(segKLDB, segSizeKLD, force_coarse_physmap); arm_vm_page_granular_RWNX(segLINKB, segSizeLINK, force_coarse_physmap); @@ -283,7 +314,8 @@ arm_vm_prot_init(boot_args * args) arm_vm_page_granular_RWNX(segPRELINKTEXTB, segSizePRELINKTEXT, TRUE); // Refined in OSKext::readPrelinkedExtensions arm_vm_page_granular_RWNX(segPRELINKTEXTB + segSizePRELINKTEXT, end_kern - (segPRELINKTEXTB + segSizePRELINKTEXT), force_coarse_physmap); // PreLinkInfoDictionary - arm_vm_page_granular_RWNX(end_kern, phystokv(args->topOfKernelData) - end_kern, force_coarse_physmap); // Device Tree, RAM Disk (if present), bootArgs + arm_vm_page_granular_RWNX(end_kern, phystokv(args->topOfKernelData) - end_kern, force_coarse_physmap); // Device Tree, RAM Disk (if present), bootArgs, trust caches + arm_vm_page_granular_RNX(segEXTRADATA, segSizeEXTRADATA, FALSE); // tighter trust cache protection arm_vm_page_granular_RWNX(phystokv(args->topOfKernelData), ARM_PGBYTES * 8, FALSE); // boot_tte, cpu_tte /* @@ -319,6 +351,8 @@ arm_vm_prot_init(boot_args * args) void arm_vm_prot_finalize(boot_args * args) { + cpu_stack_alloc(&BootCpuData); + ml_static_mfree(segBOOTDATAB, segSizeBOOTDATA); /* * Naively we could have: * arm_vm_page_granular_ROX(segTEXTB, segSizeTEXT, FALSE); @@ -335,6 +369,13 @@ arm_vm_prot_finalize(boot_args * args) flush_mmu_tlb(); } +/* used in the chosen/memory-map node, populated by iBoot. */ +typedef struct MemoryMapFileInfo { + vm_offset_t paddr; + size_t length; +} MemoryMapFileInfo; + + void arm_vm_init(uint64_t memory_size, boot_args * args) { @@ -391,9 +432,9 @@ arm_vm_init(uint64_t memory_size, boot_args * args) } while (tte < tte_limit) { - *tte = ARM_TTE_TYPE_FAULT; - tte++; - } + *tte = ARM_TTE_TYPE_FAULT; + tte++; + } /* Skip 6 pages (four L1 + two L2 entries) */ avail_start = cpu_ttep + ARM_PGBYTES * 6; @@ -404,12 +445,33 @@ arm_vm_init(uint64_t memory_size, boot_args * args) * from MACH-O headers for the currently running 32 bit kernel. */ segTEXTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__TEXT", &segSizeTEXT); + segLOWESTTEXT = segTEXTB; segDATAB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__DATA", &segSizeDATA); segLINKB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__LINKEDIT", &segSizeLINK); segKLDB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__KLD", &segSizeKLD); segLASTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__LAST", &segSizeLAST); segPRELINKTEXTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PRELINK_TEXT", &segSizePRELINKTEXT); segPRELINKINFOB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PRELINK_INFO", &segSizePRELINKINFO); + segBOOTDATAB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__BOOTDATA", &segSizeBOOTDATA); + + segEXTRADATA = 0; + segSizeEXTRADATA = 0; + + DTEntry memory_map; + MemoryMapFileInfo *trustCacheRange; + unsigned int trustCacheRangeSize; + int err; + + err = DTLookupEntry(NULL, "chosen/memory-map", &memory_map); + assert(err == kSuccess); + + err = DTGetProperty(memory_map, "TrustCache", (void**)&trustCacheRange, &trustCacheRangeSize); + if (err == kSuccess) { + assert(trustCacheRangeSize == sizeof(MemoryMapFileInfo)); + + segEXTRADATA = phystokv(trustCacheRange->paddr); + segSizeEXTRADATA = trustCacheRange->length; + } etext = (vm_offset_t) segTEXTB + segSizeTEXT; sdata = (vm_offset_t) segDATAB; @@ -492,7 +554,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args) sane_size = mem_size - (avail_start - gPhysBase); max_mem = mem_size; - vm_kernel_slide = gVirtBase-0x80000000; + vm_kernel_slide = gVirtBase-VM_KERNEL_LINK_ADDRESS; vm_kernel_stext = segTEXTB; vm_kernel_etext = segTEXTB + segSizeTEXT; vm_kernel_base = gVirtBase; diff --git a/osfmk/arm/atomic.h b/osfmk/arm/atomic.h index 3da426b3d..8f83828ed 100644 --- a/osfmk/arm/atomic.h +++ b/osfmk/arm/atomic.h @@ -29,6 +29,7 @@ #ifndef _ARM_ATOMIC_H_ #define _ARM_ATOMIC_H_ +#include #include // Parameter for __builtin_arm_dmb @@ -213,6 +214,7 @@ atomic_compare_exchange(uintptr_t *target, uintptr_t oldval, uintptr_t newval, #endif // ATOMIC_PRIVATE #if __arm__ +#undef os_atomic_rmw_loop #define os_atomic_rmw_loop(p, ov, nv, m, ...) ({ \ boolean_t _result = FALSE; uint32_t _err = 0; \ typeof(atomic_load(p)) *_p = (typeof(atomic_load(p)) *)(p); \ @@ -234,7 +236,14 @@ atomic_compare_exchange(uintptr_t *target, uintptr_t oldval, uintptr_t newval, } \ _result; \ }) + +#undef os_atomic_rmw_loop_give_up +#define os_atomic_rmw_loop_give_up(expr) \ + ({ __builtin_arm_clrex(); expr; __builtin_trap(); }) + #else + +#undef os_atomic_rmw_loop #define os_atomic_rmw_loop(p, ov, nv, m, ...) ({ \ boolean_t _result = FALSE; \ typeof(atomic_load(p)) *_p = (typeof(atomic_load(p)) *)(p); \ @@ -253,9 +262,25 @@ atomic_compare_exchange(uintptr_t *target, uintptr_t oldval, uintptr_t newval, } while (__builtin_expect(!_result, 0)); \ _result; \ }) -#endif +#undef os_atomic_rmw_loop_give_up #define os_atomic_rmw_loop_give_up(expr) \ ({ __builtin_arm_clrex(); expr; __builtin_trap(); }) +#endif + +#undef os_atomic_force_dependency_on +#if defined(__arm64__) +#define os_atomic_force_dependency_on(p, e) ({ \ + unsigned long _v; \ + __asm__("and %x[_v], %x[_e], xzr" : [_v] "=r" (_v) : [_e] "r" (e)); \ + (typeof(*(p)) *)((char *)(p) + _v); \ + }) +#else +#define os_atomic_force_dependency_on(p, e) ({ \ + unsigned long _v; \ + __asm__("and %[_v], %[_e], #0" : [_v] "=r" (_v) : [_e] "r" (e)); \ + (typeof(*(p)) *)((char *)(p) + _v); \ + }) +#endif // defined(__arm64__) #endif // _ARM_ATOMIC_H_ diff --git a/osfmk/arm/caches.c b/osfmk/arm/caches.c index 5f37e202d..91d489d08 100644 --- a/osfmk/arm/caches.c +++ b/osfmk/arm/caches.c @@ -66,13 +66,13 @@ flush_dcache( cpu_data_t *cpu_data_ptr = getCpuDatap(); if (phys) { - unsigned int paddr; - unsigned int vaddr; + pmap_paddr_t paddr; + vm_offset_t vaddr; - paddr = CAST_DOWN(unsigned int, addr); + paddr = CAST_DOWN(pmap_paddr_t, addr); if (!isphysmem(paddr)) return; - vaddr = (unsigned int)phystokv(paddr); + vaddr = phystokv(paddr); FlushPoC_DcacheRegion( (vm_offset_t) vaddr, length); if (cpu_data_ptr->cpu_cache_dispatch != (cache_dispatch_t) NULL) @@ -111,19 +111,19 @@ clean_dcache( cpu_data_t *cpu_data_ptr = getCpuDatap(); if (phys) { - unsigned int paddr; - unsigned int vaddr; + pmap_paddr_t paddr; + vm_offset_t vaddr; - paddr = CAST_DOWN(unsigned int, addr); + paddr = CAST_DOWN(pmap_paddr_t, addr); if (!isphysmem(paddr)) return; - vaddr = (unsigned int)phystokv(paddr); + vaddr = phystokv(paddr); CleanPoC_DcacheRegion( (vm_offset_t) vaddr, length); if (cpu_data_ptr->cpu_cache_dispatch != (cache_dispatch_t) NULL) ((cache_dispatch_t) cpu_data_ptr->cpu_cache_dispatch) ( - cpu_data_ptr->cpu_id, CacheCleanRegion, paddr, length); + cpu_data_ptr->cpu_id, CacheCleanRegion, (unsigned int) paddr, length); return; } @@ -175,8 +175,8 @@ dcache_incoherent_io_flush64( unsigned int remaining, unsigned int *res) { - unsigned int vaddr; - unsigned int paddr = CAST_DOWN(unsigned int, pa); + vm_offset_t vaddr; + pmap_paddr_t paddr = CAST_DOWN(pmap_paddr_t, pa); cpu_data_t *cpu_data_ptr = getCpuDatap(); if ((cache_info()->c_bulksize_op !=0) && (remaining >= (cache_info()->c_bulksize_op))) { @@ -190,7 +190,7 @@ dcache_incoherent_io_flush64( *res = BWOpDone; } else { if (isphysmem(paddr)) { - vaddr = (unsigned int)phystokv(pa); + vaddr = phystokv(pa); { FlushPoC_DcacheRegion( (vm_offset_t) vaddr, size); @@ -209,8 +209,8 @@ dcache_incoherent_io_flush64( if (count > size) count = size; - wimg_bits = pmap_cache_attributes((paddr >> PAGE_SHIFT)); - index = pmap_map_cpu_windows_copy((paddr >> PAGE_SHIFT), VM_PROT_READ|VM_PROT_WRITE, wimg_bits); + wimg_bits = pmap_cache_attributes((ppnum_t) (paddr >> PAGE_SHIFT)); + index = pmap_map_cpu_windows_copy((ppnum_t) (paddr >> PAGE_SHIFT), VM_PROT_READ|VM_PROT_WRITE, wimg_bits); vaddr = pmap_cpu_windows_copy_addr(cpu_number(), index) | (paddr & PAGE_MASK); CleanPoC_DcacheRegion( (vm_offset_t) vaddr, count); @@ -235,12 +235,12 @@ dcache_incoherent_io_store64( unsigned int remaining, unsigned int *res) { - unsigned int vaddr; - unsigned int paddr = CAST_DOWN(unsigned int, pa); + vm_offset_t vaddr; + pmap_paddr_t paddr = CAST_DOWN(pmap_paddr_t, pa); cpu_data_t *cpu_data_ptr = getCpuDatap(); if (isphysmem(paddr)) { - unsigned int wimg_bits = pmap_cache_attributes(paddr >> PAGE_SHIFT); + unsigned int wimg_bits = pmap_cache_attributes((ppnum_t) (paddr >> PAGE_SHIFT)); if ((wimg_bits == VM_WIMG_IO) || (wimg_bits == VM_WIMG_WCOMB)) { return; } @@ -259,7 +259,7 @@ dcache_incoherent_io_store64( *res = BWOpDone; } else { if (isphysmem(paddr)) { - vaddr = (unsigned int)phystokv(pa); + vaddr = phystokv(pa); { CleanPoC_DcacheRegion( (vm_offset_t) vaddr, size); @@ -278,8 +278,8 @@ dcache_incoherent_io_store64( if (count > size) count = size; - wimg_bits = pmap_cache_attributes((paddr >> PAGE_SHIFT)); - index = pmap_map_cpu_windows_copy((paddr >> PAGE_SHIFT), VM_PROT_READ|VM_PROT_WRITE, wimg_bits); + wimg_bits = pmap_cache_attributes((ppnum_t) (paddr >> PAGE_SHIFT)); + index = pmap_map_cpu_windows_copy((ppnum_t) (paddr >> PAGE_SHIFT), VM_PROT_READ|VM_PROT_WRITE, wimg_bits); vaddr = pmap_cpu_windows_copy_addr(cpu_number(), index) | (paddr & PAGE_MASK); CleanPoC_DcacheRegion( (vm_offset_t) vaddr, count); @@ -384,6 +384,7 @@ platform_cache_shutdown( void platform_cache_disable(void) { +#if (__ARM_ARCH__ < 8) uint32_t sctlr_value = 0; /* Disable dcache allocation. */ @@ -395,7 +396,7 @@ platform_cache_disable(void) __asm__ volatile("mcr p15, 0, %0, c1, c0, 0\n" "isb" :: "r"(sctlr_value)); - +#endif /* (__ARM_ARCH__ < 8) */ } void @@ -414,15 +415,16 @@ platform_cache_idle_enter( if (up_style_idle_exit && (real_ncpus == 1)) CleanPoU_Dcache(); else { - cpu_data_t *cpu_data_ptr = getCpuDatap(); - FlushPoU_Dcache(); +#if (__ARM_ARCH__ < 8) + cpu_data_t *cpu_data_ptr = getCpuDatap(); cpu_data_ptr->cpu_CLW_active = 0; __asm__ volatile("dmb ish"); cpu_data_ptr->cpu_CLWFlush_req = 0; cpu_data_ptr->cpu_CLWClean_req = 0; CleanPoC_DcacheRegion((vm_offset_t) cpu_data_ptr, sizeof(cpu_data_t)); +#endif /* (__ARM_ARCH__ < 8) */ } #else CleanPoU_Dcache(); diff --git a/osfmk/arm/commpage/commpage.c b/osfmk/arm/commpage/commpage.c index 27cfadba9..74aa72f31 100644 --- a/osfmk/arm/commpage/commpage.c +++ b/osfmk/arm/commpage/commpage.c @@ -36,6 +36,7 @@ * File: arm/commpage/commpage.c * Purpose: Set up and export a RO/RW page */ +#include #include #include #include @@ -50,8 +51,6 @@ #include #include #include -#include -#include #include @@ -62,11 +61,13 @@ static void commpage_init_cpu_capabilities( void ); static int commpage_cpus( void ); -vm_address_t commPagePtr=0; -vm_address_t sharedpage_rw_addr = 0; -uint32_t _cpu_capabilities = 0; +SECURITY_READ_ONLY_LATE(vm_address_t) commPagePtr=0; +SECURITY_READ_ONLY_LATE(vm_address_t) sharedpage_rw_addr = 0; +SECURITY_READ_ONLY_LATE(uint32_t) _cpu_capabilities = 0; -extern int gARMv81Atomics; /* For sysctl access from BSD side */ +/* For sysctl access from BSD side */ +extern int gARMv81Atomics; +extern int gARMv8Crc32; void commpage_populate( @@ -231,6 +232,12 @@ commpage_cpus( void ) return cpus; } +vm_address_t +_get_commpage_priv_address(void) +{ + return sharedpage_rw_addr; +} + /* * Initialize _cpu_capabilities vector */ @@ -273,6 +280,8 @@ commpage_init_cpu_capabilities( void ) bits |= kHasNeon; if (mvfp_info->neon_hpfp) bits |= kHasNeonHPFP; + if (mvfp_info->neon_fp16) + bits |= kHasNeonFP16; #endif #if defined(__arm64__) bits |= kHasFMA; @@ -290,10 +299,15 @@ commpage_init_cpu_capabilities( void ) bits |= kHasARMv8Crypto; #endif #ifdef __arm64__ - if ((__builtin_arm_rsr64("ID_AA64ISAR0_EL1") & ID_AA64ISAR0_EL1_ATOMIC_MASK) == ID_AA64ISAR0_EL1_ATOMIC_8_1) { + uint64_t isar0 = __builtin_arm_rsr64("ID_AA64ISAR0_EL1"); + if ((isar0 & ID_AA64ISAR0_EL1_ATOMIC_MASK) == ID_AA64ISAR0_EL1_ATOMIC_8_1) { bits |= kHasARMv81Atomics; gARMv81Atomics = 1; } + if ((isar0 & ID_AA64ISAR0_EL1_CRC32_MASK) == ID_AA64ISAR0_EL1_CRC32_EN) { + bits |= kHasARMv8Crc32; + gARMv8Crc32 = 1; + } #endif _cpu_capabilities = bits; @@ -425,26 +439,41 @@ commpage_update_boottime(uint64_t value) } } + /* - * set the commpage's remote time params for - * userspace call to mach_bridge_remote_time() + * After this counter has incremented, all running CPUs are guaranteed to + * have quiesced, i.e. executed serially dependent memory barriers. + * This is only tracked for CPUs running in userspace, therefore only useful + * outside the kernel. + * + * Note that you can't know which side of those barriers your read was from, + * so you have to observe 2 increments in order to ensure that you saw a + * serially dependent barrier chain across all running CPUs. */ - void - commpage_set_remotetime_params(double rate, uint64_t base_local_ts, uint64_t base_remote_ts) - { - if (commPagePtr) { -#ifdef __arm64__ - struct bt_params *paramsp = (struct bt_params *)(_COMM_PAGE_REMOTETIME_PARAMS + _COMM_PAGE_RW_OFFSET); - paramsp->base_local_ts = 0; - __asm__ volatile("dmb ish" ::: "memory"); - paramsp->rate = rate; - paramsp->base_remote_ts = base_remote_ts; - __asm__ volatile("dmb ish" ::: "memory"); - paramsp->base_local_ts = base_local_ts; //This will act as a generation count +uint64_t +commpage_increment_cpu_quiescent_counter(void) +{ + if (!commPagePtr) + return 0; + + uint64_t old_gen; + + _Atomic uint64_t *sched_gen = (_Atomic uint64_t *)(_COMM_PAGE_CPU_QUIESCENT_COUNTER + + _COMM_PAGE_RW_OFFSET); + /* + * On 32bit architectures, double-wide atomic load or stores are a CAS, + * so the atomic increment is the most efficient way to increment the + * counter. + * + * On 64bit architectures however, because the update is synchronized by + * the cpu mask, relaxed loads and stores is more efficient. + */ +#if __LP64__ + old_gen = atomic_load_explicit(sched_gen, memory_order_relaxed); + atomic_store_explicit(sched_gen, old_gen + 1, memory_order_relaxed); #else - (void)rate; - (void)base_local_ts; - (void)base_remote_ts; -#endif /* __arm64__ */ - } + old_gen = atomic_fetch_add_explicit(sched_gen, 1, memory_order_relaxed); +#endif + return old_gen; } + diff --git a/osfmk/arm/commpage/commpage.h b/osfmk/arm/commpage/commpage.h index 711be4cda..d7f349c29 100644 --- a/osfmk/arm/commpage/commpage.h +++ b/osfmk/arm/commpage/commpage.h @@ -46,5 +46,6 @@ extern void commpage_update_mach_continuous_time(uint64_t sleeptime); extern void commpage_update_multiuser_config(uint32_t); extern void commpage_update_boottime(uint64_t boottime_usec); extern void commpage_set_remotetime_params(double rate, uint64_t base_local_ts, uint64_t base_remote_ts); +extern uint64_t commpage_increment_cpu_quiescent_counter(void); #endif /* _ARM_COMMPAGE_H */ diff --git a/osfmk/arm/conf.c b/osfmk/arm/conf.c deleted file mode 100644 index a9da87124..000000000 --- a/osfmk/arm/conf.c +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * @APPLE_FREE_COPYRIGHT@ - */ -/* - * Mach Operating System Copyright (c) 1991,1990,1989 Carnegie Mellon - * University All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright notice - * and this permission notice appear in all copies of the software, - * derivative works or modified versions, and any portions thereof, and that - * both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION. - * CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR ANY DAMAGES - * WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science Carnegie Mellon University Pittsburgh PA - * 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon the - * rights to redistribute these changes. - */ -/* - * */ - -#include -#include -#include - -/* - * Clock device subsystem configuration. The clock_list[] - * table contains the clock structures for all clocks in - * the system. - */ - -extern const struct clock_ops sysclk_ops, calend_ops; - -/* - * List of clock devices. - */ -SECURITY_READ_ONLY_LATE(struct clock) clock_list[] = { - - /* SYSTEM_CLOCK */ - {&sysclk_ops, 0, 0}, - - /* CALENDAR_CLOCK */ - {&calend_ops, 0, 0}, -}; -int clock_count = sizeof(clock_list) / sizeof(clock_list[0]); diff --git a/osfmk/arm/cpu.c b/osfmk/arm/cpu.c index 46cfcddb7..49b5833cc 100644 --- a/osfmk/arm/cpu.c +++ b/osfmk/arm/cpu.c @@ -157,7 +157,7 @@ cpu_idle(void) platform_cache_idle_exit(); ClearIdlePop(TRUE); - cpu_idle_exit(); + cpu_idle_exit(FALSE); } /* @@ -165,7 +165,7 @@ cpu_idle(void) * Function: */ void -cpu_idle_exit(void) +cpu_idle_exit(boolean_t from_reset __unused) { uint64_t new_idle_timeout_ticks = 0x0ULL; cpu_data_t *cpu_data_ptr = getCpuDatap(); @@ -267,55 +267,35 @@ cpu_init(void) } -cpu_data_t * -cpu_data_alloc(boolean_t is_boot_cpu) +void +cpu_stack_alloc(cpu_data_t *cpu_data_ptr) { - cpu_data_t *cpu_data_ptr = NULL; - - if (is_boot_cpu) - cpu_data_ptr = &BootCpuData; - else { - void *irq_stack = NULL; - void *fiq_stack = NULL; - - if ((kmem_alloc(kernel_map, (vm_offset_t *)&cpu_data_ptr, sizeof(cpu_data_t), VM_KERN_MEMORY_CPU)) != KERN_SUCCESS) - goto cpu_data_alloc_error; - - bzero((void *)cpu_data_ptr, sizeof(cpu_data_t)); - - if ((irq_stack = kalloc(INTSTACK_SIZE)) == 0) - goto cpu_data_alloc_error; -#if __BIGGEST_ALIGNMENT__ - /* force 16-byte alignment */ - if ((uint32_t)irq_stack & 0x0F) - irq_stack = (void *)((uint32_t)irq_stack + (0x10 - ((uint32_t)irq_stack & 0x0F))); -#endif - cpu_data_ptr->intstack_top = (vm_offset_t)irq_stack + INTSTACK_SIZE ; - cpu_data_ptr->istackptr = cpu_data_ptr->intstack_top; - - if ((fiq_stack = kalloc(PAGE_SIZE)) == 0) - goto cpu_data_alloc_error; -#if __BIGGEST_ALIGNMENT__ - /* force 16-byte alignment */ - if ((uint32_t)fiq_stack & 0x0F) - fiq_stack = (void *)((uint32_t)fiq_stack + (0x10 - ((uint32_t)fiq_stack & 0x0F))); -#endif - cpu_data_ptr->fiqstack_top = (vm_offset_t)fiq_stack + PAGE_SIZE ; - cpu_data_ptr->fiqstackptr = cpu_data_ptr->fiqstack_top; - } - - cpu_data_ptr->cpu_processor = cpu_processor_alloc(is_boot_cpu); - if (cpu_data_ptr->cpu_processor == (struct processor *)NULL) - goto cpu_data_alloc_error; - - return cpu_data_ptr; - -cpu_data_alloc_error: - panic("cpu_data_alloc() failed\n"); - return (cpu_data_t *)NULL; + vm_offset_t irq_stack = 0; + vm_offset_t fiq_stack = 0; + + kern_return_t kr = kernel_memory_allocate(kernel_map, &irq_stack, + INTSTACK_SIZE + (2 * PAGE_SIZE), + PAGE_MASK, + KMA_GUARD_FIRST | KMA_GUARD_LAST | KMA_KSTACK | KMA_KOBJECT, + VM_KERN_MEMORY_STACK); + if (kr != KERN_SUCCESS) + panic("Unable to allocate cpu interrupt stack\n"); + + cpu_data_ptr->intstack_top = irq_stack + PAGE_SIZE + INTSTACK_SIZE; + cpu_data_ptr->istackptr = cpu_data_ptr->intstack_top; + + kr = kernel_memory_allocate(kernel_map, &fiq_stack, + FIQSTACK_SIZE + (2 * PAGE_SIZE), + PAGE_MASK, + KMA_GUARD_FIRST | KMA_GUARD_LAST | KMA_KSTACK | KMA_KOBJECT, + VM_KERN_MEMORY_STACK); + if (kr != KERN_SUCCESS) + panic("Unable to allocate cpu exception stack\n"); + + cpu_data_ptr->fiqstack_top = fiq_stack + PAGE_SIZE + FIQSTACK_SIZE; + cpu_data_ptr->fiqstackptr = cpu_data_ptr->fiqstack_top; } - void cpu_data_free(cpu_data_t *cpu_data_ptr) { @@ -324,7 +304,7 @@ cpu_data_free(cpu_data_t *cpu_data_ptr) cpu_processor_free( cpu_data_ptr->cpu_processor); kfree( (void *)(cpu_data_ptr->intstack_top - INTSTACK_SIZE), INTSTACK_SIZE); - kfree( (void *)(cpu_data_ptr->fiqstack_top - PAGE_SIZE), PAGE_SIZE); + kfree( (void *)(cpu_data_ptr->fiqstack_top - FIQSTACK_SIZE), FIQSTACK_SIZE); kmem_free(kernel_map, (vm_offset_t)cpu_data_ptr, sizeof(cpu_data_t)); } @@ -579,7 +559,7 @@ cpu_machine_idle_init(boolean_t from_boot) ((unsigned int)&(ResetHandlerData.cpu_data_entries) - (unsigned int)&ExceptionLowVectorsBase)), 4); - CleanPoC_DcacheRegion((vm_offset_t) phystokv((char *) (gPhysBase)), PAGE_SIZE); + CleanPoC_DcacheRegion((vm_offset_t) phystokv(gPhysBase), PAGE_SIZE); resume_idle_cpu_paddr = (unsigned int)ml_static_vtop((vm_offset_t)&resume_idle_cpu); diff --git a/osfmk/arm/cpu_capabilities.h b/osfmk/arm/cpu_capabilities.h index b686c0ed1..32044d7d0 100644 --- a/osfmk/arm/cpu_capabilities.h +++ b/osfmk/arm/cpu_capabilities.h @@ -32,10 +32,8 @@ #ifndef __ASSEMBLER__ #include -#ifdef KERNEL_PRIVATE #include #endif -#endif /* * This is the authoritative way to determine from user mode what @@ -47,6 +45,7 @@ /* * Bit definitions for _cpu_capabilities: */ +#define kHasNeonFP16 0x00000008 // ARM v8.2 NEON FP16 supported #define kCache32 0x00000010 // cache line size is 32 bytes #define kCache64 0x00000020 // cache line size is 64 bytes #define kCache128 0x00000040 // cache line size is 128 bytes @@ -60,6 +59,7 @@ #define kNumCPUs 0x00FF0000 // number of CPUs (see _NumCPUs() below) #define kHasARMv8Crypto 0x01000000 // Optional ARMv8 Crypto extensions #define kHasARMv81Atomics 0x02000000 // ARMv8.1 Atomic instructions supported +#define kHasARMv8Crc32 0x04000000 // Optional ARMv8 crc32 instructions (required in ARMv8.1) #define kNumCPUsShift 16 // see _NumCPUs() below @@ -91,6 +91,8 @@ typedef struct { volatile uint32_t TimeBase_shift; } commpage_timeofday_data_t; +extern vm_address_t _get_commpage_priv_address(void); + #endif /* __ASSEMBLER__ */ @@ -98,26 +100,22 @@ typedef struct { * The shared kernel/user "comm page(s)": */ -#if defined(__arm64__) +#if defined(__LP64__) #define _COMM_PAGE64_BASE_ADDRESS (0x0000000FFFFFC000ULL) /* In TTBR0 */ #define _COMM_HIGH_PAGE64_BASE_ADDRESS (0xFFFFFFF0001FC000ULL) /* Just below the kernel, safely in TTBR1; only used for testing */ -#define _COMM_PRIV_PAGE64_BASE_ADDRESS (_COMM_HIGH_PAGE64_BASE_ADDRESS - (PAGE_SIZE)) /* Privileged RO in kernel mode */ #define _COMM_PAGE64_AREA_LENGTH (_COMM_PAGE32_AREA_LENGTH) #define _COMM_PAGE64_AREA_USED (-1) -// macro to change a user comm page address to one that is accessible from privileged mode -// we can no longer access user memory in privileged mode once PAN is enabled -#define _COMM_PAGE_PRIV(_addr_) ((_addr_) - (_COMM_PAGE_START_ADDRESS) + (_COMM_PRIV_PAGE64_BASE_ADDRESS)) +#define _COMM_PAGE_PRIV(_addr_) ((_addr_) - (_COMM_PAGE_START_ADDRESS) + _get_commpage_priv_address()) #ifdef KERNEL_PRIVATE -extern vm_address_t sharedpage_rw_addr; #define _COMM_PAGE_RW_OFFSET (0) #define _COMM_PAGE_AREA_LENGTH (PAGE_SIZE) -#define _COMM_PAGE_BASE_ADDRESS (sharedpage_rw_addr) -#define _COMM_PAGE_START_ADDRESS (sharedpage_rw_addr) +#define _COMM_PAGE_BASE_ADDRESS (_get_commpage_priv_address()) +#define _COMM_PAGE_START_ADDRESS (_get_commpage_priv_address()) #else /* KERNEL_PRIVATE */ #define _COMM_PAGE_AREA_LENGTH (4096) @@ -125,7 +123,7 @@ extern vm_address_t sharedpage_rw_addr; #define _COMM_PAGE_START_ADDRESS _COMM_PAGE64_BASE_ADDRESS #endif /* KERNEL_PRIVATE */ -#elif defined(__arm__) +#else #define _COMM_PAGE64_BASE_ADDRESS (-1) #define _COMM_PAGE64_AREA_LENGTH (-1) @@ -137,8 +135,7 @@ extern vm_address_t sharedpage_rw_addr; #define _COMM_PAGE_PRIV(_addr_) (_addr_) #ifdef KERNEL_PRIVATE -extern vm_address_t sharedpage_rw_addr; -#define _COMM_PAGE_RW_OFFSET (sharedpage_rw_addr-_COMM_PAGE_BASE_ADDRESS) +#define _COMM_PAGE_RW_OFFSET (_get_commpage_priv_address()-_COMM_PAGE_BASE_ADDRESS) #define _COMM_PAGE_AREA_LENGTH (PAGE_SIZE) #else #define _COMM_PAGE_AREA_LENGTH (4096) @@ -147,8 +144,6 @@ extern vm_address_t sharedpage_rw_addr; #define _COMM_PAGE_BASE_ADDRESS _COMM_PAGE32_BASE_ADDRESS #define _COMM_PAGE_START_ADDRESS _COMM_PAGE32_BASE_ADDRESS -#else -#error Unknown architecture. #endif #define _COMM_PAGE32_BASE_ADDRESS (0xFFFF4000) /* Must be outside of normal map bounds */ @@ -208,6 +203,9 @@ extern vm_address_t sharedpage_rw_addr; #define _COMM_PAGE_NEWTIMEOFDAY_DATA (_COMM_PAGE_START_ADDRESS+0x120) // used by gettimeofday(). Currently, sizeof(new_commpage_timeofday_data_t) = 40. +// aligning to 128 bytes for cacheline/fabric size +#define _COMM_PAGE_CPU_QUIESCENT_COUNTER (_COMM_PAGE_START_ADDRESS+0x180) // uint64_t, but reserve the whole 128 (0x80) bytes + #define _COMM_PAGE_END (_COMM_PAGE_START_ADDRESS+0x1000) // end of common page #endif /* _ARM_CPU_CAPABILITIES_H */ diff --git a/osfmk/arm/cpu_common.c b/osfmk/arm/cpu_common.c index 2b9b0a261..d976ce5c1 100644 --- a/osfmk/arm/cpu_common.c +++ b/osfmk/arm/cpu_common.c @@ -515,6 +515,33 @@ processor_to_cpu_datap(processor_t processor) return target_cpu_datap; } +cpu_data_t * +cpu_data_alloc(boolean_t is_boot_cpu) +{ + cpu_data_t *cpu_data_ptr = NULL; + + if (is_boot_cpu) + cpu_data_ptr = &BootCpuData; + else { + if ((kmem_alloc(kernel_map, (vm_offset_t *)&cpu_data_ptr, sizeof(cpu_data_t), VM_KERN_MEMORY_CPU)) != KERN_SUCCESS) + goto cpu_data_alloc_error; + + bzero((void *)cpu_data_ptr, sizeof(cpu_data_t)); + + cpu_stack_alloc(cpu_data_ptr); + } + + cpu_data_ptr->cpu_processor = cpu_processor_alloc(is_boot_cpu); + if (cpu_data_ptr->cpu_processor == (struct processor *)NULL) + goto cpu_data_alloc_error; + + return cpu_data_ptr; + +cpu_data_alloc_error: + panic("cpu_data_alloc() failed\n"); + return (cpu_data_t *)NULL; +} + ast_t * ast_pending(void) { diff --git a/osfmk/arm/cpu_data.h b/osfmk/arm/cpu_data.h index f35121e35..3b8c88854 100644 --- a/osfmk/arm/cpu_data.h +++ b/osfmk/arm/cpu_data.h @@ -46,18 +46,45 @@ #include #include - #define current_thread() current_thread_fast() -static inline thread_t current_thread_fast(void) +static inline __pure2 thread_t current_thread_fast(void) +{ +#if defined(__arm64__) + return (thread_t)(__builtin_arm_rsr64("TPIDR_EL1")); +#else + return (thread_t)(__builtin_arm_mrc(15, 0, 13, 0, 4)); // TPIDRPRW +#endif +} + +/* + * The "volatile" flavor of current_thread() is intended for use by + * scheduler code which may need to update the thread pointer in the + * course of a context switch. Any call to current_thread() made + * prior to the thread pointer update should be safe to optimize away + * as it should be consistent with that thread's state to the extent + * the compiler can reason about it. Likewise, the context switch + * path will eventually result in an arbitrary branch to the new + * thread's pc, about which the compiler won't be able to reason. + * Thus any compile-time optimization of current_thread() calls made + * within the new thread should be safely encapsulated in its + * register/stack state. The volatile form therefore exists to cover + * the window between the thread pointer update and the branch to + * the new pc. + */ +static inline thread_t current_thread_volatile(void) { - thread_t result; + /* The compiler treats rsr64 as const, which can allow + it to eliminate redundant calls, which we don't want here. + Thus we use volatile asm. The mrc used for arm32 should be + treated as volatile however. */ #if defined(__arm64__) - __asm__ volatile("mrs %0, TPIDR_EL1" : "=r" (result)); + thread_t result; + __asm__ volatile("mrs %0, TPIDR_EL1" : "=r" (result)); + return result; #else - result = (thread_t)__builtin_arm_mrc(15, 0, 13, 0, 4); // TPIDRPRW + return (thread_t)(__builtin_arm_mrc(15, 0, 13, 0, 4)); // TPIDRPRW #endif - return result; } #if defined(__arm64__) diff --git a/osfmk/arm/cpu_data_internal.h b/osfmk/arm/cpu_data_internal.h index e660c1e36..29acbc1e8 100644 --- a/osfmk/arm/cpu_data_internal.h +++ b/osfmk/arm/cpu_data_internal.h @@ -119,12 +119,13 @@ typedef struct cpu_data unsigned short cpu_flags; vm_offset_t istackptr; vm_offset_t intstack_top; - vm_offset_t fiqstackptr; - vm_offset_t fiqstack_top; #if __arm64__ vm_offset_t excepstackptr; vm_offset_t excepstack_top; boolean_t cluster_master; +#else + vm_offset_t fiqstackptr; + vm_offset_t fiqstack_top; #endif boolean_t interrupts_enabled; thread_t cpu_active_thread; @@ -259,7 +260,6 @@ typedef struct cpu_data #if MONOTONIC struct mt_cpu cpu_monotonic; #endif /* MONOTONIC */ - struct prngContext *cpu_prng; cluster_type_t cpu_cluster_type; uint32_t cpu_cluster_id; uint32_t cpu_l2_id; @@ -302,9 +302,10 @@ extern unsigned int LowExceptionVectorBase; extern cpu_data_t *cpu_datap(int cpu); extern cpu_data_t *cpu_data_alloc(boolean_t is_boot); -extern void cpu_data_init(cpu_data_t *cpu_data_ptr); -extern void cpu_data_free(cpu_data_t *cpu_data_ptr); -extern kern_return_t cpu_data_register(cpu_data_t *cpu_data_ptr); +extern void cpu_stack_alloc(cpu_data_t*); +extern void cpu_data_init(cpu_data_t *cpu_data_ptr); +extern void cpu_data_free(cpu_data_t *cpu_data_ptr); +extern kern_return_t cpu_data_register(cpu_data_t *cpu_data_ptr); extern cpu_data_t *processor_to_cpu_datap( processor_t processor); #if __arm64__ diff --git a/osfmk/arm/cpu_internal.h b/osfmk/arm/cpu_internal.h index c7c846d75..34b4bce72 100644 --- a/osfmk/arm/cpu_internal.h +++ b/osfmk/arm/cpu_internal.h @@ -69,5 +69,9 @@ extern void cpu_signal_cancel( extern unsigned int real_ncpus; +#if defined(CONFIG_XNUPOST) && __arm64__ +extern void arm64_ipi_test(void); +#endif /* defined(CONFIG_XNUPOST) && __arm64__ */ + #endif /* _ARM_CPU_INTERNAL_H_ */ diff --git a/osfmk/arm/cpuid.c b/osfmk/arm/cpuid.c index 2782475e2..22435e76b 100644 --- a/osfmk/arm/cpuid.c +++ b/osfmk/arm/cpuid.c @@ -172,6 +172,10 @@ cpuid_get_cpufamily(void) case CPU_PART_HURRICANE_MYST: cpufamily = CPUFAMILY_ARM_HURRICANE; break; + case CPU_PART_MONSOON: + case CPU_PART_MISTRAL: + cpufamily = CPUFAMILY_ARM_MONSOON_MISTRAL; + break; default: cpufamily = CPUFAMILY_UNKNOWN; break; diff --git a/osfmk/arm/cpuid.h b/osfmk/arm/cpuid.h index 07778404b..bf642b6c5 100644 --- a/osfmk/arm/cpuid.h +++ b/osfmk/arm/cpuid.h @@ -132,6 +132,12 @@ typedef union { /* H9G (ARMv8 architecture) */ #define CPU_PART_HURRICANE_MYST 0x7 +/* H10 p-Core (ARMv8 architecture) */ +#define CPU_PART_MONSOON 0x8 + +/* H10 e-Core (ARMv8 architecture) */ +#define CPU_PART_MISTRAL 0x9 + /* Cache type identification */ @@ -198,6 +204,7 @@ typedef union { typedef struct { uint32_t neon; uint32_t neon_hpfp; + uint32_t neon_fp16; } arm_mvfp_info_t; #ifdef __cplusplus diff --git a/osfmk/arm/cswitch.s b/osfmk/arm/cswitch.s index 7c3812dd0..7851e0ed3 100644 --- a/osfmk/arm/cswitch.s +++ b/osfmk/arm/cswitch.s @@ -97,10 +97,12 @@ LEXT(machine_load_context) bx lr // Return /* - * void Call_continuation( void (*continuation)(void), - * void *param, - * wait_result_t wresult, - * vm_offset_t stack_ptr) + * typedef void (*thread_continue_t)(void *param, wait_result_t) + * + * void Call_continuation( thread_continue_t continuation, + * void *param, + * wait_result_t wresult, + * bool enable interrupts) */ .text .align 5 @@ -110,10 +112,21 @@ LEXT(Call_continuation) mrc p15, 0, r9, c13, c0, 4 // Read TPIDRPRW ldr sp, [r9, TH_KSTACKPTR] // Set stack pointer mov r7, #0 // Clear frame pointer - mov r6,r0 // Load continuation - mov r0,r1 // Set first parameter - mov r1,r2 // Set wait result arg - blx r6 // Branch to continuation + + mov r4,r0 // Load continuation + mov r5,r1 // continuation parameter + mov r6,r2 // Set wait result arg + + teq r3, #0 + beq 1f + mov r0, #1 + bl _ml_set_interrupts_enabled +1: + + mov r0,r5 // Set first parameter + mov r1,r6 // Set wait result arg + blx r4 // Branch to continuation + mrc p15, 0, r0, c13, c0, 4 // Read TPIDRPRW LOAD_ADDR_PC(thread_terminate) b . // Not reach diff --git a/osfmk/arm/data.s b/osfmk/arm/data.s index b82b50339..1ffa7e5d4 100644 --- a/osfmk/arm/data.s +++ b/osfmk/arm/data.s @@ -37,37 +37,61 @@ #error Unknown architecture. #endif + .section __BOOTDATA, __data // Aligned data - .section __DATA, __data // Aligned data + .align 14 -#if __arm64__ - /* - * Exception stack; this is above the interrupt stack so we don't squash the interrupt - * stack on an exception. - */ - .global EXT(excepstack) -LEXT(excepstack) - .space (4096) - .globl EXT(excepstack_top) -LEXT(excepstack_top) -#endif + .globl EXT(intstack_low_guard) +LEXT(intstack_low_guard) + .space (PAGE_MAX_SIZE_NUM) /* IRQ stack */ .globl EXT(intstack) // Boot processor IRQ stack LEXT(intstack) - .space (4*4096) + .space (INTSTACK_SIZE_NUM) .globl EXT(intstack_top) LEXT(intstack_top) + .globl EXT(intstack_high_guard) +LEXT(intstack_high_guard) + .space (PAGE_MAX_SIZE_NUM) - .align 12 // Page aligned Section +/* Low guard for fiq/exception stack is shared w/ interrupt stack high guard */ + +#ifndef __arm64__ .globl EXT(fiqstack) // Boot processor FIQ stack LEXT(fiqstack) - .space (4096) // One page size + .space (FIQSTACK_SIZE_NUM) .globl EXT(fiqstack_top) // Boot processor FIQ stack top LEXT(fiqstack_top) + .globl EXT(fiqstack_high_guard) +LEXT(fiqstack_high_guard) + .space (PAGE_MAX_SIZE_NUM) + +#else + + .global EXT(excepstack) +LEXT(excepstack) + .space (EXCEPSTACK_SIZE_NUM) + .globl EXT(excepstack_top) +LEXT(excepstack_top) + + .globl EXT(excepstack_high_guard) +LEXT(excepstack_high_guard) + .space (PAGE_MAX_SIZE_NUM) + +#endif + +// Must align to 16K here, due to + .global EXT(kd_early_buffer) + .align 14 +LEXT(kd_early_buffer) // space for kdebug's early event buffer + .space 16*1024,0 + + .section __DATA, __data // Aligned data + .globl EXT(CpuDataEntries) .align 12 // Page aligned LEXT(CpuDataEntries) // Cpu Data Entry Array @@ -90,12 +114,6 @@ LEXT(vfptrash_data) .fill 64, 4, 0xca55e77e #endif -// Must align to 16K here, due to - .global EXT(kd_early_buffer) - .align 14 -LEXT(kd_early_buffer) // space for kdebug's early event buffer - .space 16*1024,0 - #if __arm64__ .section __DATA, __const @@ -103,7 +121,7 @@ LEXT(kd_early_buffer) // space for kdebug's early event buffer /* reserve space for read only page tables */ .align 14 LEXT(ropagetable_begin) - .space 16*16*1024,0 + .space 14*16*1024,0 #else LEXT(ropagetable_begin) #endif /* defined(KERNEL_INTEGRITY_KTRR)*/ diff --git a/osfmk/arm/dbgwrap.h b/osfmk/arm/dbgwrap.h index 940346719..a91fd5dc4 100644 --- a/osfmk/arm/dbgwrap.h +++ b/osfmk/arm/dbgwrap.h @@ -51,6 +51,24 @@ typedef enum { DBGWRAP_WARN_CPU_OFFLINE } dbgwrap_status_t; +static inline const char* +ml_dbgwrap_strerror(dbgwrap_status_t status) { + switch (status) { + + case DBGWRAP_ERR_SELF_HALT: return "CPU attempted to halt itself"; + case DBGWRAP_ERR_UNSUPPORTED: return "halt not supported for this configuration"; + case DBGWRAP_ERR_INPROGRESS: return "halt in progress on another CPU"; + case DBGWRAP_ERR_INSTR_ERROR: return "instruction-stuffing failure"; + case DBGWRAP_ERR_INSTR_TIMEOUT: return "instruction-stuffing timeout"; + case DBGWRAP_ERR_HALT_TIMEOUT: return "halt ack timeout, CPU likely wedged"; + case DBGWRAP_SUCCESS: return "halt succeeded"; + case DBGWRAP_WARN_ALREADY_HALTED: return "CPU already halted"; + case DBGWRAP_WARN_CPU_OFFLINE: return "CPU offline"; + default: return "unrecognized status"; + + } +} + boolean_t ml_dbgwrap_cpu_is_halted(int cpu_index); dbgwrap_status_t ml_dbgwrap_wait_cpu_halted(int cpu_index, uint64_t timeout_ns); diff --git a/osfmk/arm/genassym.c b/osfmk/arm/genassym.c index 4f3c1b8ba..36e59b9c7 100644 --- a/osfmk/arm/genassym.c +++ b/osfmk/arm/genassym.c @@ -316,8 +316,10 @@ main( DECLARE("CPU_DATA_PADDR", offsetof(struct cpu_data_entry, cpu_data_paddr)); - DECLARE("INTSTACK_SIZE", INTSTACK_SIZE); + DECLARE("FIQSTACK_SIZE", FIQSTACK_SIZE); + + DECLARE("PAGE_MAX_SIZE", PAGE_MAX_SIZE); /* values from kern/timer.h */ DECLARE("TIMER_LOW", diff --git a/osfmk/arm/locks.h b/osfmk/arm/locks.h index 3a58a7fcc..16acddb90 100644 --- a/osfmk/arm/locks.h +++ b/osfmk/arm/locks.h @@ -255,8 +255,6 @@ typedef struct { #define LOCK_PANIC_TIMEOUT 0xc00000 // 12.5 m ticks = 250ms with 24MHz OSC -#define LOCK_TRY_DISABLE_INT 1 // Disable interrupts for a quick acquire attempt - #define PLATFORM_LCK_ILOCK LCK_ILOCK @@ -276,6 +274,7 @@ typedef struct { #define LCK_MTX_THREAD_MASK (~(uintptr_t)(LCK_ILOCK | ARM_LCK_WAITERS)) #define disable_preemption_for_thread(t) ((volatile thread_t)t)->machine.preemption_count++ +#define preemption_disabled_for_thread(t) (((volatile thread_t)t)->machine.preemption_count > 0) __unused static void disable_interrupts_noread(void) diff --git a/osfmk/arm/locks_arm.c b/osfmk/arm/locks_arm.c index 941ec38fc..b43f665db 100644 --- a/osfmk/arm/locks_arm.c +++ b/osfmk/arm/locks_arm.c @@ -232,7 +232,6 @@ static boolean_t lck_rw_lock_shared_to_exclusive_success(lck_rw_t *lck); static boolean_t lck_rw_lock_shared_to_exclusive_failure(lck_rw_t *lck, uint32_t prior_lock_state); static void lck_rw_lock_exclusive_to_shared_gen(lck_rw_t *lck, uint32_t prior_lock_state); static lck_rw_type_t lck_rw_done_gen(lck_rw_t *lck, uint32_t prior_lock_state); -void lck_rw_clear_promotions_x86(thread_t thread); static boolean_t lck_rw_grab(lck_rw_t *lock, int mode, boolean_t wait); /* @@ -358,8 +357,8 @@ static unsigned int hw_lock_bit_to_contended(hw_lock_bit_t *lock, uint32_t mask, uint32_t timeout); #endif -unsigned int -hw_lock_bit_to(hw_lock_bit_t *lock, unsigned int bit, uint32_t timeout) +static inline unsigned int +hw_lock_bit_to_internal(hw_lock_bit_t *lock, unsigned int bit, uint32_t timeout) { unsigned int success = 0; uint32_t mask = (1 << bit); @@ -367,7 +366,6 @@ hw_lock_bit_to(hw_lock_bit_t *lock, unsigned int bit, uint32_t timeout) uint32_t state; #endif - _disable_preemption(); #if __SMP__ if (__improbable(!atomic_test_and_set32(lock, mask, mask, memory_order_acquire, FALSE))) success = hw_lock_bit_to_contended(lock, mask, timeout); @@ -390,6 +388,13 @@ hw_lock_bit_to(hw_lock_bit_t *lock, unsigned int bit, uint32_t timeout) return success; } +unsigned int +hw_lock_bit_to(hw_lock_bit_t *lock, unsigned int bit, uint32_t timeout) +{ + _disable_preemption(); + return hw_lock_bit_to_internal(lock, bit, timeout); +} + #if __SMP__ static unsigned int NOINLINE hw_lock_bit_to_contended(hw_lock_bit_t *lock, uint32_t mask, uint32_t timeout) @@ -440,17 +445,30 @@ hw_lock_bit(hw_lock_bit_t *lock, unsigned int bit) #endif } +void +hw_lock_bit_nopreempt(hw_lock_bit_t *lock, unsigned int bit) +{ + if (__improbable(get_preemption_level() == 0)) + panic("Attempt to take no-preempt bitlock %p in preemptible context", lock); + if (hw_lock_bit_to_internal(lock, bit, LOCK_PANIC_TIMEOUT)) + return; +#if __SMP__ + panic("hw_lock_bit_nopreempt(): timed out (%p)", lock); +#else + panic("hw_lock_bit_nopreempt(): interlock held (%p)", lock); +#endif +} + unsigned int hw_lock_bit_try(hw_lock_bit_t *lock, unsigned int bit) { - long intmask; uint32_t mask = (1 << bit); #if !__SMP__ uint32_t state; #endif boolean_t success = FALSE; - intmask = disable_interrupts(); + _disable_preemption(); #if __SMP__ // TODO: consider weak (non-looping) atomic test-and-set success = atomic_test_and_set32(lock, mask, mask, memory_order_acquire, FALSE); @@ -461,9 +479,8 @@ hw_lock_bit_try(hw_lock_bit_t *lock, unsigned int bit) success = TRUE; } #endif // __SMP__ - if (success) - disable_preemption(); - restore_interrupts(intmask); + if (!success) + _enable_preemption(); #if CONFIG_DTRACE if (success) @@ -473,14 +490,8 @@ hw_lock_bit_try(hw_lock_bit_t *lock, unsigned int bit) return success; } -/* - * Routine: hw_unlock_bit - * - * Release spin-lock. The second parameter is the bit number to test and set. - * Decrement the preemption level. - */ -void -hw_unlock_bit(hw_lock_bit_t *lock, unsigned int bit) +static inline void +hw_unlock_bit_internal(hw_lock_bit_t *lock, unsigned int bit) { uint32_t mask = (1 << bit); #if !__SMP__ @@ -497,9 +508,28 @@ hw_unlock_bit(hw_lock_bit_t *lock, unsigned int bit) #if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_SPIN_UNLOCK_RELEASE, lock, bit); #endif - enable_preemption(); } +/* + * Routine: hw_unlock_bit + * + * Release spin-lock. The second parameter is the bit number to test and set. + * Decrement the preemption level. + */ +void +hw_unlock_bit(hw_lock_bit_t *lock, unsigned int bit) +{ + hw_unlock_bit_internal(lock, bit); + _enable_preemption(); +} + +void +hw_unlock_bit_nopreempt(hw_lock_bit_t *lock, unsigned int bit) +{ + if (__improbable(get_preemption_level() == 0)) + panic("Attempt to release no-preempt bitlock %p in preemptible context", lock); + hw_unlock_bit_internal(lock, bit); +} /* * Routine: lck_spin_alloc_init @@ -570,6 +600,19 @@ lck_spin_lock(lck_spin_t *lock) hw_lock_lock(&lock->hwlock); } +/* + * Routine: lck_spin_lock_nopreempt + */ +void +lck_spin_lock_nopreempt(lck_spin_t *lock) +{ +#if DEVELOPMENT || DEBUG + if (lock->type != LCK_SPIN_TYPE) + panic("Invalid spinlock %p", lock); +#endif // DEVELOPMENT || DEBUG + hw_lock_lock_nopreempt(&lock->hwlock); +} + /* * Routine: lck_spin_try_lock */ @@ -579,6 +622,15 @@ lck_spin_try_lock(lck_spin_t *lock) return hw_lock_try(&lock->hwlock); } +/* + * Routine: lck_spin_try_lock_nopreempt + */ +int +lck_spin_try_lock_nopreempt(lck_spin_t *lock) +{ + return hw_lock_try_nopreempt(&lock->hwlock); +} + /* * Routine: lck_spin_unlock */ @@ -594,6 +646,21 @@ lck_spin_unlock(lck_spin_t *lock) hw_lock_unlock(&lock->hwlock); } +/* + * Routine: lck_spin_unlock_nopreempt + */ +void +lck_spin_unlock_nopreempt(lck_spin_t *lock) +{ +#if DEVELOPMENT || DEBUG + if ((LCK_MTX_STATE_TO_THREAD(lock->lck_spin_data) != current_thread()) && LOCK_CORRECTNESS_PANIC()) + panic("Spinlock not owned by thread %p = %lx", lock, lock->lck_spin_data); + if (lock->type != LCK_SPIN_TYPE) + panic("Invalid spinlock type %p", lock); +#endif // DEVELOPMENT || DEBUG + hw_lock_unlock_nopreempt(&lock->hwlock); +} + /* * Routine: lck_spin_destroy */ @@ -1373,7 +1440,7 @@ lck_rw_lock_shared_to_exclusive_failure( if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) { /* sched_flags checked without lock, but will be rechecked while clearing */ - lck_rw_clear_promotion(thread); + lck_rw_clear_promotion(thread, unslide_for_kdebug(lck)); } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE, @@ -1457,7 +1524,8 @@ lck_rw_lock_shared_to_exclusive_success( ordered_store_rw(lock, word.data); thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade); - res = assert_wait(LCK_RW_WRITER_EVENT(lock), THREAD_UNINT); + res = assert_wait(LCK_RW_WRITER_EVENT(lock), + THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); lck_interlock_unlock(lock, istate); if (res == THREAD_WAITING) { @@ -1796,7 +1864,8 @@ lck_rw_lock_exclusive_gen( ordered_store_rw(lock, word.data); thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite); - res = assert_wait(LCK_RW_WRITER_EVENT(lock), THREAD_UNINT); + res = assert_wait(LCK_RW_WRITER_EVENT(lock), + THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); lck_interlock_unlock(lock, istate); if (res == THREAD_WAITING) { @@ -1866,7 +1935,8 @@ lck_rw_lock_exclusive_gen( ordered_store_rw(lock, word.data); thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite); - res = assert_wait(LCK_RW_WRITER_EVENT(lock), THREAD_UNINT); + res = assert_wait(LCK_RW_WRITER_EVENT(lock), + THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); lck_interlock_unlock(lock, istate); if (res == THREAD_WAITING) { @@ -2034,7 +2104,7 @@ lck_rw_done_gen( #endif if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) { /* sched_flags checked without lock, but will be rechecked while clearing */ - lck_rw_clear_promotion(thread); + lck_rw_clear_promotion(thread, unslide_for_kdebug(lck)); } #if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1); @@ -2116,7 +2186,8 @@ lck_rw_lock_shared_gen( ordered_store_rw(lck, word.data); thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead); - res = assert_wait(LCK_RW_READER_EVENT(lck), THREAD_UNINT); + res = assert_wait(LCK_RW_READER_EVENT(lck), + THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); lck_interlock_unlock(lck, istate); if (res == THREAD_WAITING) { @@ -2414,10 +2485,10 @@ lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked) if (interlocked) goto interlock_held; + /* TODO: short-duration spin for on-core contention */ + + /* Loop waiting until I see that the mutex is unowned */ for ( ; ; ) { - if (atomic_compare_exchange(&lock->lck_mtx_data, 0, LCK_MTX_THREAD_TO_STATE(thread), - memory_order_acquire_smp, FALSE)) - return; interlock_lock(lock); interlock_held: state = ordered_load_mtx(lock); @@ -2426,7 +2497,10 @@ lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked) break; ordered_store_mtx(lock, (state | LCK_ILOCK | ARM_LCK_WAITERS)); // Set waiters bit and wait lck_mtx_lock_wait(lock, holding_thread); + /* returns interlock unlocked */ } + + /* Hooray, I'm the new owner! */ waiters = lck_mtx_lock_acquire(lock); state = LCK_MTX_THREAD_TO_STATE(thread); if (waiters != 0) @@ -2661,14 +2735,14 @@ lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, boolean_t ilk_held) state |= LCK_ILOCK; ordered_store_mtx(lock, state); #endif + if (state & ARM_LCK_WAITERS) { + lck_mtx_unlock_wakeup(lock, thread); + state = ordered_load_mtx(lock); + } else { + assertf(lock->lck_mtx_pri == 0, "pri=0x%x", lock->lck_mtx_pri); + } } - if (state & ARM_LCK_WAITERS) { - lck_mtx_unlock_wakeup(lock, thread); - state = ordered_load_mtx(lock); - } else { - assertf(lock->lck_mtx_pri == 0, "pri=0x%x", lock->lck_mtx_pri); - } - state &= ARM_LCK_WAITERS; // Retain waiters bit + state &= ARM_LCK_WAITERS; /* Clear state, retain waiters bit */ #if __SMP__ state |= LCK_ILOCK; ordered_store_mtx(lock, state); diff --git a/osfmk/arm/locore.s b/osfmk/arm/locore.s index 233166b55..ce41150c1 100644 --- a/osfmk/arm/locore.s +++ b/osfmk/arm/locore.s @@ -1260,6 +1260,7 @@ fleh_irq_user: ldr r2, [r2] movs r2, r2 beq 1f + mov r1, #0 // (not a PMI record) bl EXT(telemetry_mark_curthread) // ...if so, mark the current thread... mrc p15, 0, r9, c13, c0, 4 // ...and restore the thread pointer from TPIDRPRW 1: @@ -1313,6 +1314,7 @@ fleh_irq_kernel: ldr r2, [r2] movs r2, r2 beq 1f + mov r1, #0 // (not a PMI record) bl EXT(telemetry_mark_curthread) // ...if so, mark the current thread... mrc p15, 0, r9, c13, c0, 4 // ...and restore the thread pointer from TPIDRPRW 1: @@ -1470,6 +1472,7 @@ fleh_decirq_user: ldr r2, [r2] movs r2, r2 beq 1f + mov r1, #0 // (not a PMI record) bl EXT(telemetry_mark_curthread) // ...if so, mark the current thread... mrc p15, 0, r9, c13, c0, 4 // ...and restore the thread pointer from TPIDRPRW 1: @@ -1523,6 +1526,7 @@ fleh_decirq_kernel: ldr r2, [r2] movs r2, r2 beq 1f + mov r1, #0 // (not a pmi record) bl EXT(telemetry_mark_curthread) // ...if so, mark the current thread... mrc p15, 0, r9, c13, c0, 4 // ...and restore the thread pointer from TPIDRPRW 1: @@ -1777,8 +1781,9 @@ LEXT(fleh_dec) #if CONFIG_TELEMETRY LOAD_ADDR(r4, telemetry_needs_record) // Check if a telemetry record was requested... ldr r4, [r4] - movs r4, r4 + movs r4, r4 beq 6f + mov r1, #0 // (not a PMI record) bl EXT(telemetry_mark_curthread) // ...if so, mark the current thread... mrc p15, 0, r9, c13, c0, 4 // ...and restore the thread pointer from TPIDRPRW 6: diff --git a/osfmk/arm/loose_ends.c b/osfmk/arm/loose_ends.c index 46aeec6da..35999a7f7 100644 --- a/osfmk/arm/loose_ends.c +++ b/osfmk/arm/loose_ends.c @@ -495,9 +495,9 @@ copypv(addr64_t source, addr64_t sink, unsigned int size, int which) panic("copypv: no more than 1 parameter may be virtual\n"); /* Not allowed */ if (which & cppvPsrc) - from = (void *)phystokv(from); + from = (void *)phystokv((pmap_paddr_t)from); if (which & cppvPsnk) - to = (void *)phystokv(to); + to = (void *)phystokv((pmap_paddr_t)to); if ((which & (cppvPsrc | cppvKmap)) == 0) /* Source is virtual in * current map */ @@ -549,17 +549,17 @@ copy_validate(const user_addr_t user_addr, { uintptr_t kernel_addr_last = kernel_addr + nbytes; - if (kernel_addr < VM_MIN_KERNEL_ADDRESS || + if (__improbable(kernel_addr < VM_MIN_KERNEL_ADDRESS || kernel_addr > VM_MAX_KERNEL_ADDRESS || kernel_addr_last < kernel_addr || - kernel_addr_last > VM_MAX_KERNEL_ADDRESS) + kernel_addr_last > VM_MAX_KERNEL_ADDRESS)) panic("%s(%p, %p, %u) - kaddr not in kernel", __func__, (void *)user_addr, (void *)kernel_addr, nbytes); user_addr_t user_addr_last = user_addr + nbytes; - if (user_addr_last < user_addr || - user_addr_last > VM_MIN_KERNEL_ADDRESS) + if (__improbable((user_addr_last < user_addr) || ((user_addr + nbytes) > vm_map_max(current_thread()->map)) || + (user_addr < vm_map_min(current_thread()->map)))) return (EFAULT); if (__improbable(nbytes > copysize_limit_panic)) diff --git a/osfmk/arm/lowmem_vectors.c b/osfmk/arm/lowmem_vectors.c index f710eb65f..eb6bcdf4f 100644 --- a/osfmk/arm/lowmem_vectors.c +++ b/osfmk/arm/lowmem_vectors.c @@ -62,8 +62,8 @@ lowglo lowGlo __attribute__ ((aligned(PAGE_MAX_SIZE))) = { .lgManualPktAddr = (uint32_t)&manual_pkt, #endif .lgPmapMemQ = (uint32_t)&(pmap_object_store.memq), - .lgPmapMemPageOffset = offsetof(struct vm_page_with_ppnum, phys_page), - .lgPmapMemChainOffset = offsetof(struct vm_page, listq), + .lgPmapMemPageOffset = offsetof(struct vm_page_with_ppnum, vmp_phys_page), + .lgPmapMemChainOffset = offsetof(struct vm_page, vmp_listq), .lgPmapMemPagesize = (uint32_t)sizeof(struct vm_page), .lgPmapMemStartAddr = -1, diff --git a/osfmk/arm/machine_cpu.h b/osfmk/arm/machine_cpu.h index 64d795e70..d6584de3f 100644 --- a/osfmk/arm/machine_cpu.h +++ b/osfmk/arm/machine_cpu.h @@ -44,7 +44,7 @@ extern void cpu_signal_handler_internal(boolean_t disable_signal); extern void cpu_doshutdown(void (*doshutdown)(processor_t), processor_t processor); extern void cpu_idle(void); -extern void cpu_idle_exit(void) __attribute__((noreturn)); +extern void cpu_idle_exit(boolean_t from_reset) __attribute__((noreturn)); extern void cpu_idle_tickle(void); extern void cpu_machine_idle_init(boolean_t from_boot); diff --git a/osfmk/arm/machine_cpuid.c b/osfmk/arm/machine_cpuid.c index 3943f47d4..ac54a0be5 100644 --- a/osfmk/arm/machine_cpuid.c +++ b/osfmk/arm/machine_cpuid.c @@ -147,7 +147,7 @@ machine_do_mvfpid() #else cpuid_mvfp_info.neon = 1; cpuid_mvfp_info.neon_hpfp = 1; -#endif +#endif /* __arm__ */ } diff --git a/osfmk/arm/machine_routines.c b/osfmk/arm/machine_routines.c index 83e607540..0a6777ea5 100644 --- a/osfmk/arm/machine_routines.c +++ b/osfmk/arm/machine_routines.c @@ -435,6 +435,12 @@ void ml_init_timebase( } } +void +fiq_context_bootstrap(boolean_t enable_fiq) +{ + fiq_context_init(enable_fiq); +} + void ml_parse_cpu_topology(void) { @@ -593,7 +599,7 @@ ml_processor_register( #endif if (!is_boot_cpu) - prng_cpu_init(this_cpu_datap->cpu_number); + early_random_cpu_init(this_cpu_datap->cpu_number); return KERN_SUCCESS; @@ -639,23 +645,6 @@ cause_ast_check( } } - -/* - * Routine: ml_at_interrupt_context - * Function: Check if running at interrupt context - */ -boolean_t -ml_at_interrupt_context(void) -{ - boolean_t at_interrupt_context = FALSE; - - disable_preemption(); - at_interrupt_context = (getCpuDatap()->cpu_int_state != NULL); - enable_preemption(); - - return at_interrupt_context; -} - extern uint32_t cpu_idle_count; void ml_get_power_state(boolean_t *icp, boolean_t *pidlep) { @@ -722,6 +711,19 @@ ml_static_vtop( return ((vm_address_t)(vaddr) - gVirtBase + gPhysBase); } +vm_offset_t +ml_static_slide( + vm_offset_t vaddr) +{ + return VM_KERNEL_SLIDE(vaddr); +} + +vm_offset_t +ml_static_unslide( + vm_offset_t vaddr) +{ + return VM_KERNEL_UNSLIDE(vaddr); +} kern_return_t ml_static_protect( @@ -963,20 +965,6 @@ machine_choose_processor(__unused processor_set_t pset, processor_t processor) return (processor); } -vm_offset_t -ml_stack_remaining(void) -{ - uintptr_t local = (uintptr_t) &local; - vm_offset_t intstack_top_ptr; - - intstack_top_ptr = getCpuDatap()->intstack_top; - if ((local < intstack_top_ptr) && (local > intstack_top_ptr - INTSTACK_SIZE)) { - return (local - (getCpuDatap()->intstack_top - INTSTACK_SIZE)); - } else { - return (local - current_thread()->kernel_stack); - } -} - boolean_t machine_timeout_suspended(void) { return FALSE; } @@ -1023,7 +1011,7 @@ ml_delay_should_spin(uint64_t interval) boolean_t ml_thread_is64bit(thread_t thread) { - return (thread_is_64bit(thread)); + return (thread_is_64bit_addr(thread)); } void ml_timer_evaluate(void) { @@ -1151,8 +1139,3 @@ arm_user_protect_end(thread_t thread, uintptr_t ttbr0, boolean_t disable_interru } } #endif // __ARM_USER_PROTECT__ - -void ml_task_set_rop_pid(__unused task_t task, __unused task_t parent_task, __unused boolean_t inherit) -{ - return; -} diff --git a/osfmk/arm/machine_routines.h b/osfmk/arm/machine_routines.h index cee6477b1..4a7061b65 100644 --- a/osfmk/arm/machine_routines.h +++ b/osfmk/arm/machine_routines.h @@ -131,6 +131,11 @@ typedef void (*platform_error_handler_t)(void *refcon, vm_offset_t fault_addr); typedef enum { EXCB_CLASS_ILLEGAL_INSTR_SET, +#ifdef CONFIG_XNUPOST + EXCB_CLASS_TEST1, + EXCB_CLASS_TEST2, + EXCB_CLASS_TEST3, +#endif EXCB_CLASS_MAX // this must be last } ex_cb_class_t; @@ -140,6 +145,9 @@ typedef enum { EXCB_ACTION_RERUN, // re-run the faulting instruction EXCB_ACTION_NONE, // continue normal exception handling +#ifdef CONFIG_XNUPOST + EXCB_ACTION_TEST_FAIL, +#endif } ex_cb_action_t; @@ -289,6 +297,12 @@ vm_offset_t ml_static_ptovirt( vm_offset_t); +vm_offset_t ml_static_slide( + vm_offset_t vaddr); + +vm_offset_t ml_static_unslide( + vm_offset_t vaddr); + /* Offset required to obtain absolute time value from tick counter */ uint64_t ml_get_abstime_offset(void); @@ -527,10 +541,15 @@ vm_offset_t ml_stack_remaining(void); uint32_t get_fpscr(void); void set_fpscr(uint32_t); +#ifdef __arm64__ +unsigned long update_mdscr(unsigned long clear, unsigned long set); +#endif /* __arm64__ */ + extern void init_vfp(void); extern boolean_t get_vfp_enabled(void); extern void arm_debug_set_cp14(arm_debug_state_t *debug_state); extern void fiq_context_init(boolean_t enable_fiq); +extern void fiq_context_bootstrap(boolean_t enable_fiq); extern void reenable_async_aborts(void); extern void cpu_idle_wfi(boolean_t wfi_fast); @@ -849,6 +868,8 @@ extern void sched_perfcontrol_register_callbacks(sched_perfcontrol_callbacks_t c extern void sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores); extern void sched_perfcontrol_thread_group_recommend(void *data, cluster_type_t recommendation); +extern void sched_override_recommended_cores_for_sleep(void); +extern void sched_restore_recommended_cores_after_sleep(void); /* * Update the deadline after which sched_perfcontrol_deadline_passed will be called. @@ -890,7 +911,6 @@ void ml_get_power_state(boolean_t *, boolean_t *); boolean_t user_cont_hwclock_allowed(void); boolean_t user_timebase_allowed(void); boolean_t ml_thread_is64bit(thread_t thread); -void ml_task_set_rop_pid(task_t task, task_t parent_task, boolean_t inherit); #ifdef __arm64__ void ml_set_align_checking(void); diff --git a/osfmk/arm/machine_routines_asm.s b/osfmk/arm/machine_routines_asm.s index d0b0a6b96..d175af88d 100644 --- a/osfmk/arm/machine_routines_asm.s +++ b/osfmk/arm/machine_routines_asm.s @@ -123,8 +123,8 @@ LEXT(timer_grab) bx lr .align 2 - .globl EXT(timer_update) -LEXT(timer_update) + .globl EXT(timer_advance_internal_32) +LEXT(timer_advance_internal_32) str r1, [r0, TIMER_HIGHCHK] #if __ARM_SMP__ dmb ish // dmb ish @@ -188,39 +188,106 @@ LEXT(OSSynchronizeIO) dsb bx lr +.macro SYNC_TLB_FLUSH + dsb ish + isb +.endmacro + /* - * void flush_mmu_tlb(void) + * void sync_tlb_flush * - * Flush all TLBs + * Synchronize one or more prior TLB flush operations */ .text .align 2 - .globl EXT(flush_mmu_tlb) -LEXT(flush_mmu_tlb) + .globl EXT(sync_tlb_flush) +LEXT(sync_tlb_flush) + SYNC_TLB_FLUSH + bx lr + +.macro FLUSH_MMU_TLB mov r0, #0 #if __ARM_SMP__ mcr p15, 0, r0, c8, c3, 0 // Invalidate Inner Shareable entire TLBs #else mcr p15, 0, r0, c8, c7, 0 // Invalidate entire TLB #endif - dsb ish - isb - bx lr +.endmacro + +/* + * void flush_mmu_tlb_async(void) + * + * Flush all TLBs, don't wait for completion + */ + .text + .align 2 + .globl EXT(flush_mmu_tlb_async) +LEXT(flush_mmu_tlb_async) + FLUSH_MMU_TLB + bx lr + +/* + * void flush_mmu_tlb(void) + * + * Flush all TLBs + */ + .text + .align 2 + .globl EXT(flush_mmu_tlb) +LEXT(flush_mmu_tlb) + FLUSH_MMU_TLB + SYNC_TLB_FLUSH + bx lr + +.macro FLUSH_CORE_TLB + mov r0, #0 + mcr p15, 0, r0, c8, c7, 0 // Invalidate entire TLB +.endmacro + +/* + * + * void flush_core_tlb_async(void) + * + * Flush local core's TLB, don't wait for completion + */ + .text + .align 2 + .globl EXT(flush_core_tlb_async) +LEXT(flush_core_tlb_async) + FLUSH_CORE_TLB + bx lr /* * void flush_core_tlb(void) * - * Flush core TLB + * Flush local core's TLB */ .text .align 2 .globl EXT(flush_core_tlb) LEXT(flush_core_tlb) - mov r0, #0 - mcr p15, 0, r0, c8, c7, 0 // Invalidate entire TLB - dsb ish - isb - bx lr + FLUSH_CORE_TLB + SYNC_TLB_FLUSH + bx lr + +.macro FLUSH_MMU_TLB_ENTRY +#if __ARM_SMP__ + mcr p15, 0, r0, c8, c3, 1 // Invalidate TLB Inner Shareableentry +#else + mcr p15, 0, r0, c8, c7, 1 // Invalidate TLB entry +#endif +.endmacro +/* + * void flush_mmu_tlb_entry_async(uint32_t) + * + * Flush TLB entry, don't wait for completion + */ + .text + .align 2 + .globl EXT(flush_mmu_tlb_entry_async) +LEXT(flush_mmu_tlb_entry_async) + FLUSH_MMU_TLB_ENTRY + bx lr /* * void flush_mmu_tlb_entry(uint32_t) @@ -231,40 +298,70 @@ LEXT(flush_core_tlb) .align 2 .globl EXT(flush_mmu_tlb_entry) LEXT(flush_mmu_tlb_entry) + FLUSH_MMU_TLB_ENTRY + SYNC_TLB_FLUSH + bx lr + +.macro FLUSH_MMU_TLB_ENTRIES +1: #if __ARM_SMP__ - mcr p15, 0, r0, c8, c3, 1 // Invalidate TLB Inner Shareableentry + mcr p15, 0, r0, c8, c3, 1 // Invalidate TLB Inner Shareable entry #else mcr p15, 0, r0, c8, c7, 1 // Invalidate TLB entry #endif - dsb ish - isb - bx lr + add r0, r0, ARM_PGBYTES // Increment to the next page + cmp r0, r1 // Loop if current address < end address + blt 1b +.endmacro + +/* + * void flush_mmu_tlb_entries_async(uint32_t, uint32_t) + * + * Flush TLB entries for address range, don't wait for completion + */ + .text + .align 2 + .globl EXT(flush_mmu_tlb_entries_async) +LEXT(flush_mmu_tlb_entries_async) + FLUSH_MMU_TLB_ENTRIES + bx lr /* * void flush_mmu_tlb_entries(uint32_t, uint32_t) * - * Flush TLB entries + * Flush TLB entries for address range */ .text .align 2 .globl EXT(flush_mmu_tlb_entries) LEXT(flush_mmu_tlb_entries) -1: + FLUSH_MMU_TLB_ENTRIES + SYNC_TLB_FLUSH + bx lr + + +.macro FLUSH_MMU_TLB_MVA_ENTRIES #if __ARM_SMP__ - mcr p15, 0, r0, c8, c3, 1 // Invalidate TLB Inner Shareable entry + mcr p15, 0, r0, c8, c3, 3 // Invalidate TLB Inner Shareable entries by mva #else - mcr p15, 0, r0, c8, c7, 1 // Invalidate TLB entry + mcr p15, 0, r0, c8, c7, 3 // Invalidate TLB Inner Shareable entries by mva #endif - add r0, r0, ARM_PGBYTES // Increment to the next page - cmp r0, r1 // Loop if current address < end address - blt 1b - dsb ish // Synchronize - isb - bx lr +.endmacro +/* + * void flush_mmu_tlb_mva_entries_async(uint32_t) + * + * Flush TLB entries for mva, don't wait for completion + */ + .text + .align 2 + .globl EXT(flush_mmu_tlb_mva_entries_async) +LEXT(flush_mmu_tlb_mva_entries_async) + FLUSH_MMU_TLB_MVA_ENTRIES + bx lr /* - * void flush_mmu_tlb_mva_entries(uint32_t) + * void flush_mmu_tlb_mva_entries_async(uint32_t) * * Flush TLB entries for mva */ @@ -272,46 +369,71 @@ LEXT(flush_mmu_tlb_entries) .align 2 .globl EXT(flush_mmu_tlb_mva_entries) LEXT(flush_mmu_tlb_mva_entries) + FLUSH_MMU_TLB_MVA_ENTRIES + SYNC_TLB_FLUSH + bx lr + +.macro FLUSH_MMU_TLB_ASID #if __ARM_SMP__ - mcr p15, 0, r0, c8, c3, 3 // Invalidate TLB Inner Shareable entries by mva + mcr p15, 0, r0, c8, c3, 2 // Invalidate TLB Inner Shareable entries by asid #else - mcr p15, 0, r0, c8, c7, 3 // Invalidate TLB Inner Shareable entries by mva + mcr p15, 0, r0, c8, c7, 2 // Invalidate TLB entries by asid #endif - dsb ish - isb - bx lr +.endmacro + +/* + * void flush_mmu_tlb_asid_async(uint32_t) + * + * Flush TLB entries for asid, don't wait for completion + */ + .text + .align 2 + .globl EXT(flush_mmu_tlb_asid_async) +LEXT(flush_mmu_tlb_asid_async) + FLUSH_MMU_TLB_ASID + bx lr /* * void flush_mmu_tlb_asid(uint32_t) * - * Flush TLB entriesfor requested asid + * Flush TLB entries for asid */ .text .align 2 .globl EXT(flush_mmu_tlb_asid) LEXT(flush_mmu_tlb_asid) -#if __ARM_SMP__ - mcr p15, 0, r0, c8, c3, 2 // Invalidate TLB Inner Shareable entries by asid -#else + FLUSH_MMU_TLB_ASID + SYNC_TLB_FLUSH + bx lr + +.macro FLUSH_CORE_TLB_ASID mcr p15, 0, r0, c8, c7, 2 // Invalidate TLB entries by asid -#endif - dsb ish - isb - bx lr +.endmacro + +/* + * void flush_core_tlb_asid_async(uint32_t) + * + * Flush local core TLB entries for asid, don't wait for completion + */ + .text + .align 2 + .globl EXT(flush_core_tlb_asid_async) +LEXT(flush_core_tlb_asid_async) + FLUSH_CORE_TLB_ASID + bx lr /* * void flush_core_tlb_asid(uint32_t) * - * Flush TLB entries for core for requested asid + * Flush local core TLB entries for asid */ .text .align 2 .globl EXT(flush_core_tlb_asid) LEXT(flush_core_tlb_asid) - mcr p15, 0, r0, c8, c7, 2 // Invalidate TLB entries by asid - dsb ish - isb - bx lr + FLUSH_CORE_TLB_ASID + SYNC_TLB_FLUSH + bx lr /* * Set MMU Translation Table Base @@ -499,22 +621,7 @@ LEXT(set_context_id) isb bx lr -#define COPYIO_HEADER(rUser, kLabel) \ - /* test for zero len */ ;\ - cmp r2, #0 ;\ - moveq r0, #0 ;\ - bxeq lr ;\ - /* test user_addr, user_addr+len to see if it's in kernel space */ ;\ - add r12, rUser, r2 ;\ - cmp r12, KERNELBASE ;\ - bhs kLabel ;\ - cmp r12, rUser ;\ - bcc kLabel - -#define COPYIO_VALIDATE(NAME, SIZE) \ - /* branch around for small sizes */ ;\ - cmp r2, #(SIZE) ;\ - bls L##NAME##_validate_done ;\ +#define COPYIO_VALIDATE(NAME) \ /* call NAME_validate to check the arguments */ ;\ push {r0, r1, r2, r7, lr} ;\ add r7, sp, #12 ;\ @@ -523,7 +630,6 @@ LEXT(set_context_id) addne sp, #12 ;\ popne {r7, pc} ;\ pop {r0, r1, r2, r7, lr} ;\ -L##NAME##_validate_done: #define COPYIO_SET_RECOVER() \ /* set recovery address */ ;\ @@ -533,6 +639,15 @@ L##NAME##_validate_done: ldr r4, [r12, TH_RECOVER] ;\ str r3, [r12, TH_RECOVER] +#define COPYIO_TRY_KERNEL() \ + /* if (current_thread()->map->pmap == kernel_pmap) copyio_kernel() */ ;\ + mrc p15, 0, r12, c13, c0, 4 // Read TPIDRPRW ;\ + ldr r3, [r12, ACT_MAP] ;\ + ldr r3, [r3, MAP_PMAP] ;\ + LOAD_ADDR(ip, kernel_pmap_store) ;\ + cmp r3, ip ;\ + beq copyio_kern_body + #if __ARM_USER_PROTECT__ #define COPYIO_MAP_USER() \ /* disable interrupts to prevent expansion to 2GB at L1 ;\ @@ -549,7 +664,7 @@ L##NAME##_validate_done: #define COPYIO_MAP_USER() #endif -#define COPYIO_HEADER_KERN() ;\ +#define COPYIO_HEADER() ;\ /* test for zero len */ ;\ cmp r2, #0 ;\ moveq r0, #0 ;\ @@ -615,22 +730,21 @@ L$0_noerror: .align 2 .globl EXT(copyinstr) LEXT(copyinstr) + cmp r2, #0 + moveq r0, #ENAMETOOLONG + moveq r12, #0 + streq r12, [r3] + bxeq lr + COPYIO_VALIDATE(copyin) stmfd sp!, { r4, r5, r6 } mov r6, r3 - add r3, r0, r2 // user_addr + max - cmp r3, KERNELBASE // Check KERNELBASE < user_addr + max - bhs copyinstr_param_error // Drop out if it is - cmp r3, r0 // Check we're copying from user space - bcc copyinstr_param_error // Drop out if we aren't adr r3, copyinstr_error // Get address for recover mrc p15, 0, r12, c13, c0, 4 // Read TPIDRPRW ldr r4, [r12, TH_RECOVER] ;\ str r3, [r12, TH_RECOVER] COPYIO_MAP_USER() mov r12, #0 // Number of bytes copied so far - cmp r2, #0 - beq copyinstr_too_long copyinstr_loop: ldrb r3, [r0], #1 // Load a byte from the source (user) strb r3, [r1], #1 // Store a byte to the destination (kernel) @@ -647,16 +761,15 @@ copyinstr_too_long: copyinstr_done: // // When we get here, we have finished copying the string. We came here from -// either the "beq copyinstr_done" above, in which case r4 == 0 (which is also +// either the "beq copyinstr_done" above, in which case r3 == 0 (which is also // the function result for success), or falling through from copyinstr_too_long, -// in which case r4 == ENAMETOOLONG. +// in which case r3 == ENAMETOOLONG. // str r12, [r6] // Save the count for actual mov r0, r3 // Return error code from r3 copyinstr_exit: COPYIO_UNMAP_USER() str r4, [r12, TH_RECOVER] -copyinstr_exit2: ldmfd sp!, { r4, r5, r6 } bx lr @@ -665,11 +778,6 @@ copyinstr_error: mov r0, #EFAULT b copyinstr_exit -copyinstr_param_error: - /* set error, exit routine */ - mov r0, #EFAULT - b copyinstr_exit2 - /* * int copyin(const user_addr_t user_addr, char *kernel_addr, vm_size_t nbytes) */ @@ -677,8 +785,9 @@ copyinstr_param_error: .align 2 .globl EXT(copyin) LEXT(copyin) - COPYIO_HEADER(r0,copyio_kernel) - COPYIO_VALIDATE(copyin,4096) + COPYIO_HEADER() + COPYIO_VALIDATE(copyin) + COPYIO_TRY_KERNEL() COPYIO_SET_RECOVER() COPYIO_MAP_USER() COPYIO_BODY copyin @@ -693,8 +802,9 @@ LEXT(copyin) .align 2 .globl EXT(copyout) LEXT(copyout) - COPYIO_HEADER(r1,copyio_kernel) - COPYIO_VALIDATE(copyout,4096) + COPYIO_HEADER() + COPYIO_VALIDATE(copyout) + COPYIO_TRY_KERNEL() COPYIO_SET_RECOVER() COPYIO_MAP_USER() COPYIO_BODY copyout @@ -717,7 +827,7 @@ LEXT(copyin_word) tst r0, r3 // Test alignment of user address bne L_copyin_invalid - COPYIO_HEADER(r0,L_copyin_word_fault) + COPYIO_VALIDATE(copyin) COPYIO_SET_RECOVER() COPYIO_MAP_USER() @@ -734,9 +844,6 @@ LEXT(copyin_word) L_copyin_invalid: mov r0, #EINVAL bx lr -L_copyin_word_fault: - mov r0, #EFAULT - bx lr copyio_error: @@ -753,8 +860,8 @@ copyio_error: .align 2 .globl EXT(copyin_kern) LEXT(copyin_kern) - COPYIO_HEADER_KERN() - b bypass_check + COPYIO_HEADER() + b copyio_kern_body /* * int copyout_kern(const char *kernel_addr, user_addr_t user_addr, vm_size_t nbytes) @@ -763,23 +870,10 @@ LEXT(copyin_kern) .align 2 .globl EXT(copyout_kern) LEXT(copyout_kern) - COPYIO_HEADER_KERN() - b bypass_check - -copyio_kernel_error: - mov r0, #EFAULT - bx lr - -copyio_kernel: - /* if (current_thread()->map->pmap != kernel_pmap) return EFAULT */ - mrc p15, 0, r12, c13, c0, 4 // Read TPIDRPRW - ldr r3, [r12, ACT_MAP] - ldr r3, [r3, MAP_PMAP] - LOAD_ADDR(ip, kernel_pmap_store) - cmp r3, ip - bne copyio_kernel_error + COPYIO_HEADER() + b copyio_kern_body -bypass_check: +copyio_kern_body: stmfd sp!, { r5, r6 } COPYIO_BODY copyio_kernel ldmfd sp!, { r5, r6 } diff --git a/osfmk/arm/machine_routines_common.c b/osfmk/arm/machine_routines_common.c index 9afa6a74f..0c6da73f7 100644 --- a/osfmk/arm/machine_routines_common.c +++ b/osfmk/arm/machine_routines_common.c @@ -40,6 +40,7 @@ #include #include #include +#include #if MONOTONIC #include @@ -361,7 +362,7 @@ machine_thread_going_on_core(thread_t new_thread, on_core.energy_estimate_nj = 0; on_core.qos_class = proc_get_effective_thread_policy(new_thread, TASK_POLICY_QOS); on_core.urgency = urgency; - on_core.is_32_bit = thread_is_64bit(new_thread) ? FALSE : TRUE; + on_core.is_32_bit = thread_is_64bit_data(new_thread) ? FALSE : TRUE; on_core.is_kernel_thread = new_thread->task == kernel_task; on_core.scheduling_latency = sched_latency; on_core.start_time = timestamp; @@ -467,7 +468,7 @@ machine_perfcontrol_deadline_passed(uint64_t deadline) void ml_spin_debug_reset(thread_t thread) { - thread->machine.intmask_timestamp = mach_absolute_time(); + thread->machine.intmask_timestamp = mach_absolute_time(); } /* @@ -478,7 +479,7 @@ ml_spin_debug_reset(thread_t thread) void ml_spin_debug_clear(thread_t thread) { - thread->machine.intmask_timestamp = 0; + thread->machine.intmask_timestamp = 0; } /* @@ -495,28 +496,28 @@ ml_spin_debug_clear_self() void ml_check_interrupts_disabled_duration(thread_t thread) { - uint64_t start; - uint64_t now; + uint64_t start; + uint64_t now; - start = thread->machine.intmask_timestamp; - if (start != 0) { - now = mach_absolute_time(); + start = thread->machine.intmask_timestamp; + if (start != 0) { + now = mach_absolute_time(); - if ((now - start) > interrupt_masked_timeout) { - mach_timebase_info_data_t timebase; - clock_timebase_info(&timebase); + if ((now - start) > interrupt_masked_timeout * debug_cpu_performance_degradation_factor) { + mach_timebase_info_data_t timebase; + clock_timebase_info(&timebase); #ifndef KASAN - /* - * Disable the actual panic for KASAN due to the overhead of KASAN itself, leave the rest of the - * mechanism enabled so that KASAN can catch any bugs in the mechanism itself. - */ - panic("Interrupts held disabled for %llu nanoseconds", (((now - start) * timebase.numer)/timebase.denom)); + /* + * Disable the actual panic for KASAN due to the overhead of KASAN itself, leave the rest of the + * mechanism enabled so that KASAN can catch any bugs in the mechanism itself. + */ + panic("Interrupts held disabled for %llu nanoseconds", (((now - start) * timebase.numer)/timebase.denom)); #endif - } - } + } + } - return; + return; } #endif // INTERRUPT_MASKED_DEBUG @@ -524,84 +525,121 @@ ml_check_interrupts_disabled_duration(thread_t thread) boolean_t ml_set_interrupts_enabled(boolean_t enable) { - thread_t thread; - uint64_t state; + thread_t thread; + uint64_t state; #if __arm__ #define INTERRUPT_MASK PSR_IRQF - state = __builtin_arm_rsr("cpsr"); + state = __builtin_arm_rsr("cpsr"); #else #define INTERRUPT_MASK DAIF_IRQF - state = __builtin_arm_rsr("DAIF"); + state = __builtin_arm_rsr("DAIF"); #endif - if (enable) { + if (enable && (state & INTERRUPT_MASK)) { #if INTERRUPT_MASKED_DEBUG - if (interrupt_masked_debug && (state & INTERRUPT_MASK)) { - // Interrupts are currently masked, we will enable them (after finishing this check) - thread = current_thread(); - ml_check_interrupts_disabled_duration(thread); - thread->machine.intmask_timestamp = 0; - } + if (interrupt_masked_debug) { + // Interrupts are currently masked, we will enable them (after finishing this check) + thread = current_thread(); + ml_check_interrupts_disabled_duration(thread); + thread->machine.intmask_timestamp = 0; + } #endif // INTERRUPT_MASKED_DEBUG - if (get_preemption_level() == 0) { - thread = current_thread(); - while (thread->machine.CpuDatap->cpu_pending_ast & AST_URGENT) { + if (get_preemption_level() == 0) { + thread = current_thread(); + while (thread->machine.CpuDatap->cpu_pending_ast & AST_URGENT) { #if __ARM_USER_PROTECT__ - uintptr_t up = arm_user_protect_begin(thread); + uintptr_t up = arm_user_protect_begin(thread); #endif - ast_taken_kernel(); + ast_taken_kernel(); #if __ARM_USER_PROTECT__ - arm_user_protect_end(thread, up, FALSE); + arm_user_protect_end(thread, up, FALSE); #endif - } - } + } + } #if __arm__ - __asm__ volatile ("cpsie if" ::: "memory"); // Enable IRQ FIQ + __asm__ volatile ("cpsie if" ::: "memory"); // Enable IRQ FIQ #else - __builtin_arm_wsr("DAIFClr", (DAIFSC_IRQF | DAIFSC_FIQF)); + __builtin_arm_wsr("DAIFClr", (DAIFSC_IRQF | DAIFSC_FIQF)); #endif - } else { + } else if (!enable && ((state & INTERRUPT_MASK) == 0)) { #if __arm__ - __asm__ volatile ("cpsid if" ::: "memory"); // Mask IRQ FIQ + __asm__ volatile ("cpsid if" ::: "memory"); // Mask IRQ FIQ #else - __builtin_arm_wsr("DAIFSet", (DAIFSC_IRQF | DAIFSC_FIQF)); + __builtin_arm_wsr("DAIFSet", (DAIFSC_IRQF | DAIFSC_FIQF)); #endif #if INTERRUPT_MASKED_DEBUG - if (interrupt_masked_debug && ((state & INTERRUPT_MASK) == 0)) { - // Interrupts were enabled, we just masked them - current_thread()->machine.intmask_timestamp = mach_absolute_time(); - } + if (interrupt_masked_debug) { + // Interrupts were enabled, we just masked them + current_thread()->machine.intmask_timestamp = mach_absolute_time(); + } #endif - } - return ((state & INTERRUPT_MASK) == 0); + } + return ((state & INTERRUPT_MASK) == 0); +} + +/* + * Routine: ml_at_interrupt_context + * Function: Check if running at interrupt context + */ +boolean_t +ml_at_interrupt_context(void) +{ + /* Do not use a stack-based check here, as the top-level exception handler + * is free to use some other stack besides the per-CPU interrupt stack. + * Interrupts should always be disabled if we're at interrupt context. + * Check that first, as we may be in a preemptible non-interrupt context, in + * which case we could be migrated to a different CPU between obtaining + * the per-cpu data pointer and loading cpu_int_state. We then might end + * up checking the interrupt state of a different CPU, resulting in a false + * positive. But if interrupts are disabled, we also know we cannot be + * preempted. */ + return (!ml_get_interrupts_enabled() && (getCpuDatap()->cpu_int_state != NULL)); +} + +vm_offset_t +ml_stack_remaining(void) +{ + uintptr_t local = (uintptr_t) &local; + vm_offset_t intstack_top_ptr; + + /* Since this is a stack-based check, we don't need to worry about + * preemption as we do in ml_at_interrupt_context(). If we are preemptible, + * then the sp should never be within any CPU's interrupt stack unless + * something has gone horribly wrong. */ + intstack_top_ptr = getCpuDatap()->intstack_top; + if ((local < intstack_top_ptr) && (local > intstack_top_ptr - INTSTACK_SIZE)) { + return (local - (getCpuDatap()->intstack_top - INTSTACK_SIZE)); + } else { + return (local - current_thread()->kernel_stack); + } } static boolean_t ml_quiescing; void ml_set_is_quiescing(boolean_t quiescing) { - assert(FALSE == ml_get_interrupts_enabled()); - ml_quiescing = quiescing; + assert(FALSE == ml_get_interrupts_enabled()); + ml_quiescing = quiescing; } boolean_t ml_is_quiescing(void) { - assert(FALSE == ml_get_interrupts_enabled()); - return (ml_quiescing); + assert(FALSE == ml_get_interrupts_enabled()); + return (ml_quiescing); } uint64_t ml_get_booter_memory_size(void) { - enum { kRoundSize = 512*1024*1024ULL }; uint64_t size; + uint64_t roundsize = 512*1024*1024ULL; size = BootArgs->memSizeActual; - if (!size) - { + if (!size) { size = BootArgs->memSize; - size = (size + kRoundSize - 1) & ~(kRoundSize - 1); + if (size < (2 * roundsize)) roundsize >>= 1; + size = (size + roundsize - 1) & ~(roundsize - 1); size -= BootArgs->memSize; - } - return (size); + } + return (size); } uint64_t diff --git a/osfmk/arm/misc_protos.h b/osfmk/arm/misc_protos.h index 416bcb2a3..a36edabcb 100644 --- a/osfmk/arm/misc_protos.h +++ b/osfmk/arm/misc_protos.h @@ -56,7 +56,7 @@ extern void Load_context(thread_t); extern void Idle_load_context(void) __attribute__((noreturn)); extern thread_t Switch_context(thread_t, thread_continue_t, thread_t); extern thread_t Shutdown_context(void (*doshutdown)(processor_t), processor_t processor); -extern void Call_continuation(thread_continue_t, void *, wait_result_t, vm_offset_t); +extern void Call_continuation(thread_continue_t, void *, wait_result_t, boolean_t enable_interrupts); extern void DebuggerCall(unsigned int reason, void *ctx); extern void DebuggerXCall(void *ctx); @@ -82,8 +82,6 @@ extern boolean_t debug_state_is_valid32(arm_debug_state32_t *ds); extern boolean_t debug_state_is_valid64(arm_debug_state64_t *ds); extern int copyio_check_user_addr(user_addr_t user_addr, vm_size_t nbytes); -extern int _emulate_swp(user_addr_t addr, uint32_t newval, uint32_t *oldval); -extern int _emulate_swpb(user_addr_t addr, uint8_t newval, uint32_t *oldval); /* Top-Byte-Ignore */ extern boolean_t user_tbi; diff --git a/osfmk/arm/model_dep.c b/osfmk/arm/model_dep.c index 1ae4b3c55..49503dfeb 100644 --- a/osfmk/arm/model_dep.c +++ b/osfmk/arm/model_dep.c @@ -40,6 +40,7 @@ #include #include + #include #include #include @@ -57,6 +58,7 @@ #include #include #include +#include #include #include @@ -92,7 +94,7 @@ extern int kdp_stack_snapshot_bytes_traced(void); * Increment the PANICLOG_VERSION if you change the format of the panic * log in any way. */ -#define PANICLOG_VERSION 9 +#define PANICLOG_VERSION 11 static struct kcdata_descriptor kc_panic_data; extern char firmware_version[]; @@ -123,6 +125,7 @@ decl_simple_lock_data(extern,clock_lock) extern struct timeval gIOLastSleepTime; extern struct timeval gIOLastWakeTime; extern boolean_t is_clock_configured; +extern boolean_t kernelcache_uuid_valid; extern uuid_t kernelcache_uuid; /* Definitions for frame pointers */ @@ -152,6 +155,8 @@ unsigned int DebugContextCount = 0; uint8_t PE_smc_stashed_x86_system_state = 0xFF; uint8_t PE_smc_stashed_x86_power_state = 0xFF; uint8_t PE_smc_stashed_x86_efi_boot_state = 0xFF; +uint8_t PE_smc_stashed_x86_shutdown_cause = 0xFF; +uint64_t PE_smc_stashed_x86_prev_power_transitions = UINT64_MAX; uint32_t PE_pcie_stashed_link_state = UINT32_MAX; #endif @@ -327,24 +332,21 @@ do_print_all_backtraces( if (last_hwaccess_thread) { paniclog_append_noflush("AppleHWAccess Thread: 0x%llx\n", last_hwaccess_thread); } -#if defined(XNU_TARGET_OS_BRIDGE) - paniclog_append_noflush("PCIeUp link state: "); - if (PE_pcie_stashed_link_state != UINT32_MAX) { - paniclog_append_noflush("0x%x\n", PE_pcie_stashed_link_state); - } else { - paniclog_append_noflush("not available\n"); - } -#endif + paniclog_append_noflush("Boot args: %s\n", PE_boot_args()); } paniclog_append_noflush("Memory ID: 0x%x\n", gPlatformMemoryID); paniclog_append_noflush("OS version: %.256s\n", ('\0' != osversion[0]) ? osversion : "Not set yet"); paniclog_append_noflush("Kernel version: %.512s\n", version); - paniclog_append_noflush("KernelCache UUID: "); - for (index = 0; index < sizeof(uuid_t); index++) { - paniclog_append_noflush("%02X", kernelcache_uuid[index]); + + if (kernelcache_uuid_valid) { + paniclog_append_noflush("KernelCache UUID: "); + for (index = 0; index < sizeof(uuid_t); index++) { + paniclog_append_noflush("%02X", kernelcache_uuid[index]); + } + paniclog_append_noflush("\n"); } - paniclog_append_noflush("\n"); + panic_display_kernel_uuid(); paniclog_append_noflush("iBoot version: %.128s\n", firmware_version); paniclog_append_noflush("secure boot?: %s\n", debug_enabled ? "NO": "YES"); @@ -367,6 +369,24 @@ do_print_all_backtraces( } else { paniclog_append_noflush("not available\n"); } + paniclog_append_noflush("x86 Shutdown Cause: "); + if (PE_smc_stashed_x86_shutdown_cause != 0xFF) { + paniclog_append_noflush("0x%x\n", PE_smc_stashed_x86_shutdown_cause); + } else { + paniclog_append_noflush("not available\n"); + } + paniclog_append_noflush("x86 Previous Power Transitions: "); + if (PE_smc_stashed_x86_prev_power_transitions != UINT64_MAX) { + paniclog_append_noflush("0x%llx\n", PE_smc_stashed_x86_prev_power_transitions); + } else { + paniclog_append_noflush("not available\n"); + } + paniclog_append_noflush("PCIeUp link state: "); + if (PE_pcie_stashed_link_state != UINT32_MAX) { + paniclog_append_noflush("0x%x\n", PE_pcie_stashed_link_state); + } else { + paniclog_append_noflush("not available\n"); + } #endif paniclog_append_noflush("Paniclog version: %d\n", logversion); @@ -380,6 +400,13 @@ do_print_all_backtraces( panic_display_ecc_errors(); #endif /* CONFIG_ECC_LOGGING */ +#if DEVELOPMENT || DEBUG + if (cs_debug_unsigned_exec_failures != 0 || cs_debug_unsigned_mmap_failures != 0) { + paniclog_append_noflush("Unsigned code exec failures: %u\n", cs_debug_unsigned_exec_failures); + paniclog_append_noflush("Unsigned code mmap failures: %u\n", cs_debug_unsigned_mmap_failures); + } +#endif + // Just print threads with high CPU usage for WDT timeouts if (strncmp(message, "WDT timeout", 11) == 0) { thread_t top_runnable[5] = {0}; @@ -596,7 +623,7 @@ void panic_print_symbol_name(vm_address_t search) void SavePanicInfo( - const char *message, __unused uint64_t panic_options) + const char *message, __unused void *panic_data, __unused uint64_t panic_options) { /* This should be initialized by the time we get here */ @@ -787,10 +814,12 @@ DebuggerXCallEnter( paniclog_append_noflush("Attempting to forcibly halt cpu %d\n", cpu); dbgwrap_status_t halt_status = ml_dbgwrap_halt_cpu(cpu, 0); if (halt_status < 0) - paniclog_append_noflush("Unable to halt cpu %d: %d\n", cpu, halt_status); + paniclog_append_noflush("cpu %d failed to halt with error %d: %s\n", cpu, halt_status, ml_dbgwrap_strerror(halt_status)); else { if (halt_status > 0) - paniclog_append_noflush("cpu %d halted with warning %d\n", cpu, halt_status); + paniclog_append_noflush("cpu %d halted with warning %d: %s\n", cpu, halt_status, ml_dbgwrap_strerror(halt_status)); + else + paniclog_append_noflush("cpu %d successfully halted\n", cpu); target_cpu_datap->halt_status = CPU_HALTED; } } else @@ -806,7 +835,7 @@ DebuggerXCallEnter( dbgwrap_status_t halt_status = ml_dbgwrap_halt_cpu_with_state(cpu, NSEC_PER_SEC, &target_cpu_datap->halt_state); if ((halt_status < 0) || (halt_status == DBGWRAP_WARN_CPU_OFFLINE)) - paniclog_append_noflush("Unable to obtain state for cpu %d: %d\n", cpu, halt_status); + paniclog_append_noflush("Unable to obtain state for cpu %d with status %d: %s\n", cpu, halt_status, ml_dbgwrap_strerror(halt_status)); else target_cpu_datap->halt_status = CPU_HALTED_WITH_STATE; } @@ -871,6 +900,7 @@ DebuggerXCall( if (save_context) { /* Save the interrupted context before acknowledging the signal */ *state = *regs; + } else if (regs) { /* zero old state so machine_trace_thread knows not to backtrace it */ set_saved_state_fp(state, 0); diff --git a/osfmk/arm/monotonic_arm.c b/osfmk/arm/monotonic_arm.c index a5b071631..5526c515b 100644 --- a/osfmk/arm/monotonic_arm.c +++ b/osfmk/arm/monotonic_arm.c @@ -32,7 +32,7 @@ bool mt_core_supported = false; void -mt_init(void) +mt_early_init(void) { } @@ -48,4 +48,10 @@ mt_cur_cpu(void) return &getCpuDatap()->cpu_monotonic; } -const struct monotonic_dev monotonic_devs[0]; +int +mt_microstackshot_start_arch(__unused uint64_t period) +{ + return 1; +} + +struct mt_device mt_devices[0]; diff --git a/osfmk/arm/pcb.c b/osfmk/arm/pcb.c index 2d06b559e..60510d8b2 100644 --- a/osfmk/arm/pcb.c +++ b/osfmk/arm/pcb.c @@ -285,13 +285,14 @@ void call_continuation( thread_continue_t continuation, void *parameter, - wait_result_t wresult) + wait_result_t wresult, + boolean_t enable_interrupts) { #define call_continuation_kprintf(x...) /* kprintf("call_continuation_kprintf: * " x) */ call_continuation_kprintf("thread = %x continuation = %x, stack = %x\n", current_thread(), continuation, current_thread()->machine.kstackptr); - Call_continuation(continuation, parameter, wresult, current_thread()->machine.kstackptr); + Call_continuation(continuation, parameter, wresult, enable_interrupts); } void arm_debug_set(arm_debug_state_t *debug_state) diff --git a/osfmk/arm/pmap.c b/osfmk/arm/pmap.c index a62fd171e..754b84930 100644 --- a/osfmk/arm/pmap.c +++ b/osfmk/arm/pmap.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2016 Apple Inc. All rights reserved. + * Copyright (c) 2011-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -45,6 +45,9 @@ #include #include #include +#include + +#include #include #include @@ -55,6 +58,7 @@ #include #include +#include #include #include @@ -70,8 +74,6 @@ #include #include -#include - #if (__ARM_VMSA__ > 7) #include #include @@ -87,8 +89,15 @@ #include #include +#include + #if MACH_ASSERT +int vm_footprint_suspend_allowed = 1; + +extern int pmap_ledgers_panic; +extern int pmap_ledgers_panic_leeway; + int pmap_stats_assert = 1; #define PMAP_STATS_ASSERTF(cond, pmap, fmt, ...) \ MACRO_BEGIN \ @@ -100,13 +109,18 @@ int pmap_stats_assert = 1; #endif /* MACH_ASSERT */ #if DEVELOPMENT || DEBUG -#define PMAP_FOOTPRINT_SUSPENDED(pmap) ((pmap)->footprint_suspended) +#define PMAP_FOOTPRINT_SUSPENDED(pmap) \ + (current_thread()->pmap_footprint_suspended) #else /* DEVELOPMENT || DEBUG */ #define PMAP_FOOTPRINT_SUSPENDED(pmap) (FALSE) #endif /* DEVELOPMENT || DEBUG */ +#define pmap_ledger_debit(p, e, a) ledger_debit((p)->ledger, e, a) +#define pmap_ledger_credit(p, e, a) ledger_credit((p)->ledger, e, a) + + #if DEVELOPMENT || DEBUG int panic_on_unsigned_execute = 0; #endif /* DEVELOPMENT || DEBUG */ @@ -130,6 +144,7 @@ extern pmap_paddr_t avail_end; extern vm_offset_t virtual_space_start; /* Next available kernel VA */ extern vm_offset_t virtual_space_end; /* End of kernel address space */ +extern vm_offset_t static_memory_end; extern int hard_maxproc; @@ -168,10 +183,6 @@ decl_simple_lock_data(, pmaps_lock MARK_AS_PMAP_DATA) unsigned int pmap_stamp MARK_AS_PMAP_DATA; queue_head_t map_pmap_list MARK_AS_PMAP_DATA; -queue_head_t tt_pmap_list MARK_AS_PMAP_DATA; -unsigned int tt_pmap_count MARK_AS_PMAP_DATA; -unsigned int tt_pmap_max MARK_AS_PMAP_DATA; - decl_simple_lock_data(, pt_pages_lock MARK_AS_PMAP_DATA) queue_head_t pt_page_list MARK_AS_PMAP_DATA; /* pt page ptd entries list */ @@ -302,15 +313,28 @@ decl_simple_lock_data(,phys_backup_lock) #endif #endif -#define PT_DESC_REFCOUNT 0x4000U +#define PT_DESC_REFCOUNT 0x4000U +#define PT_DESC_IOMMU_REFCOUNT 0x8000U typedef struct pt_desc { - queue_chain_t pt_page; + queue_chain_t pt_page; struct { - unsigned short refcnt; - unsigned short wiredcnt; + /* + * For non-leaf pagetables, should always be PT_DESC_REFCOUNT + * For leaf pagetables, should reflect the number of non-empty PTEs + * For IOMMU pages, should always be PT_DESC_IOMMU_REFCOUNT + */ + unsigned short refcnt; + /* + * For non-leaf pagetables, should be 0 + * For leaf pagetables, should reflect the number of wired entries + * For IOMMU pages, may optionally reflect a driver-defined refcount (IOMMU operations are implicitly wired) + */ + unsigned short wiredcnt; } pt_cnt[PT_INDEX_MAX]; - struct pmap *pmap; + union { + struct pmap *pmap; + }; struct { vm_offset_t va; } pt_map[PT_INDEX_MAX]; @@ -348,20 +372,21 @@ typedef u_int16_t pp_attr_t; SECURITY_READ_ONLY_LATE(pp_attr_t*) pp_attr_table; +typedef struct pmap_io_range +{ + uint64_t addr; + uint32_t len; + uint32_t wimg; // treated as pp_attr_t +} __attribute__((packed)) pmap_io_range_t; -typedef uint8_t io_attr_t; - -#define IO_ATTR_WIMG_MASK 0x3F -#define IO_ATTR_WIMG(x) ((x) & IO_ATTR_WIMG_MASK) - -SECURITY_READ_ONLY_LATE(io_attr_t*) io_attr_table; +SECURITY_READ_ONLY_LATE(pmap_io_range_t*) io_attr_table; SECURITY_READ_ONLY_LATE(pmap_paddr_t) vm_first_phys = (pmap_paddr_t) 0; SECURITY_READ_ONLY_LATE(pmap_paddr_t) vm_last_phys = (pmap_paddr_t) 0; SECURITY_READ_ONLY_LATE(pmap_paddr_t) io_rgn_start = 0; SECURITY_READ_ONLY_LATE(pmap_paddr_t) io_rgn_end = 0; -SECURITY_READ_ONLY_LATE(uint32_t) io_rgn_granule = 0; +SECURITY_READ_ONLY_LATE(unsigned int) num_io_rgns = 0; SECURITY_READ_ONLY_LATE(boolean_t) pmap_initialized = FALSE; /* Has pmap_init completed? */ @@ -381,67 +406,67 @@ SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap; #endif -#define pa_index(pa) \ +#define pa_index(pa) \ (atop((pa) - vm_first_phys)) -#define pai_to_pvh(pai) \ +#define pai_to_pvh(pai) \ (&pv_head_table[pai]) -#define pa_valid(x) \ +#define pa_valid(x) \ ((x) >= vm_first_phys && (x) < vm_last_phys) /* PTE Define Macros */ -#define pte_is_wired(pte) \ +#define pte_is_wired(pte) \ (((pte) & ARM_PTE_WIRED_MASK) == ARM_PTE_WIRED) -#define pte_set_wired(ptep, wired) \ +#define pte_set_wired(ptep, wired) \ do { \ - SInt16 *ptd_wiredcnt_ptr; \ + SInt16 *ptd_wiredcnt_ptr; \ ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_ptd(ptep)->pt_cnt[ARM_PT_DESC_INDEX(ptep)].wiredcnt); \ - if (wired) { \ - *ptep |= ARM_PTE_WIRED; \ - OSAddAtomic16(1, ptd_wiredcnt_ptr); \ + if (wired) { \ + *ptep |= ARM_PTE_WIRED; \ + OSAddAtomic16(1, ptd_wiredcnt_ptr); \ } else { \ - *ptep &= ~ARM_PTE_WIRED; \ - OSAddAtomic16(-1, ptd_wiredcnt_ptr); \ + *ptep &= ~ARM_PTE_WIRED; \ + OSAddAtomic16(-1, ptd_wiredcnt_ptr); \ } \ } while(0) -#define pte_is_ffr(pte) \ +#define pte_is_ffr(pte) \ (((pte) & ARM_PTE_WRITEABLE) == ARM_PTE_WRITEABLE) #define pte_set_ffr(pte, ffr) \ - do { \ - if (ffr) { \ - pte |= ARM_PTE_WRITEABLE; \ - } else { \ - pte &= ~ARM_PTE_WRITEABLE; \ - } \ + do { \ + if (ffr) { \ + pte |= ARM_PTE_WRITEABLE; \ + } else { \ + pte &= ~ARM_PTE_WRITEABLE; \ + } \ } while(0) /* PVE Define Macros */ -#define pve_next(pve) \ +#define pve_next(pve) \ ((pve)->pve_next) -#define pve_link_field(pve) \ +#define pve_link_field(pve) \ (&pve_next(pve)) -#define pve_link(pp, e) \ +#define pve_link(pp, e) \ ((pve_next(e) = pve_next(pp)), (pve_next(pp) = (e))) -#define pve_unlink(pp, e) \ +#define pve_unlink(pp, e) \ (pve_next(pp) = pve_next(e)) /* bits held in the ptep pointer field */ -#define pve_get_ptep(pve) \ +#define pve_get_ptep(pve) \ ((pve)->pve_ptep) -#define pve_set_ptep(pve, ptep_new) \ - do { \ - (pve)->pve_ptep = (ptep_new); \ +#define pve_set_ptep(pve, ptep_new) \ + do { \ + (pve)->pve_ptep = (ptep_new); \ } while (0) /* PTEP Define Macros */ @@ -461,18 +486,11 @@ SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap; */ #define ARM_TT_PT_ADDR_SHIFT (10U) -#define ARM_PT_DESC_INDEX(ptep) \ - (((unsigned)(ptep) & ARM_PT_DESC_INDEX_MASK) >> ARM_PT_DESC_INDEX_SHIFT) - -#define ptep_get_ptd(ptep) \ - ((struct pt_desc *)((*((vm_offset_t *)(pai_to_pvh(pa_index((vm_offset_t)(ptep) - gVirtBase + gPhysBase))))) & PVH_LIST_MASK)) - #define ptep_get_va(ptep) \ - ((((pt_desc_t *) (pvh_list(pai_to_pvh(pa_index((((vm_offset_t)(ptep) & ~0xFFF) - gVirtBase + gPhysBase))))))->pt_map[ARM_PT_DESC_INDEX(ptep)].va)+ ((((unsigned)(ptep)) & ARM_TT_PT_INDEX_MASK)<pt_map[ARM_PT_DESC_INDEX(ptep)].va)+ ((((unsigned)(ptep)) & ARM_TT_PT_INDEX_MASK)<pmap)) - + ((((pt_desc_t *) (pvh_list(pai_to_pvh(pa_index(ml_static_vtop((((vm_offset_t)(ptep) & ~0xFFF))))))))->pmap)) #else @@ -508,157 +526,164 @@ SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap; #define ARM_TT_PT_OTHER_MASK (0x3fffULL) #endif -#define ARM_PT_DESC_INDEX(ptep) \ +#define ARM_PT_DESC_INDEX(ptep) \ (((unsigned)(ptep) & ARM_PT_DESC_INDEX_MASK) >> ARM_PT_DESC_INDEX_SHIFT) - -#define ptep_get_ptd(ptep) \ - ((struct pt_desc *)((*((vm_offset_t *)(pai_to_pvh(pa_index((vm_offset_t)(ptep) - gVirtBase + gPhysBase))))) & PVH_LIST_MASK)) - #define ptep_get_va(ptep) \ - ((((pt_desc_t *) (pvh_list(pai_to_pvh(pa_index((((vm_offset_t)(ptep) & ~ARM_TT_PT_OTHER_MASK) - gVirtBase + gPhysBase))))))->pt_map[ARM_PT_DESC_INDEX(ptep)].va)+ ((((unsigned)(ptep)) & ARM_TT_PT_INDEX_MASK)<pt_map[ARM_PT_DESC_INDEX(ptep)].va)+ ((((unsigned)(ptep)) & ARM_TT_PT_INDEX_MASK)<pmap)) + ((((pt_desc_t *) (pvh_list(pai_to_pvh(pa_index(ml_static_vtop((((vm_offset_t)(ptep) & ~ARM_TT_PT_OTHER_MASK))))))))->pmap)) #endif +#define ARM_PT_DESC_INDEX(ptep) \ + (((unsigned)(ptep) & ARM_PT_DESC_INDEX_MASK) >> ARM_PT_DESC_INDEX_SHIFT) + +#define ptep_get_ptd(ptep) \ + ((struct pt_desc *)(pvh_list(pai_to_pvh(pa_index(ml_static_vtop((vm_offset_t)(ptep))))))) + /* PVH Define Macros */ /* pvhead type */ -#define PVH_TYPE_NULL 0x0UL -#define PVH_TYPE_PVEP 0x1UL -#define PVH_TYPE_PTEP 0x2UL -#define PVH_TYPE_PTDP 0x3UL +#define PVH_TYPE_NULL 0x0UL +#define PVH_TYPE_PVEP 0x1UL +#define PVH_TYPE_PTEP 0x2UL +#define PVH_TYPE_PTDP 0x3UL -#define PVH_TYPE_MASK (0x3UL) -#define PVH_LIST_MASK (~PVH_TYPE_MASK) +#define PVH_TYPE_MASK (0x3UL) -#if (__ARM_VMSA__ == 7) -#define pvh_set_bits(h, b) \ - do { \ - while (!OSCompareAndSwap(*(vm_offset_t *)(h), *(vm_offset_t *)(h) | (b), (vm_offset_t *)(h))); \ - } while (0) +#ifdef __arm64__ -#define pvh_clear_bits(h, b) \ - do { \ - while (!OSCompareAndSwap(*(vm_offset_t *)(h), *(vm_offset_t *)(h) & ~(b), (vm_offset_t *)(h))); \ - } while (0) -#else -#define pvh_set_bits(h, b) \ - do { \ - while (!OSCompareAndSwap64(*(vm_offset_t *)(h), *(vm_offset_t *)(h) | ((int64_t)b), (vm_offset_t *)(h))); \ - } while (0) +#define PVH_FLAG_IOMMU 0x4UL +#define PVH_FLAG_IOMMU_TABLE (1ULL << 63) +#define PVH_FLAG_CPU (1ULL << 62) +#define PVH_LOCK_BIT 61 +#define PVH_FLAG_LOCK (1ULL << PVH_LOCK_BIT) +#define PVH_FLAG_EXEC (1ULL << 60) +#define PVH_FLAG_LOCKDOWN (1ULL << 59) +#define PVH_HIGH_FLAGS (PVH_FLAG_CPU | PVH_FLAG_LOCK | PVH_FLAG_EXEC | PVH_FLAG_LOCKDOWN) + +#else /* !__arm64__ */ + +#define PVH_LOCK_BIT 31 +#define PVH_FLAG_LOCK (1UL << PVH_LOCK_BIT) +#define PVH_HIGH_FLAGS PVH_FLAG_LOCK -#define pvh_clear_bits(h, b) \ - do { \ - while (!OSCompareAndSwap64(*(vm_offset_t *)(h), *(vm_offset_t *)(h) & ~((int64_t)b), (vm_offset_t *)(h))); \ - } while (0) #endif -#define pvh_test_type(h, b) \ +#define PVH_LIST_MASK (~PVH_TYPE_MASK) + +#define pvh_test_type(h, b) \ ((*(vm_offset_t *)(h) & (PVH_TYPE_MASK)) == (b)) #define pvh_ptep(h) \ - ((pt_entry_t *)(*(vm_offset_t *)(h) & PVH_LIST_MASK)) + ((pt_entry_t *)((*(vm_offset_t *)(h) & PVH_LIST_MASK) | PVH_HIGH_FLAGS)) #define pvh_list(h) \ - ((pv_entry_t *)(*(vm_offset_t *)(h) & PVH_LIST_MASK)) + ((pv_entry_t *)((*(vm_offset_t *)(h) & PVH_LIST_MASK) | PVH_HIGH_FLAGS)) -#define pvh_bits(h) \ - (*(vm_offset_t *)(h) & PVH_TYPE_MASK) +#define pvh_get_flags(h) \ + (*(vm_offset_t *)(h) & PVH_HIGH_FLAGS) -#if (__ARM_VMSA__ == 7) -#define pvh_update_head(h, e, t) \ - do { \ - while (!OSCompareAndSwap(*(vm_offset_t *)(h), (vm_offset_t)(e) | (t), (vm_offset_t *)(h))); \ +#define pvh_set_flags(h, f) \ + do { \ + __c11_atomic_store((_Atomic vm_offset_t *)(h), (*(vm_offset_t *)(h) & ~PVH_HIGH_FLAGS) | (f), \ + memory_order_relaxed); \ } while (0) -#else -#define pvh_update_head(h, e, t) \ - do { \ - while (!OSCompareAndSwap64(*(vm_offset_t *)(h), (vm_offset_t)(e) | (t), (vm_offset_t *)(h))); \ + +#define pvh_update_head(h, e, t) \ + do { \ + assert(*(vm_offset_t *)(h) & PVH_FLAG_LOCK); \ + __c11_atomic_store((_Atomic vm_offset_t *)(h), (vm_offset_t)(e) | (t) | PVH_FLAG_LOCK, \ + memory_order_relaxed); \ } while (0) -#endif -#define pvh_add(h, e) \ - do { \ - assert(!pvh_test_type((h), PVH_TYPE_PTEP)); \ - pve_next(e) = pvh_list(h); \ - pvh_update_head((h), (e), PVH_TYPE_PVEP); \ +#define pvh_update_head_unlocked(h, e, t) \ + do { \ + assert(!(*(vm_offset_t *)(h) & PVH_FLAG_LOCK)); \ + *(vm_offset_t *)(h) = ((vm_offset_t)(e) | (t)) & ~PVH_FLAG_LOCK; \ } while (0) -#define pvh_remove(h, p, e) \ - do { \ - assert(!PVE_NEXT_IS_ALTACCT(pve_next((e)))); \ - if ((p) == (h)) { \ - if (PVE_NEXT_PTR(pve_next((e))) == PV_ENTRY_NULL) { \ - pvh_update_head((h), PV_ENTRY_NULL, PVH_TYPE_NULL); \ - } else { \ - pvh_update_head((h), PVE_NEXT_PTR(pve_next((e))), PVH_TYPE_PVEP); \ - } \ - } else { \ - /* \ - * PMAP LEDGERS: \ - * preserve the "alternate accounting" bit \ - * when updating "p" (the previous entry's \ - * "pve_next"). \ - */ \ - boolean_t __is_altacct; \ - __is_altacct = PVE_NEXT_IS_ALTACCT(*(p)); \ - *(p) = PVE_NEXT_PTR(pve_next((e))); \ - if (__is_altacct) { \ - PVE_NEXT_SET_ALTACCT((p)); \ - } else { \ - PVE_NEXT_CLR_ALTACCT((p)); \ - } \ - } \ +#define pvh_add(h, e) \ + do { \ + assert(!pvh_test_type((h), PVH_TYPE_PTEP)); \ + pve_next(e) = pvh_list(h); \ + pvh_update_head((h), (e), PVH_TYPE_PVEP); \ + } while (0) + +#define pvh_remove(h, p, e) \ + do { \ + assert(!PVE_NEXT_IS_ALTACCT(pve_next((e)))); \ + if ((p) == (h)) { \ + if (PVE_NEXT_PTR(pve_next((e))) == PV_ENTRY_NULL) { \ + pvh_update_head((h), PV_ENTRY_NULL, PVH_TYPE_NULL); \ + } else { \ + pvh_update_head((h), PVE_NEXT_PTR(pve_next((e))), PVH_TYPE_PVEP); \ + } \ + } else { \ + /* \ + * PMAP LEDGERS: \ + * preserve the "alternate accounting" bit \ + * when updating "p" (the previous entry's \ + * "pve_next"). \ + */ \ + boolean_t __is_altacct; \ + __is_altacct = PVE_NEXT_IS_ALTACCT(*(p)); \ + *(p) = PVE_NEXT_PTR(pve_next((e))); \ + if (__is_altacct) { \ + PVE_NEXT_SET_ALTACCT((p)); \ + } else { \ + PVE_NEXT_CLR_ALTACCT((p)); \ + } \ + } \ } while (0) /* PPATTR Define Macros */ -#define ppattr_set_bits(h, b) \ - do { \ +#define ppattr_set_bits(h, b) \ + do { \ while (!OSCompareAndSwap16(*(pp_attr_t *)(h), *(pp_attr_t *)(h) | (b), (pp_attr_t *)(h))); \ } while (0) -#define ppattr_clear_bits(h, b) \ - do { \ +#define ppattr_clear_bits(h, b) \ + do { \ while (!OSCompareAndSwap16(*(pp_attr_t *)(h), *(pp_attr_t *)(h) & ~(b), (pp_attr_t *)(h))); \ } while (0) -#define ppattr_test_bits(h, b) \ +#define ppattr_test_bits(h, b) \ ((*(pp_attr_t *)(h) & (b)) == (b)) -#define pa_set_bits(x, b) \ - do { \ - if (pa_valid(x)) \ - ppattr_set_bits(&pp_attr_table[pa_index(x)], \ - (b)); \ +#define pa_set_bits(x, b) \ + do { \ + if (pa_valid(x)) \ + ppattr_set_bits(&pp_attr_table[pa_index(x)], \ + (b)); \ } while (0) -#define pa_test_bits(x, b) \ +#define pa_test_bits(x, b) \ (pa_valid(x) ? ppattr_test_bits(&pp_attr_table[pa_index(x)],\ (b)) : FALSE) -#define pa_clear_bits(x, b) \ - do { \ - if (pa_valid(x)) \ - ppattr_clear_bits(&pp_attr_table[pa_index(x)], \ - (b)); \ +#define pa_clear_bits(x, b) \ + do { \ + if (pa_valid(x)) \ + ppattr_clear_bits(&pp_attr_table[pa_index(x)], \ + (b)); \ } while (0) -#define pa_set_modify(x) \ +#define pa_set_modify(x) \ pa_set_bits(x, PP_ATTR_MODIFIED) -#define pa_clear_modify(x) \ +#define pa_clear_modify(x) \ pa_clear_bits(x, PP_ATTR_MODIFIED) -#define pa_set_reference(x) \ +#define pa_set_reference(x) \ pa_set_bits(x, PP_ATTR_REFERENCED) -#define pa_clear_reference(x) \ +#define pa_clear_reference(x) \ pa_clear_bits(x, PP_ATTR_REFERENCED) @@ -676,21 +701,21 @@ SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap; #define CLR_REUSABLE_PAGE(pai) \ ppattr_clear_bits(&pp_attr_table[pai], PP_ATTR_REUSABLE) -#define IS_ALTACCT_PAGE(pai, pve_p) \ - (((pve_p) == NULL) \ - ? ppattr_test_bits(&pp_attr_table[pai], PP_ATTR_ALTACCT) \ +#define IS_ALTACCT_PAGE(pai, pve_p) \ + (((pve_p) == NULL) \ + ? ppattr_test_bits(&pp_attr_table[pai], PP_ATTR_ALTACCT) \ : PVE_NEXT_IS_ALTACCT(pve_next((pve_p)))) -#define SET_ALTACCT_PAGE(pai, pve_p) \ - if ((pve_p) == NULL) { \ - ppattr_set_bits(&pp_attr_table[pai], PP_ATTR_ALTACCT); \ - } else { \ - PVE_NEXT_SET_ALTACCT(&pve_next((pve_p))); \ +#define SET_ALTACCT_PAGE(pai, pve_p) \ + if ((pve_p) == NULL) { \ + ppattr_set_bits(&pp_attr_table[pai], PP_ATTR_ALTACCT); \ + } else { \ + PVE_NEXT_SET_ALTACCT(&pve_next((pve_p))); \ } -#define CLR_ALTACCT_PAGE(pai, pve_p) \ - if ((pve_p) == NULL) { \ - ppattr_clear_bits(&pp_attr_table[pai], PP_ATTR_ALTACCT);\ - } else { \ - PVE_NEXT_CLR_ALTACCT(&pve_next((pve_p))); \ +#define CLR_ALTACCT_PAGE(pai, pve_p) \ + if ((pve_p) == NULL) { \ + ppattr_clear_bits(&pp_attr_table[pai], PP_ATTR_ALTACCT); \ + } else { \ + PVE_NEXT_CLR_ALTACCT(&pve_next((pve_p))); \ } #define IS_REFFAULT_PAGE(pai) \ @@ -707,171 +732,184 @@ SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap; #define CLR_MODFAULT_PAGE(pai) \ ppattr_clear_bits(&pp_attr_table[pai], PP_ATTR_MODFAULT) +#define tte_get_ptd(tte) \ + ((struct pt_desc *)(pvh_list(pai_to_pvh(pa_index((vm_offset_t)((tte) & ~PAGE_MASK)))))) + #if (__ARM_VMSA__ == 7) -#define tte_index(pmap, addr) \ +#define tte_index(pmap, addr) \ ttenum((addr)) -#define tte_get_ptd(tte) \ - ((struct pt_desc *)((*((vm_offset_t *)(pai_to_pvh(pa_index((vm_offset_t)((tte) & ~PAGE_MASK)))))) & PVH_LIST_MASK)) - #else -#define tt0_index(pmap, addr) \ +#define tt0_index(pmap, addr) \ (((addr) & ARM_TT_L0_INDEX_MASK) >> ARM_TT_L0_SHIFT) -#define tt1_index(pmap, addr) \ +#define tt1_index(pmap, addr) \ (((addr) & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT) -#define tt2_index(pmap, addr) \ +#define tt2_index(pmap, addr) \ (((addr) & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT) -#define tt3_index(pmap, addr) \ +#define tt3_index(pmap, addr) \ (((addr) & ARM_TT_L3_INDEX_MASK) >> ARM_TT_L3_SHIFT) -#define tte_index(pmap, addr) \ +#define tte_index(pmap, addr) \ (((addr) & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT) -#define tte_get_ptd(tte) \ - ((struct pt_desc *)((*((vm_offset_t *)(pai_to_pvh(pa_index((vm_offset_t)((tte) & ~PAGE_MASK)))))) & PVH_LIST_MASK)) - #endif /* * Lock on pmap system */ -#define PMAP_LOCK_INIT(pmap) { \ - simple_lock_init(&(pmap)->lock, 0); \ +#define PMAP_LOCK_INIT(pmap) { \ + simple_lock_init(&(pmap)->lock, 0); \ } -#define PMAP_LOCK(pmap) { \ - simple_lock(&(pmap)->lock); \ +#define PMAP_LOCK(pmap) { \ + pmap_simple_lock(&(pmap)->lock); \ } -#define PMAP_UNLOCK(pmap) { \ - simple_unlock(&(pmap)->lock); \ +#define PMAP_UNLOCK(pmap) { \ + pmap_simple_unlock(&(pmap)->lock); \ } #if MACH_ASSERT -#define PMAP_ASSERT_LOCKED(pmap) { \ - simple_lock_assert(&(pmap)->lock, LCK_ASSERT_OWNED); \ +#define PMAP_ASSERT_LOCKED(pmap) { \ + simple_lock_assert(&(pmap)->lock, LCK_ASSERT_OWNED); \ } #else #define PMAP_ASSERT_LOCKED(pmap) #endif -/* - * Each entry in the pv_head_table is locked by a bit in the - * pv lock array, which is stored in the region preceding pv_head_table. - * The lock bits are accessed by the physical address of the page they lock. - */ -#define LOCK_PVH(index) { \ - hw_lock_bit((hw_lock_bit_t *) \ - ((unsigned int*)pv_head_table)-1-(index>>5), \ - (index&0x1F)); \ - } +#if defined(__arm64__) +#define PVH_LOCK_WORD 1 /* Assumes little-endian */ +#else +#define PVH_LOCK_WORD 0 +#endif -#define UNLOCK_PVH(index) { \ - hw_unlock_bit((hw_lock_bit_t *) \ - ((unsigned int*)pv_head_table)-1-(index>>5), \ - (index&0x1F)); \ - } +#define ASSERT_PVH_LOCKED(index) \ + do { \ + assert((vm_offset_t)(pv_head_table[index]) & PVH_FLAG_LOCK); \ + } while (0) -#define ASSERT_PVH_LOCKED(index) { \ - assert(*(((unsigned int*)pv_head_table)-1-(index>>5)) & (1 << (index & 0x1F))); \ -} +#define LOCK_PVH(index) \ + do { \ + pmap_lock_bit((uint32_t*)(&pv_head_table[index]) + PVH_LOCK_WORD, PVH_LOCK_BIT - (PVH_LOCK_WORD * 32)); \ + } while (0) + +#define UNLOCK_PVH(index) \ + do { \ + ASSERT_PVH_LOCKED(index); \ + pmap_unlock_bit((uint32_t*)(&pv_head_table[index]) + PVH_LOCK_WORD, PVH_LOCK_BIT - (PVH_LOCK_WORD * 32)); \ + } while (0) #define PMAP_UPDATE_TLBS(pmap, s, e) { \ - flush_mmu_tlb_region_asid(s, (unsigned)(e - s), pmap); \ + flush_mmu_tlb_region_asid_async(s, (unsigned)(e - s), pmap); \ + sync_tlb_flush(); \ } #ifdef __ARM_L1_PTW__ -#define FLUSH_PTE_RANGE(spte, epte) \ - __asm__ volatile("dsb ish"); +#define FLUSH_PTE_RANGE(spte, epte) \ + __builtin_arm_dmb(DMB_ISH); -#define FLUSH_PTE(pte_p) \ - __asm__ volatile("dsb ish"); +#define FLUSH_PTE(pte_p) \ + __builtin_arm_dmb(DMB_ISH); -#else +#define FLUSH_PTE_STRONG(pte_p) \ + __builtin_arm_dsb(DSB_ISH); + +#define FLUSH_PTE_RANGE_STRONG(spte, epte) \ + __builtin_arm_dsb(DSB_ISH); -#define FLUSH_PTE_RANGE(spte, epte) \ +#else /* __ARM_L1_PTW */ + +#define FLUSH_PTE_RANGE(spte, epte) \ CleanPoU_DcacheRegion((vm_offset_t)spte, \ (vm_offset_t)epte - (vm_offset_t)spte); -#define FLUSH_PTE(pte_p) \ - CleanPoU_DcacheRegion((vm_offset_t)pte_p, sizeof(pt_entry_t)); -#endif +#define FLUSH_PTE(pte_p) \ + __unreachable_ok_push \ + if (TEST_PAGE_RATIO_4) \ + FLUSH_PTE_RANGE((pte_p), (pte_p) + 4); \ + else \ + FLUSH_PTE_RANGE((pte_p), (pte_p) + 1); \ + CleanPoU_DcacheRegion((vm_offset_t)pte_p, sizeof(pt_entry_t)); \ + __unreachable_ok_pop -#define WRITE_PTE(pte_p, pte_entry) \ - __unreachable_ok_push \ - if (TEST_PAGE_RATIO_4) { \ - do { \ - if (((unsigned)(pte_p)) & 0x1f) panic("WRITE_PTE\n"); \ - if (((pte_entry) & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) { \ - *(pte_p) = (pte_entry); \ - *((pte_p)+1) = (pte_entry); \ - *((pte_p)+2) = (pte_entry); \ - *((pte_p)+3) = (pte_entry); \ - } else { \ - *(pte_p) = (pte_entry); \ - *((pte_p)+1) = (pte_entry) | 0x1000; \ - *((pte_p)+2) = (pte_entry) | 0x2000; \ - *((pte_p)+3) = (pte_entry) | 0x3000; \ - } \ - FLUSH_PTE_RANGE((pte_p),((pte_p)+4)); \ - } while(0); \ - } else { \ - do { \ - *(pte_p) = (pte_entry); \ - FLUSH_PTE(pte_p); \ - } while(0); \ - } \ - __unreachable_ok_pop +#define FLUSH_PTE_STRONG(pte_p) FLUSH_PTE(pte_p) -#define WRITE_PTE_FAST(pte_p, pte_entry) \ - __unreachable_ok_push \ - if (TEST_PAGE_RATIO_4) { \ - if (((unsigned)(pte_p)) & 0x1f) panic("WRITE_PTE\n"); \ - if (((pte_entry) & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) { \ - *(pte_p) = (pte_entry); \ - *((pte_p)+1) = (pte_entry); \ - *((pte_p)+2) = (pte_entry); \ - *((pte_p)+3) = (pte_entry); \ - } else { \ - *(pte_p) = (pte_entry); \ - *((pte_p)+1) = (pte_entry) | 0x1000; \ - *((pte_p)+2) = (pte_entry) | 0x2000; \ - *((pte_p)+3) = (pte_entry) | 0x3000; \ - } \ - } else { \ - *(pte_p) = (pte_entry); \ - } \ - __unreachable_ok_pop +#define FLUSH_PTE_RANGE_STRONG(spte, epte) FLUSH_PTE_RANGE(spte, epte) +#endif /* !defined(__ARM_L1_PTW) */ + +#define WRITE_PTE_FAST(pte_p, pte_entry) \ + __unreachable_ok_push \ + if (TEST_PAGE_RATIO_4) { \ + if (((unsigned)(pte_p)) & 0x1f) \ + panic("WRITE_PTE\n"); \ + if (((pte_entry) & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) { \ + *(pte_p) = (pte_entry); \ + *((pte_p)+1) = (pte_entry); \ + *((pte_p)+2) = (pte_entry); \ + *((pte_p)+3) = (pte_entry); \ + } else { \ + *(pte_p) = (pte_entry); \ + *((pte_p)+1) = (pte_entry) | 0x1000; \ + *((pte_p)+2) = (pte_entry) | 0x2000; \ + *((pte_p)+3) = (pte_entry) | 0x3000; \ + } \ + } else { \ + *(pte_p) = (pte_entry); \ + } \ + __unreachable_ok_pop + +#define WRITE_PTE(pte_p, pte_entry) \ + WRITE_PTE_FAST(pte_p, pte_entry); \ + FLUSH_PTE(pte_p); + +#define WRITE_PTE_STRONG(pte_p, pte_entry) \ + WRITE_PTE_FAST(pte_p, pte_entry); \ + FLUSH_PTE_STRONG(pte_p); /* * Other useful macros. */ -#define current_pmap() \ +#define current_pmap() \ (vm_map_pmap(current_thread()->map)) -#define PMAP_IS_VALID(x) (TRUE) -#ifdef PMAP_TRACES -unsigned int pmap_trace = 0; +#define VALIDATE_USER_PMAP(x) +#define VALIDATE_PMAP(x) +#define VALIDATE_LEDGER(x) + + +#if DEVELOPMENT || DEBUG + +/* + * Trace levels are controlled by a bitmask in which each + * level can be enabled/disabled by the (1< 7) static inline tt_entry_t *pmap_tt1e( @@ -989,9 +1023,6 @@ static inline pt_entry_t *pmap_tt3e( static void pmap_unmap_sharedpage( pmap_t pmap); -static void pmap_sharedpage_flush_32_to_64( - void); - static boolean_t pmap_is_64bit(pmap_t); @@ -1025,6 +1056,13 @@ static void pmap_pages_free( pmap_paddr_t pa, unsigned size); +static void pmap_pin_kernel_pages(vm_offset_t kva, size_t nbytes); + +static void pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes); + + +static void pmap_trim_self(pmap_t pmap); +static void pmap_trim_subord(pmap_t subord); #define PMAP_SUPPORT_PROTOTYPES(__return_type, __function_name, __function_args, __function_index) \ static __return_type __function_name##_internal __function_args; @@ -1076,8 +1114,6 @@ PMAP_SUPPORT_PROTOTYPES( void, pmap_destroy, (pmap_t pmap), PMAP_DESTROY_INDEX); - - PMAP_SUPPORT_PROTOTYPES( kern_return_t, pmap_enter_options, (pmap_t pmap, @@ -1101,7 +1137,7 @@ pmap_find_phys, (pmap_t pmap, #if (__ARM_VMSA__ > 7) PMAP_SUPPORT_PROTOTYPES( -void, +kern_return_t, pmap_insert_sharedpage, (pmap_t pmap), PMAP_INSERT_SHAREDPAGE_INDEX); #endif @@ -1149,11 +1185,10 @@ pmap_query_page_info, (pmap_t pmap, int *disp_p), PMAP_QUERY_PAGE_INFO_INDEX); PMAP_SUPPORT_PROTOTYPES( -boolean_t, +mach_vm_size_t, pmap_query_resident, (pmap_t pmap, vm_map_address_t start, vm_map_address_t end, - mach_vm_size_t *resident_bytes_p, mach_vm_size_t *compressed_bytes_p), PMAP_QUERY_RESIDENT_INDEX); PMAP_SUPPORT_PROTOTYPES( @@ -1189,7 +1224,6 @@ pmap_set_process, (pmap_t pmap, char *procname), PMAP_SET_PROCESS_INDEX); #endif - PMAP_SUPPORT_PROTOTYPES( void, pmap_unmap_cpu_windows_copy, (unsigned int index), PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX); @@ -1223,6 +1257,25 @@ PMAP_SUPPORT_PROTOTYPES( void, pmap_switch_user_ttb, (pmap_t pmap), PMAP_SWITCH_USER_TTB_INDEX); +PMAP_SUPPORT_PROTOTYPES( +void, +pmap_clear_user_ttb, (void), PMAP_CLEAR_USER_TTB_INDEX); + + +PMAP_SUPPORT_PROTOTYPES( +void, +pmap_set_jit_entitled, (pmap_t pmap), PMAP_SET_JIT_ENTITLED_INDEX); + +PMAP_SUPPORT_PROTOTYPES( +void, +pmap_trim, (pmap_t grand, + pmap_t subord, + addr64_t vstart, + addr64_t nstart, + uint64_t size), PMAP_TRIM_INDEX); + + + void pmap_footprint_suspend(vm_map_t map, @@ -1233,6 +1286,7 @@ PMAP_SUPPORT_PROTOTYPES( boolean_t suspend), PMAP_FOOTPRINT_SUSPEND_INDEX); + #if CONFIG_PGTRACE boolean_t pgtrace_enabled = 0; @@ -1294,12 +1348,11 @@ int pt_fake_zone_index = -1; /* index of pmap fake zone */ /* * Allocates and initializes a per-CPU data structure for the pmap. */ -static void +MARK_AS_PMAP_TEXT static void pmap_cpu_data_init_internal(unsigned int cpu_number) { - pmap_cpu_data_t * pmap_cpu_data = NULL; + pmap_cpu_data_t * pmap_cpu_data = pmap_get_cpu_data(); - pmap_cpu_data = pmap_get_cpu_data(); pmap_cpu_data->cpu_number = cpu_number; } @@ -1351,7 +1404,7 @@ pmap_pages_reclaim( * If no eligible page were found in the pt page list, panic. */ - simple_lock(&pmap_pages_lock); + pmap_simple_lock(&pmap_pages_lock); pmap_pages_request_count++; pmap_pages_request_acum++; @@ -1362,30 +1415,28 @@ pmap_pages_reclaim( page_entry = pmap_pages_reclaim_list; pmap_pages_reclaim_list = pmap_pages_reclaim_list->next; - simple_unlock(&pmap_pages_lock); + pmap_simple_unlock(&pmap_pages_lock); return((pmap_paddr_t)ml_static_vtop((vm_offset_t)page_entry)); } - simple_unlock(&pmap_pages_lock); + pmap_simple_unlock(&pmap_pages_lock); - simple_lock(&pt_pages_lock); + pmap_simple_lock(&pt_pages_lock); ptdp = (pt_desc_t *)queue_first(&pt_page_list); found_page = FALSE; while (!queue_end(&pt_page_list, (queue_entry_t)ptdp)) { - if ((ptdp->pmap != kernel_pmap) - && (ptdp->pmap->nested == FALSE) - && (simple_lock_try(&ptdp->pmap->lock))) { + if ((ptdp->pmap->nested == FALSE) + && (pmap_simple_lock_try(&ptdp->pmap->lock))) { + assert(ptdp->pmap != kernel_pmap); unsigned refcnt_acc = 0; unsigned wiredcnt_acc = 0; for (i = 0 ; i < PT_INDEX_MAX ; i++) { - if (ptdp->pt_cnt[i].refcnt & PT_DESC_REFCOUNT) { - /* Do not attempt to free a page that contains an L2 table - * or is currently being operated on by pmap_enter(), - * which can drop the pmap lock. */ + if (ptdp->pt_cnt[i].refcnt == PT_DESC_REFCOUNT) { + /* Do not attempt to free a page that contains an L2 table */ refcnt_acc = 0; break; } @@ -1399,7 +1450,7 @@ pmap_pages_reclaim( * with it while we do that. */ break; } - simple_unlock(&ptdp->pmap->lock); + pmap_simple_unlock(&ptdp->pmap->lock); } ptdp = (pt_desc_t *)queue_next((queue_t)ptdp); } @@ -1414,12 +1465,18 @@ pmap_pages_reclaim( tt_entry_t *tte_p; uint32_t rmv_spte=0; - simple_unlock(&pt_pages_lock); + pmap_simple_unlock(&pt_pages_lock); pmap = ptdp->pmap; PMAP_ASSERT_LOCKED(pmap); // pmap lock should be held from loop above for (i = 0 ; i < PT_INDEX_MAX ; i++) { va = ptdp->pt_map[i].va; + /* If the VA is bogus, this may represent an unallocated region + * or one which is in transition (already being freed or expanded). + * Don't try to remove mappings here. */ + if (va == (vm_offset_t)-1) + continue; + tte_p = pmap_tte(pmap, va); if ((tte_p != (tt_entry_t *) NULL) && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) { @@ -1451,28 +1508,29 @@ pmap_pages_reclaim( panic("pmap_pages_reclaim(): ptdp %p, count %d\n", ptdp, ptdp->pt_cnt[ARM_PT_DESC_INDEX(pte_p)].refcnt); #if (__ARM_VMSA__ == 7) pmap_tte_deallocate(pmap, tte_p, PMAP_TT_L1_LEVEL); - flush_mmu_tlb_entry((va & ~ARM_TT_L1_PT_OFFMASK) | (pmap->asid & 0xff)); - flush_mmu_tlb_entry(((va & ~ARM_TT_L1_PT_OFFMASK) + ARM_TT_L1_SIZE) | (pmap->asid & 0xff)); - flush_mmu_tlb_entry(((va & ~ARM_TT_L1_PT_OFFMASK) + 2*ARM_TT_L1_SIZE)| (pmap->asid & 0xff)); - flush_mmu_tlb_entry(((va & ~ARM_TT_L1_PT_OFFMASK) + 3*ARM_TT_L1_SIZE)| (pmap->asid & 0xff)); + flush_mmu_tlb_entry_async((va & ~ARM_TT_L1_PT_OFFMASK) | (pmap->asid & 0xff)); + flush_mmu_tlb_entry_async(((va & ~ARM_TT_L1_PT_OFFMASK) + ARM_TT_L1_SIZE) | (pmap->asid & 0xff)); + flush_mmu_tlb_entry_async(((va & ~ARM_TT_L1_PT_OFFMASK) + 2*ARM_TT_L1_SIZE)| (pmap->asid & 0xff)); + flush_mmu_tlb_entry_async(((va & ~ARM_TT_L1_PT_OFFMASK) + 3*ARM_TT_L1_SIZE)| (pmap->asid & 0xff)); #else pmap_tte_deallocate(pmap, tte_p, PMAP_TT_L2_LEVEL); - flush_mmu_tlb_entry(tlbi_addr(va & ~ARM_TT_L2_OFFMASK) | tlbi_asid(pmap->asid)); + flush_mmu_tlb_entry_async(tlbi_addr(va & ~ARM_TT_L2_OFFMASK) | tlbi_asid(pmap->asid)); #endif if (remove_count > 0) { #if (__ARM_VMSA__ == 7) - PMAP_UPDATE_TLBS(pmap, va, va+4*ARM_TT_L1_SIZE); + flush_mmu_tlb_region_asid_async(va, 4*ARM_TT_L1_SIZE, pmap); #else - PMAP_UPDATE_TLBS(pmap, va, va+ARM_TT_L2_SIZE); + flush_mmu_tlb_region_asid_async(va, ARM_TT_L2_SIZE, pmap); #endif } } } + sync_tlb_flush(); // Undo the lock we grabbed when we found ptdp above PMAP_UNLOCK(pmap); } - simple_lock(&pmap_pages_lock); + pmap_simple_lock(&pmap_pages_lock); } } @@ -1534,7 +1592,7 @@ pmap_pages_free( pmap_paddr_t pa, unsigned size) { - simple_lock(&pmap_pages_lock); + pmap_simple_lock(&pmap_pages_lock); if (pmap_pages_request_count != 0) { page_free_entry_t *page_entry; @@ -1543,12 +1601,12 @@ pmap_pages_free( page_entry = (page_free_entry_t *)phystokv(pa); page_entry->next = pmap_pages_reclaim_list; pmap_pages_reclaim_list = page_entry; - simple_unlock(&pmap_pages_lock); + pmap_simple_unlock(&pmap_pages_lock); return; } - simple_unlock(&pmap_pages_lock); + pmap_simple_unlock(&pmap_pages_lock); vm_page_t m; pmap_paddr_t pa_max; @@ -1610,7 +1668,7 @@ alloc_asid( { unsigned int asid_bitmap_index; - simple_lock(&pmaps_lock); + pmap_simple_lock(&pmaps_lock); for (asid_bitmap_index = 0; asid_bitmap_index < (MAX_ASID / (sizeof(uint32_t) * NBBY)); asid_bitmap_index++) { unsigned int temp = ffs(asid_bitmap[asid_bitmap_index]); if (temp > 0) { @@ -1630,7 +1688,7 @@ alloc_asid( /* Grab the second ASID. */ asid_bitmap[asid_bitmap_index] &= ~(1 << temp2); #endif /* __ARM_KERNEL_PROTECT__ */ - simple_unlock(&pmaps_lock); + pmap_simple_unlock(&pmaps_lock); /* * We should never vend out physical ASID 0 through this @@ -1646,7 +1704,7 @@ alloc_asid( return (asid_bitmap_index * sizeof(uint32_t) * NBBY + temp); } } - simple_unlock(&pmaps_lock); + pmap_simple_unlock(&pmaps_lock); /* * ToDo: Add code to deal with pmap with no asid panic for now. Not * an issue with the small config process hard limit @@ -1662,7 +1720,7 @@ free_asid( /* Don't free up any alias of physical ASID 0. */ assert((asid % ARM_MAX_ASID) != 0); - simple_lock(&pmaps_lock); + pmap_simple_lock(&pmaps_lock); setbit(asid, (int *) asid_bitmap); #if __ARM_KERNEL_PROTECT__ @@ -1671,17 +1729,20 @@ free_asid( setbit(asid | 1, (int *) asid_bitmap); #endif /* __ARM_KERNEL_PROTECT__ */ - simple_unlock(&pmaps_lock); + pmap_simple_unlock(&pmaps_lock); } -#define PV_LOW_WATER_MARK_DEFAULT 0x200 -#define PV_KERN_LOW_WATER_MARK_DEFAULT 0x200 -#define PV_ALLOC_CHUNK_INITIAL 0x200 -#define PV_KERN_ALLOC_CHUNK_INITIAL 0x200 +#ifndef PMAP_PV_LOAD_FACTOR +#define PMAP_PV_LOAD_FACTOR 1 +#endif + +#define PV_LOW_WATER_MARK_DEFAULT (0x200 * PMAP_PV_LOAD_FACTOR) +#define PV_KERN_LOW_WATER_MARK_DEFAULT (0x200 * PMAP_PV_LOAD_FACTOR) +#define PV_ALLOC_CHUNK_INITIAL (0x200 * PMAP_PV_LOAD_FACTOR) +#define PV_KERN_ALLOC_CHUNK_INITIAL (0x200 * PMAP_PV_LOAD_FACTOR) #define PV_ALLOC_INITIAL_TARGET (PV_ALLOC_CHUNK_INITIAL * 5) #define PV_KERN_ALLOC_INITIAL_TARGET (PV_KERN_ALLOC_CHUNK_INITIAL) - uint32_t pv_free_count MARK_AS_PMAP_DATA = 0; uint32_t pv_page_count MARK_AS_PMAP_DATA = 0; uint32_t pv_kern_free_count MARK_AS_PMAP_DATA = 0; @@ -1730,12 +1791,13 @@ pv_alloc( unsigned int pai, pv_entry_t **pvepp) { - PMAP_ASSERT_LOCKED(pmap); + if (pmap != NULL) + PMAP_ASSERT_LOCKED(pmap); ASSERT_PVH_LOCKED(pai); PV_ALLOC(pvepp); if (PV_ENTRY_NULL == *pvepp) { - if (kernel_pmap == pmap) { + if ((pmap == NULL) || (kernel_pmap == pmap)) { PV_KERN_ALLOC(pvepp); @@ -1749,7 +1811,8 @@ pv_alloc( kern_return_t ret; UNLOCK_PVH(pai); - PMAP_UNLOCK(pmap); + if (pmap != NULL) + PMAP_UNLOCK(pmap); ret = pmap_pages_alloc(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT); @@ -1782,7 +1845,8 @@ pv_alloc( pv_e++; } PV_KERN_FREE_LIST(pv_eh, pv_et, pv_cnt); - PMAP_LOCK(pmap); + if (pmap != NULL) + PMAP_LOCK(pmap); LOCK_PVH(pai); return FALSE; } @@ -1852,11 +1916,18 @@ pv_list_free( PV_FREE_LIST(pvehp, pvetp, cnt); } - +static inline void +pv_water_mark_check(void) +{ + if ((pv_free_count < pv_low_water_mark) || (pv_kern_free_count < pv_kern_low_water_mark)) { + if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse)) + thread_wakeup(&mapping_replenish_event); + } +} static inline void PV_ALLOC(pv_entry_t **pv_ep) { assert(*pv_ep == PV_ENTRY_NULL); - simple_lock(&pv_free_list_lock); + pmap_simple_lock(&pv_free_list_lock); /* * If the kernel reserved pool is low, let non-kernel mappings allocate * synchronously, possibly subject to a throttle. @@ -1867,25 +1938,20 @@ static inline void PV_ALLOC(pv_entry_t **pv_ep) { pv_free_count--; } - simple_unlock(&pv_free_list_lock); - - if ((pv_free_count < pv_low_water_mark) || (pv_kern_free_count < pv_kern_low_water_mark)) { - if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse)) - thread_wakeup(&mapping_replenish_event); - } + pmap_simple_unlock(&pv_free_list_lock); } static inline void PV_FREE_LIST(pv_entry_t *pv_eh, pv_entry_t *pv_et, int pv_cnt) { - simple_lock(&pv_free_list_lock); + pmap_simple_lock(&pv_free_list_lock); pv_et->pve_next = (pv_entry_t *)pv_free_list; pv_free_list = pv_eh; pv_free_count += pv_cnt; - simple_unlock(&pv_free_list_lock); + pmap_simple_unlock(&pv_free_list_lock); } static inline void PV_KERN_ALLOC(pv_entry_t **pv_e) { assert(*pv_e == PV_ENTRY_NULL); - simple_lock(&pv_kern_free_list_lock); + pmap_simple_lock(&pv_kern_free_list_lock); if ((*pv_e = pv_kern_free_list) != 0) { pv_kern_free_list = (pv_entry_t *)(*pv_e)->pve_next; @@ -1894,21 +1960,15 @@ static inline void PV_KERN_ALLOC(pv_entry_t **pv_e) { pmap_kern_reserve_alloc_stat++; } - simple_unlock(&pv_kern_free_list_lock); - - if (pv_kern_free_count < pv_kern_low_water_mark) { - if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse)) { - thread_wakeup(&mapping_replenish_event); - } - } + pmap_simple_unlock(&pv_kern_free_list_lock); } static inline void PV_KERN_FREE_LIST(pv_entry_t *pv_eh, pv_entry_t *pv_et, int pv_cnt) { - simple_lock(&pv_kern_free_list_lock); + pmap_simple_lock(&pv_kern_free_list_lock); pv_et->pve_next = pv_kern_free_list; pv_kern_free_list = pv_eh; pv_kern_free_count += pv_cnt; - simple_unlock(&pv_kern_free_list_lock); + pmap_simple_unlock(&pv_kern_free_list_lock); } static inline void pmap_pv_throttle(__unused pmap_t p) { @@ -1929,7 +1989,7 @@ static inline void pmap_pv_throttle(__unused pmap_t p) { * Creates a target number of free pv_entry_t objects for the kernel free list * and the general free list. */ -static kern_return_t +MARK_AS_PMAP_TEXT static kern_return_t mapping_free_prime_internal(void) { unsigned j; @@ -2069,7 +2129,7 @@ void mapping_adjust(void) { /* * Fills the kernel and general PV free lists back up to their low watermarks. */ -static kern_return_t +MARK_AS_PMAP_TEXT static kern_return_t mapping_replenish_internal(void) { pv_entry_t *pv_e; @@ -2190,14 +2250,13 @@ ptd_bootstrap( } static pt_desc_t -*ptd_alloc( - pmap_t pmap) +*ptd_alloc_unlinked(void) { pt_desc_t *ptdp; unsigned i; if (!ptd_preboot) - simple_lock(&ptd_free_list_lock); + pmap_simple_lock(&ptd_free_list_lock); if (ptd_free_count == 0) { unsigned int ptd_cnt; @@ -2212,7 +2271,7 @@ static pt_desc_t pmap_paddr_t pa; kern_return_t ret; - simple_unlock(&ptd_free_list_lock); + pmap_simple_unlock(&ptd_free_list_lock); if (pmap_pages_alloc(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT) != KERN_SUCCESS) { ret = pmap_pages_alloc(&pa, PAGE_SIZE, PMAP_PAGES_RECLAIM_NOWAIT); @@ -2220,7 +2279,7 @@ static pt_desc_t } ptdp = (pt_desc_t *)phystokv(pa); - simple_lock(&ptd_free_list_lock); + pmap_simple_lock(&ptd_free_list_lock); ptdp_next = ptdp; ptd_cnt = PAGE_SIZE/sizeof(pt_desc_t); } @@ -2242,52 +2301,60 @@ static pt_desc_t } if (!ptd_preboot) - simple_unlock(&ptd_free_list_lock); + pmap_simple_unlock(&ptd_free_list_lock); ptdp->pt_page.next = NULL; ptdp->pt_page.prev = NULL; - ptdp->pmap = pmap; + ptdp->pmap = NULL; for (i = 0 ; i < PT_INDEX_MAX ; i++) { - ptdp->pt_map[i].va = 0; + ptdp->pt_map[i].va = (vm_offset_t)-1; ptdp->pt_cnt[i].refcnt = 0; ptdp->pt_cnt[i].wiredcnt = 0; } - simple_lock(&pt_pages_lock); - queue_enter(&pt_page_list, ptdp, pt_desc_t *, pt_page); - simple_unlock(&pt_pages_lock); - - pmap_tt_ledger_credit(pmap, sizeof(*ptdp)); return(ptdp); } +static inline pt_desc_t* +ptd_alloc(pmap_t pmap) +{ + pt_desc_t *ptdp = ptd_alloc_unlinked(); + + ptdp->pmap = pmap; + if (pmap != kernel_pmap) { + /* We should never try to reclaim kernel pagetable pages in + * pmap_pages_reclaim(), so don't enter them into the list. */ + pmap_simple_lock(&pt_pages_lock); + queue_enter(&pt_page_list, ptdp, pt_desc_t *, pt_page); + pmap_simple_unlock(&pt_pages_lock); + } + + pmap_tt_ledger_credit(pmap, sizeof(*ptdp)); + return ptdp; +} + static void -ptd_deallocate( - pt_desc_t *ptdp) +ptd_deallocate(pt_desc_t *ptdp) { - unsigned i; pmap_t pmap = ptdp->pmap; if (ptd_preboot) { panic("ptd_deallocate(): early boot\n"); } - for (i = 0 ; i < PT_INDEX_MAX ; i++) { - if (ptdp->pt_cnt[i].refcnt != 0) - panic("ptd_deallocate(): ptdp=%p refcnt=0x%x \n", ptdp, ptdp->pt_cnt[i].refcnt); - } if (ptdp->pt_page.next != NULL) { - simple_lock(&pt_pages_lock); + pmap_simple_lock(&pt_pages_lock); queue_remove(&pt_page_list, ptdp, pt_desc_t *, pt_page); - simple_unlock(&pt_pages_lock); + pmap_simple_unlock(&pt_pages_lock); } - simple_lock(&ptd_free_list_lock); + pmap_simple_lock(&ptd_free_list_lock); (*(void **)ptdp) = (void *)ptd_free_list; ptd_free_list = (pt_desc_t *)ptdp; ptd_free_count++; - simple_unlock(&ptd_free_list_lock); - pmap_tt_ledger_debit(pmap, sizeof(*ptdp)); + pmap_simple_unlock(&ptd_free_list_lock); + if (pmap != NULL) + pmap_tt_ledger_debit(pmap, sizeof(*ptdp)); } static void @@ -2305,14 +2372,13 @@ ptd_init( assert(level == 2); ptdp->pt_map[ARM_PT_DESC_INDEX(pte_p)].va = (vm_offset_t) va & ~(ARM_TT_L1_PT_OFFMASK); #else - if (level == 3) { - ptdp->pt_map[ARM_PT_DESC_INDEX(pte_p)].va = (vm_offset_t) va & ~ARM_TT_L2_OFFMASK ; - } else if (level == 2) - ptdp->pt_map[ARM_PT_DESC_INDEX(pte_p)].va = (vm_offset_t) va & ~ARM_TT_L1_OFFMASK ; + if (level == 3) + ptdp->pt_map[ARM_PT_DESC_INDEX(pte_p)].va = (vm_offset_t) va & ~ARM_TT_L2_OFFMASK; + else if (level == 2) + ptdp->pt_map[ARM_PT_DESC_INDEX(pte_p)].va = (vm_offset_t) va & ~ARM_TT_L1_OFFMASK; #endif if (level < PMAP_TT_MAX_LEVEL) ptdp->pt_cnt[ARM_PT_DESC_INDEX(pte_p)].refcnt = PT_DESC_REFCOUNT; - } @@ -2379,6 +2445,7 @@ static inline tt_entry_t * pmap_tt1e(pmap_t pmap, vm_map_address_t addr) { + /* Level 0 currently unused */ #if __ARM64_TWO_LEVEL_PMAP__ #pragma unused(pmap, addr) panic("pmap_tt1e called on a two level pmap"); @@ -2430,23 +2497,12 @@ pmap_tt3e( tt_entry_t *ttp; tt_entry_t tte; - /* Level 0 currently unused */ -#if __ARM64_TWO_LEVEL_PMAP__ ttp = pmap_tt2e(pmap, addr); - tte = *ttp; -#else - /* Get first-level (1GB) entry */ - ttp = pmap_tt1e(pmap, addr); - tte = *ttp; - #if MACH_ASSERT - if ((tte & (ARM_TTE_TYPE_MASK | ARM_TTE_VALID)) == (ARM_TTE_TYPE_BLOCK | ARM_TTE_VALID)) - panic("Attempt to demote L1 block (?!): pmap=%p, va=0x%llx, tte=0x%llx\n", pmap, (uint64_t)addr, (uint64_t)tte); - #endif - if ((tte & (ARM_TTE_TYPE_MASK | ARM_TTE_VALID)) != (ARM_TTE_TYPE_TABLE | ARM_TTE_VALID)) + if (ttp == PT_ENTRY_NULL) return (PT_ENTRY_NULL); - tte = ((tt_entry_t*) phystokv(tte & ARM_TTE_TABLE_MASK))[tt2_index(pmap, addr)]; -#endif + tte = *ttp; + #if MACH_ASSERT if ((tte & (ARM_TTE_TYPE_MASK | ARM_TTE_VALID)) == (ARM_TTE_TYPE_BLOCK | ARM_TTE_VALID)) panic("Attempt to demote L2 block: pmap=%p, va=0x%llx, tte=0x%llx\n", pmap, (uint64_t)addr, (uint64_t)tte); @@ -2563,7 +2619,7 @@ pmap_map_bd_with_options( panic("pmap_map_bd"); } assert(!ARM_PTE_IS_COMPRESSED(*ptep)); - WRITE_PTE(ptep, tmplate); + WRITE_PTE_STRONG(ptep, tmplate); pte_increment_pa(tmplate); vaddr += PAGE_SIZE; @@ -2612,7 +2668,7 @@ pmap_map_bd( panic("pmap_map_bd"); } assert(!ARM_PTE_IS_COMPRESSED(*ptep)); - WRITE_PTE(ptep, tmplate); + WRITE_PTE_STRONG(ptep, tmplate); pte_increment_pa(tmplate); vaddr += PAGE_SIZE; @@ -2693,7 +2749,7 @@ pmap_map_high_window_bd( #if __ARM_KERNEL_PROTECT__ pte |= ARM_PTE_NG; #endif /* __ARM_KERNEL_PROTECT__ */ - WRITE_PTE(ptep, pte); + WRITE_PTE_STRONG(ptep, pte); } PMAP_UPDATE_TLBS(kernel_pmap, va_start, va_start + len); #if KASAN @@ -2704,18 +2760,12 @@ pmap_map_high_window_bd( #define PMAP_ALIGN(addr, align) ((addr) + ((align) - 1) & ~((align) - 1)) -typedef struct pmap_io_range -{ - uint64_t addr; - uint32_t len; - uint32_t wimg; -} __attribute__((packed)) pmap_io_range_t; - -static unsigned int +static vm_size_t pmap_compute_io_rgns(void) { DTEntry entry; pmap_io_range_t *ranges; + uint64_t rgn_end; void *prop = NULL; int err; unsigned int prop_size; @@ -2723,23 +2773,22 @@ pmap_compute_io_rgns(void) err = DTLookupEntry(NULL, "/defaults", &entry); assert(err == kSuccess); - if (kSuccess != DTGetProperty(entry, "pmap-io-granule", &prop, &prop_size)) - return 0; - - io_rgn_granule = *((uint32_t*)prop); - if (kSuccess != DTGetProperty(entry, "pmap-io-ranges", &prop, &prop_size)) return 0; - if ((io_rgn_granule == 0) || (io_rgn_granule & PAGE_MASK)) - panic("pmap I/O region granularity is not page-aligned!\n"); - ranges = prop; for (unsigned int i = 0; i < (prop_size / sizeof(*ranges)); ++i) { + if (ranges[i].addr & PAGE_MASK) + panic("pmap I/O region %u addr 0x%llx is not page-aligned", i, ranges[i].addr); + if (ranges[i].len & PAGE_MASK) + panic("pmap I/O region %u length 0x%x is not page-aligned", i, ranges[i].len); + if (os_add_overflow(ranges[i].addr, ranges[i].len, &rgn_end)) + panic("pmap I/O region %u addr 0x%llx length 0x%x wraps around", i, ranges[i].addr, ranges[i].len); if ((i == 0) || (ranges[i].addr < io_rgn_start)) io_rgn_start = ranges[i].addr; - if ((i == 0) || ((ranges[i].addr + ranges[i].len) > io_rgn_end)) - io_rgn_end = ranges[i].addr + ranges[i].len; + if ((i == 0) || (rgn_end > io_rgn_end)) + io_rgn_end = rgn_end; + ++num_io_rgns; } if (io_rgn_start & PAGE_MASK) @@ -2748,11 +2797,35 @@ pmap_compute_io_rgns(void) if (io_rgn_end & PAGE_MASK) panic("pmap I/O region end is not page-aligned!\n"); - if (((io_rgn_start < gPhysBase) && (io_rgn_end >= gPhysBase)) || - ((io_rgn_start < avail_end) && (io_rgn_end >= avail_end))) + if (((io_rgn_start <= gPhysBase) && (io_rgn_end > gPhysBase)) || + ((io_rgn_start < avail_end) && (io_rgn_end >= avail_end)) || + ((io_rgn_start > gPhysBase) && (io_rgn_end < avail_end))) panic("pmap I/O region overlaps physical memory!\n"); - return (unsigned int)((io_rgn_end - io_rgn_start) / io_rgn_granule); + return (num_io_rgns * sizeof(*ranges)); +} + +/* + * return < 0 for a < b + * 0 for a == b + * > 0 for a > b + */ +typedef int (*cmpfunc_t)(const void *a, const void *b); + +extern void +qsort(void *a, size_t n, size_t es, cmpfunc_t cmp); + +static int +cmp_io_rgns(const void *a, const void *b) +{ + const pmap_io_range_t *range_a = a; + const pmap_io_range_t *range_b = b; + if ((range_b->addr + range_b->len) <= range_a->addr) + return 1; + else if ((range_a->addr + range_a->len) <= range_b->addr) + return -1; + else + return 0; } static void @@ -2764,27 +2837,83 @@ pmap_load_io_rgns(void) int err; unsigned int prop_size; - if (io_rgn_granule == 0) + if (num_io_rgns == 0) return; - err = DTLookupEntry(NULL, "/defaults", &entry); - assert(err == kSuccess); + err = DTLookupEntry(NULL, "/defaults", &entry); + assert(err == kSuccess); err = DTGetProperty(entry, "pmap-io-ranges", &prop, &prop_size); - assert(err == kSuccess); + assert(err == kSuccess); ranges = prop; - for (unsigned int i = 0; i < (prop_size / sizeof(*ranges)); ++i) { - if ((ranges[i].addr - io_rgn_start) % io_rgn_granule) - panic("pmap I/O region %d is not aligned to I/O granularity!\n", i); - if (ranges[i].len % io_rgn_granule) - panic("pmap I/O region %d size is not a multiple of I/O granularity!\n", i); - for (uint32_t offs = 0; offs < ranges[i].len; offs += io_rgn_granule) { - io_attr_table[(ranges[i].addr + offs - io_rgn_start) / io_rgn_granule] = - IO_ATTR_WIMG(ranges[i].wimg); - } + for (unsigned int i = 0; i < (prop_size / sizeof(*ranges)); ++i) + io_attr_table[i] = ranges[i]; + + qsort(io_attr_table, num_io_rgns, sizeof(*ranges), cmp_io_rgns); +} + +#if __arm64__ +/* + * pmap_get_arm64_prot + * + * return effective armv8 VMSA block protections including + * table AP/PXN/XN overrides of a pmap entry + * + */ + +uint64_t +pmap_get_arm64_prot( + pmap_t pmap, + vm_offset_t addr) +{ + uint64_t tte; + uint64_t tt_type, table_ap, table_xn, table_pxn; + uint64_t prot = 0; + + tte = *pmap_tt1e(pmap, addr); + + if (!(tte & ARM_TTE_VALID)) { + return 0; + } + + tt_type = tte & ARM_TTE_TYPE_MASK; + + if(tt_type == ARM_TTE_TYPE_BLOCK) { + return prot | (tte & ARM_TTE_BLOCK_NX) | (tte & ARM_TTE_BLOCK_PNX) | (tte & ARM_TTE_BLOCK_APMASK) | ARM_TTE_VALID; + } + + table_ap = (tte >> ARM_TTE_TABLE_APSHIFT) & 0x3; + table_xn = tte & ARM_TTE_TABLE_XN; + table_pxn = tte & ARM_TTE_TABLE_PXN; + + prot |= (table_ap << ARM_TTE_BLOCK_APSHIFT) | (table_xn ? ARM_TTE_BLOCK_NX : 0) | (table_pxn ? ARM_TTE_BLOCK_PNX : 0); + + tte = *pmap_tt2e(pmap, addr); + if (!(tte & ARM_TTE_VALID)) { + return 0; + } + + tt_type = tte & ARM_TTE_TYPE_MASK; + + if (tt_type == ARM_TTE_TYPE_BLOCK) { + return prot | (tte & ARM_TTE_BLOCK_NX) | (tte & ARM_TTE_BLOCK_PNX) | (tte & ARM_TTE_BLOCK_APMASK) | ARM_TTE_VALID; + } + + table_ap = (tte >> ARM_TTE_TABLE_APSHIFT) & 0x3; + table_xn = tte & ARM_TTE_TABLE_XN; + table_pxn = tte & ARM_TTE_TABLE_PXN; + + prot |= (table_ap << ARM_TTE_BLOCK_APSHIFT) | (table_xn ? ARM_TTE_BLOCK_NX : 0) | (table_pxn ? ARM_TTE_BLOCK_PNX : 0); + + tte = *pmap_tt3e(pmap, addr); + if (!(tte & ARM_TTE_VALID)) { + return 0; } + + return prot | (tte & ARM_TTE_BLOCK_NX) | (tte & ARM_TTE_BLOCK_PNX) | (tte & ARM_TTE_BLOCK_APMASK) | ARM_TTE_VALID; } +#endif /* __arm64__ */ /* @@ -2815,18 +2944,16 @@ pmap_bootstrap( { pmap_paddr_t pmap_struct_start; vm_size_t pv_head_size; - vm_size_t pv_lock_table_size; vm_size_t ptd_root_table_size; vm_size_t pp_attr_table_size; vm_size_t io_attr_table_size; - unsigned int niorgns; unsigned int npages; unsigned int i; vm_map_offset_t maxoffset; -#ifdef PMAP_TRACES - if (PE_parse_boot_argn("-pmap_trace", &pmap_trace, sizeof (pmap_trace))) { +#if DEVELOPMENT || DEBUG + if (PE_parse_boot_argn("pmap_trace", &pmap_trace_mask, sizeof (pmap_trace_mask))) { kprintf("Kernel traces for pmap operations enabled\n"); } #endif @@ -2843,7 +2970,6 @@ pmap_bootstrap( kernel_pmap->min = VM_MIN_KERNEL_AND_KEXT_ADDRESS; #endif kernel_pmap->max = VM_MAX_KERNEL_ADDRESS; - kernel_pmap->wired = 0; kernel_pmap->ref_count = 1; kernel_pmap->gc_status = 0; kernel_pmap->nx_enabled = TRUE; @@ -2866,20 +2992,18 @@ pmap_bootstrap( kernel_pmap->tte_index_max = (ARM_PGBYTES / sizeof(tt_entry_t)); #endif kernel_pmap->prev_tte = (tt_entry_t *) NULL; - kernel_pmap->cpu_ref = 0; PMAP_LOCK_INIT(kernel_pmap); #if (__ARM_VMSA__ == 7) simple_lock_init(&kernel_pmap->tt1_lock, 0); + kernel_pmap->cpu_ref = 0; #endif memset((void *) &kernel_pmap->stats, 0, sizeof(kernel_pmap->stats)); /* allocate space for and initialize the bookkeeping structures */ - niorgns = pmap_compute_io_rgns(); + io_attr_table_size = pmap_compute_io_rgns(); npages = (unsigned int)atop(mem_size); pp_attr_table_size = npages * sizeof(pp_attr_t); - io_attr_table_size = niorgns * sizeof(io_attr_t); - pv_lock_table_size = npages; pv_head_size = round_page(sizeof(pv_entry_t *) * npages); #if (__ARM_VMSA__ == 7) ptd_root_table_size = sizeof(pt_desc_t) * (1<<((mem_size>>30)+12)); @@ -2891,8 +3015,8 @@ pmap_bootstrap( pp_attr_table = (pp_attr_t *) phystokv(avail_start); avail_start = PMAP_ALIGN(avail_start + pp_attr_table_size, __alignof(pp_attr_t)); - io_attr_table = (io_attr_t *) phystokv(avail_start); - avail_start = PMAP_ALIGN(avail_start + io_attr_table_size + pv_lock_table_size, __alignof(pv_entry_t*)); + io_attr_table = (pmap_io_range_t *) phystokv(avail_start); + avail_start = PMAP_ALIGN(avail_start + io_attr_table_size, __alignof(pv_entry_t*)); pv_head_table = (pv_entry_t **) phystokv(avail_start); avail_start = PMAP_ALIGN(avail_start + pv_head_size, __alignof(pt_desc_t)); ptd_root_table = (pt_desc_t *)phystokv(avail_start); @@ -2911,9 +3035,6 @@ pmap_bootstrap( simple_lock_init(&pmaps_lock, 0); queue_init(&map_pmap_list); queue_enter(&map_pmap_list, kernel_pmap, pmap_t, pmaps); - queue_init(&tt_pmap_list); - tt_pmap_count = 0; - tt_pmap_max = 0; free_page_size_tt_list = TT_FREE_ENTRY_NULL; free_page_size_tt_count = 0; free_page_size_tt_max = 0; @@ -2957,6 +3078,7 @@ pmap_bootstrap( kernel_pmap->asid = 0; kernel_pmap->vasid = 0; + if (PE_parse_boot_argn("arm_maxoffset", &maxoffset, sizeof (maxoffset))) { maxoffset = trunc_page(maxoffset); if ((maxoffset >= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MIN)) @@ -2983,10 +3105,14 @@ pmap_bootstrap( simple_lock_init(&phys_backup_lock, 0); + #if MACH_ASSERT PE_parse_boot_argn("pmap_stats_assert", &pmap_stats_assert, sizeof (pmap_stats_assert)); + PE_parse_boot_argn("vm_footprint_suspend_allowed", + &vm_footprint_suspend_allowed, + sizeof (vm_footprint_suspend_allowed)); #endif /* MACH_ASSERT */ #if KASAN @@ -3154,7 +3280,7 @@ pmap_init( assert(hard_maxproc < MAX_ASID); #if CONFIG_PGTRACE - pmap_pgtrace_init(); + pmap_pgtrace_init(); #endif } @@ -3164,7 +3290,6 @@ pmap_verify_free( { pv_entry_t **pv_h; int pai; - boolean_t result = TRUE; pmap_paddr_t phys = ptoa(ppnum); assert(phys != vm_page_fictitious_addr); @@ -3175,9 +3300,7 @@ pmap_verify_free( pai = (int)pa_index(phys); pv_h = pai_to_pvh(pai); - result = (pvh_list(pv_h) == PV_ENTRY_NULL); - - return (result); + return (pvh_test_type(pv_h, PVH_TYPE_NULL)); } @@ -3197,6 +3320,33 @@ pmap_zone_init( } +void +pmap_ledger_alloc_init(size_t size) +{ + panic("%s: unsupported, " + "size=%lu", + __func__, size); +} + +ledger_t +pmap_ledger_alloc(void) +{ + ledger_t retval = NULL; + + panic("%s: unsupported", + __func__); + + return retval; +} + +void +pmap_ledger_free(ledger_t ledger) +{ + panic("%s: unsupported, " + "ledger=%p", + __func__, ledger); +} + /* * Create and return a physical map. * @@ -3209,7 +3359,7 @@ pmap_zone_init( * the map will be used in software only, and * is bounded by that size. */ -static pmap_t +MARK_AS_PMAP_TEXT static pmap_t pmap_create_internal( ledger_t ledger, vm_map_size_t size, @@ -3241,7 +3391,9 @@ pmap_create_internal( p->max = VM_MAX_ADDRESS; } - p->wired = 0; + p->nested_region_true_start = 0; + p->nested_region_true_end = ~0; + p->ref_count = 1; p->gc_status = 0; p->stamp = hw_atomic_add(&pmap_stamp, 1); @@ -3251,12 +3403,13 @@ pmap_create_internal( p->nested_pmap = PMAP_NULL; - ledger_reference(ledger); + p->ledger = ledger; PMAP_LOCK_INIT(p); #if (__ARM_VMSA__ == 7) simple_lock_init(&p->tt1_lock, 0); + p->cpu_ref = 0; #endif memset((void *) &p->stats, 0, sizeof(p->stats)); @@ -3264,6 +3417,7 @@ pmap_create_internal( p->tte = pmap_tt1_allocate(p, PMAP_ROOT_ALLOC_SIZE, 0); p->ttep = ml_static_vtop((vm_offset_t)p->tte); + PMAP_TRACE(3, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(p), VM_KERNEL_ADDRHIDE(p->min), VM_KERNEL_ADDRHIDE(p->max), p->ttep); #if (__ARM_VMSA__ == 7) p->tte_index_max = NTTES; @@ -3271,17 +3425,13 @@ pmap_create_internal( p->tte_index_max = (PMAP_ROOT_ALLOC_SIZE / sizeof(tt_entry_t)); #endif p->prev_tte = (tt_entry_t *) NULL; - p->cpu_ref = 0; /* nullify the translation table */ for (i = 0; i < p->tte_index_max; i++) p->tte[i] = ARM_TTE_TYPE_FAULT; -#ifndef __ARM_L1_PTW__ - CleanPoU_DcacheRegion((vm_offset_t) (p->tte), PMAP_ROOT_ALLOC_SIZE); -#else - __asm__ volatile("dsb ish"); -#endif + FLUSH_PTE_RANGE(p->tte, p->tte + p->tte_index_max); + /* assign a asid */ p->vasid = alloc_asid(); p->asid = p->vasid % ARM_MAX_ASID; @@ -3295,19 +3445,23 @@ pmap_create_internal( p->nested_region_asid_bitmap = NULL; p->nested_region_asid_bitmap_size = 0x0UL; + p->nested_has_no_bounds_ref = false; + p->nested_no_bounds_refcnt = 0; + p->nested_bounds_set = false; + + #if MACH_ASSERT p->pmap_stats_assert = TRUE; p->pmap_pid = 0; strlcpy(p->pmap_procname, "", sizeof (p->pmap_procname)); #endif /* MACH_ASSERT */ #if DEVELOPMENT || DEBUG - p->footprint_suspended = FALSE; p->footprint_was_suspended = FALSE; #endif /* DEVELOPMENT || DEBUG */ - simple_lock(&pmaps_lock); + pmap_simple_lock(&pmaps_lock); queue_enter(&map_pmap_list, p, pmap_t, pmaps); - simple_unlock(&pmaps_lock); + pmap_simple_unlock(&pmaps_lock); return (p); } @@ -3320,18 +3474,23 @@ pmap_create( { pmap_t pmap; - PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, is_64bit); + PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, is_64bit); + + ledger_reference(ledger); pmap = pmap_create_internal(ledger, size, is_64bit); - PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, - VM_KERNEL_ADDRHIDE(pmap)); + if (pmap == PMAP_NULL) { + ledger_dereference(ledger); + } + + PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), pmap->vasid, pmap->asid); return pmap; } #if MACH_ASSERT -static void +MARK_AS_PMAP_TEXT static void pmap_set_process_internal( __unused pmap_t pmap, __unused int pid, @@ -3342,15 +3501,21 @@ pmap_set_process_internal( return; } + VALIDATE_PMAP(pmap); + pmap->pmap_pid = pid; strlcpy(pmap->pmap_procname, procname, sizeof (pmap->pmap_procname)); - if (!strncmp(procname, "corecaptured", sizeof (pmap->pmap_procname))) { + if (pmap_ledgers_panic_leeway) { /* * XXX FBDP - * "corecaptured" somehow triggers some issues that make - * the pmap stats and ledgers to go off track, causing + * Some processes somehow trigger some issues that make + * the pmap stats and ledgers go off track, causing * some assertion failures and ledger panics. - * Turn that off if the terminating process is "corecaptured". + * Turn off the sanity checks if we allow some ledger leeway + * because of that. We'll still do a final check in + * pmap_check_ledgers() for discrepancies larger than the + * allowed leeway after the address space has been fully + * cleaned up. */ pmap->pmap_stats_assert = FALSE; ledger_disable_panic_on_negative(pmap->ledger, @@ -3470,6 +3635,34 @@ struct { int purgeable_nonvolatile_compressed_under; ledger_amount_t purgeable_nonvolatile_compressed_under_total; ledger_amount_t purgeable_nonvolatile_compressed_under_max; + + int network_volatile_over; + ledger_amount_t network_volatile_over_total; + ledger_amount_t network_volatile_over_max; + int network_volatile_under; + ledger_amount_t network_volatile_under_total; + ledger_amount_t network_volatile_under_max; + + int network_nonvolatile_over; + ledger_amount_t network_nonvolatile_over_total; + ledger_amount_t network_nonvolatile_over_max; + int network_nonvolatile_under; + ledger_amount_t network_nonvolatile_under_total; + ledger_amount_t network_nonvolatile_under_max; + + int network_volatile_compressed_over; + ledger_amount_t network_volatile_compressed_over_total; + ledger_amount_t network_volatile_compressed_over_max; + int network_volatile_compressed_under; + ledger_amount_t network_volatile_compressed_under_total; + ledger_amount_t network_volatile_compressed_under_max; + + int network_nonvolatile_compressed_over; + ledger_amount_t network_nonvolatile_compressed_over_total; + ledger_amount_t network_nonvolatile_compressed_over_max; + int network_nonvolatile_compressed_under; + ledger_amount_t network_nonvolatile_compressed_under_total; + ledger_amount_t network_nonvolatile_compressed_under_max; } pmap_ledgers_drift; #endif /* MACH_ASSERT */ @@ -3478,76 +3671,43 @@ struct { * Should only be called if the map contains * no valid mappings. */ -static void +MARK_AS_PMAP_TEXT static void pmap_destroy_internal( pmap_t pmap) { -#if (__ARM_VMSA__ == 7) - pt_entry_t *ttep; - unsigned int i; - pmap_t tmp_pmap, tt_pmap; - queue_head_t tmp_pmap_list; - - queue_init(&tmp_pmap_list); - simple_lock(&pmaps_lock); - tt_pmap = CAST_DOWN_EXPLICIT(pmap_t, queue_first(&tt_pmap_list)); - while (!queue_end(&tt_pmap_list, (queue_entry_t)tt_pmap)) { - if (tt_pmap->cpu_ref == 0 ) { - tmp_pmap = tt_pmap; - tt_pmap = CAST_DOWN_EXPLICIT(pmap_t, queue_next(&tmp_pmap->pmaps)); - queue_remove(&tt_pmap_list, tmp_pmap, pmap_t, pmaps); - tt_pmap_count--; - queue_enter(&tmp_pmap_list, tmp_pmap, pmap_t, pmaps); - } else { - tmp_pmap = tt_pmap; - tt_pmap = CAST_DOWN_EXPLICIT(pmap_t, queue_next(&tmp_pmap->pmaps)); - } - } - simple_unlock(&pmaps_lock); - - tmp_pmap = CAST_DOWN_EXPLICIT(pmap_t, queue_first(&tmp_pmap_list)); - while (!queue_end(&tmp_pmap_list, (queue_entry_t)tmp_pmap)) { - tt_pmap = tmp_pmap; - tmp_pmap = CAST_DOWN_EXPLICIT(pmap_t, queue_next(&tt_pmap->pmaps)); - queue_remove(&tmp_pmap_list, tt_pmap, pmap_t, pmaps); - if (tt_pmap->tte) { - pmap_tt1_deallocate(pmap, tt_pmap->tte, tt_pmap->tte_index_max*sizeof(tt_entry_t), 0); - tt_pmap->tte = (tt_entry_t *) NULL; - tt_pmap->ttep = 0; - tt_pmap->tte_index_max = 0; - } - if (tt_pmap->prev_tte) { - pmap_tt1_deallocate(pmap, tt_pmap->prev_tte, PMAP_ROOT_ALLOC_SIZE, 0); - tt_pmap->prev_tte = (tt_entry_t *) NULL; - } - assert((tt_free_entry_t*)pmap->tt_entry_free == NULL); - free_asid(tt_pmap->vasid); - - pmap_check_ledgers(tt_pmap); - ledger_dereference(tt_pmap->ledger); - - zfree(pmap_zone, tt_pmap); - } - if (pmap == PMAP_NULL) return; - if (hw_atomic_sub(&pmap->ref_count, 1) != 0) + VALIDATE_PMAP(pmap); + + int32_t ref_count = __c11_atomic_fetch_sub(&pmap->ref_count, 1, memory_order_relaxed) - 1; + if (ref_count > 0) return; + else if (ref_count < 0) + panic("pmap %p: refcount underflow", pmap); + else if (pmap == kernel_pmap) + panic("pmap %p: attempt to destroy kernel pmap", pmap); - simple_lock(&pmaps_lock); +#if (__ARM_VMSA__ == 7) + pt_entry_t *ttep; + unsigned int i; + pmap_simple_lock(&pmaps_lock); while (pmap->gc_status & PMAP_GC_INFLIGHT) { pmap->gc_status |= PMAP_GC_WAIT; assert_wait((event_t) & pmap->gc_status, THREAD_UNINT); - simple_unlock(&pmaps_lock); + pmap_simple_unlock(&pmaps_lock); (void) thread_block(THREAD_CONTINUE_NULL); - simple_lock(&pmaps_lock); + pmap_simple_lock(&pmaps_lock); } - queue_remove(&map_pmap_list, pmap, pmap_t, pmaps); - simple_unlock(&pmaps_lock); + pmap_simple_unlock(&pmaps_lock); + + if (pmap->cpu_ref != 0) + panic("pmap_destroy(%p): cpu_ref = %u", pmap, pmap->cpu_ref); + + pmap_trim_self(pmap); /* * Free the memory maps, then the @@ -3558,133 +3718,123 @@ pmap_destroy_internal( ttep = &pmap->tte[i]; if ((*ttep & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) { pmap_tte_deallocate(pmap, ttep, PMAP_TT_L1_LEVEL); - flush_mmu_tlb_entry((i<asid & 0xff)); } } PMAP_UNLOCK(pmap); - if (pmap->cpu_ref == 0) { - if (pmap->tte) { - pmap_tt1_deallocate(pmap, pmap->tte, pmap->tte_index_max*sizeof(tt_entry_t), 0); - pmap->tte = (tt_entry_t *) NULL; - pmap->ttep = 0; - pmap->tte_index_max = 0; - } - if (pmap->prev_tte) { - pmap_tt1_deallocate(pmap, pmap->prev_tte, PMAP_ROOT_ALLOC_SIZE, 0); - pmap->prev_tte = (tt_entry_t *) NULL; - } - assert((tt_free_entry_t*)pmap->tt_entry_free == NULL); + if (pmap->tte) { + pmap_tt1_deallocate(pmap, pmap->tte, pmap->tte_index_max*sizeof(tt_entry_t), 0); + pmap->tte = (tt_entry_t *) NULL; + pmap->ttep = 0; + pmap->tte_index_max = 0; + } + if (pmap->prev_tte) { + pmap_tt1_deallocate(pmap, pmap->prev_tte, PMAP_ROOT_ALLOC_SIZE, 0); + pmap->prev_tte = (tt_entry_t *) NULL; + } + assert((tt_free_entry_t*)pmap->tt_entry_free == NULL); - /* return its asid to the pool */ - free_asid(pmap->vasid); - pmap_check_ledgers(pmap); + flush_mmu_tlb_asid(pmap->asid); + /* return its asid to the pool */ + free_asid(pmap->vasid); + pmap_check_ledgers(pmap); - ledger_dereference(pmap->ledger); - if (pmap->nested_region_asid_bitmap) - kfree(pmap->nested_region_asid_bitmap, pmap->nested_region_asid_bitmap_size*sizeof(unsigned int)); - zfree(pmap_zone, pmap); - } else { - simple_lock(&pmaps_lock); - queue_enter(&tt_pmap_list, pmap, pmap_t, pmaps); - tt_pmap_count++; - if (tt_pmap_count > tt_pmap_max) - tt_pmap_max = tt_pmap_count; - simple_unlock(&pmaps_lock); - } -#else + + if (pmap->nested_region_asid_bitmap) + kfree(pmap->nested_region_asid_bitmap, pmap->nested_region_asid_bitmap_size*sizeof(unsigned int)); + zfree(pmap_zone, pmap); +#else /* __ARM_VMSA__ == 7 */ pt_entry_t *ttep; pmap_paddr_t pa; vm_map_address_t c; - if (pmap == PMAP_NULL) { - return; - } - pmap_unmap_sharedpage(pmap); - if (hw_atomic_sub(&pmap->ref_count, 1) == 0) { + pmap_simple_lock(&pmaps_lock); + while (pmap->gc_status & PMAP_GC_INFLIGHT) { + pmap->gc_status |= PMAP_GC_WAIT; + assert_wait((event_t) & pmap->gc_status, THREAD_UNINT); + pmap_simple_unlock(&pmaps_lock); + (void) thread_block(THREAD_CONTINUE_NULL); + pmap_simple_lock(&pmaps_lock); + } + queue_remove(&map_pmap_list, pmap, pmap_t, pmaps); + pmap_simple_unlock(&pmaps_lock); - simple_lock(&pmaps_lock); - while (pmap->gc_status & PMAP_GC_INFLIGHT) { - pmap->gc_status |= PMAP_GC_WAIT; - assert_wait((event_t) & pmap->gc_status, THREAD_UNINT); - simple_unlock(&pmaps_lock); - (void) thread_block(THREAD_CONTINUE_NULL); - simple_lock(&pmaps_lock); - } - queue_remove(&map_pmap_list, pmap, pmap_t, pmaps); - simple_unlock(&pmaps_lock); + pmap_trim_self(pmap); - /* - * Free the memory maps, then the - * pmap structure. - */ - for (c = pmap->min; c < pmap->max; c += ARM_TT_L2_SIZE) { - ttep = pmap_tt2e(pmap, c); - if ((ttep != PT_ENTRY_NULL) && (*ttep & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) { - PMAP_LOCK(pmap); - pmap_tte_deallocate(pmap, ttep, PMAP_TT_L2_LEVEL); - PMAP_UNLOCK(pmap); - flush_mmu_tlb_entry(tlbi_addr(c) | tlbi_asid(pmap->asid)); - } + /* + * Free the memory maps, then the + * pmap structure. + */ + for (c = pmap->min; c < pmap->max; c += ARM_TT_L2_SIZE) { + ttep = pmap_tt2e(pmap, c); + if ((ttep != PT_ENTRY_NULL) && (*ttep & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) { + PMAP_LOCK(pmap); + pmap_tte_deallocate(pmap, ttep, PMAP_TT_L2_LEVEL); + PMAP_UNLOCK(pmap); } + } #if !__ARM64_TWO_LEVEL_PMAP__ - for (c = pmap->min; c < pmap->max; c += ARM_TT_L1_SIZE) { - ttep = pmap_tt1e(pmap, c); - if ((ttep != PT_ENTRY_NULL) && (*ttep & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) { - PMAP_LOCK(pmap); - pmap_tte_deallocate(pmap, ttep, PMAP_TT_L1_LEVEL); - PMAP_UNLOCK(pmap); - } + for (c = pmap->min; c < pmap->max; c += ARM_TT_L1_SIZE) { + ttep = pmap_tt1e(pmap, c); + if ((ttep != PT_ENTRY_NULL) && (*ttep & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) { + PMAP_LOCK(pmap); + pmap_tte_deallocate(pmap, ttep, PMAP_TT_L1_LEVEL); + PMAP_UNLOCK(pmap); } + } #endif - if (pmap->tte) { - pa = pmap->ttep; - pmap_tt1_deallocate(pmap, (tt_entry_t *)phystokv(pa), PMAP_ROOT_ALLOC_SIZE, 0); - } + if (pmap->tte) { + pa = pmap->ttep; + pmap_tt1_deallocate(pmap, (tt_entry_t *)phystokv(pa), PMAP_ROOT_ALLOC_SIZE, 0); + } - assert((tt_free_entry_t*)pmap->tt_entry_free == NULL); - flush_mmu_tlb_asid((uint64_t)(pmap->asid) << TLBI_ASID_SHIFT); - free_asid(pmap->vasid); + assert((tt_free_entry_t*)pmap->tt_entry_free == NULL); + flush_mmu_tlb_asid((uint64_t)(pmap->asid) << TLBI_ASID_SHIFT); + free_asid(pmap->vasid); - if (pmap->nested_region_asid_bitmap) { - kfree(pmap->nested_region_asid_bitmap, pmap->nested_region_asid_bitmap_size*sizeof(unsigned int)); - } + if (pmap->nested_region_asid_bitmap) { + kfree(pmap->nested_region_asid_bitmap, pmap->nested_region_asid_bitmap_size*sizeof(unsigned int)); + } - pmap_check_ledgers(pmap); - ledger_dereference(pmap->ledger); + pmap_check_ledgers(pmap); - zfree(pmap_zone, pmap); - } + zfree(pmap_zone, pmap); -#endif +#endif /* __ARM_VMSA__ == 7 */ } void pmap_destroy( pmap_t pmap) { - PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, - VM_KERNEL_ADDRHIDE(pmap)); + ledger_t ledger; + + PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), pmap->vasid, pmap->asid); + + ledger = pmap->ledger; pmap_destroy_internal(pmap); - PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END); + ledger_dereference(ledger); + + PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END); } /* * Add a reference to the specified pmap. */ -static void +MARK_AS_PMAP_TEXT static void pmap_reference_internal( pmap_t pmap) { if (pmap != PMAP_NULL) { - (void) hw_atomic_add(&pmap->ref_count, 1); + VALIDATE_PMAP(pmap); + __c11_atomic_fetch_add(&pmap->ref_count, 1, memory_order_relaxed); } } @@ -3708,12 +3858,12 @@ pmap_tt1_allocate( vm_address_t va_end; kern_return_t ret; - simple_lock(&pmaps_lock); + pmap_simple_lock(&pmaps_lock); if ((size == PAGE_SIZE) && (free_page_size_tt_count != 0)) { free_page_size_tt_count--; tt1 = (tt_entry_t *)free_page_size_tt_list; free_page_size_tt_list = ((tt_free_entry_t *)tt1)->next; - simple_unlock(&pmaps_lock); + pmap_simple_unlock(&pmaps_lock); pmap_tt_ledger_credit(pmap, size); return (tt_entry_t *)tt1; }; @@ -3721,7 +3871,7 @@ pmap_tt1_allocate( free_two_page_size_tt_count--; tt1 = (tt_entry_t *)free_two_page_size_tt_list; free_two_page_size_tt_list = ((tt_free_entry_t *)tt1)->next; - simple_unlock(&pmaps_lock); + pmap_simple_unlock(&pmaps_lock); pmap_tt_ledger_credit(pmap, size); return (tt_entry_t *)tt1; }; @@ -3729,12 +3879,12 @@ pmap_tt1_allocate( free_tt_count--; tt1 = (tt_entry_t *)free_tt_list; free_tt_list = (tt_free_entry_t *)((tt_free_entry_t *)tt1)->next; - simple_unlock(&pmaps_lock); + pmap_simple_unlock(&pmaps_lock); pmap_tt_ledger_credit(pmap, size); return (tt_entry_t *)tt1; } - simple_unlock(&pmaps_lock); + pmap_simple_unlock(&pmaps_lock); ret = pmap_pages_alloc(&pa, (unsigned)((size < PAGE_SIZE)? PAGE_SIZE : size), ((option & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)); @@ -3743,7 +3893,7 @@ pmap_tt1_allocate( if (size < PAGE_SIZE) { - simple_lock(&pmaps_lock); + pmap_simple_lock(&pmaps_lock); for (va_end = phystokv(pa) + PAGE_SIZE, va = phystokv(pa) + size; va < va_end; va = va+size) { tt1_free = (tt_free_entry_t *)va; @@ -3754,7 +3904,7 @@ pmap_tt1_allocate( if (free_tt_count > free_tt_max) free_tt_max = free_tt_count; - simple_unlock(&pmaps_lock); + pmap_simple_unlock(&pmaps_lock); } /* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size. @@ -3777,7 +3927,7 @@ pmap_tt1_deallocate( tt_entry = (tt_free_entry_t *)tt; if (not_in_kdp) - simple_lock(&pmaps_lock); + pmap_simple_lock(&pmaps_lock); if (size < PAGE_SIZE) { free_tt_count++; @@ -3805,7 +3955,7 @@ pmap_tt1_deallocate( if ((option & PMAP_TT_DEALLOCATE_NOBLOCK) || (!not_in_kdp)) { if (not_in_kdp) - simple_unlock(&pmaps_lock); + pmap_simple_unlock(&pmaps_lock); pmap_tt_ledger_debit(pmap, size); return; } @@ -3816,13 +3966,13 @@ pmap_tt1_deallocate( tt = (tt_entry_t *)free_page_size_tt_list; free_page_size_tt_list = ((tt_free_entry_t *)tt)->next; - simple_unlock(&pmaps_lock); + pmap_simple_unlock(&pmaps_lock); pmap_pages_free(ml_static_vtop((vm_offset_t)tt), PAGE_SIZE); OSAddAtomic(-(int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count)); - simple_lock(&pmaps_lock); + pmap_simple_lock(&pmaps_lock); } while (free_two_page_size_tt_count > FREE_TWO_PAGE_SIZE_TT_MAX) { @@ -3830,15 +3980,15 @@ pmap_tt1_deallocate( tt = (tt_entry_t *)free_two_page_size_tt_list; free_two_page_size_tt_list = ((tt_free_entry_t *)tt)->next; - simple_unlock(&pmaps_lock); + pmap_simple_unlock(&pmaps_lock); pmap_pages_free(ml_static_vtop((vm_offset_t)tt), 2*PAGE_SIZE); OSAddAtomic(-2 * (int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count)); - simple_lock(&pmaps_lock); + pmap_simple_lock(&pmaps_lock); } - simple_unlock(&pmaps_lock); + pmap_simple_unlock(&pmaps_lock); pmap_tt_ledger_debit(pmap, size); } @@ -3888,7 +4038,7 @@ pmap_tt_allocate( PMAP_ZINFO_PALLOC(pmap, PAGE_SIZE); ptdp = ptd_alloc(pmap); - *(pt_desc_t **)pai_to_pvh(pa_index(pa)) = ptdp; + pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), ptdp, PVH_TYPE_PTDP); __unreachable_ok_push if (TEST_PAGE_RATIO_4) { @@ -3928,17 +4078,16 @@ pmap_tt_deallocate( ptdp = ptep_get_ptd((vm_offset_t)ttp); - if (level < PMAP_TT_MAX_LEVEL) { - - if (ptdp->pt_cnt[ARM_PT_DESC_INDEX(ttp)].refcnt == PT_DESC_REFCOUNT) - ptdp->pt_cnt[ARM_PT_DESC_INDEX(ttp)].refcnt = 0; - } + ptdp->pt_map[ARM_PT_DESC_INDEX(ttp)].va = (vm_offset_t)-1; - ptdp->pt_map[ARM_PT_DESC_INDEX(ttp)].va = 0; + if ((level < PMAP_TT_MAX_LEVEL) && (ptdp->pt_cnt[ARM_PT_DESC_INDEX(ttp)].refcnt == PT_DESC_REFCOUNT)) + ptdp->pt_cnt[ARM_PT_DESC_INDEX(ttp)].refcnt = 0; if (ptdp->pt_cnt[ARM_PT_DESC_INDEX(ttp)].refcnt != 0) panic("pmap_tt_deallocate(): ptdp %p, count %d\n", ptdp, ptdp->pt_cnt[ARM_PT_DESC_INDEX(ttp)].refcnt); + ptdp->pt_cnt[ARM_PT_DESC_INDEX(ttp)].refcnt = 0; + for (i = 0, pt_acc_cnt = 0 ; i < max_pt_index ; i++) pt_acc_cnt += ptdp->pt_cnt[i].refcnt; @@ -3999,28 +4148,17 @@ pmap_tt_deallocate( } static void -pmap_tte_deallocate( +pmap_tte_remove( pmap_t pmap, tt_entry_t *ttep, unsigned int level) { - pmap_paddr_t pa; - tt_entry_t tte; - - PMAP_ASSERT_LOCKED(pmap); - - tte = *ttep; + tt_entry_t tte = *ttep; if (tte == 0) { panic("pmap_tte_deallocate(): null tt_entry ttep==%p\n", ttep); } -#if MACH_ASSERT - if (tte_get_ptd(tte)->pmap != pmap) { - panic("pmap_tte_deallocate(): ptd=%p ptd->pmap=%p pmap=%p \n", - tte_get_ptd(tte), tte_get_ptd(tte)->pmap, pmap); - } -#endif if (((level+1) == PMAP_TT_MAX_LEVEL) && (tte_get_ptd(tte)->pt_cnt[ARM_PT_DESC_INDEX(ttetokv(*ttep))].refcnt != 0)) { panic("pmap_tte_deallocate(): pmap=%p ttep=%p ptd=%p refcnt=0x%x \n", pmap, ttep, tte_get_ptd(tte), (tte_get_ptd(tte)->pt_cnt[ARM_PT_DESC_INDEX(ttetokv(*ttep))].refcnt)); @@ -4033,16 +4171,36 @@ pmap_tte_deallocate( for (i = 0; i<4; i++, ttep_4M++) *ttep_4M = (tt_entry_t) 0; + FLUSH_PTE_RANGE_STRONG(ttep_4M - 4, ttep_4M); } #else *ttep = (tt_entry_t) 0; + FLUSH_PTE_STRONG(ttep); #endif +} -#ifndef __ARM_L1_PTW__ - CleanPoU_DcacheRegion((vm_offset_t) ttep, sizeof(tt_entry_t)); -#else - __asm__ volatile("dsb ish"); +static void +pmap_tte_deallocate( + pmap_t pmap, + tt_entry_t *ttep, + unsigned int level) +{ + pmap_paddr_t pa; + tt_entry_t tte; + + PMAP_ASSERT_LOCKED(pmap); + + tte = *ttep; + +#if MACH_ASSERT + if (tte_get_ptd(tte)->pmap != pmap) { + panic("pmap_tte_deallocate(): ptd=%p ptd->pmap=%p pmap=%p \n", + tte_get_ptd(tte), tte_get_ptd(tte)->pmap, pmap); + } #endif + + pmap_tte_remove(pmap, ttep, level); + if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) { #if MACH_ASSERT { @@ -4096,9 +4254,136 @@ pmap_remove_range( PMAP_OPTIONS_REMOVE); } + +#ifdef PVH_FLAG_EXEC + +/* + * Update the access protection bits of the physical aperture mapping for a page. + * This is useful, for example, in guranteeing that a verified executable page + * has no writable mappings anywhere in the system, including the physical + * aperture. flush_tlb_async can be set to true to avoid unnecessary TLB + * synchronization overhead in cases where the call to this function is + * guaranteed to be followed by other TLB operations. + */ +static void +pmap_set_ptov_ap(unsigned int pai __unused, unsigned int ap __unused, boolean_t flush_tlb_async __unused) +{ +#if __ARM_PTE_PHYSMAP__ + ASSERT_PVH_LOCKED(pai); + vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai)); + pt_entry_t *pte_p = pmap_pte(kernel_pmap, kva); + + pt_entry_t tmplate = *pte_p; + if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(ap)) + return; + tmplate = (tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(ap); +#if (__ARM_VMSA__ > 7) + if (tmplate & ARM_PTE_HINT_MASK) { + panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx", + __func__, pte_p, (void *)kva, tmplate); + } +#endif + WRITE_PTE_STRONG(pte_p, tmplate); + flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap); + if (!flush_tlb_async) + sync_tlb_flush(); +#endif +} + +#endif /* defined(PVH_FLAG_EXEC) */ + +static void +pmap_remove_pv( + pmap_t pmap, + pt_entry_t *cpte, + int pai, + int *num_internal, + int *num_alt_internal, + int *num_reusable, + int *num_external) +{ + pv_entry_t **pv_h, **pve_pp; + pv_entry_t *pve_p; + + ASSERT_PVH_LOCKED(pai); + pv_h = pai_to_pvh(pai); + vm_offset_t pvh_flags = pvh_get_flags(pv_h); + + + if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) { + if (__builtin_expect((cpte != pvh_ptep(pv_h)), 0)) + panic("%s: cpte=%p does not match pv_h=%p (%p), pai=0x%x\n", __func__, cpte, pv_h, pvh_ptep(pv_h), pai); + if (IS_ALTACCT_PAGE(pai, PV_ENTRY_NULL)) { + assert(IS_INTERNAL_PAGE(pai)); + (*num_internal)++; + (*num_alt_internal)++; + CLR_ALTACCT_PAGE(pai, PV_ENTRY_NULL); + } else if (IS_INTERNAL_PAGE(pai)) { + if (IS_REUSABLE_PAGE(pai)) { + (*num_reusable)++; + } else { + (*num_internal)++; + } + } else { + (*num_external)++; + } + pvh_update_head(pv_h, PV_ENTRY_NULL, PVH_TYPE_NULL); + } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) { + + pve_pp = pv_h; + pve_p = pvh_list(pv_h); + + while (pve_p != PV_ENTRY_NULL && + (pve_get_ptep(pve_p) != cpte)) { + pve_pp = pve_link_field(pve_p); + pve_p = PVE_NEXT_PTR(pve_next(pve_p)); + } + + if (__builtin_expect((pve_p == PV_ENTRY_NULL), 0)) + panic("%s: cpte=%p (pai=0x%x) not in pv_h=%p\n", __func__, cpte, pai, pv_h); + #if MACH_ASSERT -int num_reusable_mismatch = 0; -#endif /* MACH_ASSERT */ + if ((pmap != NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) { + pv_entry_t *check_pve_p = PVE_NEXT_PTR(pve_next(pve_p)); + while (check_pve_p != PV_ENTRY_NULL) { + if (pve_get_ptep(check_pve_p) == cpte) { + panic("%s: duplicate pve entry cpte=%p pmap=%p, pv_h=%p, pve_p=%p, pai=0x%x", + __func__, cpte, pmap, pv_h, pve_p, pai); + } + check_pve_p = PVE_NEXT_PTR(pve_next(check_pve_p)); + } + } +#endif + + if (IS_ALTACCT_PAGE(pai, pve_p)) { + assert(IS_INTERNAL_PAGE(pai)); + (*num_internal)++; + (*num_alt_internal)++; + CLR_ALTACCT_PAGE(pai, pve_p); + } else if (IS_INTERNAL_PAGE(pai)) { + if (IS_REUSABLE_PAGE(pai)) { + (*num_reusable)++; + } else { + (*num_internal)++; + } + } else { + (*num_external)++; + } + + pvh_remove(pv_h, pve_pp, pve_p); + pv_free(pve_p); + if (!pvh_test_type(pv_h, PVH_TYPE_NULL)) + pvh_set_flags(pv_h, pvh_flags); + } else { + panic("%s: unexpected PV head %p, cpte=%p pmap=%p pv_h=%p pai=0x%x", + __func__, *pv_h, cpte, pmap, pv_h, pai); + } + +#ifdef PVH_FLAG_EXEC + if ((pvh_flags & PVH_FLAG_EXEC) && pvh_test_type(pv_h, PVH_TYPE_NULL)) + pmap_set_ptov_ap(pai, AP_RWNA, FALSE); +#endif +} static int pmap_remove_range_options( @@ -4132,17 +4417,14 @@ pmap_remove_range_options( for (cpte = bpte; cpte < epte; cpte += PAGE_SIZE/ARM_PGBYTES, va += PAGE_SIZE) { - pv_entry_t **pv_h, **pve_pp; - pv_entry_t *pve_p; pt_entry_t spte; boolean_t managed=FALSE; spte = *cpte; #if CONFIG_PGTRACE - if (pgtrace_enabled) { - pmap_pgtrace_remove_clone(pmap, pte_to_pa(spte), va); - } + if (pgtrace_enabled) + pmap_pgtrace_remove_clone(pmap, pte_to_pa(spte), va); #endif while (!managed) { @@ -4238,77 +4520,8 @@ pmap_remove_range_options( * find and remove the mapping from the chain for this * physical address. */ - ASSERT_PVH_LOCKED(pai); // Should have been locked when we found the managed PTE above - pv_h = pai_to_pvh(pai); - - if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) { - if (__builtin_expect((cpte != pvh_ptep(pv_h)), 0)) - panic("pmap_remove_range(): cpte=%p (0x%llx) does not match pv_h=%p (%p)\n", cpte, (uint64_t)spte, pv_h, pvh_ptep(pv_h)); - if (IS_ALTACCT_PAGE(pai, PV_ENTRY_NULL)) { - assert(IS_INTERNAL_PAGE(pai)); - num_internal++; - num_alt_internal++; - CLR_ALTACCT_PAGE(pai, PV_ENTRY_NULL); - } else if (IS_INTERNAL_PAGE(pai)) { - if (IS_REUSABLE_PAGE(pai)) { - num_reusable++; - } else { - num_internal++; - } - } else { - num_external++; - } - pvh_update_head(pv_h, PV_ENTRY_NULL, PVH_TYPE_NULL); - } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) { - - pve_pp = pv_h; - pve_p = pvh_list(pv_h); - - while (pve_p != PV_ENTRY_NULL && - (pve_get_ptep(pve_p) != cpte)) { - pve_pp = pve_link_field(pve_p); - pve_p = PVE_NEXT_PTR(pve_next(pve_p)); - } - - if (__builtin_expect((pve_p == PV_ENTRY_NULL), 0)) { - UNLOCK_PVH(pai); - panic("pmap_remove_range(): cpte=%p (0x%llx) not in pv_h=%p\n", cpte, (uint64_t)spte, pv_h); - } - -#if MACH_ASSERT - if (kern_feature_override(KF_PMAPV_OVRD) == FALSE) { - pv_entry_t *check_pve_p = PVE_NEXT_PTR(pve_next(pve_p)); - while (check_pve_p != PV_ENTRY_NULL) { - if (pve_get_ptep(check_pve_p) == cpte) { - panic("pmap_remove_range(): duplicate pve entry cpte=%p pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, va=0x%llx\n", - cpte, pmap, pv_h, pve_p, (uint64_t)spte, (uint64_t)va); - } - check_pve_p = PVE_NEXT_PTR(pve_next(check_pve_p)); - } - } -#endif - - if (IS_ALTACCT_PAGE(pai, pve_p)) { - assert(IS_INTERNAL_PAGE(pai)); - num_internal++; - num_alt_internal++; - CLR_ALTACCT_PAGE(pai, pve_p); - } else if (IS_INTERNAL_PAGE(pai)) { - if (IS_REUSABLE_PAGE(pai)) { - num_reusable++; - } else { - num_internal++; - } - } else { - num_external++; - } - pvh_remove(pv_h, pve_pp, pve_p) ; - pv_free(pve_p); - } else { - panic("pmap_remove_range(): unexpected PV head %p, cpte=%p pmap=%p pv_h=%p pte=0x%llx va=0x%llx\n", - *pv_h, cpte, pmap, pv_h, (uint64_t)spte, (uint64_t)va); - } + pmap_remove_pv(pmap, cpte, pai, &num_internal, &num_alt_internal, &num_reusable, &num_external); UNLOCK_PVH(pai); num_removed++; @@ -4325,11 +4538,10 @@ pmap_remove_range_options( #if MACH_ASSERT if (pmap->stats.internal < num_internal) { if ((! pmap_stats_assert || - ! pmap->pmap_stats_assert) || - (pmap->stats.internal + pmap->stats.reusable) == - (num_internal + num_reusable)) { - num_reusable_mismatch++; - printf("pmap_remove_range_options(%p,0x%llx,%p,%p,0x%x): num_internal=%d num_removed=%d num_unwired=%d num_external=%d num_reusable=%d num_compressed=%lld num_alt_internal=%d num_alt_compressed=%lld num_pte_changed=%d stats.internal=%d stats.reusable=%d\n", + ! pmap->pmap_stats_assert)) { + printf("%d[%s] pmap_remove_range_options(%p,0x%llx,%p,%p,0x%x): num_internal=%d num_removed=%d num_unwired=%d num_external=%d num_reusable=%d num_compressed=%lld num_alt_internal=%d num_alt_compressed=%lld num_pte_changed=%d stats.internal=%d stats.reusable=%d\n", + pmap->pmap_pid, + pmap->pmap_procname, pmap, (uint64_t) va, bpte, @@ -4346,11 +4558,10 @@ pmap_remove_range_options( num_pte_changed, pmap->stats.internal, pmap->stats.reusable); - /* slight mismatch: fix it... */ - num_internal = pmap->stats.internal; - num_reusable = pmap->stats.reusable; } else { - panic("pmap_remove_range_options(%p,0x%llx,%p,%p,0x%x): num_internal=%d num_removed=%d num_unwired=%d num_external=%d num_reusable=%d num_compressed=%lld num_alt_internal=%d num_alt_compressed=%lld num_pte_changed=%d stats.internal=%d stats.reusable=%d", + panic("%d[%s] pmap_remove_range_options(%p,0x%llx,%p,%p,0x%x): num_internal=%d num_removed=%d num_unwired=%d num_external=%d num_reusable=%d num_compressed=%lld num_alt_internal=%d num_alt_compressed=%lld num_pte_changed=%d stats.internal=%d stats.reusable=%d", + pmap->pmap_pid, + pmap->pmap_procname, pmap, (uint64_t) va, bpte, @@ -4418,7 +4629,7 @@ pmap_remove_range_options( /* flush the ptable entries we have written */ if (num_pte_changed > 0) - FLUSH_PTE_RANGE(bpte, epte); + FLUSH_PTE_RANGE_STRONG(bpte, epte); return num_pte_changed; } @@ -4440,11 +4651,12 @@ pmap_remove( pmap_remove_options(pmap, start, end, PMAP_OPTIONS_REMOVE); } -static int -pmap_remove_options_internal(pmap_t pmap, -vm_map_address_t start, -vm_map_address_t end, -int options) +MARK_AS_PMAP_TEXT static int +pmap_remove_options_internal( + pmap_t pmap, + vm_map_address_t start, + vm_map_address_t end, + int options) { int remove_count = 0; pt_entry_t *bpte, *epte; @@ -4452,6 +4664,10 @@ int options) tt_entry_t *tte_p; uint32_t rmv_spte=0; + if (__improbable(end < start)) + panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end); + + VALIDATE_PMAP(pmap); PMAP_LOCK(pmap); tte_p = pmap_tte(pmap, start); @@ -4485,7 +4701,6 @@ int options) done: PMAP_UNLOCK(pmap); - return remove_count; } @@ -4502,7 +4717,7 @@ pmap_remove_options( if (pmap == PMAP_NULL) return; - PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START, + PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start), VM_KERNEL_ADDRHIDE(end)); @@ -4537,11 +4752,10 @@ pmap_remove_options( va = l; } - if (remove_count > 0) PMAP_UPDATE_TLBS(pmap, start, end); - PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END); + PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END); } @@ -4556,7 +4770,6 @@ pmap_remove_some_phys( /* Implement to support working set code */ } - void pmap_set_pmap( pmap_t pmap, @@ -4585,30 +4798,25 @@ pmap_flush_core_tlb_asid(pmap_t pmap) flush_core_tlb_asid(pmap->asid); #else flush_core_tlb_asid(((uint64_t) pmap->asid) << TLBI_ASID_SHIFT); -#if __ARM_KERNEL_PROTECT__ - flush_core_tlb_asid(((uint64_t) pmap->asid + 1) << TLBI_ASID_SHIFT); -#endif /* __ARM_KERNEL_PROTECT__ */ #endif } -static void +MARK_AS_PMAP_TEXT static void pmap_switch_internal( pmap_t pmap) { + VALIDATE_PMAP(pmap); pmap_cpu_data_t *cpu_data_ptr = pmap_get_cpu_data(); - uint32_t last_asid_high_bits, asid_high_bits; - pmap_t cur_pmap; - pmap_t cur_user_pmap; - boolean_t do_asid_flush = FALSE; + uint32_t last_asid_high_bits, asid_high_bits; + boolean_t do_asid_flush = FALSE; #if (__ARM_VMSA__ == 7) if (not_in_kdp) - simple_lock(&pmap->tt1_lock); + pmap_simple_lock(&pmap->tt1_lock); +#else + pmap_t last_nested_pmap = cpu_data_ptr->cpu_nested_pmap; #endif - cur_pmap = current_pmap(); - cur_user_pmap = cpu_data_ptr->cpu_user_pmap; - /* Paranoia. */ assert(pmap->asid < (sizeof(cpu_data_ptr->cpu_asid_high_bits) / sizeof(*cpu_data_ptr->cpu_asid_high_bits))); @@ -4627,50 +4835,23 @@ pmap_switch_internal( do_asid_flush = TRUE; } - if ((cur_user_pmap == cur_pmap) && (cur_pmap == pmap)) { - if (cpu_data_ptr->cpu_user_pmap_stamp == pmap->stamp) { - pmap_switch_user_ttb_internal(pmap); - -#if (__ARM_VMSA__ == 7) - if (not_in_kdp) - simple_unlock(&pmap->tt1_lock); -#endif - - if (do_asid_flush) { - pmap_flush_core_tlb_asid(pmap); - } - - return; - } else - cur_user_pmap = NULL; - } else if ((cur_user_pmap == pmap) && (cpu_data_ptr->cpu_user_pmap_stamp != pmap->stamp)) - cur_user_pmap = NULL; - pmap_switch_user_ttb_internal(pmap); - if (do_asid_flush) { +#if (__ARM_VMSA__ > 7) + /* If we're switching to a different nested pmap (i.e. shared region), we'll need + * to flush the userspace mappings for that region. Those mappings are global + * and will not be protected by the ASID. It should also be cheaper to flush the + * entire local TLB rather than to do a broadcast MMU flush by VA region. */ + if ((pmap != kernel_pmap) && (last_nested_pmap != NULL) && (pmap->nested_pmap != last_nested_pmap)) + flush_core_tlb(); + else +#endif + if (do_asid_flush) pmap_flush_core_tlb_asid(pmap); - } #if (__ARM_VMSA__ == 7) if (not_in_kdp) - simple_unlock(&pmap->tt1_lock); -#else - if (pmap != kernel_pmap) { - - if (cur_user_pmap != PMAP_NULL) { - /* - * We have a low-address global mapping for the commpage - * for 32-bit processes; flush it if we switch to a 64-bot - * process. - */ - if (pmap_is_64bit(pmap) && !pmap_is_64bit(cur_user_pmap)) { - pmap_sharedpage_flush_32_to_64(); - } - - } else - flush_core_tlb(); - } + pmap_simple_unlock(&pmap->tt1_lock); #endif } @@ -4678,7 +4859,9 @@ void pmap_switch( pmap_t pmap) { + PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), pmap->vasid, pmap->asid); pmap_switch_internal(pmap); + PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END); } void @@ -4696,7 +4879,7 @@ pmap_page_protect( * Lower the permission for all mappings to a given * page. */ -static void +MARK_AS_PMAP_TEXT static void pmap_page_protect_options_internal( ppnum_t ppnum, vm_prot_t prot, @@ -4704,13 +4887,18 @@ pmap_page_protect_options_internal( { pmap_paddr_t phys = ptoa(ppnum); pv_entry_t **pv_h; + pv_entry_t **pve_pp; pv_entry_t *pve_p; pv_entry_t *pveh_p; pv_entry_t *pvet_p; pt_entry_t *pte_p; + pv_entry_t *new_pve_p; + pt_entry_t *new_pte_p; + vm_offset_t pvh_flags; int pai; boolean_t remove; boolean_t set_NX; + boolean_t tlb_flush_needed = FALSE; unsigned int pvh_cnt = 0; assert(ppnum != vm_page_fictitious_addr); @@ -4738,11 +4926,16 @@ pmap_page_protect_options_internal( pai = (int)pa_index(phys); LOCK_PVH(pai); pv_h = pai_to_pvh(pai); + pvh_flags = pvh_get_flags(pv_h); + pte_p = PT_ENTRY_NULL; pve_p = PV_ENTRY_NULL; + pve_pp = pv_h; pveh_p = PV_ENTRY_NULL; pvet_p = PV_ENTRY_NULL; + new_pve_p = PV_ENTRY_NULL; + new_pte_p = PT_ENTRY_NULL; if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) { pte_p = pvh_ptep(pv_h); } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) { @@ -4759,6 +4952,29 @@ pmap_page_protect_options_internal( if (pve_p != PV_ENTRY_NULL) pte_p = pve_get_ptep(pve_p); +#ifdef PVH_FLAG_IOMMU + if ((vm_offset_t)pte_p & PVH_FLAG_IOMMU) { + if (remove) { + if (options & PMAP_OPTIONS_COMPRESSOR) { + panic("pmap_page_protect: attempt to compress ppnum 0x%x owned by iommu 0x%llx, pve_p=%p", + ppnum, (uint64_t)pte_p & ~PVH_FLAG_IOMMU, pve_p); + } + if (pve_p != PV_ENTRY_NULL) { + pv_entry_t *temp_pve_p = PVE_NEXT_PTR(pve_next(pve_p)); + pvh_remove(pv_h, pve_pp, pve_p); + pveh_p = pvh_list(pv_h); + pve_next(pve_p) = new_pve_p; + new_pve_p = pve_p; + pve_p = temp_pve_p; + continue; + } else { + new_pte_p = pte_p; + break; + } + } + goto protect_skip_pve; + } +#endif pmap = ptep_get_pmap(pte_p); va = ptep_get_va(pte_p); @@ -4833,7 +5049,7 @@ pmap_page_protect_options_internal( } if (*pte_p != tmplate) { - WRITE_PTE(pte_p, tmplate); + WRITE_PTE_STRONG(pte_p, tmplate); update = TRUE; } pvh_cnt++; @@ -4976,34 +5192,54 @@ pmap_page_protect_options_internal( if (*pte_p != ARM_PTE_TYPE_FAULT && !ARM_PTE_IS_COMPRESSED(*pte_p) && *pte_p != tmplate) { - WRITE_PTE(pte_p, tmplate); + WRITE_PTE_STRONG(pte_p, tmplate); update = TRUE; } } /* Invalidate TLBs for all CPUs using it */ - if (update) - PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE); + if (update) { + tlb_flush_needed = TRUE; + flush_mmu_tlb_region_asid_async(va, PAGE_SIZE, pmap); + } +#ifdef PVH_FLAG_IOMMU + protect_skip_pve: +#endif pte_p = PT_ENTRY_NULL; pvet_p = pve_p; if (pve_p != PV_ENTRY_NULL) { - pvet_p = pve_p; if (remove) { assert(pve_next(pve_p) == PVE_NEXT_PTR(pve_next(pve_p))); } + pve_pp = pve_link_field(pve_p); pve_p = PVE_NEXT_PTR(pve_next(pve_p)); } } +#ifdef PVH_FLAG_EXEC + if (remove && (pvh_get_flags(pv_h) & PVH_FLAG_EXEC)) + pmap_set_ptov_ap(pai, AP_RWNA, tlb_flush_needed); +#endif + if (tlb_flush_needed) + sync_tlb_flush(); + /* if we removed a bunch of entries, take care of them now */ if (remove) { - pvh_update_head(pv_h, PV_ENTRY_NULL, PVH_TYPE_NULL); + if (new_pve_p != PV_ENTRY_NULL) { + pvh_update_head(pv_h, new_pve_p, PVH_TYPE_PVEP); + pvh_set_flags(pv_h, pvh_flags); + } else if (new_pte_p != PT_ENTRY_NULL) { + pvh_update_head(pv_h, new_pte_p, PVH_TYPE_PTEP); + pvh_set_flags(pv_h, pvh_flags); + } else { + pvh_update_head(pv_h, PV_ENTRY_NULL, PVH_TYPE_NULL); + } } UNLOCK_PVH(pai); - if (remove && (pveh_p != PV_ENTRY_NULL)) { + if (remove && (pvet_p != PV_ENTRY_NULL)) { pv_list_free(pveh_p, pvet_p, pvh_cnt); } } @@ -5030,11 +5266,11 @@ pmap_page_protect_options( return; /* nothing to do */ } - PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot); + PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot); pmap_page_protect_options_internal(ppnum, prot, options); - PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END); + PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END); } /* @@ -5062,7 +5298,7 @@ pmap_protect( pmap_protect_options(pmap, b, e, prot, 0, NULL); } -static void +MARK_AS_PMAP_TEXT static void pmap_protect_options_internal(pmap_t pmap, vm_map_address_t start, vm_map_address_t end, @@ -5083,6 +5319,9 @@ pmap_protect_options_internal(pmap_t pmap, boolean_t InvalidatePoU_Icache_Done = FALSE; #endif + if (__improbable(end < start)) + panic("%s called with bogus range: %p, %p", __func__, (void*)start, (void*)end); + #if DEVELOPMENT || DEBUG if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) { if ((prot & VM_PROT_ALL) == VM_PROT_NONE) { @@ -5127,6 +5366,7 @@ pmap_protect_options_internal(pmap_t pmap, set_NX = TRUE; } + VALIDATE_PMAP(pmap); PMAP_LOCK(pmap); tte_p = pmap_tte(pmap, start); @@ -5307,7 +5547,7 @@ pmap_protect_options_internal(pmap_t pmap, } } - FLUSH_PTE_RANGE(bpte_p, epte_p); + FLUSH_PTE_RANGE_STRONG(bpte_p, epte_p); PMAP_UPDATE_TLBS(pmap, start, end); } @@ -5354,7 +5594,7 @@ pmap_protect_options( } } - PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START, + PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(b), VM_KERNEL_ADDRHIDE(e)); @@ -5371,7 +5611,7 @@ pmap_protect_options( beg = l; } - PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END); + PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END); } /* Map a (possibly) autogenned block */ @@ -5457,12 +5697,14 @@ static inline void pmap_enter_pte(pmap_t pmap, pt_entry_t *pte_p, pt_entry_t pte } if (*pte_p != ARM_PTE_TYPE_FAULT && !ARM_PTE_IS_COMPRESSED(*pte_p)) { - WRITE_PTE(pte_p, pte); + WRITE_PTE_STRONG(pte_p, pte); PMAP_UPDATE_TLBS(pmap, v, v + PAGE_SIZE); } else { WRITE_PTE(pte_p, pte); - __asm__ volatile("isb"); + __builtin_arm_isb(ISB_SY); } + + PMAP_TRACE(3, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), VM_KERNEL_ADDRHIDE(v + PAGE_SIZE), pte); } static pt_entry_t @@ -5472,6 +5714,7 @@ wimg_to_pte(unsigned int wimg) switch (wimg & (VM_WIMG_MASK)) { case VM_WIMG_IO: + case VM_WIMG_RT: pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE); pte |= ARM_PTE_NX | ARM_PTE_PNX; break; @@ -5519,7 +5762,145 @@ wimg_to_pte(unsigned int wimg) return pte; } -static kern_return_t +static boolean_t +pmap_enter_pv( + pmap_t pmap, + pt_entry_t *pte_p, + int pai, + unsigned int options, + pv_entry_t **pve_p, + boolean_t *is_altacct) +{ + pv_entry_t **pv_h; + pv_h = pai_to_pvh(pai); + boolean_t first_cpu_mapping; + + ASSERT_PVH_LOCKED(pai); + + vm_offset_t pvh_flags = pvh_get_flags(pv_h); + + +#ifdef PVH_FLAG_CPU + /* An IOMMU mapping may already be present for a page that hasn't yet + * had a CPU mapping established, so we use PVH_FLAG_CPU to determine + * if this is the first CPU mapping. We base internal/reusable + * accounting on the options specified for the first CPU mapping. + * PVH_FLAG_CPU, and thus this accounting, will then persist as long + * as there are *any* mappings of the page. The accounting for a + * page should not need to change until the page is recycled by the + * VM layer, and we assert that there are no mappings when a page + * is recycled. An IOMMU mapping of a freed/recycled page is + * considered a security violation & potential DMA corruption path.*/ + first_cpu_mapping = ((pmap != NULL) && !(pvh_flags & PVH_FLAG_CPU)); + if (first_cpu_mapping) + pvh_flags |= PVH_FLAG_CPU; +#else + first_cpu_mapping = pvh_test_type(pv_h, PVH_TYPE_NULL); +#endif + + if (first_cpu_mapping) { + if (options & PMAP_OPTIONS_INTERNAL) { + SET_INTERNAL_PAGE(pai); + } else { + CLR_INTERNAL_PAGE(pai); + } + if ((options & PMAP_OPTIONS_INTERNAL) && + (options & PMAP_OPTIONS_REUSABLE)) { + SET_REUSABLE_PAGE(pai); + } else { + CLR_REUSABLE_PAGE(pai); + } + } + if (pvh_test_type(pv_h, PVH_TYPE_NULL)) { + pvh_update_head(pv_h, pte_p, PVH_TYPE_PTEP); + if (pmap != NULL && pmap != kernel_pmap && + ((options & PMAP_OPTIONS_ALT_ACCT) || + PMAP_FOOTPRINT_SUSPENDED(pmap)) && + IS_INTERNAL_PAGE(pai)) { + /* + * Make a note to ourselves that this mapping is using alternative + * accounting. We'll need this in order to know which ledger to + * debit when the mapping is removed. + * + * The altacct bit must be set while the pv head is locked. Defer + * the ledger accounting until after we've dropped the lock. + */ + SET_ALTACCT_PAGE(pai, PV_ENTRY_NULL); + *is_altacct = TRUE; + } else { + CLR_ALTACCT_PAGE(pai, PV_ENTRY_NULL); + } + } else { + if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) { + pt_entry_t *pte1_p; + + /* + * convert pvh list from PVH_TYPE_PTEP to PVH_TYPE_PVEP + */ + pte1_p = pvh_ptep(pv_h); + pvh_set_flags(pv_h, pvh_flags); + if((*pve_p == PV_ENTRY_NULL) && (!pv_alloc(pmap, pai, pve_p))) + return FALSE; + + pve_set_ptep(*pve_p, pte1_p); + (*pve_p)->pve_next = PV_ENTRY_NULL; + + if (IS_ALTACCT_PAGE(pai, PV_ENTRY_NULL)) { + /* + * transfer "altacct" from + * pp_attr to this pve + */ + CLR_ALTACCT_PAGE(pai, PV_ENTRY_NULL); + SET_ALTACCT_PAGE(pai, *pve_p); + } + pvh_update_head(pv_h, *pve_p, PVH_TYPE_PVEP); + *pve_p = PV_ENTRY_NULL; + } else if (!pvh_test_type(pv_h, PVH_TYPE_PVEP)) { + panic("%s: unexpected PV head %p, pte_p=%p pmap=%p pv_h=%p", + __func__, *pv_h, pte_p, pmap, pv_h); + } + /* + * Set up pv_entry for this new mapping and then + * add it to the list for this physical page. + */ + pvh_set_flags(pv_h, pvh_flags); + if((*pve_p == PV_ENTRY_NULL) && (!pv_alloc(pmap, pai, pve_p))) + return FALSE; + + pve_set_ptep(*pve_p, pte_p); + (*pve_p)->pve_next = PV_ENTRY_NULL; + + pvh_add(pv_h, *pve_p); + + if (pmap != NULL && pmap != kernel_pmap && + ((options & PMAP_OPTIONS_ALT_ACCT) || + PMAP_FOOTPRINT_SUSPENDED(pmap)) && + IS_INTERNAL_PAGE(pai)) { + /* + * Make a note to ourselves that this + * mapping is using alternative + * accounting. We'll need this in order + * to know which ledger to debit when + * the mapping is removed. + * + * The altacct bit must be set while + * the pv head is locked. Defer the + * ledger accounting until after we've + * dropped the lock. + */ + SET_ALTACCT_PAGE(pai, *pve_p); + *is_altacct = TRUE; + } + + *pve_p = PV_ENTRY_NULL; + } + + pvh_set_flags(pv_h, pvh_flags); + + return TRUE; +} + +MARK_AS_PMAP_TEXT static kern_return_t pmap_enter_options_internal( pmap_t pmap, vm_map_address_t v, @@ -5541,6 +5922,9 @@ pmap_enter_options_internal( boolean_t wiredcnt_updated; unsigned int wimg_bits; boolean_t was_compressed, was_alt_compressed; + kern_return_t kr = KERN_SUCCESS; + + VALIDATE_PMAP(pmap); if ((v) & PAGE_MASK) { panic("pmap_enter_options() pmap %p v 0x%llx\n", @@ -5585,11 +5969,10 @@ pmap_enter_options_internal( /* Must unlock to expand the pmap. */ PMAP_UNLOCK(pmap); - kern_return_t kr=pmap_expand(pmap, v, options, PMAP_TT_MAX_LEVEL); + kr = pmap_expand(pmap, v, options, PMAP_TT_MAX_LEVEL); - if(kr) { + if (kr != KERN_SUCCESS) return kr; - } PMAP_LOCK(pmap); } @@ -5618,10 +6001,7 @@ pmap_enter_options_internal( was_compressed = TRUE; if (spte & ARM_PTE_COMPRESSED_ALT) { was_alt_compressed = TRUE; - pmap_ledger_debit( - pmap, - task_ledgers.alternate_accounting_compressed, - PAGE_SIZE); + pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, PAGE_SIZE); } else { /* was part of the footprint */ pmap_ledger_debit(pmap, task_ledgers.phys_footprint, PAGE_SIZE); @@ -5668,13 +6048,6 @@ pmap_enter_options_internal( } #endif - if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) - wimg_bits = (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT)); - else - wimg_bits = pmap_cache_attributes(pn); - - pte |= wimg_to_pte(wimg_bits); - if (pmap == kernel_pmap) { #if __ARM_KERNEL_PROTECT__ pte |= ARM_PTE_NG; @@ -5783,40 +6156,42 @@ pmap_enter_options_internal( } if (pa_valid(pa)) { - pv_entry_t **pv_h; - int pai; - boolean_t is_altacct, is_internal; + int pai; + boolean_t is_altacct, is_internal; is_internal = FALSE; is_altacct = FALSE; pai = (int)pa_index(pa); - pv_h = pai_to_pvh(pai); LOCK_PVH(pai); + Pmap_enter_loop: + if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) + wimg_bits = (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT)); + else + wimg_bits = pmap_cache_attributes(pn); + + /* We may be retrying this operation after dropping the PVH lock. + * Cache attributes for the physical page may have changed while the lock + * was dropped, so clear any cache attributes we may have previously set + * in the PTE template. */ + pte &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK); + pte |= wimg_to_pte(wimg_bits); + + if (pte == *pte_p) { /* * This pmap_enter operation has been completed by another thread * undo refcnt on pt and return */ - if (refcnt != NULL) { - assert(refcnt_updated); - if (OSAddAtomic16(-1, (volatile int16_t*)refcnt) <= 0) - panic("pmap_enter(): over-release of ptdp %p for pte %p\n", ptep_get_ptd(pte_p), pte_p); - } UNLOCK_PVH(pai); - goto Pmap_enter_return; + goto Pmap_enter_cleanup; } else if (pte_to_pa(*pte_p) == pa) { - if (refcnt != NULL) { - assert(refcnt_updated); - if (OSAddAtomic16(-1, (volatile int16_t*)refcnt) <= 0) - panic("pmap_enter(): over-release of ptdp %p for pte %p\n", ptep_get_ptd(pte_p), pte_p); - } pmap_enter_pte(pmap, pte_p, pte, v); UNLOCK_PVH(pai); - goto Pmap_enter_return; + goto Pmap_enter_cleanup; } else if (*pte_p != ARM_PTE_TYPE_FAULT) { /* * pte has been modified by another thread @@ -5825,96 +6200,8 @@ pmap_enter_options_internal( UNLOCK_PVH(pai); goto Pmap_enter_retry; } - if (pvh_test_type(pv_h, PVH_TYPE_NULL)) { - pvh_update_head(pv_h, pte_p, PVH_TYPE_PTEP); - /* 1st mapping: see what kind of page it is */ - if (options & PMAP_OPTIONS_INTERNAL) { - SET_INTERNAL_PAGE(pai); - } else { - CLR_INTERNAL_PAGE(pai); - } - if ((options & PMAP_OPTIONS_INTERNAL) && - (options & PMAP_OPTIONS_REUSABLE)) { - SET_REUSABLE_PAGE(pai); - } else { - CLR_REUSABLE_PAGE(pai); - } - if (pmap != kernel_pmap && - ((options & PMAP_OPTIONS_ALT_ACCT) || - PMAP_FOOTPRINT_SUSPENDED(pmap)) && - IS_INTERNAL_PAGE(pai)) { - /* - * Make a note to ourselves that this mapping is using alternative - * accounting. We'll need this in order to know which ledger to - * debit when the mapping is removed. - * - * The altacct bit must be set while the pv head is locked. Defer - * the ledger accounting until after we've dropped the lock. - */ - SET_ALTACCT_PAGE(pai, PV_ENTRY_NULL); - is_altacct = TRUE; - } else { - CLR_ALTACCT_PAGE(pai, PV_ENTRY_NULL); - } - } else { - if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) { - pt_entry_t *pte1_p; - - /* - * convert pvh list from PVH_TYPE_PTEP to PVH_TYPE_PVEP - */ - pte1_p = pvh_ptep(pv_h); - if((pve_p == PV_ENTRY_NULL) && (!pv_alloc(pmap, pai, &pve_p))) { - goto Pmap_enter_loop; - } - pve_set_ptep(pve_p, pte1_p); - pve_p->pve_next = PV_ENTRY_NULL; - - if (IS_ALTACCT_PAGE(pai, PV_ENTRY_NULL)) { - /* - * transfer "altacct" from - * pp_attr to this pve - */ - CLR_ALTACCT_PAGE(pai, PV_ENTRY_NULL); - SET_ALTACCT_PAGE(pai, pve_p); - } - pvh_update_head(pv_h, pve_p, PVH_TYPE_PVEP); - pve_p = PV_ENTRY_NULL; - } - /* - * Set up pv_entry for this new mapping and then - * add it to the list for this physical page. - */ - if((pve_p == PV_ENTRY_NULL) && (!pv_alloc(pmap, pai, &pve_p))) { - goto Pmap_enter_loop; - } - pve_set_ptep(pve_p, pte_p); - pve_p->pve_next = PV_ENTRY_NULL; - - pvh_add(pv_h, pve_p); - - if (pmap != kernel_pmap && - ((options & PMAP_OPTIONS_ALT_ACCT) || - PMAP_FOOTPRINT_SUSPENDED(pmap)) && - IS_INTERNAL_PAGE(pai)) { - /* - * Make a note to ourselves that this - * mapping is using alternative - * accounting. We'll need this in order - * to know which ledger to debit when - * the mapping is removed. - * - * The altacct bit must be set while - * the pv head is locked. Defer the - * ledger accounting until after we've - * dropped the lock. - */ - SET_ALTACCT_PAGE(pai, pve_p); - is_altacct = TRUE; - } - - pve_p = PV_ENTRY_NULL; - } + if (!pmap_enter_pv(pmap, pte_p, pai, options, &pve_p, &is_altacct)) + goto Pmap_enter_loop; pmap_enter_pte(pmap, pte_p, pte, v); @@ -5977,9 +6264,31 @@ pmap_enter_options_internal( if (pmap->stats.resident_count > pmap->stats.resident_max) pmap->stats.resident_max = pmap->stats.resident_count; } else { + + if (prot & VM_PROT_EXECUTE) { + kr = KERN_FAILURE; + goto Pmap_enter_cleanup; + } + + wimg_bits = pmap_cache_attributes(pn); + if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) + wimg_bits = (wimg_bits & (~VM_WIMG_MASK)) | (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT)); + + pte |= wimg_to_pte(wimg_bits); + pmap_enter_pte(pmap, pte_p, pte, v); } + goto Pmap_enter_return; + +Pmap_enter_cleanup: + + if (refcnt != NULL) { + assert(refcnt_updated); + if (OSAddAtomic16(-1, (volatile int16_t*)refcnt) <= 0) + panic("pmap_enter(): over-release of ptdp %p for pte %p\n", ptep_get_ptd(pte_p), pte_p); + } + Pmap_enter_return: #if CONFIG_PGTRACE @@ -5999,7 +6308,7 @@ pmap_enter_options_internal( PMAP_UNLOCK(pmap); - return KERN_SUCCESS; + return kr; } kern_return_t @@ -6016,12 +6325,13 @@ pmap_enter_options( { kern_return_t kr = KERN_FAILURE; - PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START, + PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), pn, prot); kr = pmap_enter_options_internal(pmap, v, pn, prot, fault_type, flags, wired, options); + pv_water_mark_check(); - PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr); + PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr); return kr; } @@ -6033,7 +6343,7 @@ pmap_enter_options( * In/out conditions: * The mapping must already exist in the pmap. */ -static void +MARK_AS_PMAP_TEXT static void pmap_change_wiring_internal( pmap_t pmap, vm_map_address_t v, @@ -6049,6 +6359,7 @@ pmap_change_wiring_internal( if (pmap == kernel_pmap) { return; } + VALIDATE_USER_PMAP(pmap); PMAP_LOCK(pmap); pte_p = pmap_pte(pmap, v); @@ -6083,13 +6394,15 @@ pmap_change_wiring( pmap_change_wiring_internal(pmap, v, wired); } -static ppnum_t +MARK_AS_PMAP_TEXT static ppnum_t pmap_find_phys_internal( pmap_t pmap, addr64_t va) { ppnum_t ppn=0; + VALIDATE_PMAP(pmap); + if (pmap != kernel_pmap) { PMAP_LOCK(pmap); } @@ -6209,7 +6522,7 @@ pmap_vtophys( return ppn; } -static vm_offset_t +MARK_AS_PMAP_TEXT static vm_offset_t pmap_extract_internal( pmap_t pmap, vm_map_address_t va) @@ -6221,6 +6534,8 @@ pmap_extract_internal( return 0; } + VALIDATE_PMAP(pmap); + PMAP_LOCK(pmap); ppn = pmap_vtophys(pmap, va); @@ -6268,11 +6583,12 @@ pmap_init_pte_page( unsigned int ttlevel, boolean_t alloc_ptd) { - pt_desc_t *ptdp; + pt_desc_t *ptdp = NULL; + vm_offset_t *pvh; - ptdp = *(pt_desc_t **)pai_to_pvh(pa_index((((vm_offset_t)pte_p) - gVirtBase + gPhysBase))); + pvh = (vm_offset_t *)(pai_to_pvh(pa_index(ml_static_vtop((vm_offset_t)pte_p)))); - if (ptdp == NULL) { + if (pvh_test_type(pvh, PVH_TYPE_NULL)) { if (alloc_ptd) { /* * This path should only be invoked from arm_vm_init. If we are emulating 16KB pages @@ -6280,94 +6596,23 @@ pmap_init_pte_page( * bootstrap request, so we check for an existing PTD here. */ ptdp = ptd_alloc(pmap); - *(pt_desc_t **)pai_to_pvh(pa_index((((vm_offset_t)pte_p) - gVirtBase + gPhysBase))) = ptdp; + pvh_update_head_unlocked(pvh, ptdp, PVH_TYPE_PTDP); } else { - panic("pmap_init_pte_page(): pte_p %p\n", pte_p); + panic("pmap_init_pte_page(): pte_p %p", pte_p); } + } else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) { + ptdp = (pt_desc_t*)(pvh_list(pvh)); + } else { + panic("pmap_init_pte_page(): invalid PVH type for pte_p %p", pte_p); } - pmap_init_pte_page_internal(pmap, pte_p, va, ttlevel, &ptdp); -} - -/* - * pmap_init_pte_page_internal - Initialize page table page and page table descriptor - */ -void -pmap_init_pte_page_internal( - pmap_t pmap, - pt_entry_t *pte_p, - vm_offset_t va, - unsigned int ttlevel, - pt_desc_t **ptdp) -{ bzero(pte_p, ARM_PGBYTES); // below barrier ensures the page zeroing is visible to PTW before // it is linked to the PTE of previous level - __asm__ volatile("DMB ST" : : : "memory"); - ptd_init(*ptdp, pmap, va, ttlevel, pte_p); -} - -/* - * pmap_init_pte_static_page - for static mappings to a known contiguous range of pa's - * Called from arm_vm_init(). - */ -void -pmap_init_pte_static_page( - __unused pmap_t pmap, - pt_entry_t * pte_p, - pmap_paddr_t pa) -{ -#if (__ARM_VMSA__ == 7) - unsigned int i; - pt_entry_t *pte_cur; - - for (i = 0, pte_cur = pte_p; - i < (ARM_PGBYTES / sizeof(*pte_p)); - i++, pa += PAGE_SIZE) { - if (pa >= avail_end) { - /* We don't want to map memory xnu does not own through this routine. */ - break; - } - - *pte_cur = pa_to_pte(pa) - | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_SH | ARM_PTE_AP(AP_RONA) - | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT); - pte_cur++; - } -#else - unsigned int i; - pt_entry_t *pte_cur; - pt_entry_t template; - - template = ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_SH(SH_OUTER_MEMORY) | ARM_PTE_AP(AP_RONA) | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT) | ARM_PTE_NX; - - for (i = 0, pte_cur = pte_p; - i < (ARM_PGBYTES / sizeof(*pte_p)); - i++, pa += PAGE_SIZE) { - if (pa >= avail_end) { - /* We don't want to map memory xnu does not own through this routine. */ - break; - } - - /* TEST_PAGE_RATIO_4 may be pre-processor defined to 0 */ - __unreachable_ok_push - if (TEST_PAGE_RATIO_4) { - *pte_cur = pa_to_pte(pa) | template; - *(pte_cur+1) = pa_to_pte(pa+0x1000) | template; - *(pte_cur+2) = pa_to_pte(pa+0x2000) | template; - *(pte_cur+3) = pa_to_pte(pa+0x3000) | template; - pte_cur += 4; - } else { - *pte_cur = pa_to_pte(pa) | template; - pte_cur++; - } - __unreachable_ok_pop - } -#endif - bzero(pte_cur, ARM_PGBYTES - ((vm_offset_t)pte_cur - (vm_offset_t)pte_p)); + __builtin_arm_dmb(DMB_ISHST); + ptd_init(ptdp, pmap, va, ttlevel, pte_p); } - /* * Routine: pmap_expand * @@ -6405,7 +6650,7 @@ pmap_expand( break; } - simple_lock(&pmap->tt1_lock); + pmap_simple_lock(&pmap->tt1_lock); for (i = 0; i < pmap->tte_index_max; i++) tte_p[i] = pmap->tte[i]; for (i = NTTES; i < 2*NTTES; i++) @@ -6414,23 +6659,18 @@ pmap_expand( pmap->prev_tte = pmap->tte; pmap->tte = tte_p; pmap->ttep = ml_static_vtop((vm_offset_t)pmap->tte); -#ifndef __ARM_L1_PTW__ - CleanPoU_DcacheRegion((vm_offset_t) pmap->tte, 2*NTTES * sizeof(tt_entry_t)); -#else - __builtin_arm_dsb(DSB_ISH); -#endif + + FLUSH_PTE_RANGE(pmap->tte, pmap->tte + (2*NTTES)); + pmap->tte_index_max = 2*NTTES; pmap->stamp = hw_atomic_add(&pmap_stamp, 1); for (i = 0; i < NTTES; i++) pmap->prev_tte[i] = ARM_TTE_TYPE_FAULT; -#ifndef __ARM_L1_PTW__ - CleanPoU_DcacheRegion((vm_offset_t) pmap->prev_tte, NTTES * sizeof(tt_entry_t)); -#else - __builtin_arm_dsb(DSB_ISH); -#endif - simple_unlock(&pmap->tt1_lock); + FLUSH_PTE_RANGE(pmap->prev_tte, pmap->prev_tte + NTTES); + + pmap_simple_unlock(&pmap->tt1_lock); PMAP_UNLOCK(pmap); pmap_set_pmap(pmap, current_thread()); @@ -6460,9 +6700,9 @@ pmap_expand( if (pa) { tte_p = &pmap->tte[ttenum(v)]; *tte_p = pa_to_tte(pa) | (((v >> ARM_TT_L1_SHIFT) & 0x3) << 10) | ARM_TTE_TYPE_TABLE; -#ifndef __ARM_L1_PTW__ - CleanPoU_DcacheRegion((vm_offset_t) tte_p, sizeof(tt_entry_t)); -#endif + FLUSH_PTE(tte_p); + PMAP_TRACE(3, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~ARM_TT_L1_OFFMASK), + VM_KERNEL_ADDRHIDE((v & ~ARM_TT_L1_OFFMASK) + ARM_TT_L1_SIZE), *tte_p); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } @@ -6497,12 +6737,13 @@ pmap_expand( tte_p = &pmap->tte[ttenum(v)]; for (i = 0, tte_next_p = tte_p; i<4; i++) { *tte_next_p = pa_to_tte(pa) | ARM_TTE_TYPE_TABLE; + PMAP_TRACE(3, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE((v & ~ARM_TT_L1_PT_OFFMASK) + (i * ARM_TT_L1_SIZE)), + VM_KERNEL_ADDRHIDE((v & ~ARM_TT_L1_PT_OFFMASK) + ((i + 1) * ARM_TT_L1_SIZE)), *tte_p); tte_next_p++; pa = pa +0x400; } -#ifndef __ARM_L1_PTW__ - CleanPoU_DcacheRegion((vm_offset_t) tte_p, 4*sizeof(tt_entry_t)); -#endif + FLUSH_PTE_RANGE(tte_p, tte_p + 4); + pa = 0x0ULL; tt_p = (tt_entry_t *)NULL; } @@ -6547,6 +6788,8 @@ pmap_expand( pa = kvtophys((vm_offset_t)tt_p); tte_p = pmap_tt1e( pmap, v); *tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID; + PMAP_TRACE(3, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~ARM_TT_L1_OFFMASK), + VM_KERNEL_ADDRHIDE((v & ~ARM_TT_L1_OFFMASK) + ARM_TT_L1_SIZE), *tte_p); pa = 0x0ULL; tt_p = (tt_entry_t *)NULL; if ((pmap == kernel_pmap) && (VM_MIN_KERNEL_ADDRESS < 0x00000000FFFFFFFFULL)) @@ -6569,6 +6812,8 @@ pmap_expand( pa = kvtophys((vm_offset_t)tt_p); tte_p = pmap_tt2e( pmap, v); *tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID; + PMAP_TRACE(3, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~ARM_TT_L2_OFFMASK), + VM_KERNEL_ADDRHIDE((v & ~ARM_TT_L2_OFFMASK) + ARM_TT_L2_SIZE), *tte_p); pa = 0x0ULL; tt_p = (tt_entry_t *)NULL; } @@ -6632,38 +6877,39 @@ pmap_gc( pmap_gc_forced)) { pmap_gc_forced = FALSE; pmap_gc_allowed_by_time_throttle = FALSE; - simple_lock(&pmaps_lock); + pmap_simple_lock(&pmaps_lock); pmap = CAST_DOWN_EXPLICIT(pmap_t, queue_first(&map_pmap_list)); while (!queue_end(&map_pmap_list, (queue_entry_t)pmap)) { if (!(pmap->gc_status & PMAP_GC_INFLIGHT)) pmap->gc_status |= PMAP_GC_INFLIGHT; - simple_unlock(&pmaps_lock); + pmap_simple_unlock(&pmaps_lock); pmap_collect(pmap); - simple_lock(&pmaps_lock); + pmap_simple_lock(&pmaps_lock); gc_wait = (pmap->gc_status & PMAP_GC_WAIT); pmap->gc_status &= ~(PMAP_GC_INFLIGHT|PMAP_GC_WAIT); pmap_next = CAST_DOWN_EXPLICIT(pmap_t, queue_next(&pmap->pmaps)); if (gc_wait) { if (!queue_end(&map_pmap_list, (queue_entry_t)pmap_next)) pmap_next->gc_status |= PMAP_GC_INFLIGHT; - simple_unlock(&pmaps_lock); + pmap_simple_unlock(&pmaps_lock); thread_wakeup((event_t) & pmap->gc_status); - simple_lock(&pmaps_lock); + pmap_simple_lock(&pmaps_lock); } pmap = pmap_next; } - simple_unlock(&pmaps_lock); + pmap_simple_unlock(&pmaps_lock); } } /* * Called by the VM to reclaim pages that we can reclaim quickly and cheaply. */ -void +uint64_t pmap_release_pages_fast(void) { + return 0; } /* @@ -6776,15 +7022,15 @@ mapping_set_ref( } /* - * Clear specified attribute bits. + * Clear specified attribute bits. * - * Try to force an arm_fast_fault() for all mappings of - * the page - to force attributes to be set again at fault time. - * If the forcing succeeds, clear the cached bits at the head. - * Otherwise, something must have been wired, so leave the cached - * attributes alone. + * Try to force an arm_fast_fault() for all mappings of + * the page - to force attributes to be set again at fault time. + * If the forcing succeeds, clear the cached bits at the head. + * Otherwise, something must have been wired, so leave the cached + * attributes alone. */ -static void +MARK_AS_PMAP_TEXT static void phys_attribute_clear_internal( ppnum_t pn, unsigned int bits, @@ -6804,6 +7050,21 @@ phys_attribute_clear_internal( } assert(pn != vm_page_fictitious_addr); + + if (options & PMAP_OPTIONS_CLEAR_WRITE) { + assert(bits == PP_ATTR_MODIFIED); + + pmap_page_protect_options_internal(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), 0); + /* + * We short circuit this case; it should not need to + * invoke arm_force_fast_fault, so just clear the modified bit. + * pmap_page_protect has taken care of resetting + * the state so that we'll see the next write as a fault to + * the VM (i.e. we don't want a fast fault). + */ + pa_clear_bits(pa, bits); + return; + } if (bits & PP_ATTR_REFERENCED) allow_mode &= ~(VM_PROT_READ | VM_PROT_EXECUTE); if (bits & PP_ATTR_MODIFIED) @@ -6835,11 +7096,11 @@ phys_attribute_clear( * Do we really want this tracepoint? It will be extremely chatty. * Also, should we have a corresponding trace point for the set path? */ - PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits); + PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits); phys_attribute_clear_internal(pn, bits, options, arg); - PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END); + PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END); } /* @@ -6849,7 +7110,7 @@ phys_attribute_clear( * no per-mapping hardware support for referenced and * modify bits. */ -static void +MARK_AS_PMAP_TEXT static void phys_attribute_set_internal( ppnum_t pn, unsigned int bits) @@ -7122,7 +7383,7 @@ pmap_lock_phys_page(ppnum_t pn) pai = (int)pa_index(phys); LOCK_PVH(pai); } else - simple_lock(&phys_backup_lock); + simple_lock(&phys_backup_lock); } @@ -7136,18 +7397,19 @@ pmap_unlock_phys_page(ppnum_t pn) pai = (int)pa_index(phys); UNLOCK_PVH(pai); } else - simple_unlock(&phys_backup_lock); + simple_unlock(&phys_backup_lock); } -static void +MARK_AS_PMAP_TEXT static void pmap_switch_user_ttb_internal( pmap_t pmap) { -#if (__ARM_VMSA__ == 7) + VALIDATE_PMAP(pmap); pmap_cpu_data_t *cpu_data_ptr; - cpu_data_ptr = pmap_get_cpu_data(); +#if (__ARM_VMSA__ == 7) + if ((cpu_data_ptr->cpu_user_pmap != PMAP_NULL) && (cpu_data_ptr->cpu_user_pmap != kernel_pmap)) { unsigned int c; @@ -7180,7 +7442,7 @@ pmap_switch_user_ttb_internal( if (pmap->tte_index_max == NTTES) { /* Setting TTBCR.N for TTBR0 TTBR1 boundary at 0x40000000 */ __asm__ volatile("mcr p15,0,%0,c2,c0,2" : : "r"(2)); - __asm__ volatile("isb"); + __builtin_arm_isb(ISB_SY); #if !__ARM_USER_PROTECT__ set_mmu_ttb(pmap->ttep); #endif @@ -7190,7 +7452,7 @@ pmap_switch_user_ttb_internal( #endif /* Setting TTBCR.N for TTBR0 TTBR1 boundary at 0x80000000 */ __asm__ volatile("mcr p15,0,%0,c2,c0,2" : : "r"(1)); - __asm__ volatile("isb"); + __builtin_arm_isb(ISB_SY); #if MACH_ASSERT && __ARM_USER_PROTECT__ if (pmap->ttep & 0x1000) { panic("Misaligned ttbr0 %08X\n", pmap->ttep); @@ -7201,16 +7463,14 @@ pmap_switch_user_ttb_internal( #if !__ARM_USER_PROTECT__ set_context_id(pmap->asid); #endif -#else - pmap_get_cpu_data()->cpu_user_pmap = pmap; - pmap_get_cpu_data()->cpu_user_pmap_stamp = pmap->stamp; +#else /* (__ARM_VMSA__ == 7) */ + + if (pmap != kernel_pmap) + cpu_data_ptr->cpu_nested_pmap = pmap->nested_pmap; -#if !__arm64__ - set_context_id(pmap->asid); /* Not required */ -#endif if (pmap == kernel_pmap) { - set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK); + pmap_clear_user_ttb_internal(); } else { set_mmu_ttb((pmap->ttep & TTBR_BADDR_MASK)|(((uint64_t)pmap->asid) << TTBR_ASID_SHIFT)); } @@ -7221,52 +7481,26 @@ void pmap_switch_user_ttb( pmap_t pmap) { + PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH_USER_TTB) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), pmap->vasid, pmap->asid); pmap_switch_user_ttb_internal(pmap); + PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH_USER_TTB) | DBG_FUNC_END); } -/* - * Try to "intuit" whether we need to raise a VM_PROT_WRITE fault - * for the given address when a "swp" instruction raised the fault. - * We have to look at the existing pte for the address to see - * if it needs to get bumped, or just added. If just added, do it - * as a read-only mapping first (this could result in extra faults - - * but better that than extra copy-on-write evaluations). - */ - -#if (__ARM_VMSA__ == 7) -boolean_t -arm_swap_readable_type( - vm_map_address_t addr, - unsigned int spsr) +MARK_AS_PMAP_TEXT static void +pmap_clear_user_ttb_internal(void) { - int ap; - pt_entry_t spte; - pt_entry_t *ptep; - - ptep = pmap_pte(current_pmap(), addr); - if (ptep == PT_ENTRY_NULL) - return (FALSE); - - spte = *ptep; - if (spte == ARM_PTE_TYPE_FAULT || - ARM_PTE_IS_COMPRESSED(spte)) - return (FALSE); - - /* get the access permission bitmaps */ - /* (all subpages should be the same) */ - ap = (spte & ARM_PTE_APMASK); +#if (__ARM_VMSA__ > 7) + set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK); +#else + set_mmu_ttb(kernel_pmap->ttep); +#endif +} - if (spsr & 0xf) { /* Supervisor mode */ - panic("arm_swap_readable_type supv"); - return TRUE; - } else { /* User mode */ - if ((ap == ARM_PTE_AP(AP_RWRW)) || (ap == ARM_PTE_AP(AP_RORO))) - return (FALSE); - else - return (TRUE); - } +void +pmap_clear_user_ttb(void) +{ + pmap_clear_user_ttb_internal(); } -#endif /* * Routine: arm_force_fast_fault @@ -7276,7 +7510,7 @@ arm_swap_readable_type( * to the access modes allowed, so we can gather ref/modify * bits again. */ -static boolean_t +MARK_AS_PMAP_TEXT static boolean_t arm_force_fast_fault_internal( ppnum_t ppnum, vm_prot_t allow_mode, @@ -7289,6 +7523,7 @@ arm_force_fast_fault_internal( boolean_t result; pv_entry_t **pv_h; boolean_t is_reusable, is_internal; + boolean_t tlb_flush_needed = FALSE; boolean_t ref_fault; boolean_t mod_fault; @@ -7318,10 +7553,10 @@ arm_force_fast_fault_internal( while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) { vm_map_address_t va; - pt_entry_t spte; - pt_entry_t tmplate; - pmap_t pmap; - boolean_t update_pte; + pt_entry_t spte; + pt_entry_t tmplate; + pmap_t pmap; + boolean_t update_pte; if (pve_p != PV_ENTRY_NULL) pte_p = pve_get_ptep(pve_p); @@ -7329,6 +7564,11 @@ arm_force_fast_fault_internal( if (pte_p == PT_ENTRY_NULL) { panic("pte_p is NULL: pve_p=%p ppnum=0x%x\n", pve_p, ppnum); } +#ifdef PVH_FLAG_IOMMU + if ((vm_offset_t)pte_p & PVH_FLAG_IOMMU) { + goto fff_skip_pve; + } +#endif if (*pte_p == ARM_PTE_EMPTY) { panic("pte is NULL: pte_p=%p ppnum=0x%x\n", pte_p, ppnum); } @@ -7377,11 +7617,12 @@ arm_force_fast_fault_internal( if (update_pte) { if (*pte_p != ARM_PTE_TYPE_FAULT && !ARM_PTE_IS_COMPRESSED(*pte_p)) { - WRITE_PTE(pte_p, tmplate); - PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE); + WRITE_PTE_STRONG(pte_p, tmplate); + flush_mmu_tlb_region_asid_async(va, PAGE_SIZE, pmap); + tlb_flush_needed = TRUE; } else { WRITE_PTE(pte_p, tmplate); - __asm__ volatile("isb"); + __builtin_arm_isb(ISB_SY); } } @@ -7402,14 +7643,10 @@ arm_force_fast_fault_internal( OSAddAtomic(+1, &pmap->stats.internal); PMAP_STATS_PEAK(pmap->stats.internal); PMAP_STATS_ASSERTF(pmap->stats.internal > 0, pmap, "stats.internal %d", pmap->stats.internal); - pmap_ledger_credit(pmap, - task_ledgers.internal, - machine_ptob(1)); + pmap_ledger_credit(pmap, task_ledgers.internal, machine_ptob(1)); assert(!IS_ALTACCT_PAGE(pai, pve_p)); assert(IS_INTERNAL_PAGE(pai)); - pmap_ledger_credit(pmap, - task_ledgers.phys_footprint, - machine_ptob(1)); + pmap_ledger_credit(pmap, task_ledgers.phys_footprint, machine_ptob(1)); /* * Avoid the cost of another trap to handle the fast @@ -7431,21 +7668,23 @@ arm_force_fast_fault_internal( /* one less "internal" */ PMAP_STATS_ASSERTF(pmap->stats.internal > 0, pmap, "stats.internal %d", pmap->stats.internal); OSAddAtomic(-1, &pmap->stats.internal); - pmap_ledger_debit(pmap, - task_ledgers.internal, - machine_ptob(1)); + pmap_ledger_debit(pmap, task_ledgers.internal, machine_ptob(1)); assert(!IS_ALTACCT_PAGE(pai, pve_p)); assert(IS_INTERNAL_PAGE(pai)); - pmap_ledger_debit(pmap, - task_ledgers.phys_footprint, - machine_ptob(1)); + pmap_ledger_debit(pmap, task_ledgers.phys_footprint, machine_ptob(1)); } +#ifdef PVH_FLAG_IOMMU + fff_skip_pve: +#endif pte_p = PT_ENTRY_NULL; if (pve_p != PV_ENTRY_NULL) pve_p = PVE_NEXT_PTR(pve_next(pve_p)); } + if (tlb_flush_needed) + sync_tlb_flush(); + /* update global "reusable" status for this page */ if (is_internal) { if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) && @@ -7503,6 +7742,7 @@ arm_clear_fast_fault( pt_entry_t *pte_p; int pai; boolean_t result; + boolean_t tlb_flush_needed = FALSE; pv_entry_t **pv_h; assert(ppnum != vm_page_fictitious_addr); @@ -7536,6 +7776,11 @@ arm_clear_fast_fault( if (pte_p == PT_ENTRY_NULL) { panic("pte_p is NULL: pve_p=%p ppnum=0x%x\n", pve_p, ppnum); } +#ifdef PVH_FLAG_IOMMU + if ((vm_offset_t)pte_p & PVH_FLAG_IOMMU) { + goto cff_skip_pve; + } +#endif if (*pte_p == ARM_PTE_EMPTY) { panic("pte is NULL: pte_p=%p ppnum=0x%x\n", pte_p, ppnum); } @@ -7572,19 +7817,25 @@ arm_clear_fast_fault( if (spte != tmplate) { if (spte != ARM_PTE_TYPE_FAULT) { - WRITE_PTE(pte_p, tmplate); - PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE); + WRITE_PTE_STRONG(pte_p, tmplate); + flush_mmu_tlb_region_asid_async(va, PAGE_SIZE, pmap); + tlb_flush_needed = TRUE; } else { WRITE_PTE(pte_p, tmplate); - __asm__ volatile("isb"); + __builtin_arm_isb(ISB_SY); } result = TRUE; } +#ifdef PVH_FLAG_IOMMU + cff_skip_pve: +#endif pte_p = PT_ENTRY_NULL; if (pve_p != PV_ENTRY_NULL) pve_p = PVE_NEXT_PTR(pve_next(pve_p)); } + if (tlb_flush_needed) + sync_tlb_flush(); return result; } @@ -7602,7 +7853,7 @@ arm_clear_fast_fault( * Returns KERN_PROTECTION_FAILURE if the pmap layer explictly * disallows this type of access. */ -static kern_return_t +MARK_AS_PMAP_TEXT static kern_return_t arm_fast_fault_internal( pmap_t pmap, vm_map_address_t va, @@ -7615,6 +7866,8 @@ arm_fast_fault_internal( int pai; pmap_paddr_t pa; + VALIDATE_PMAP(pmap); + PMAP_LOCK(pmap); /* @@ -7629,12 +7882,15 @@ arm_fast_fault_internal( pa = pte_to_pa(spte); if ((spte == ARM_PTE_TYPE_FAULT) || - ARM_PTE_IS_COMPRESSED(spte) || - (!pa_valid(pa))) { - PMAP_UNLOCK(pmap); - return result; + ARM_PTE_IS_COMPRESSED(spte)) { + PMAP_UNLOCK(pmap); + return result; } + if (!pa_valid(pa)) { + PMAP_UNLOCK(pmap); + return result; + } pai = (int)pa_index(pa); LOCK_PVH(pai); } else { @@ -7689,7 +7945,7 @@ arm_fast_fault( if (va < pmap->min || va >= pmap->max) return result; - PMAP_TRACE(PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_START, + PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(va), fault_type, from_user); @@ -7718,7 +7974,7 @@ arm_fast_fault( done: #endif - PMAP_TRACE(PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_END, result); + PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_END, result); return result; } @@ -7815,10 +8071,12 @@ pmap_map_globals( vm_offset_t pmap_cpu_windows_copy_addr(int cpu_num, unsigned int index) { + if (__improbable(index >= CPUWINDOWS_MAX)) + panic("%s: invalid index %u", __func__, index); return (vm_offset_t)(CPUWINDOWS_BASE + (PAGE_SIZE * ((CPUWINDOWS_MAX * cpu_num) + index))); } -static unsigned int +MARK_AS_PMAP_TEXT static unsigned int pmap_map_cpu_windows_copy_internal( ppnum_t pn, vm_prot_t prot, @@ -7855,11 +8113,12 @@ pmap_map_cpu_windows_copy_internal( pte |= ARM_PTE_AP(AP_RONA); } - WRITE_PTE(ptep, pte); + WRITE_PTE_FAST(ptep, pte); /* * Invalidate tlb. Cover nested cpu_copywindow_vaddr usage with the interrupted context * in pmap_unmap_cpu_windows_copy() after clearing the pte and before tlb invalidate. */ + FLUSH_PTE_STRONG(ptep); PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE); return(i); @@ -7874,7 +8133,7 @@ pmap_map_cpu_windows_copy( return pmap_map_cpu_windows_copy_internal(pn, prot, wimg_bits); } -static void +MARK_AS_PMAP_TEXT static void pmap_unmap_cpu_windows_copy_internal( unsigned int index) { @@ -7885,9 +8144,12 @@ pmap_unmap_cpu_windows_copy_internal( cpu_num = pmap_get_cpu_data()->cpu_number; cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, index); - __asm__ volatile("dsb sy"); + /* Issue full-system DSB to ensure prior operations on the per-CPU window + * (which are likely to have been on I/O memory) are complete before + * tearing down the mapping. */ + __builtin_arm_dsb(DSB_SY); ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr); - WRITE_PTE(ptep, ARM_PTE_TYPE_FAULT); + WRITE_PTE_STRONG(ptep, ARM_PTE_TYPE_FAULT); PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE); } @@ -7899,12 +8161,15 @@ pmap_unmap_cpu_windows_copy( } /* - * Marked a pmap has nested + * Indicate that a pmap is intended to be used as a nested pmap + * within one or more larger address spaces. This must be set + * before pmap_nest() is called with this pmap as the 'subordinate'. */ -static void +MARK_AS_PMAP_TEXT static void pmap_set_nested_internal( pmap_t pmap) { + VALIDATE_PMAP(pmap); pmap->nested = TRUE; } @@ -7915,6 +8180,357 @@ pmap_set_nested( pmap_set_nested_internal(pmap); } +/* + * pmap_trim_range(pmap, start, end) + * + * pmap = pmap to operate on + * start = start of the range + * end = end of the range + * + * Attempts to deallocate TTEs for the given range in the nested range. + */ +MARK_AS_PMAP_TEXT static void +pmap_trim_range( + pmap_t pmap, + addr64_t start, + addr64_t end) +{ + addr64_t cur; + addr64_t nested_region_start; + addr64_t nested_region_end; + addr64_t adjusted_start; + addr64_t adjusted_end; + addr64_t adjust_offmask; + tt_entry_t * tte_p; + pt_entry_t * pte_p; + + if (__improbable(end < start)) { + panic("%s: invalid address range, " + "pmap=%p, start=%p, end=%p", + __func__, + pmap, (void*)start, (void*)end); + } + + nested_region_start = pmap->nested ? pmap->nested_region_subord_addr : pmap->nested_region_subord_addr; + nested_region_end = nested_region_start + pmap->nested_region_size; + + if (__improbable((start < nested_region_start) || (end > nested_region_end))) { + panic("%s: range outside nested region %p-%p, " + "pmap=%p, start=%p, end=%p", + __func__, (void *)nested_region_start, (void *)nested_region_end, + pmap, (void*)start, (void*)end); + } + + /* Contract the range to TT page boundaries. */ +#if (__ARM_VMSA__ > 7) + adjust_offmask = ARM_TT_TWIG_OFFMASK; +#else /* (__ARM_VMSA__ > 7) */ + adjust_offmask = ((ARM_TT_TWIG_SIZE * 4) - 1); +#endif /* (__ARM_VMSA__ > 7) */ + + adjusted_start = ((start + adjust_offmask) & ~adjust_offmask); + adjusted_end = end & ~adjust_offmask; + + /* Iterate over the range, trying to remove TTEs. */ + for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += ARM_TT_TWIG_SIZE) { + bool modified = false; + + PMAP_LOCK(pmap); + + tte_p = pmap_tte(pmap, cur); + + if (tte_p == (tt_entry_t *) NULL) { + goto done; + } + + if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) { + pte_p = (pt_entry_t *) ttetokv(*tte_p); + +#if (__ARM_VMSA__ == 7) + if ((ptep_get_ptd(pte_p)->pt_cnt[ARM_PT_DESC_INDEX(pte_p)].refcnt == 0) && + (pmap != kernel_pmap)) { + if (pmap->nested == TRUE) { + /* Deallocate for the nested map. */ + pmap_tte_deallocate(pmap, tte_p, PMAP_TT_L1_LEVEL); + } else { + /* Just remove for the parent map. */ + pmap_tte_remove(pmap, tte_p, PMAP_TT_L1_LEVEL); + } + + flush_mmu_tlb_entry((cur & ~ARM_TT_L1_OFFMASK) | (pmap->asid & 0xff)); + modified = true; + } +#else + if ((ptep_get_ptd(pte_p)->pt_cnt[ARM_PT_DESC_INDEX(pte_p)].refcnt == 0) && + (pmap != kernel_pmap)) { + if (pmap->nested == TRUE) { + /* Deallocate for the nested map. */ + pmap_tte_deallocate(pmap, tte_p, PMAP_TT_L2_LEVEL); + } else { + /* Just remove for the parent map. */ + pmap_tte_remove(pmap, tte_p, PMAP_TT_L2_LEVEL); + } + + flush_mmu_tlb_entry(tlbi_addr(cur & ~ARM_TT_L2_OFFMASK) | tlbi_asid(pmap->asid)); + modified = true; + } +#endif + } + +done: + PMAP_UNLOCK(pmap); + + if (modified) { + PMAP_UPDATE_TLBS(pmap, cur, cur + PAGE_SIZE); + } + } + +#if (__ARM_VMSA__ > 7) + /* Remove empty L2 TTs. */ + adjusted_start = ((start + ARM_TT_L1_OFFMASK) & ~ARM_TT_L1_OFFMASK); + adjusted_end = end & ~ARM_TT_L1_OFFMASK; + + for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += ARM_TT_L1_SIZE) { + /* For each L1 entry in our range... */ + PMAP_LOCK(pmap); + + bool remove_tt1e = true; + tt_entry_t * tt1e_p = pmap_tt1e(pmap, cur); + tt_entry_t * tt2e_start; + tt_entry_t * tt2e_end; + tt_entry_t * tt2e_p; + tt_entry_t tt1e; + + if (tt1e_p == NULL) { + PMAP_UNLOCK(pmap); + continue; + } + + tt1e = *tt1e_p; + + if (tt1e == ARM_TTE_TYPE_FAULT) { + PMAP_UNLOCK(pmap); + continue; + } + + tt2e_start = &((tt_entry_t*) phystokv(tt1e & ARM_TTE_TABLE_MASK))[0]; + tt2e_end = &tt2e_start[TTE_PGENTRIES]; + + for (tt2e_p = tt2e_start; tt2e_p < tt2e_end; tt2e_p++) { + if (*tt2e_p != ARM_TTE_TYPE_FAULT) { + /* + * If any TTEs are populated, don't remove the + * L1 TT. + */ + remove_tt1e = false; + } + } + + if (remove_tt1e) { + pmap_tte_deallocate(pmap, tt1e_p, PMAP_TT_L1_LEVEL); + PMAP_UPDATE_TLBS(pmap, cur, cur + PAGE_SIZE); + } + + PMAP_UNLOCK(pmap); + } +#endif /* (__ARM_VMSA__ > 7) */ +} + +/* + * pmap_trim_internal(grand, subord, vstart, nstart, size) + * + * grand = pmap subord is nested in + * subord = nested pmap + * vstart = start of the used range in grand + * nstart = start of the used range in nstart + * size = size of the used range + * + * Attempts to trim the shared region page tables down to only cover the given + * range in subord and grand. + */ +MARK_AS_PMAP_TEXT static void +pmap_trim_internal( + pmap_t grand, + pmap_t subord, + addr64_t vstart, + addr64_t nstart, + uint64_t size) +{ + addr64_t vend, nend; + addr64_t adjust_offmask; + + if (__improbable(os_add_overflow(vstart, size, &vend))) { + panic("%s: grand addr wraps around, " + "grand=%p, subord=%p, vstart=%p, nstart=%p, size=%#llx", + __func__, grand, subord, (void*)vstart, (void*)nstart, size); + } + + if (__improbable(os_add_overflow(nstart, size, &nend))) { + panic("%s: nested addr wraps around, " + "grand=%p, subord=%p, vstart=%p, nstart=%p, size=%#llx", + __func__, grand, subord, (void*)vstart, (void*)nstart, size); + } + + VALIDATE_PMAP(grand); + VALIDATE_PMAP(subord); + + PMAP_LOCK(subord); + + if (!subord->nested) { + panic("%s: subord is not nestable, " + "grand=%p, subord=%p, vstart=%p, nstart=%p, size=%#llx", + __func__, grand, subord, (void*)vstart, (void*)nstart, size); + } + + if (grand->nested) { + panic("%s: grand is nestable, " + "grand=%p, subord=%p, vstart=%p, nstart=%p, size=%#llx", + __func__, grand, subord, (void*)vstart, (void*)nstart, size); + } + + if (grand->nested_pmap != subord) { + panic("%s: grand->nested != subord, " + "grand=%p, subord=%p, vstart=%p, nstart=%p, size=%#llx", + __func__, grand, subord, (void*)vstart, (void*)nstart, size); + } + + if (size != 0) { + if ((vstart < grand->nested_region_grand_addr) || (vend > (grand->nested_region_grand_addr + grand->nested_region_size))) { + panic("%s: grand range not in nested region, " + "grand=%p, subord=%p, vstart=%p, nstart=%p, size=%#llx", + __func__, grand, subord, (void*)vstart, (void*)nstart, size); + } + + if ((nstart < grand->nested_region_grand_addr) || (nend > (grand->nested_region_grand_addr + grand->nested_region_size))) { + panic("%s: subord range not in nested region, " + "grand=%p, subord=%p, vstart=%p, nstart=%p, size=%#llx", + __func__, grand, subord, (void*)vstart, (void*)nstart, size); + } + } + + + if (!grand->nested_has_no_bounds_ref) { + assert(subord->nested_bounds_set); + + if (!grand->nested_bounds_set) { + /* Inherit the bounds from subord. */ + grand->nested_region_true_start = (subord->nested_region_true_start - grand->nested_region_subord_addr) + grand->nested_region_grand_addr; + grand->nested_region_true_end = (subord->nested_region_true_end - grand->nested_region_subord_addr) + grand->nested_region_grand_addr; + grand->nested_bounds_set = true; + } + + PMAP_UNLOCK(subord); + return; + } + + if ((!subord->nested_bounds_set) && size) { +#if (__ARM_VMSA__ > 7) + adjust_offmask = ARM_TT_TWIG_OFFMASK; +#else /* (__ARM_VMSA__ > 7) */ + adjust_offmask = ((ARM_TT_TWIG_SIZE * 4) - 1); +#endif /* (__ARM_VMSA__ > 7) */ + + subord->nested_region_true_start = nstart; + subord->nested_region_true_end = nend; + subord->nested_region_true_start &= ~adjust_offmask; + + if (__improbable(os_add_overflow(subord->nested_region_true_end, adjust_offmask, &subord->nested_region_true_end))) { + panic("%s: padded true end wraps around, " + "grand=%p, subord=%p, vstart=%p, nstart=%p, size=%#llx", + __func__, grand, subord, (void*)vstart, (void*)nstart, size); + } + + subord->nested_region_true_end &= ~adjust_offmask; + subord->nested_bounds_set = true; + } + + if (subord->nested_bounds_set) { + /* Inherit the bounds from subord. */ + grand->nested_region_true_start = (subord->nested_region_true_start - grand->nested_region_subord_addr) + grand->nested_region_grand_addr; + grand->nested_region_true_end = (subord->nested_region_true_end - grand->nested_region_subord_addr) + grand->nested_region_grand_addr; + grand->nested_bounds_set = true; + + /* If we know the bounds, we can trim the pmap. */ + grand->nested_has_no_bounds_ref = false; + PMAP_UNLOCK(subord); + } else { + /* Don't trim if we don't know the bounds. */ + PMAP_UNLOCK(subord); + return; + } + + /* Trim grand to only cover the given range. */ + pmap_trim_range(grand, grand->nested_region_grand_addr, grand->nested_region_true_start); + pmap_trim_range(grand, grand->nested_region_true_end, (grand->nested_region_grand_addr + grand->nested_region_size)); + + /* Try to trim subord. */ + pmap_trim_subord(subord); +} + +MARK_AS_PMAP_TEXT static void pmap_trim_self(pmap_t pmap) +{ + if (pmap->nested_has_no_bounds_ref && pmap->nested_pmap) { + /* If we have a no bounds ref, we need to drop it. */ + PMAP_LOCK(pmap->nested_pmap); + pmap->nested_has_no_bounds_ref = false; + boolean_t nested_bounds_set = pmap->nested_pmap->nested_bounds_set; + vm_map_offset_t nested_region_true_start = (pmap->nested_pmap->nested_region_true_start - pmap->nested_region_subord_addr) + pmap->nested_region_grand_addr; + vm_map_offset_t nested_region_true_end = (pmap->nested_pmap->nested_region_true_end - pmap->nested_region_subord_addr) + pmap->nested_region_grand_addr; + PMAP_UNLOCK(pmap->nested_pmap); + + if (nested_bounds_set) { + pmap_trim_range(pmap, pmap->nested_region_grand_addr, nested_region_true_start); + pmap_trim_range(pmap, nested_region_true_end, (pmap->nested_region_grand_addr + pmap->nested_region_size)); + } + /* + * Try trimming the nested pmap, in case we had the + * last reference. + */ + pmap_trim_subord(pmap->nested_pmap); + } +} + +/* + * pmap_trim_subord(grand, subord) + * + * grand = pmap that we have nested subord in + * subord = nested pmap we are attempting to trim + * + * Trims subord if possible + */ +MARK_AS_PMAP_TEXT static void +pmap_trim_subord(pmap_t subord) +{ + bool contract_subord = false; + + PMAP_LOCK(subord); + + subord->nested_no_bounds_refcnt--; + + if ((subord->nested_no_bounds_refcnt == 0) && (subord->nested_bounds_set)) { + /* If this was the last no bounds reference, trim subord. */ + contract_subord = true; + } + + PMAP_UNLOCK(subord); + + if (contract_subord) { + pmap_trim_range(subord, subord->nested_region_subord_addr, subord->nested_region_true_start); + pmap_trim_range(subord, subord->nested_region_true_end, subord->nested_region_subord_addr + subord->nested_region_size); + } +} + +void +pmap_trim( + pmap_t grand, + pmap_t subord, + addr64_t vstart, + addr64_t nstart, + uint64_t size) +{ + pmap_trim_internal(grand, subord, vstart, nstart, size); +} + /* * kern_return_t pmap_nest(grand, subord, vstart, size) * @@ -7928,7 +8544,7 @@ pmap_set_nested( * */ -static kern_return_t +MARK_AS_PMAP_TEXT static kern_return_t pmap_nest_internal( pmap_t grand, pmap_t subord, @@ -7946,6 +8562,14 @@ pmap_nest_internal( unsigned int* nested_region_asid_bitmap; int expand_options = 0; + addr64_t vend, nend; + if (__improbable(os_add_overflow(vstart, size, &vend))) + panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size); + if (__improbable(os_add_overflow(nstart, size, &nend))) + panic("%s: %p nested addr wraps around: 0x%llx + 0x%llx", __func__, subord, nstart, size); + VALIDATE_PMAP(grand); + VALIDATE_PMAP(subord); + #if (__ARM_VMSA__ == 7) if (((size|vstart|nstart) & ARM_TT_L1_PT_OFFMASK) != 0x0ULL) { @@ -7953,10 +8577,13 @@ pmap_nest_internal( } #else if (((size|vstart|nstart) & (ARM_TT_L2_OFFMASK)) != 0x0ULL) { - panic("pmap_nest() pmap %p has a nested pmap 0x%llx, 0x%llx, 0x%llx\n", grand, vstart, nstart, size); + panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx, 0x%llx\n", grand, vstart, nstart, size); } #endif + if (!subord->nested) + panic("%s: subordinate pmap %p is not nestable", __func__, subord); + if ((grand->nested_pmap != PMAP_NULL) && (grand->nested_pmap != subord)) { panic("pmap_nest() pmap %p has a nested pmap\n", grand); } @@ -7980,14 +8607,14 @@ pmap_nest_internal( kfree(nested_region_asid_bitmap, nested_region_asid_bitmap_size*sizeof(unsigned int)); } } - if ((subord->nested_region_subord_addr + subord->nested_region_size) < (nstart+size)) { + if ((subord->nested_region_subord_addr + subord->nested_region_size) < nend) { uint64_t new_size; unsigned int new_nested_region_asid_bitmap_size; unsigned int* new_nested_region_asid_bitmap; nested_region_asid_bitmap = NULL; nested_region_asid_bitmap_size = 0; - new_size = nstart + size - subord->nested_region_subord_addr; + new_size = nend - subord->nested_region_subord_addr; /* We explicitly add 1 to the bitmap allocation size in order to avoid issues with truncation. */ new_nested_region_asid_bitmap_size = (unsigned int)((new_size>>ARM_TT_TWIG_SHIFT)/(sizeof(unsigned int)*NBBY)) + 1; @@ -8014,6 +8641,16 @@ pmap_nest_internal( PMAP_LOCK(subord); if (grand->nested_pmap == PMAP_NULL) { grand->nested_pmap = subord; + + if (!subord->nested_bounds_set) { + /* + * We are nesting without the shared regions bounds + * being known. We'll have to trim the pmap later. + */ + grand->nested_has_no_bounds_ref = true; + subord->nested_no_bounds_refcnt++; + } + grand->nested_region_grand_addr = vstart; grand->nested_region_subord_addr = nstart; grand->nested_region_size = (mach_vm_offset_t) size; @@ -8021,7 +8658,7 @@ pmap_nest_internal( if ((grand->nested_region_grand_addr > vstart)) { panic("pmap_nest() pmap %p : attempt to nest outside the nested region\n", grand); } - else if ((grand->nested_region_grand_addr + grand->nested_region_size) < (vstart+size)) { + else if ((grand->nested_region_grand_addr + grand->nested_region_size) < vend) { grand->nested_region_size = (mach_vm_offset_t)(vstart - grand->nested_region_grand_addr + size); } } @@ -8032,6 +8669,10 @@ pmap_nest_internal( num_tte = size >> ARM_TT_L1_SHIFT; for (i = 0; i < num_tte; i++) { + if (((subord->nested_region_true_start) > nvaddr) || ((subord->nested_region_true_end) <= nvaddr)) { + goto expand_next; + } + stte_p = pmap_tte(subord, nvaddr); if ((stte_p == (tt_entry_t *)NULL) || (((*stte_p) & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE)) { PMAP_UNLOCK(subord); @@ -8061,7 +8702,7 @@ pmap_nest_internal( } PMAP_LOCK(subord); - +expand_next: nvaddr += ARM_TT_L1_SIZE; vaddr += ARM_TT_L1_SIZE; } @@ -8071,6 +8712,10 @@ pmap_nest_internal( num_tte = (unsigned int)(size >> ARM_TT_L2_SHIFT); for (i = 0; i < num_tte; i++) { + if (((subord->nested_region_true_start) > nvaddr) || ((subord->nested_region_true_end) <= nvaddr)) { + goto expand_next; + } + stte_p = pmap_tt2e(subord, nvaddr); if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) { PMAP_UNLOCK(subord); @@ -8083,6 +8728,7 @@ pmap_nest_internal( PMAP_LOCK(subord); } +expand_next: nvaddr += ARM_TT_L2_SIZE; } #endif @@ -8099,18 +8745,25 @@ pmap_nest_internal( #if (__ARM_VMSA__ == 7) for (i = 0; i < num_tte; i++) { + if (((subord->nested_region_true_start) > nvaddr) || ((subord->nested_region_true_end) <= nvaddr)) { + goto nest_next; + } stte_p = pmap_tte(subord, nvaddr); gtte_p = pmap_tte(grand, vaddr); *gtte_p = *stte_p; +nest_next: nvaddr += ARM_TT_L1_SIZE; vaddr += ARM_TT_L1_SIZE; } #else for (i = 0; i < num_tte; i++) { + if (((subord->nested_region_true_start) > nvaddr) || ((subord->nested_region_true_end) <= nvaddr)) { + goto nest_next; + } - stte_p = pmap_tt2e(subord, nstart); + stte_p = pmap_tt2e(subord, nvaddr); gtte_p = pmap_tt2e(grand, vaddr); if (gtte_p == PT_ENTRY_NULL) { PMAP_UNLOCK(grand); @@ -8124,17 +8777,18 @@ pmap_nest_internal( gtte_p = pmap_tt2e(grand, vaddr); } *gtte_p = *stte_p; + +nest_next: vaddr += ARM_TT_L2_SIZE; - nstart += ARM_TT_L2_SIZE; + nvaddr += ARM_TT_L2_SIZE; } #endif kr = KERN_SUCCESS; done: -#ifndef __ARM_L1_PTW__ - CleanPoU_DcacheRegion((vm_offset_t) pmap_tte(grand, vstart), num_tte * sizeof(tt_entry_t)); -#endif + stte_p = pmap_tte(grand, vstart); + FLUSH_PTE_RANGE_STRONG(stte_p, stte_p + num_tte); #if (__ARM_VMSA__ > 7) /* @@ -8142,7 +8796,7 @@ pmap_nest_internal( */ assert((size & 0xFFFFFFFF00000000ULL) == 0); #endif - PMAP_UPDATE_TLBS(grand, vstart, vstart + size); + PMAP_UPDATE_TLBS(grand, vstart, vend); PMAP_UNLOCK(grand); return kr; @@ -8157,13 +8811,13 @@ kern_return_t pmap_nest( { kern_return_t kr = KERN_FAILURE; - PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_START, + PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord), VM_KERNEL_ADDRHIDE(vstart)); kr = pmap_nest_internal(grand, subord, vstart, nstart, size); - PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr); + PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr); return kr; } @@ -8171,7 +8825,7 @@ kern_return_t pmap_nest( /* * kern_return_t pmap_unnest(grand, vaddr) * - * grand = the pmap that we will nest subord into + * grand = the pmap that will have the virtual range unnested * vaddr = start of range in pmap to be unnested * size = size of range in pmap to be unnested * @@ -8186,7 +8840,7 @@ pmap_unnest( return(pmap_unnest_options(grand, vaddr, size, 0)); } -static kern_return_t +MARK_AS_PMAP_TEXT static kern_return_t pmap_unnest_options_internal( pmap_t grand, addr64_t vaddr, @@ -8202,6 +8856,12 @@ pmap_unnest_options_internal( unsigned int num_tte; unsigned int i; + addr64_t vend; + if (__improbable(os_add_overflow(vaddr, size, &vend))) + panic("%s: %p vaddr wraps around: 0x%llx + 0x%llx", __func__, grand, vaddr, size); + + VALIDATE_PMAP(grand); + #if (__ARM_VMSA__ == 7) if (((size|vaddr) & ARM_TT_L1_PT_OFFMASK) != 0x0ULL) { panic("pmap_unnest(): unaligned request\n"); @@ -8214,6 +8874,12 @@ pmap_unnest_options_internal( if ((option & PMAP_UNNEST_CLEAN) == 0) { + if (grand->nested_pmap == NULL) + panic("%s: %p has no nested pmap", __func__, grand); + + if ((vaddr < grand->nested_region_grand_addr) || (vend > (grand->nested_region_grand_addr + grand->nested_region_size))) + panic("%s: %p: unnest request to region not-fully-nested region [%p, %p)", __func__, grand, (void*)vaddr, (void*)vend); + PMAP_LOCK(grand->nested_pmap); start = vaddr - grand->nested_region_grand_addr + grand->nested_region_subord_addr ; @@ -8221,19 +8887,24 @@ pmap_unnest_options_internal( max_index = (unsigned int)(start_index + (size >> ARM_TT_TWIG_SHIFT)); num_tte = (unsigned int)(size >> ARM_TT_TWIG_SHIFT); - if (size > grand->nested_region_size) { - panic("pmap_unnest() pmap %p %llu, %llu\n", grand, size, (uint64_t)grand->nested_region_size); - } - - for (current_index = start_index, addr = start; current_index < max_index; current_index++) { + for (current_index = start_index, addr = start; current_index < max_index; current_index++, addr += ARM_TT_TWIG_SIZE) { pt_entry_t *bpte, *epte, *cpte; + if (addr < grand->nested_pmap->nested_region_true_start) { + /* We haven't reached the interesting range. */ + continue; + } - if(!testbit(current_index, (int *)grand->nested_pmap->nested_region_asid_bitmap)) { + if (addr >= grand->nested_pmap->nested_region_true_end) { + /* We're done with the interesting range. */ + break; + } + + bpte = pmap_pte(grand->nested_pmap, addr); + epte = bpte + (ARM_TT_LEAF_INDEX_MASK>>ARM_TT_LEAF_SHIFT); + if(!testbit(current_index, (int *)grand->nested_pmap->nested_region_asid_bitmap)) { setbit(current_index, (int *)grand->nested_pmap->nested_region_asid_bitmap); - bpte = pmap_pte(grand->nested_pmap, addr); - epte = bpte + (ARM_TT_LEAF_INDEX_MASK>>ARM_TT_LEAF_SHIFT); for (cpte = bpte; cpte <= epte; cpte++) { pmap_paddr_t pa; @@ -8262,7 +8933,7 @@ pmap_unnest_options_internal( if (((spte & ARM_PTE_NG) != ARM_PTE_NG)) { - WRITE_PTE(cpte, (spte | ARM_PTE_NG)); + WRITE_PTE_FAST(cpte, (spte | ARM_PTE_NG)); } if (managed) @@ -8274,14 +8945,12 @@ pmap_unnest_options_internal( } } - addr += ARM_TT_TWIG_SIZE; - -#ifndef __ARM_L1_PTW__ - CleanPoU_DcacheRegion((vm_offset_t) pmap_pte(grand->nested_pmap, start), num_tte * sizeof(tt_entry_t)); -#endif - PMAP_UPDATE_TLBS(grand->nested_pmap, start, start + size); + FLUSH_PTE_RANGE_STRONG(bpte, epte); + flush_mmu_tlb_region_asid_async(start, (unsigned)size, grand->nested_pmap); } + sync_tlb_flush(); + PMAP_UNLOCK(grand->nested_pmap); } @@ -8295,17 +8964,24 @@ pmap_unnest_options_internal( num_tte = (unsigned int)(size >> ARM_TT_TWIG_SHIFT); - for (i = 0; i < num_tte; i++) { + for (i = 0; i < num_tte; i++, addr += ARM_TT_TWIG_SIZE) { + if (addr < grand->nested_pmap->nested_region_true_start) { + /* We haven't reached the interesting range. */ + continue; + } + + if (addr >= grand->nested_pmap->nested_region_true_end) { + /* We're done with the interesting range. */ + break; + } + tte_p = pmap_tte(grand, addr); *tte_p = ARM_TTE_TYPE_FAULT; - - addr += ARM_TT_TWIG_SIZE; } -#ifndef __ARM_L1_PTW__ - CleanPoU_DcacheRegion((vm_offset_t) pmap_tte(grand, start), num_tte * sizeof(tt_entry_t)); -#endif - PMAP_UPDATE_TLBS(grand, start, start + size); + tte_p = pmap_tte(grand, start); + FLUSH_PTE_RANGE_STRONG(tte_p, tte_p + num_tte); + PMAP_UPDATE_TLBS(grand, start, vend); PMAP_UNLOCK(grand); @@ -8321,12 +8997,12 @@ pmap_unnest_options( { kern_return_t kr = KERN_FAILURE; - PMAP_TRACE(PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START, + PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr)); kr = pmap_unnest_options_internal(grand, vaddr, size, option); - PMAP_TRACE(PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, kr); + PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, kr); return kr; } @@ -8394,7 +9070,7 @@ pt_fake_zone_info( #define ARM64_FULL_TLB_FLUSH_THRESHOLD 256 static void -flush_mmu_tlb_region_asid( +flush_mmu_tlb_region_asid_async( vm_offset_t va, unsigned length, pmap_t pmap) @@ -8411,9 +9087,9 @@ flush_mmu_tlb_region_asid( if ((asid == 0) || (pmap->nested == TRUE)) flush_all = TRUE; if (flush_all) - flush_mmu_tlb(); + flush_mmu_tlb_async(); else - flush_mmu_tlb_asid(asid); + flush_mmu_tlb_asid_async(asid); return; } @@ -8423,14 +9099,14 @@ flush_mmu_tlb_region_asid( #else va = arm_trunc_page(va); while (va < end) { - flush_mmu_tlb_mva_entries(va); + flush_mmu_tlb_mva_entries_async(va); va += ARM_SMALL_PAGE_SIZE; } #endif return; } va = arm_trunc_page(va) | (asid & 0xff); - flush_mmu_tlb_entries(va, end); + flush_mmu_tlb_entries_async(va, end); #else vm_offset_t end = va + length; @@ -8444,17 +9120,17 @@ flush_mmu_tlb_region_asid( if ((asid == 0) || (pmap->nested == TRUE)) flush_all = TRUE; if (flush_all) - flush_mmu_tlb(); + flush_mmu_tlb_async(); else - flush_mmu_tlb_asid((uint64_t)asid << TLBI_ASID_SHIFT); + flush_mmu_tlb_asid_async((uint64_t)asid << TLBI_ASID_SHIFT); return; } va = tlbi_asid(asid) | tlbi_addr(va); end = tlbi_asid(asid) | tlbi_addr(end); if (pmap->nested == TRUE) { - flush_mmu_tlb_allentries(va, end); + flush_mmu_tlb_allentries_async(va, end); } else { - flush_mmu_tlb_entries(va, end); + flush_mmu_tlb_entries_async(va, end); } #endif @@ -8465,7 +9141,31 @@ flush_mmu_tlb_region( vm_offset_t va, unsigned length) { - flush_mmu_tlb_region_asid(va, length, kernel_pmap); + flush_mmu_tlb_region_asid_async(va, length, kernel_pmap); + sync_tlb_flush(); +} + +static unsigned int +pmap_find_io_attr(pmap_paddr_t paddr) +{ + pmap_io_range_t find_range = {.addr = paddr, .len = PAGE_SIZE}; + unsigned int begin = 0, end = num_io_rgns - 1; + assert(num_io_rgns > 0); + + for (;;) { + unsigned int middle = (begin + end) / 2; + int cmp = cmp_io_rgns(&find_range, &io_attr_table[middle]); + if (cmp == 0) + return io_attr_table[middle].wimg; + else if (begin == end) + break; + else if (cmp > 0) + begin = middle + 1; + else + end = middle; + }; + + return (VM_WIMG_IO); } unsigned int @@ -8479,14 +9179,8 @@ pmap_cache_attributes( paddr = ptoa(pn); - if ((paddr >= io_rgn_start) && (paddr < io_rgn_end)) { - unsigned int attr = IO_ATTR_WIMG(io_attr_table[(paddr - io_rgn_start) / io_rgn_granule]); - if (attr) - return attr; - else - return (VM_WIMG_IO); - } - + if ((paddr >= io_rgn_start) && (paddr < io_rgn_end)) + return pmap_find_io_attr(paddr); if (!pmap_initialized) { if ((paddr >= gPhysBase) && (paddr < gPhysBase+gPhysSize)) @@ -8509,7 +9203,7 @@ pmap_cache_attributes( return result; } -static boolean_t +MARK_AS_PMAP_TEXT static boolean_t pmap_batch_set_cache_attributes_internal( ppnum_t pn, unsigned int cacheattr, @@ -8528,7 +9222,9 @@ pmap_batch_set_cache_attributes_internal( cacheattr = VM_WIMG_DEFAULT; if ((doit == FALSE) && (*res == 0)) { + pmap_pin_kernel_pages((vm_offset_t)res, sizeof(*res)); *res = page_cnt; + pmap_unpin_kernel_pages((vm_offset_t)res, sizeof(*res)); if (platform_cache_batch_wimg(cacheattr & (VM_WIMG_MASK), page_cnt< 7) + if (tmplate & ARM_PTE_HINT_MASK) { + panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx", + __FUNCTION__, pte_p, (void *)kva, tmplate); + } +#endif + WRITE_PTE_STRONG(pte_p, tmplate); + flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap); + tlb_flush_needed = TRUE; #endif pai = (unsigned int)pa_index(phys); @@ -8709,7 +9434,10 @@ pmap_update_cache_attributes_locked( if (pve_p != PV_ENTRY_NULL) pte_p = pve_get_ptep(pve_p); - +#ifdef PVH_FLAG_IOMMU + if ((vm_offset_t)pte_p & PVH_FLAG_IOMMU) + goto cache_skip_pve; +#endif pmap = ptep_get_pmap(pte_p); va = ptep_get_va(pte_p); @@ -8717,14 +9445,20 @@ pmap_update_cache_attributes_locked( tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK); tmplate |= wimg_to_pte(attributes); - WRITE_PTE(pte_p, tmplate); - PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE); + WRITE_PTE_STRONG(pte_p, tmplate); + flush_mmu_tlb_region_asid_async(va, PAGE_SIZE, pmap); + tlb_flush_needed = TRUE; +#ifdef PVH_FLAG_IOMMU + cache_skip_pve: +#endif pte_p = PT_ENTRY_NULL; if (pve_p != PV_ENTRY_NULL) pve_p = PVE_NEXT_PTR(pve_next(pve_p)); } + if (tlb_flush_needed) + sync_tlb_flush(); } #if (__ARM_VMSA__ == 7) @@ -8760,7 +9494,7 @@ pmap_update_tt3e( pte = *ptep; pte = tte_to_pa(pte) | template; - WRITE_PTE(ptep, pte); + WRITE_PTE_STRONG(ptep, pte); } /* Note absence of non-global bit */ @@ -8782,12 +9516,11 @@ pmap_create_sharedpage( memset((char *) phystokv(pa), 0, PAGE_SIZE); +#ifdef CONFIG_XNUPOST /* * The kernel pmap maintains a user accessible mapping of the commpage * to test PAN. */ - kr = pmap_expand(kernel_pmap, _COMM_HIGH_PAGE64_BASE_ADDRESS, 0, PMAP_TT_L3_LEVEL); - assert(kr == KERN_SUCCESS); kr = pmap_enter(kernel_pmap, _COMM_HIGH_PAGE64_BASE_ADDRESS, (ppnum_t)atop(pa), VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE); assert(kr == KERN_SUCCESS); @@ -8797,14 +9530,10 @@ pmap_create_sharedpage( */ pmap_update_tt3e(kernel_pmap, _COMM_HIGH_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE | ARM_PTE_NG); - /* - * With PAN enabled kernel drivers can no longer use the previous mapping which is user readable - * They should use the following mapping instead - */ - kr = pmap_expand(kernel_pmap, _COMM_PRIV_PAGE64_BASE_ADDRESS, 0, PMAP_TT_L3_LEVEL); - assert(kr == KERN_SUCCESS); - kr = pmap_enter(kernel_pmap, _COMM_PRIV_PAGE64_BASE_ADDRESS, (ppnum_t)atop(pa), VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE); - assert(kr == KERN_SUCCESS); +#if KASAN + kasan_map_shadow(_COMM_HIGH_PAGE64_BASE_ADDRESS, PAGE_SIZE, true); +#endif +#endif /* CONFIG_XNUPOST */ /* * In order to avoid burning extra pages on mapping the shared page, we @@ -8833,8 +9562,7 @@ pmap_create_sharedpage( pmap_update_tt3e(sharedpage_pmap, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE); /* For manipulation in kernel, go straight to physical page */ - sharedpage_rw_addr = phystokv(pa); - return((vm_map_address_t)sharedpage_rw_addr); + return ((vm_map_address_t)phystokv(pa)); } /* @@ -8851,15 +9579,17 @@ static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L1_OFFMASK) >= VM_MAX_ADDRESS #error Nested shared page mapping is unsupported on this config #endif -static void +MARK_AS_PMAP_TEXT static kern_return_t pmap_insert_sharedpage_internal( pmap_t pmap) { -#if (ARM_PGSHIFT == 14) && !__ARM64_TWO_LEVEL_PMAP__ - kern_return_t kr; -#endif + kern_return_t kr = KERN_SUCCESS; vm_offset_t sharedpage_vaddr; pt_entry_t *ttep, *src_ttep; + int options = 0; + + VALIDATE_PMAP(pmap); + #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE #error We assume a single page. #endif @@ -8883,6 +9613,8 @@ pmap_insert_sharedpage_internal( #if __ARM64_TWO_LEVEL_PMAP__ #error A two level page table with a page shift of 12 is not currently supported #endif + (void)options; + /* Just slam in the L1 entry. */ ttep = pmap_tt1e(pmap, sharedpage_vaddr); @@ -8901,10 +9633,12 @@ pmap_insert_sharedpage_internal( while (*pmap_tt1e(pmap, sharedpage_vaddr) == ARM_PTE_EMPTY) { PMAP_UNLOCK(pmap); - kr = pmap_expand(pmap, _COMM_PAGE32_BASE_ADDRESS, 0, PMAP_TT_L2_LEVEL); + kr = pmap_expand(pmap, sharedpage_vaddr, options, PMAP_TT_L2_LEVEL); if (kr != KERN_SUCCESS) { - panic("Failed to pmap_expand for 32-bit commpage, pmap=%p", pmap); + { + panic("Failed to pmap_expand for commpage, pmap=%p", pmap); + } } PMAP_LOCK(pmap); @@ -8921,26 +9655,21 @@ pmap_insert_sharedpage_internal( #endif *ttep = *src_ttep; -#ifndef __ARM_L1_PTW__ - CleanPoU_DcacheRegion((vm_offset_t) ttep, sizeof(tt_entry_t)); -#endif + FLUSH_PTE_STRONG(ttep); + /* TODO: Should we flush in the 64-bit case? */ - flush_mmu_tlb_region(sharedpage_vaddr, PAGE_SIZE); + flush_mmu_tlb_region_asid_async(sharedpage_vaddr, PAGE_SIZE, kernel_pmap); #if (ARM_PGSHIFT == 12) && !__ARM64_TWO_LEVEL_PMAP__ - flush_mmu_tlb_entry(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L1_OFFMASK) | tlbi_asid(pmap->asid)); + flush_mmu_tlb_entry_async(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L1_OFFMASK) | tlbi_asid(pmap->asid)); #elif (ARM_PGSHIFT == 14) - flush_mmu_tlb_entry(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L2_OFFMASK) | tlbi_asid(pmap->asid)); + flush_mmu_tlb_entry_async(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L2_OFFMASK) | tlbi_asid(pmap->asid)); #endif + sync_tlb_flush(); PMAP_UNLOCK(pmap); -} -static void -pmap_sharedpage_flush_32_to_64( - void) -{ - flush_mmu_tlb_region(_COMM_PAGE32_BASE_ADDRESS, PAGE_SIZE); + return kr; } static void @@ -8988,16 +9717,17 @@ pmap_unmap_sharedpage( #endif *ttep = ARM_TTE_EMPTY; - flush_mmu_tlb_region(sharedpage_vaddr, PAGE_SIZE); + flush_mmu_tlb_region_asid_async(sharedpage_vaddr, PAGE_SIZE, kernel_pmap); #if (ARM_PGSHIFT == 12) #if __ARM64_TWO_LEVEL_PMAP__ #error A two level page table with a page shift of 12 is not currently supported #endif - flush_mmu_tlb_entry(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L1_OFFMASK) | tlbi_asid(pmap->asid)); + flush_mmu_tlb_entry_async(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L1_OFFMASK) | tlbi_asid(pmap->asid)); #elif (ARM_PGSHIFT == 14) - flush_mmu_tlb_entry(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L2_OFFMASK) | tlbi_asid(pmap->asid)); + flush_mmu_tlb_entry_async(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L2_OFFMASK) | tlbi_asid(pmap->asid)); #endif + sync_tlb_flush(); } void @@ -9025,7 +9755,7 @@ pmap_valid_page( return pa_valid(ptoa(pn)); } -static boolean_t +MARK_AS_PMAP_TEXT static boolean_t pmap_is_empty_internal( pmap_t pmap, vm_map_offset_t va_start, @@ -9038,6 +9768,8 @@ pmap_is_empty_internal( return TRUE; } + VALIDATE_PMAP(pmap); + if ((pmap != kernel_pmap) && (not_in_kdp)) { PMAP_LOCK(pmap); } @@ -9127,14 +9859,19 @@ pmap_is_empty( } vm_map_offset_t pmap_max_offset( - boolean_t is64 __unused, + boolean_t is64, unsigned int option) +{ + return (is64) ? pmap_max_64bit_offset(option) : pmap_max_32bit_offset(option); +} + +vm_map_offset_t pmap_max_64bit_offset( + __unused unsigned int option) { vm_map_offset_t max_offset_ret = 0; #if defined(__arm64__) - assert (is64); - vm_map_offset_t min_max_offset = SHARED_REGION_BASE_ARM64 + SHARED_REGION_SIZE_ARM64 + 0x20000000; // end of shared region + 512MB for various purposes + const vm_map_offset_t min_max_offset = SHARED_REGION_BASE_ARM64 + SHARED_REGION_SIZE_ARM64 + 0x20000000; // end of shared region + 512MB for various purposes if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) { max_offset_ret = arm64_pmap_max_offset_default; } else if (option == ARM_PMAP_MAX_OFFSET_MIN) { @@ -9145,9 +9882,9 @@ vm_map_offset_t pmap_max_offset( if (arm64_pmap_max_offset_default) { max_offset_ret = arm64_pmap_max_offset_default; } else if (max_mem > 0xC0000000) { - max_offset_ret = 0x0000000318000000ULL; // Max offset is 12.375GB for devices with > 3GB of memory + max_offset_ret = min_max_offset + 0x138000000; // Max offset is 13.375GB for devices with > 3GB of memory } else if (max_mem > 0x40000000) { - max_offset_ret = 0x0000000218000000ULL; // Max offset is 8.375GB for devices with > 1GB and <= 3GB of memory + max_offset_ret = min_max_offset + 0x38000000; // Max offset is 9.375GB for devices with > 1GB and <= 3GB of memory } else { max_offset_ret = min_max_offset; } @@ -9156,16 +9893,26 @@ vm_map_offset_t pmap_max_offset( // Allow the boot-arg to override jumbo size max_offset_ret = arm64_pmap_max_offset_default; } else { - max_offset_ret = MACH_VM_MAX_ADDRESS; // Max offset is MACH_VM_MAX_ADDRESS for pmaps with special "jumbo" blessing + max_offset_ret = MACH_VM_MAX_ADDRESS; // Max offset is 64GB for pmaps with special "jumbo" blessing } } else { - panic("pmap_max_offset illegal option 0x%x\n", option); + panic("pmap_max_64bit_offset illegal option 0x%x\n", option); } - assert(max_offset_ret >= min_max_offset); assert(max_offset_ret <= MACH_VM_MAX_ADDRESS); - return max_offset_ret; + assert(max_offset_ret >= min_max_offset); #else + panic("Can't run pmap_max_64bit_offset on non-64bit architectures\n"); +#endif + + return max_offset_ret; +} + +vm_map_offset_t pmap_max_32bit_offset( + unsigned int option) +{ + vm_map_offset_t max_offset_ret = 0; + if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) { max_offset_ret = arm_pmap_max_offset_default; } else if (option == ARM_PMAP_MAX_OFFSET_MIN) { @@ -9180,13 +9927,14 @@ vm_map_offset_t pmap_max_offset( } else { max_offset_ret = 0x66000000; } + } else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) { + max_offset_ret = 0x80000000; } else { - panic("pmap_max_offset illegal option 0x%x\n", option); + panic("pmap_max_32bit_offset illegal option 0x%x\n", option); } - assert(max_offset_ret <= VM_MAX_ADDRESS); + assert(max_offset_ret <= MACH_VM_MAX_ADDRESS); return max_offset_ret; -#endif } #if CONFIG_DTRACE @@ -9227,12 +9975,26 @@ pmap_flush( return; } -static boolean_t + +static void __unused +pmap_pin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused) +{ +} + +static void __unused +pmap_unpin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused) +{ +} + + + +#define PMAP_RESIDENT_INVALID ((mach_vm_size_t)-1) + +MARK_AS_PMAP_TEXT static mach_vm_size_t pmap_query_resident_internal( pmap_t pmap, vm_map_address_t start, vm_map_address_t end, - mach_vm_size_t *resident_bytes_p, mach_vm_size_t *compressed_bytes_p) { mach_vm_size_t resident_bytes = 0; @@ -9243,20 +10005,23 @@ pmap_query_resident_internal( tt_entry_t *tte_p; if (pmap == NULL) { - return FALSE; + return PMAP_RESIDENT_INVALID; } + VALIDATE_PMAP(pmap); + /* Ensure that this request is valid, and addresses exactly one TTE. */ - assert(!(start % ARM_PGBYTES)); - assert(!(end % ARM_PGBYTES)); - assert(end >= start); - assert((end - start) <= (PTE_PGENTRIES * ARM_PGBYTES)); + if (__improbable((start % ARM_PGBYTES) || (end % ARM_PGBYTES))) + panic("%s: address range %p, %p not page-aligned", __func__, (void*)start, (void*)end); + + if (__improbable((end < start) || ((end - start) > (PTE_PGENTRIES * ARM_PGBYTES)))) + panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end); PMAP_LOCK(pmap); tte_p = pmap_tte(pmap, start); if (tte_p == (tt_entry_t *) NULL) { PMAP_UNLOCK(pmap); - return FALSE; + return PMAP_RESIDENT_INVALID; } if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) { @@ -9281,14 +10046,12 @@ pmap_query_resident_internal( PMAP_UNLOCK(pmap); if (compressed_bytes_p) { + pmap_pin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p)); *compressed_bytes_p += compressed_bytes; + pmap_unpin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p)); } - if (resident_bytes_p) { - *resident_bytes_p += resident_bytes; - } - - return TRUE; + return resident_bytes; } mach_vm_size_t @@ -9298,7 +10061,7 @@ pmap_query_resident( vm_map_address_t end, mach_vm_size_t *compressed_bytes_p) { - mach_vm_size_t resident_bytes; + mach_vm_size_t total_resident_bytes; mach_vm_size_t compressed_bytes; vm_map_address_t va; @@ -9310,24 +10073,27 @@ pmap_query_resident( return 0; } - resident_bytes = 0; + total_resident_bytes = 0; compressed_bytes = 0; - PMAP_TRACE(PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START, + PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start), VM_KERNEL_ADDRHIDE(end)); va = start; while (va < end) { vm_map_address_t l; + mach_vm_size_t resident_bytes; l = ((va + ARM_TT_TWIG_SIZE) & ~ARM_TT_TWIG_OFFMASK); if (l > end) l = end; - if (!pmap_query_resident_internal(pmap, va, l, &resident_bytes, compressed_bytes_p)) { + resident_bytes = pmap_query_resident_internal(pmap, va, l, compressed_bytes_p); + if (resident_bytes == PMAP_RESIDENT_INVALID) break; - } + + total_resident_bytes += resident_bytes; va = l; } @@ -9336,14 +10102,13 @@ pmap_query_resident( *compressed_bytes_p = compressed_bytes; } - PMAP_TRACE(PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END, - resident_bytes); + PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END, + total_resident_bytes); - return resident_bytes; + return total_resident_bytes; } #if MACH_ASSERT -extern int pmap_ledgers_panic; static void pmap_check_ledgers( pmap_t pmap) @@ -9375,255 +10140,60 @@ pmap_check_ledgers( pmap_ledgers_drift.num_pmaps_checked++; - ledger_get_balance(pmap->ledger, - task_ledgers.phys_footprint, - &bal); - if (bal != 0) { -#if DEVELOPMENT || DEBUG -// if (!pmap->footprint_was_suspended) -#endif /* DEVELOPMENT || DEBUG */ - do_panic = TRUE; - printf("LEDGER BALANCE proc %d (%s) " - "\"phys_footprint\" = %lld\n", - pid, procname, bal); - if (bal > 0) { - pmap_ledgers_drift.phys_footprint_over++; - pmap_ledgers_drift.phys_footprint_over_total += bal; - if (bal > pmap_ledgers_drift.phys_footprint_over_max) { - pmap_ledgers_drift.phys_footprint_over_max = bal; - } - } else { - pmap_ledgers_drift.phys_footprint_under++; - pmap_ledgers_drift.phys_footprint_under_total += bal; - if (bal < pmap_ledgers_drift.phys_footprint_under_max) { - pmap_ledgers_drift.phys_footprint_under_max = bal; - } - } - } - ledger_get_balance(pmap->ledger, - task_ledgers.internal, - &bal); - if (bal != 0) { - do_panic = TRUE; - printf("LEDGER BALANCE proc %d (%s) " - "\"internal\" = %lld\n", - pid, procname, bal); - if (bal > 0) { - pmap_ledgers_drift.internal_over++; - pmap_ledgers_drift.internal_over_total += bal; - if (bal > pmap_ledgers_drift.internal_over_max) { - pmap_ledgers_drift.internal_over_max = bal; - } - } else { - pmap_ledgers_drift.internal_under++; - pmap_ledgers_drift.internal_under_total += bal; - if (bal < pmap_ledgers_drift.internal_under_max) { - pmap_ledgers_drift.internal_under_max = bal; - } - } - } - ledger_get_balance(pmap->ledger, - task_ledgers.internal_compressed, - &bal); - if (bal != 0) { - do_panic = TRUE; - printf("LEDGER BALANCE proc %d (%s) " - "\"internal_compressed\" = %lld\n", - pid, procname, bal); - if (bal > 0) { - pmap_ledgers_drift.internal_compressed_over++; - pmap_ledgers_drift.internal_compressed_over_total += bal; - if (bal > pmap_ledgers_drift.internal_compressed_over_max) { - pmap_ledgers_drift.internal_compressed_over_max = bal; - } - } else { - pmap_ledgers_drift.internal_compressed_under++; - pmap_ledgers_drift.internal_compressed_under_total += bal; - if (bal < pmap_ledgers_drift.internal_compressed_under_max) { - pmap_ledgers_drift.internal_compressed_under_max = bal; - } - } - } - ledger_get_balance(pmap->ledger, - task_ledgers.iokit_mapped, - &bal); - if (bal != 0) { - do_panic = TRUE; - printf("LEDGER BALANCE proc %d (%s) " - "\"iokit_mapped\" = %lld\n", - pid, procname, bal); - if (bal > 0) { - pmap_ledgers_drift.iokit_mapped_over++; - pmap_ledgers_drift.iokit_mapped_over_total += bal; - if (bal > pmap_ledgers_drift.iokit_mapped_over_max) { - pmap_ledgers_drift.iokit_mapped_over_max = bal; - } - } else { - pmap_ledgers_drift.iokit_mapped_under++; - pmap_ledgers_drift.iokit_mapped_under_total += bal; - if (bal < pmap_ledgers_drift.iokit_mapped_under_max) { - pmap_ledgers_drift.iokit_mapped_under_max = bal; - } - } - } - ledger_get_balance(pmap->ledger, - task_ledgers.alternate_accounting, - &bal); - if (bal != 0) { - do_panic = TRUE; - printf("LEDGER BALANCE proc %d (%s) " - "\"alternate_accounting\" = %lld\n", - pid, procname, bal); - if (bal > 0) { - pmap_ledgers_drift.alternate_accounting_over++; - pmap_ledgers_drift.alternate_accounting_over_total += bal; - if (bal > pmap_ledgers_drift.alternate_accounting_over_max) { - pmap_ledgers_drift.alternate_accounting_over_max = bal; - } - } else { - pmap_ledgers_drift.alternate_accounting_under++; - pmap_ledgers_drift.alternate_accounting_under_total += bal; - if (bal < pmap_ledgers_drift.alternate_accounting_under_max) { - pmap_ledgers_drift.alternate_accounting_under_max = bal; - } - } - } - ledger_get_balance(pmap->ledger, - task_ledgers.alternate_accounting_compressed, - &bal); - if (bal != 0) { - do_panic = TRUE; - printf("LEDGER BALANCE proc %d (%s) " - "\"alternate_accounting_compressed\" = %lld\n", - pid, procname, bal); - if (bal > 0) { - pmap_ledgers_drift.alternate_accounting_compressed_over++; - pmap_ledgers_drift.alternate_accounting_compressed_over_total += bal; - if (bal > pmap_ledgers_drift.alternate_accounting_compressed_over_max) { - pmap_ledgers_drift.alternate_accounting_compressed_over_max = bal; - } - } else { - pmap_ledgers_drift.alternate_accounting_compressed_under++; - pmap_ledgers_drift.alternate_accounting_compressed_under_total += bal; - if (bal < pmap_ledgers_drift.alternate_accounting_compressed_under_max) { - pmap_ledgers_drift.alternate_accounting_compressed_under_max = bal; - } - } - } - ledger_get_balance(pmap->ledger, - task_ledgers.page_table, - &bal); - if (bal != 0) { - do_panic = TRUE; - printf("LEDGER BALANCE proc %d (%s) " - "\"page_table\" = %lld\n", - pid, procname, bal); - if (bal > 0) { - pmap_ledgers_drift.page_table_over++; - pmap_ledgers_drift.page_table_over_total += bal; - if (bal > pmap_ledgers_drift.page_table_over_max) { - pmap_ledgers_drift.page_table_over_max = bal; - } - } else { - pmap_ledgers_drift.page_table_under++; - pmap_ledgers_drift.page_table_under_total += bal; - if (bal < pmap_ledgers_drift.page_table_under_max) { - pmap_ledgers_drift.page_table_under_max = bal; - } - } - } - ledger_get_balance(pmap->ledger, - task_ledgers.purgeable_volatile, - &bal); - if (bal != 0) { - do_panic = TRUE; - printf("LEDGER BALANCE proc %d (%s) " - "\"purgeable_volatile\" = %lld\n", - pid, procname, bal); - if (bal > 0) { - pmap_ledgers_drift.purgeable_volatile_over++; - pmap_ledgers_drift.purgeable_volatile_over_total += bal; - if (bal > pmap_ledgers_drift.purgeable_volatile_over_max) { - pmap_ledgers_drift.purgeable_volatile_over_max = bal; - } - } else { - pmap_ledgers_drift.purgeable_volatile_under++; - pmap_ledgers_drift.purgeable_volatile_under_total += bal; - if (bal < pmap_ledgers_drift.purgeable_volatile_under_max) { - pmap_ledgers_drift.purgeable_volatile_under_max = bal; - } - } - } - ledger_get_balance(pmap->ledger, - task_ledgers.purgeable_nonvolatile, - &bal); - if (bal != 0) { - do_panic = TRUE; - printf("LEDGER BALANCE proc %d (%s) " - "\"purgeable_nonvolatile\" = %lld\n", - pid, procname, bal); - if (bal > 0) { - pmap_ledgers_drift.purgeable_nonvolatile_over++; - pmap_ledgers_drift.purgeable_nonvolatile_over_total += bal; - if (bal > pmap_ledgers_drift.purgeable_nonvolatile_over_max) { - pmap_ledgers_drift.purgeable_nonvolatile_over_max = bal; - } - } else { - pmap_ledgers_drift.purgeable_nonvolatile_under++; - pmap_ledgers_drift.purgeable_nonvolatile_under_total += bal; - if (bal < pmap_ledgers_drift.purgeable_nonvolatile_under_max) { - pmap_ledgers_drift.purgeable_nonvolatile_under_max = bal; - } - } - } - ledger_get_balance(pmap->ledger, - task_ledgers.purgeable_volatile_compressed, - &bal); - if (bal != 0) { - do_panic = TRUE; - printf("LEDGER BALANCE proc %d (%s) " - "\"purgeable_volatile_compressed\" = %lld\n", - pid, procname, bal); - if (bal > 0) { - pmap_ledgers_drift.purgeable_volatile_compressed_over++; - pmap_ledgers_drift.purgeable_volatile_compressed_over_total += bal; - if (bal > pmap_ledgers_drift.purgeable_volatile_compressed_over_max) { - pmap_ledgers_drift.purgeable_volatile_compressed_over_max = bal; - } - } else { - pmap_ledgers_drift.purgeable_volatile_compressed_under++; - pmap_ledgers_drift.purgeable_volatile_compressed_under_total += bal; - if (bal < pmap_ledgers_drift.purgeable_volatile_compressed_under_max) { - pmap_ledgers_drift.purgeable_volatile_compressed_under_max = bal; - } - } - } - ledger_get_balance(pmap->ledger, - task_ledgers.purgeable_nonvolatile_compressed, - &bal); - if (bal != 0) { - do_panic = TRUE; - printf("LEDGER BALANCE proc %d (%s) " - "\"purgeable_nonvolatile_compressed\" = %lld\n", - pid, procname, bal); - if (bal > 0) { - pmap_ledgers_drift.purgeable_nonvolatile_compressed_over++; - pmap_ledgers_drift.purgeable_nonvolatile_compressed_over_total += bal; - if (bal > pmap_ledgers_drift.purgeable_nonvolatile_compressed_over_max) { - pmap_ledgers_drift.purgeable_nonvolatile_compressed_over_max = bal; - } - } else { - pmap_ledgers_drift.purgeable_nonvolatile_compressed_under++; - pmap_ledgers_drift.purgeable_nonvolatile_compressed_under_total += bal; - if (bal < pmap_ledgers_drift.purgeable_nonvolatile_compressed_under_max) { - pmap_ledgers_drift.purgeable_nonvolatile_compressed_under_max = bal; - } - } - } +#define LEDGER_CHECK_BALANCE(__LEDGER) \ +MACRO_BEGIN \ + int panic_on_negative = TRUE; \ + ledger_get_balance(pmap->ledger, \ + task_ledgers.__LEDGER, \ + &bal); \ + ledger_get_panic_on_negative(pmap->ledger, \ + task_ledgers.__LEDGER, \ + &panic_on_negative); \ + if (bal != 0) { \ + if (panic_on_negative || \ + (pmap_ledgers_panic && \ + pmap_ledgers_panic_leeway > 0 && \ + (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) || \ + bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \ + do_panic = TRUE; \ + } \ + printf("LEDGER BALANCE proc %d (%s) " \ + "\"%s\" = %lld\n", \ + pid, procname, #__LEDGER, bal); \ + if (bal > 0) { \ + pmap_ledgers_drift.__LEDGER##_over++; \ + pmap_ledgers_drift.__LEDGER##_over_total += bal; \ + if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \ + pmap_ledgers_drift.__LEDGER##_over_max = bal; \ + } \ + } else if (bal < 0) { \ + pmap_ledgers_drift.__LEDGER##_under++; \ + pmap_ledgers_drift.__LEDGER##_under_total += bal; \ + if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \ + pmap_ledgers_drift.__LEDGER##_under_max = bal; \ + } \ + } \ + } \ +MACRO_END + + LEDGER_CHECK_BALANCE(phys_footprint); + LEDGER_CHECK_BALANCE(internal); + LEDGER_CHECK_BALANCE(internal_compressed); + LEDGER_CHECK_BALANCE(iokit_mapped); + LEDGER_CHECK_BALANCE(alternate_accounting); + LEDGER_CHECK_BALANCE(alternate_accounting_compressed); + LEDGER_CHECK_BALANCE(page_table); + LEDGER_CHECK_BALANCE(purgeable_volatile); + LEDGER_CHECK_BALANCE(purgeable_nonvolatile); + LEDGER_CHECK_BALANCE(purgeable_volatile_compressed); + LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed); + LEDGER_CHECK_BALANCE(network_volatile); + LEDGER_CHECK_BALANCE(network_nonvolatile); + LEDGER_CHECK_BALANCE(network_volatile_compressed); + LEDGER_CHECK_BALANCE(network_nonvolatile_compressed); if (do_panic) { - if (pmap_ledgers_panic && - pmap->pmap_stats_assert) { + if (pmap_ledgers_panic) { panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n", pmap, pid, procname); } else { @@ -10588,32 +11158,42 @@ pmap_enforces_execute_only( #endif } -void -pmap_set_jit_entitled( +MARK_AS_PMAP_TEXT void +pmap_set_jit_entitled_internal( __unused pmap_t pmap) { return; } -static kern_return_t +void +pmap_set_jit_entitled( + pmap_t pmap) +{ + pmap_set_jit_entitled_internal(pmap); +} + +MARK_AS_PMAP_TEXT static kern_return_t pmap_query_page_info_internal( pmap_t pmap, vm_map_offset_t va, int *disp_p) { - int disp; - pmap_paddr_t pa; - int pai; - pt_entry_t *pte; - pv_entry_t **pv_h, *pve_p; + pmap_paddr_t pa; + int disp; + int pai; + pt_entry_t *pte; + pv_entry_t **pv_h, *pve_p; if (pmap == PMAP_NULL || pmap == kernel_pmap) { + pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p)); *disp_p = 0; + pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p)); return KERN_INVALID_ARGUMENT; } disp = 0; + VALIDATE_PMAP(pmap); PMAP_LOCK(pmap); pte = pmap_pte(pmap, va); @@ -10657,7 +11237,9 @@ pmap_query_page_info_internal( done: PMAP_UNLOCK(pmap); + pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p)); *disp_p = disp; + pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p)); return KERN_SUCCESS; } @@ -10670,7 +11252,7 @@ pmap_query_page_info( return pmap_query_page_info_internal(pmap, va, disp_p); } -kern_return_t +MARK_AS_PMAP_TEXT kern_return_t pmap_return_internal(__unused boolean_t do_panic, __unused boolean_t do_recurse) { @@ -10683,23 +11265,26 @@ pmap_return(boolean_t do_panic, boolean_t do_recurse) return pmap_return_internal(do_panic, do_recurse); } -static void + + +MARK_AS_PMAP_TEXT static void pmap_footprint_suspend_internal( vm_map_t map, boolean_t suspend) { #if DEVELOPMENT || DEBUG if (suspend) { - map->pmap->footprint_suspended = TRUE; + current_thread()->pmap_footprint_suspended = TRUE; map->pmap->footprint_was_suspended = TRUE; } else { - map->pmap->footprint_suspended = FALSE; + current_thread()->pmap_footprint_suspended = FALSE; } #else /* DEVELOPMENT || DEBUG */ (void) map; (void) suspend; #endif /* DEVELOPMENT || DEBUG */ } + void pmap_footprint_suspend( vm_map_t map, @@ -10707,3 +11292,113 @@ pmap_footprint_suspend( { pmap_footprint_suspend_internal(map, suspend); } + +#if defined(__arm64__) && (DEVELOPMENT || DEBUG) + +struct page_table_level_info { + uint64_t size; + uint64_t offmask; + uint64_t shift; + uint64_t index_mask; + uint64_t valid_mask; + uint64_t type_mask; + uint64_t type_block; +}; + +struct page_table_dump_header { + uint64_t pa; + uint64_t num_entries; + uint64_t start_va; + uint64_t end_va; +}; + +struct page_table_level_info page_table_levels[] = + { { ARM_TT_L0_SIZE, ARM_TT_L0_OFFMASK, ARM_TT_L0_SHIFT, ARM_TT_L0_INDEX_MASK, ARM_TTE_VALID, ARM_TTE_TYPE_MASK, ARM_TTE_TYPE_BLOCK }, + { ARM_TT_L1_SIZE, ARM_TT_L1_OFFMASK, ARM_TT_L1_SHIFT, ARM_TT_L1_INDEX_MASK, ARM_TTE_VALID, ARM_TTE_TYPE_MASK, ARM_TTE_TYPE_BLOCK }, + { ARM_TT_L2_SIZE, ARM_TT_L2_OFFMASK, ARM_TT_L2_SHIFT, ARM_TT_L2_INDEX_MASK, ARM_TTE_VALID, ARM_TTE_TYPE_MASK, ARM_TTE_TYPE_BLOCK }, + { ARM_TT_L3_SIZE, ARM_TT_L3_OFFMASK, ARM_TT_L3_SHIFT, ARM_TT_L3_INDEX_MASK, ARM_PTE_TYPE_VALID, ARM_PTE_TYPE_MASK, ARM_TTE_TYPE_L3BLOCK } }; + +static size_t +pmap_dump_page_tables_recurse(const tt_entry_t *ttp, + unsigned int cur_level, + uint64_t start_va, + void *bufp, + void *buf_end) +{ + size_t bytes_used = 0; + uint64_t num_entries = ARM_PGBYTES / sizeof(*ttp); + uint64_t size = page_table_levels[cur_level].size; + uint64_t valid_mask = page_table_levels[cur_level].valid_mask; + uint64_t type_mask = page_table_levels[cur_level].type_mask; + uint64_t type_block = page_table_levels[cur_level].type_block; + + if (cur_level == arm64_root_pgtable_level) + num_entries = arm64_root_pgtable_num_ttes; + + uint64_t tt_size = num_entries * sizeof(tt_entry_t); + const tt_entry_t *tt_end = &ttp[num_entries]; + + if (((vm_offset_t)buf_end - (vm_offset_t)bufp) < (tt_size + sizeof(struct page_table_dump_header))) { + return 0; + } + + struct page_table_dump_header *header = (struct page_table_dump_header*)bufp; + header->pa = ml_static_vtop((vm_offset_t)ttp); + header->num_entries = num_entries; + header->start_va = start_va; + header->end_va = start_va + (num_entries * size); + + bcopy(ttp, (uint8_t*)bufp + sizeof(*header), tt_size); + bytes_used += (sizeof(*header) + tt_size); + uint64_t current_va = start_va; + + for (const tt_entry_t *ttep = ttp; ttep < tt_end; ttep++, current_va += size) { + tt_entry_t tte = *ttep; + + if (!(tte & valid_mask)) { + continue; + } + + if ((tte & type_mask) == type_block) { + continue; + } else { + if (cur_level >= PMAP_TT_MAX_LEVEL) { + panic("%s: corrupt entry %#llx at %p, " + "ttp=%p, cur_level=%u, bufp=%p, buf_end=%p", + __FUNCTION__, tte, ttep, + ttp, cur_level, bufp, buf_end); + } + + const tt_entry_t *next_tt = (const tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK); + + size_t recurse_result = pmap_dump_page_tables_recurse(next_tt, cur_level + 1, current_va, (uint8_t*)bufp + bytes_used, buf_end); + + if (recurse_result == 0) { + return 0; + } + + bytes_used += recurse_result; + } + } + + return bytes_used; +} + +size_t +pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end) +{ + if (not_in_kdp) + panic("pmap_dump_page_tables must only be called from kernel debugger context"); + return pmap_dump_page_tables_recurse(pmap->tte, arm64_root_pgtable_level, pmap->min, bufp, buf_end); +} + +#else /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */ + +size_t +pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused) +{ + return (size_t)-1; +} + +#endif /* !defined(__arm64__) */ + diff --git a/osfmk/arm/pmap.h b/osfmk/arm/pmap.h index 7653401cf..88d89086c 100644 --- a/osfmk/arm/pmap.h +++ b/osfmk/arm/pmap.h @@ -42,8 +42,12 @@ #ifndef ASSEMBLER +#include +#include #include #include +#include +#include #if __ARM_KERNEL_PROTECT__ /* @@ -61,9 +65,14 @@ #define NBBY 8 struct pmap_cpu_data { +#if defined(__arm64__) + pmap_t cpu_nested_pmap; +#else pmap_t cpu_user_pmap; - unsigned int cpu_number; unsigned int cpu_user_pmap_stamp; +#endif + unsigned int cpu_number; + /* * This supports overloading of ARM ASIDs by the pmap. The field needs @@ -87,6 +96,9 @@ typedef struct pmap_cpu_data pmap_cpu_data_t; #include #include + +#include + /* Base address for low globals. */ #define LOW_GLOBAL_BASE_ADDRESS 0xfffffff000000000ULL @@ -102,14 +114,14 @@ typedef struct pmap_cpu_data pmap_cpu_data_t; #if defined(__arm64__) +#define BOOTSTRAP_TABLE_SIZE (ARM_PGBYTES * 8) + typedef uint64_t tt_entry_t; /* translation table entry type */ #define TT_ENTRY_NULL ((tt_entry_t *) 0) typedef uint64_t pt_entry_t; /* page table entry type */ #define PT_ENTRY_NULL ((pt_entry_t *) 0) -typedef uint64_t pmap_paddr_t; /* physical address (not ppnum_t) */ - #elif defined(__arm__) typedef uint32_t tt_entry_t; /* translation table entry type */ @@ -118,8 +130,6 @@ typedef uint32_t tt_entry_t; /* translation table entry type */ typedef uint32_t pt_entry_t; /* page table entry type */ #define TT_ENTRY_NULL ((tt_entry_t *) 0) -typedef uint32_t pmap_paddr_t; /* physical address (not ppnum_t) */ - #else #error unknown arch #endif @@ -164,22 +174,35 @@ typedef uint32_t pmap_paddr_t; /* physical address (not ppnum_t) */ #define NPTES (ARM_PGBYTES / sizeof(pt_entry_t)) #endif +extern void sync_tlb_flush(void); +extern void flush_mmu_tlb_async(void); extern void flush_mmu_tlb(void); +extern void flush_core_tlb_async(void); extern void flush_core_tlb(void); #if defined(__arm64__) +extern void flush_mmu_tlb_allentries_async(uint64_t, uint64_t); extern void flush_mmu_tlb_allentries(uint64_t, uint64_t); +extern void flush_mmu_tlb_entry_async(uint64_t); extern void flush_mmu_tlb_entry(uint64_t); +extern void flush_mmu_tlb_entries_async(uint64_t, uint64_t); extern void flush_mmu_tlb_entries(uint64_t, uint64_t); +extern void flush_mmu_tlb_asid_async(uint64_t); extern void flush_mmu_tlb_asid(uint64_t); +extern void flush_core_tlb_asid_async(uint64_t); extern void flush_core_tlb_asid(uint64_t); #define tlbi_addr(x) (((x) >> TLBI_ADDR_SHIFT) & TLBI_ADDR_MASK) #define tlbi_asid(x) (((uint64_t)x << TLBI_ASID_SHIFT) & TLBI_ASID_MASK) #else +extern void flush_mmu_tlb_entry_async(uint32_t); extern void flush_mmu_tlb_entry(uint32_t); +extern void flush_mmu_tlb_entries_async(uint32_t, uint32_t); extern void flush_mmu_tlb_entries(uint32_t, uint32_t); +extern void flush_mmu_tlb_mva_entries_async(uint32_t); extern void flush_mmu_tlb_mva_entries(uint32_t); +extern void flush_mmu_tlb_asid_async(uint32_t); extern void flush_mmu_tlb_asid(uint32_t); +extern void flush_core_tlb_asid_async(uint32_t); extern void flush_core_tlb_asid(uint32_t); #endif extern void flush_mmu_tlb_region(vm_offset_t va, unsigned length); @@ -192,6 +215,7 @@ extern void set_mmu_ttb(uint64_t); extern void set_mmu_ttb_alternate(uint64_t); extern uint64_t get_tcr(void); extern void set_tcr(uint64_t); +extern uint64_t pmap_get_arm64_prot(pmap_t, vm_offset_t); #else extern uint32_t get_mmu_control(void); extern void set_mmu_control(uint32_t); @@ -255,6 +279,18 @@ extern pmap_paddr_t mmu_uvtop(vm_offset_t va); #define PMAP_GC_INFLIGHT 1 #define PMAP_GC_WAIT 2 +#if DEVELOPMENT || DEBUG +#define pmap_cs_log(msg, args...) printf("PMAP_CS: " msg "\n", args) +#define pmap_cs_log_h(msg, args...) { if(pmap_cs_log_hacks) printf("PMAP_CS: " msg "\n", args); } + +#define PMAP_CS_EXCEPTION_LIST_HACK 1 + +#else +#define pmap_cs_log(msg, args...) +#define pmap_cs_log_h(msg, args...) +#endif /* DEVELOPMENT || DEBUG */ + + /* * Convert translation/page table entry to kernel virtual address */ @@ -266,42 +302,49 @@ struct pmap { pmap_paddr_t ttep; /* translation table physical */ vm_map_address_t min; /* min address in pmap */ vm_map_address_t max; /* max address in pmap */ - unsigned int asid; /* address space id */ - unsigned int vasid; /* Virtual address space id */ - unsigned int stamp; /* creation stamp */ - unsigned int wired; /* wired bits */ - volatile uint32_t ref_count; /* pmap reference count */ - unsigned int cpu_ref; /* number of cpus using pmap */ - unsigned int gc_status; /* gc status */ - ledger_t ledger; /* ledger tracking phys mappings */ + ledger_t ledger; /* ledger tracking phys mappings */ decl_simple_lock_data(,lock) /* lock on map */ struct pmap_statistics stats; /* map statistics */ queue_chain_t pmaps; /* global list of pmaps */ tt_entry_t *tt_entry_free; /* free translation table entries */ tt_entry_t *prev_tte; /* previous translation table */ - unsigned int tte_index_max; /* max tte index in translation table entries */ - boolean_t nx_enabled; /* no execute */ - boolean_t nested; /* is nested */ - boolean_t is_64bit; /* is 64bit */ struct pmap *nested_pmap; /* nested pmap */ vm_map_address_t nested_region_grand_addr; vm_map_address_t nested_region_subord_addr; vm_map_offset_t nested_region_size; + vm_map_offset_t nested_region_true_start; + vm_map_offset_t nested_region_true_end; unsigned int *nested_region_asid_bitmap; - unsigned int nested_region_asid_bitmap_size; #if (__ARM_VMSA__ <= 7) decl_simple_lock_data(,tt1_lock) /* lock on tt1 */ + unsigned int cpu_ref; /* number of cpus using pmap */ #endif + + + unsigned int asid; /* address space id */ + unsigned int vasid; /* Virtual address space id */ + unsigned int stamp; /* creation stamp */ + _Atomic int32_t ref_count; /* pmap reference count */ + unsigned int gc_status; /* gc status */ + unsigned int nested_region_asid_bitmap_size; + unsigned int tte_index_max; /* max tte index in translation table entries */ + uint32_t nested_no_bounds_refcnt;/* number of pmaps that nested this pmap without bounds set */ + #if MACH_ASSERT - boolean_t pmap_stats_assert; int pmap_pid; char pmap_procname[17]; + bool pmap_stats_assert; #endif /* MACH_ASSERT */ #if DEVELOPMENT || DEBUG - boolean_t footprint_suspended; - boolean_t footprint_was_suspended; + bool footprint_suspended; + bool footprint_was_suspended; #endif /* DEVELOPMENT || DEBUG */ + bool nx_enabled; /* no execute */ + bool nested; /* is nested */ + bool is_64bit; /* is 64bit */ + bool nested_has_no_bounds_ref; /* nested a pmap when the bounds were not set */ + bool nested_bounds_set; /* The nesting bounds have been set */ }; /* typedef struct pmap *pmap_t; */ @@ -312,6 +355,7 @@ struct pmap { * WIMG control */ #define VM_MEM_INNER 0x10 +#define VM_MEM_RT 0x10 // intentionally alias VM_MEM_INNER; will be used with mutually exclusive caching policies #define VM_MEM_EARLY_ACK 0x20 #define VM_WIMG_DEFAULT (VM_MEM_COHERENT) @@ -320,8 +364,8 @@ struct pmap { #define VM_WIMG_IO (VM_MEM_COHERENT | VM_MEM_NOT_CACHEABLE | VM_MEM_GUARDED) #define VM_WIMG_POSTED (VM_MEM_COHERENT | VM_MEM_NOT_CACHEABLE | VM_MEM_GUARDED | VM_MEM_EARLY_ACK) #define VM_WIMG_WTHRU (VM_MEM_WRITE_THROUGH | VM_MEM_COHERENT | VM_MEM_GUARDED) -#define VM_WIMG_WCOMB (VM_MEM_NOT_CACHEABLE | VM_MEM_COHERENT) - +#define VM_WIMG_WCOMB (VM_MEM_NOT_CACHEABLE | VM_MEM_COHERENT) +#define VM_WIMG_RT (VM_WIMG_IO | VM_MEM_RT) #if VM_DEBUG extern int pmap_list_resident_pages( @@ -355,6 +399,7 @@ extern pmap_paddr_t invalid_ttep; /* physical invalid translation table addr */ * platform dependent Prototypes */ extern void pmap_switch_user_ttb(pmap_t pmap); +extern void pmap_clear_user_ttb(void); extern void pmap_bootstrap(vm_offset_t); extern vm_map_address_t pmap_ptov(pmap_t, ppnum_t); extern ppnum_t pmap_find_phys(pmap_t map, addr64_t va); @@ -362,7 +407,7 @@ extern void pmap_set_pmap(pmap_t pmap, thread_t thread); extern void pmap_collect(pmap_t pmap); extern void pmap_gc(void); #if defined(__arm64__) -extern vm_offset_t pmap_extract(pmap_t pmap, vm_map_offset_t va); +extern vm_offset_t pmap_extract(pmap_t pmap, vm_map_offset_t va); #endif /* @@ -407,6 +452,8 @@ extern vm_offset_t pmap_extract(pmap_t pmap, vm_map_offset_t va); copyout(from, to, cnt) extern pmap_paddr_t kvtophys(vm_offset_t va); +extern vm_map_address_t phystokv(pmap_paddr_t pa); +extern vm_map_address_t phystokv_range(pmap_paddr_t pa, vm_size_t *max_len); extern vm_map_address_t pmap_map(vm_map_address_t va, vm_offset_t sa, vm_offset_t ea, vm_prot_t prot, unsigned int flags); extern vm_map_address_t pmap_map_high_window_bd( vm_offset_t pa, vm_size_t len, vm_prot_t prot); @@ -422,7 +469,6 @@ extern vm_map_address_t pmap_map_bd_with_options(vm_map_address_t va, vm_offset_ extern vm_map_address_t pmap_map_bd(vm_map_address_t va, vm_offset_t sa, vm_offset_t ea, vm_prot_t prot); extern void pmap_init_pte_page(pmap_t, pt_entry_t *, vm_offset_t, unsigned int ttlevel, boolean_t alloc_ptd); -extern void pmap_init_pte_static_page(pmap_t, pt_entry_t *, pmap_paddr_t); extern boolean_t pmap_valid_address(pmap_paddr_t addr); extern void pmap_disable_NX(pmap_t pmap); @@ -452,6 +498,8 @@ extern boolean_t pmap_is_empty(pmap_t pmap, vm_map_offset_t start, vm_map_offset extern vm_map_offset_t pmap_max_offset(boolean_t is64, unsigned int option); +extern vm_map_offset_t pmap_max_64bit_offset(unsigned int option); +extern vm_map_offset_t pmap_max_32bit_offset(unsigned int option); boolean_t pmap_virtual_region(unsigned int region_select, vm_map_offset_t *startp, vm_map_size_t *size); @@ -488,28 +536,50 @@ boolean_t pmap_enforces_execute_only(pmap_t pmap); #define PMAP_SET_PROCESS_INDEX 27 #define PMAP_SWITCH_INDEX 28 #define PMAP_SWITCH_USER_TTB_INDEX 29 -#define PMAP_UNHINT_KV_ADDR_INDEX 30 +#define PMAP_CLEAR_USER_TTB_INDEX 30 #define PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX 31 #define PMAP_UNNEST_OPTIONS_INDEX 32 #define PMAP_FOOTPRINT_SUSPEND_INDEX 33 #define PMAP_CPU_DATA_INIT_INDEX 34 #define PMAP_RELEASE_PAGES_TO_KERNEL_INDEX 35 +#define PMAP_SET_JIT_ENTITLED_INDEX 36 + + +#define PMAP_TRIM_INDEX 64 +#define PMAP_LEDGER_ALLOC_INIT_INDEX 65 +#define PMAP_LEDGER_ALLOC_INDEX 66 +#define PMAP_LEDGER_FREE_INDEX 67 -#define MAX_PMAP_INDEX 36 +#define PMAP_COUNT 68 #define PMAP_INVALID_CPU_NUM (~0U) +struct pmap_cpu_data_array_entry { + pmap_cpu_data_t cpu_data; +} __attribute__((aligned(1 << L2_CLINE))); + /* Initialize the pmap per-CPU data for the current CPU. */ extern void pmap_cpu_data_init(void); /* Get the pmap per-CPU data for the current CPU. */ extern pmap_cpu_data_t * pmap_get_cpu_data(void); + #define MARK_AS_PMAP_TEXT #define MARK_AS_PMAP_DATA +#define MARK_AS_PMAP_RODATA + + extern kern_return_t pmap_return(boolean_t do_panic, boolean_t do_recurse); +#define pmap_force_dcache_clean(va, sz) CleanPoC_DcacheRegion_Force(va, sz) +#define pmap_simple_lock(l) simple_lock(l) +#define pmap_simple_unlock(l) simple_unlock(l) +#define pmap_simple_lock_try(l) simple_lock_try(l) +#define pmap_lock_bit(l, i) hw_lock_bit(l, i) +#define pmap_unlock_bit(l, i) hw_unlock_bit(l, i) + #endif /* #ifndef ASSEMBLER */ #if __ARM_KERNEL_PROTECT__ diff --git a/osfmk/arm/pmap_public.h b/osfmk/arm/pmap_public.h new file mode 100644 index 000000000..98393ccd5 --- /dev/null +++ b/osfmk/arm/pmap_public.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef _ARM_PMAP_PUBLIC_H_ +#define _ARM_PMAP_PUBLIC_H_ + +#include +#include +#include +#include + +__BEGIN_DECLS + +#if defined(__arm64__) +typedef uint64_t pmap_paddr_t; /* physical address (not ppnum_t) */ +#else +typedef uint32_t pmap_paddr_t; /* physical address (not ppnum_t) */ +#endif + + +__END_DECLS + +#endif /* _ARM_PMAP_PUBLIC_H_ */ diff --git a/osfmk/arm/proc_reg.h b/osfmk/arm/proc_reg.h index 28a99e3f5..5d170e88d 100644 --- a/osfmk/arm/proc_reg.h +++ b/osfmk/arm/proc_reg.h @@ -64,17 +64,6 @@ #ifndef _ARM_PROC_REG_H_ #define _ARM_PROC_REG_H_ -#if __ARM_KERNEL_PROTECT__ -/* - * This feature is not currently implemented for 32-bit ARM CPU architectures. - * A discussion of this feature for 64-bit ARM CPU architectures can be found - * in the ARM64 version of this file. - */ -#if __arm__ -#error __ARM_KERNEL_PROTECT__ is not supported on ARM32 -#endif -#endif /* __ARM_KERNEL_PROTECT__ */ - #if defined (__arm64__) #include #elif defined (__arm__) @@ -172,10 +161,53 @@ #define __ARM_GLOBAL_SLEEP_BIT__ 1 #define __ARM_PAN_AVAILABLE__ 1 +#elif defined (APPLEMONSOON) +#define __ARM_ARCH__ 8 +#define __ARM_VMSA__ 8 +#define __ARM_SMP__ 1 +#define __ARM_AMP__ 1 +#define __ARM_VFP__ 4 +#define __ARM_COHERENT_CACHE__ 1 +#define __ARM_COHERENT_IO__ 1 +#define __ARM_IC_NOALIAS_ICACHE__ 1 +#define __ARM_L1_PTW__ 1 +#define __ARM_DEBUG__ 7 +#define __ARM_ENABLE_SWAP__ 1 +#define __ARM_V8_CRYPTO_EXTENSIONS__ 1 +#define __ARM_16K_PG__ 1 +#define __ARM64_PMAP_SUBPAGE_L1__ 1 +#define __ARM_KERNEL_PROTECT__ 1 +#define __ARM_GLOBAL_SLEEP_BIT__ 1 +#define __ARM_PAN_AVAILABLE__ 1 +#define __ARM_WKDM_ISA_AVAILABLE__ 1 +#define __PLATFORM_WKDM_ALIGNMENT_MASK__ (0x3FULL) +#define __PLATFORM_WKDM_ALIGNMENT_BOUNDARY__ (64) +#define __ARM_CLUSTER_COUNT__ 2 + +#elif defined (BCM2837) +#define __ARM_ARCH__ 8 +#define __ARM_VMSA__ 8 +#define __ARM_SMP__ 1 +#define __ARM_VFP__ 4 +#define __ARM_COHERENT_CACHE__ 1 +#define __ARM_L1_PTW__ 1 +#define __ARM_DEBUG__ 7 +#define __ARM64_PMAP_SUBPAGE_L1__ 1 #else #error processor not supported #endif +#if __ARM_KERNEL_PROTECT__ +/* + * This feature is not currently implemented for 32-bit ARM CPU architectures. + * A discussion of this feature for 64-bit ARM CPU architectures can be found + * in the ARM64 version of this file. + */ +#if __arm__ +#error __ARM_KERNEL_PROTECT__ is not supported on ARM32 +#endif +#endif /* __ARM_KERNEL_PROTECT__ */ + #if defined(ARM_BOARD_WFE_TIMEOUT_NS) #define __ARM_ENABLE_WFE_ 1 #else @@ -296,7 +328,7 @@ #define DFSR_WRITE 0x00000800 /* write data abort fault */ -#if defined (ARMA7) || defined (APPLE_ARM64_ARCH_FAMILY) +#if defined (ARMA7) || defined (APPLE_ARM64_ARCH_FAMILY) || defined (BCM2837) #define TEST_FSR_VMFAULT(status) \ (((status) == FSR_PFAULT) \ @@ -307,6 +339,10 @@ || ((status) == FSR_SACCESS) \ || ((status) == FSR_PACCESS)) +#define TEST_FSR_TRANSLATION_FAULT(status) \ + (((status) == FSR_SFAULT) \ + || ((status) == FSR_PFAULT)) + #else #error Incompatible CPU type configured @@ -466,6 +502,70 @@ #define L2_SWAY (L2_CSIZE - L2_NWAY) /* set size 1< #include +#include #include #include @@ -142,7 +143,7 @@ sleh_undef(struct arm_saved_state * regs, struct arm_vfpsaved_state * vfp_ss __u /* Check to see if we've hit a userland probe */ if ((regs->cpsr & PSR_MODE_MASK) == PSR_USER_MODE) { if (regs->cpsr & PSR_TF) { - uint16_t instr; + uint16_t instr = 0; if(COPYIN((user_addr_t)(regs->pc), (char *)&instr,(vm_size_t)(sizeof(uint16_t))) != KERN_SUCCESS) goto exit; @@ -153,7 +154,7 @@ sleh_undef(struct arm_saved_state * regs, struct arm_vfpsaved_state * vfp_ss __u goto exit; } } else { - uint32_t instr; + uint32_t instr = 0; if(COPYIN((user_addr_t)(regs->pc), (char *)&instr,(vm_size_t)(sizeof(uint32_t))) != KERN_SUCCESS) goto exit; @@ -169,13 +170,13 @@ sleh_undef(struct arm_saved_state * regs, struct arm_vfpsaved_state * vfp_ss __u if (regs->cpsr & PSR_TF) { - unsigned short instr; + unsigned short instr = 0; if(COPYIN((user_addr_t)(regs->pc), (char *)&instr,(vm_size_t)(sizeof(unsigned short))) != KERN_SUCCESS) goto exit; if (IS_THUMB32(instr)) { - unsigned int instr32; + unsigned int instr32; instr32 = (instr<<16); @@ -202,7 +203,7 @@ sleh_undef(struct arm_saved_state * regs, struct arm_vfpsaved_state * vfp_ss __u } } } else { - uint32_t instr; + uint32_t instr = 0; if(COPYIN((user_addr_t)(regs->pc), (char *)&instr,(vm_size_t)(sizeof(uint32_t))) != KERN_SUCCESS) goto exit; @@ -238,17 +239,7 @@ sleh_undef(struct arm_saved_state * regs, struct arm_vfpsaved_state * vfp_ss __u (void) ml_set_interrupts_enabled(intr); goto exit; } - panic_context(exception, (void *)regs, "undefined kernel instruction\n" - "r0: 0x%08x r1: 0x%08x r2: 0x%08x r3: 0x%08x\n" - "r4: 0x%08x r5: 0x%08x r6: 0x%08x r7: 0x%08x\n" - "r8: 0x%08x r9: 0x%08x r10: 0x%08x r11: 0x%08x\n" - "r12: 0x%08x sp: 0x%08x lr: 0x%08x pc: 0x%08x\n" - "cpsr: 0x%08x fsr: 0x%08x far: 0x%08x\n", - regs->r[0], regs->r[1], regs->r[2], regs->r[3], - regs->r[4], regs->r[5], regs->r[6], regs->r[7], - regs->r[8], regs->r[9], regs->r[10], regs->r[11], - regs->r[12], regs->sp, regs->lr, regs->pc, - regs->cpsr, regs->fsr, regs->far); + panic_with_thread_kernel_state("undefined kernel instruction", regs); (void) ml_set_interrupts_enabled(intr); @@ -306,8 +297,14 @@ sleh_abort(struct arm_saved_state * regs, int type) /* Done with asynchronous handling; re-enable here so that subsequent aborts are taken as early as possible. */ reenable_async_aborts(); - if (ml_at_interrupt_context()) - panic_with_thread_kernel_state("sleh_abort at interrupt context", regs); + if (ml_at_interrupt_context()) { +#if CONFIG_DTRACE + if (!(thread->options & TH_OPT_DTRACE)) +#endif /* CONFIG_DTRACE */ + { + panic_with_thread_kernel_state("sleh_abort at interrupt context", regs); + } + } fault_addr = vaddr = regs->far; @@ -339,7 +336,7 @@ sleh_abort(struct arm_saved_state * regs, int type) /* Cache operations report faults as write access, change these to read access */ /* Cache operations are invoked from arm mode for now */ if (!(regs->cpsr & PSR_TF)) { - unsigned int ins; + unsigned int ins = 0; if(COPYIN((user_addr_t)(regs->pc), (char *)&ins,(vm_size_t)(sizeof(unsigned int))) != KERN_SUCCESS) goto exit; @@ -355,7 +352,7 @@ sleh_abort(struct arm_saved_state * regs, int type) * a write fault. */ if (!(regs->cpsr & PSR_TF)) { - unsigned int ins; + unsigned int ins = 0; if(COPYIN((user_addr_t)(regs->pc), (char *)&ins,(vm_size_t)(sizeof(unsigned int))) != KERN_SUCCESS) goto exit; @@ -387,18 +384,7 @@ sleh_abort(struct arm_saved_state * regs, int type) (void) ml_set_interrupts_enabled(intr); goto exit; } - panic_context(EXC_BAD_ACCESS, (void*)regs, "sleh_abort: prefetch abort in kernel mode: fault_addr=0x%x\n" - "r0: 0x%08x r1: 0x%08x r2: 0x%08x r3: 0x%08x\n" - "r4: 0x%08x r5: 0x%08x r6: 0x%08x r7: 0x%08x\n" - "r8: 0x%08x r9: 0x%08x r10: 0x%08x r11: 0x%08x\n" - "r12: 0x%08x sp: 0x%08x lr: 0x%08x pc: 0x%08x\n" - "cpsr: 0x%08x fsr: 0x%08x far: 0x%08x\n", - fault_addr, - regs->r[0], regs->r[1], regs->r[2], regs->r[3], - regs->r[4], regs->r[5], regs->r[6], regs->r[7], - regs->r[8], regs->r[9], regs->r[10], regs->r[11], - regs->r[12], regs->sp, regs->lr, regs->pc, - regs->cpsr, regs->fsr, regs->far); + panic_with_thread_kernel_state("prefetch abort in kernel mode", regs); (void) ml_set_interrupts_enabled(intr); @@ -412,17 +398,7 @@ sleh_abort(struct arm_saved_state * regs, int type) goto exit; } else { intr = ml_set_interrupts_enabled(FALSE); - panic_context(EXC_BAD_ACCESS, (void *)regs, "Unexpected page fault under dtrace_probe" - "r0: 0x%08x r1: 0x%08x r2: 0x%08x r3: 0x%08x\n" - "r4: 0x%08x r5: 0x%08x r6: 0x%08x r7: 0x%08x\n" - "r8: 0x%08x r9: 0x%08x r10: 0x%08x r11: 0x%08x\n" - "r12: 0x%08x sp: 0x%08x lr: 0x%08x pc: 0x%08x\n" - "cpsr: 0x%08x fsr: 0x%08x far: 0x%08x\n", - regs->r[0], regs->r[1], regs->r[2], regs->r[3], - regs->r[4], regs->r[5], regs->r[6], regs->r[7], - regs->r[8], regs->r[9], regs->r[10], regs->r[11], - regs->r[12], regs->sp, regs->lr, regs->pc, - regs->cpsr, regs->fsr, regs->far); + panic_with_thread_kernel_state("Unexpected page fault under dtrace_probe", regs); (void) ml_set_interrupts_enabled(intr); @@ -436,10 +412,12 @@ sleh_abort(struct arm_saved_state * regs, int type) else map = thread->map; - /* check to see if it is just a pmap ref/modify fault */ - result = arm_fast_fault(map->pmap, trunc_page(fault_addr), fault_type, FALSE); - if (result == KERN_SUCCESS) - goto exit; + if (!TEST_FSR_TRANSLATION_FAULT(status)) { + /* check to see if it is just a pmap ref/modify fault */ + result = arm_fast_fault(map->pmap, trunc_page(fault_addr), fault_type, FALSE); + if (result == KERN_SUCCESS) + goto exit; + } /* * We have to "fault" the page in. @@ -468,18 +446,7 @@ sleh_abort(struct arm_saved_state * regs, int type) } else { intr = ml_set_interrupts_enabled(FALSE); - panic_context(EXC_BAD_ACCESS, (void *)regs, "unaligned kernel data access: pc=0x%08x fault_addr=0x%x\n" - "r0: 0x%08x r1: 0x%08x r2: 0x%08x r3: 0x%08x\n" - "r4: 0x%08x r5: 0x%08x r6: 0x%08x r7: 0x%08x\n" - "r8: 0x%08x r9: 0x%08x r10: 0x%08x r11: 0x%08x\n" - "r12: 0x%08x sp: 0x%08x lr: 0x%08x pc: 0x%08x\n" - "cpsr: 0x%08x fsr: 0x%08x far: 0x%08x\n", - regs->pc, fault_addr, - regs->r[0], regs->r[1], regs->r[2], regs->r[3], - regs->r[4], regs->r[5], regs->r[6], regs->r[7], - regs->r[8], regs->r[9], regs->r[10], regs->r[11], - regs->r[12], regs->sp, regs->lr, regs->pc, - regs->cpsr, regs->fsr, regs->far); + panic_with_thread_kernel_state("unaligned kernel data access", regs); (void) ml_set_interrupts_enabled(intr); @@ -489,7 +456,7 @@ sleh_abort(struct arm_saved_state * regs, int type) } intr = ml_set_interrupts_enabled(FALSE); - panic_context(EXC_BAD_ACCESS, (void *)regs, "kernel abort type %d: fault_type=0x%x, fault_addr=0x%x\n" + panic_plain("kernel abort type %d: fault_type=0x%x, fault_addr=0x%x\n" "r0: 0x%08x r1: 0x%08x r2: 0x%08x r3: 0x%08x\n" "r4: 0x%08x r5: 0x%08x r6: 0x%08x r7: 0x%08x\n" "r8: 0x%08x r9: 0x%08x r10: 0x%08x r11: 0x%08x\n" @@ -519,17 +486,7 @@ sleh_abort(struct arm_saved_state * regs, int type) } else { intr = ml_set_interrupts_enabled(FALSE); - panic_context(EXC_BAD_ACCESS, (void *)regs, "copyin/out has no recovery point" - "r0: 0x%08x r1: 0x%08x r2: 0x%08x r3: 0x%08x\n" - "r4: 0x%08x r5: 0x%08x r6: 0x%08x r7: 0x%08x\n" - "r8: 0x%08x r9: 0x%08x r10: 0x%08x r11: 0x%08x\n" - "r12: 0x%08x sp: 0x%08x lr: 0x%08x pc: 0x%08x\n" - "cpsr: 0x%08x fsr: 0x%08x far: 0x%08x\n", - regs->r[0], regs->r[1], regs->r[2], regs->r[3], - regs->r[4], regs->r[5], regs->r[6], regs->r[7], - regs->r[8], regs->r[9], regs->r[10], regs->r[11], - regs->r[12], regs->sp, regs->lr, regs->pc, - regs->cpsr, regs->fsr, regs->far); + panic_with_thread_kernel_state("copyin/out has no recovery point", regs); (void) ml_set_interrupts_enabled(intr); } @@ -537,17 +494,7 @@ sleh_abort(struct arm_saved_state * regs, int type) } else { intr = ml_set_interrupts_enabled(FALSE); - panic_context(EXC_BAD_ACCESS, (void*)regs, "Unexpected UMW page fault under dtrace_probe" - "r0: 0x%08x r1: 0x%08x r2: 0x%08x r3: 0x%08x\n" - "r4: 0x%08x r5: 0x%08x r6: 0x%08x r7: 0x%08x\n" - "r8: 0x%08x r9: 0x%08x r10: 0x%08x r11: 0x%08x\n" - "r12: 0x%08x sp: 0x%08x lr: 0x%08x pc: 0x%08x\n" - "cpsr: 0x%08x fsr: 0x%08x far: 0x%08x\n", - regs->r[0], regs->r[1], regs->r[2], regs->r[3], - regs->r[4], regs->r[5], regs->r[6], regs->r[7], - regs->r[8], regs->r[9], regs->r[10], regs->r[11], - regs->r[12], regs->sp, regs->lr, regs->pc, - regs->cpsr, regs->fsr, regs->far); + panic_with_thread_kernel_state("Unexpected UMW page fault under dtrace_probe", regs); (void) ml_set_interrupts_enabled(intr); @@ -556,16 +503,19 @@ sleh_abort(struct arm_saved_state * regs, int type) } #endif - /* check to see if it is just a pmap ref/modify fault */ - result = arm_fast_fault(map->pmap, trunc_page(fault_addr), fault_type, TRUE); - if (result != KERN_SUCCESS) { - /* - * We have to "fault" the page in. - */ - result = vm_fault(map, fault_addr, fault_type, - FALSE /* change_wiring */, VM_KERN_MEMORY_NONE, - THREAD_ABORTSAFE, NULL, 0); + if (!TEST_FSR_TRANSLATION_FAULT(status)) { + /* check to see if it is just a pmap ref/modify fault */ + result = arm_fast_fault(map->pmap, trunc_page(fault_addr), fault_type, TRUE); + if (result == KERN_SUCCESS) + goto exception_return; } + + /* + * We have to "fault" the page in. + */ + result = vm_fault(map, fault_addr, fault_type, + FALSE /* change_wiring */, VM_KERN_MEMORY_NONE, + THREAD_ABORTSAFE, NULL, 0); if (result == KERN_SUCCESS || result == KERN_ABORTED) { goto exception_return; } @@ -614,7 +564,7 @@ static kern_return_t sleh_alignment(struct arm_saved_state * regs) { unsigned int status; - unsigned int ins; + unsigned int ins = 0; unsigned int rd_index; unsigned int base_index; unsigned int paddr; @@ -650,7 +600,7 @@ sleh_alignment(struct arm_saved_state * regs) paddr = regs->far; if (regs->cpsr & PSR_TF) { - unsigned short ins16; + unsigned short ins16 = 0; /* Get aborted instruction */ #if __ARM_SMP__ || __ARM_USER_PROTECT__ @@ -859,9 +809,10 @@ void interrupt_trace_exit( void) { - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - MACHDBG_CODE(DBG_MACH_EXCP_INTR, 0) | DBG_FUNC_END, - 0, 0, 0, 0, 0); +#if KPERF + kperf_interrupt(); +#endif /* KPERF */ + KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_EXCP_INTR, 0) | DBG_FUNC_END); } #endif @@ -878,17 +829,17 @@ interrupt_stats(void) static void panic_with_thread_kernel_state(const char *msg, struct arm_saved_state *regs) { - panic_context(0, (void*)regs, "%s (saved state:%p)\n" - "r0: 0x%08x r1: 0x%08x r2: 0x%08x r3: 0x%08x\n" - "r4: 0x%08x r5: 0x%08x r6: 0x%08x r7: 0x%08x\n" - "r8: 0x%08x r9: 0x%08x r10: 0x%08x r11: 0x%08x\n" - "r12: 0x%08x sp: 0x%08x lr: 0x%08x pc: 0x%08x\n" - "cpsr: 0x%08x fsr: 0x%08x far: 0x%08x\n", - msg, regs, - regs->r[0], regs->r[1], regs->r[2], regs->r[3], - regs->r[4], regs->r[5], regs->r[6], regs->r[7], - regs->r[8], regs->r[9], regs->r[10], regs->r[11], - regs->r[12], regs->sp, regs->lr, regs->pc, - regs->cpsr, regs->fsr, regs->far); + panic_plain("%s (saved state:%p)\n" + "r0: 0x%08x r1: 0x%08x r2: 0x%08x r3: 0x%08x\n" + "r4: 0x%08x r5: 0x%08x r6: 0x%08x r7: 0x%08x\n" + "r8: 0x%08x r9: 0x%08x r10: 0x%08x r11: 0x%08x\n" + "r12: 0x%08x sp: 0x%08x lr: 0x%08x pc: 0x%08x\n" + "cpsr: 0x%08x fsr: 0x%08x far: 0x%08x\n", + msg, regs, + regs->r[0], regs->r[1], regs->r[2], regs->r[3], + regs->r[4], regs->r[5], regs->r[6], regs->r[7], + regs->r[8], regs->r[9], regs->r[10], regs->r[11], + regs->r[12], regs->sp, regs->lr, regs->pc, + regs->cpsr, regs->fsr, regs->far); } diff --git a/osfmk/arm/trap.h b/osfmk/arm/trap.h index a4caeaa5a..b1d02a39c 100644 --- a/osfmk/arm/trap.h +++ b/osfmk/arm/trap.h @@ -239,7 +239,6 @@ || (((op) & THUMB_SIMD_VFP_MASK2) == THUMB_SIMD_VFP_CODE2 ) \ || (((op) & THUMB_SIMD_VFP_MASK3) == THUMB_SIMD_VFP_CODE3)) -extern boolean_t arm_swap_readable_type(vm_map_address_t, unsigned int /* spsr */); extern boolean_t arm_force_fast_fault(ppnum_t, vm_prot_t, int, void *); extern kern_return_t arm_fast_fault(pmap_t, vm_map_address_t, vm_prot_t, boolean_t); @@ -250,9 +249,7 @@ extern kern_return_t arm_fast_fault(pmap_t, vm_map_address_t, vm_prot_t, boolean (((((op)&ARM_CDX_MASK) == ARM_STC) || \ (((op)&ARM_STRH_MASK) == ARM_STRH) || \ (((op)&ARM_BDX_MASK) == ARM_STM) || \ - (((op)&ARM_SDX_MASK) == ARM_STR) || \ - ((((op)&ARM_SWP_MASK) == ARM_SWP) && \ - arm_swap_readable_type(vaddr,spsr))) ? \ + (((op)&ARM_SDX_MASK) == ARM_STR)) ? \ (VM_PROT_WRITE|VM_PROT_READ) : (VM_PROT_READ)) #define thumb_fault_type(op,spsr,vaddr) \ diff --git a/osfmk/arm64/Makefile b/osfmk/arm64/Makefile index ad75e8a11..4498b9ff0 100644 --- a/osfmk/arm64/Makefile +++ b/osfmk/arm64/Makefile @@ -6,6 +6,7 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir include $(MakeInc_cmd) include $(MakeInc_def) + ARM_HEADER_FILES = \ lowglobals.h \ machine_cpuid.h \ @@ -22,7 +23,7 @@ INSTALL_KF_MD_LIST = $(ARM_HEADER_FILES) INSTALL_KF_MD_LCL_LIST = machine_kpc.h monotonic.h pgtrace.h $(ARM_HEADER_FILES) -EXPORT_MD_LIST = machine_cpuid.h machine_kpc.h monotonic.h proc_reg.h pgtrace.h +EXPORT_MD_LIST = machine_cpuid.h machine_kpc.h monotonic.h proc_reg.h pgtrace.h asm.h EXPORT_MD_DIR = arm64 diff --git a/osfmk/arm64/arm_vm_init.c b/osfmk/arm64/arm_vm_init.c index 9b06f9504..bfad29bf5 100644 --- a/osfmk/arm64/arm_vm_init.c +++ b/osfmk/arm64/arm_vm_init.c @@ -38,18 +38,18 @@ #include #include +#include #include #include #include #include #include +#include #include #include -#if __ARM_KERNEL_PROTECT__ -#include -#endif /* __ARM_KERNEL_PROTECT__ */ +#include #if __ARM_KERNEL_PROTECT__ /* @@ -69,6 +69,8 @@ static_assert((KERNEL_PMAP_HEAP_RANGE_START & ~ARM_TT_ROOT_OFFMASK) > ARM_KERNEL static_assert((((~ARM_KERNEL_PROTECT_EXCEPTION_START) + 1) * 2ULL) <= (ARM_TT_ROOT_SIZE + ARM_TT_ROOT_INDEX_MASK)); #endif /* __ARM_KERNEL_PROTECT__ */ +#define ARM_DYNAMIC_TABLE_XN (ARM_TTE_TABLE_PXN | ARM_TTE_TABLE_XN) + #if KASAN extern vm_offset_t shadow_pbase; extern vm_offset_t shadow_ptop; @@ -81,6 +83,9 @@ extern vm_offset_t physmap_vtop; */ extern void *last_kernel_symbol; +extern void arm64_replace_bootstack(cpu_data_t*); +extern void PE_slide_devicetree(vm_offset_t); + /* * KASLR parameters */ @@ -103,17 +108,67 @@ SECURITY_READ_ONLY_LATE(vm_offset_t) vm_prelink_einfo; SECURITY_READ_ONLY_LATE(vm_offset_t) vm_slinkedit; SECURITY_READ_ONLY_LATE(vm_offset_t) vm_elinkedit; +SECURITY_READ_ONLY_LATE(vm_offset_t) vm_kernel_builtinkmod_text; +SECURITY_READ_ONLY_LATE(vm_offset_t) vm_kernel_builtinkmod_text_end; + /* Used by */ SECURITY_READ_ONLY_LATE(unsigned long) gVirtBase; SECURITY_READ_ONLY_LATE(unsigned long) gPhysBase; SECURITY_READ_ONLY_LATE(unsigned long) gPhysSize; +SECURITY_READ_ONLY_LATE(unsigned long) gT0Sz = T0SZ_BOOT; +SECURITY_READ_ONLY_LATE(unsigned long) gT1Sz = T1SZ_BOOT; + +/* 23543331 - step 1 of kext / kernel __TEXT and __DATA colocation is to move + * all kexts before the kernel. This is only for arm64 devices and looks + * something like the following: + * -- vmaddr order -- + * 0xffffff8004004000 __PRELINK_TEXT + * 0xffffff8007004000 __TEXT (xnu) + * 0xffffff80075ec000 __DATA (xnu) + * 0xffffff80076dc000 __KLD (xnu) + * 0xffffff80076e0000 __LAST (xnu) + * 0xffffff80076e4000 __LINKEDIT (xnu) + * 0xffffff80076e4000 __PRELINK_DATA (not used yet) + * 0xffffff800782c000 __PRELINK_INFO + * 0xffffff80078e4000 -- End of kernelcache + */ - -/* - * NOTE: mem_size is bogus on large memory machines. - * We will pin it to 0x80000000 if there is more than 2 GB - * This is left only for compatibility and max_mem should be used. +/* 24921709 - make XNU ready for KTRR + * + * Two possible kernel cache layouts, depending on which kcgen is being used. + * VAs increasing downwards. + * Old KCGEN: + * + * __PRELINK_TEXT + * __TEXT + * __DATA_CONST + * __TEXT_EXEC + * __KLD + * __LAST + * __DATA + * __PRELINK_DATA (expected empty) + * __LINKEDIT + * __PRELINK_INFO + * + * New kcgen: + * + * __PRELINK_TEXT <--- First KTRR (ReadOnly) segment + * __PLK_DATA_CONST + * __PLK_TEXT_EXEC + * __TEXT + * __DATA_CONST + * __TEXT_EXEC + * __KLD + * __LAST <--- Last KTRR (ReadOnly) segment + * __DATA + * __BOOTDATA (if present) + * __LINKEDIT + * __PRELINK_DATA (expected populated now) + * __PLK_LINKEDIT + * __PRELINK_INFO + * */ + vm_offset_t mem_size; /* Size of actual physical memory present * minus any performance buffer and possibly * limited by mem_limit in bytes */ @@ -129,6 +184,11 @@ addr64_t vm_last_addr = VM_MAX_KERNEL_ADDRESS; /* Highest kernel * virtual address known * to the VM system */ +SECURITY_READ_ONLY_LATE(vm_offset_t) segEXTRADATA; +SECURITY_READ_ONLY_LATE(unsigned long) segSizeEXTRADATA; + +SECURITY_READ_ONLY_LATE(vm_offset_t) segLOWESTTEXT; + SECURITY_READ_ONLY_LATE(static vm_offset_t) segTEXTB; SECURITY_READ_ONLY_LATE(static unsigned long) segSizeTEXT; @@ -143,13 +203,19 @@ SECURITY_READ_ONLY_LATE(static vm_offset_t) segDATAB; SECURITY_READ_ONLY_LATE(static unsigned long) segSizeDATA; +SECURITY_READ_ONLY_LATE(vm_offset_t) segBOOTDATAB; +SECURITY_READ_ONLY_LATE(unsigned long) segSizeBOOTDATA; +extern vm_offset_t intstack_low_guard; +extern vm_offset_t intstack_high_guard; +extern vm_offset_t excepstack_high_guard; + SECURITY_READ_ONLY_LATE(static vm_offset_t) segLINKB; SECURITY_READ_ONLY_LATE(static unsigned long) segSizeLINK; SECURITY_READ_ONLY_LATE(static vm_offset_t) segKLDB; SECURITY_READ_ONLY_LATE(static unsigned long) segSizeKLD; -SECURITY_READ_ONLY_LATE(static vm_offset_t) segLASTB; -SECURITY_READ_ONLY_LATE(static unsigned long) segSizeLAST; +SECURITY_READ_ONLY_LATE(vm_offset_t) segLASTB; +SECURITY_READ_ONLY_LATE(unsigned long) segSizeLAST; SECURITY_READ_ONLY_LATE(vm_offset_t) segPRELINKTEXTB; SECURITY_READ_ONLY_LATE(unsigned long) segSizePRELINKTEXT; @@ -199,14 +265,13 @@ SECURITY_READ_ONLY_LATE(vm_offset_t) first_avail; SECURITY_READ_ONLY_LATE(vm_offset_t) static_memory_end; SECURITY_READ_ONLY_LATE(pmap_paddr_t) avail_start; SECURITY_READ_ONLY_LATE(pmap_paddr_t) avail_end; +SECURITY_READ_ONLY_LATE(pmap_paddr_t) real_avail_end; #if __ARM_KERNEL_PROTECT__ extern void ExceptionVectorsBase; extern void ExceptionVectorsEnd; #endif /* __ARM_KERNEL_PROTECT__ */ -#define MEM_SIZE_MAX 0x100000000ULL - #if defined(KERNEL_INTEGRITY_KTRR) #if __ARM64_TWO_LEVEL_PMAP__ /* We could support this configuration, but it adds memory overhead. */ @@ -214,6 +279,57 @@ extern void ExceptionVectorsEnd; #endif #endif +typedef struct { + pmap_paddr_t pa; + vm_map_address_t va; + vm_size_t len; +} ptov_table_entry; + +#define PTOV_TABLE_SIZE 8 +SECURITY_READ_ONLY_LATE(static ptov_table_entry) ptov_table[PTOV_TABLE_SIZE]; +SECURITY_READ_ONLY_LATE(static boolean_t) kva_active = FALSE; + + +vm_map_address_t +phystokv(pmap_paddr_t pa) +{ + for (size_t i = 0; (i < PTOV_TABLE_SIZE) && (ptov_table[i].len != 0); i++) { + if ((pa >= ptov_table[i].pa) && (pa < (ptov_table[i].pa + ptov_table[i].len))) + return (pa - ptov_table[i].pa + ptov_table[i].va); + } + return (pa - gPhysBase + gVirtBase); +} + +vm_map_address_t +phystokv_range(pmap_paddr_t pa, vm_size_t *max_len) +{ + vm_size_t len; + for (size_t i = 0; (i < PTOV_TABLE_SIZE) && (ptov_table[i].len != 0); i++) { + if ((pa >= ptov_table[i].pa) && (pa < (ptov_table[i].pa + ptov_table[i].len))) { + len = ptov_table[i].len - (pa - ptov_table[i].pa); + if (*max_len > len) + *max_len = len; + return (pa - ptov_table[i].pa + ptov_table[i].va); + } + } + len = PAGE_SIZE - (pa & PAGE_MASK); + if (*max_len > len) + *max_len = len; + return (pa - gPhysBase + gVirtBase); +} + +vm_offset_t +ml_static_vtop(vm_offset_t va) +{ + for (size_t i = 0; (i < PTOV_TABLE_SIZE) && (ptov_table[i].len != 0); i++) { + if ((va >= ptov_table[i].va) && (va < (ptov_table[i].va + ptov_table[i].len))) + return (va - ptov_table[i].va + ptov_table[i].pa); + } + if (((vm_address_t)(va) - gVirtBase) >= gPhysSize) + panic("ml_static_vtop(): illegal VA: %p\n", (void*)va); + return ((vm_address_t)(va) - gVirtBase + gPhysBase); +} + /* * This rounds the given address up to the nearest boundary for a PTE contiguous * hint. @@ -637,33 +753,30 @@ static void arm_replace_identity_map(boot_args * args) /* * The V=P page tables (at the time this comment was written) start - * after the last bit of kernel data, and consist of 1 to 2 pages. + * after the last bit of kernel data, and consist of 1 L1 page and 1 or + * more L2 pages. * Grab references to those pages, and allocate an L3 page. */ -#if !__ARM64_TWO_LEVEL_PMAP__ l1_ptp_phys = args->topOfKernelData; l1_ptp_virt = (tt_entry_t *)phystokv(l1_ptp_phys); - tte1 = &l1_ptp_virt[(((paddr) & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT)]; + tte1 = &l1_ptp_virt[L1_TABLE_INDEX(paddr)]; - l2_ptp_phys = l1_ptp_phys + ARM_PGBYTES; -#else - l2_ptp_phys = args->topOfKernelData; -#endif - l2_ptp_virt = (tt_entry_t *)phystokv(l2_ptp_phys); - tte2 = &l2_ptp_virt[(((paddr) & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT)]; + l2_ptp_virt = L2_TABLE_VA(tte1); + l2_ptp_phys = (*tte1) & ARM_TTE_TABLE_MASK; + tte2 = &l2_ptp_virt[L2_TABLE_INDEX(paddr)]; l3_ptp_virt = (pt_entry_t *)alloc_ptpage(FALSE); l3_ptp_phys = kvtophys((vm_offset_t)l3_ptp_virt); - ptep = &l3_ptp_virt[(((paddr) & ARM_TT_L3_INDEX_MASK) >> ARM_TT_L3_SHIFT)]; + ptep = &l3_ptp_virt[L3_TABLE_INDEX(paddr)]; /* * Replace the large V=P mapping with a mapping that provides only the * mappings needed to turn on the MMU. */ -#if !__ARM64_TWO_LEVEL_PMAP__ + bzero(l1_ptp_virt, ARM_PGBYTES); *tte1 = ARM_TTE_BOOT_TABLE | (l2_ptp_phys & ARM_TTE_TABLE_MASK); -#endif + bzero(l2_ptp_virt, ARM_PGBYTES); *tte2 = ARM_TTE_BOOT_TABLE | (l3_ptp_phys & ARM_TTE_TABLE_MASK); @@ -677,6 +790,23 @@ static void arm_replace_identity_map(boot_args * args) } #endif /* defined(KERNEL_INTEGRITY_KTRR)*/ +tt_entry_t *arm_kva_to_tte(vm_offset_t); + +tt_entry_t * +arm_kva_to_tte(vm_offset_t va) +{ +#if __ARM64_TWO_LEVEL_PMAP__ + tt_entry_t *tte2; + tte2 = cpu_tte + L2_TABLE_INDEX(va); +#else + tt_entry_t *tte1, *tte2; + tte1 = cpu_tte + L1_TABLE_INDEX(va); + tte2 = L2_TABLE_VA(tte1) + L2_TABLE_INDEX(va); +#endif + return tte2; +} + + /* * arm_vm_page_granular_helper updates protections at the L3 level. It will (if * neccessary) allocate a page for the L3 table and update the corresponding L2 @@ -684,18 +814,18 @@ static void arm_replace_identity_map(boot_args * args) * This expects to be invoked on a L2 entry or sub L2 entry granularity, so this should * not be invoked from a context that does not do L2 iteration separately (basically, * don't call this except from arm_vm_page_granular_prot). + * + * bool force_page_granule: true: will force page level mappings for this entry + * false: will try to use block level mappings */ + static void -arm_vm_page_granular_helper(vm_offset_t start, vm_offset_t _end, vm_offset_t va, - int pte_prot_APX, int pte_prot_XN, int forceCoarse, +arm_vm_page_granular_helper(vm_offset_t start, vm_offset_t _end, vm_offset_t va, pmap_paddr_t pa_offset, + int pte_prot_APX, int pte_prot_XN, bool force_page_granule, pt_entry_t **deferred_pte, pt_entry_t *deferred_ptmp) { if (va & ARM_TT_L2_OFFMASK) { /* ragged edge hanging over a ARM_TT_L2_SIZE boundary */ -#if __ARM64_TWO_LEVEL_PMAP__ tt_entry_t *tte2; -#else - tt_entry_t *tte1, *tte2; -#endif tt_entry_t tmplate; pmap_paddr_t pa; pt_entry_t *ppte, *recursive_pte = NULL, ptmp, recursive_ptmp = 0; @@ -703,33 +833,40 @@ arm_vm_page_granular_helper(vm_offset_t start, vm_offset_t _end, vm_offset_t va, unsigned i; va &= ~ARM_TT_L2_OFFMASK; - pa = va - gVirtBase + gPhysBase; + pa = va - gVirtBase + gPhysBase - pa_offset; -#if __ARM64_TWO_LEVEL_PMAP__ - tte2 = &cpu_tte[(((va) & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT)]; -#else - tte1 = &cpu_tte[(((va) & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT)]; - tte2 = &((tt_entry_t*) phystokv((*tte1) & ARM_TTE_TABLE_MASK))[(((va) & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT)]; -#endif + if (pa >= real_avail_end) + return; + tte2 = arm_kva_to_tte(va); + + assert(_end >= va); tmplate = *tte2; if (ARM_TTE_TYPE_TABLE == (tmplate & ARM_TTE_TYPE_MASK)) { /* pick up the existing page table. */ ppte = (pt_entry_t *)phystokv((tmplate & ARM_TTE_TABLE_MASK)); } else { - // TTE must be reincarnated COARSE. - ppte = (pt_entry_t*)alloc_ptpage(TRUE); + // TTE must be reincarnated with page level mappings. + ppte = (pt_entry_t*)alloc_ptpage(pa_offset == 0); + bzero(ppte, ARM_PGBYTES); ppte_phys = kvtophys((vm_offset_t)ppte); - pmap_init_pte_static_page(kernel_pmap, ppte, pa); - - *tte2 = pa_to_tte(ppte_phys) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID; + *tte2 = pa_to_tte(ppte_phys) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID; } + vm_offset_t len = _end - va; + if ((pa + len) > real_avail_end) + _end -= (pa + len - real_avail_end); + assert((start - gVirtBase + gPhysBase - pa_offset) >= gPhysBase); + + /* Round up to the nearest PAGE_SIZE boundary when creating mappings: + * PAGE_SIZE may be a multiple of ARM_PGBYTES, and we don't want to leave + * a ragged non-PAGE_SIZE-aligned edge. */ + vm_offset_t rounded_end = round_page(_end); /* Apply the desired protections to the specified page range */ for (i = 0; i <= (ARM_TT_L3_INDEX_MASK>>ARM_TT_L3_SHIFT); i++) { - if ((start <= va) && (va < _end)) { + if ((start <= va) && (va < rounded_end)) { ptmp = pa | ARM_PTE_AF | ARM_PTE_SH(SH_OUTER_MEMORY) | ARM_PTE_TYPE; ptmp = ptmp | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT); @@ -745,19 +882,35 @@ arm_vm_page_granular_helper(vm_offset_t start, vm_offset_t _end, vm_offset_t va, /* * If we can, apply the contiguous hint to this range. The hint is - * applicable if we are not trying to create per-page mappings and - * if the current address falls within a hint-sized range that will + * applicable if the current address falls within a hint-sized range that will * be fully covered by this mapping request. */ - if ((va >= round_up_pte_hint_address(start)) && (round_up_pte_hint_address(va + 1) < _end) && - !forceCoarse && use_contiguous_hint) { + if ((va >= round_up_pte_hint_address(start)) && (round_up_pte_hint_address(va + 1) <= _end) && + !force_page_granule && use_contiguous_hint) { + assert((va & ((1 << ARM_PTE_HINT_ADDR_SHIFT) - 1)) == ((pa & ((1 << ARM_PTE_HINT_ADDR_SHIFT) - 1)))); ptmp |= ARM_PTE_HINT; } - - if ((pt_entry_t*)(phystokv(pa)) == ppte) { + /* + * Do not change the contiguous bit on an active mapping. Even in a single-threaded + * environment, it's possible for prefetch to produce a TLB conflict by trying to pull in + * a hint-sized entry on top of one or more existing page-sized entries. It's also useful + * to make sure we're not trying to unhint a sub-range of a larger hinted range, which + * could produce a later TLB conflict. + */ + assert(!kva_active || (ppte[i] == ARM_PTE_TYPE_FAULT) || ((ppte[i] & ARM_PTE_HINT) == (ptmp & ARM_PTE_HINT))); + + /* + * If we reach an entry that maps the current pte page, delay updating it until the very end. + * Otherwise we might end up making the PTE page read-only, leading to a fault later on in + * this function if we manage to outrun the TLB. This can happen on KTRR-enabled devices when + * marking segDATACONST read-only. Mappings for this region may straddle a PT page boundary, + * so we must also defer assignment of the following PTE. We will assume that if the region + * were to require one or more full L3 pages, it would instead use L2 blocks where possible, + * therefore only requiring at most one L3 page at the beginning and one at the end. + */ + if (kva_active && ((pt_entry_t*)(phystokv(pa)) == ppte)) { assert(recursive_pte == NULL); - /* This assert should be reenabled as part of rdar://problem/30149465 */ - assert(!forceCoarse); + assert(!force_page_granule); recursive_pte = &ppte[i]; recursive_ptmp = ptmp; } else if ((deferred_pte != NULL) && (&ppte[i] == &recursive_pte[1])) { @@ -783,15 +936,11 @@ arm_vm_page_granular_helper(vm_offset_t start, vm_offset_t _end, vm_offset_t va, * changing them. If a particular chunk necessitates L3 entries (for reasons of * alignment or length, or an explicit request that the entry be fully expanded), we * hand off to arm_vm_page_granular_helper to deal with the L3 chunk of the logic. - * - * Note that counterintuitively a forceCoarse request is a request to expand the entries - * out to L3, i.e. to make *finer* grained mappings. That comes from historical arm32 - * nomenclature in which the 4K granule is "coarse" vs. the 1K "fine" granule (which we - * don't use). */ static void -arm_vm_page_granular_prot(vm_offset_t start, unsigned long size, - int tte_prot_XN, int pte_prot_APX, int pte_prot_XN, int forceCoarse) +arm_vm_page_granular_prot(vm_offset_t start, unsigned long size, pmap_paddr_t pa_offset, + int tte_prot_XN, int pte_prot_APX, int pte_prot_XN, + bool force_page_granule) { pt_entry_t *deferred_pte = NULL, deferred_ptmp = 0; vm_offset_t _end = start + size; @@ -801,94 +950,110 @@ arm_vm_page_granular_prot(vm_offset_t start, unsigned long size, return; if (align_start > _end) { - arm_vm_page_granular_helper(start, _end, start, pte_prot_APX, pte_prot_XN, forceCoarse, NULL, NULL); + arm_vm_page_granular_helper(start, _end, start, pa_offset, pte_prot_APX, pte_prot_XN, force_page_granule, NULL, NULL); return; } - arm_vm_page_granular_helper(start, align_start, start, pte_prot_APX, pte_prot_XN, forceCoarse, &deferred_pte, &deferred_ptmp); + arm_vm_page_granular_helper(start, align_start, start, pa_offset, pte_prot_APX, pte_prot_XN, force_page_granule, &deferred_pte, &deferred_ptmp); - while ((_end - align_start) >= ARM_TT_L2_SIZE) { - if (forceCoarse) - arm_vm_page_granular_helper(align_start, align_start+ARM_TT_L2_SIZE, align_start + 1, - pte_prot_APX, pte_prot_XN, forceCoarse, NULL, NULL); + while ((_end - align_start) >= ARM_TT_L2_SIZE) { + if (force_page_granule) + arm_vm_page_granular_helper(align_start, align_start+ARM_TT_L2_SIZE, align_start + 1, pa_offset, + pte_prot_APX, pte_prot_XN, force_page_granule, NULL, NULL); else { -#if __ARM64_TWO_LEVEL_PMAP__ + pmap_paddr_t pa = align_start - gVirtBase + gPhysBase - pa_offset; + assert((pa & ARM_TT_L2_OFFMASK) == 0); tt_entry_t *tte2; -#else - tt_entry_t *tte1, *tte2; -#endif tt_entry_t tmplate; -#if __ARM64_TWO_LEVEL_PMAP__ - tte2 = &cpu_tte[((align_start & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT)]; -#else - tte1 = &cpu_tte[((align_start & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT)]; - tte2 = &((tt_entry_t*) phystokv((*tte1) & ARM_TTE_TABLE_MASK))[((align_start & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT)]; -#endif + tte2 = arm_kva_to_tte(align_start); - tmplate = *tte2; - - tmplate = (tmplate & ~ARM_TTE_BLOCK_APMASK) | ARM_TTE_BLOCK_AP(pte_prot_APX); - tmplate = tmplate | ARM_TTE_BLOCK_NX; + if ((pa >= gPhysBase) && (pa < real_avail_end)) { + tmplate = (pa & ARM_TTE_BLOCK_L2_MASK) | ARM_TTE_TYPE_BLOCK + | ARM_TTE_VALID | ARM_TTE_BLOCK_AF | ARM_TTE_BLOCK_NX + | ARM_TTE_BLOCK_AP(pte_prot_APX) | ARM_TTE_BLOCK_SH(SH_OUTER_MEMORY) + | ARM_TTE_BLOCK_ATTRINDX(CACHE_ATTRINDX_WRITEBACK); + #if __ARM_KERNEL_PROTECT__ - tmplate = tmplate | ARM_TTE_BLOCK_NG; + tmplate = tmplate | ARM_TTE_BLOCK_NG; #endif /* __ARM_KERNEL_PROTECT__ */ - if (tte_prot_XN) - tmplate = tmplate | ARM_TTE_BLOCK_PNX; + if (tte_prot_XN) + tmplate = tmplate | ARM_TTE_BLOCK_PNX; - *tte2 = tmplate; + *tte2 = tmplate; + } } align_start += ARM_TT_L2_SIZE; } if (align_start < _end) - arm_vm_page_granular_helper(align_start, _end, _end, pte_prot_APX, pte_prot_XN, forceCoarse, &deferred_pte, &deferred_ptmp); + arm_vm_page_granular_helper(align_start, _end, _end, pa_offset, pte_prot_APX, pte_prot_XN, force_page_granule, &deferred_pte, &deferred_ptmp); if (deferred_pte != NULL) *deferred_pte = deferred_ptmp; } static inline void -arm_vm_page_granular_RNX(vm_offset_t start, unsigned long size, int forceCoarse) +arm_vm_page_granular_RNX(vm_offset_t start, unsigned long size, bool force_page_granule) { - arm_vm_page_granular_prot(start, size, 1, AP_RONA, 1, forceCoarse); + arm_vm_page_granular_prot(start, size, 0, 1, AP_RONA, 1, force_page_granule); } static inline void -arm_vm_page_granular_ROX(vm_offset_t start, unsigned long size, int forceCoarse) +arm_vm_page_granular_ROX(vm_offset_t start, unsigned long size, bool force_page_granule) { - arm_vm_page_granular_prot(start, size, 0, AP_RONA, 0, forceCoarse); + arm_vm_page_granular_prot(start, size, 0, 0, AP_RONA, 0, force_page_granule); } static inline void -arm_vm_page_granular_RWNX(vm_offset_t start, unsigned long size, int forceCoarse) +arm_vm_page_granular_RWNX(vm_offset_t start, unsigned long size, bool force_page_granule) { - arm_vm_page_granular_prot(start, size, 1, AP_RWNA, 1, forceCoarse); + arm_vm_page_granular_prot(start, size, 0, 1, AP_RWNA, 1, force_page_granule); } static inline void -arm_vm_page_granular_RWX(vm_offset_t start, unsigned long size, int forceCoarse) +arm_vm_page_granular_RWX(vm_offset_t start, unsigned long size, bool force_page_granule) { - arm_vm_page_granular_prot(start, size, 0, AP_RWNA, 0, forceCoarse); + arm_vm_page_granular_prot(start, size, 0, 0, AP_RWNA, 0, force_page_granule); } +/* used in the chosen/memory-map node, populated by iBoot. */ +typedef struct MemoryMapFileInfo { + vm_offset_t paddr; + size_t length; +} MemoryMapFileInfo; + + void arm_vm_prot_init(boot_args * args) { - /* - * Enforce W^X protections on sections that have been identified so far. This will be - * further refined for each KEXT's TEXT and DATA segments in readPrelinkedExtensions() - */ - bool use_small_page_mappings = FALSE; - /* - * First off, we'll create mappings for any physical memory preceeding the kernel TEXT. - * This is memory that we want to give to the VM; this will be accomplished through an - * ml_static_mfree call in arm_vm_prot_finalize. This allows the pmap/vm bootstrap - * routines to assume they will have a physically contiguous chunk of memory to deal - * with during bootstrap, while reclaiming this memory later. - */ - arm_vm_page_granular_RWNX(gVirtBase, segPRELINKTEXTB - gVirtBase, use_small_page_mappings); // Memory for the VM + segLOWESTTEXT = UINT64_MAX; + if (segSizePRELINKTEXT && (segPRELINKTEXTB < segLOWESTTEXT)) segLOWESTTEXT = segPRELINKTEXTB; + assert(segSizeTEXT); + if (segTEXTB < segLOWESTTEXT) segLOWESTTEXT = segTEXTB; + assert(segLOWESTTEXT < UINT64_MAX); + + segEXTRADATA = segLOWESTTEXT; + segSizeEXTRADATA = 0; + + DTEntry memory_map; + MemoryMapFileInfo *trustCacheRange; + unsigned int trustCacheRangeSize; + int err; + + err = DTLookupEntry(NULL, "chosen/memory-map", &memory_map); + assert(err == kSuccess); + + err = DTGetProperty(memory_map, "TrustCache", (void**)&trustCacheRange, &trustCacheRangeSize); + if (err == kSuccess) { + assert(trustCacheRangeSize == sizeof(MemoryMapFileInfo)); + + segEXTRADATA = phystokv(trustCacheRange->paddr); + segSizeEXTRADATA = trustCacheRange->length; + + arm_vm_page_granular_RNX(segEXTRADATA, segSizeEXTRADATA, FALSE); + } /* Map coalesced kext TEXT segment RWNX for now */ arm_vm_page_granular_RWNX(segPRELINKTEXTB, segSizePRELINKTEXT, FALSE); // Refined in OSKext::readPrelinkedExtensions @@ -900,10 +1065,11 @@ arm_vm_prot_init(boot_args * args) arm_vm_page_granular_ROX(segPLKTEXTEXECB, segSizePLKTEXTEXEC, FALSE); // Refined in OSKext::readPrelinkedExtensions /* if new segments not present, set space between PRELINK_TEXT and xnu TEXT to RWNX - * otherwise we no longer expecting any space between the coalesced kext read only segments and xnu rosegments + * otherwise we no longer expect any space between the coalesced kext read only segments and xnu rosegments */ if (!segSizePLKDATACONST && !segSizePLKTEXTEXEC) { - arm_vm_page_granular_RWNX(segPRELINKTEXTB + segSizePRELINKTEXT, segTEXTB - (segPRELINKTEXTB + segSizePRELINKTEXT), FALSE); + if (segSizePRELINKTEXT) + arm_vm_page_granular_RWNX(segPRELINKTEXTB + segSizePRELINKTEXT, segTEXTB - (segPRELINKTEXTB + segSizePRELINKTEXT), FALSE); } else { /* * If we have the new segments, we should still protect the gap between kext @@ -937,8 +1103,14 @@ arm_vm_prot_init(boot_args * args) /* DATA segment will remain RWNX */ arm_vm_page_granular_RWNX(segDATAB, segSizeDATA, FALSE); + arm_vm_page_granular_RWNX(segBOOTDATAB, segSizeBOOTDATA, TRUE); + arm_vm_page_granular_RNX((vm_offset_t)&intstack_low_guard, PAGE_MAX_SIZE, TRUE); + arm_vm_page_granular_RNX((vm_offset_t)&intstack_high_guard, PAGE_MAX_SIZE, TRUE); + arm_vm_page_granular_RNX((vm_offset_t)&excepstack_high_guard, PAGE_MAX_SIZE, TRUE); + arm_vm_page_granular_ROX(segKLDB, segSizeKLD, FALSE); arm_vm_page_granular_RWNX(segLINKB, segSizeLINK, FALSE); + arm_vm_page_granular_RWNX(segPLKLINKEDITB, segSizePLKLINKEDIT, FALSE); // Coalesced kext LINKEDIT segment arm_vm_page_granular_ROX(segLASTB, segSizeLAST, FALSE); // __LAST may be empty, but we cannot assume this arm_vm_page_granular_RWNX(segPRELINKDATAB, segSizePRELINKDATA, FALSE); // Prelink __DATA for kexts (RW data) @@ -946,33 +1118,94 @@ arm_vm_prot_init(boot_args * args) if (segSizePLKLLVMCOV > 0) arm_vm_page_granular_RWNX(segPLKLLVMCOVB, segSizePLKLLVMCOV, FALSE); // LLVM code coverage data - arm_vm_page_granular_RWNX(segPLKLINKEDITB, segSizePLKLINKEDIT, use_small_page_mappings); // Coalesced kext LINKEDIT segment - arm_vm_page_granular_RWNX(segPRELINKINFOB, segSizePRELINKINFO, FALSE); /* PreLinkInfoDictionary */ - arm_vm_page_granular_RWNX(end_kern, phystokv(args->topOfKernelData) - end_kern, use_small_page_mappings); /* Device Tree, RAM Disk (if present), bootArgs */ - /* - * This is offset by 4 pages to make room for the boot page tables; we could probably - * include them in the overall mapping, but we'll be paranoid for now. - */ - vm_offset_t extra = 0; -#if KASAN - /* add the KASAN stolen memory to the physmap */ - extra = shadow_ptop - shadow_pbase; + arm_vm_page_granular_RNX(phystokv(args->topOfKernelData), BOOTSTRAP_TABLE_SIZE, FALSE); // Boot page tables; they should not be mutable. +} - /* record the extent of the physmap */ - physmap_vbase = phystokv(args->topOfKernelData) + ARM_PGBYTES * 4; - physmap_vtop = static_memory_end; -#endif - arm_vm_page_granular_RNX(phystokv(args->topOfKernelData), ARM_PGBYTES * 4, FALSE); // Boot page tables; they should not be mutable. - arm_vm_page_granular_RWNX(phystokv(args->topOfKernelData) + ARM_PGBYTES * 4, - extra + static_memory_end - ((phystokv(args->topOfKernelData) + ARM_PGBYTES * 4)), use_small_page_mappings); // rest of physmem +/* + * return < 0 for a < b + * 0 for a == b + * > 0 for a > b + */ +typedef int (*cmpfunc_t)(const void *a, const void *b); + +extern void +qsort(void *a, size_t n, size_t es, cmpfunc_t cmp); + +static int +cmp_ptov_entries(const void *a, const void *b) +{ + const ptov_table_entry *entry_a = a; + const ptov_table_entry *entry_b = b; + // Sort in descending order of segment length + if (entry_a->len < entry_b->len) + return 1; + else if (entry_a->len > entry_b->len) + return -1; + else + return 0; +} + +SECURITY_READ_ONLY_LATE(static unsigned int) ptov_index = 0; + +#define ROUND_TWIG(addr) (((addr) + ARM_TT_TWIG_OFFMASK) & ~(ARM_TT_TWIG_OFFMASK)) + +static void +arm_vm_physmap_slide(ptov_table_entry *temp_ptov_table, vm_map_address_t physmap_base, vm_map_address_t orig_va, vm_size_t len, int pte_prot_APX, boolean_t force_page_granule) +{ + pmap_paddr_t pa_offset; + + assert(ptov_index < PTOV_TABLE_SIZE); + assert((orig_va & ARM_PGMASK) == 0); + temp_ptov_table[ptov_index].pa = orig_va - gVirtBase + gPhysBase; + if (ptov_index == 0) + temp_ptov_table[ptov_index].va = physmap_base; + else + temp_ptov_table[ptov_index].va = temp_ptov_table[ptov_index - 1].va + temp_ptov_table[ptov_index - 1].len; + if (!force_page_granule) { + vm_map_address_t orig_offset = temp_ptov_table[ptov_index].pa & ARM_TT_TWIG_OFFMASK; + vm_map_address_t new_offset = temp_ptov_table[ptov_index].va & ARM_TT_TWIG_OFFMASK; + if (new_offset < orig_offset) + temp_ptov_table[ptov_index].va += (orig_offset - new_offset); + else if (new_offset > orig_offset) + temp_ptov_table[ptov_index].va = ROUND_TWIG(temp_ptov_table[ptov_index].va) + orig_offset; + } + assert((temp_ptov_table[ptov_index].va & ARM_PGMASK) == 0); + temp_ptov_table[ptov_index].len = round_page(len); + pa_offset = temp_ptov_table[ptov_index].va - orig_va; + arm_vm_page_granular_prot(temp_ptov_table[ptov_index].va, temp_ptov_table[ptov_index].len, pa_offset, 1, pte_prot_APX, 1, force_page_granule); + ++ptov_index; } + +static void +arm_vm_physmap_init(boot_args *args, vm_map_address_t physmap_base, vm_map_address_t dynamic_memory_begin __unused) +{ + ptov_table_entry temp_ptov_table[PTOV_TABLE_SIZE]; + bzero(temp_ptov_table, sizeof(temp_ptov_table)); + + // Will be handed back to VM layer through ml_static_mfree() in arm_vm_prot_finalize() + arm_vm_physmap_slide(temp_ptov_table, physmap_base, gVirtBase, segEXTRADATA - gVirtBase, AP_RWNA, FALSE); + + arm_vm_page_granular_RWNX(end_kern, phystokv(args->topOfKernelData) - end_kern, FALSE); /* Device Tree, RAM Disk (if present), bootArgs */ + + arm_vm_physmap_slide(temp_ptov_table, physmap_base, (args->topOfKernelData + BOOTSTRAP_TABLE_SIZE - gPhysBase + gVirtBase), + real_avail_end - (args->topOfKernelData + BOOTSTRAP_TABLE_SIZE), AP_RWNA, FALSE); // rest of physmem + + assert((temp_ptov_table[ptov_index - 1].va + temp_ptov_table[ptov_index - 1].len) <= dynamic_memory_begin); + + // Sort in descending order of segment length. LUT traversal is linear, so largest (most likely used) + // segments should be placed earliest in the table to optimize lookup performance. + qsort(temp_ptov_table, PTOV_TABLE_SIZE, sizeof(temp_ptov_table[0]), cmp_ptov_entries); + + memcpy(ptov_table, temp_ptov_table, sizeof(ptov_table)); +} + + void -arm_vm_prot_finalize(boot_args * args) +arm_vm_prot_finalize(boot_args * args __unused) { -#pragma unused(args) /* * At this point, we are far enough along in the boot process that it will be * safe to free up all of the memory preceeding the kernel. It may in fact @@ -989,9 +1222,13 @@ arm_vm_prot_finalize(boot_args * args) * should be immediately followed by XNU's TEXT segment */ - ml_static_mfree(gVirtBase, segPRELINKTEXTB - gVirtBase); + ml_static_mfree(phystokv(gPhysBase), segEXTRADATA - gVirtBase); - if (!segSizePLKDATACONST && !segSizePLKTEXTEXEC) { + /* + * KTRR support means we will be mucking with these pages and trying to + * protect them; we cannot free the pages to the VM if we do this. + */ + if (!segSizePLKDATACONST && !segSizePLKTEXTEXEC && segSizePRELINKTEXT) { /* If new segments not present, PRELINK_TEXT is not dynamically sized, free DRAM between it and xnu TEXT */ ml_static_mfree(segPRELINKTEXTB + segSizePRELINKTEXT, segTEXTB - (segPRELINKTEXTB + segSizePRELINKTEXT)); } @@ -1009,10 +1246,15 @@ arm_vm_prot_finalize(boot_args * args) arm_vm_page_granular_RNX(segPLKDATACONSTB, segSizePLKDATACONST, FALSE); } + cpu_stack_alloc(&BootCpuData); + arm64_replace_bootstack(&BootCpuData); + ml_static_mfree(phystokv(segBOOTDATAB - gVirtBase + gPhysBase), segSizeBOOTDATA); + #if __ARM_KERNEL_PROTECT__ arm_vm_populate_kernel_el0_mappings(); #endif /* __ARM_KERNEL_PROTECT__ */ + #if defined(KERNEL_INTEGRITY_KTRR) /* * __LAST,__pinst should no longer be executable. @@ -1031,6 +1273,7 @@ arm_vm_prot_finalize(boot_args * args) #ifndef __ARM_L1_PTW__ FlushPoC_Dcache(); #endif + __builtin_arm_dsb(DSB_ISH); flush_mmu_tlb(); } @@ -1068,12 +1311,14 @@ set_tbi(void) #endif /* !__ARM_KERNEL_PROTECT__ */ } +#define ARM64_PHYSMAP_SLIDE_RANGE (1ULL << 30) // 1 GB +#define ARM64_PHYSMAP_SLIDE_MASK (ARM64_PHYSMAP_SLIDE_RANGE - 1) + void arm_vm_init(uint64_t memory_size, boot_args * args) { #if !__ARM64_TWO_LEVEL_PMAP__ vm_map_address_t va_l1, va_l1_end; - pmap_paddr_t pa_l1; tt_entry_t *cpu_l1_tte; #else /* @@ -1086,12 +1331,13 @@ arm_vm_init(uint64_t memory_size, boot_args * args) */ #endif vm_map_address_t va_l2, va_l2_end; - pmap_paddr_t pa_l2; tt_entry_t *cpu_l2_tte; pmap_paddr_t boot_ttep; tt_entry_t *boot_tte; uint64_t mem_segments; vm_offset_t ptpage_vaddr; + vm_map_address_t dynamic_memory_begin; + vm_map_address_t physmap_base; /* @@ -1103,19 +1349,47 @@ arm_vm_init(uint64_t memory_size, boot_args * args) mem_size = args->memSize; if ((memory_size != 0) && (mem_size > memory_size)) mem_size = memory_size; - if (mem_size > MEM_SIZE_MAX ) - mem_size = MEM_SIZE_MAX; - static_memory_end = gVirtBase + mem_size; + if (mem_size >= ((VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / 4)) + panic("Unsupported memory configuration %lx\n", mem_size); + + physmap_base = phystokv(args->topOfKernelData) + BOOTSTRAP_TABLE_SIZE; + + // Slide the physical aperture to a random page-aligned location within the slide range + uint64_t physmap_slide = early_random() & ARM64_PHYSMAP_SLIDE_MASK & ~((uint64_t)PAGE_MASK); + assert(physmap_slide < ARM64_PHYSMAP_SLIDE_RANGE); + + physmap_base += physmap_slide; + + static_memory_end = physmap_base + mem_size + (PTOV_TABLE_SIZE * ARM_TT_TWIG_SIZE); // worst possible case for block alignment +#if KASAN + /* add the KASAN stolen memory to the physmap */ + dynamic_memory_begin = static_memory_end + (shadow_ptop - shadow_pbase); +#else + dynamic_memory_begin = static_memory_end; +#endif + if (dynamic_memory_begin > VM_MAX_KERNEL_ADDRESS) + panic("Unsupported memory configuration %lx\n", mem_size); boot_ttep = args->topOfKernelData; boot_tte = (tt_entry_t *) phystokv(boot_ttep); - /* - * Four pages: +#if DEVELOPMENT || DEBUG + /* Sanity check - assert that BOOTSTRAP_TABLE_SIZE is sufficiently-large to + * hold our bootstrap mappings for any possible slide */ + size_t bytes_mapped = dynamic_memory_begin - gVirtBase; + size_t l1_entries = 1 + ((bytes_mapped + ARM_TT_L1_SIZE - 1) / ARM_TT_L1_SIZE); + /* 1 L1 each for V=P and KVA, plus 1 page for each L2 */ + size_t pages_used = 2 * (l1_entries + 1); + if (pages_used > BOOTSTRAP_TABLE_SIZE) { + panic("BOOTSTRAP_TABLE_SIZE too small for memory config\n"); + } +#endif + + /* * TTBR0 L1, TTBR0 L2 - 1:1 bootstrap mapping. * TTBR1 L1, TTBR1 L2 - kernel mapping */ - avail_start = boot_ttep + 4*ARM_PGBYTES; + avail_start = boot_ttep + BOOTSTRAP_TABLE_SIZE; #if defined(KERNEL_INTEGRITY_KTRR) arm_replace_identity_map(args); @@ -1142,6 +1416,12 @@ arm_vm_init(uint64_t memory_size, boot_args * args) bzero(cpu_tte, ARM_PGBYTES); avail_end = gPhysBase + mem_size; +#if KASAN + real_avail_end = avail_end + (shadow_ptop - shadow_pbase); +#else + real_avail_end = avail_end; +#endif + /* * Initialize l1 and l2 page table pages : * map physical memory at the kernel base virtual address @@ -1150,62 +1430,25 @@ arm_vm_init(uint64_t memory_size, boot_args * args) * the so called physical aperture should be statically mapped */ #if !__ARM64_TWO_LEVEL_PMAP__ - pa_l1 = gPhysBase; va_l1 = gVirtBase; - va_l1_end = gVirtBase + mem_size; -#if KASAN - /* add the KASAN stolen memory to the physmap */ - va_l1_end = gVirtBase + (shadow_ptop - gPhysBase); -#endif + va_l1_end = dynamic_memory_begin; cpu_l1_tte = cpu_tte + ((va_l1 & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT); while (va_l1 < va_l1_end) { - tt_entry_t *new_tte = (tt_entry_t *)alloc_ptpage(TRUE); - /* Allocate a page and setup L1 Table TTE in L1 */ - *cpu_l1_tte = (kvtophys((vm_offset_t)new_tte) & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID; - bzero((void *)new_tte, ARM_PGBYTES); - - va_l2 = va_l1; + if (*cpu_l1_tte == ARM_TTE_EMPTY) { + /* Allocate a page and setup L1 Table TTE in L1 */ + ptpage_vaddr = alloc_ptpage(TRUE); + *cpu_l1_tte = (kvtophys(ptpage_vaddr) & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID; + bzero((void *)ptpage_vaddr, ARM_PGBYTES); + } - if (((va_l1 & ~ARM_TT_L1_OFFMASK)+ARM_TT_L1_SIZE) < va_l1) { + if ((va_l1 + ARM_TT_L1_SIZE) < va_l1) { /* If this is the last L1 entry, it must cover the last mapping. */ - va_l2_end = va_l1_end; - } else { - va_l2_end = MIN((va_l1 & ~ARM_TT_L1_OFFMASK)+ARM_TT_L1_SIZE, va_l1_end); + break; } - pa_l2 = pa_l1; - cpu_l2_tte = ((tt_entry_t *) phystokv(((*cpu_l1_tte) & ARM_TTE_TABLE_MASK))) + ((va_l1 & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT); -#else - va_l2 = gVirtBase; - va_l2_end = gVirtBase + mem_size; - pa_l2 = gPhysBase; - cpu_l2_tte = cpu_tte + ((va_l2 & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT); - -#if KASAN - /* add the KASAN stolen memory to the physmap */ - va_l2_end = gVirtBase + (shadow_ptop - gPhysBase); -#endif - -#endif - - while (va_l2 < va_l2_end) { - /* Set up L2 Block TTE in L2 */ - *cpu_l2_tte = (pa_l2 & ARM_TTE_BLOCK_L2_MASK) | ARM_TTE_TYPE_BLOCK - | ARM_TTE_VALID | ARM_TTE_BLOCK_AF - | ARM_TTE_BLOCK_AP(AP_RWNA) | ARM_TTE_BLOCK_SH(SH_OUTER_MEMORY) - | ARM_TTE_BLOCK_ATTRINDX(CACHE_ATTRINDX_WRITEBACK); -#if __ARM_KERNEL_PROTECT__ - *cpu_l2_tte |= ARM_TTE_BLOCK_NG; -#endif /* __ARM_KERNEL_PROTECT__ */ - va_l2 += ARM_TT_L2_SIZE; - pa_l2 += ARM_TT_L2_SIZE; - cpu_l2_tte++; - } -#if !__ARM64_TWO_LEVEL_PMAP__ + va_l1 += ARM_TT_L1_SIZE; cpu_l1_tte++; - va_l1 = va_l2; - pa_l1 = pa_l2; } #endif @@ -1224,6 +1467,8 @@ arm_vm_init(uint64_t memory_size, boot_args * args) segDATACONSTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__DATA_CONST", &segSizeDATACONST); segTEXTEXECB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__TEXT_EXEC", &segSizeTEXTEXEC); segDATAB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__DATA", &segSizeDATA); + + segBOOTDATAB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__BOOTDATA", &segSizeBOOTDATA); segLINKB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__LINKEDIT", &segSizeLINK); segKLDB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__KLD", &segSizeKLD); segPRELINKDATAB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PRELINK_DATA", &segSizePRELINKDATA); @@ -1292,7 +1537,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args) * KERNEL_DYNAMIC_ADDR - VM_MAX_KERNEL_ADDRESS */ #if !__ARM64_TWO_LEVEL_PMAP__ - va_l1 = (gVirtBase+MEM_SIZE_MAX+ ~0xFFFFFFFFFF800000ULL) & 0xFFFFFFFFFF800000ULL; + va_l1 = dynamic_memory_begin; va_l1_end = VM_MAX_KERNEL_ADDRESS; cpu_l1_tte = cpu_tte + ((va_l1 & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT); @@ -1300,7 +1545,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args) if (*cpu_l1_tte == ARM_TTE_EMPTY) { /* Allocate a page and setup L1 Table TTE in L1 */ ptpage_vaddr = alloc_ptpage(TRUE); - *cpu_l1_tte = (kvtophys(ptpage_vaddr) & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID | ARM_TTE_TABLE_PXN | ARM_TTE_TABLE_XN; + *cpu_l1_tte = (kvtophys(ptpage_vaddr) & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID | ARM_DYNAMIC_TABLE_XN; bzero((void *)ptpage_vaddr, ARM_PGBYTES); } @@ -1315,31 +1560,37 @@ arm_vm_init(uint64_t memory_size, boot_args * args) #endif #if KASAN + /* record the extent of the physmap */ + physmap_vbase = physmap_base; + physmap_vtop = static_memory_end; kasan_init(); #endif + set_tbi(); set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK); + + arm_vm_physmap_init(args, physmap_base, dynamic_memory_begin); set_mmu_ttb_alternate(cpu_ttep & TTBR_BADDR_MASK); - set_tbi(); flush_mmu_tlb(); + kva_active = TRUE; + // global table pointers may need to be different due to physical aperture remapping + cpu_tte = (tt_entry_t*)(phystokv(cpu_ttep)); + invalid_tte = (tt_entry_t*)(phystokv(invalid_ttep)); - /* - * TODO: We're hardcoding the expected virtual TEXT base here; - * that gives us an ugly dependency on a linker argument in - * the make files. Clean this up, so we don't hardcode it - * twice; this is nothing but trouble. - */ sane_size = mem_size - (avail_start - gPhysBase); max_mem = mem_size; - vm_kernel_slid_base = segPRELINKTEXTB; + vm_kernel_slid_base = segLOWESTTEXT; vm_kernel_slid_top = vm_prelink_einfo; - vm_kernel_slide = segTEXTB-0xfffffff007004000; + vm_kernel_slide = segTEXTB-VM_KERNEL_LINK_ADDRESS; vm_kernel_stext = segTEXTB; assert(segDATACONSTB == segTEXTB + segSizeTEXT); - assert(segTEXTEXECB == segDATACONSTB + segSizeDATACONST); + assert(segTEXTEXECB == segDATACONSTB + segSizeDATACONST); vm_kernel_etext = segTEXTB + segSizeTEXT + segSizeDATACONST + segSizeTEXTEXEC; - pmap_bootstrap((gVirtBase+MEM_SIZE_MAX+ ~0xFFFFFFFFFF800000ULL) & 0xFFFFFFFFFF800000ULL); + dynamic_memory_begin = ROUND_TWIG(dynamic_memory_begin); + pmap_bootstrap(dynamic_memory_begin); + + disable_preemption(); /* * Initialize l3 page table pages : @@ -1350,7 +1601,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args) mem_segments = (mem_size + 0x0FFFFFFF) >> 28; #if !__ARM64_TWO_LEVEL_PMAP__ - va_l1 = (gVirtBase+MEM_SIZE_MAX+ ~0xFFFFFFFFFF800000ULL) & 0xFFFFFFFFFF800000ULL; + va_l1 = dynamic_memory_begin; va_l1_end = va_l1 + ((2 + (mem_segments * 10)) << 20); va_l1_end += round_page(args->Video.v_height * args->Video.v_rowBytes); va_l1_end = (va_l1_end + 0x00000000007FFFFFULL) & 0xFFFFFFFFFF800000ULL; @@ -1370,7 +1621,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args) cpu_l2_tte = ((tt_entry_t *) phystokv(((*cpu_l1_tte) & ARM_TTE_TABLE_MASK))) + ((va_l2 & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT); #else - va_l2 = (gVirtBase+MEM_SIZE_MAX+ ~0xFFFFFFFFFF800000ULL) & 0xFFFFFFFFFF800000ULL; + va_l2 = dynamic_memory_begin; va_l2_end = va_l2 + ((2 + (mem_segments * 10)) << 20); va_l2_end += round_page(args->Video.v_height * args->Video.v_rowBytes); va_l2_end = (va_l2_end + 0x00000000007FFFFFULL) & 0xFFFFFFFFFF800000ULL; @@ -1387,7 +1638,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args) pmap_init_pte_page(kernel_pmap, ptp, va_l2, 3, TRUE); - *cpu_l2_tte = (pa_to_tte (ptp_phys)) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID | ARM_TTE_TABLE_PXN | ARM_TTE_TABLE_XN; + *cpu_l2_tte = (pa_to_tte (ptp_phys)) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID | ARM_DYNAMIC_TABLE_XN; va_l2 += ARM_TT_L2_SIZE; cpu_l2_tte++; @@ -1437,7 +1688,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args) pmap_init_pte_page(kernel_pmap, ptp, va_l2, 3, TRUE); - *cpu_l2_tte = (pa_to_tte (ptp_phys)) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID | ARM_TTE_TABLE_PXN | ARM_TTE_TABLE_XN; + *cpu_l2_tte = (pa_to_tte (ptp_phys)) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID | ARM_DYNAMIC_TABLE_XN; va_l2 += ARM_TT_L2_SIZE; cpu_l2_tte++; @@ -1465,7 +1716,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args) if (*cpu_l1_tte == ARM_TTE_EMPTY) { tt_entry_t *new_tte = (tt_entry_t*)alloc_ptpage(FALSE); bzero(new_tte, ARM_PGBYTES); - *cpu_l1_tte = (kvtophys((vm_offset_t)new_tte) & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID | ARM_TTE_TABLE_PXN | ARM_TTE_TABLE_XN; + *cpu_l1_tte = (kvtophys((vm_offset_t)new_tte) & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID | ARM_DYNAMIC_TABLE_XN; } cpu_l1_tte++; @@ -1479,8 +1730,8 @@ arm_vm_init(uint64_t memory_size, boot_args * args) */ avail_start = (avail_start + PAGE_MASK) & ~PAGE_MASK; - first_avail = avail_start; patch_low_glo_static_region(args->topOfKernelData, avail_start - args->topOfKernelData); + enable_preemption(); } diff --git a/osfmk/arm64/asm.h b/osfmk/arm64/asm.h index f756f22ae..fb2c1ea8a 100644 --- a/osfmk/arm64/asm.h +++ b/osfmk/arm64/asm.h @@ -152,6 +152,20 @@ movk $0, #((($1) >> 00) & 0x000000000000FFFF), lsl #00 .endmacro +.macro ARM64_STACK_PROLOG +#if __has_feature(ptrauth_returns) + pacibsp +#endif +.endmacro + +.macro ARM64_STACK_EPILOG +#if __has_feature(ptrauth_returns) + retab +#else + ret +#endif +.endmacro + #define PUSH_FRAME \ stp fp, lr, [sp, #-16]! %% \ mov fp, sp %% diff --git a/osfmk/arm64/bcopy.s b/osfmk/arm64/bcopy.s index 01f33d61e..67266940d 100644 --- a/osfmk/arm64/bcopy.s +++ b/osfmk/arm64/bcopy.s @@ -90,6 +90,7 @@ _memmove: // can only be smaller than length if the buffers do not overlap, so we don't // need to worry about false positives due to the overflow (they happen, but // only in cases where copying in either order is correct). + ARM64_STACK_PROLOG PUSH_FRAME sub x3, x0, x1 cmp x3, x2 @@ -178,7 +179,7 @@ L_forwardCleanup: stp x12,x13,[x3, #32] stp x14,x15,[x3, #48] POP_FRAME - ret + ARM64_STACK_EPILOG /***************************************************************************** * forward small copy * @@ -204,7 +205,7 @@ L_forwardSmallCopy: subs x2, x2, #1 b.ne 1b 2: POP_FRAME - ret + ARM64_STACK_EPILOG /***************************************************************************** * Reverse copy engines * @@ -271,7 +272,7 @@ L_reverseCleanup: stp x12,x13,[x0, #16] // In the forward copy, we need to compute the stp x14,x15,[x0] // address of these stores, but here we already POP_FRAME // have a pointer to the start of the buffer. - ret + ARM64_STACK_EPILOG /***************************************************************************** * reverse small copy * @@ -289,8 +290,9 @@ L_reverseSmallCopy: subs x2, x2, #1 b.ne 1b 2: POP_FRAME - ret + ARM64_STACK_EPILOG + L_return: POP_FRAME - ret + ARM64_STACK_EPILOG diff --git a/osfmk/arm64/bzero.s b/osfmk/arm64/bzero.s index c2f084e47..a7abca2cb 100644 --- a/osfmk/arm64/bzero.s +++ b/osfmk/arm64/bzero.s @@ -50,6 +50,7 @@ .align 4 _bzero: ___bzero: + ARM64_STACK_PROLOG PUSH_FRAME mov x2, x1 eor x1, x1, x1 @@ -85,7 +86,7 @@ L_bzeroLarge: stp x1, x1, [x3, #32] stp x1, x1, [x3, #48] POP_FRAME - ret + ARM64_STACK_EPILOG /***************************************************************************** * memset entrypoint * @@ -98,6 +99,7 @@ L_bzeroLarge: */ _secure_memset: _memset: + ARM64_STACK_PROLOG PUSH_FRAME and x1, x1, #0xff orr x3, xzr,#0x0101010101010101 @@ -134,7 +136,7 @@ L_memsetLarge: stp x1, x1, [x3, #32] stp x1, x1, [x3, #48] POP_FRAME - ret + ARM64_STACK_EPILOG /***************************************************************************** * Small buffer store engine * @@ -150,4 +152,5 @@ L_memsetSmall: subs x2, x2, #1 b.ne 1b 2: POP_FRAME - ret + ARM64_STACK_EPILOG + diff --git a/osfmk/arm64/caches_asm.s b/osfmk/arm64/caches_asm.s index ac50fd037..a673abaf3 100644 --- a/osfmk/arm64/caches_asm.s +++ b/osfmk/arm64/caches_asm.s @@ -130,7 +130,19 @@ LEXT(CleanPoU_Dcache) #if defined(APPLE_ARM64_ARCH_FAMILY) /* "Fully Coherent." */ #else /* !defined(APPLE_ARM64_ARCH_FAMILY) */ -#error CleanPoU_Dcache needs an implementation + mov x0, #0 + mov x9, #(1 << MMU_I7SET) + mov x10, #(1 << (MMU_NSET + MMU_I7SET)) + mov x11, #(1 << MMU_I7WAY) +L_cpud_dcacheway: +L_cpud_dcacheline: + dc csw, x0 // clean dcache line by way/set + add x0, x0, x9 // increment set index + tst x0, #(1 << (MMU_NSET + MMU_I7SET)) // look for overflow + b.eq L_cpud_dcacheline + bic x0, x0, x10 // clear set overflow + adds x0, x0, x11 // increment way + b.cc L_cpud_dcacheway // loop #endif /* defined(APPLE_ARM64_ARCH_FAMILY) */ dsb sy ret @@ -170,6 +182,7 @@ L_cpudr_loop: .text .align 2 LEXT(CleanPoC_DcacheRegion_internal) + ARM64_STACK_PROLOG PUSH_FRAME mov x9, #((1< #include +#undef copyin +#undef copyout + extern int _bcopyin(const char *src, char *dst, vm_size_t len); extern int _bcopyinstr(const char *src, char *dst, vm_size_t max, vm_size_t *actual); extern int _bcopyout(const char *src, char *dst, vm_size_t len); @@ -41,6 +44,9 @@ extern int _copyin_word(const char *src, uint64_t *dst, vm_size_t len); extern pmap_t kernel_pmap; +/* On by default, optionally disabled by boot-arg */ +extern boolean_t copyio_zalloc_check; + typedef enum copyio_type { COPYIO_IN, COPYIO_IN_WORD, @@ -48,18 +54,6 @@ typedef enum copyio_type { COPYIO_OUT, } copyio_type_t; -int -copyio_check_user_addr(user_addr_t user_addr, vm_size_t nbytes) -{ - if (nbytes && (user_addr + nbytes <= user_addr)) - return EFAULT; - - if ((user_addr + nbytes) > vm_map_max(current_thread()->map)) - return EFAULT; - - return 0; -} - static inline void user_access_enable(void) { @@ -82,6 +76,8 @@ copyio(copyio_type_t copytype, const char *src, char *dst, { int result = 0; vm_size_t bytes_copied = 0; + vm_size_t kernel_buf_size = 0; + void * kernel_addr = NULL; /* Reject TBI addresses */ if (copytype == COPYIO_OUT) { @@ -92,8 +88,16 @@ copyio(copyio_type_t copytype, const char *src, char *dst, return EINVAL; } - if (!nbytes) { - return 0; + if (__probable(copyio_zalloc_check)) { + if (copytype == COPYIO_IN || copytype == COPYIO_INSTR || copytype == COPYIO_IN_WORD) { + kernel_addr = (void*)dst; + } else if (copytype == COPYIO_OUT) { + kernel_addr = (void*)(uintptr_t)src; + } + if (kernel_addr) + kernel_buf_size = zone_element_size(kernel_addr, NULL); + if (__improbable(kernel_buf_size && kernel_buf_size < nbytes)) + panic("copyio: kernel buffer %p has size %lu < nbytes %lu", kernel_addr, kernel_buf_size, nbytes); } #if KASAN @@ -153,27 +157,20 @@ copyout_kern(const char *kernel_addr, user_addr_t user_addr, vm_size_t nbytes) } int -copyin(const user_addr_t user_addr, char *kernel_addr, vm_size_t nbytes) +copyin(const user_addr_t user_addr, void *kernel_addr, vm_size_t nbytes) { int result; - if (user_addr >= VM_MIN_KERNEL_ADDRESS || user_addr + nbytes >= VM_MIN_KERNEL_ADDRESS) { - if (current_thread()->map->pmap == kernel_pmap) - return copyin_kern(user_addr, kernel_addr, nbytes); - else - return EFAULT; - } - - if (nbytes >= 4096) { - result = copyin_validate(user_addr, (uintptr_t)kernel_addr, nbytes); - if (result) return result; - } - - result = copyio_check_user_addr(user_addr, nbytes); + if (nbytes == 0) + return 0; + result = copyin_validate(user_addr, (uintptr_t)kernel_addr, nbytes); if (result) return result; - return copyio(COPYIO_IN, (const char *)(uintptr_t)user_addr, kernel_addr, nbytes, NULL); + if (current_thread()->map->pmap == kernel_pmap) + return copyin_kern(user_addr, kernel_addr, nbytes); + else + return copyio(COPYIO_IN, (const char *)(uintptr_t)user_addr, kernel_addr, nbytes, NULL); } /* @@ -194,11 +191,7 @@ copyin_word(const user_addr_t user_addr, uint64_t *kernel_addr, vm_size_t nbytes if (user_addr & (nbytes - 1)) return EINVAL; - /* Address must be user */ - if (user_addr >= VM_MIN_KERNEL_ADDRESS || user_addr + nbytes >= VM_MIN_KERNEL_ADDRESS) - return EFAULT; - - result = copyio_check_user_addr(user_addr, nbytes); + result = copyin_validate(user_addr, (uintptr_t)kernel_addr, nbytes); if (result) return result; @@ -210,18 +203,14 @@ copyinstr(const user_addr_t user_addr, char *kernel_addr, vm_size_t nbytes, vm_s { int result; - if (user_addr >= VM_MIN_KERNEL_ADDRESS || user_addr + nbytes >= VM_MIN_KERNEL_ADDRESS) { - return EFAULT; - } + *lencopied = 0; + if (nbytes == 0) + return ENAMETOOLONG; - result = copyio_check_user_addr(user_addr, nbytes); + result = copyin_validate(user_addr, (uintptr_t)kernel_addr, nbytes); if (result) return result; - if (!nbytes) { - return ENAMETOOLONG; - } - return copyio(COPYIO_INSTR, (const char *)(uintptr_t)user_addr, kernel_addr, nbytes, lencopied); } @@ -230,23 +219,16 @@ copyout(const void *kernel_addr, user_addr_t user_addr, vm_size_t nbytes) { int result; - if (user_addr >= VM_MIN_KERNEL_ADDRESS || user_addr + nbytes >= VM_MIN_KERNEL_ADDRESS) { - if (current_thread()->map->pmap == kernel_pmap) - return copyout_kern(kernel_addr, user_addr, nbytes); - else - return EFAULT; - } - - if (nbytes >= 4096) { - result = copyout_validate((uintptr_t)kernel_addr, user_addr, nbytes); - if (result) return result; - } - - result = copyio_check_user_addr(user_addr, nbytes); + if (nbytes == 0) + return 0; + result = copyout_validate((uintptr_t)kernel_addr, user_addr, nbytes); if (result) return result; - return copyio(COPYIO_OUT, kernel_addr, (char *)(uintptr_t)user_addr, nbytes, NULL); + if (current_thread()->map->pmap == kernel_pmap) + return copyout_kern(kernel_addr, user_addr, nbytes); + else + return copyio(COPYIO_OUT, kernel_addr, (char *)(uintptr_t)user_addr, nbytes, NULL); } @@ -262,10 +244,6 @@ const int copysize_limit_panic = (64 * 1024 * 1024); /* * Validate the arguments to copy{in,out} on this platform. - * - * Called when nbytes is "large" e.g. more than a page. Such sizes are - * infrequent, and very large sizes are likely indications of attempts - * to exploit kernel programming errors (bugs). */ static int copy_validate(const user_addr_t user_addr, @@ -273,16 +251,17 @@ copy_validate(const user_addr_t user_addr, { uintptr_t kernel_addr_last = kernel_addr + nbytes; - if (kernel_addr < VM_MIN_KERNEL_ADDRESS || + if (__improbable(kernel_addr < VM_MIN_KERNEL_ADDRESS || kernel_addr > VM_MAX_KERNEL_ADDRESS || kernel_addr_last < kernel_addr || - kernel_addr_last > VM_MAX_KERNEL_ADDRESS) + kernel_addr_last > VM_MAX_KERNEL_ADDRESS)) panic("%s(%p, %p, %lu) - kaddr not in kernel", __func__, (void *)user_addr, (void *)kernel_addr, nbytes); user_addr_t user_addr_last = user_addr + nbytes; - if (user_addr_last < user_addr || user_addr_last > VM_MIN_KERNEL_ADDRESS) + if (__improbable((user_addr_last < user_addr) || ((user_addr + nbytes) > vm_map_max(current_thread()->map)) || + (user_addr < vm_map_min(current_thread()->map)))) return (EFAULT); if (__improbable(nbytes > copysize_limit_panic)) diff --git a/osfmk/arm64/cpu.c b/osfmk/arm64/cpu.c index d4712a612..4d1300d75 100644 --- a/osfmk/arm64/cpu.c +++ b/osfmk/arm64/cpu.c @@ -134,6 +134,55 @@ static vm_offset_t sleepTokenBuffer = (vm_offset_t)NULL; #endif static boolean_t coresight_debug_enabled = FALSE; +#if defined(CONFIG_XNUPOST) +void arm64_ipi_test_callback(void *); + +void arm64_ipi_test_callback(void *parm) { + volatile uint64_t *ipi_test_data = parm; + cpu_data_t *cpu_data; + + cpu_data = getCpuDatap(); + + *ipi_test_data = cpu_data->cpu_number; +} + +uint64_t arm64_ipi_test_data[MAX_CPUS]; + +void arm64_ipi_test() { + volatile uint64_t *ipi_test_data; + uint32_t timeout_ms = 100; + uint64_t then, now, delta; + int current_cpu_number = getCpuDatap()->cpu_number; + + /* + * probably the only way to have this on most systems is with the + * cpus=1 boot-arg, but nonetheless, if we only have 1 CPU active, + * IPI is not available + */ + if (real_ncpus == 1) { + return; + } + + for (unsigned int i = 0; i < MAX_CPUS; ++i) { + ipi_test_data = &arm64_ipi_test_data[i]; + *ipi_test_data = ~i; + kern_return_t error = cpu_xcall((int)i, (void *)arm64_ipi_test_callback, (void *)(uintptr_t)ipi_test_data); + if (error != KERN_SUCCESS) + panic("CPU %d was unable to IPI CPU %u: error %d", current_cpu_number, i, error); + + then = mach_absolute_time(); + + while (*ipi_test_data != i) { + now = mach_absolute_time(); + absolutetime_to_nanoseconds(now-then, &delta); + if ((delta / NSEC_PER_MSEC) > timeout_ms) { + panic("CPU %d tried to IPI CPU %d but didn't get correct response within %dms, respose: %llx", current_cpu_number, i, timeout_ms, *ipi_test_data); + } + } + } + +} +#endif /* defined(CONFIG_XNUPOST) */ static void configure_coresight_registers(cpu_data_t *cdp) @@ -316,7 +365,7 @@ cpu_idle(void) ClearIdlePop(TRUE); - cpu_idle_exit(); + cpu_idle_exit(FALSE); } /* @@ -324,7 +373,7 @@ cpu_idle(void) * Function: */ void -cpu_idle_exit(void) +cpu_idle_exit(boolean_t from_reset) { uint64_t new_idle_timeout_ticks = 0x0ULL; cpu_data_t *cpu_data_ptr = getCpuDatap(); @@ -332,7 +381,8 @@ cpu_idle_exit(void) assert(exception_stack_pointer() != 0); /* Back from WFI, unlock OSLAR and EDLAR. */ - configure_coresight_registers(cpu_data_ptr); + if (from_reset) + configure_coresight_registers(cpu_data_ptr); #if KPC kpc_idle_exit(); @@ -420,51 +470,35 @@ cpu_init(void) #endif /* MONOTONIC */ } -cpu_data_t * -cpu_data_alloc(boolean_t is_boot_cpu) +void +cpu_stack_alloc(cpu_data_t *cpu_data_ptr) { - cpu_data_t *cpu_data_ptr = NULL; - - if (is_boot_cpu) - cpu_data_ptr = &BootCpuData; - else { - void *irq_stack = NULL; - void *exc_stack = NULL; - void *fiq_stack = NULL; - - if ((kmem_alloc(kernel_map, (vm_offset_t *)&cpu_data_ptr, sizeof(cpu_data_t), VM_KERN_MEMORY_CPU)) != KERN_SUCCESS) - goto cpu_data_alloc_error; - - bzero((void *)cpu_data_ptr, sizeof(cpu_data_t)); - - if ((irq_stack = kalloc(INTSTACK_SIZE)) == 0) - goto cpu_data_alloc_error; - cpu_data_ptr->intstack_top = (vm_offset_t)irq_stack + INTSTACK_SIZE ; - cpu_data_ptr->istackptr = cpu_data_ptr->intstack_top; - - if ((exc_stack = kalloc(PAGE_SIZE)) == 0) - goto cpu_data_alloc_error; - cpu_data_ptr->excepstack_top = (vm_offset_t)exc_stack + PAGE_SIZE ; - cpu_data_ptr->excepstackptr = cpu_data_ptr->excepstack_top; - - if ((fiq_stack = kalloc(PAGE_SIZE)) == 0) - goto cpu_data_alloc_error; - cpu_data_ptr->fiqstack_top = (vm_offset_t)fiq_stack + PAGE_SIZE ; - cpu_data_ptr->fiqstackptr = cpu_data_ptr->fiqstack_top; - } - - cpu_data_ptr->cpu_processor = cpu_processor_alloc(is_boot_cpu); - if (cpu_data_ptr->cpu_processor == (struct processor *)NULL) - goto cpu_data_alloc_error; - - return cpu_data_ptr; - -cpu_data_alloc_error: - panic("cpu_data_alloc() failed\n"); - return (cpu_data_t *)NULL; + vm_offset_t irq_stack = 0; + vm_offset_t exc_stack = 0; + + kern_return_t kr = kernel_memory_allocate(kernel_map, &irq_stack, + INTSTACK_SIZE + (2 * PAGE_SIZE), + PAGE_MASK, + KMA_GUARD_FIRST | KMA_GUARD_LAST | KMA_KSTACK | KMA_KOBJECT, + VM_KERN_MEMORY_STACK); + if (kr != KERN_SUCCESS) + panic("Unable to allocate cpu interrupt stack\n"); + + cpu_data_ptr->intstack_top = irq_stack + PAGE_SIZE + INTSTACK_SIZE; + cpu_data_ptr->istackptr = cpu_data_ptr->intstack_top; + + kr = kernel_memory_allocate(kernel_map, &exc_stack, + EXCEPSTACK_SIZE + (2 * PAGE_SIZE), + PAGE_MASK, + KMA_GUARD_FIRST | KMA_GUARD_LAST | KMA_KSTACK | KMA_KOBJECT, + VM_KERN_MEMORY_STACK); + if (kr != KERN_SUCCESS) + panic("Unable to allocate cpu exception stack\n"); + + cpu_data_ptr->excepstack_top = exc_stack + PAGE_SIZE + EXCEPSTACK_SIZE; + cpu_data_ptr->excepstackptr = cpu_data_ptr->excepstack_top; } - void cpu_data_free(cpu_data_t *cpu_data_ptr) { @@ -473,7 +507,7 @@ cpu_data_free(cpu_data_t *cpu_data_ptr) cpu_processor_free( cpu_data_ptr->cpu_processor); kfree( (void *)(cpu_data_ptr->intstack_top - INTSTACK_SIZE), INTSTACK_SIZE); - kfree( (void *)(cpu_data_ptr->fiqstack_top - PAGE_SIZE), PAGE_SIZE); + kfree( (void *)(cpu_data_ptr->excepstack_top - EXCEPSTACK_SIZE), EXCEPSTACK_SIZE); kmem_free(kernel_map, (vm_offset_t)cpu_data_ptr, sizeof(cpu_data_t)); } @@ -533,8 +567,7 @@ cpu_data_init(cpu_data_t *cpu_data_ptr) pmap_cpu_data_t * pmap_cpu_data_ptr = &cpu_data_ptr->cpu_pmap_cpu_data; - pmap_cpu_data_ptr->cpu_user_pmap = (struct pmap *) NULL; - pmap_cpu_data_ptr->cpu_user_pmap_stamp = 0; + pmap_cpu_data_ptr->cpu_nested_pmap = (struct pmap *) NULL; pmap_cpu_data_ptr->cpu_number = PMAP_INVALID_CPU_NUM; for (i = 0; i < (sizeof(pmap_cpu_data_ptr->cpu_asid_high_bits) / sizeof(*pmap_cpu_data_ptr->cpu_asid_high_bits)); i++) { @@ -544,6 +577,7 @@ cpu_data_init(cpu_data_t *cpu_data_ptr) #if __ARM_KERNEL_PROTECT__ cpu_data_ptr->cpu_exc_vectors = (vm_offset_t)&exc_vectors_table; #endif /* __ARM_KERNEL_PROTECT__ */ + } kern_return_t @@ -563,6 +597,7 @@ cpu_data_register(cpu_data_t *cpu_data_ptr) } + kern_return_t cpu_start(int cpu) { @@ -578,7 +613,7 @@ cpu_start(int cpu) cpu_data_ptr->cpu_reset_handler = (vm_offset_t) start_cpu_paddr; - cpu_data_ptr->cpu_pmap_cpu_data.cpu_user_pmap = NULL; + cpu_data_ptr->cpu_pmap_cpu_data.cpu_nested_pmap = NULL; if (cpu_data_ptr->cpu_processor->next_thread != THREAD_NULL) first_thread = cpu_data_ptr->cpu_processor->next_thread; diff --git a/osfmk/arm64/cswitch.s b/osfmk/arm64/cswitch.s index e3a0cb317..7aa9614a1 100644 --- a/osfmk/arm64/cswitch.s +++ b/osfmk/arm64/cswitch.s @@ -78,6 +78,7 @@ * arg1 - Scratch register */ .macro load_general_registers + ldp x16, x17, [$0, SS64_X16] ldp x19, x20, [$0, SS64_X19] ldp x21, x22, [$0, SS64_X21] @@ -134,14 +135,16 @@ LEXT(machine_load_context) set_thread_registers x0, x1, x2 ldr x1, [x0, TH_KSTACKPTR] // Get top of kernel stack load_general_registers x1, x2 - mov x0, xzr // Clear argument to thread_continue + mov x0, #0 // Clear argument to thread_continue ret /* - * void Call_continuation( void (*continuation)(void), - * void *param, - * wait_result_t wresult, - * vm_offset_t stack_ptr) + * typedef void (*thread_continue_t)(void *param, wait_result_t) + * + * void Call_continuation( thread_continue_t continuation, + * void *param, + * wait_result_t wresult, + * bool enable interrupts) */ .text .align 5 @@ -153,12 +156,21 @@ LEXT(Call_continuation) /* ARM64_TODO arm loads the kstack top instead of arg4. What should we use? */ ldr x5, [x4, TH_KSTACKPTR] // Get the top of the kernel stack mov sp, x5 // Set stack pointer + mov fp, #0 // Clear the frame pointer + + + mov x20, x0 //continuation + mov x21, x1 //continuation parameter + mov x22, x2 //wait result + + cbz x3, 1f + mov x0, #1 + bl _ml_set_interrupts_enabled +1: - mov fp, xzr // Clear the frame pointer - mov x4, x0 // Load the continuation - mov x0, x1 // Set the first parameter - mov x1, x2 // Set the wait result arg - blr x4 // Branch to the continuation + mov x0, x21 // Set the first parameter + mov x1, x22 // Set the wait result arg + blr x20 // Branch to the continuation mrs x0, TPIDR_EL1 // Get the current thread pointer b EXT(thread_terminate) // Kill the thread diff --git a/osfmk/arm64/genassym.c b/osfmk/arm64/genassym.c index 6e47758b1..c9755bde8 100644 --- a/osfmk/arm64/genassym.c +++ b/osfmk/arm64/genassym.c @@ -260,7 +260,6 @@ main( DECLARE("PGSHIFT", ARM_PGSHIFT); DECLARE("PGMASK", ARM_PGMASK); - DECLARE("VM_MIN_ADDRESS", VM_MIN_ADDRESS); DECLARE("VM_MAX_ADDRESS", VM_MAX_ADDRESS); DECLARE("VM_MIN_KERNEL_ADDRESS", VM_MIN_KERNEL_ADDRESS); @@ -292,10 +291,6 @@ main( offsetof(cpu_data_t, excepstackptr)); DECLARE("CPU_EXCEPSTACK_TOP", offsetof(cpu_data_t, excepstack_top)); - DECLARE("CPU_FIQSTACKPTR", - offsetof(cpu_data_t, fiqstackptr)); - DECLARE("CPU_FIQSTACK_TOP", - offsetof(cpu_data_t, fiqstack_top)); #if __ARM_KERNEL_PROTECT__ DECLARE("CPU_EXC_VECTORS", offsetof(cpu_data_t, cpu_exc_vectors)); @@ -356,6 +351,8 @@ main( offsetof(cpu_data_t, cpu_phys_id)); DECLARE("RTCLOCK_DATAP", offsetof(cpu_data_t, rtclock_datap)); + DECLARE("CLUSTER_MASTER", + offsetof(cpu_data_t, cluster_master)); DECLARE("RTCLOCKDataSize", sizeof(rtclock_data_t)); @@ -382,8 +379,10 @@ main( DECLARE("CPU_DATA_PADDR", offsetof(struct cpu_data_entry, cpu_data_paddr)); - DECLARE("INTSTACK_SIZE", INTSTACK_SIZE); + DECLARE("EXCEPSTACK_SIZE", EXCEPSTACK_SIZE); + + DECLARE("PAGE_MAX_SIZE", PAGE_MAX_SIZE); DECLARE("TIMER_TSTAMP", offsetof(struct timer, tstamp)); @@ -420,6 +419,8 @@ main( offsetof(struct boot_args, deviceTreeP)); DECLARE("BA_DEVICE_TREE_LENGTH", offsetof(struct boot_args, deviceTreeLength)); + DECLARE("BA_BOOT_FLAGS", + offsetof(struct boot_args, bootFlags)); DECLARE("ENTROPY_INDEX_PTR", offsetof(entropy_data_t, index_ptr)); @@ -430,5 +431,6 @@ main( DECLARE("SR_RESTORE_TCR_EL1", offsetof(struct sysreg_restore, tcr_el1)); + return (0); } diff --git a/osfmk/arm64/kpc.c b/osfmk/arm64/kpc.c index b1eae91fe..d69c2d270 100644 --- a/osfmk/arm64/kpc.c +++ b/osfmk/arm64/kpc.c @@ -41,6 +41,8 @@ #include #endif /* MONOTONIC */ +void kpc_pmi_handler(unsigned int ctr); + /* * PMCs 8 and 9 were added to Hurricane and to maintain the existing bit * positions of the other PMCs, their configuration bits start at position 32. @@ -230,8 +232,6 @@ static uint64_t kpc_running_cfg_pmc_mask = 0; static uint32_t kpc_running_classes = 0; static uint32_t kpc_configured = 0; -static int first_time = 1; - /* * The whitelist is disabled by default on development/debug kernel. This can * be changed via the kpc.disable_whitelist sysctl. The whitelist is enabled on @@ -305,6 +305,20 @@ static kpc_config_t whitelist[] = { 0xd3, /* FED_IC_MISS_DEM */ 0xd4, /* FED_ITLB_MISS */ +#elif defined(APPLEMONSOON) + 0x02, /* CORE_CYCLE */ + 0x8a, /* INST_A32 */ + 0x8b, /* INST_THUMB */ + 0x8c, /* INST_A64 */ + 0x8d, /* INST_BRANCH */ + 0xbf, /* SYNC_DC_LOAD_MISS */ + 0xc0, /* SYNC_DC_STORE_MISS */ + 0xc1, /* SYNC_DTLB_MISS */ + 0xc4, /* SYNC_ST_HIT_YNGR_LD */ + 0xcb, /* SYNC_BR_ANY_MISP */ + 0xd3, /* FED_IC_MISS_DEM */ + 0xd4, /* FED_ITLB_MISS */ + #else /* An unknown CPU gets a trivial { NO_EVENT } whitelist. */ #endif @@ -984,43 +998,16 @@ kpc_set_reload_xcall(void *vmp_config) thread_wakeup((event_t) &kpc_reload_sync); } -void kpc_pmi_handler(cpu_id_t source); void -kpc_pmi_handler(cpu_id_t source __unused) +kpc_pmi_handler(unsigned int ctr) { - uint64_t PMSR, extra; - int ctr; - int enabled; + uint64_t extra = kpc_reload_counter(ctr); - enabled = ml_set_interrupts_enabled(FALSE); + FIXED_SHADOW(ctr) += (kpc_fixed_max() - FIXED_RELOAD(ctr) + 1 /* Wrap */) + extra; - /* The pmi must be delivered to the CPU that generated it */ - if (source != getCpuDatap()->interrupt_nub) { - panic("pmi from IOCPU %p delivered to IOCPU %p", source, getCpuDatap()->interrupt_nub); + if (FIXED_ACTIONID(ctr)) { + kpc_sample_kperf(FIXED_ACTIONID(ctr)); } - - /* Get the PMSR which has the overflow bits for all the counters */ - __asm__ volatile("mrs %0, S3_1_c15_c13_0" : "=r"(PMSR)); - - for (ctr = 0; ctr < (KPC_ARM64_FIXED_COUNT + KPC_ARM64_CONFIGURABLE_COUNT); ctr++) { - if ((1ull << ctr) & PMSR) { - if (ctr < 2) { -#if MONOTONIC - mt_cpu_pmi(getCpuDatap(), PMSR); -#endif /* MONOTONIC */ - } else { - extra = kpc_reload_counter(ctr); - - FIXED_SHADOW(ctr) - += (kpc_fixed_max() - FIXED_RELOAD(ctr) + 1 /* Wrap */) + extra; - - if (FIXED_ACTIONID(ctr)) - kpc_sample_kperf(FIXED_ACTIONID(ctr)); - } - } - } - - ml_set_interrupts_enabled(enabled); } uint32_t @@ -1032,20 +1019,7 @@ kpc_get_classes(void) int kpc_set_running_arch(struct kpc_running_remote *mp_config) { - int cpu; - - assert(mp_config); - - if (first_time) { - PE_cpu_perfmon_interrupt_install_handler(kpc_pmi_handler); - int max_cpu = ml_get_max_cpu_number(); - for (cpu = 0; cpu <= max_cpu; cpu++) { - cpu_data_t *target_cpu_datap = (cpu_data_t *)CpuDataEntries[cpu].cpu_data_vaddr; - if (target_cpu_datap != NULL) - PE_cpu_perfmon_interrupt_enable(target_cpu_datap->cpu_id, TRUE); - } - first_time = 0; - } + assert(mp_config != NULL); /* dispatch to all CPUs */ cpu_broadcast_xcall(&kpc_xcall_sync, TRUE, kpc_set_running_xcall, mp_config); diff --git a/osfmk/arm64/locore.s b/osfmk/arm64/locore.s index 376e90195..6a8d109f7 100644 --- a/osfmk/arm64/locore.s +++ b/osfmk/arm64/locore.s @@ -100,11 +100,12 @@ stp q28, q29, [x0, NS64_Q28] stp q30, q31, [x0, NS64_Q30] - mrs lr, ELR_EL1 // Get exception link register + mrs lr, ELR_EL1 // Get exception link register mrs x23, SPSR_EL1 // Load CPSR into var reg x23 mrs x24, FPSR mrs x25, FPCR + str lr, [x0, SS64_PC] // Save ELR to PCB str w23, [x0, SS64_CPSR] // Save CPSR to PCB str w24, [x0, NS64_FPSR] @@ -372,6 +373,8 @@ Lel1_sp0_serror_vector_long: .endmacro Lel1_sp1_synchronous_vector_long: + b check_exception_stack +Lel1_sp1_synchronous_valid_stack: #if defined(KERNEL_INTEGRITY_KTRR) b check_ktrr_sctlr_trap Lel1_sp1_synchronous_vector_continue: @@ -400,7 +403,7 @@ Lel1_sp1_serror_vector_long: b fleh_dispatch64 .macro EL0_64_VECTOR - mov x18, xzr // Zero x18 to avoid leaking data to user SS + mov x18, #0 // Zero x18 to avoid leaking data to user SS stp x0, x1, [sp, #-16]! // Save x0 and x1 to the exception stack mrs x0, TPIDR_EL1 // Load the thread register mrs x1, SP_EL0 // Load the user stack pointer @@ -412,8 +415,8 @@ Lel1_sp1_serror_vector_long: msr SPSel, #0 // Switch to SP0 stp x0, x1, [sp, SS64_X0] // Save x0, x1 to the user PCB stp fp, lr, [sp, SS64_FP] // Save fp and lr to the user PCB - mov fp, xzr // Clear the fp and lr for the - mov lr, xzr // debugger stack frame + mov fp, #0 // Clear the fp and lr for the + mov lr, #0 // debugger stack frame mov x0, sp // Copy the user PCB pointer to x0 .endmacro @@ -457,6 +460,30 @@ Lel0_serror_vector_64_long: b fleh_dispatch64 +/* + * check_exception_stack + * + * Verifies that stack pointer at SP1 is within exception stack + * If not, will simply hang as we have no more stack to fall back on. + */ + + .text + .align 2 +check_exception_stack: + mrs x18, TPIDR_EL1 // Get thread pointer + cbz x18, Lvalid_exception_stack // Thread context may not be set early in boot + ldr x18, [x18, ACT_CPUDATAP] + cbz x18, . // If thread context is set, cpu data should be too + ldr x18, [x18, CPU_EXCEPSTACK_TOP] + cmp sp, x18 + b.gt . // Hang if above exception stack top + sub x18, x18, EXCEPSTACK_SIZE_NUM // Find bottom of exception stack + cmp sp, x18 + b.lt . // Hang if below exception stack bottom +Lvalid_exception_stack: + mov x18, #0 + b Lel1_sp1_synchronous_valid_stack + /* * check_kernel_stack * @@ -492,17 +519,10 @@ Ltest_kstack: Ltest_istack: ldr x1, [x1, ACT_CPUDATAP] // Load the cpu data ptr ldr x2, [x1, CPU_INTSTACK_TOP] // Get top of istack - sub x3, x2, PGBYTES // Find bottom of istack + sub x3, x2, INTSTACK_SIZE_NUM // Find bottom of istack cmp x0, x2 // if (SP_EL0 >= istack top) - b.ge Ltest_fiqstack // jump to fiqstack test - cmp x0, x3 // if (SP_EL0 > istack bottom) - b.gt Lvalid_stack // stack pointer valid -Ltest_fiqstack: - ldr x2, [x1, CPU_FIQSTACK_TOP] // Get top of fiqstack - sub x3, x2, PGBYTES // Find bottom of fiqstack - cmp x0, x2 // if (SP_EL0 >= fiqstack top) b.ge Lcorrupt_stack // corrupt stack pointer - cmp x0, x3 // if (SP_EL0 > fiqstack bottom) + cmp x0, x3 // if (SP_EL0 > istack bottom) b.gt Lvalid_stack // stack pointer valid Lcorrupt_stack: INIT_SAVED_STATE_FLAVORS sp, w0, w1 @@ -570,32 +590,32 @@ fleh_dispatch64: cmp x23, #(PSR64_MODE_EL0) bne 1f - mov x2, xzr - mov x3, xzr - mov x4, xzr - mov x5, xzr - mov x6, xzr - mov x7, xzr - mov x8, xzr - mov x9, xzr - mov x10, xzr - mov x11, xzr - mov x12, xzr - mov x13, xzr - mov x14, xzr - mov x15, xzr - mov x16, xzr - mov x17, xzr - mov x18, xzr - mov x19, xzr - mov x20, xzr + mov x2, #0 + mov x3, #0 + mov x4, #0 + mov x5, #0 + mov x6, #0 + mov x7, #0 + mov x8, #0 + mov x9, #0 + mov x10, #0 + mov x11, #0 + mov x12, #0 + mov x13, #0 + mov x14, #0 + mov x15, #0 + mov x16, #0 + mov x17, #0 + mov x18, #0 + mov x19, #0 + mov x20, #0 /* x21, x22 cleared in common case below */ - mov x23, xzr - mov x24, xzr - mov x25, xzr - mov x26, xzr - mov x27, xzr - mov x28, xzr + mov x23, #0 + mov x24, #0 + mov x25, #0 + mov x26, #0 + mov x27, #0 + mov x28, #0 /* fp/lr already cleared by EL0_64_VECTOR */ 1: @@ -910,7 +930,6 @@ check_user_asts: // return_to_user, the latter will have to change. // - exception_return: msr DAIFSet, #DAIFSC_ALL // Disable exceptions mrs x3, TPIDR_EL1 // Load thread pointer @@ -1021,7 +1040,7 @@ Lexception_return_restore_registers: mrs x18, TTBR0_EL1 bic x18, x18, #(1 << TTBR_ASID_SHIFT) msr TTBR0_EL1, x18 - mov x18, xzr + mov x18, #0 /* We don't need an ISB here, as the eret is synchronizing. */ Lskip_ttbr1_switch: @@ -1044,7 +1063,7 @@ user_set_debug_state_and_return: POP_FRAME isb mrs x3, TPIDR_EL1 // Reload thread pointer - b exception_return // And continue + b exception_return // And continue .text .align 2 diff --git a/osfmk/arm64/loose_ends.c b/osfmk/arm64/loose_ends.c index 1eec53104..e211448c7 100644 --- a/osfmk/arm64/loose_ends.c +++ b/osfmk/arm64/loose_ends.c @@ -53,8 +53,13 @@ #define INT_SIZE (BYTE_SIZE * sizeof (int)) -void -bcopy_phys(addr64_t src, addr64_t dst, vm_size_t bytes) +#define BCOPY_PHYS_SRC_IS_PHYS(flags) (((flags) & cppvPsrc) != 0) +#define BCOPY_PHYS_DST_IS_PHYS(flags) (((flags) & cppvPsnk) != 0) +#define BCOPY_PHYS_SRC_IS_USER(flags) (((flags) & (cppvPsrc | cppvKmap)) == 0) +#define BCOPY_PHYS_DST_IS_USER(flags) (((flags) & (cppvPsnk | cppvKmap)) == 0) + +static kern_return_t +bcopy_phys_internal(addr64_t src, addr64_t dst, vm_size_t bytes, int flags) { unsigned int src_index; unsigned int dst_index; @@ -62,49 +67,108 @@ bcopy_phys(addr64_t src, addr64_t dst, vm_size_t bytes) vm_offset_t dst_offset; unsigned int wimg_bits_src, wimg_bits_dst; unsigned int cpu_num = 0; - ppnum_t pn_src = (ppnum_t)(src >> PAGE_SHIFT); - ppnum_t pn_dst = (ppnum_t)(dst >> PAGE_SHIFT); - -#ifdef __ARM_COHERENT_IO__ - if (pmap_valid_address(src) && - pmap_valid_address(dst) && - (mmu_kvtop_wpreflight(phystokv((pmap_paddr_t) dst)))) { - bcopy((char *)phystokv((pmap_paddr_t) src), (char *)phystokv((pmap_paddr_t) dst), bytes); - return; - } + ppnum_t pn_src; + ppnum_t pn_dst; + addr64_t end __assert_only; + kern_return_t res = KERN_SUCCESS; + + assert(!__improbable(os_add_overflow(src, bytes, &end))); + assert(!__improbable(os_add_overflow(dst, bytes, &end))); + + while ((bytes > 0) && (res == KERN_SUCCESS)) { + src_offset = src & PAGE_MASK; + dst_offset = dst & PAGE_MASK; + boolean_t use_copy_window_src = FALSE; + boolean_t use_copy_window_dst = FALSE; + vm_size_t count = bytes; + vm_size_t count2 = bytes; + if (BCOPY_PHYS_SRC_IS_PHYS(flags)) { + use_copy_window_src = !pmap_valid_address(src); + pn_src = (ppnum_t)(src >> PAGE_SHIFT); +#if !defined(__ARM_COHERENT_IO__) && !__ARM_PTE_PHYSMAP__ + count = PAGE_SIZE - src_offset; + wimg_bits_src = pmap_cache_attributes(pn_src); + if ((wimg_bits_src & VM_WIMG_MASK) != VM_WIMG_DEFAULT) + use_copy_window_src = TRUE; +#else + if (use_copy_window_src) { + wimg_bits_src = pmap_cache_attributes(pn_src); + count = PAGE_SIZE - src_offset; + } #endif + } + if (BCOPY_PHYS_DST_IS_PHYS(flags)) { + // write preflighting needed for things like dtrace which may write static read-only mappings + use_copy_window_dst = (!pmap_valid_address(dst) || !mmu_kvtop_wpreflight(phystokv((pmap_paddr_t)dst))); + pn_dst = (ppnum_t)(dst >> PAGE_SHIFT); +#if !defined(__ARM_COHERENT_IO__) && !__ARM_PTE_PHYSMAP__ + count2 = PAGE_SIZE - dst_offset; + wimg_bits_dst = pmap_cache_attributes(pn_dst); + if ((wimg_bits_dst & VM_WIMG_MASK) != VM_WIMG_DEFAULT) + use_copy_window_dst = TRUE; +#else + if (use_copy_window_dst) { + wimg_bits_dst = pmap_cache_attributes(pn_dst); + count2 = PAGE_SIZE - dst_offset; + } +#endif + } - wimg_bits_src = pmap_cache_attributes(pn_src); - wimg_bits_dst = pmap_cache_attributes(pn_dst); + char *tmp_src; + char *tmp_dst; -#ifndef __ARM_COHERENT_IO__ - if (((wimg_bits_src & VM_WIMG_MASK) == VM_WIMG_DEFAULT) && - ((wimg_bits_dst & VM_WIMG_MASK) == VM_WIMG_DEFAULT) && - (mmu_kvtop_wpreflight(phystokv((pmap_paddr_t) dst)))) { - /* Fast path - dst is writable and both source and destination have default attributes */ - bcopy((char *)phystokv((pmap_paddr_t) src), (char *)phystokv((pmap_paddr_t) dst), bytes); - return; - } -#endif + if (use_copy_window_src || use_copy_window_dst) { + mp_disable_preemption(); + cpu_num = cpu_number(); + } + + if (use_copy_window_src) { + src_index = pmap_map_cpu_windows_copy(pn_src, VM_PROT_READ, wimg_bits_src); + tmp_src = (char*)(pmap_cpu_windows_copy_addr(cpu_num, src_index) + src_offset); + } else if (BCOPY_PHYS_SRC_IS_PHYS(flags)) { + tmp_src = (char*)phystokv_range((pmap_paddr_t)src, &count); + } else { + tmp_src = (char*)src; + } + if (use_copy_window_dst) { + dst_index = pmap_map_cpu_windows_copy(pn_dst, VM_PROT_READ | VM_PROT_WRITE, wimg_bits_dst); + tmp_dst = (char*)(pmap_cpu_windows_copy_addr(cpu_num, dst_index) + dst_offset); + } else if (BCOPY_PHYS_DST_IS_PHYS(flags)) { + tmp_dst = (char*)phystokv_range((pmap_paddr_t)dst, &count2); + } else { + tmp_dst = (char*)dst; + } - src_offset = src & PAGE_MASK; - dst_offset = dst & PAGE_MASK; + if (count > count2) + count = count2; + if (count > bytes) + count = bytes; - if ((src_offset + bytes) > PAGE_SIZE || (dst_offset + bytes) > PAGE_SIZE) - panic("bcopy extends beyond copy windows"); + if (BCOPY_PHYS_SRC_IS_USER(flags)) + res = copyin((user_addr_t)src, tmp_dst, count); + else if (BCOPY_PHYS_DST_IS_USER(flags)) + res = copyout(tmp_src, (user_addr_t)dst, count); + else + bcopy(tmp_src, tmp_dst, count); - mp_disable_preemption(); - cpu_num = cpu_number(); - src_index = pmap_map_cpu_windows_copy(pn_src, VM_PROT_READ, wimg_bits_src); - dst_index = pmap_map_cpu_windows_copy(pn_dst, VM_PROT_READ|VM_PROT_WRITE, wimg_bits_dst); + if (use_copy_window_src) + pmap_unmap_cpu_windows_copy(src_index); + if (use_copy_window_dst) + pmap_unmap_cpu_windows_copy(dst_index); + if (use_copy_window_src || use_copy_window_dst) + mp_enable_preemption(); - bcopy((char *)(pmap_cpu_windows_copy_addr(cpu_num, src_index) + src_offset), - (char *)(pmap_cpu_windows_copy_addr(cpu_num, dst_index) + dst_offset), - bytes); + src += count; + dst += count; + bytes -= count; + } + return res; +} - pmap_unmap_cpu_windows_copy(src_index); - pmap_unmap_cpu_windows_copy(dst_index); - mp_enable_preemption(); +void +bcopy_phys(addr64_t src, addr64_t dst, vm_size_t bytes) +{ + bcopy_phys_internal(src, dst, bytes, cppvPsrc | cppvPsnk); } void @@ -119,48 +183,53 @@ bzero_phys(addr64_t src, vm_size_t bytes) { unsigned int wimg_bits; unsigned int cpu_num = cpu_number(); - ppnum_t pn = (ppnum_t)(src >> PAGE_SHIFT); + ppnum_t pn; + addr64_t end __assert_only; -#ifdef __ARM_COHERENT_IO__ - if (pmap_valid_address(src)) { - bzero((char *)phystokv((pmap_paddr_t) src), bytes); - return; - } -#endif + assert(!__improbable(os_add_overflow(src, bytes, &end))); - wimg_bits = pmap_cache_attributes(pn); + vm_offset_t offset = src & PAGE_MASK; + while (bytes > 0) { + vm_size_t count = bytes; -#ifndef __ARM_COHERENT_IO__ - if ((wimg_bits & VM_WIMG_MASK) == VM_WIMG_DEFAULT) { - /* Fast path - default attributes */ - bzero((char *)phystokv((pmap_paddr_t) src), bytes); - return; - } + boolean_t use_copy_window = !pmap_valid_address(src); + pn = (ppnum_t)(src >> PAGE_SHIFT); +#if !defined(__ARM_COHERENT_IO__) && !__ARM_PTE_PHYSMAP__ + count = PAGE_SIZE - offset; + wimg_bits = pmap_cache_attributes(pn); + if ((wimg_bits & VM_WIMG_MASK) != VM_WIMG_DEFAULT) + use_copy_window = TRUE; +#else + if (use_copy_window) { + wimg_bits = pmap_cache_attributes(pn); + count = PAGE_SIZE - offset; + } #endif - - mp_disable_preemption(); - cpu_num = cpu_number(); - - while (bytes > 0) { - vm_offset_t offset = src & PAGE_MASK; - uint64_t count = PAGE_SIZE - offset; + char *buf; + unsigned int index; + if (use_copy_window) { + mp_disable_preemption(); + cpu_num = cpu_number(); + index = pmap_map_cpu_windows_copy(pn, VM_PROT_READ | VM_PROT_WRITE, wimg_bits); + buf = (char *)(pmap_cpu_windows_copy_addr(cpu_num, index) + offset); + } else { + buf = (char *)phystokv_range((pmap_paddr_t)src, &count); + } if (count > bytes) count = bytes; - pn = (ppnum_t)(src >> PAGE_SHIFT); - - unsigned int index = pmap_map_cpu_windows_copy(pn, VM_PROT_READ | VM_PROT_WRITE, wimg_bits); - - bzero((char *)(pmap_cpu_windows_copy_addr(cpu_num, index) + offset), count); + bzero(buf, count); - pmap_unmap_cpu_windows_copy(index); + if (use_copy_window) { + pmap_unmap_cpu_windows_copy(index); + mp_enable_preemption(); + } src += count; bytes -= count; + offset = 0; } - - mp_enable_preemption(); } /* @@ -174,13 +243,17 @@ ml_phys_read_data(pmap_paddr_t paddr, int size) unsigned int index; unsigned int wimg_bits; ppnum_t pn = (ppnum_t)(paddr >> PAGE_SHIFT); + ppnum_t pn_end = (ppnum_t)((paddr + size - 1) >> PAGE_SHIFT); unsigned long long result = 0; vm_offset_t copywindow_vaddr = 0; unsigned char s1; unsigned short s2; unsigned int s4; -#ifdef __ARM_COHERENT_IO__ + if (__improbable(pn_end != pn)) + panic("%s: paddr 0x%llx spans a page boundary", __func__, (uint64_t)paddr); + +#if defined(__ARM_COHERENT_IO__) || __ARM_PTE_PHYSMAP__ if (pmap_valid_address(paddr)) { switch (size) { case 1: @@ -301,9 +374,13 @@ ml_phys_write_data(pmap_paddr_t paddr, unsigned long long data, int size) unsigned int index; unsigned int wimg_bits; ppnum_t pn = (ppnum_t)(paddr >> PAGE_SHIFT); + ppnum_t pn_end = (ppnum_t)((paddr + size - 1) >> PAGE_SHIFT); vm_offset_t copywindow_vaddr = 0; -#ifdef __ARM_COHERENT_IO__ + if (__improbable(pn_end != pn)) + panic("%s: paddr 0x%llx spans a page boundary", __func__, (uint64_t)paddr); + +#if defined(__ARM_COHERENT_IO__) || __ARM_PTE_PHYSMAP__ if (pmap_valid_address(paddr)) { switch (size) { case 1: @@ -539,53 +616,21 @@ memcmp(const void *s1, const void *s2, size_t n) kern_return_t copypv(addr64_t source, addr64_t sink, unsigned int size, int which) { - kern_return_t retval = KERN_SUCCESS; - void *from, *to; -#ifndef __ARM_COHERENT_IO__ - unsigned int from_wimg_bits, to_wimg_bits; -#endif + if ((which & (cppvPsrc | cppvPsnk)) == 0) /* Make sure that only one is virtual */ + panic("%s: no more than 1 parameter may be virtual", __func__); - from = CAST_DOWN(void *, source); - to = CAST_DOWN(void *, sink); - - if ((which & (cppvPsrc | cppvPsnk)) == 0) /* Make sure that only - * one is virtual */ - panic("copypv: no more than 1 parameter may be virtual\n"); /* Not allowed */ - - if (which & cppvPsrc) - from = (void *)phystokv(from); - if (which & cppvPsnk) - to = (void *)phystokv(to); - - if ((which & (cppvPsrc | cppvKmap)) == 0) /* Source is virtual in - * current map */ - retval = copyin((user_addr_t) from, to, size); - else if ((which & (cppvPsnk | cppvKmap)) == 0) /* Sink is virtual in - * current map */ - retval = copyout(from, (user_addr_t) to, size); - else /* both addresses are physical or kernel map */ - bcopy(from, to, size); - -#ifndef __ARM_COHERENT_IO__ - if (which & cppvFsrc) { - flush_dcache64(source, size, ((which & cppvPsrc) == cppvPsrc)); - } else if (which & cppvPsrc) { - from_wimg_bits = pmap_cache_attributes(source >> PAGE_SHIFT); - if ((from_wimg_bits != VM_WIMG_COPYBACK) && (from_wimg_bits != VM_WIMG_WTHRU)) - flush_dcache64(source, size, TRUE); - } + kern_return_t res = bcopy_phys_internal(source, sink, size, which); - if (which & cppvFsnk) { - flush_dcache64(sink, size, ((which & cppvPsnk) == cppvPsnk)); - } else if (which & cppvPsnk) { - to_wimg_bits = pmap_cache_attributes(sink >> PAGE_SHIFT); - if (to_wimg_bits != VM_WIMG_COPYBACK) - flush_dcache64(sink, size, TRUE); - } +#ifndef __ARM_COHERENT_IO__ + if (which & cppvFsrc) + flush_dcache64(source, size, ((which & cppvPsrc) == cppvPsrc)); + + if (which & cppvFsnk) + flush_dcache64(sink, size, ((which & cppvPsnk) == cppvPsnk)); #endif - return retval; -} + return res; +} #if MACH_ASSERT diff --git a/osfmk/arm64/lowmem_vectors.c b/osfmk/arm64/lowmem_vectors.c index a04ef6d76..6cd326a30 100644 --- a/osfmk/arm64/lowmem_vectors.c +++ b/osfmk/arm64/lowmem_vectors.c @@ -66,21 +66,21 @@ lowglo lowGlo __attribute__ ((aligned(PAGE_MAX_SIZE))) = { .lgManualPktAddr = (uint64_t) &manual_pkt, #endif .lgPmapMemQ = (uint64_t)&(pmap_object_store.memq), - .lgPmapMemPageOffset = offsetof(struct vm_page_with_ppnum, phys_page), - .lgPmapMemChainOffset = offsetof(struct vm_page, listq), + .lgPmapMemPageOffset = offsetof(struct vm_page_with_ppnum, vmp_phys_page), + .lgPmapMemChainOffset = offsetof(struct vm_page, vmp_listq), .lgPmapMemPagesize = (uint64_t)sizeof(struct vm_page), .lgPmapMemFromArrayMask = VM_PACKED_FROM_VM_PAGES_ARRAY, .lgPmapMemPackedShift = VM_PACKED_POINTER_SHIFT, .lgPmapMemPackedBaseAddr = VM_MIN_KERNEL_AND_KEXT_ADDRESS, .lgPmapMemStartAddr = -1, .lgPmapMemEndAddr = -1, - .lgPmapMemFirstppnum = -1 + .lgPmapMemFirstppnum = -1, + .lgPageShift = ARM_PGSHIFT }; void patch_low_glo(void) { lowGlo.lgStext = (uint64_t)vm_kernel_stext; - lowGlo.lgPageShift = PAGE_SHIFT; } void patch_low_glo_static_region(uint64_t address, uint64_t size) @@ -95,4 +95,5 @@ void patch_low_glo_vm_page_info(void * start_addr, void * end_addr, uint32_t fir lowGlo.lgPmapMemStartAddr = (uint64_t)start_addr; lowGlo.lgPmapMemEndAddr = (uint64_t)end_addr; lowGlo.lgPmapMemFirstppnum = first_ppnum; + lowGlo.lgPageShift = PAGE_SHIFT; } diff --git a/osfmk/arm64/lz4_decode_arm64.s b/osfmk/arm64/lz4_decode_arm64.s index 2c7353465..632776e95 100644 --- a/osfmk/arm64/lz4_decode_arm64.s +++ b/osfmk/arm64/lz4_decode_arm64.s @@ -26,6 +26,7 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ #include +#include #if LZ4_ENABLE_ASSEMBLY_DECODE_ARM64 /* @@ -73,13 +74,14 @@ #define src_good x20 .macro establish_frame + ARM64_STACK_PROLOG stp fp, lr, [sp, #-16]! mov fp, sp .endm .macro clear_frame_and_return ldp fp, lr, [sp], #16 - ret lr + ARM64_STACK_EPILOG .endm // copy_1x16 SOURCE_ADDR DESTINATION_ADDR diff --git a/osfmk/arm64/lz4_encode_arm64.s b/osfmk/arm64/lz4_encode_arm64.s index bf94a8536..1c5a51e8c 100644 --- a/osfmk/arm64/lz4_encode_arm64.s +++ b/osfmk/arm64/lz4_encode_arm64.s @@ -28,6 +28,7 @@ #include #include +#include #if LZ4_ENABLE_ASSEMBLY_ENCODE_ARM64 @@ -54,6 +55,7 @@ _lz4_encode_2gb: // esteblish frame + ARM64_STACK_PROLOG stp fp, lr, [sp, #-16]! mov fp, sp @@ -391,7 +393,7 @@ L_done: // clear frame ldp fp, lr, [sp], #16 - ret lr + ARM64_STACK_EPILOG L_revert_x9_and_done: sub x9, x9, #1 diff --git a/osfmk/arm64/machine_kpc.h b/osfmk/arm64/machine_kpc.h index 1ba778dc7..7f1b0b640 100644 --- a/osfmk/arm64/machine_kpc.h +++ b/osfmk/arm64/machine_kpc.h @@ -33,14 +33,7 @@ typedef uint64_t kpc_config_t; #define KPC_ARM64_FIXED_COUNT (2) -#if NO_MONITOR -/* Addition of 2 counters to the SoC happens to coincide with removal of - * EL3 monitor. If this changes again in the future, consider moving - * counter config to per-SoC headers. */ -#define KPC_ARM64_CONFIGURABLE_COUNT (8) -#else -#define KPC_ARM64_CONFIGURABLE_COUNT (6) -#endif +#define KPC_ARM64_CONFIGURABLE_COUNT (CORE_NCTRS - KPC_ARM64_FIXED_COUNT) #define KPC_ARM64_COUNTER_WIDTH (47) #define KPC_ARM64_COUNTER_MASK ((UINT64_C(1) << KPC_ARM64_COUNTER_WIDTH) - 1) diff --git a/osfmk/arm64/machine_routines.c b/osfmk/arm64/machine_routines.c index f6a5db38c..e9a22430d 100644 --- a/osfmk/arm64/machine_routines.c +++ b/osfmk/arm64/machine_routines.c @@ -56,6 +56,8 @@ #include #endif +#include + #if KPC #include #endif @@ -73,16 +75,23 @@ boolean_t is_clock_configured = FALSE; extern int mach_assert; extern volatile uint32_t debug_enabled; +extern vm_offset_t segEXTRADATA; +extern vm_offset_t segLOWESTTEXT; +extern vm_offset_t segLASTB; +extern unsigned long segSizeLAST; + void machine_conf(void); thread_t Idle_context(void); -static uint32_t cpu_phys_ids[MAX_CPUS] = {[0 ... MAX_CPUS - 1] = (uint32_t)-1}; -static unsigned int avail_cpus = 0; -static int boot_cpu = -1; -static int max_cpu_number = 0; -cluster_type_t boot_cluster = CLUSTER_TYPE_SMP; +SECURITY_READ_ONLY_LATE(static uint32_t) cpu_phys_ids[MAX_CPUS] = {[0 ... MAX_CPUS - 1] = (uint32_t)-1}; +SECURITY_READ_ONLY_LATE(static unsigned int) avail_cpus = 0; +SECURITY_READ_ONLY_LATE(static int) boot_cpu = -1; +SECURITY_READ_ONLY_LATE(static int) max_cpu_number = 0; +SECURITY_READ_ONLY_LATE(cluster_type_t) boot_cluster = CLUSTER_TYPE_SMP; + +SECURITY_READ_ONLY_LATE(static uint32_t) fiq_eventi = UINT32_MAX; lockdown_handler_t lockdown_handler; void *lockdown_this; @@ -183,19 +192,6 @@ pmap_paddr_t get_mmu_ttb(void) return value; } -MARK_AS_PMAP_TEXT -void set_mmu_ttb(pmap_paddr_t value) -{ -#if __ARM_KERNEL_PROTECT__ - /* All EL1-mode ASIDs are odd. */ - value |= (1ULL << TTBR_ASID_SHIFT); -#endif /* __ARM_KERNEL_PROTECT__ */ - - __builtin_arm_dsb(DSB_ISH); - MSR("TTBR0_EL1", value); - __builtin_arm_isb(ISB_SY); -} - static uint32_t get_midr_el1(void) { uint64_t value; @@ -267,7 +263,15 @@ void rorgn_stash_range(void) } #endif - /* Get the AMC values, and stash them into rorgn_begin, rorgn_end. */ + /* Get the AMC values, and stash them into rorgn_begin, rorgn_end. + * gPhysBase is the base of DRAM managed by xnu. we need DRAM_BASE as + * the AMCC RO region begin/end registers are in units of 16KB page + * numbers from DRAM_BASE so we'll truncate gPhysBase at 512MB granule + * and assert the value is the canonical DRAM_BASE PA of 0x8_0000_0000 for arm64. + */ + + uint64_t dram_base = gPhysBase & ~0x1FFFFFFFULL; /* 512MB */ + assert(dram_base == 0x800000000ULL); #if defined(KERNEL_INTEGRITY_KTRR) uint64_t soc_base = 0; @@ -288,8 +292,8 @@ void rorgn_stash_range(void) #if defined(KERNEL_INTEGRITY_KTRR) assert(rRORGNENDADDR > rRORGNBASEADDR); - rorgn_begin = (rRORGNBASEADDR << ARM_PGSHIFT) + gPhysBase; - rorgn_end = (rRORGNENDADDR << ARM_PGSHIFT) + gPhysBase; + rorgn_begin = (rRORGNBASEADDR << AMCC_PGSHIFT) + dram_base; + rorgn_end = (rRORGNENDADDR << AMCC_PGSHIFT) + dram_base; #else #error KERNEL_INTEGRITY config error #endif /* defined (KERNEL_INTEGRITY_KTRR) */ @@ -358,7 +362,7 @@ static void assert_amcc_cache_disabled() { void rorgn_lockdown(void) { vm_offset_t ktrr_begin, ktrr_end; - unsigned long plt_segsz, last_segsz; + unsigned long last_segsz; #if DEVELOPMENT || DEBUG boolean_t ktrr_disable = FALSE; @@ -377,22 +381,24 @@ void rorgn_lockdown(void) assert_unlocked(); /* [x] - Use final method of determining all kernel text range or expect crashes */ - - ktrr_begin = (uint64_t) getsegdatafromheader(&_mh_execute_header, "__PRELINK_TEXT", &plt_segsz); + ktrr_begin = segEXTRADATA; assert(ktrr_begin && gVirtBase && gPhysBase); ktrr_begin = kvtophys(ktrr_begin); + ktrr_end = kvtophys(segLASTB); + last_segsz = segSizeLAST; +#if defined(KERNEL_INTEGRITY_KTRR) /* __LAST is not part of the MMU KTRR region (it is however part of the AMCC KTRR region) */ - ktrr_end = (uint64_t) getsegdatafromheader(&_mh_execute_header, "__LAST", &last_segsz); - ktrr_end = (kvtophys(ktrr_end) - 1) & ~PAGE_MASK; - + ktrr_end = (ktrr_end - 1) & ~AMCC_PGMASK; /* ensure that iboot and xnu agree on the ktrr range */ assert(rorgn_begin == ktrr_begin && rorgn_end == (ktrr_end + last_segsz)); /* assert that __LAST segment containing privileged insns is only a single page */ assert(last_segsz == PAGE_SIZE); +#endif + -#if DEBUG +#if DEBUG || DEVELOPMENT printf("KTRR Begin: %p End: %p, setting lockdown\n", (void *)ktrr_begin, (void *)ktrr_end); #endif @@ -401,7 +407,7 @@ void rorgn_lockdown(void) assert_amcc_cache_disabled(); CleanPoC_DcacheRegion_Force(phystokv(ktrr_begin), - (unsigned)((ktrr_end + last_segsz) - ktrr_begin + PAGE_MASK)); + (unsigned)((ktrr_end + last_segsz) - ktrr_begin + AMCC_PGMASK)); lock_amcc(); @@ -446,7 +452,7 @@ void machine_lockdown_preflight(void) #if CONFIG_KERNEL_INTEGRITY #if defined(KERNEL_INTEGRITY_KTRR) - rorgn_stash_range(); + rorgn_stash_range(); #endif #endif @@ -470,13 +476,13 @@ void machine_lockdown(void) #if defined(KERNEL_INTEGRITY_KTRR) - /* KTRR - * - * Lock physical KTRR region. KTRR region is read-only. Memory outside - * the region is not executable at EL1. - */ + /* KTRR + * + * Lock physical KTRR region. KTRR region is read-only. Memory outside + * the region is not executable at EL1. + */ - rorgn_lockdown(); + rorgn_lockdown(); #endif /* defined(KERNEL_INTEGRITY_KTRR)*/ @@ -833,6 +839,16 @@ ml_parse_cpu_topology(void) if (boot_cpu == -1) panic("unable to determine boot cpu!"); + + /* + * Set TPIDRRO_EL0 to indicate the correct cpu number, as we may + * not be booting from cpu 0. Userspace will consume the current + * CPU number through this register. For non-boot cores, this is + * done in start.s (start_cpu) using the cpu_number field of the + * per-cpu data object. + */ + assert(__builtin_arm_rsr64("TPIDRRO_EL0") == 0); + __builtin_arm_wsr64("TPIDRRO_EL0", (uint64_t)boot_cpu); } unsigned int @@ -875,6 +891,7 @@ void ml_lockdown_init() { assert(lockdown_handler_grp != NULL); lck_mtx_init(&lockdown_handler_lck, lockdown_handler_grp, NULL); + } kern_return_t @@ -1006,7 +1023,7 @@ ml_processor_register( #endif if (!is_boot_cpu) { - prng_cpu_init(this_cpu_datap->cpu_number); + early_random_cpu_init(this_cpu_datap->cpu_number); // now let next CPU register itself OSIncrementAtomic((SInt32*)&real_ncpus); } @@ -1056,20 +1073,6 @@ cause_ast_check( } } - -/* - * Routine: ml_at_interrupt_context - * Function: Check if running at interrupt context - */ -boolean_t -ml_at_interrupt_context(void) -{ - unsigned int local; - vm_offset_t intstack_top_ptr; - - intstack_top_ptr = getCpuDatap()->intstack_top; - return (((vm_offset_t)(&local) < intstack_top_ptr) && ((vm_offset_t)(&local) > (intstack_top_ptr - INTSTACK_SIZE))); -} extern uint32_t cpu_idle_count; void ml_get_power_state(boolean_t *icp, boolean_t *pidlep) { @@ -1128,14 +1131,21 @@ ml_static_ptovirt( } vm_offset_t -ml_static_vtop( - vm_offset_t vaddr) +ml_static_slide( + vm_offset_t vaddr) { - if (((vm_address_t)(vaddr) - gVirtBase) >= gPhysSize) - panic("ml_static_ptovirt(): illegal vaddr: %p\n", (void*)vaddr); - return ((vm_address_t)(vaddr) - gVirtBase + gPhysBase); + return phystokv(vaddr + vm_kernel_slide - gVirtBase + gPhysBase); } +vm_offset_t +ml_static_unslide( + vm_offset_t vaddr) +{ + return (ml_static_vtop(vaddr) - gPhysBase + gVirtBase - vm_kernel_slide) ; +} + +extern tt_entry_t *arm_kva_to_tte(vm_offset_t va); + kern_return_t ml_static_protect( vm_offset_t vaddr, /* kernel virtual address */ @@ -1181,21 +1191,12 @@ ml_static_protect( vaddr_cur += PAGE_SIZE) { ppn = pmap_find_phys(kernel_pmap, vaddr_cur); if (ppn != (vm_offset_t) NULL) { -#if __ARM64_TWO_LEVEL_PMAP__ tt_entry_t *tte2; -#else - tt_entry_t *tte1, *tte2; -#endif pt_entry_t *pte_p; pt_entry_t ptmp; -#if __ARM64_TWO_LEVEL_PMAP__ - tte2 = &kernel_pmap->tte[(((vaddr_cur) & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT)]; -#else - tte1 = &kernel_pmap->tte[(((vaddr_cur) & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT)]; - tte2 = &((tt_entry_t*) phystokv((*tte1) & ARM_TTE_TABLE_MASK))[(((vaddr_cur) & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT)]; -#endif + tte2 = arm_kva_to_tte(vaddr_cur); if (((*tte2) & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE) { if ((((*tte2) & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) && @@ -1452,18 +1453,6 @@ machine_choose_processor(__unused processor_set_t pset, processor_t processor) return (processor); } -vm_offset_t -ml_stack_remaining(void) -{ - uintptr_t local = (uintptr_t) &local; - - if (ml_at_interrupt_context()) { - return (local - (getCpuDatap()->intstack_top - INTSTACK_SIZE)); - } else { - return (local - current_thread()->kernel_stack); - } -} - #if KASAN vm_offset_t ml_stack_base(void); vm_size_t ml_stack_size(void); @@ -1471,19 +1460,27 @@ vm_size_t ml_stack_size(void); vm_offset_t ml_stack_base(void) { - if (ml_at_interrupt_context()) { - return getCpuDatap()->intstack_top - INTSTACK_SIZE; + uintptr_t local = (uintptr_t) &local; + vm_offset_t intstack_top_ptr; + + intstack_top_ptr = getCpuDatap()->intstack_top; + if ((local < intstack_top_ptr) && (local > intstack_top_ptr - INTSTACK_SIZE)) { + return intstack_top_ptr - INTSTACK_SIZE; } else { - return current_thread()->kernel_stack; + return current_thread()->kernel_stack; } } vm_size_t ml_stack_size(void) { - if (ml_at_interrupt_context()) { - return INTSTACK_SIZE; + uintptr_t local = (uintptr_t) &local; + vm_offset_t intstack_top_ptr; + + intstack_top_ptr = getCpuDatap()->intstack_top; + if ((local < intstack_top_ptr) && (local > intstack_top_ptr - INTSTACK_SIZE)) { + return INTSTACK_SIZE; } else { - return kernel_stack_size; + return kernel_stack_size; } } #endif @@ -1605,7 +1602,7 @@ dcache_flush_trap(vm_map_address_t start, vm_map_size_t size) vm_offset_t old_recover = thread->recover; /* Check bounds */ - if (task_has_64BitAddr(current_task())) { + if (task_has_64Bit_addr(current_task())) { if (end > MACH_VM_MAX_ADDRESS) { cache_trap_error(thread, end & ((1 << ARM64_CLINE_SHIFT) - 1)); } @@ -1622,17 +1619,12 @@ dcache_flush_trap(vm_map_address_t start, vm_map_size_t size) /* Set recovery function */ thread->recover = (vm_address_t)cache_trap_recover; -#if defined(APPLE_ARM64_ARCH_FAMILY) /* * We're coherent on Apple ARM64 CPUs, so this could be a nop. However, * if the region given us is bad, it would be good to catch it and * crash, ergo we still do the flush. */ - assert((size & 0xFFFFFFFF00000000ULL) == 0); FlushPoC_DcacheRegion(start, (uint32_t)size); -#else -#error "Make sure you don't need to xcall." -#endif /* Restore recovery function */ thread->recover = old_recover; @@ -1648,7 +1640,7 @@ icache_invalidate_trap(vm_map_address_t start, vm_map_size_t size) vm_offset_t old_recover = thread->recover; /* Check bounds */ - if (task_has_64BitAddr(current_task())) { + if (task_has_64Bit_addr(current_task())) { if (end > MACH_VM_MAX_ADDRESS) { cache_trap_error(thread, end & ((1 << ARM64_CLINE_SHIFT) - 1)); } @@ -1665,15 +1657,14 @@ icache_invalidate_trap(vm_map_address_t start, vm_map_size_t size) /* Set recovery function */ thread->recover = (vm_address_t)cache_trap_recover; -#if defined(APPLE_ARM64_ARCH_FAMILY) - /* Clean dcache to unification, except we're coherent on Apple ARM64 CPUs */ -#else -#error Make sure not cleaning is right for this platform! -#endif + CleanPoU_DcacheRegion(start, (uint32_t) size); /* Invalidate iCache to point of unification */ - assert((size & 0xFFFFFFFF00000000ULL) == 0); +#if __ARM_IC_NOALIAS_ICACHE__ InvalidatePoU_IcacheRegion(start, (uint32_t)size); +#else + InvalidatePoU_Icache(); +#endif /* Restore recovery function */ thread->recover = old_recover; @@ -1759,7 +1750,17 @@ _enable_virtual_timer(void) void fiq_context_init(boolean_t enable_fiq __unused) { -#if defined(APPLE_ARM64_ARCH_FAMILY) + _enable_timebase_event_stream(fiq_eventi); + + /* Interrupts still disabled. */ + assert(ml_get_interrupts_enabled() == FALSE); + _enable_virtual_timer(); +} + +void +fiq_context_bootstrap(boolean_t enable_fiq) +{ +#if defined(APPLE_ARM64_ARCH_FAMILY) || defined(BCM2837) /* Could fill in our own ops here, if we needed them */ uint64_t ticks_per_sec, ticks_per_event, events_per_sec; uint32_t bit_index; @@ -1775,9 +1776,8 @@ fiq_context_init(boolean_t enable_fiq __unused) bit_index = flsll(ticks_per_event) - 1; /* Highest bit set */ /* Round up to power of two */ - if ((ticks_per_event & ((1 << bit_index) - 1)) != 0) { + if ((ticks_per_event & ((1 << bit_index) - 1)) != 0) bit_index++; - } /* * The timer can only trigger on rising or falling edge, @@ -1788,89 +1788,11 @@ fiq_context_init(boolean_t enable_fiq __unused) if (bit_index != 0) bit_index--; - _enable_timebase_event_stream(bit_index); + fiq_eventi = bit_index; #else #error Need a board configuration. #endif - - /* Interrupts still disabled. */ - assert(ml_get_interrupts_enabled() == FALSE); - _enable_virtual_timer(); -} - -/* - * ARM64_TODO: remove me (just a convenience while we don't have crashreporter) - */ -extern int copyinframe(vm_address_t, char *, boolean_t); -size_t _OSUserBacktrace(char *buffer, size_t bufsize); - -size_t _OSUserBacktrace(char *buffer, size_t bufsize) -{ - thread_t thread = current_thread(); - boolean_t is64bit = thread_is_64bit(thread); - size_t trace_size_bytes = 0, lr_size; - vm_address_t frame_addr; // Should really by mach_vm_offset_t... - - if (bufsize < 8) { - return 0; - } - - if (get_threadtask(thread) == kernel_task) { - panic("%s: Should never be called from a kernel thread.", __FUNCTION__); - } - - frame_addr = get_saved_state_fp(thread->machine.upcb); - if (is64bit) { - uint64_t frame[2]; - lr_size = sizeof(frame[1]); - - *((uint64_t*)buffer) = get_saved_state_pc(thread->machine.upcb); - trace_size_bytes = lr_size; - - while (trace_size_bytes + lr_size < bufsize) { - if (!(frame_addr < VM_MIN_KERNEL_AND_KEXT_ADDRESS)) { - break; - } - - if (0 != copyinframe(frame_addr, (char*)frame, TRUE)) { - break; - } - - *((uint64_t*)(buffer + trace_size_bytes)) = frame[1]; /* lr */ - frame_addr = frame[0]; - trace_size_bytes += lr_size; - - if (frame[0] == 0x0ULL) { - break; - } - } - } else { - uint32_t frame[2]; - lr_size = sizeof(frame[1]); - - *((uint32_t*)buffer) = (uint32_t)get_saved_state_pc(thread->machine.upcb); - trace_size_bytes = lr_size; - - while (trace_size_bytes + lr_size < bufsize) { - if (!(frame_addr < VM_MIN_KERNEL_AND_KEXT_ADDRESS)) { - break; - } - - if (0 != copyinframe(frame_addr, (char*)frame, FALSE)) { - break; - } - - *((uint32_t*)(buffer + trace_size_bytes)) = frame[1]; /* lr */ - frame_addr = frame[0]; - trace_size_bytes += lr_size; - - if (frame[0] == 0x0ULL) { - break; - } - } - } - - return trace_size_bytes; + fiq_context_init(enable_fiq); } boolean_t @@ -1890,7 +1812,7 @@ ml_delay_should_spin(uint64_t interval) } boolean_t ml_thread_is64bit(thread_t thread) { - return (thread_is_64bit(thread)); + return (thread_is_64bit_addr(thread)); } void ml_timer_evaluate(void) { diff --git a/osfmk/arm64/machine_routines_asm.s b/osfmk/arm64/machine_routines_asm.s index 042246006..8f51e2a22 100644 --- a/osfmk/arm64/machine_routines_asm.s +++ b/osfmk/arm64/machine_routines_asm.s @@ -35,6 +35,7 @@ #include "assym.s" + /* uint32_t get_fpscr(void): * Returns (FPSR | FPCR). */ @@ -82,6 +83,54 @@ LEXT(set_fpscr) #endif ret +/* + * void update_mdscr(unsigned long clear, unsigned long set) + * Clears and sets the specified bits in MDSCR_EL1. + * + * Setting breakpoints in EL1 is effectively a KTRR bypass. The ability to do so is + * controlled by MDSCR.KDE. The MSR to set MDSCR must be present to allow + * self-hosted user mode debug. Any checks before the MRS can be skipped with ROP, + * so we need to put the checks after the MRS where they can't be skipped. That + * still leaves a small window if a breakpoint is set on the instruction + * immediately after the MRS. To handle that, we also do a check and then set of + * the breakpoint control registers. This allows us to guarantee that a given + * core will never have both KDE set and a breakpoint targeting EL1. + * + * If KDE gets set, unset it and then panic + */ + .align 2 + .globl EXT(update_mdscr) +LEXT(update_mdscr) + mov x4, #0 + mrs x2, MDSCR_EL1 + bic x2, x2, x0 + orr x2, x2, x1 +1: + bic x2, x2, #0x2000 + msr MDSCR_EL1, x2 +#if defined(CONFIG_KERNEL_INTEGRITY) + /* + * verify KDE didn't get set (including via ROP) + * If set, clear it and then panic + */ + ands x3, x2, #0x2000 + orr x4, x4, x3 + bne 1b + cmp x4, xzr + b.ne Lupdate_mdscr_panic +#endif + ret + +Lupdate_mdscr_panic: + adrp x0, Lupdate_mdscr_panic_str@page + add x0, x0, Lupdate_mdscr_panic_str@pageoff + b EXT(panic) + b . + +Lupdate_mdscr_panic_str: + .asciz "MDSCR.KDE was set" + + #if __ARM_KERNEL_PROTECT__ /* * __ARM_KERNEL_PROTECT__ adds two complications to TLB management: @@ -96,6 +145,40 @@ LEXT(set_fpscr) */ #endif /* __ARM_KERNEL_PROTECT__ */ +.macro SYNC_TLB_FLUSH + dsb ish + isb sy +.endmacro + + +/* + * void sync_tlb_flush(void) + * + * Synchronize one or more prior TLB flush operations + */ + .text + .align 2 + .globl EXT(sync_tlb_flush) +LEXT(sync_tlb_flush) + SYNC_TLB_FLUSH + ret + + +.macro FLUSH_MMU_TLB + tlbi vmalle1is +.endmacro +/* + * void flush_mmu_tlb_async(void) + * + * Flush all TLBs, don't wait for completion + */ + .text + .align 2 + .globl EXT(flush_mmu_tlb_async) +LEXT(flush_mmu_tlb_async) + FLUSH_MMU_TLB + ret + /* * void flush_mmu_tlb(void) * @@ -105,34 +188,40 @@ LEXT(set_fpscr) .align 2 .globl EXT(flush_mmu_tlb) LEXT(flush_mmu_tlb) - tlbi vmalle1is - dsb ish - isb sy + FLUSH_MMU_TLB + SYNC_TLB_FLUSH ret +.macro FLUSH_CORE_TLB + tlbi vmalle1 +.endmacro + /* - * void flush_core_tlb(void) + * void flush_core_tlb_async(void) * - * Flush core TLB + * Flush local core TLB, don't wait for completion */ .text .align 2 - .globl EXT(flush_core_tlb) -LEXT(flush_core_tlb) - tlbi vmalle1 - dsb ish - isb sy + .globl EXT(flush_core_tlb_async) +LEXT(flush_core_tlb_async) + FLUSH_CORE_TLB ret /* - * void flush_mmu_tlb_allentries(uint64_t, uint64_t) + * void flush_core_tlb(void) * - * Flush TLB entries + * Flush local core TLB */ .text .align 2 - .globl EXT(flush_mmu_tlb_allentries) -LEXT(flush_mmu_tlb_allentries) + .globl EXT(flush_core_tlb) +LEXT(flush_core_tlb) + FLUSH_CORE_TLB + SYNC_TLB_FLUSH + ret + +.macro FLUSH_MMU_TLB_ALLENTRIES #if __ARM_16K_PG__ and x0, x0, #~0x3 @@ -154,24 +243,37 @@ LEXT(flush_mmu_tlb_allentries) add x1, x1, #0x3 and x1, x1, #~0x3 #endif -Lflush_mmu_tlb_allentries_loop: +1: // Lflush_mmu_tlb_allentries_loop: tlbi vaae1is, x0 add x0, x0, #(ARM_PGBYTES / 4096) // Units are 4KB pages, as defined by the ISA cmp x0, x1 - b.lt Lflush_mmu_tlb_allentries_loop - dsb ish - isb sy - ret + b.lt 1b // Lflush_mmu_tlb_allentries_loop +.endmacro /* - * void flush_mmu_tlb_entry(uint64_t) + * void flush_mmu_tlb_allentries_async(uint64_t, uint64_t) * - * Flush TLB entry + * Flush TLB entries, don't wait for completion */ .text .align 2 - .globl EXT(flush_mmu_tlb_entry) -LEXT(flush_mmu_tlb_entry) + .globl EXT(flush_mmu_tlb_allentries_async) +LEXT(flush_mmu_tlb_allentries_async) + FLUSH_MMU_TLB_ALLENTRIES + ret + +/* + * void flush_mmu_tlb_allentries(uint64_t, uint64_t) + * + * Flush TLB entries + */ + .globl EXT(flush_mmu_tlb_allentries) +LEXT(flush_mmu_tlb_allentries) + FLUSH_MMU_TLB_ALLENTRIES + SYNC_TLB_FLUSH + ret + +.macro FLUSH_MMU_TLB_ENTRY #if __ARM_KERNEL_PROTECT__ /* * If we are flushing ASID 0, this is a kernel operation. With this @@ -179,33 +281,46 @@ LEXT(flush_mmu_tlb_entry) */ lsr x2, x0, #TLBI_ASID_SHIFT cmp x2, #0 - b.eq Lflush_mmu_tlb_entry_globally + b.eq 1f // Lflush_mmu_tlb_entry_globally bic x0, x0, #(1 << TLBI_ASID_SHIFT) tlbi vae1is, x0 orr x0, x0, #(1 << TLBI_ASID_SHIFT) #endif /* __ARM_KERNEL_PROTECT__ */ tlbi vae1is, x0 - dsb ish - isb sy - ret #if __ARM_KERNEL_PROTECT__ -Lflush_mmu_tlb_entry_globally: + b 2f // Lflush_mmu_tlb_entry_done +1: // Lflush_mmu_tlb_entry_globally: tlbi vaae1is, x0 - dsb ish - isb sy - ret +2: // Lflush_mmu_tlb_entry_done #endif /* __ARM_KERNEL_PROTECT__ */ +.endmacro +/* + * void flush_mmu_tlb_entry_async(uint64_t) + * + * Flush TLB entry, don't wait for completion + */ + .text + .align 2 + .globl EXT(flush_mmu_tlb_entry_async) +LEXT(flush_mmu_tlb_entry_async) + FLUSH_MMU_TLB_ENTRY + ret /* - * void flush_mmu_tlb_entries(uint64_t, uint64_t) + * void flush_mmu_tlb_entry(uint64_t) * - * Flush TLB entries + * Flush TLB entry */ .text .align 2 - .globl EXT(flush_mmu_tlb_entries) -LEXT(flush_mmu_tlb_entries) + .globl EXT(flush_mmu_tlb_entry) +LEXT(flush_mmu_tlb_entry) + FLUSH_MMU_TLB_ENTRY + SYNC_TLB_FLUSH + ret + +.macro FLUSH_MMU_TLB_ENTRIES #if __ARM_16K_PG__ and x0, x0, #~0x3 @@ -226,7 +341,7 @@ LEXT(flush_mmu_tlb_entries) */ add x1, x1, #0x3 and x1, x1, #~0x3 -#endif /* __ARM_KERNEL_PROTECT__ */ +#endif /* __ARM_16K_PG__ */ #if __ARM_KERNEL_PROTECT__ /* * If we are flushing ASID 0, this is a kernel operation. With this @@ -234,11 +349,11 @@ LEXT(flush_mmu_tlb_entries) */ lsr x2, x0, #TLBI_ASID_SHIFT cmp x2, #0 - b.eq Lflush_mmu_tlb_entries_globally_loop + b.eq 2f // Lflush_mmu_tlb_entries_globally_loop bic x0, x0, #(1 << TLBI_ASID_SHIFT) #endif /* __ARM_KERNEL_PROTECT__ */ -Lflush_mmu_tlb_entries_loop: +1: // Lflush_mmu_tlb_entries_loop tlbi vae1is, x0 #if __ARM_KERNEL_PROTECT__ orr x0, x0, #(1 << TLBI_ASID_SHIFT) @@ -247,30 +362,44 @@ Lflush_mmu_tlb_entries_loop: #endif /* __ARM_KERNEL_PROTECT__ */ add x0, x0, #(ARM_PGBYTES / 4096) // Units are pages cmp x0, x1 - b.lt Lflush_mmu_tlb_entries_loop - dsb ish - isb sy - ret + b.lt 1b // Lflush_mmu_tlb_entries_loop #if __ARM_KERNEL_PROTECT__ -Lflush_mmu_tlb_entries_globally_loop: + b 3f // Lflush_mmu_tlb_entries_done +2: // Lflush_mmu_tlb_entries_globally_loop: tlbi vaae1is, x0 add x0, x0, #(ARM_PGBYTES / 4096) // Units are pages cmp x0, x1 - b.lt Lflush_mmu_tlb_entries_globally_loop - dsb ish - isb sy - ret + b.lt 2b // Lflush_mmu_tlb_entries_globally_loop +3: // Lflush_mmu_tlb_entries_done #endif /* __ARM_KERNEL_PROTECT__ */ +.endmacro /* - * void flush_mmu_tlb_asid(uint64_t) + * void flush_mmu_tlb_entries_async(uint64_t, uint64_t) * - * Flush TLB entriesfor requested asid + * Flush TLB entries, don't wait for completion */ .text .align 2 - .globl EXT(flush_mmu_tlb_asid) -LEXT(flush_mmu_tlb_asid) + .globl EXT(flush_mmu_tlb_entries_async) +LEXT(flush_mmu_tlb_entries_async) + FLUSH_MMU_TLB_ENTRIES + ret + +/* + * void flush_mmu_tlb_entries(uint64_t, uint64_t) + * + * Flush TLB entries + */ + .text + .align 2 + .globl EXT(flush_mmu_tlb_entries) +LEXT(flush_mmu_tlb_entries) + FLUSH_MMU_TLB_ENTRIES + SYNC_TLB_FLUSH + ret + +.macro FLUSH_MMU_TLB_ASID #if __ARM_KERNEL_PROTECT__ /* * If we are flushing ASID 0, this is a kernel operation. With this @@ -278,33 +407,47 @@ LEXT(flush_mmu_tlb_asid) */ lsr x1, x0, #TLBI_ASID_SHIFT cmp x1, #0 - b.eq Lflush_mmu_tlb_globally + b.eq 1f // Lflush_mmu_tlb_globally bic x0, x0, #(1 << TLBI_ASID_SHIFT) tlbi aside1is, x0 orr x0, x0, #(1 << TLBI_ASID_SHIFT) #endif /* __ARM_KERNEL_PROTECT__ */ tlbi aside1is, x0 - dsb ish - isb sy - ret #if __ARM_KERNEL_PROTECT__ -Lflush_mmu_tlb_globally: + b 2f // Lflush_mmu_tlb_asid_done +1: // Lflush_mmu_tlb_globally: tlbi vmalle1is - dsb ish - isb sy - ret +2: // Lflush_mmu_tlb_asid_done: #endif /* __ARM_KERNEL_PROTECT__ */ +.endmacro /* - * void flush_core_tlb_asid(uint64_t) + * void flush_mmu_tlb_asid_async(uint64_t) * - * Flush TLB entries for core for requested asid + * Flush TLB entriesfor requested asid, don't wait for completion */ .text .align 2 - .globl EXT(flush_core_tlb_asid) -LEXT(flush_core_tlb_asid) + .globl EXT(flush_mmu_tlb_asid_async) +LEXT(flush_mmu_tlb_asid_async) + FLUSH_MMU_TLB_ASID + ret + +/* + * void flush_mmu_tlb_asid(uint64_t) + * + * Flush TLB entriesfor requested asid + */ + .text + .align 2 + .globl EXT(flush_mmu_tlb_asid) +LEXT(flush_mmu_tlb_asid) + FLUSH_MMU_TLB_ASID + SYNC_TLB_FLUSH + ret + +.macro FLUSH_CORE_TLB_ASID #if __ARM_KERNEL_PROTECT__ /* * If we are flushing ASID 0, this is a kernel operation. With this @@ -312,23 +455,44 @@ LEXT(flush_core_tlb_asid) */ lsr x1, x0, #TLBI_ASID_SHIFT cmp x1, #0 - b.eq Lflush_core_tlb_asid_globally + b.eq 1f // Lflush_core_tlb_asid_globally bic x0, x0, #(1 << TLBI_ASID_SHIFT) tlbi aside1, x0 orr x0, x0, #(1 << TLBI_ASID_SHIFT) #endif /* __ARM_KERNEL_PROTECT__ */ tlbi aside1, x0 - dsb ish - isb sy - ret #if __ARM_KERNEL_PROTECT__ -Lflush_core_tlb_asid_globally: + b 2f // Lflush_core_tlb_asid_done +1: // Lflush_core_tlb_asid_globally: tlbi vmalle1 - dsb ish - isb sy - ret +2: // Lflush_core_tlb_asid_done: #endif /* __ARM_KERNEL_PROTECT__ */ +.endmacro + +/* + * void flush_core_tlb_asid_async(uint64_t) + * + * Flush TLB entries for core for requested asid, don't wait for completion + */ + .text + .align 2 + .globl EXT(flush_core_tlb_asid_async) +LEXT(flush_core_tlb_asid_async) + FLUSH_CORE_TLB_ASID + ret +/* + * void flush_core_tlb_asid(uint64_t) + * + * Flush TLB entries for core for requested asid + */ + .text + .align 2 + .globl EXT(flush_core_tlb_asid) +LEXT(flush_core_tlb_asid) + FLUSH_CORE_TLB_ASID + SYNC_TLB_FLUSH + ret /* * Set MMU Translation Table Base Alternate @@ -348,6 +512,19 @@ LEXT(set_mmu_ttb_alternate) isb sy ret + .text + .align 2 + .globl EXT(set_mmu_ttb) +LEXT(set_mmu_ttb) +#if __ARM_KERNEL_PROTECT__ + /* All EL1-mode ASIDs are odd. */ + orr x0, x0, #(1 << TTBR_ASID_SHIFT) +#endif /* __ARM_KERNEL_PROTECT__ */ + dsb ish + msr TTBR0_EL1, x0 + isb sy + ret + /* * set AUX control register */ @@ -447,7 +624,7 @@ LEXT(mmu_kvtop) and x0, x1, #0x0000ffffffffffff // Clear non-address bits ret L_mmu_kvtop_invalid: - mov x0, xzr // Return invalid + mov x0, #0 // Return invalid ret /* @@ -469,7 +646,7 @@ LEXT(mmu_uvtop) and x0, x1, #0x0000ffffffffffff // Clear non-address bits ret L_mmu_uvtop_invalid: - mov x0, xzr // Return invalid + mov x0, #0 // Return invalid ret /* @@ -489,7 +666,7 @@ LEXT(mmu_kvtop_wpreflight) and x0, x1, #0x0000ffffffffffff // Clear non-address bits ret L_mmu_kvtop_wpreflight_invalid: - mov x0, xzr // Return invalid + mov x0, #0 // Return invalid ret /* @@ -529,7 +706,7 @@ copyio_error: CLEAR_RECOVERY_HANDLER x10, x11 mov x0, #EFAULT // Return an EFAULT error POP_FRAME - ret + ARM64_STACK_EPILOG /* * int _bcopyin(const char *src, char *dst, vm_size_t len) @@ -538,6 +715,7 @@ copyio_error: .align 2 .globl EXT(_bcopyin) LEXT(_bcopyin) + ARM64_STACK_PROLOG PUSH_FRAME SET_RECOVERY_HANDLER x10, x11, x3, copyio_error /* If len is less than 16 bytes, just do a bytewise copy */ @@ -560,9 +738,9 @@ LEXT(_bcopyin) b.hi 2b 3: CLEAR_RECOVERY_HANDLER x10, x11 - mov x0, xzr + mov x0, #0 POP_FRAME - ret + ARM64_STACK_EPILOG /* * int _copyin_word(const char *src, uint64_t *dst, vm_size_t len) @@ -571,6 +749,7 @@ LEXT(_bcopyin) .align 2 .globl EXT(_copyin_word) LEXT(_copyin_word) + ARM64_STACK_PROLOG PUSH_FRAME SET_RECOVERY_HANDLER x10, x11, x3, copyio_error cmp x2, #4 @@ -586,11 +765,12 @@ L_copyin_word_8: ldr x8, [x0] L_copyin_word_store: str x8, [x1] - mov x0, xzr + mov x0, #0 CLEAR_RECOVERY_HANDLER x10, x11 L_copying_exit: POP_FRAME - ret + ARM64_STACK_EPILOG + /* @@ -600,6 +780,7 @@ L_copying_exit: .align 2 .globl EXT(_bcopyout) LEXT(_bcopyout) + ARM64_STACK_PROLOG PUSH_FRAME SET_RECOVERY_HANDLER x10, x11, x3, copyio_error /* If len is less than 16 bytes, just do a bytewise copy */ @@ -622,9 +803,9 @@ LEXT(_bcopyout) b.hi 2b 3: CLEAR_RECOVERY_HANDLER x10, x11 - mov x0, xzr + mov x0, #0 POP_FRAME - ret + ARM64_STACK_EPILOG /* * int _bcopyinstr( @@ -637,12 +818,13 @@ LEXT(_bcopyout) .align 2 .globl EXT(_bcopyinstr) LEXT(_bcopyinstr) + ARM64_STACK_PROLOG PUSH_FRAME adr x4, Lcopyinstr_error // Get address for recover mrs x10, TPIDR_EL1 // Get thread pointer ldr x11, [x10, TH_RECOVER] // Save previous recover str x4, [x10, TH_RECOVER] // Store new recover - mov x4, xzr // x4 - total bytes copied + mov x4, #0 // x4 - total bytes copied Lcopyinstr_loop: ldrb w5, [x0], #1 // Load a byte from the user source strb w5, [x1], #1 // Store a byte to the kernel dest @@ -661,7 +843,7 @@ Lcopyinstr_error: Lcopyinstr_exit: str x11, [x10, TH_RECOVER] // Restore old recover POP_FRAME - ret + ARM64_STACK_EPILOG /* * int copyinframe(const vm_address_t frame_addr, char *kernel_addr, bool is64bit) @@ -684,6 +866,7 @@ Lcopyinstr_exit: .align 2 .globl EXT(copyinframe) LEXT(copyinframe) + ARM64_STACK_PROLOG PUSH_FRAME SET_RECOVERY_HANDLER x10, x11, x3, copyio_error cbnz w2, Lcopyinframe64 // Check frame size @@ -718,90 +901,8 @@ Lcopyinframe_valid: Lcopyinframe_done: CLEAR_RECOVERY_HANDLER x10, x11 POP_FRAME - ret - - -/* - * int _emulate_swp(user_addr_t addr, uint32_t newval, uint32_t *oldval) - * - * Securely emulates the swp instruction removed from armv8. - * Returns true on success. - * Returns false if the user address is not user accessible. - * - * x0 : address to swap - * x1 : new value to store - * x2 : address to save old value - * x3 : scratch reg - * x10 : thread pointer (set by SET_RECOVERY_HANDLER) - * x11 : old recovery handler (set by SET_RECOVERY_HANDLER) - * x12 : interrupt state - * x13 : return value - */ - .text - .align 2 - .globl EXT(_emulate_swp) -LEXT(_emulate_swp) - PUSH_FRAME - SET_RECOVERY_HANDLER x10, x11, x3, swp_error - - // Perform swap -Lswp_try: - ldxr w3, [x0] // Load data at target address - stxr w4, w1, [x0] // Store new value to target address - cbnz w4, Lswp_try // Retry if store failed - str w3, [x2] // Save old value - mov x13, #1 // Set successful return value - -Lswp_exit: - mov x0, x13 // Set return value - CLEAR_RECOVERY_HANDLER x10, x11 - POP_FRAME - ret - -/* - * int _emulate_swpb(user_addr_t addr, uint32_t newval, uint32_t *oldval) - * - * Securely emulates the swpb instruction removed from armv8. - * Returns true on success. - * Returns false if the user address is not user accessible. - * - * x0 : address to swap - * x1 : new value to store - * x2 : address to save old value - * x3 : scratch reg - * x10 : thread pointer (set by SET_RECOVERY_HANDLER) - * x11 : old recovery handler (set by SET_RECOVERY_HANDLER) - * x12 : interrupt state - * x13 : return value - */ - .text - .align 2 - .globl EXT(_emulate_swpb) -LEXT(_emulate_swpb) - PUSH_FRAME - SET_RECOVERY_HANDLER x10, x11, x3, swp_error - - // Perform swap -Lswpb_try: - ldxrb w3, [x0] // Load data at target address - stxrb w4, w1, [x0] // Store new value to target address - cbnz w4, Lswp_try // Retry if store failed - str w3, [x2] // Save old value - mov x13, #1 // Set successful return value - -Lswpb_exit: - mov x0, x13 // Set return value - CLEAR_RECOVERY_HANDLER x10, x11 - POP_FRAME - ret + ARM64_STACK_EPILOG - .text - .align 2 -swp_error: - mov x0, xzr // Return false - CLEAR_RECOVERY_HANDLER x10, x11 - POP_FRAME - ret /* * uint32_t arm_debug_read_dscr(void) @@ -826,7 +927,6 @@ LEXT(arm_debug_read_dscr) LEXT(arm_debug_set_cp14) PANIC_UNIMPLEMENTED - #if defined(APPLE_ARM64_ARCH_FAMILY) /* * Note: still have to ISB before executing wfi! @@ -871,6 +971,28 @@ LEXT(arm64_prepare_for_sleep) orr x0, x0, #(ARM64_REG_CYC_OVRD_ok2pwrdn_force_down) msr ARM64_REG_CYC_OVRD, x0 +#if defined(APPLEMONSOON) + ARM64_IS_PCORE x0 + cbz x0, Lwfi_inst // skip if not p-core + + /* : Flush the GUPS prefetcher prior to + * wfi. A Skye HW bug can cause the GUPS prefetcher on p-cores + * to be left with valid entries that fail to drain if a + * subsequent wfi is issued. This can prevent the core from + * power-gating. For the idle case that is recoverable, but + * for the deep-sleep (S2R) case in which cores MUST power-gate, + * it can lead to a hang. This can be prevented by disabling + * and re-enabling GUPS, which forces the prefetch queue to + * drain. This should be done as close to wfi as possible, i.e. + * at the very end of arm64_prepare_for_sleep(). */ + mrs x0, ARM64_REG_HID10 + orr x0, x0, #(ARM64_REG_HID10_DisHwpGups) + msr ARM64_REG_HID10, x0 + isb sy + and x0, x0, #(~(ARM64_REG_HID10_DisHwpGups)) + msr ARM64_REG_HID10, x0 + isb sy +#endif Lwfi_inst: dsb sy isb sy @@ -885,6 +1007,7 @@ Lwfi_inst: .align 2 .globl EXT(arm64_force_wfi_clock_gate) LEXT(arm64_force_wfi_clock_gate) + ARM64_STACK_PROLOG PUSH_FRAME mrs x0, ARM64_REG_CYC_OVRD @@ -892,7 +1015,7 @@ LEXT(arm64_force_wfi_clock_gate) msr ARM64_REG_CYC_OVRD, x0 POP_FRAME - ret + ARM64_STACK_EPILOG @@ -1030,7 +1153,64 @@ cpu_defeatures_set_ret: ret #endif +#else /* !defined(APPLE_ARM64_ARCH_FAMILY) */ + .text + .align 2 + .globl EXT(arm64_prepare_for_sleep) +LEXT(arm64_prepare_for_sleep) + PUSH_FRAME +Lwfi_inst: + dsb sy + isb sy + wfi + b Lwfi_inst + +/* + * Force WFI to use clock gating only + * Note: for non-Apple device, do nothing. + */ + .text + .align 2 + .globl EXT(arm64_force_wfi_clock_gate) +LEXT(arm64_force_wfi_clock_gate) + PUSH_FRAME + nop + POP_FRAME + +#endif /* defined(APPLE_ARM64_ARCH_FAMILY) */ + +/* + * void arm64_replace_bootstack(cpu_data_t *cpu_data) + * + * This must be called from a kernel thread context running on the boot CPU, + * after setting up new exception stacks in per-CPU data. That will guarantee + * that the stack(s) we're trying to replace aren't currently in use. For + * KTRR-protected devices, this must also be called prior to VM prot finalization + * and lockdown, as updating SP1 requires a sensitive instruction. + */ + .text + .align 2 + .globl EXT(arm64_replace_bootstack) +LEXT(arm64_replace_bootstack) + ARM64_STACK_PROLOG + PUSH_FRAME + // Set the exception stack pointer + ldr x0, [x0, CPU_EXCEPSTACK_TOP] + mrs x4, DAIF // Load current DAIF; use x4 as pinst may trash x1-x3 + msr DAIFSet, #(DAIFSC_IRQF | DAIFSC_FIQF | DAIFSC_ASYNCF) // Disable IRQ/FIQ/serror + // Set SP_EL1 to exception stack +#if defined(KERNEL_INTEGRITY_KTRR) + mov x1, lr + bl _pinst_spsel_1 + mov lr, x1 +#else + msr SPSel, #1 #endif + mov sp, x0 + msr SPSel, #0 + msr DAIF, x4 // Restore interrupt state + POP_FRAME + ARM64_STACK_EPILOG #ifdef MONITOR /* @@ -1050,4 +1230,5 @@ LEXT(monitor_call) ret #endif + /* vim: set sw=4 ts=4: */ diff --git a/osfmk/arm64/machine_task.c b/osfmk/arm64/machine_task.c index d9efa1cad..a07df7007 100644 --- a/osfmk/arm64/machine_task.c +++ b/osfmk/arm64/machine_task.c @@ -71,7 +71,7 @@ machine_task_set_state( case ARM_DEBUG_STATE: { arm_legacy_debug_state_t *tstate = (arm_legacy_debug_state_t *) state; - if (task_has_64BitAddr(task) || + if (task_has_64Bit_data(task) || (state_count != ARM_LEGACY_DEBUG_STATE_COUNT) || (!debug_legacy_state_is_valid(tstate))) { return KERN_INVALID_ARGUMENT; @@ -90,7 +90,7 @@ machine_task_set_state( case ARM_DEBUG_STATE32: { arm_debug_state32_t *tstate = (arm_debug_state32_t *) state; - if (task_has_64BitAddr(task) || + if (task_has_64Bit_data(task) || (state_count != ARM_DEBUG_STATE32_COUNT) || (!debug_state_is_valid32(tstate))) { return KERN_INVALID_ARGUMENT; @@ -110,7 +110,7 @@ machine_task_set_state( { arm_debug_state64_t *tstate = (arm_debug_state64_t *) state; - if ((!task_has_64BitAddr(task)) || + if ((!task_has_64Bit_data(task)) || (state_count != ARM_DEBUG_STATE64_COUNT) || (!debug_state_is_valid64(tstate))) { return KERN_INVALID_ARGUMENT; @@ -156,7 +156,7 @@ machine_task_get_state(task_t task, { arm_legacy_debug_state_t *tstate = (arm_legacy_debug_state_t *) state; - if (task_has_64BitAddr(task) || (*state_count != ARM_LEGACY_DEBUG_STATE_COUNT)) { + if (task_has_64Bit_data(task) || (*state_count != ARM_LEGACY_DEBUG_STATE_COUNT)) { return KERN_INVALID_ARGUMENT; } @@ -172,7 +172,7 @@ machine_task_get_state(task_t task, { arm_debug_state32_t *tstate = (arm_debug_state32_t *) state; - if (task_has_64BitAddr(task) || (*state_count != ARM_DEBUG_STATE32_COUNT)) { + if (task_has_64Bit_data(task) || (*state_count != ARM_DEBUG_STATE32_COUNT)) { return KERN_INVALID_ARGUMENT; } @@ -188,7 +188,7 @@ machine_task_get_state(task_t task, { arm_debug_state64_t *tstate = (arm_debug_state64_t *) state; - if ((!task_has_64BitAddr(task)) || (*state_count != ARM_DEBUG_STATE64_COUNT)) { + if ((!task_has_64Bit_data(task)) || (*state_count != ARM_DEBUG_STATE64_COUNT)) { return KERN_INVALID_ARGUMENT; } @@ -233,8 +233,8 @@ machine_thread_inherit_taskwide( int flavor; mach_msg_type_number_t count; - flavor = task_has_64BitAddr(parent_task) ? ARM_DEBUG_STATE64 : ARM_DEBUG_STATE32; - count = task_has_64BitAddr(parent_task) ? ARM_DEBUG_STATE64_COUNT : ARM_DEBUG_STATE32_COUNT; + flavor = task_has_64Bit_data(parent_task) ? ARM_DEBUG_STATE64 : ARM_DEBUG_STATE32; + count = task_has_64Bit_data(parent_task) ? ARM_DEBUG_STATE64_COUNT : ARM_DEBUG_STATE32_COUNT; return machine_thread_set_state(thread, flavor, parent_task->task_debug, count); } diff --git a/osfmk/arm64/monotonic.h b/osfmk/arm64/monotonic.h index 1cc446028..ec10b1981 100644 --- a/osfmk/arm64/monotonic.h +++ b/osfmk/arm64/monotonic.h @@ -47,12 +47,23 @@ #include +#define PMSR "s3_1_c15_c13_0" +#define PMSR_PMI(REG) ((REG) & ((1 << CORE_NCTRS) - 1)) -static inline void -mt_fiq(void) + +static inline bool +mt_pmi_pending(uint64_t * restrict pmsr, uint64_t * restrict upmsr) { + *pmsr = __builtin_arm_rsr64(PMSR); + bool pmi = PMSR_PMI(*pmsr); + +#pragma unused(upmsr) + + return pmi; } +void mt_fiq(void *cpu, uint64_t pmsr, uint64_t upmsr); + #endif /* MACH_KERNEL_PRIVATE */ #endif /* !defined(ARM64_MONOTONIC_H) */ diff --git a/osfmk/arm64/monotonic_arm64.c b/osfmk/arm64/monotonic_arm64.c index 4a1563ea7..321205db8 100644 --- a/osfmk/arm64/monotonic_arm64.c +++ b/osfmk/arm64/monotonic_arm64.c @@ -29,7 +29,7 @@ #include #include #include -#include /* static_assert, assert */ +#include #include /* panic */ #include #include /* CHAR_BIT */ @@ -39,12 +39,12 @@ #include #include #include +#include /* DTFindEntry */ #include #pragma mark core counters bool mt_core_supported = true; -void mt_fiq_internal(uint64_t upmsr); /* * PMC[0-1] are the 48-bit fixed counters -- PMC0 is cycles and PMC1 is @@ -64,6 +64,8 @@ void mt_fiq_internal(uint64_t upmsr); #define PMC8 "s3_2_c15_c9_0" #define PMC9 "s3_2_c15_c10_0" +#define CTR_MAX ((UINT64_C(1) << 47) - 1) + #define CYCLES 0 #define INSTRS 1 @@ -103,14 +105,13 @@ enum { PMCR0_INTGEN_FIQ = 4, }; #define PMCR0_INTGEN_SET(INT) ((uint64_t)(INT) << 8) -/* use AIC for backwards compatibility with kpc */ -#define PMCR0_INTGEN_INIT PMCR0_INTGEN_SET(PMCR0_INTGEN_AIC) +#define PMCR0_INTGEN_INIT PMCR0_INTGEN_SET(PMCR0_INTGEN_FIQ) /* set by hardware if a PMI was delivered */ #define PMCR0_PMAI (UINT64_C(1) << 11) #define PMCR0_PMI_EN(CTR) (UINT64_C(1) << (12 + CTR_POS(CTR))) -/* fixed counters are always counting XXX probably need to just set this to all true */ +/* fixed counters are always counting */ #define PMCR0_PMI_INIT (PMCR0_PMI_EN(CYCLES) | PMCR0_PMI_EN(INSTRS)) -/* disable counting on a PMI (except for AIC interrupts) */ +/* disable counting on a PMI */ #define PMCR0_DISCNT_EN (UINT64_C(1) << 20) /* block PMIs until ERET retires */ #define PMCR0_WFRFE_EN (UINT64_C(1) << 22) @@ -119,7 +120,6 @@ enum { /* user mode access to configuration registers */ #define PMCR0_USEREN_EN (UINT64_C(1) << 30) -/* XXX this needs to be synchronized with kpc... */ #define PMCR0_INIT (PMCR0_INTGEN_INIT | PMCR0_PMI_INIT | PMCR0_DISCNT_EN) /* @@ -153,14 +153,6 @@ core_init_execution_modes(void) __builtin_arm_wsr64(PMCR1, pmcr1); } -/* - * PMSR reports the overflow status of all counters. - */ - -#define PMSR "s3_1_c15_c13_0" - -#define PMSR_OVF(CTR) (UINT64_C(1) << (CTR)) - /* * PMCR2 controls watchpoint registers. * @@ -173,19 +165,15 @@ core_init_execution_modes(void) #define PMCR3 "s3_1_c15_c3_0" #define PMCR4 "s3_1_c15_c4_0" -/* - * PMCR_AFFINITY does ??? XXX. - */ - -#define PMCR_AFFINITY "s3_1_c15_c11_0" +#define PMSR_OVF(CTR) (1ULL << (CTR)) void -mt_init(void) +mt_early_init(void) { } static int -core_init(void) +core_init(__unused mt_device_t dev) { /* the dev node interface to the core counters is still unsupported */ return ENOTSUP; @@ -207,7 +195,7 @@ mt_core_snap(unsigned int ctr) return __builtin_arm_rsr64(PMC1); default: panic("monotonic: invalid core counter read: %u", ctr); - __builtin_trap(); + __builtin_unreachable(); } } @@ -223,7 +211,7 @@ mt_core_set_snap(unsigned int ctr, uint64_t count) break; default: panic("monotonic: invalid core counter %u write %llu", ctr, count); - __builtin_trap(); + __builtin_unreachable(); } } @@ -260,8 +248,19 @@ core_idle(__unused cpu_data_t *cpu) mt_update_fixed_counts(); } -static void -core_run(cpu_data_t *cpu) +#pragma mark uncore performance monitor + + +#pragma mark common hooks + +void +mt_cpu_idle(cpu_data_t *cpu) +{ + core_idle(cpu); +} + +void +mt_cpu_run(cpu_data_t *cpu) { uint64_t pmcr0; struct mt_cpu *mtc; @@ -283,47 +282,6 @@ core_run(cpu_data_t *cpu) __builtin_arm_wsr64(PMCR0, pmcr0); } -static void -core_up(__unused cpu_data_t *cpu) -{ - assert(ml_get_interrupts_enabled() == FALSE); - - core_init_execution_modes(); -} - -#pragma mark uncore counters - - -static void -uncore_sleep(void) -{ -} - -static void -uncore_wake(void) -{ -} - -static void -uncore_fiq(uint64_t upmsr) -{ -#pragma unused(upmsr) -} - -#pragma mark common hooks - -void -mt_cpu_idle(cpu_data_t *cpu) -{ - core_idle(cpu); -} - -void -mt_cpu_run(cpu_data_t *cpu) -{ - core_run(cpu); -} - void mt_cpu_down(cpu_data_t *cpu) { @@ -333,59 +291,114 @@ mt_cpu_down(cpu_data_t *cpu) void mt_cpu_up(cpu_data_t *cpu) { - core_up(cpu); mt_cpu_run(cpu); } void mt_sleep(void) { - uncore_sleep(); } void -mt_wake(void) +mt_wake_per_core(void) { - uncore_wake(); } -void +static void mt_cpu_pmi(cpu_data_t *cpu, uint64_t pmsr) { - bool found_overflow = false; - assert(cpu != NULL); assert(ml_get_interrupts_enabled() == FALSE); (void)atomic_fetch_add_explicit(&mt_pmis, 1, memory_order_relaxed); - for (int i = 0; i < MT_CORE_NFIXED; i++) { + /* + * monotonic handles any fixed counter PMIs. + */ + for (unsigned int i = 0; i < MT_CORE_NFIXED; i++) { + if ((pmsr & PMSR_OVF(i)) == 0) { + continue; + } + + uint64_t count = mt_cpu_update_count(cpu, i); + cpu->cpu_monotonic.mtc_counts[i] += count; + mt_core_set_snap(i, mt_core_reset_values[i]); + cpu->cpu_monotonic.mtc_snaps[i] = mt_core_reset_values[i]; + + if (mt_microstackshots && mt_microstackshot_ctr == i) { + bool user_mode = false; + arm_saved_state_t *state = get_user_regs(current_thread()); + if (state) { + user_mode = PSR64_IS_USER(get_saved_state_cpsr(state)); + } + KDBG_RELEASE(KDBG_EVENTID(DBG_MONOTONIC, DBG_MT_DEBUG, 1), + mt_microstackshot_ctr, user_mode); + mt_microstackshot_pmi_handler(user_mode, mt_microstackshot_ctx); + } + } + + /* + * KPC handles the configurable counter PMIs. + */ + for (unsigned int i = MT_CORE_NFIXED; i < CORE_NCTRS; i++) { if (pmsr & PMSR_OVF(i)) { - mt_cpu_update_count(cpu, i); - mt_core_set_snap(i, 0); - found_overflow = true; + extern void kpc_pmi_handler(unsigned int ctr); + kpc_pmi_handler(i); } } - assert(found_overflow); core_set_enabled(); } void -mt_fiq_internal(uint64_t upmsr) +mt_fiq(void *cpu, uint64_t pmsr, uint64_t upmsr) +{ + mt_cpu_pmi(cpu, pmsr); + +#pragma unused(upmsr) +} + +static uint32_t mt_xc_sync; + +static void +mt_microstackshot_start_remote(__unused void *arg) +{ + cpu_data_t *cpu = getCpuDatap(); + + __builtin_arm_wsr64(PMCR0, PMCR0_INIT); + + for (int i = 0; i < MT_CORE_NFIXED; i++) { + uint64_t count = mt_cpu_update_count(cpu, i); + cpu->cpu_monotonic.mtc_counts[i] += count; + mt_core_set_snap(i, mt_core_reset_values[i]); + cpu->cpu_monotonic.mtc_snaps[i] = mt_core_reset_values[i]; + } + + core_set_enabled(); + + if (hw_atomic_sub(&mt_xc_sync, 1) == 0) { + thread_wakeup((event_t)&mt_xc_sync); + } +} + +int +mt_microstackshot_start_arch(uint64_t period) { - uncore_fiq(upmsr); + mt_core_reset_values[mt_microstackshot_ctr] = CTR_MAX - period; + cpu_broadcast_xcall(&mt_xc_sync, TRUE, mt_microstackshot_start_remote, + mt_microstackshot_start_remote /* cannot pass NULL */); + return 0; } #pragma mark dev nodes -const struct monotonic_dev monotonic_devs[] = { +struct mt_device mt_devices[] = { [0] = { - .mtd_name = "monotonic/core", + .mtd_name = "core", .mtd_init = core_init, }, }; static_assert( - (sizeof(monotonic_devs) / sizeof(monotonic_devs[0])) == MT_NDEVS, - "MT_NDEVS macro should be same as the length of monotonic_devs"); + (sizeof(mt_devices) / sizeof(mt_devices[0])) == MT_NDEVS, + "MT_NDEVS macro should be same as the length of mt_devices"); diff --git a/osfmk/arm64/pcb.c b/osfmk/arm64/pcb.c index a6fa9154b..d8809b38f 100644 --- a/osfmk/arm64/pcb.c +++ b/osfmk/arm64/pcb.c @@ -61,6 +61,7 @@ #include + #define USER_SS_ZONE_ALLOC_SIZE (0x4000) extern int debug_task; @@ -160,7 +161,7 @@ machine_thread_create( thread->machine.upcb = &thread->machine.contextData->ss; thread->machine.uNeon = &thread->machine.contextData->ns; - if (task_has_64BitAddr(task)) { + if (task_has_64Bit_data(task)) { thread->machine.upcb->ash.flavor = ARM_SAVED_STATE64; thread->machine.upcb->ash.count = ARM_SAVED_STATE64_COUNT; thread->machine.uNeon->nsh.flavor = ARM_NEON_SAVED_STATE64; @@ -309,7 +310,7 @@ machine_stack_attach( savestate->lr = (uintptr_t)thread_continue; savestate->sp = thread->machine.kstackptr; savestate->cpsr = PSR64_KERNEL_DEFAULT; - machine_stack_attach_kprintf("thread = %x pc = %x, sp = %x\n", thread, savestate->lr, savestate->sp); + machine_stack_attach_kprintf("thread = %p pc = %llx, sp = %llx\n", thread, savestate->lr, savestate->sp); } @@ -357,51 +358,15 @@ machine_stack_handoff( */ void call_continuation( - thread_continue_t continuation, - void *parameter, - wait_result_t wresult) + thread_continue_t continuation, + void *parameter, + wait_result_t wresult, + boolean_t enable_interrupts) { #define call_continuation_kprintf(x...) /* kprintf("call_continuation_kprintf:" x) */ call_continuation_kprintf("thread = %p continuation = %p, stack = %p\n", current_thread(), continuation, current_thread()->machine.kstackptr); - Call_continuation(continuation, parameter, wresult, current_thread()->machine.kstackptr); -} - -/* Setting breakpoints in EL1 is effectively a KTRR bypass. The ability to do so is - * controlled by MDSCR.KDE. The MSR to set MDSCR must be present to allow - * self-hosted user mode debug. Any checks before the MRS can be skipped with ROP, - * so we need to put the checks after the MRS where they can't be skipped. That - * still leaves a small window if a breakpoint is set on the instruction - * immediately after the MRS. To handle that, we also do a check and then set of - * the breakpoint control registers. This allows us to guarantee that a given - * core will never have both KDE set and a breakpoint targeting EL1. - * - * If KDE gets set, unset it and then panic */ -static void -update_mdscr(uint64_t clear, uint64_t set) -{ - uint64_t result = 0; - uint64_t tmp1, tmp2; - __asm__ volatile( - "mrs %[reg], MDSCR_EL1\n" - "bic %[reg], %[reg], %[clear]\n" - "orr %[reg], %[reg], %[set]\n" - "1:\n" - "bic %[reg], %[reg], #0x2000\n" - "msr MDSCR_EL1, %[reg]\n" -#if defined(CONFIG_KERNEL_INTEGRITY) - /* verify KDE didn't get set (including via ROP) - * If set, clear it and then panic */ - "ands %[tmp], %[reg], #0x2000\n" - "orr %[res], %[res], %[tmp]\n" - "bne 1b\n" -#endif - : [res] "+r" (result), [tmp] "=r" (tmp1), [reg] "=r" (tmp2) - : [clear] "r" (clear), [set] "r" (set) : "x0"); -#if defined(CONFIG_KERNEL_INTEGRITY) - if (result) - panic("MDSCR.KDE was set: %llx %llx %llx", tmp1, tmp2, result); -#endif + Call_continuation(continuation, parameter, wresult, enable_interrupts); } #define SET_DBGBCRn(n, value, accum) \ @@ -794,7 +759,7 @@ void arm_debug_set(arm_debug_state_t *debug_state) break; } } else { - if (thread_is_64bit(current_thread())) + if (thread_is_64bit_data(current_thread())) arm_debug_set64(debug_state); else arm_debug_set32(debug_state); @@ -898,7 +863,7 @@ machine_thread_set_tsd_base( return KERN_INVALID_ARGUMENT; } - if (thread_is_64bit(thread)) { + if (thread_is_64bit_addr(thread)) { if (tsd_base > vm_map_max(thread->map)) tsd_base = 0ULL; } else { diff --git a/osfmk/arm64/platform_tests.c b/osfmk/arm64/platform_tests.c index 61b627419..96a59cac2 100644 --- a/osfmk/arm64/platform_tests.c +++ b/osfmk/arm64/platform_tests.c @@ -56,6 +56,7 @@ #define LOCK_PRIVATE 1 +#include #include #include #include @@ -78,6 +79,7 @@ #include #include #include +#include kern_return_t arm64_lock_test(void); kern_return_t arm64_munger_test(void); @@ -1055,6 +1057,7 @@ ex_cb_test() return KERN_SUCCESS; } + #if __ARM_PAN_AVAILABLE__ kern_return_t arm64_pan_test() @@ -1119,3 +1122,4 @@ arm64_munger_test() return 0; } + diff --git a/osfmk/arm64/proc_reg.h b/osfmk/arm64/proc_reg.h index a6971fa74..914cce974 100644 --- a/osfmk/arm64/proc_reg.h +++ b/osfmk/arm64/proc_reg.h @@ -201,14 +201,6 @@ #define SCTLR_RESERVED ((3 << 28) | (1 << 22) | (1 << 20) | (1 << 11)) -// 31 PACIA_ENABLED AddPACIA and AuthIA functions enabled -#define SCTLR_PACIA_ENABLED (1 << 31) -// 30 PACIB_ENABLED AddPACIB and AuthIB functions enabled -#define SCTLR_PACIB_ENABLED (1 << 30) -// 29:28 RES1 11 -// 27 PACDA_ENABLED AddPACDA and AuthDA functions enabled -#define SCTLR_PACDA_ENABLED (1 << 27) - // 26 UCI User Cache Instructions #define SCTLR_UCI_ENABLED (1 << 26) @@ -242,7 +234,8 @@ // 14 DZE User Data Cache Zero (DC ZVA) #define SCTLR_DZE_ENABLED (1 << 14) -// 13 RES0 0 +// 13 PACDB_ENABLED AddPACDB and AuthDB functions enabled +#define SCTLR_PACDB_ENABLED (1 << 13) // 12 I Instruction cache enable #define SCTLR_I_ENABLED (1 << 12) @@ -279,9 +272,7 @@ // 0 M MMU enable #define SCTLR_M_ENABLED (1 << 0) -#define SCTLR_PAC_DEFAULT 0 - -#define SCTLR_EL1_DEFAULT (SCTLR_PAC_DEFAULT | SCTLR_RESERVED | SCTLR_UCI_ENABLED | SCTLR_nTWE_WFE_ENABLED | SCTLR_DZE_ENABLED | \ +#define SCTLR_EL1_DEFAULT (SCTLR_RESERVED | SCTLR_UCI_ENABLED | SCTLR_nTWE_WFE_ENABLED | SCTLR_DZE_ENABLED | \ SCTLR_I_ENABLED | SCTLR_SED_DISABLED | SCTLR_CP15BEN_ENABLED | \ SCTLR_SA0_ENABLED | SCTLR_SA_ENABLED | SCTLR_C_ENABLED | SCTLR_M_ENABLED) @@ -520,7 +511,9 @@ */ #define MPIDR_AFF0_MASK 0xFF #define MPIDR_AFF1_MASK 0xFF00 +#define MPIDR_AFF1_SHIFT 8 #define MPIDR_AFF2_MASK 0xFF0000 +#define MPIDR_AFF2_SHIFT 16 /* * We currently use a 3 level page table (rather than the full 4 @@ -774,6 +767,15 @@ #endif /* __ARM64_PMAP_SUBPAGE_L1__ */ #endif +/* some sugar for getting pointers to page tables and entries */ + +#define L1_TABLE_INDEX(va) (((va) & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT) +#define L2_TABLE_INDEX(va) (((va) & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT) +#define L3_TABLE_INDEX(va) (((va) & ARM_TT_L3_INDEX_MASK) >> ARM_TT_L3_SHIFT) + +#define L2_TABLE_VA(tte) ((tt_entry_t*) phystokv((*(tte)) & ARM_TTE_TABLE_MASK)) +#define L3_TABLE_VA(tte2) ((pt_entry_t*) phystokv((*(tte2)) & ARM_TTE_TABLE_MASK)) + /* * L2 Translation table * @@ -992,7 +994,7 @@ #define ARM_TTE_BLOCK_NS_MASK 0x0000000000000020ULL /* notSecure mapping mask */ #define ARM_TTE_BLOCK_PNX 0x0020000000000000ULL /* value for privilege no execute bit */ -#define ARM_TTE_BLOCK_PNXMASK 0x0020000000000000ULL /* privilege execute mask */ +#define ARM_TTE_BLOCK_PNXMASK 0x0020000000000000ULL /* privilege no execute mask */ #define ARM_TTE_BLOCK_NX 0x0040000000000000ULL /* value for no execute */ #define ARM_TTE_BLOCK_NXMASK 0x0040000000000000ULL /* no execute mask */ @@ -1149,18 +1151,20 @@ #define ARM_PTE_HINT_ENTRIES_SHIFT 7ULL /* shift to construct the number of entries */ #define ARM_PTE_HINT_ADDR_MASK 0x0000FFFFFFE00000ULL /* mask to extract the starting hint address */ #define ARM_PTE_HINT_ADDR_SHIFT 21 /* shift for the hint address */ +#define ARM_KVA_HINT_ADDR_MASK 0xFFFFFFFFFFE00000ULL /* mask to extract the starting hint address */ #else #define ARM_PTE_HINT_ENTRIES 16ULL /* number of entries the hint covers */ #define ARM_PTE_HINT_ENTRIES_SHIFT 4ULL /* shift to construct the number of entries */ #define ARM_PTE_HINT_ADDR_MASK 0x0000FFFFFFFF0000ULL /* mask to extract the starting hint address */ #define ARM_PTE_HINT_ADDR_SHIFT 16 /* shift for the hint address */ +#define ARM_KVA_HINT_ADDR_MASK 0xFFFFFFFFFFFF0000ULL /* mask to extract the starting hint address */ #endif -#define ARM_PTE_PNX 0x0020000000000000ULL /* value for no execute */ -#define ARM_PTE_PNXMASK 0x0020000000000000ULL /* no execute mask */ +#define ARM_PTE_PNX 0x0020000000000000ULL /* value for privilege no execute bit */ +#define ARM_PTE_PNXMASK 0x0020000000000000ULL /* privilege no execute mask */ -#define ARM_PTE_NX 0x0040000000000000ULL /* value for privilege no execute bit */ -#define ARM_PTE_NXMASK 0x0040000000000000ULL /* privilege execute mask */ +#define ARM_PTE_NX 0x0040000000000000ULL /* value for no execute bit */ +#define ARM_PTE_NXMASK 0x0040000000000000ULL /* no execute mask */ #define ARM_PTE_WIRED 0x0080000000000000ULL /* value for software wired bit */ #define ARM_PTE_WIRED_MASK 0x0080000000000000ULL /* software wired mask */ diff --git a/osfmk/arm64/sleh.c b/osfmk/arm64/sleh.c index cf022d32f..4ace02086 100644 --- a/osfmk/arm64/sleh.c +++ b/osfmk/arm64/sleh.c @@ -54,6 +54,7 @@ #include #include +#include #include #if CONFIG_TELEMETRY @@ -122,6 +123,7 @@ static void inspect_instruction_abort(uint32_t, fault_status_t *, vm_prot_t *); static void inspect_data_abort(uint32_t, fault_status_t *, vm_prot_t *); static int is_vm_fault(fault_status_t); +static int is_translation_fault(fault_status_t); static int is_alignment_fault(fault_status_t); typedef void(*abort_handler_t)(arm_saved_state_t *, uint32_t, vm_offset_t, fault_status_t, vm_prot_t, vm_offset_t); @@ -177,6 +179,12 @@ extern boolean_t pgtrace_enabled; #endif #if __ARM_PAN_AVAILABLE__ +#ifdef CONFIG_XNUPOST +extern vm_offset_t pan_test_addr; +extern vm_offset_t pan_ro_addr; +extern volatile int pan_exception_level; +extern volatile char pan_fault_value; +#endif #endif #if defined(APPLECYCLONE) @@ -207,6 +215,8 @@ extern boolean_t pgtrace_enabled; #endif +extern vm_offset_t static_memory_end; + static inline unsigned __ror(unsigned value, unsigned shift) { @@ -284,6 +294,7 @@ arm64_implementation_specific_error(arm_saved_state_t *state, uint32_t esr, vm_o (void *)llc_err_sts, (void *)llc_err_adr, (void *)llc_err_inf); #endif #else // !defined(APPLE_ARM64_ARCH_FAMILY) +#pragma unused (state, esr, far) panic_plain("Unhandled implementation specific error\n"); #endif } @@ -437,8 +448,8 @@ sleh_synchronous(arm_context_t *context, uint32_t esr, vm_offset_t far) assert(0); /* Unreachable */ case ESR_EC_IABORT_EL1: - panic("Kernel instruction fetch abort: pc=%p iss=0x%x far=%p. Note: the faulting frame may be missing in the backtrace.", - (void *)get_saved_state_pc(state), ESR_ISS(esr), (void*)far); + + panic_with_thread_kernel_state("Kernel instruction fetch abort", state); case ESR_EC_PC_ALIGN: handle_pc_align(state); @@ -609,13 +620,13 @@ sleh_synchronous(arm_context_t *context, uint32_t esr, vm_offset_t far) static void handle_uncategorized(arm_saved_state_t *state, boolean_t instrLen2) { - exception_type_t exception = EXC_BAD_INSTRUCTION; - mach_exception_data_type_t codes[2] = {EXC_ARM_UNDEFINED}; - mach_msg_type_number_t numcodes = 2; - uint32_t instr; + exception_type_t exception = EXC_BAD_INSTRUCTION; + mach_exception_data_type_t codes[2] = {EXC_ARM_UNDEFINED}; + mach_msg_type_number_t numcodes = 2; + uint32_t instr = 0; if (instrLen2) { - uint16_t instr16; + uint16_t instr16 = 0; COPYIN(get_saved_state_pc(state), (char *)&instr16, sizeof(instr16)); instr = instr16; @@ -886,6 +897,20 @@ is_vm_fault(fault_status_t status) } } +static int +is_translation_fault(fault_status_t status) +{ + switch (status) { + case FSC_TRANSLATION_FAULT_L0: + case FSC_TRANSLATION_FAULT_L1: + case FSC_TRANSLATION_FAULT_L2: + case FSC_TRANSLATION_FAULT_L3: + return TRUE; + default: + return FALSE; + } +} + #if __ARM_PAN_AVAILABLE__ static int is_permission_fault(fault_status_t status) @@ -940,9 +965,9 @@ handle_user_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_addr thread->iotier_override = THROTTLE_LEVEL_NONE; /* Reset IO tier override before handling abort from userspace */ if (is_vm_fault(fault_code)) { - kern_return_t result = KERN_FAILURE; - vm_map_t map = thread->map; - vm_offset_t vm_fault_addr = fault_addr; + kern_return_t result = KERN_FAILURE; + vm_map_t map = thread->map; + vm_offset_t vm_fault_addr = fault_addr; assert(map != kernel_map); @@ -981,7 +1006,7 @@ handle_user_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_addr /* check to see if it is just a pmap ref/modify fault */ - if (result != KERN_SUCCESS) { + if ((result != KERN_SUCCESS) && !is_translation_fault(fault_code)) { result = arm_fast_fault(map->pmap, trunc_page(vm_fault_addr), fault_type, TRUE); } if (result != KERN_SUCCESS) { @@ -1082,9 +1107,9 @@ handle_kernel_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_ad #endif if (is_vm_fault(fault_code)) { - kern_return_t result; - vm_map_t map; - int interruptible; + kern_return_t result = KERN_FAILURE; + vm_map_t map; + int interruptible; /* * Ensure no faults in the physical aperture. This could happen if @@ -1093,7 +1118,20 @@ handle_kernel_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_ad */ - if (fault_addr >= gVirtBase && fault_addr < (gVirtBase+gPhysSize)) { +#if __ARM_PAN_AVAILABLE__ && defined(CONFIG_XNUPOST) + if (is_permission_fault(fault_code) && !(get_saved_state_cpsr(state) & PSR64_PAN) && + (pan_ro_addr != 0) && (fault_addr == pan_ro_addr)) { + ++pan_exception_level; + // On an exception taken from a PAN-disabled context, verify + // that PAN is re-enabled for the exception handler and that + // accessing the test address produces a PAN fault. + pan_fault_value = *(char *)pan_test_addr; + set_saved_state_pc(state, get_saved_state_pc(state) + 4); + return; + } +#endif + + if (fault_addr >= gVirtBase && fault_addr < static_memory_end) { panic_with_thread_kernel_state("Unexpected fault in kernel static region\n",state); } @@ -1117,9 +1155,12 @@ handle_kernel_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_ad #endif /* check to see if it is just a pmap ref/modify fault */ - result = arm_fast_fault(map->pmap, trunc_page(fault_addr), fault_type, FALSE); - if (result == KERN_SUCCESS) return; + if (!is_translation_fault(fault_code)) { + result = arm_fast_fault(map->pmap, trunc_page(fault_addr), fault_type, FALSE); + if (result == KERN_SUCCESS) return; + } + if (result != KERN_PROTECTION_FAILURE) { /* * We have to "fault" the page in. @@ -1141,6 +1182,22 @@ handle_kernel_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_ad #if __ARM_PAN_AVAILABLE__ if (is_pan_fault(state, esr, fault_addr, fault_code)) { +#ifdef CONFIG_XNUPOST + if ((pan_test_addr != 0) && (fault_addr == pan_test_addr)) + { + ++pan_exception_level; + // read the user-accessible value to make sure + // pan is enabled and produces a 2nd fault from + // the exception handler + if (pan_exception_level == 1) + pan_fault_value = *(char *)pan_test_addr; + // this fault address is used for PAN test + // disable PAN and rerun + set_saved_state_cpsr(state, + get_saved_state_cpsr(state) & (~PSR64_PAN)); + return; + } +#endif panic_with_thread_kernel_state("Privileged access never abort.", state); } #endif @@ -1232,10 +1289,10 @@ handle_mach_continuous_time_trap(arm_saved_state_t *state) static void handle_msr_trap(arm_saved_state_t *state, uint32_t iss) { - exception_type_t exception = EXC_BAD_INSTRUCTION; - mach_exception_data_type_t codes[2] = {EXC_ARM_UNDEFINED}; - mach_msg_type_number_t numcodes = 2; - uint32_t instr; + exception_type_t exception = EXC_BAD_INSTRUCTION; + mach_exception_data_type_t codes[2] = {EXC_ARM_UNDEFINED}; + mach_msg_type_number_t numcodes = 2; + uint32_t instr = 0; (void)iss; @@ -1256,10 +1313,10 @@ handle_msr_trap(arm_saved_state_t *state, uint32_t iss) static void handle_user_trapped_instruction32(arm_saved_state_t *state, uint32_t esr) { - exception_type_t exception = EXC_BAD_INSTRUCTION; - mach_exception_data_type_t codes[2] = {EXC_ARM_UNDEFINED}; - mach_msg_type_number_t numcodes = 2; - uint32_t instr; + exception_type_t exception = EXC_BAD_INSTRUCTION; + mach_exception_data_type_t codes[2] = {EXC_ARM_UNDEFINED}; + mach_msg_type_number_t numcodes = 2; + uint32_t instr = 0; if (is_saved_state64(state)) { panic("ESR (0x%x) for instruction trapped from U32, but saved state is 64-bit.", esr); @@ -1278,10 +1335,10 @@ handle_user_trapped_instruction32(arm_saved_state_t *state, uint32_t esr) static void handle_simd_trap(arm_saved_state_t *state, uint32_t esr) { - exception_type_t exception = EXC_BAD_INSTRUCTION; - mach_exception_data_type_t codes[2] = {EXC_ARM_UNDEFINED}; - mach_msg_type_number_t numcodes = 2; - uint32_t instr; + exception_type_t exception = EXC_BAD_INSTRUCTION; + mach_exception_data_type_t codes[2] = {EXC_ARM_UNDEFINED}; + mach_msg_type_number_t numcodes = 2; + uint32_t instr = 0; if (PSR64_IS_KERNEL(get_saved_state_cpsr(state))) { panic("ESR (0x%x) for SIMD trap from userland, actually came from kernel?", esr); @@ -1301,6 +1358,9 @@ sleh_irq(arm_saved_state_t *state) uint32_t * old_entropy_data_ptr = NULL; uint32_t * new_entropy_data_ptr = NULL; cpu_data_t * cdp = getCpuDatap(); +#if DEVELOPMENT || DEBUG + int preemption_level = get_preemption_level(); +#endif sleh_interrupt_handler_prologue(state, DBG_INTR_TYPE_OTHER); @@ -1334,25 +1394,45 @@ sleh_irq(arm_saved_state_t *state) *old_entropy_data_ptr = (uint32_t)timestamp ^ __ror(old_entropy_data, 9); sleh_interrupt_handler_epilogue(); +#if DEVELOPMENT || DEBUG + if (preemption_level != get_preemption_level()) + panic("irq handler %p changed preemption level from %d to %d", cdp->interrupt_handler, preemption_level, get_preemption_level()); +#endif } void sleh_fiq(arm_saved_state_t *state) { unsigned int type = DBG_INTR_TYPE_UNKNOWN; +#if DEVELOPMENT || DEBUG + int preemption_level = get_preemption_level(); +#endif +#if MONOTONIC + uint64_t pmsr = 0, upmsr = 0; +#endif /* MONOTONIC */ + +#if MONOTONIC + if (mt_pmi_pending(&pmsr, &upmsr)) { + type = DBG_INTR_TYPE_PMI; + } else +#endif /* MONOTONIC */ if (ml_get_timer_pending()) { type = DBG_INTR_TYPE_TIMER; } sleh_interrupt_handler_prologue(state, type); +#if MONOTONIC + if (type == DBG_INTR_TYPE_PMI) { + mt_fiq(getCpuDatap(), pmsr, upmsr); + } else +#endif /* MONOTONIC */ { /* * We don't know that this is a timer, but we don't have insight into * the other interrupts that go down this path. */ - cpu_data_t *cdp = getCpuDatap(); cdp->cpu_decrementer = -1; /* Large */ @@ -1366,15 +1446,26 @@ sleh_fiq(arm_saved_state_t *state) } sleh_interrupt_handler_epilogue(); +#if DEVELOPMENT || DEBUG + if (preemption_level != get_preemption_level()) + panic("fiq type %u changed preemption level from %d to %d", type, preemption_level, get_preemption_level()); +#endif } void sleh_serror(arm_context_t *context, uint32_t esr, vm_offset_t far) { arm_saved_state_t *state = &context->ss; +#if DEVELOPMENT || DEBUG + int preemption_level = get_preemption_level(); +#endif ASSERT_CONTEXT_SANITY(context); arm64_platform_error(state, esr, far); +#if DEVELOPMENT || DEBUG + if (preemption_level != get_preemption_level()) + panic("serror changed preemption level from %d to %d", preemption_level, get_preemption_level()); +#endif } void @@ -1434,7 +1525,7 @@ sleh_interrupt_handler_prologue(arm_saved_state_t *state, unsigned int type) #if CONFIG_TELEMETRY if (telemetry_needs_record) { - telemetry_mark_curthread((boolean_t)is_user); + telemetry_mark_curthread((boolean_t)is_user, FALSE); } #endif /* CONFIG_TELEMETRY */ } @@ -1442,6 +1533,9 @@ sleh_interrupt_handler_prologue(arm_saved_state_t *state, unsigned int type) static void sleh_interrupt_handler_epilogue(void) { +#if KPERF + kperf_interrupt(); +#endif /* KPERF */ KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_EXCP_INTR, 0) | DBG_FUNC_END); } diff --git a/osfmk/arm64/start.s b/osfmk/arm64/start.s index 24a6dba7f..f709217cf 100644 --- a/osfmk/arm64/start.s +++ b/osfmk/arm64/start.s @@ -29,8 +29,6 @@ #include #include #include -#include -#include #include #include #include "assym.s" @@ -104,6 +102,7 @@ LEXT(LowResetVectorBase) // Unlock the core for debugging msr OSLAR_EL1, xzr + msr DAIFSet, #(DAIFSC_ALL) // Disable all interrupts #if !(defined(KERNEL_INTEGRITY_KTRR)) // Set low reset vector before attempting any loads @@ -123,18 +122,19 @@ LEXT(LowResetVectorBase) * If either values are zero, we're debugging kernel so skip programming KTRR. */ + // load stashed rorgn_begin adrp x17, EXT(rorgn_begin)@page add x17, x17, EXT(rorgn_begin)@pageoff ldr x17, [x17] // if rorgn_begin is zero, we're debugging. skip enabling ktrr - cbz x17, 1f + cbz x17, Lskip_ktrr // load stashed rorgn_end adrp x19, EXT(rorgn_end)@page add x19, x19, EXT(rorgn_end)@pageoff ldr x19, [x19] - cbz x19, 1f + cbz x19, Lskip_ktrr // program and lock down KTRR // subtract one page from rorgn_end to make pinst insns NX @@ -143,9 +143,8 @@ LEXT(LowResetVectorBase) msr ARM64_REG_KTRR_UPPER_EL1, x19 mov x17, #1 msr ARM64_REG_KTRR_LOCK_EL1, x17 - -1: -#endif /* defined(KERNEL_INTEGRITY_KTRR) */ +Lskip_ktrr: +#endif /* defined(KERNEL_INTEGRITY_KTRR)*/ // Process reset handlers adrp x19, EXT(ResetHandlerData)@page // Get address of the reset handler data @@ -304,13 +303,13 @@ start_cpu: // x20 set to BootArgs phys address // x21 set to cpu data phys address - msr DAIFSet, #(DAIFSC_ALL) // Disable all interrupts // Get the kernel memory parameters from the boot args ldr x22, [x20, BA_VIRT_BASE] // Get the kernel virt base ldr x23, [x20, BA_PHYS_BASE] // Get the kernel phys base ldr x24, [x20, BA_MEM_SIZE] // Get the physical memory size ldr x25, [x20, BA_TOP_OF_KERNEL_DATA] // Get the top of the kernel data + ldr x26, [x20, BA_BOOT_FLAGS] // Get the kernel boot flags // Set TPIDRRO_EL0 with the CPU number ldr x0, [x21, CPU_NUMBER_GS] @@ -395,6 +394,52 @@ start_cpu: b.ne 1b .endmacro +/* + * arg0 - virtual start address + * arg1 - physical start address + * arg2 - number of entries to map + * arg3 - L1 table address + * arg4 - free space pointer + * arg5 - scratch (entries mapped per loop) + * arg6 - scratch + * arg7 - scratch + * arg8 - scratch + * arg9 - scratch + */ +.macro create_bootstrap_mapping + /* calculate entries left in this page */ + and $5, $0, #(ARM_TT_L2_INDEX_MASK) + lsr $5, $5, #(ARM_TT_L2_SHIFT) + mov $6, #(TTE_PGENTRIES) + sub $5, $6, $5 + + /* allocate an L2 table */ +3: add $4, $4, PGBYTES + + /* create_l1_table_entry(virt_base, L1 table, L2 table, scratch1, scratch2, scratch3) */ + create_l1_table_entry $0, $3, $4, $6, $7, $8 + + /* determine how many entries to map this loop - the smaller of entries + * remaining in page and total entries left */ + cmp $2, $5 + csel $5, $2, $5, lt + + /* create_l2_block_entries(virt_base, phys_base, L2 table, num_ents, scratch1, scratch2, scratch3) */ + create_l2_block_entries $0, $1, $4, $5, $6, $7, $8, $9 + + /* subtract entries just mapped and bail out if we're done */ + subs $2, $2, $5 + beq 2f + + /* entries left to map - advance base pointers */ + add $0, $0, $5, lsl #(ARM_TT_L2_SHIFT) + add $1, $1, $5, lsl #(ARM_TT_L2_SHIFT) + + mov $5, #(TTE_PGENTRIES) /* subsequent loops map (up to) a whole L2 page */ + b 3b +2: +.endmacro + /* * _start_first_cpu * Cold boot init routine. Called from __start @@ -406,8 +451,9 @@ LEXT(start_first_cpu) // Unlock the core for debugging msr OSLAR_EL1, xzr + msr DAIFSet, #(DAIFSC_ALL) // Disable all interrupts mov x20, x0 - mov x21, xzr + mov x21, #0 // Set low reset vector before attempting any loads adrp x0, EXT(LowExceptionVectorBase)@page @@ -415,14 +461,16 @@ LEXT(start_first_cpu) MSR_VBAR_EL1_X0 - // Get the kernel memory parameters from the boot args ldr x22, [x20, BA_VIRT_BASE] // Get the kernel virt base ldr x23, [x20, BA_PHYS_BASE] // Get the kernel phys base ldr x24, [x20, BA_MEM_SIZE] // Get the physical memory size ldr x25, [x20, BA_TOP_OF_KERNEL_DATA] // Get the top of the kernel data + ldr x26, [x20, BA_BOOT_FLAGS] // Get the kernel boot flags - // Set CPU number to 0 + // Clear the register that will be used to store the userspace thread pointer and CPU number. + // We may not actually be booting from ordinal CPU 0, so this register will be updated + // in ml_parse_cpu_topology(), which happens later in bootstrap. msr TPIDRRO_EL0, x21 // Set up exception stack pointer @@ -480,21 +528,11 @@ LEXT(start_first_cpu) #else lsl x2, x2, #2 // Shift by 2 for num entries on 4 pages #endif - sub x2, x2, #1 // Subtract one to terminate on last entry Linvalidate_bootstrap: // do { str x0, [x1], #(1 << TTE_SHIFT) // Invalidate and advance subs x2, x2, #1 // entries-- b.ne Linvalidate_bootstrap // } while (entries != 0) - /* Load addresses for page table construction macros - * x0 - Physical base (used to identify V=P section to set up) - * x1 - V=P L1 table base - * x2 - V=P L2 table base - * x3 - KVA L1 table base - * x4 - KVA L2 table base - * x5 - Mem size in entries (up to 1GB) - */ - /* * In order to reclaim memory on targets where TZ0 (or some other entity) * must be located at the base of memory, iBoot may set the virtual and @@ -512,53 +550,55 @@ Linvalidate_bootstrap: // do { * mapping TZ0. */ adrp x0, EXT(_mh_execute_header)@page // Use xnu's mach header as the start address - add x0, x0, EXT(_mh_execute_header)@pageoff -#if __ARM64_TWO_LEVEL_PMAP__ + add x0, x0, EXT(_mh_execute_header)@pageoff + /* - * We don't need the L1 entries in this case, so skip them. + * Adjust physical and virtual base addresses to account for physical + * memory preceeding xnu Mach-O header + * x22 - Kernel virtual base + * x23 - Kernel physical base + * x24 - Physical memory size */ - mov x2, x25 // Load V=P L2 table address - add x4, x2, PGBYTES // Load KVA L2 table address -#else - mov x1, x25 // Load V=P L1 table address - add x2, x1, PGBYTES // Load V=P L2 table address - add x3, x2, PGBYTES // Load KVA L1 table address - add x4, x3, PGBYTES // Load KVA L2 table address -#endif + sub x18, x0, x23 + sub x24, x24, x18 + add x22, x22, x18 + add x23, x23, x18 + /* - * We must adjust the amount we wish to map in order to account for the - * memory preceeding xnu's mach header. + * x0 - V=P virtual cursor + * x4 - V=P physical cursor + * x14 - KVA virtual cursor + * x15 - KVA physical cursor */ - sub x5, x0, x23 // Map from the mach header up to the end of our memory - sub x5, x24, x5 - lsr x5, x5, #(ARM_TT_L2_SHIFT) - mov x6, #(TTE_PGENTRIES) // Load number of L2 entries per page - cmp x5, x6 // If memsize requires more than 1 page of entries - csel x5, x5, x6, lt // ... round down to a single page (first 1GB) - -#if !__ARM64_TWO_LEVEL_PMAP__ - /* Create entry for L2 table in V=P L1 table - * create_l1_table_entry(V=P, L1 table, L2 table, scratch1, scratch2, scratch3) - */ - create_l1_table_entry x0, x1, x2, x10, x11, x12 -#endif + mov x4, x0 + mov x14, x22 + mov x15, x23 - /* Create block entry in V=P L2 table - * create_l2_block_entries(V=P virt, V=P phys, L2 table, num_ents, scratch1, scratch2, scratch3) + /* + * Allocate L1 tables + * x1 - V=P L1 page + * x3 - KVA L1 page + * x2 - free mem pointer from which we allocate a variable number of L2 + * pages. The maximum number of bootstrap page table pages is limited to + * BOOTSTRAP_TABLE_SIZE. For a 2G 4k page device, assuming the worst-case + * slide, we need 1xL1 and up to 3xL2 pages (1GB mapped per L1 entry), so + * 8 total pages for V=P and KVA. */ - create_l2_block_entries x0, x0, x2, x5, x10, x11, x12, x13 + mov x1, x25 + add x3, x1, PGBYTES + mov x2, x3 -#if !__ARM64_TWO_LEVEL_PMAP__ - /* Create entry for L2 table in KVA L1 table - * create_l1_table_entry(virt_base, L1 table, L2 table, scratch1, scratch2, scratch3) + /* + * Setup the V=P bootstrap mapping + * x5 - total number of L2 entries to allocate */ - create_l1_table_entry x22, x3, x4, x10, x11, x12 -#endif + lsr x5, x24, #(ARM_TT_L2_SHIFT) + /* create_bootstrap_mapping(vbase, pbase, num_ents, L1 table, freeptr) */ + create_bootstrap_mapping x0, x4, x5, x1, x2, x6, x10, x11, x12, x13 - /* Create block entries in KVA L2 table - * create_l2_block_entries(virt_base, phys_base, L2 table, num_ents, scratch1, scratch2, scratch3) - */ - create_l2_block_entries x22, x23, x4, x5, x10, x11, x12, x13 + /* Setup the KVA bootstrap mapping */ + lsr x5, x24, #(ARM_TT_L2_SHIFT) + create_bootstrap_mapping x14, x15, x5, x3, x2, x9, x10, x11, x12, x13 /* Ensure TTEs are visible */ dsb ish @@ -573,8 +613,7 @@ Linvalidate_bootstrap: // do { * x21 - zero on cold boot, PA of cpu data on warm reset * x22 - Kernel virtual base * x23 - Kernel physical base - * x24 - Physical memory size - * x25 - PA of the end of the kernl + * x25 - PA of the end of the kernel * lr - KVA of C init routine * sp - SP_EL0 selected * @@ -591,7 +630,7 @@ common_start: /* Set up translation table base registers. * TTBR0 - V=P table @ top of kernel - * TTBR1 - KVA table @ top of kernel + 2 pages + * TTBR1 - KVA table @ top of kernel + 1 page */ #if defined(KERNEL_INTEGRITY_KTRR) /* Note that for KTRR configurations, the V=P map will be modified by @@ -599,25 +638,10 @@ common_start: */ #endif and x0, x25, #(TTBR_BADDR_MASK) -#if __ARM_KERNEL_PROTECT__ - /* We start out with a kernel ASID. */ - orr x0, x0, #(1 << TTBR_ASID_SHIFT) -#endif /* __ARM_KERNEL_PROTECT__ */ - msr TTBR0_EL1, x0 -#if __ARM64_TWO_LEVEL_PMAP__ - /* - * If we're using a two level pmap, we'll only need a - * single page per bootstrap pmap. - */ - mov x12, #1 -#else - /* - * If we're using a three level pmap, we'll need two - * pages per bootstrap pmap. - */ - mov x12, #2 -#endif - add x0, x25, x12, lsl PGSHIFT + mov x19, lr + bl EXT(set_mmu_ttb) + mov lr, x19 + add x0, x25, PGBYTES and x0, x0, #(TTBR_BADDR_MASK) MSR_TTBR1_EL1_X0 @@ -637,9 +661,6 @@ common_start: orr x0, x0, x1 msr MAIR_EL1, x0 - // Disable interrupts - msr DAIFSet, #(DAIFSC_IRQF | DAIFSC_FIQF) - #if defined(APPLEHURRICANE) // Increase Snoop reservation in EDB to reduce starvation risk @@ -651,6 +672,19 @@ common_start: #endif +#if defined(BCM2837) + // Setup timer interrupt routing; must be done before MMU is enabled + mrs x15, MPIDR_EL1 // Load MPIDR to get CPU number + and x15, x15, #0xFF // CPU number is in MPIDR Affinity Level 0 + mov x0, #0x4000 + lsl x0, x0, #16 + add x0, x0, #0x0040 // x0: 0x4000004X Core Timers interrupt control + add x0, x0, x15, lsl #2 + mov w1, #0xF0 // x1: 0xF0 Route to Core FIQs + str w1, [x0] + isb sy +#endif + #ifndef __ARM_IC_NOALIAS_ICACHE__ /* Invalidate the TLB and icache on systems that do not guarantee that the @@ -725,13 +759,13 @@ common_start: #if defined(APPLECYCLONE) || defined(APPLETYPHOON) // // Cyclone/Typhoon-Specific initialization - // For tunable summary, see + // For tunable summary, see Alcatraz/H6: Confirm Cyclone CPU tunables have been set // // // Disable LSP flush with context switch to work around bug in LSP // that can cause Cyclone to wedge when CONTEXTIDR is written. - // + // Innsbruck11A175: panic(cpu 0 caller 0xffffff800024e30c): "wait queue deadlock - wq=0xffffff805a7a63c0, cpu=0\n" // mrs x12, ARM64_REG_HID0 @@ -843,6 +877,83 @@ Lskip_isalive: #endif // APPLEHURRICANE +#if defined(APPLEMONSOON) + + /***** Tunables that apply to all skye cores, all chip revs *****/ + + // SW WAR/eval: WKdm write ack lost when bif_wke_colorWrAck_XXaH asserts concurrently for both colors + mrs x12, ARM64_REG_HID8 + orr x12, x12, #ARM64_REG_HID8_WkeForceStrictOrder + msr ARM64_REG_HID8, x12 + + // Skip if not E-core + ARM64_IS_PCORE x15 + cbnz x15, Lskip_skye_ecore_only + + /***** Tunables that only apply to skye e-cores, all chip revs *****/ + + // : Atomic launch eligibility is erroneously taken away when a store at SMB gets invalidated + mrs x12, ARM64_REG_EHID11 + and x12, x12, ~(ARM64_REG_EHID11_SmbDrainThresh_mask) + msr ARM64_REG_EHID11, x12 + +Lskip_skye_ecore_only: + + SKIP_IF_CPU_VERSION_GREATER_OR_EQUAL x12, MONSOON_CPU_VERSION_B0, Lskip_skye_a0_workarounds + + // Skip if not E-core + cbnz x15, Lskip_skye_a0_ecore_only + + /***** Tunables that only apply to skye e-cores, chip revs < B0 *****/ + + // Disable downstream fill bypass logic + // [Tunable] Skye - L2E fill bypass collision from both pipes to ecore + mrs x12, ARM64_REG_EHID5 + orr x12, x12, ARM64_REG_EHID5_DisFillByp + msr ARM64_REG_EHID5, x12 + + // Disable forwarding of return addresses to the NFP + // Skye: FED incorrectly taking illegal va exception + mrs x12, ARM64_REG_EHID0 + orr x12, x12, ARM64_REG_EHID0_nfpRetFwdDisb + msr ARM64_REG_EHID0, x12 + +Lskip_skye_a0_ecore_only: + + /***** Tunables that apply to all skye cores, chip revs < B0 *****/ + + // Disable clock divider gating + // [Tunable/Errata][cpu_1p_1e] [CPGV2] ACC power down issue when link FSM switches from GO_DN to CANCEL and at the same time upStreamDrain request is set. + mrs x12, ARM64_REG_HID6 + orr x12, x12, ARM64_REG_HID6_DisClkDivGating + msr ARM64_REG_HID6, x12 + + // Disable clock dithering + // [Tunable] Skye A0: Linux: LLC PIO Errors + mrs x12, ARM64_REG_ACC_OVRD + orr x12, x12, ARM64_REG_ACC_OVRD_dsblClkDtr + msr ARM64_REG_ACC_OVRD, x12 + + mrs x12, ARM64_REG_ACC_EBLK_OVRD + orr x12, x12, ARM64_REG_ACC_OVRD_dsblClkDtr + msr ARM64_REG_ACC_EBLK_OVRD, x12 + +Lskip_skye_a0_workarounds: + + SKIP_IF_CPU_VERSION_LESS_THAN x12, MONSOON_CPU_VERSION_B0, Lskip_skye_post_a1_workarounds + + /***** Tunables that apply to all skye cores, chip revs >= B0 *****/ + + // : Disable refcount syncing between E and P + mrs x12, ARM64_REG_CYC_OVRD + and x12, x12, ~ARM64_REG_CYC_OVRD_dsblSnoopTime_mask + orr x12, x12, ARM64_REG_CYC_OVRD_dsblSnoopPTime + msr ARM64_REG_CYC_OVRD, x12 + +Lskip_skye_post_a1_workarounds: + +#endif /* defined(APPLEMONSOON) */ + // If x21 != 0, we're doing a warm reset, so we need to trampoline to the kernel pmap. cbnz x21, Ltrampoline @@ -913,29 +1024,12 @@ arm_init_tramp: * +---Kernel Base---+ */ - - adrp x0, EXT(invalid_ttep)@page - add x0, x0, EXT(invalid_ttep)@pageoff - ldr x0, [x0] -#if __ARM_KERNEL_PROTECT__ - /* We start out with a kernel ASID. */ - orr x0, x0, #(1 << TTBR_ASID_SHIFT) -#endif /* __ARM_KERNEL_PROTECT__ */ - - msr TTBR0_EL1, x0 - + mov x19, lr // Convert CPU data PA to VA and set as first argument - add x0, x21, x22 - sub x0, x0, x23 - mov x1, #0 + mov x0, x21 + bl EXT(phystokv) - // Make sure that the TLB flush happens after the registers are set! - isb sy - - // Synchronize system for TTBR updates - tlbi vmalle1 - dsb sy - isb sy + mov lr, x19 /* Return to arm_init() */ ret diff --git a/osfmk/arm64/status.c b/osfmk/arm64/status.c index cf2d66cd8..ff0429e36 100644 --- a/osfmk/arm64/status.c +++ b/osfmk/arm64/status.c @@ -231,22 +231,92 @@ handle_set_arm_thread_state( * what the client is expecting. */ if (count < ARM_UNIFIED_THREAD_STATE_COUNT) { + if (!is_saved_state32(saved_state)) { + return (KERN_INVALID_ARGUMENT); + } return handle_set_arm32_thread_state(tstate, count, saved_state); } const arm_unified_thread_state_t *unified_state = (const arm_unified_thread_state_t *) tstate; #if __arm64__ if (is_thread_state64(unified_state)) { + if (!is_saved_state64(saved_state)) { + return (KERN_INVALID_ARGUMENT); + } (void)thread_state64_to_saved_state(const_thread_state64(unified_state), saved_state); } else #endif { + if (!is_saved_state32(saved_state)) { + return (KERN_INVALID_ARGUMENT); + } (void)thread_state32_to_saved_state(const_thread_state32(unified_state), saved_state); } return (KERN_SUCCESS); } +/* + * Translate thread state arguments to userspace representation + */ + +kern_return_t +machine_thread_state_convert_to_user( + thread_t thread, + thread_flavor_t flavor, + thread_state_t tstate, + mach_msg_type_number_t *count) +{ + // No conversion to userspace representation on this platform + (void)thread; (void)flavor; (void)tstate; (void)count; + return KERN_SUCCESS; +} + +/* + * Translate thread state arguments from userspace representation + */ + +kern_return_t +machine_thread_state_convert_from_user( + thread_t thread, + thread_flavor_t flavor, + thread_state_t tstate, + mach_msg_type_number_t count) +{ + // No conversion from userspace representation on this platform + (void)thread; (void)flavor; (void)tstate; (void)count; + return KERN_SUCCESS; +} + +/* + * Translate signal context data pointer to userspace representation + */ + +kern_return_t +machine_thread_siguctx_pointer_convert_to_user( + __assert_only thread_t thread, + user_addr_t *uctxp) +{ + // No conversion to userspace representation on this platform + (void)thread; (void)uctxp; + return KERN_SUCCESS; +} + +/* + * Translate array of function pointer syscall arguments from userspace representation + */ + +kern_return_t +machine_thread_function_pointers_convert_from_user( + __assert_only thread_t thread, + user_addr_t *fptrs, + uint32_t count) +{ + // No conversion from userspace representation on this platform + (void)thread; (void)fptrs; (void)count; + return KERN_SUCCESS; +} + /* * Routine: machine_thread_get_state * @@ -276,8 +346,8 @@ machine_thread_get_state( tstate[0] = ARM_THREAD_STATE; tstate[1] = ARM_VFP_STATE; - tstate[2] = thread_is_64bit(thread) ? ARM_EXCEPTION_STATE64 : ARM_EXCEPTION_STATE; - tstate[3] = thread_is_64bit(thread) ? ARM_DEBUG_STATE64 : ARM_DEBUG_STATE32; + tstate[2] = thread_is_64bit_data(thread) ? ARM_EXCEPTION_STATE64 : ARM_EXCEPTION_STATE; + tstate[3] = thread_is_64bit_data(thread) ? ARM_DEBUG_STATE64 : ARM_DEBUG_STATE32; *count = 4; break; @@ -289,7 +359,7 @@ machine_thread_get_state( } case ARM_THREAD_STATE32: { - if (thread_is_64bit(thread)) + if (thread_is_64bit_data(thread)) return KERN_INVALID_ARGUMENT; kern_return_t rn = handle_get_arm32_thread_state(tstate, count, thread->machine.upcb); @@ -299,7 +369,7 @@ machine_thread_get_state( #if __arm64__ case ARM_THREAD_STATE64: { - if (!thread_is_64bit(thread)) + if (!thread_is_64bit_data(thread)) return KERN_INVALID_ARGUMENT; kern_return_t rn = handle_get_arm64_thread_state(tstate, count, thread->machine.upcb); @@ -313,7 +383,7 @@ machine_thread_get_state( if (*count < ARM_EXCEPTION_STATE_COUNT) return (KERN_INVALID_ARGUMENT); - if (thread_is_64bit(thread)) + if (thread_is_64bit_data(thread)) return (KERN_INVALID_ARGUMENT); state = (struct arm_exception_state *) tstate; @@ -332,7 +402,7 @@ machine_thread_get_state( if (*count < ARM_EXCEPTION_STATE64_COUNT) return (KERN_INVALID_ARGUMENT); - if (!thread_is_64bit(thread)) + if (!thread_is_64bit_data(thread)) return (KERN_INVALID_ARGUMENT); state = (struct arm_exception_state64 *) tstate; @@ -352,7 +422,7 @@ machine_thread_get_state( if (*count < ARM_LEGACY_DEBUG_STATE_COUNT) return (KERN_INVALID_ARGUMENT); - if (thread_is_64bit(thread)) + if (thread_is_64bit_data(thread)) return (KERN_INVALID_ARGUMENT); state = (arm_legacy_debug_state_t *) tstate; @@ -373,7 +443,7 @@ machine_thread_get_state( if (*count < ARM_DEBUG_STATE32_COUNT) return (KERN_INVALID_ARGUMENT); - if (thread_is_64bit(thread)) + if (thread_is_64bit_data(thread)) return (KERN_INVALID_ARGUMENT); state = (arm_debug_state32_t *) tstate; @@ -395,7 +465,7 @@ machine_thread_get_state( if (*count < ARM_DEBUG_STATE64_COUNT) return (KERN_INVALID_ARGUMENT); - if (!thread_is_64bit(thread)) + if (!thread_is_64bit_data(thread)) return (KERN_INVALID_ARGUMENT); state = (arm_debug_state64_t *) tstate; @@ -439,10 +509,10 @@ machine_thread_get_state( arm_neon_state_t *state; arm_neon_saved_state32_t *thread_state; - if (*count < ARM_NEON_STATE_COUNT) + if (*count < ARM_NEON_STATE_COUNT) return (KERN_INVALID_ARGUMENT); - if (thread_is_64bit(thread)) + if (thread_is_64bit_data(thread)) return (KERN_INVALID_ARGUMENT); state = (arm_neon_state_t *)tstate; @@ -460,10 +530,10 @@ machine_thread_get_state( arm_neon_state64_t *state; arm_neon_saved_state64_t *thread_state; - if (*count < ARM_NEON_STATE64_COUNT) + if (*count < ARM_NEON_STATE64_COUNT) return (KERN_INVALID_ARGUMENT); - if (!thread_is_64bit(thread)) + if (!thread_is_64bit_data(thread)) return (KERN_INVALID_ARGUMENT); state = (arm_neon_state64_t *)tstate; @@ -532,7 +602,7 @@ machine_thread_get_kern_state( void machine_thread_switch_addrmode(thread_t thread) { - if (task_has_64BitAddr(thread->task)) { + if (task_has_64Bit_data(thread->task)) { thread->machine.upcb->ash.flavor = ARM_SAVED_STATE64; thread->machine.upcb->ash.count = ARM_SAVED_STATE64_COUNT; thread->machine.uNeon->nsh.flavor = ARM_NEON_SAVED_STATE64; @@ -579,7 +649,7 @@ machine_thread_set_state( break; case ARM_THREAD_STATE32: - if (thread_is_64bit(thread)) + if (thread_is_64bit_data(thread)) return (KERN_INVALID_ARGUMENT); rn = handle_set_arm32_thread_state(tstate, count, thread->machine.upcb); @@ -588,7 +658,7 @@ machine_thread_set_state( #if __arm64__ case ARM_THREAD_STATE64: - if (!thread_is_64bit(thread)) + if (!thread_is_64bit_data(thread)) return (KERN_INVALID_ARGUMENT); rn = handle_set_arm64_thread_state(tstate, count, thread->machine.upcb); @@ -599,7 +669,7 @@ machine_thread_set_state( if (count != ARM_EXCEPTION_STATE_COUNT) return (KERN_INVALID_ARGUMENT); - if (thread_is_64bit(thread)) + if (thread_is_64bit_data(thread)) return (KERN_INVALID_ARGUMENT); break; @@ -608,7 +678,7 @@ machine_thread_set_state( if (count != ARM_EXCEPTION_STATE64_COUNT) return (KERN_INVALID_ARGUMENT); - if (!thread_is_64bit(thread)) + if (!thread_is_64bit_data(thread)) return (KERN_INVALID_ARGUMENT); break; @@ -621,7 +691,7 @@ machine_thread_set_state( if (count != ARM_LEGACY_DEBUG_STATE_COUNT) return (KERN_INVALID_ARGUMENT); - if (thread_is_64bit(thread)) + if (thread_is_64bit_data(thread)) return (KERN_INVALID_ARGUMENT); state = (arm_legacy_debug_state_t *) tstate; @@ -698,7 +768,7 @@ machine_thread_set_state( if (count != ARM_DEBUG_STATE32_COUNT) return (KERN_INVALID_ARGUMENT); - if (thread_is_64bit(thread)) + if (thread_is_64bit_data(thread)) return (KERN_INVALID_ARGUMENT); state = (arm_debug_state32_t *) tstate; @@ -781,7 +851,7 @@ machine_thread_set_state( if (count != ARM_DEBUG_STATE64_COUNT) return (KERN_INVALID_ARGUMENT); - if (!thread_is_64bit(thread)) + if (!thread_is_64bit_data(thread)) return (KERN_INVALID_ARGUMENT); state = (arm_debug_state64_t *) tstate; @@ -886,7 +956,7 @@ machine_thread_set_state( if (count != ARM_NEON_STATE_COUNT) return (KERN_INVALID_ARGUMENT); - if (thread_is_64bit(thread)) + if (thread_is_64bit_data(thread)) return (KERN_INVALID_ARGUMENT); state = (arm_neon_state_t *)tstate; @@ -908,7 +978,7 @@ machine_thread_set_state( if (count != ARM_NEON_STATE64_COUNT) return (KERN_INVALID_ARGUMENT); - if (!thread_is_64bit(thread)) + if (!thread_is_64bit_data(thread)) return (KERN_INVALID_ARGUMENT); state = (arm_neon_state64_t *)tstate; @@ -958,6 +1028,7 @@ machine_thread_state_initialize( thread->machine.DebugData = NULL; + return KERN_SUCCESS; } @@ -968,7 +1039,8 @@ machine_thread_state_initialize( kern_return_t machine_thread_dup( thread_t self, - thread_t target) + thread_t target, + __unused boolean_t is_corpse) { struct arm_saved_state *self_saved_state; struct arm_saved_state *target_saved_state; @@ -1056,13 +1128,13 @@ find_debug_state64( */ kern_return_t thread_userstack( - thread_t thread, - int flavor, - thread_state_t tstate, - unsigned int count, - mach_vm_offset_t * user_stack, - int *customstack, - boolean_t is64bit + __unused thread_t thread, + int flavor, + thread_state_t tstate, + unsigned int count, + mach_vm_offset_t * user_stack, + int *customstack, + boolean_t is_64bit_data ) { register_t sp; @@ -1071,7 +1143,7 @@ thread_userstack( case ARM_THREAD_STATE: if (count == ARM_UNIFIED_THREAD_STATE_COUNT) { #if __arm64__ - if (thread_is_64bit(thread)) { + if (is_64bit_data) { sp = ((arm_unified_thread_state_t *)tstate)->ts_64.sp; } else #endif @@ -1086,7 +1158,7 @@ thread_userstack( case ARM_THREAD_STATE32: if (count != ARM_THREAD_STATE32_COUNT) return (KERN_INVALID_ARGUMENT); - if (is64bit) + if (is_64bit_data) return (KERN_INVALID_ARGUMENT); sp = ((arm_thread_state32_t *)tstate)->sp; @@ -1095,7 +1167,7 @@ thread_userstack( case ARM_THREAD_STATE64: if (count != ARM_THREAD_STATE64_COUNT) return (KERN_INVALID_ARGUMENT); - if (!is64bit) + if (!is_64bit_data) return (KERN_INVALID_ARGUMENT); sp = ((arm_thread_state32_t *)tstate)->sp; @@ -1312,7 +1384,7 @@ act_thread_csave(void) } #if __ARM_VFP__ - if (thread_is_64bit(thread)) { + if (thread_is_64bit_data(thread)) { val = ARM_NEON_STATE64_COUNT; kret = machine_thread_get_state(thread, ARM_NEON_STATE64, @@ -1353,7 +1425,7 @@ act_thread_catt(void *ctx) goto out; #if __ARM_VFP__ - if (thread_is_64bit(thread)) { + if (thread_is_64bit_data(thread)) { kret = machine_thread_set_state(thread, ARM_NEON_STATE64, (thread_state_t) & ic->ns, @@ -1390,7 +1462,7 @@ thread_set_wq_state32(thread_t thread, thread_state_t tstate) thread_t curth = current_thread(); spl_t s=0; - assert(!thread_is_64bit(thread)); + assert(!thread_is_64bit_data(thread)); saved_state = thread->machine.upcb; saved_state_32 = saved_state32(saved_state); @@ -1427,7 +1499,7 @@ thread_set_wq_state64(thread_t thread, thread_state_t tstate) thread_t curth = current_thread(); spl_t s=0; - assert(thread_is_64bit(thread)); + assert(thread_is_64bit_data(thread)); saved_state = thread->machine.upcb; saved_state_64 = saved_state64(saved_state); @@ -1444,7 +1516,7 @@ thread_set_wq_state64(thread_t thread, thread_state_t tstate) * like sp. */ thread_state64_to_saved_state(state, saved_state); - saved_state_64->cpsr = PSR64_USER64_DEFAULT; + set_saved_state_cpsr(saved_state, PSR64_USER64_DEFAULT); if (curth != thread) { thread_unlock(thread); diff --git a/osfmk/arm64/strncmp.s b/osfmk/arm64/strncmp.s index eee2de722..07c9a36d0 100644 --- a/osfmk/arm64/strncmp.s +++ b/osfmk/arm64/strncmp.s @@ -34,6 +34,8 @@ * of the first mismatched characters interpreted as uint8_t. */ +#include + .globl _strncmp /***************************************************************************** @@ -41,13 +43,14 @@ *****************************************************************************/ .macro EstablishFrame + ARM64_STACK_PROLOG stp fp, lr, [sp, #-16]! mov fp, sp .endm .macro ClearFrameAndReturn ldp fp, lr, [sp], #16 - ret + ARM64_STACK_EPILOG .endm #include "../mach/arm/vm_param.h" diff --git a/osfmk/arm64/strnlen.s b/osfmk/arm64/strnlen.s index 3e0080669..4ec162539 100644 --- a/osfmk/arm64/strnlen.s +++ b/osfmk/arm64/strnlen.s @@ -33,6 +33,8 @@ * is amller, without reading beyond the first maxlen characters of string. */ +#include + .globl _strlen .globl _strnlen @@ -41,13 +43,14 @@ *****************************************************************************/ .macro EstablishFrame + ARM64_STACK_PROLOG stp fp, lr, [sp, #-16]! mov fp, sp .endm .macro ClearFrameAndReturn ldp fp, lr, [sp], #16 - ret + ARM64_STACK_EPILOG .endm /***************************************************************************** @@ -116,7 +119,7 @@ _strnlen: ClearFrameAndReturn L_maxlenIsZero: - mov x0, xzr + mov x0, #0 ret // No stack frame, so don't clear it. L_foundNUL: diff --git a/osfmk/bank/bank.c b/osfmk/bank/bank.c index b7d5d11d0..4b183e9b5 100644 --- a/osfmk/bank/bank.c +++ b/osfmk/bank/bank.c @@ -74,7 +74,10 @@ static bank_account_t bank_account_alloc_init(bank_task_t bank_holder, bank_task static bank_task_t get_bank_task_context(task_t task, boolean_t initialize); static void bank_task_dealloc(bank_task_t bank_task, mach_voucher_attr_value_reference_t sync); static kern_return_t bank_account_dealloc_with_sync(bank_account_t bank_account, mach_voucher_attr_value_reference_t sync); -static void bank_rollup_chit_to_tasks(ledger_t bill, bank_task_t bank_holder, bank_task_t bank_merchant); +static void bank_rollup_chit_to_tasks(ledger_t bill, ledger_t bank_holder_ledger, ledger_t bank_merchant_ledger, + int bank_holder_pid, int bank_merchant_pid); +static ledger_t bank_get_bank_task_ledger_with_ref(bank_task_t bank_task); +static void bank_destroy_bank_task_ledger(bank_task_t bank_task); static void init_bank_ledgers(void); static boolean_t bank_task_is_propagate_entitled(task_t t); static struct thread_group *bank_get_bank_task_thread_group(bank_task_t bank_task __unused); @@ -729,7 +732,7 @@ bank_release( * Purpose: Allocate and initialize a bank task structure. * Returns: bank_task_t on Success. * BANK_TASK_NULL: on Failure. - * Notes: Leaves the task and creditcard blank and has only 1 ref, + * Notes: Leaves the task and ledger blank and has only 1 ref, needs to take 1 extra ref after the task field is initialized. */ static bank_task_t @@ -745,7 +748,7 @@ bank_task_alloc_init(task_t task) new_bank_task->bt_voucher_ref = 0; new_bank_task->bt_refs = 1; new_bank_task->bt_made = 0; - new_bank_task->bt_creditcard = NULL; + new_bank_task->bt_ledger = LEDGER_NULL; new_bank_task->bt_hasentitlement = bank_task_is_propagate_entitled(task); queue_init(&new_bank_task->bt_accounts_to_pay); queue_init(&new_bank_task->bt_accounts_to_charge); @@ -813,7 +816,7 @@ bank_account_alloc_init( boolean_t entry_found = FALSE; ledger_t new_ledger = ledger_instantiate(bank_ledger_template, LEDGER_CREATE_INACTIVE_ENTRIES); - if (new_ledger == NULL) + if (new_ledger == LEDGER_NULL) return BANK_ACCOUNT_NULL; ledger_entry_setactive(new_ledger, bank_ledgers.cpu_time); @@ -919,7 +922,7 @@ get_bank_task_context return BANK_TASK_NULL; } /* We won the race. Take a ref on the ledger and initialize bank task. */ - bank_task->bt_creditcard = task->ledger; + bank_task->bt_ledger = task->ledger; #if DEVELOPMENT || DEBUG bank_task->bt_task = task; #endif @@ -954,7 +957,7 @@ bank_task_dealloc( assert(queue_empty(&bank_task->bt_accounts_to_pay)); assert(queue_empty(&bank_task->bt_accounts_to_charge)); - ledger_dereference(bank_task->bt_creditcard); + assert(!LEDGER_VALID(bank_task->bt_ledger)); lck_mtx_destroy(&bank_task->bt_acc_to_pay_lock, &bank_lock_grp); lck_mtx_destroy(&bank_task->bt_acc_to_charge_lock, &bank_lock_grp); @@ -983,12 +986,22 @@ bank_account_dealloc_with_sync( bank_task_t bank_merchant = bank_account->ba_merchant; bank_task_t bank_secureoriginator = bank_account->ba_secureoriginator; bank_task_t bank_proximateprocess = bank_account->ba_proximateprocess; + ledger_t bank_merchant_ledger = LEDGER_NULL; + + /* + * Grab a reference on the bank_merchant_ledger, since we would not be able + * to take bt_acc_to_pay_lock for bank_merchant later. + */ + bank_merchant_ledger = bank_get_bank_task_ledger_with_ref(bank_merchant); /* Grab the acc to pay list lock and check the sync value */ lck_mtx_lock(&bank_holder->bt_acc_to_pay_lock); if (bank_account->ba_made != sync) { lck_mtx_unlock(&bank_holder->bt_acc_to_pay_lock); + if (bank_merchant_ledger) { + ledger_dereference(bank_merchant_ledger); + } return KERN_FAILURE; } @@ -1001,8 +1014,10 @@ bank_account_dealloc_with_sync( /* Grab both the acc to pay and acc to charge locks */ lck_mtx_lock(&bank_merchant->bt_acc_to_charge_lock); - bank_rollup_chit_to_tasks(bank_account->ba_bill, bank_holder, bank_merchant); - + /* No need to take ledger reference for bank_holder ledger since bt_acc_to_pay_lock is locked */ + bank_rollup_chit_to_tasks(bank_account->ba_bill, bank_holder->bt_ledger, bank_merchant_ledger, + bank_holder->bt_pid, bank_merchant->bt_pid); + /* Remove the account entry from Accounts need to pay account link list. */ queue_remove(&bank_holder->bt_accounts_to_pay, bank_account, bank_account_t, ba_next_acc_to_pay); @@ -1012,6 +1027,9 @@ bank_account_dealloc_with_sync( lck_mtx_unlock(&bank_merchant->bt_acc_to_charge_lock); lck_mtx_unlock(&bank_holder->bt_acc_to_pay_lock); + if (bank_merchant_ledger) { + ledger_dereference(bank_merchant_ledger); + } ledger_dereference(bank_account->ba_bill); /* Drop the reference of bank holder and merchant */ @@ -1038,38 +1056,50 @@ bank_account_dealloc_with_sync( static void bank_rollup_chit_to_tasks( ledger_t bill, - bank_task_t bank_holder, - bank_task_t bank_merchant) + ledger_t bank_holder_ledger, + ledger_t bank_merchant_ledger, + int bank_holder_pid, + int bank_merchant_pid) { ledger_amount_t credit; ledger_amount_t debit; kern_return_t ret; - if (bank_holder == bank_merchant) + if (bank_holder_ledger == bank_merchant_ledger) return; ret = ledger_get_entries(bill, bank_ledgers.cpu_time, &credit, &debit); if (ret == KERN_SUCCESS) { KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (BANK_CODE(BANK_ACCOUNT_INFO, (BANK_SETTLE_CPU_TIME))) | DBG_FUNC_NONE, - bank_merchant->bt_pid, bank_holder->bt_pid, credit, debit, 0); - ledger_credit(bank_holder->bt_creditcard, task_ledgers.cpu_time_billed_to_me, credit); - ledger_debit(bank_holder->bt_creditcard, task_ledgers.cpu_time_billed_to_me, debit); + bank_merchant_pid, bank_holder_pid, credit, debit, 0); - ledger_credit(bank_merchant->bt_creditcard, task_ledgers.cpu_time_billed_to_others, credit); - ledger_debit(bank_merchant->bt_creditcard, task_ledgers.cpu_time_billed_to_others, debit); + if (bank_holder_ledger) { + ledger_credit(bank_holder_ledger, task_ledgers.cpu_time_billed_to_me, credit); + ledger_debit(bank_holder_ledger, task_ledgers.cpu_time_billed_to_me, debit); + } + + if (bank_merchant_ledger) { + ledger_credit(bank_merchant_ledger, task_ledgers.cpu_time_billed_to_others, credit); + ledger_debit(bank_merchant_ledger, task_ledgers.cpu_time_billed_to_others, debit); + } } ret = ledger_get_entries(bill, bank_ledgers.energy, &credit, &debit); if (ret == KERN_SUCCESS) { KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (BANK_CODE(BANK_ACCOUNT_INFO, (BANK_SETTLE_ENERGY))) | DBG_FUNC_NONE, - bank_merchant->bt_pid, bank_holder->bt_pid, credit, debit, 0); - ledger_credit(bank_holder->bt_creditcard, task_ledgers.energy_billed_to_me, credit); - ledger_debit(bank_holder->bt_creditcard, task_ledgers.energy_billed_to_me, debit); + bank_merchant_pid, bank_holder_pid, credit, debit, 0); - ledger_credit(bank_merchant->bt_creditcard, task_ledgers.energy_billed_to_others, credit); - ledger_debit(bank_merchant->bt_creditcard, task_ledgers.energy_billed_to_others, debit); + if (bank_holder_ledger) { + ledger_credit(bank_holder_ledger, task_ledgers.energy_billed_to_me, credit); + ledger_debit(bank_holder_ledger, task_ledgers.energy_billed_to_me, debit); + } + + if (bank_merchant_ledger) { + ledger_credit(bank_merchant_ledger, task_ledgers.energy_billed_to_others, credit); + ledger_debit(bank_merchant_ledger, task_ledgers.energy_billed_to_others, debit); + } } } @@ -1091,6 +1121,7 @@ bank_task_destroy(task_t task) task->bank_context = NULL; global_bank_task_unlock(); + bank_destroy_bank_task_ledger(bank_task); bank_task_dealloc(bank_task, 1); } @@ -1200,19 +1231,22 @@ bank_billed_balance(bank_task_t bank_task, uint64_t *cpu_time, uint64_t *energy) lck_mtx_lock(&bank_task->bt_acc_to_pay_lock); - kr = ledger_get_balance(bank_task->bt_creditcard, task_ledgers.cpu_time_billed_to_me, &temp); - if (kr == KERN_SUCCESS && temp >= 0) { - cpu_balance += temp; - } + /* bt_acc_to_pay_lock locked, no need to take ledger reference for bt_ledger */ + if (bank_task->bt_ledger != LEDGER_NULL) { + kr = ledger_get_balance(bank_task->bt_ledger, task_ledgers.cpu_time_billed_to_me, &temp); + if (kr == KERN_SUCCESS && temp >= 0) { + cpu_balance += temp; + } #if DEVELOPMENT || DEBUG - else { - printf("bank_bill_time: ledger_get_balance failed or negative balance in ledger: %lld\n", temp); - } + else { + printf("bank_bill_time: ledger_get_balance failed or negative balance in ledger: %lld\n", temp); + } #endif /* DEVELOPMENT || DEBUG */ - kr = ledger_get_balance(bank_task->bt_creditcard, task_ledgers.energy_billed_to_me, &temp); - if (kr == KERN_SUCCESS && temp >= 0) { - energy_balance += temp; + kr = ledger_get_balance(bank_task->bt_ledger, task_ledgers.energy_billed_to_me, &temp); + if (kr == KERN_SUCCESS && temp >= 0) { + energy_balance += temp; + } } queue_iterate(&bank_task->bt_accounts_to_pay, bank_account, bank_account_t, ba_next_acc_to_pay) { @@ -1297,27 +1331,33 @@ bank_serviced_balance(bank_task_t bank_task, uint64_t *cpu_time, uint64_t *energ bank_account_t bank_account; int64_t temp = 0; kern_return_t kr; + ledger_t ledger = LEDGER_NULL; if (bank_task == BANK_TASK_NULL) { *cpu_time = 0; *energy = 0; return; } + /* Grab a ledger reference on bt_ledger for bank_task */ + ledger = bank_get_bank_task_ledger_with_ref(bank_task); + lck_mtx_lock(&bank_task->bt_acc_to_charge_lock); - kr = ledger_get_balance(bank_task->bt_creditcard, task_ledgers.cpu_time_billed_to_others, &temp); - if (kr == KERN_SUCCESS && temp >= 0) { - cpu_balance += temp; - } + if (ledger) { + kr = ledger_get_balance(ledger, task_ledgers.cpu_time_billed_to_others, &temp); + if (kr == KERN_SUCCESS && temp >= 0) { + cpu_balance += temp; + } #if DEVELOPMENT || DEBUG - else { - printf("bank_serviced_time: ledger_get_balance failed or negative balance in ledger: %lld\n", temp); - } + else { + printf("bank_serviced_time: ledger_get_balance failed or negative balance in ledger: %lld\n", temp); + } #endif /* DEVELOPMENT || DEBUG */ - kr = ledger_get_balance(bank_task->bt_creditcard, task_ledgers.energy_billed_to_others, &temp); - if (kr == KERN_SUCCESS && temp >= 0) { - energy_balance += temp; + kr = ledger_get_balance(ledger, task_ledgers.energy_billed_to_others, &temp); + if (kr == KERN_SUCCESS && temp >= 0) { + energy_balance += temp; + } } queue_iterate(&bank_task->bt_accounts_to_charge, bank_account, bank_account_t, ba_next_acc_to_charge) { @@ -1338,6 +1378,9 @@ bank_serviced_balance(bank_task_t bank_task, uint64_t *cpu_time, uint64_t *energ } } lck_mtx_unlock(&bank_task->bt_acc_to_charge_lock); + if (ledger) { + ledger_dereference(ledger); + } *cpu_time = (uint64_t)cpu_balance; *energy = (uint64_t)energy_balance; return; @@ -1401,6 +1444,44 @@ bank_get_voucher_bank_account(ipc_voucher_t voucher) return BANK_ACCOUNT_NULL; } +/* + * Routine: bank_get_bank_task_ledger_with_ref + * Purpose: Get the bank ledger from the bank task and return a reference to it. + */ +static ledger_t +bank_get_bank_task_ledger_with_ref(bank_task_t bank_task) +{ + ledger_t ledger = LEDGER_NULL; + + lck_mtx_lock(&bank_task->bt_acc_to_pay_lock); + ledger = bank_task->bt_ledger; + if (ledger) { + ledger_reference(ledger); + } + lck_mtx_unlock(&bank_task->bt_acc_to_pay_lock); + + return ledger; +} + +/* + * Routine: bank_destroy_bank_task_ledger + * Purpose: Drop the bank task reference on the task ledger. + */ +static void +bank_destroy_bank_task_ledger(bank_task_t bank_task) +{ + ledger_t ledger; + + /* Remove the ledger reference from the bank task */ + lck_mtx_lock(&bank_task->bt_acc_to_pay_lock); + assert(LEDGER_VALID(bank_task->bt_ledger)); + ledger = bank_task->bt_ledger; + bank_task->bt_ledger = LEDGER_NULL; + lck_mtx_unlock(&bank_task->bt_acc_to_pay_lock); + + ledger_dereference(ledger); +} + /* * Routine: bank_get_bank_account_ledger * Purpose: Get the bankledger from the bank account if ba_merchant different than ba_holder @@ -1408,7 +1489,7 @@ bank_get_voucher_bank_account(ipc_voucher_t voucher) static ledger_t bank_get_bank_account_ledger(bank_account_t bank_account) { - ledger_t bankledger = NULL; + ledger_t bankledger = LEDGER_NULL; if (bank_account != BANK_ACCOUNT_NULL && bank_account->ba_holder != bank_account->ba_merchant) @@ -1437,7 +1518,7 @@ bank_get_bank_task_thread_group(bank_task_t bank_task __unused) static struct thread_group * bank_get_bank_account_thread_group(bank_account_t bank_account __unused) { - thread_group_t banktg = NULL; + struct thread_group *banktg = NULL; return (banktg); @@ -1453,7 +1534,7 @@ kern_return_t bank_get_bank_ledger_and_thread_group( ipc_voucher_t voucher, ledger_t *bankledger, - thread_group_t *banktg) + struct thread_group **banktg) { bank_account_t bank_account; struct thread_group *thread_group = NULL; @@ -1488,7 +1569,7 @@ bank_swap_thread_bank_ledger(thread_t thread __unused, ledger_t new_ledger __unu int64_t effective_energy_consumed = 0; uint64_t thread_energy; - if (old_ledger == NULL && new_ledger == NULL) + if (old_ledger == LEDGER_NULL && new_ledger == LEDGER_NULL) return; assert((thread == current_thread() || thread->started == 0)); @@ -1534,7 +1615,7 @@ bank_swap_thread_bank_ledger(thread_t thread __unused, ledger_t new_ledger __unu thread_unlock(thread); splx(s); - if (old_ledger != NULL) { + if (old_ledger != LEDGER_NULL) { ledger_credit(old_ledger, bank_ledgers.cpu_time, effective_ledger_time_consumed); diff --git a/osfmk/bank/bank_internal.h b/osfmk/bank/bank_internal.h index eb8f5599c..e3b3480e2 100644 --- a/osfmk/bank/bank_internal.h +++ b/osfmk/bank/bank_internal.h @@ -66,7 +66,7 @@ typedef struct bank_element * bank_element_t; struct bank_task { struct bank_element bt_elem; /* Bank element */ struct proc_persona_info bt_proc_persona; /* Persona of the process */ - ledger_t bt_creditcard; /* Ledger of the customer task */ + ledger_t bt_ledger; /* Ledger of the customer task */ queue_head_t bt_accounts_to_pay; /* List of accounts worked for me and need to pay */ queue_head_t bt_accounts_to_charge; /* List of accounts I did work and need to charge */ decl_lck_mtx_data(, bt_acc_to_pay_lock) /* Lock to protect accounts to pay list */ @@ -176,7 +176,7 @@ extern void bank_billed_balance(bank_task_t bank_task, uint64_t *cpu_time, uint6 extern void bank_serviced_balance_safe(task_t task, uint64_t *cpu_time, uint64_t *energy); extern void bank_serviced_balance(bank_task_t bank_task, uint64_t *cpu_time, uint64_t *energy); extern kern_return_t bank_get_bank_ledger_and_thread_group(ipc_voucher_t voucher, - ledger_t *bankledger, thread_group_t *banktg); + ledger_t *bankledger, struct thread_group **banktg); extern void bank_swap_thread_bank_ledger(thread_t thread, ledger_t ledger); #endif /* MACH_KERNEL_PRIVATE */ diff --git a/osfmk/conf/Makefile.arm64 b/osfmk/conf/Makefile.arm64 index 78235b8be..2c3b7ec5c 100644 --- a/osfmk/conf/Makefile.arm64 +++ b/osfmk/conf/Makefile.arm64 @@ -8,6 +8,7 @@ CWARNFLAGS = $(CWARNFLAGS_STD) -Wshorten-64-to-32 HIB_FILES= lz4.o_CFLAGS_ADD += -fbuiltin -O3 + ###################################################################### #END Machine dependent Makefile fragment for arm64 ###################################################################### diff --git a/osfmk/conf/Makefile.template b/osfmk/conf/Makefile.template index d533b91a5..d296cb6a7 100644 --- a/osfmk/conf/Makefile.template +++ b/osfmk/conf/Makefile.template @@ -40,14 +40,12 @@ OBJS_NO_CAST_ALIGN = \ cpu_threads.o \ cpuid.o \ locks_i386.o \ + locks_i386_opt.o \ machine_task.o \ mp_desc.o \ pcb.o \ pcb_native.o \ kdp_x86_common.o \ - memory_object.o \ - vm_apple_protect.o \ - vm_map.o \ startup64.o \ affinity.o \ sched_grrr.o \ @@ -60,9 +58,6 @@ OBJS_NO_CAST_ALIGN = \ status.o \ machine_routines.o \ loose_ends.o \ - fips_sha1.o \ - prng_yarrow.o \ - sha1mod.o \ sleh.o \ ccdigest_final_64be.o \ ccdigest_init.o \ @@ -71,17 +66,8 @@ OBJS_NO_CAST_ALIGN = \ cchmac_init.o \ ccsha1.o \ ipc_object.o \ - ccmode_ctr_crypt.o \ - ccmode_factory_ctr_crypt.o \ - ccmode_ctr_init.o \ - ccmode_ctr_setctr.o \ ipc_kmsg.o \ - ipc_right.o \ - bsd_vm.o \ - vm_map_store.o \ - vm_map_store_ll.o \ - vm_map_store_rb.o \ - vm_debug.o + ipc_right.o # Objects that don't want -Wsign-compare warning (15294427) OBJS_NO_SIGN_COMPARE = \ diff --git a/osfmk/conf/Makefile.x86_64 b/osfmk/conf/Makefile.x86_64 index efbb892f6..57759351c 100644 --- a/osfmk/conf/Makefile.x86_64 +++ b/osfmk/conf/Makefile.x86_64 @@ -16,17 +16,21 @@ UNCONFIGURED_HIB_FILES= \ HIB_FILES=$(filter $(UNCONFIGURED_HIB_FILES),$(OBJS)) # Unconfigured __HIB files must be Mach-O for "setsegname" -WKdmDecompress_new.o_CFLAGS_ADD += -fno-stack-protector $(CFLAGS_NOLTO_FLAG) -WKdmData_new.o_CFLAGS_ADD += -fno-stack-protector $(CFLAGS_NOLTO_FLAG) -hibernate_restore.o_CFLAGS_ADD += -fno-stack-protector $(CFLAGS_NOLTO_FLAG) -hibernate_bootstrap.o_CFLAGS_ADD += -fno-stack-protector $(CFLAGS_NOLTO_FLAG) -bcopy.o_CFLAGS_ADD += -fno-stack-protector $(CFLAGS_NOLTO_FLAG) -bzero.o_CFLAGS_ADD += -fno-stack-protector $(CFLAGS_NOLTO_FLAG) +WKdmDecompress_new.o_CFLAGS_ADD += -fno-stack-protector -fno-stack-check $(CFLAGS_NOLTO_FLAG) +WKdmData_new.o_CFLAGS_ADD += -fno-stack-protector -fno-stack-check $(CFLAGS_NOLTO_FLAG) +hibernate_restore.o_CFLAGS_ADD += -fno-stack-protector -fno-stack-check $(CFLAGS_NOLTO_FLAG) +hibernate_bootstrap.o_CFLAGS_ADD += -fno-stack-protector -fno-stack-check $(CFLAGS_NOLTO_FLAG) +bcopy.o_CFLAGS_ADD += -fno-stack-protector -fno-stack-check $(CFLAGS_NOLTO_FLAG) +bzero.o_CFLAGS_ADD += -fno-stack-protector -fno-stack-check $(CFLAGS_NOLTO_FLAG) +fp_simd.o_SFLAGS_ADD += -mavx512f # To appear at the beginning of the __HIB segment, emit # as Mach-O so that the linker can enforce symbol order boot_pt.o_CFLAGS_ADD += $(CFLAGS_NOLTO_FLAG) +# fast path lock C leaf functions must be built without stack frames +locks_i386_opt.o_CFLAGS_ADD += -momit-leaf-frame-pointer -O2 + ###################################################################### #END Machine dependent Makefile fragment for x86_64 ###################################################################### diff --git a/osfmk/conf/files b/osfmk/conf/files index 53078380e..342d24d83 100644 --- a/osfmk/conf/files +++ b/osfmk/conf/files @@ -61,6 +61,8 @@ OPTIONS/config_dtrace optional config_dtrace OPTIONS/no_kextd optional no_kextd +OPTIONS/config_quiesce_counter optional config_quiesce_counter + # # gssd files # @@ -119,6 +121,7 @@ osfmk/kern/clock.c standard osfmk/kern/clock_oldops.c standard osfmk/kern/coalition.c optional config_coalitions osfmk/kern/counters.c standard +osfmk/kern/cpu_quiesce.c optional config_quiesce_counter osfmk/kern/debug.c standard osfmk/kern/energy_perf.c standard osfmk/kern/exception.c standard @@ -133,7 +136,7 @@ osfmk/kern/ipc_misc.c standard osfmk/kern/ipc_sync.c standard osfmk/kern/ipc_tt.c standard osfmk/kern/kalloc.c standard -osfmk/kern/kern_ecc.c optional config_ecc_logging +osfmk/kern/ecc_logging.c optional config_ecc_logging osfmk/kern/ktrace_background_notify.c standard osfmk/kern/ledger.c standard osfmk/kern/locks.c standard @@ -145,6 +148,7 @@ osfmk/kern/mk_timer.c standard osfmk/kern/page_decrypt.c standard osfmk/kern/printf.c standard osfmk/kern/priority.c standard +osfmk/kern/priority_queue.c standard osfmk/kern/processor.c standard osfmk/kern/processor_data.c standard osfmk/kern/sched_average.c standard @@ -166,6 +170,8 @@ osfmk/kern/sysdiagnose.c optional config_sysdiagnose osfmk/kern/task.c standard osfmk/kern/task_policy.c standard osfmk/kern/task_swap.c standard +osfmk/kern/test_lock.c optional development +osfmk/kern/test_lock.c optional debug osfmk/kern/thread.c standard osfmk/kern/thread_act.c standard osfmk/kern/thread_call.c standard @@ -173,10 +179,13 @@ osfmk/kern/thread_group.c standard osfmk/kern/thread_policy.c standard osfmk/kern/timer.c standard osfmk/kern/timer_call.c standard +osfmk/kern/turnstile.c standard +osfmk/kern/ux_handler.c standard osfmk/kern/waitq.c standard osfmk/kern/work_interval.c standard osfmk/kern/xpr.c optional xpr_debug osfmk/kern/zalloc.c standard +osfmk/kern/zcache.c optional config_zcache osfmk/kern/gzalloc.c optional config_gzalloc osfmk/kern/bsd_kern.c optional mach_bsd osfmk/kern/hibernate.c optional hibernation @@ -200,6 +209,7 @@ osfmk/kern/copyout_shim.c optional copyout_shim ./mach/mach_vm_server.c standard ./mach/mach_voucher_server.c standard ./mach/mach_voucher_attr_control_server.c standard +./mach/memory_entry_server.c standard ./mach/memory_object_control_server.c standard ./mach/resource_notify_user.c standard ./mach/upl_server.c standard @@ -207,6 +217,14 @@ osfmk/kern/copyout_shim.c optional copyout_shim ./mach/task_access_user.c standard osfmk/corpses/corpse.c standard osfmk/kern/kern_cdata.c standard +osfmk/tests/kernel_tests.c optional config_xnupost +osfmk/tests/ktest.c optional config_xnupost +osfmk/tests/ktest_accessor.c optional config_xnupost +osfmk/tests/ktest_emit.c optional config_xnupost +osfmk/tests/ktest_global.c optional config_xnupost +osfmk/tests/pmap_tests.c optional config_xnupost +osfmk/tests/bitmap_test.c optional config_xnupost +osfmk/tests/test_thread_call.c optional config_xnupost ./mach/telemetry_notification_user.c optional config_telemetry osfmk/bank/bank.c standard osfmk/atm/atm.c optional config_atm @@ -248,6 +266,7 @@ osfmk/vm/vm_pageout.c standard osfmk/vm/vm_purgeable.c standard osfmk/vm/vm_resident.c standard osfmk/vm/vm_shared_region.c standard +osfmk/vm/vm_shared_region_pager.c standard osfmk/vm/vm_swapfile_pager.c standard osfmk/vm/vm_user.c standard osfmk/vm/vm32_user.c standard @@ -281,6 +300,7 @@ osfmk/kperf/meminfo.c optional kperf osfmk/kperf/kperf_timer.c optional kperf osfmk/kperf/kperf_kpc.c optional kperf osfmk/kperf/kdebug_trigger.c optional kperf +osfmk/kperf/lazy.c optional kperf osfmk/kern/kpc_thread.c optional kpc osfmk/kern/kpc_common.c optional kpc @@ -315,17 +335,4 @@ osfmk/corecrypto/ccsha2/src/ccsha256_K.c standard osfmk/corecrypto/ccsha2/src/ccsha256_ltc_compress.c standard osfmk/corecrypto/ccsha2/src/ccsha256_ltc_di.c standard -osfmk/corecrypto/ccaes/src/ccaes_ltc_ecb_encrypt_mode.c standard -osfmk/corecrypto/ccmode/src/ccmode_ctr_crypt.c standard -osfmk/corecrypto/ccmode/src/ccmode_ctr_init.c standard -osfmk/corecrypto/ccmode/src/ccmode_ctr_setctr.c standard -osfmk/corecrypto/ccmode/src/ccmode_factory_ctr_crypt.c standard - -osfmk/prng/random.c standard -osfmk/prng/prng_yarrow.c standard -osfmk/prng/fips_sha1.c standard -osfmk/prng/YarrowCoreLib/port/smf.c standard -osfmk/prng/YarrowCoreLib/src/comp.c standard -osfmk/prng/YarrowCoreLib/src/prng.c standard -osfmk/prng/YarrowCoreLib/src/sha1mod.c standard -osfmk/prng/YarrowCoreLib/src/yarrowUtils.c standard +osfmk/prng/prng_random.c standard diff --git a/osfmk/conf/files.arm b/osfmk/conf/files.arm index 208781e47..11ca56627 100644 --- a/osfmk/conf/files.arm +++ b/osfmk/conf/files.arm @@ -50,7 +50,6 @@ osfmk/arm/strlcpy.c standard osfmk/arm/model_dep.c standard osfmk/arm/pcb.c standard -osfmk/arm/conf.c standard osfmk/arm/rtclock.c standard osfmk/arm/status.c standard osfmk/arm/status_shared.c standard @@ -70,7 +69,6 @@ osfmk/OPTIONS/hi_res_clock optional hi_res_clock # Kernel performance monitoring osfmk/kperf/arm/kperf_mp.c optional kperf -osfmk/kperf/arm/kperf_meminfo.c optional kperf osfmk/arm/kpc_arm.c optional kpc osfmk/arm/monotonic_arm.c optional monotonic diff --git a/osfmk/conf/files.arm64 b/osfmk/conf/files.arm64 index 61e470322..68611422f 100644 --- a/osfmk/conf/files.arm64 +++ b/osfmk/conf/files.arm64 @@ -55,7 +55,6 @@ osfmk/arm/strlcpy.c standard osfmk/arm/model_dep.c standard osfmk/arm64/pcb.c standard -osfmk/arm/conf.c standard osfmk/arm/rtclock.c standard osfmk/arm64/status.c standard osfmk/arm/status_shared.c standard @@ -74,7 +73,6 @@ osfmk/OPTIONS/hi_res_clock optional hi_res_clock # Kernel performance monitoring osfmk/kperf/arm/kperf_mp.c optional kperf -osfmk/kperf/arm/kperf_meminfo.c optional kperf osfmk/arm64/kpc.c optional kpc osfmk/arm64/monotonic_arm64.c optional monotonic diff --git a/osfmk/conf/files.x86_64 b/osfmk/conf/files.x86_64 index bf7e53cc6..a696fc0ac 100644 --- a/osfmk/conf/files.x86_64 +++ b/osfmk/conf/files.x86_64 @@ -47,7 +47,8 @@ osfmk/i386/ktss.c standard osfmk/i386/ldt.c standard osfmk/x86_64/loose_ends.c standard osfmk/x86_64/copyio.c standard -osfmk/i386/locks_i386.c standard +osfmk/i386/locks_i386.c standard +osfmk/i386/locks_i386_opt.c standard osfmk/x86_64/locore.s standard osfmk/x86_64/lowmem_vectors.c standard osfmk/x86_64/cswitch.s standard @@ -73,7 +74,6 @@ osfmk/i386/commpage/commpage.c standard osfmk/i386/commpage/commpage_asm.s standard osfmk/i386/commpage/fifo_queues.s standard -osfmk/i386/AT386/conf.c standard osfmk/i386/AT386/model_dep.c standard osfmk/i386/lapic.c standard @@ -114,7 +114,6 @@ osfmk/kern/hv_support.c optional hypervisor # Kernel performance monitoring osfmk/kperf/x86_64/kperf_mp.c optional kperf -osfmk/kperf/x86_64/kperf_meminfo.c optional kperf osfmk/x86_64/kpc_x86.c optional kpc osfmk/x86_64/monotonic_x86_64.c optional monotonic diff --git a/osfmk/console/serial_console.c b/osfmk/console/serial_console.c index 8161ef280..2a74280b8 100644 --- a/osfmk/console/serial_console.c +++ b/osfmk/console/serial_console.c @@ -51,6 +51,13 @@ #include #endif +#ifdef CONFIG_XNUPOST +#include +kern_return_t console_serial_test(void); +kern_return_t console_serial_alloc_rel_tests(void); +kern_return_t console_serial_parallel_log_tests(void); +#define MAX_CPU_SLOTS (MAX_CPUS + 2) +#endif #ifndef MAX_CPU_SLOTS #define MAX_CPU_SLOTS (MAX_CPUS) @@ -127,7 +134,7 @@ SECURITY_READ_ONLY_EARLY(uint32_t) nconsops = (sizeof cons_ops / sizeof cons_ops uint32_t cons_ops_index = VC_CONS_OPS; -#ifdef __arm__ +#if defined(__x86_64__) || defined(__arm__) // NMI static variables #define NMI_STRING_SIZE 32 char nmi_string[NMI_STRING_SIZE] = "afDIGHr84A84jh19Kphgp428DNPdnapq"; @@ -598,7 +605,7 @@ _serial_getc(__unused int a, __unused int b, boolean_t wait, __unused boolean_t c = serial_getc(); } while (wait && c < 0); -#ifdef __arm__ +#if defined(__x86_64__) || defined(__arm__) // Check for the NMI string if (c == nmi_string[nmi_counter]) { nmi_counter++; @@ -645,3 +652,174 @@ vcgetc(__unused int l, __unused int u, __unused boolean_t wait, __unused boolean return 0; } +#ifdef CONFIG_XNUPOST +static uint32_t cons_test_ops_count = 0; + +/* + * Try to do multiple cpu buffer allocs and free and intentionally + * allow for pre-emption. + */ +static void +alloc_free_func(void * arg, wait_result_t wres __unused) +{ + console_buf_t * cbp = NULL; + int count = (int)arg; + + T_LOG("Doing %d iterations of console cpu alloc and free.", count); + + while (count-- > 0) { + (void)hw_atomic_add(&cons_test_ops_count, 1); + cbp = (console_buf_t *)console_cpu_alloc(0); + if (cbp == NULL) { + T_ASSERT_NOTNULL(cbp, "cpu allocation failed"); + } + console_cpu_free(cbp); + cbp = NULL; + /* give chance to another thread to come in */ + delay(10); + } +} + +/* + * Log to console by multiple methods - printf, unbuffered write, console_write() + */ +static void +log_to_console_func(void * arg __unused, wait_result_t wres __unused) +{ + uint64_t thread_id = current_thread()->thread_id; + char somedata[10] = "123456789"; + for (int i = 0; i < 26; i++) { + (void)hw_atomic_add(&cons_test_ops_count, 1); + printf(" thid: %llu printf iteration %d\n", thread_id, i); + cnputc_unbuffered((char)('A' + i)); + cnputc_unbuffered('\n'); + console_write((char *)somedata, sizeof(somedata)); + delay(10); + } + printf("finished the log_to_console_func operations\n\n"); +} + +kern_return_t +console_serial_parallel_log_tests(void) +{ + thread_t thread; + kern_return_t kr; + cons_test_ops_count = 0; + + kr = kernel_thread_start(log_to_console_func, NULL, &thread); + T_ASSERT_EQ_INT(kr, KERN_SUCCESS, "kernel_thread_start returned successfully"); + + delay(100); + + log_to_console_func(NULL, 0); + + /* wait until other thread has also finished */ + while (cons_test_ops_count < 52) { + delay(1000); + } + + thread_deallocate(thread); + T_LOG("parallel_logging tests is now complete. From this point forward we expect full lines\n"); + return KERN_SUCCESS; +} + +kern_return_t +console_serial_alloc_rel_tests(void) +{ + unsigned long i, free_buf_count = 0; + uint32_t * p; + console_buf_t * cbp; + thread_t thread; + kern_return_t kr; + + T_LOG("doing alloc/release tests"); + + for (i = 0; i < MAX_CPU_SLOTS; i++) { + p = (uint32_t *)((uintptr_t)console_ring.buffer + console_ring.len + (i * sizeof(console_buf_t))); + cbp = (console_buf_t *)(void *)p; + /* p should either be allocated cpu buffer or have CPU_BUF_FREE_HEX in it */ + T_ASSERT(*p == CPU_BUF_FREE_HEX || cbp->buf_base == &cbp->buf[0], ""); + if (*p == CPU_BUF_FREE_HEX) { + free_buf_count++; + } + } + + T_ASSERT_GE_ULONG(free_buf_count, 2, "At least 2 buffers should be free"); + cons_test_ops_count = 0; + + kr = kernel_thread_start(alloc_free_func, (void *)1000, &thread); + T_ASSERT_EQ_INT(kr, KERN_SUCCESS, "kernel_thread_start returned successfully"); + + /* yeild cpu to give other thread chance to get on-core */ + delay(100); + + alloc_free_func((void *)1000, 0); + + /* wait until other thread finishes its tasks */ + while (cons_test_ops_count < 2000) { + delay(1000); + } + + thread_deallocate(thread); + /* verify again that atleast 2 slots are free */ + free_buf_count = 0; + for (i = 0; i < MAX_CPU_SLOTS; i++) { + p = (uint32_t *)((uintptr_t)console_ring.buffer + console_ring.len + (i * sizeof(console_buf_t))); + cbp = (console_buf_t *)(void *)p; + /* p should either be allocated cpu buffer or have CPU_BUF_FREE_HEX in it */ + T_ASSERT(*p == CPU_BUF_FREE_HEX || cbp->buf_base == &cbp->buf[0], ""); + if (*p == CPU_BUF_FREE_HEX) { + free_buf_count++; + } + } + T_ASSERT_GE_ULONG(free_buf_count, 2, "At least 2 buffers should be free after alloc free tests"); + + return KERN_SUCCESS; +} + +kern_return_t +console_serial_test(void) +{ + unsigned long i; + char buffer[CPU_BUFFER_LEN]; + uint32_t * p; + console_buf_t * cbp; + + T_LOG("Checking console_ring status."); + T_ASSERT_EQ_INT(console_ring.len, KERN_CONSOLE_RING_SIZE, "Console ring size is not correct."); + T_ASSERT_GT_INT(KERN_CONSOLE_BUF_SIZE, KERN_CONSOLE_RING_SIZE, "kernel console buffer size is < allocation."); + + /* select the next slot from the per cpu buffers at end of console_ring.buffer */ + for (i = 0; i < MAX_CPU_SLOTS; i++) { + p = (uint32_t *)((uintptr_t)console_ring.buffer + console_ring.len + (i * sizeof(console_buf_t))); + cbp = (console_buf_t *)(void *)p; + /* p should either be allocated cpu buffer or have CPU_BUF_FREE_HEX in it */ + T_ASSERT(*p == CPU_BUF_FREE_HEX || cbp->buf_base == &cbp->buf[0], "verified initialization of cpu buffers p=%p", (void *)p); + } + + /* setup buffer to be chars */ + for (i = 0; i < CPU_BUFFER_LEN; i++) { + buffer[i] = (char)('0' + (i % 10)); + } + buffer[CPU_BUFFER_LEN - 1] = '\0'; + + T_LOG("Printing %d char string to serial one char at a time.", CPU_BUFFER_LEN); + for (i = 0; i < CPU_BUFFER_LEN; i++) { + printf("%c", buffer[i]); + } + printf("End\n"); + T_LOG("Printing %d char string to serial as a whole", CPU_BUFFER_LEN); + printf("%s\n", buffer); + + T_LOG("Using console_write call repeatedly for 100 iterations"); + for (i = 0; i < 100; i++) { + console_write(&buffer[0], 14); + if ((i % 6) == 0) + printf("\n"); + } + printf("\n"); + + T_LOG("Using T_LOG to print buffer %s", buffer); + return KERN_SUCCESS; +} +#endif diff --git a/osfmk/corecrypto/cc/src/cc_try_abort.c b/osfmk/corecrypto/cc/src/cc_try_abort.c index 2a0437671..a9e11890a 100644 --- a/osfmk/corecrypto/cc/src/cc_try_abort.c +++ b/osfmk/corecrypto/cc/src/cc_try_abort.c @@ -41,10 +41,10 @@ #include void cc_try_abort(const char * msg CC_UNUSED , ...) { - panic(msg); + panic("%s", msg); } -#elif CC_USE_SEPROM || CC_USE_S3 || CC_BASEBAND || CC_EFI || CC_IBOOT || CC_RTKIT +#elif CC_USE_SEPROM || CC_USE_S3 || CC_BASEBAND || CC_EFI || CC_IBOOT || CC_RTKIT || CC_RTKITROM void cc_try_abort(const char * msg CC_UNUSED, ...) { //Do nothing and return because we don't have panic() in those diff --git a/osfmk/corecrypto/ccaes/src/aes_tab.c b/osfmk/corecrypto/ccaes/src/aes_tab.c deleted file mode 100644 index 0fe7b19a6..000000000 --- a/osfmk/corecrypto/ccaes/src/aes_tab.c +++ /dev/null @@ -1,1061 +0,0 @@ -/* - * aes_tab.c - * corecrypto - * - * Created on 12/12/2010 - * - * Copyright (c) 2010,2015 Apple Inc. All rights reserved. - * - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* LibTomCrypt, modular cryptographic library -- Tom St Denis - * - * LibTomCrypt is a library that provides various cryptographic - * algorithms in a highly modular and flexible manner. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@gmail.com, http://libtom.org - */ - -/* The precomputed tables for AES */ -/* -Te0[x] = S [x].[02, 01, 01, 03]; -Te1[x] = S [x].[03, 02, 01, 01]; -Te2[x] = S [x].[01, 03, 02, 01]; -Te3[x] = S [x].[01, 01, 03, 02]; -Te4[x] = S [x].[01, 01, 01, 01]; - -Td0[x] = Si[x].[0e, 09, 0d, 0b]; -Td1[x] = Si[x].[0b, 0e, 09, 0d]; -Td2[x] = Si[x].[0d, 0b, 0e, 09]; -Td3[x] = Si[x].[09, 0d, 0b, 0e]; -Td4[x] = Si[x].[01, 01, 01, 01]; -*/ - -#include - -/*! - @file aes_tab.c - AES tables -*/ -static const uint32_t TE0[256] = { - 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d, - 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554, - 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d, - 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a, - 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87, - 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b, - 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea, - 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b, - 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a, - 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f, - 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108, - 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f, - 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e, - 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5, - 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d, - 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f, - 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e, - 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb, - 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce, - 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497, - 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c, - 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed, - 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b, - 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a, - 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16, - 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594, - 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81, - 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3, - 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a, - 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504, - 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163, - 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d, - 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f, - 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739, - 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47, - 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395, - 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f, - 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883, - 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c, - 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76, - 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e, - 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4, - 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6, - 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b, - 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7, - 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0, - 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25, - 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818, - 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72, - 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651, - 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21, - 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85, - 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa, - 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12, - 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0, - 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9, - 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133, - 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7, - 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920, - 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a, - 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17, - 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8, - 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11, - 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a, -}; - -#ifndef PELI_TAB -static const uint32_t Te4[256] = { - 0x63636363, 0x7c7c7c7c, 0x77777777, 0x7b7b7b7b, - 0xf2f2f2f2, 0x6b6b6b6b, 0x6f6f6f6f, 0xc5c5c5c5, - 0x30303030, 0x01010101, 0x67676767, 0x2b2b2b2b, - 0xfefefefe, 0xd7d7d7d7, 0xabababab, 0x76767676, - 0xcacacaca, 0x82828282, 0xc9c9c9c9, 0x7d7d7d7d, - 0xfafafafa, 0x59595959, 0x47474747, 0xf0f0f0f0, - 0xadadadad, 0xd4d4d4d4, 0xa2a2a2a2, 0xafafafaf, - 0x9c9c9c9c, 0xa4a4a4a4, 0x72727272, 0xc0c0c0c0, - 0xb7b7b7b7, 0xfdfdfdfd, 0x93939393, 0x26262626, - 0x36363636, 0x3f3f3f3f, 0xf7f7f7f7, 0xcccccccc, - 0x34343434, 0xa5a5a5a5, 0xe5e5e5e5, 0xf1f1f1f1, - 0x71717171, 0xd8d8d8d8, 0x31313131, 0x15151515, - 0x04040404, 0xc7c7c7c7, 0x23232323, 0xc3c3c3c3, - 0x18181818, 0x96969696, 0x05050505, 0x9a9a9a9a, - 0x07070707, 0x12121212, 0x80808080, 0xe2e2e2e2, - 0xebebebeb, 0x27272727, 0xb2b2b2b2, 0x75757575, - 0x09090909, 0x83838383, 0x2c2c2c2c, 0x1a1a1a1a, - 0x1b1b1b1b, 0x6e6e6e6e, 0x5a5a5a5a, 0xa0a0a0a0, - 0x52525252, 0x3b3b3b3b, 0xd6d6d6d6, 0xb3b3b3b3, - 0x29292929, 0xe3e3e3e3, 0x2f2f2f2f, 0x84848484, - 0x53535353, 0xd1d1d1d1, 0x00000000, 0xedededed, - 0x20202020, 0xfcfcfcfc, 0xb1b1b1b1, 0x5b5b5b5b, - 0x6a6a6a6a, 0xcbcbcbcb, 0xbebebebe, 0x39393939, - 0x4a4a4a4a, 0x4c4c4c4c, 0x58585858, 0xcfcfcfcf, - 0xd0d0d0d0, 0xefefefef, 0xaaaaaaaa, 0xfbfbfbfb, - 0x43434343, 0x4d4d4d4d, 0x33333333, 0x85858585, - 0x45454545, 0xf9f9f9f9, 0x02020202, 0x7f7f7f7f, - 0x50505050, 0x3c3c3c3c, 0x9f9f9f9f, 0xa8a8a8a8, - 0x51515151, 0xa3a3a3a3, 0x40404040, 0x8f8f8f8f, - 0x92929292, 0x9d9d9d9d, 0x38383838, 0xf5f5f5f5, - 0xbcbcbcbc, 0xb6b6b6b6, 0xdadadada, 0x21212121, - 0x10101010, 0xffffffff, 0xf3f3f3f3, 0xd2d2d2d2, - 0xcdcdcdcd, 0x0c0c0c0c, 0x13131313, 0xecececec, - 0x5f5f5f5f, 0x97979797, 0x44444444, 0x17171717, - 0xc4c4c4c4, 0xa7a7a7a7, 0x7e7e7e7e, 0x3d3d3d3d, - 0x64646464, 0x5d5d5d5d, 0x19191919, 0x73737373, - 0x60606060, 0x81818181, 0x4f4f4f4f, 0xdcdcdcdc, - 0x22222222, 0x2a2a2a2a, 0x90909090, 0x88888888, - 0x46464646, 0xeeeeeeee, 0xb8b8b8b8, 0x14141414, - 0xdededede, 0x5e5e5e5e, 0x0b0b0b0b, 0xdbdbdbdb, - 0xe0e0e0e0, 0x32323232, 0x3a3a3a3a, 0x0a0a0a0a, - 0x49494949, 0x06060606, 0x24242424, 0x5c5c5c5c, - 0xc2c2c2c2, 0xd3d3d3d3, 0xacacacac, 0x62626262, - 0x91919191, 0x95959595, 0xe4e4e4e4, 0x79797979, - 0xe7e7e7e7, 0xc8c8c8c8, 0x37373737, 0x6d6d6d6d, - 0x8d8d8d8d, 0xd5d5d5d5, 0x4e4e4e4e, 0xa9a9a9a9, - 0x6c6c6c6c, 0x56565656, 0xf4f4f4f4, 0xeaeaeaea, - 0x65656565, 0x7a7a7a7a, 0xaeaeaeae, 0x08080808, - 0xbabababa, 0x78787878, 0x25252525, 0x2e2e2e2e, - 0x1c1c1c1c, 0xa6a6a6a6, 0xb4b4b4b4, 0xc6c6c6c6, - 0xe8e8e8e8, 0xdddddddd, 0x74747474, 0x1f1f1f1f, - 0x4b4b4b4b, 0xbdbdbdbd, 0x8b8b8b8b, 0x8a8a8a8a, - 0x70707070, 0x3e3e3e3e, 0xb5b5b5b5, 0x66666666, - 0x48484848, 0x03030303, 0xf6f6f6f6, 0x0e0e0e0e, - 0x61616161, 0x35353535, 0x57575757, 0xb9b9b9b9, - 0x86868686, 0xc1c1c1c1, 0x1d1d1d1d, 0x9e9e9e9e, - 0xe1e1e1e1, 0xf8f8f8f8, 0x98989898, 0x11111111, - 0x69696969, 0xd9d9d9d9, 0x8e8e8e8e, 0x94949494, - 0x9b9b9b9b, 0x1e1e1e1e, 0x87878787, 0xe9e9e9e9, - 0xcececece, 0x55555555, 0x28282828, 0xdfdfdfdf, - 0x8c8c8c8c, 0xa1a1a1a1, 0x89898989, 0x0d0d0d0d, - 0xbfbfbfbf, 0xe6e6e6e6, 0x42424242, 0x68686868, - 0x41414141, 0x99999999, 0x2d2d2d2d, 0x0f0f0f0f, - 0xb0b0b0b0, 0x54545454, 0xbbbbbbbb, 0x16161616, -}; -#endif - -#ifndef ENCRYPT_ONLY - -static const uint32_t TD0[256] = { - 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96, - 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393, - 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25, - 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f, - 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1, - 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6, - 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da, - 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844, - 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd, - 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4, - 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45, - 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94, - 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7, - 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a, - 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5, - 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c, - 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1, - 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a, - 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75, - 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051, - 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46, - 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff, - 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77, - 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb, - 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000, - 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e, - 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927, - 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a, - 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e, - 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16, - 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d, - 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8, - 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd, - 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34, - 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163, - 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120, - 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d, - 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0, - 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422, - 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef, - 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36, - 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4, - 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662, - 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5, - 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3, - 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b, - 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8, - 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6, - 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6, - 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0, - 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815, - 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f, - 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df, - 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f, - 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e, - 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713, - 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89, - 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c, - 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf, - 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86, - 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f, - 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541, - 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190, - 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742, -}; - -static const uint32_t Td4[256] = { - 0x52525252, 0x09090909, 0x6a6a6a6a, 0xd5d5d5d5, - 0x30303030, 0x36363636, 0xa5a5a5a5, 0x38383838, - 0xbfbfbfbf, 0x40404040, 0xa3a3a3a3, 0x9e9e9e9e, - 0x81818181, 0xf3f3f3f3, 0xd7d7d7d7, 0xfbfbfbfb, - 0x7c7c7c7c, 0xe3e3e3e3, 0x39393939, 0x82828282, - 0x9b9b9b9b, 0x2f2f2f2f, 0xffffffff, 0x87878787, - 0x34343434, 0x8e8e8e8e, 0x43434343, 0x44444444, - 0xc4c4c4c4, 0xdededede, 0xe9e9e9e9, 0xcbcbcbcb, - 0x54545454, 0x7b7b7b7b, 0x94949494, 0x32323232, - 0xa6a6a6a6, 0xc2c2c2c2, 0x23232323, 0x3d3d3d3d, - 0xeeeeeeee, 0x4c4c4c4c, 0x95959595, 0x0b0b0b0b, - 0x42424242, 0xfafafafa, 0xc3c3c3c3, 0x4e4e4e4e, - 0x08080808, 0x2e2e2e2e, 0xa1a1a1a1, 0x66666666, - 0x28282828, 0xd9d9d9d9, 0x24242424, 0xb2b2b2b2, - 0x76767676, 0x5b5b5b5b, 0xa2a2a2a2, 0x49494949, - 0x6d6d6d6d, 0x8b8b8b8b, 0xd1d1d1d1, 0x25252525, - 0x72727272, 0xf8f8f8f8, 0xf6f6f6f6, 0x64646464, - 0x86868686, 0x68686868, 0x98989898, 0x16161616, - 0xd4d4d4d4, 0xa4a4a4a4, 0x5c5c5c5c, 0xcccccccc, - 0x5d5d5d5d, 0x65656565, 0xb6b6b6b6, 0x92929292, - 0x6c6c6c6c, 0x70707070, 0x48484848, 0x50505050, - 0xfdfdfdfd, 0xedededed, 0xb9b9b9b9, 0xdadadada, - 0x5e5e5e5e, 0x15151515, 0x46464646, 0x57575757, - 0xa7a7a7a7, 0x8d8d8d8d, 0x9d9d9d9d, 0x84848484, - 0x90909090, 0xd8d8d8d8, 0xabababab, 0x00000000, - 0x8c8c8c8c, 0xbcbcbcbc, 0xd3d3d3d3, 0x0a0a0a0a, - 0xf7f7f7f7, 0xe4e4e4e4, 0x58585858, 0x05050505, - 0xb8b8b8b8, 0xb3b3b3b3, 0x45454545, 0x06060606, - 0xd0d0d0d0, 0x2c2c2c2c, 0x1e1e1e1e, 0x8f8f8f8f, - 0xcacacaca, 0x3f3f3f3f, 0x0f0f0f0f, 0x02020202, - 0xc1c1c1c1, 0xafafafaf, 0xbdbdbdbd, 0x03030303, - 0x01010101, 0x13131313, 0x8a8a8a8a, 0x6b6b6b6b, - 0x3a3a3a3a, 0x91919191, 0x11111111, 0x41414141, - 0x4f4f4f4f, 0x67676767, 0xdcdcdcdc, 0xeaeaeaea, - 0x97979797, 0xf2f2f2f2, 0xcfcfcfcf, 0xcececece, - 0xf0f0f0f0, 0xb4b4b4b4, 0xe6e6e6e6, 0x73737373, - 0x96969696, 0xacacacac, 0x74747474, 0x22222222, - 0xe7e7e7e7, 0xadadadad, 0x35353535, 0x85858585, - 0xe2e2e2e2, 0xf9f9f9f9, 0x37373737, 0xe8e8e8e8, - 0x1c1c1c1c, 0x75757575, 0xdfdfdfdf, 0x6e6e6e6e, - 0x47474747, 0xf1f1f1f1, 0x1a1a1a1a, 0x71717171, - 0x1d1d1d1d, 0x29292929, 0xc5c5c5c5, 0x89898989, - 0x6f6f6f6f, 0xb7b7b7b7, 0x62626262, 0x0e0e0e0e, - 0xaaaaaaaa, 0x18181818, 0xbebebebe, 0x1b1b1b1b, - 0xfcfcfcfc, 0x56565656, 0x3e3e3e3e, 0x4b4b4b4b, - 0xc6c6c6c6, 0xd2d2d2d2, 0x79797979, 0x20202020, - 0x9a9a9a9a, 0xdbdbdbdb, 0xc0c0c0c0, 0xfefefefe, - 0x78787878, 0xcdcdcdcd, 0x5a5a5a5a, 0xf4f4f4f4, - 0x1f1f1f1f, 0xdddddddd, 0xa8a8a8a8, 0x33333333, - 0x88888888, 0x07070707, 0xc7c7c7c7, 0x31313131, - 0xb1b1b1b1, 0x12121212, 0x10101010, 0x59595959, - 0x27272727, 0x80808080, 0xecececec, 0x5f5f5f5f, - 0x60606060, 0x51515151, 0x7f7f7f7f, 0xa9a9a9a9, - 0x19191919, 0xb5b5b5b5, 0x4a4a4a4a, 0x0d0d0d0d, - 0x2d2d2d2d, 0xe5e5e5e5, 0x7a7a7a7a, 0x9f9f9f9f, - 0x93939393, 0xc9c9c9c9, 0x9c9c9c9c, 0xefefefef, - 0xa0a0a0a0, 0xe0e0e0e0, 0x3b3b3b3b, 0x4d4d4d4d, - 0xaeaeaeae, 0x2a2a2a2a, 0xf5f5f5f5, 0xb0b0b0b0, - 0xc8c8c8c8, 0xebebebeb, 0xbbbbbbbb, 0x3c3c3c3c, - 0x83838383, 0x53535353, 0x99999999, 0x61616161, - 0x17171717, 0x2b2b2b2b, 0x04040404, 0x7e7e7e7e, - 0xbabababa, 0x77777777, 0xd6d6d6d6, 0x26262626, - 0xe1e1e1e1, 0x69696969, 0x14141414, 0x63636363, - 0x55555555, 0x21212121, 0x0c0c0c0c, 0x7d7d7d7d, -}; - -#endif /* ENCRYPT_ONLY */ - -#ifdef LTC_SMALL_CODE - -#define Te0(x) TE0[x] -#define Te1(x) RORc(TE0[x], 8) -#define Te2(x) RORc(TE0[x], 16) -#define Te3(x) RORc(TE0[x], 24) - -#define Td0(x) TD0[x] -#define Td1(x) RORc(TD0[x], 8) -#define Td2(x) RORc(TD0[x], 16) -#define Td3(x) RORc(TD0[x], 24) - -#define Te4_0 0x000000FF & Te4 -#define Te4_1 0x0000FF00 & Te4 -#define Te4_2 0x00FF0000 & Te4 -#define Te4_3 0xFF000000 & Te4 - -#else - -#define Te0(x) TE0[x] -#define Te1(x) TE1[x] -#define Te2(x) TE2[x] -#define Te3(x) TE3[x] - -#define Td0(x) TD0[x] -#define Td1(x) TD1[x] -#define Td2(x) TD2[x] -#define Td3(x) TD3[x] - -static const uint32_t TE1[256] = { - 0xa5c66363, 0x84f87c7c, 0x99ee7777, 0x8df67b7b, - 0x0dfff2f2, 0xbdd66b6b, 0xb1de6f6f, 0x5491c5c5, - 0x50603030, 0x03020101, 0xa9ce6767, 0x7d562b2b, - 0x19e7fefe, 0x62b5d7d7, 0xe64dabab, 0x9aec7676, - 0x458fcaca, 0x9d1f8282, 0x4089c9c9, 0x87fa7d7d, - 0x15effafa, 0xebb25959, 0xc98e4747, 0x0bfbf0f0, - 0xec41adad, 0x67b3d4d4, 0xfd5fa2a2, 0xea45afaf, - 0xbf239c9c, 0xf753a4a4, 0x96e47272, 0x5b9bc0c0, - 0xc275b7b7, 0x1ce1fdfd, 0xae3d9393, 0x6a4c2626, - 0x5a6c3636, 0x417e3f3f, 0x02f5f7f7, 0x4f83cccc, - 0x5c683434, 0xf451a5a5, 0x34d1e5e5, 0x08f9f1f1, - 0x93e27171, 0x73abd8d8, 0x53623131, 0x3f2a1515, - 0x0c080404, 0x5295c7c7, 0x65462323, 0x5e9dc3c3, - 0x28301818, 0xa1379696, 0x0f0a0505, 0xb52f9a9a, - 0x090e0707, 0x36241212, 0x9b1b8080, 0x3ddfe2e2, - 0x26cdebeb, 0x694e2727, 0xcd7fb2b2, 0x9fea7575, - 0x1b120909, 0x9e1d8383, 0x74582c2c, 0x2e341a1a, - 0x2d361b1b, 0xb2dc6e6e, 0xeeb45a5a, 0xfb5ba0a0, - 0xf6a45252, 0x4d763b3b, 0x61b7d6d6, 0xce7db3b3, - 0x7b522929, 0x3edde3e3, 0x715e2f2f, 0x97138484, - 0xf5a65353, 0x68b9d1d1, 0x00000000, 0x2cc1eded, - 0x60402020, 0x1fe3fcfc, 0xc879b1b1, 0xedb65b5b, - 0xbed46a6a, 0x468dcbcb, 0xd967bebe, 0x4b723939, - 0xde944a4a, 0xd4984c4c, 0xe8b05858, 0x4a85cfcf, - 0x6bbbd0d0, 0x2ac5efef, 0xe54faaaa, 0x16edfbfb, - 0xc5864343, 0xd79a4d4d, 0x55663333, 0x94118585, - 0xcf8a4545, 0x10e9f9f9, 0x06040202, 0x81fe7f7f, - 0xf0a05050, 0x44783c3c, 0xba259f9f, 0xe34ba8a8, - 0xf3a25151, 0xfe5da3a3, 0xc0804040, 0x8a058f8f, - 0xad3f9292, 0xbc219d9d, 0x48703838, 0x04f1f5f5, - 0xdf63bcbc, 0xc177b6b6, 0x75afdada, 0x63422121, - 0x30201010, 0x1ae5ffff, 0x0efdf3f3, 0x6dbfd2d2, - 0x4c81cdcd, 0x14180c0c, 0x35261313, 0x2fc3ecec, - 0xe1be5f5f, 0xa2359797, 0xcc884444, 0x392e1717, - 0x5793c4c4, 0xf255a7a7, 0x82fc7e7e, 0x477a3d3d, - 0xacc86464, 0xe7ba5d5d, 0x2b321919, 0x95e67373, - 0xa0c06060, 0x98198181, 0xd19e4f4f, 0x7fa3dcdc, - 0x66442222, 0x7e542a2a, 0xab3b9090, 0x830b8888, - 0xca8c4646, 0x29c7eeee, 0xd36bb8b8, 0x3c281414, - 0x79a7dede, 0xe2bc5e5e, 0x1d160b0b, 0x76addbdb, - 0x3bdbe0e0, 0x56643232, 0x4e743a3a, 0x1e140a0a, - 0xdb924949, 0x0a0c0606, 0x6c482424, 0xe4b85c5c, - 0x5d9fc2c2, 0x6ebdd3d3, 0xef43acac, 0xa6c46262, - 0xa8399191, 0xa4319595, 0x37d3e4e4, 0x8bf27979, - 0x32d5e7e7, 0x438bc8c8, 0x596e3737, 0xb7da6d6d, - 0x8c018d8d, 0x64b1d5d5, 0xd29c4e4e, 0xe049a9a9, - 0xb4d86c6c, 0xfaac5656, 0x07f3f4f4, 0x25cfeaea, - 0xafca6565, 0x8ef47a7a, 0xe947aeae, 0x18100808, - 0xd56fbaba, 0x88f07878, 0x6f4a2525, 0x725c2e2e, - 0x24381c1c, 0xf157a6a6, 0xc773b4b4, 0x5197c6c6, - 0x23cbe8e8, 0x7ca1dddd, 0x9ce87474, 0x213e1f1f, - 0xdd964b4b, 0xdc61bdbd, 0x860d8b8b, 0x850f8a8a, - 0x90e07070, 0x427c3e3e, 0xc471b5b5, 0xaacc6666, - 0xd8904848, 0x05060303, 0x01f7f6f6, 0x121c0e0e, - 0xa3c26161, 0x5f6a3535, 0xf9ae5757, 0xd069b9b9, - 0x91178686, 0x5899c1c1, 0x273a1d1d, 0xb9279e9e, - 0x38d9e1e1, 0x13ebf8f8, 0xb32b9898, 0x33221111, - 0xbbd26969, 0x70a9d9d9, 0x89078e8e, 0xa7339494, - 0xb62d9b9b, 0x223c1e1e, 0x92158787, 0x20c9e9e9, - 0x4987cece, 0xffaa5555, 0x78502828, 0x7aa5dfdf, - 0x8f038c8c, 0xf859a1a1, 0x80098989, 0x171a0d0d, - 0xda65bfbf, 0x31d7e6e6, 0xc6844242, 0xb8d06868, - 0xc3824141, 0xb0299999, 0x775a2d2d, 0x111e0f0f, - 0xcb7bb0b0, 0xfca85454, 0xd66dbbbb, 0x3a2c1616, -}; -static const uint32_t TE2[256] = { - 0x63a5c663, 0x7c84f87c, 0x7799ee77, 0x7b8df67b, - 0xf20dfff2, 0x6bbdd66b, 0x6fb1de6f, 0xc55491c5, - 0x30506030, 0x01030201, 0x67a9ce67, 0x2b7d562b, - 0xfe19e7fe, 0xd762b5d7, 0xabe64dab, 0x769aec76, - 0xca458fca, 0x829d1f82, 0xc94089c9, 0x7d87fa7d, - 0xfa15effa, 0x59ebb259, 0x47c98e47, 0xf00bfbf0, - 0xadec41ad, 0xd467b3d4, 0xa2fd5fa2, 0xafea45af, - 0x9cbf239c, 0xa4f753a4, 0x7296e472, 0xc05b9bc0, - 0xb7c275b7, 0xfd1ce1fd, 0x93ae3d93, 0x266a4c26, - 0x365a6c36, 0x3f417e3f, 0xf702f5f7, 0xcc4f83cc, - 0x345c6834, 0xa5f451a5, 0xe534d1e5, 0xf108f9f1, - 0x7193e271, 0xd873abd8, 0x31536231, 0x153f2a15, - 0x040c0804, 0xc75295c7, 0x23654623, 0xc35e9dc3, - 0x18283018, 0x96a13796, 0x050f0a05, 0x9ab52f9a, - 0x07090e07, 0x12362412, 0x809b1b80, 0xe23ddfe2, - 0xeb26cdeb, 0x27694e27, 0xb2cd7fb2, 0x759fea75, - 0x091b1209, 0x839e1d83, 0x2c74582c, 0x1a2e341a, - 0x1b2d361b, 0x6eb2dc6e, 0x5aeeb45a, 0xa0fb5ba0, - 0x52f6a452, 0x3b4d763b, 0xd661b7d6, 0xb3ce7db3, - 0x297b5229, 0xe33edde3, 0x2f715e2f, 0x84971384, - 0x53f5a653, 0xd168b9d1, 0x00000000, 0xed2cc1ed, - 0x20604020, 0xfc1fe3fc, 0xb1c879b1, 0x5bedb65b, - 0x6abed46a, 0xcb468dcb, 0xbed967be, 0x394b7239, - 0x4ade944a, 0x4cd4984c, 0x58e8b058, 0xcf4a85cf, - 0xd06bbbd0, 0xef2ac5ef, 0xaae54faa, 0xfb16edfb, - 0x43c58643, 0x4dd79a4d, 0x33556633, 0x85941185, - 0x45cf8a45, 0xf910e9f9, 0x02060402, 0x7f81fe7f, - 0x50f0a050, 0x3c44783c, 0x9fba259f, 0xa8e34ba8, - 0x51f3a251, 0xa3fe5da3, 0x40c08040, 0x8f8a058f, - 0x92ad3f92, 0x9dbc219d, 0x38487038, 0xf504f1f5, - 0xbcdf63bc, 0xb6c177b6, 0xda75afda, 0x21634221, - 0x10302010, 0xff1ae5ff, 0xf30efdf3, 0xd26dbfd2, - 0xcd4c81cd, 0x0c14180c, 0x13352613, 0xec2fc3ec, - 0x5fe1be5f, 0x97a23597, 0x44cc8844, 0x17392e17, - 0xc45793c4, 0xa7f255a7, 0x7e82fc7e, 0x3d477a3d, - 0x64acc864, 0x5de7ba5d, 0x192b3219, 0x7395e673, - 0x60a0c060, 0x81981981, 0x4fd19e4f, 0xdc7fa3dc, - 0x22664422, 0x2a7e542a, 0x90ab3b90, 0x88830b88, - 0x46ca8c46, 0xee29c7ee, 0xb8d36bb8, 0x143c2814, - 0xde79a7de, 0x5ee2bc5e, 0x0b1d160b, 0xdb76addb, - 0xe03bdbe0, 0x32566432, 0x3a4e743a, 0x0a1e140a, - 0x49db9249, 0x060a0c06, 0x246c4824, 0x5ce4b85c, - 0xc25d9fc2, 0xd36ebdd3, 0xacef43ac, 0x62a6c462, - 0x91a83991, 0x95a43195, 0xe437d3e4, 0x798bf279, - 0xe732d5e7, 0xc8438bc8, 0x37596e37, 0x6db7da6d, - 0x8d8c018d, 0xd564b1d5, 0x4ed29c4e, 0xa9e049a9, - 0x6cb4d86c, 0x56faac56, 0xf407f3f4, 0xea25cfea, - 0x65afca65, 0x7a8ef47a, 0xaee947ae, 0x08181008, - 0xbad56fba, 0x7888f078, 0x256f4a25, 0x2e725c2e, - 0x1c24381c, 0xa6f157a6, 0xb4c773b4, 0xc65197c6, - 0xe823cbe8, 0xdd7ca1dd, 0x749ce874, 0x1f213e1f, - 0x4bdd964b, 0xbddc61bd, 0x8b860d8b, 0x8a850f8a, - 0x7090e070, 0x3e427c3e, 0xb5c471b5, 0x66aacc66, - 0x48d89048, 0x03050603, 0xf601f7f6, 0x0e121c0e, - 0x61a3c261, 0x355f6a35, 0x57f9ae57, 0xb9d069b9, - 0x86911786, 0xc15899c1, 0x1d273a1d, 0x9eb9279e, - 0xe138d9e1, 0xf813ebf8, 0x98b32b98, 0x11332211, - 0x69bbd269, 0xd970a9d9, 0x8e89078e, 0x94a73394, - 0x9bb62d9b, 0x1e223c1e, 0x87921587, 0xe920c9e9, - 0xce4987ce, 0x55ffaa55, 0x28785028, 0xdf7aa5df, - 0x8c8f038c, 0xa1f859a1, 0x89800989, 0x0d171a0d, - 0xbfda65bf, 0xe631d7e6, 0x42c68442, 0x68b8d068, - 0x41c38241, 0x99b02999, 0x2d775a2d, 0x0f111e0f, - 0xb0cb7bb0, 0x54fca854, 0xbbd66dbb, 0x163a2c16, -}; -static const uint32_t TE3[256] = { - - 0x6363a5c6, 0x7c7c84f8, 0x777799ee, 0x7b7b8df6, - 0xf2f20dff, 0x6b6bbdd6, 0x6f6fb1de, 0xc5c55491, - 0x30305060, 0x01010302, 0x6767a9ce, 0x2b2b7d56, - 0xfefe19e7, 0xd7d762b5, 0xababe64d, 0x76769aec, - 0xcaca458f, 0x82829d1f, 0xc9c94089, 0x7d7d87fa, - 0xfafa15ef, 0x5959ebb2, 0x4747c98e, 0xf0f00bfb, - 0xadadec41, 0xd4d467b3, 0xa2a2fd5f, 0xafafea45, - 0x9c9cbf23, 0xa4a4f753, 0x727296e4, 0xc0c05b9b, - 0xb7b7c275, 0xfdfd1ce1, 0x9393ae3d, 0x26266a4c, - 0x36365a6c, 0x3f3f417e, 0xf7f702f5, 0xcccc4f83, - 0x34345c68, 0xa5a5f451, 0xe5e534d1, 0xf1f108f9, - 0x717193e2, 0xd8d873ab, 0x31315362, 0x15153f2a, - 0x04040c08, 0xc7c75295, 0x23236546, 0xc3c35e9d, - 0x18182830, 0x9696a137, 0x05050f0a, 0x9a9ab52f, - 0x0707090e, 0x12123624, 0x80809b1b, 0xe2e23ddf, - 0xebeb26cd, 0x2727694e, 0xb2b2cd7f, 0x75759fea, - 0x09091b12, 0x83839e1d, 0x2c2c7458, 0x1a1a2e34, - 0x1b1b2d36, 0x6e6eb2dc, 0x5a5aeeb4, 0xa0a0fb5b, - 0x5252f6a4, 0x3b3b4d76, 0xd6d661b7, 0xb3b3ce7d, - 0x29297b52, 0xe3e33edd, 0x2f2f715e, 0x84849713, - 0x5353f5a6, 0xd1d168b9, 0x00000000, 0xeded2cc1, - 0x20206040, 0xfcfc1fe3, 0xb1b1c879, 0x5b5bedb6, - 0x6a6abed4, 0xcbcb468d, 0xbebed967, 0x39394b72, - 0x4a4ade94, 0x4c4cd498, 0x5858e8b0, 0xcfcf4a85, - 0xd0d06bbb, 0xefef2ac5, 0xaaaae54f, 0xfbfb16ed, - 0x4343c586, 0x4d4dd79a, 0x33335566, 0x85859411, - 0x4545cf8a, 0xf9f910e9, 0x02020604, 0x7f7f81fe, - 0x5050f0a0, 0x3c3c4478, 0x9f9fba25, 0xa8a8e34b, - 0x5151f3a2, 0xa3a3fe5d, 0x4040c080, 0x8f8f8a05, - 0x9292ad3f, 0x9d9dbc21, 0x38384870, 0xf5f504f1, - 0xbcbcdf63, 0xb6b6c177, 0xdada75af, 0x21216342, - 0x10103020, 0xffff1ae5, 0xf3f30efd, 0xd2d26dbf, - 0xcdcd4c81, 0x0c0c1418, 0x13133526, 0xecec2fc3, - 0x5f5fe1be, 0x9797a235, 0x4444cc88, 0x1717392e, - 0xc4c45793, 0xa7a7f255, 0x7e7e82fc, 0x3d3d477a, - 0x6464acc8, 0x5d5de7ba, 0x19192b32, 0x737395e6, - 0x6060a0c0, 0x81819819, 0x4f4fd19e, 0xdcdc7fa3, - 0x22226644, 0x2a2a7e54, 0x9090ab3b, 0x8888830b, - 0x4646ca8c, 0xeeee29c7, 0xb8b8d36b, 0x14143c28, - 0xdede79a7, 0x5e5ee2bc, 0x0b0b1d16, 0xdbdb76ad, - 0xe0e03bdb, 0x32325664, 0x3a3a4e74, 0x0a0a1e14, - 0x4949db92, 0x06060a0c, 0x24246c48, 0x5c5ce4b8, - 0xc2c25d9f, 0xd3d36ebd, 0xacacef43, 0x6262a6c4, - 0x9191a839, 0x9595a431, 0xe4e437d3, 0x79798bf2, - 0xe7e732d5, 0xc8c8438b, 0x3737596e, 0x6d6db7da, - 0x8d8d8c01, 0xd5d564b1, 0x4e4ed29c, 0xa9a9e049, - 0x6c6cb4d8, 0x5656faac, 0xf4f407f3, 0xeaea25cf, - 0x6565afca, 0x7a7a8ef4, 0xaeaee947, 0x08081810, - 0xbabad56f, 0x787888f0, 0x25256f4a, 0x2e2e725c, - 0x1c1c2438, 0xa6a6f157, 0xb4b4c773, 0xc6c65197, - 0xe8e823cb, 0xdddd7ca1, 0x74749ce8, 0x1f1f213e, - 0x4b4bdd96, 0xbdbddc61, 0x8b8b860d, 0x8a8a850f, - 0x707090e0, 0x3e3e427c, 0xb5b5c471, 0x6666aacc, - 0x4848d890, 0x03030506, 0xf6f601f7, 0x0e0e121c, - 0x6161a3c2, 0x35355f6a, 0x5757f9ae, 0xb9b9d069, - 0x86869117, 0xc1c15899, 0x1d1d273a, 0x9e9eb927, - 0xe1e138d9, 0xf8f813eb, 0x9898b32b, 0x11113322, - 0x6969bbd2, 0xd9d970a9, 0x8e8e8907, 0x9494a733, - 0x9b9bb62d, 0x1e1e223c, 0x87879215, 0xe9e920c9, - 0xcece4987, 0x5555ffaa, 0x28287850, 0xdfdf7aa5, - 0x8c8c8f03, 0xa1a1f859, 0x89898009, 0x0d0d171a, - 0xbfbfda65, 0xe6e631d7, 0x4242c684, 0x6868b8d0, - 0x4141c382, 0x9999b029, 0x2d2d775a, 0x0f0f111e, - 0xb0b0cb7b, 0x5454fca8, 0xbbbbd66d, 0x16163a2c, -}; - -#ifndef PELI_TAB -static const uint32_t Te4_0[] = { -0x00000063, 0x0000007c, 0x00000077, 0x0000007b, 0x000000f2, 0x0000006b, 0x0000006f, 0x000000c5, -0x00000030, 0x00000001, 0x00000067, 0x0000002b, 0x000000fe, 0x000000d7, 0x000000ab, 0x00000076, -0x000000ca, 0x00000082, 0x000000c9, 0x0000007d, 0x000000fa, 0x00000059, 0x00000047, 0x000000f0, -0x000000ad, 0x000000d4, 0x000000a2, 0x000000af, 0x0000009c, 0x000000a4, 0x00000072, 0x000000c0, -0x000000b7, 0x000000fd, 0x00000093, 0x00000026, 0x00000036, 0x0000003f, 0x000000f7, 0x000000cc, -0x00000034, 0x000000a5, 0x000000e5, 0x000000f1, 0x00000071, 0x000000d8, 0x00000031, 0x00000015, -0x00000004, 0x000000c7, 0x00000023, 0x000000c3, 0x00000018, 0x00000096, 0x00000005, 0x0000009a, -0x00000007, 0x00000012, 0x00000080, 0x000000e2, 0x000000eb, 0x00000027, 0x000000b2, 0x00000075, -0x00000009, 0x00000083, 0x0000002c, 0x0000001a, 0x0000001b, 0x0000006e, 0x0000005a, 0x000000a0, -0x00000052, 0x0000003b, 0x000000d6, 0x000000b3, 0x00000029, 0x000000e3, 0x0000002f, 0x00000084, -0x00000053, 0x000000d1, 0x00000000, 0x000000ed, 0x00000020, 0x000000fc, 0x000000b1, 0x0000005b, -0x0000006a, 0x000000cb, 0x000000be, 0x00000039, 0x0000004a, 0x0000004c, 0x00000058, 0x000000cf, -0x000000d0, 0x000000ef, 0x000000aa, 0x000000fb, 0x00000043, 0x0000004d, 0x00000033, 0x00000085, -0x00000045, 0x000000f9, 0x00000002, 0x0000007f, 0x00000050, 0x0000003c, 0x0000009f, 0x000000a8, -0x00000051, 0x000000a3, 0x00000040, 0x0000008f, 0x00000092, 0x0000009d, 0x00000038, 0x000000f5, -0x000000bc, 0x000000b6, 0x000000da, 0x00000021, 0x00000010, 0x000000ff, 0x000000f3, 0x000000d2, -0x000000cd, 0x0000000c, 0x00000013, 0x000000ec, 0x0000005f, 0x00000097, 0x00000044, 0x00000017, -0x000000c4, 0x000000a7, 0x0000007e, 0x0000003d, 0x00000064, 0x0000005d, 0x00000019, 0x00000073, -0x00000060, 0x00000081, 0x0000004f, 0x000000dc, 0x00000022, 0x0000002a, 0x00000090, 0x00000088, -0x00000046, 0x000000ee, 0x000000b8, 0x00000014, 0x000000de, 0x0000005e, 0x0000000b, 0x000000db, -0x000000e0, 0x00000032, 0x0000003a, 0x0000000a, 0x00000049, 0x00000006, 0x00000024, 0x0000005c, -0x000000c2, 0x000000d3, 0x000000ac, 0x00000062, 0x00000091, 0x00000095, 0x000000e4, 0x00000079, -0x000000e7, 0x000000c8, 0x00000037, 0x0000006d, 0x0000008d, 0x000000d5, 0x0000004e, 0x000000a9, -0x0000006c, 0x00000056, 0x000000f4, 0x000000ea, 0x00000065, 0x0000007a, 0x000000ae, 0x00000008, -0x000000ba, 0x00000078, 0x00000025, 0x0000002e, 0x0000001c, 0x000000a6, 0x000000b4, 0x000000c6, -0x000000e8, 0x000000dd, 0x00000074, 0x0000001f, 0x0000004b, 0x000000bd, 0x0000008b, 0x0000008a, -0x00000070, 0x0000003e, 0x000000b5, 0x00000066, 0x00000048, 0x00000003, 0x000000f6, 0x0000000e, -0x00000061, 0x00000035, 0x00000057, 0x000000b9, 0x00000086, 0x000000c1, 0x0000001d, 0x0000009e, -0x000000e1, 0x000000f8, 0x00000098, 0x00000011, 0x00000069, 0x000000d9, 0x0000008e, 0x00000094, -0x0000009b, 0x0000001e, 0x00000087, 0x000000e9, 0x000000ce, 0x00000055, 0x00000028, 0x000000df, -0x0000008c, 0x000000a1, 0x00000089, 0x0000000d, 0x000000bf, 0x000000e6, 0x00000042, 0x00000068, -0x00000041, 0x00000099, 0x0000002d, 0x0000000f, 0x000000b0, 0x00000054, 0x000000bb, 0x00000016 -}; - -static const uint32_t Te4_1[] = { -0x00006300, 0x00007c00, 0x00007700, 0x00007b00, 0x0000f200, 0x00006b00, 0x00006f00, 0x0000c500, -0x00003000, 0x00000100, 0x00006700, 0x00002b00, 0x0000fe00, 0x0000d700, 0x0000ab00, 0x00007600, -0x0000ca00, 0x00008200, 0x0000c900, 0x00007d00, 0x0000fa00, 0x00005900, 0x00004700, 0x0000f000, -0x0000ad00, 0x0000d400, 0x0000a200, 0x0000af00, 0x00009c00, 0x0000a400, 0x00007200, 0x0000c000, -0x0000b700, 0x0000fd00, 0x00009300, 0x00002600, 0x00003600, 0x00003f00, 0x0000f700, 0x0000cc00, -0x00003400, 0x0000a500, 0x0000e500, 0x0000f100, 0x00007100, 0x0000d800, 0x00003100, 0x00001500, -0x00000400, 0x0000c700, 0x00002300, 0x0000c300, 0x00001800, 0x00009600, 0x00000500, 0x00009a00, -0x00000700, 0x00001200, 0x00008000, 0x0000e200, 0x0000eb00, 0x00002700, 0x0000b200, 0x00007500, -0x00000900, 0x00008300, 0x00002c00, 0x00001a00, 0x00001b00, 0x00006e00, 0x00005a00, 0x0000a000, -0x00005200, 0x00003b00, 0x0000d600, 0x0000b300, 0x00002900, 0x0000e300, 0x00002f00, 0x00008400, -0x00005300, 0x0000d100, 0x00000000, 0x0000ed00, 0x00002000, 0x0000fc00, 0x0000b100, 0x00005b00, -0x00006a00, 0x0000cb00, 0x0000be00, 0x00003900, 0x00004a00, 0x00004c00, 0x00005800, 0x0000cf00, -0x0000d000, 0x0000ef00, 0x0000aa00, 0x0000fb00, 0x00004300, 0x00004d00, 0x00003300, 0x00008500, -0x00004500, 0x0000f900, 0x00000200, 0x00007f00, 0x00005000, 0x00003c00, 0x00009f00, 0x0000a800, -0x00005100, 0x0000a300, 0x00004000, 0x00008f00, 0x00009200, 0x00009d00, 0x00003800, 0x0000f500, -0x0000bc00, 0x0000b600, 0x0000da00, 0x00002100, 0x00001000, 0x0000ff00, 0x0000f300, 0x0000d200, -0x0000cd00, 0x00000c00, 0x00001300, 0x0000ec00, 0x00005f00, 0x00009700, 0x00004400, 0x00001700, -0x0000c400, 0x0000a700, 0x00007e00, 0x00003d00, 0x00006400, 0x00005d00, 0x00001900, 0x00007300, -0x00006000, 0x00008100, 0x00004f00, 0x0000dc00, 0x00002200, 0x00002a00, 0x00009000, 0x00008800, -0x00004600, 0x0000ee00, 0x0000b800, 0x00001400, 0x0000de00, 0x00005e00, 0x00000b00, 0x0000db00, -0x0000e000, 0x00003200, 0x00003a00, 0x00000a00, 0x00004900, 0x00000600, 0x00002400, 0x00005c00, -0x0000c200, 0x0000d300, 0x0000ac00, 0x00006200, 0x00009100, 0x00009500, 0x0000e400, 0x00007900, -0x0000e700, 0x0000c800, 0x00003700, 0x00006d00, 0x00008d00, 0x0000d500, 0x00004e00, 0x0000a900, -0x00006c00, 0x00005600, 0x0000f400, 0x0000ea00, 0x00006500, 0x00007a00, 0x0000ae00, 0x00000800, -0x0000ba00, 0x00007800, 0x00002500, 0x00002e00, 0x00001c00, 0x0000a600, 0x0000b400, 0x0000c600, -0x0000e800, 0x0000dd00, 0x00007400, 0x00001f00, 0x00004b00, 0x0000bd00, 0x00008b00, 0x00008a00, -0x00007000, 0x00003e00, 0x0000b500, 0x00006600, 0x00004800, 0x00000300, 0x0000f600, 0x00000e00, -0x00006100, 0x00003500, 0x00005700, 0x0000b900, 0x00008600, 0x0000c100, 0x00001d00, 0x00009e00, -0x0000e100, 0x0000f800, 0x00009800, 0x00001100, 0x00006900, 0x0000d900, 0x00008e00, 0x00009400, -0x00009b00, 0x00001e00, 0x00008700, 0x0000e900, 0x0000ce00, 0x00005500, 0x00002800, 0x0000df00, -0x00008c00, 0x0000a100, 0x00008900, 0x00000d00, 0x0000bf00, 0x0000e600, 0x00004200, 0x00006800, -0x00004100, 0x00009900, 0x00002d00, 0x00000f00, 0x0000b000, 0x00005400, 0x0000bb00, 0x00001600 -}; - -static const uint32_t Te4_2[] = { -0x00630000, 0x007c0000, 0x00770000, 0x007b0000, 0x00f20000, 0x006b0000, 0x006f0000, 0x00c50000, -0x00300000, 0x00010000, 0x00670000, 0x002b0000, 0x00fe0000, 0x00d70000, 0x00ab0000, 0x00760000, -0x00ca0000, 0x00820000, 0x00c90000, 0x007d0000, 0x00fa0000, 0x00590000, 0x00470000, 0x00f00000, -0x00ad0000, 0x00d40000, 0x00a20000, 0x00af0000, 0x009c0000, 0x00a40000, 0x00720000, 0x00c00000, -0x00b70000, 0x00fd0000, 0x00930000, 0x00260000, 0x00360000, 0x003f0000, 0x00f70000, 0x00cc0000, -0x00340000, 0x00a50000, 0x00e50000, 0x00f10000, 0x00710000, 0x00d80000, 0x00310000, 0x00150000, -0x00040000, 0x00c70000, 0x00230000, 0x00c30000, 0x00180000, 0x00960000, 0x00050000, 0x009a0000, -0x00070000, 0x00120000, 0x00800000, 0x00e20000, 0x00eb0000, 0x00270000, 0x00b20000, 0x00750000, -0x00090000, 0x00830000, 0x002c0000, 0x001a0000, 0x001b0000, 0x006e0000, 0x005a0000, 0x00a00000, -0x00520000, 0x003b0000, 0x00d60000, 0x00b30000, 0x00290000, 0x00e30000, 0x002f0000, 0x00840000, -0x00530000, 0x00d10000, 0x00000000, 0x00ed0000, 0x00200000, 0x00fc0000, 0x00b10000, 0x005b0000, -0x006a0000, 0x00cb0000, 0x00be0000, 0x00390000, 0x004a0000, 0x004c0000, 0x00580000, 0x00cf0000, -0x00d00000, 0x00ef0000, 0x00aa0000, 0x00fb0000, 0x00430000, 0x004d0000, 0x00330000, 0x00850000, -0x00450000, 0x00f90000, 0x00020000, 0x007f0000, 0x00500000, 0x003c0000, 0x009f0000, 0x00a80000, -0x00510000, 0x00a30000, 0x00400000, 0x008f0000, 0x00920000, 0x009d0000, 0x00380000, 0x00f50000, -0x00bc0000, 0x00b60000, 0x00da0000, 0x00210000, 0x00100000, 0x00ff0000, 0x00f30000, 0x00d20000, -0x00cd0000, 0x000c0000, 0x00130000, 0x00ec0000, 0x005f0000, 0x00970000, 0x00440000, 0x00170000, -0x00c40000, 0x00a70000, 0x007e0000, 0x003d0000, 0x00640000, 0x005d0000, 0x00190000, 0x00730000, -0x00600000, 0x00810000, 0x004f0000, 0x00dc0000, 0x00220000, 0x002a0000, 0x00900000, 0x00880000, -0x00460000, 0x00ee0000, 0x00b80000, 0x00140000, 0x00de0000, 0x005e0000, 0x000b0000, 0x00db0000, -0x00e00000, 0x00320000, 0x003a0000, 0x000a0000, 0x00490000, 0x00060000, 0x00240000, 0x005c0000, -0x00c20000, 0x00d30000, 0x00ac0000, 0x00620000, 0x00910000, 0x00950000, 0x00e40000, 0x00790000, -0x00e70000, 0x00c80000, 0x00370000, 0x006d0000, 0x008d0000, 0x00d50000, 0x004e0000, 0x00a90000, -0x006c0000, 0x00560000, 0x00f40000, 0x00ea0000, 0x00650000, 0x007a0000, 0x00ae0000, 0x00080000, -0x00ba0000, 0x00780000, 0x00250000, 0x002e0000, 0x001c0000, 0x00a60000, 0x00b40000, 0x00c60000, -0x00e80000, 0x00dd0000, 0x00740000, 0x001f0000, 0x004b0000, 0x00bd0000, 0x008b0000, 0x008a0000, -0x00700000, 0x003e0000, 0x00b50000, 0x00660000, 0x00480000, 0x00030000, 0x00f60000, 0x000e0000, -0x00610000, 0x00350000, 0x00570000, 0x00b90000, 0x00860000, 0x00c10000, 0x001d0000, 0x009e0000, -0x00e10000, 0x00f80000, 0x00980000, 0x00110000, 0x00690000, 0x00d90000, 0x008e0000, 0x00940000, -0x009b0000, 0x001e0000, 0x00870000, 0x00e90000, 0x00ce0000, 0x00550000, 0x00280000, 0x00df0000, -0x008c0000, 0x00a10000, 0x00890000, 0x000d0000, 0x00bf0000, 0x00e60000, 0x00420000, 0x00680000, -0x00410000, 0x00990000, 0x002d0000, 0x000f0000, 0x00b00000, 0x00540000, 0x00bb0000, 0x00160000 -}; - -static const uint32_t Te4_3[] = { -0x63000000, 0x7c000000, 0x77000000, 0x7b000000, 0xf2000000, 0x6b000000, 0x6f000000, 0xc5000000, -0x30000000, 0x01000000, 0x67000000, 0x2b000000, 0xfe000000, 0xd7000000, 0xab000000, 0x76000000, -0xca000000, 0x82000000, 0xc9000000, 0x7d000000, 0xfa000000, 0x59000000, 0x47000000, 0xf0000000, -0xad000000, 0xd4000000, 0xa2000000, 0xaf000000, 0x9c000000, 0xa4000000, 0x72000000, 0xc0000000, -0xb7000000, 0xfd000000, 0x93000000, 0x26000000, 0x36000000, 0x3f000000, 0xf7000000, 0xcc000000, -0x34000000, 0xa5000000, 0xe5000000, 0xf1000000, 0x71000000, 0xd8000000, 0x31000000, 0x15000000, -0x04000000, 0xc7000000, 0x23000000, 0xc3000000, 0x18000000, 0x96000000, 0x05000000, 0x9a000000, -0x07000000, 0x12000000, 0x80000000, 0xe2000000, 0xeb000000, 0x27000000, 0xb2000000, 0x75000000, -0x09000000, 0x83000000, 0x2c000000, 0x1a000000, 0x1b000000, 0x6e000000, 0x5a000000, 0xa0000000, -0x52000000, 0x3b000000, 0xd6000000, 0xb3000000, 0x29000000, 0xe3000000, 0x2f000000, 0x84000000, -0x53000000, 0xd1000000, 0x00000000, 0xed000000, 0x20000000, 0xfc000000, 0xb1000000, 0x5b000000, -0x6a000000, 0xcb000000, 0xbe000000, 0x39000000, 0x4a000000, 0x4c000000, 0x58000000, 0xcf000000, -0xd0000000, 0xef000000, 0xaa000000, 0xfb000000, 0x43000000, 0x4d000000, 0x33000000, 0x85000000, -0x45000000, 0xf9000000, 0x02000000, 0x7f000000, 0x50000000, 0x3c000000, 0x9f000000, 0xa8000000, -0x51000000, 0xa3000000, 0x40000000, 0x8f000000, 0x92000000, 0x9d000000, 0x38000000, 0xf5000000, -0xbc000000, 0xb6000000, 0xda000000, 0x21000000, 0x10000000, 0xff000000, 0xf3000000, 0xd2000000, -0xcd000000, 0x0c000000, 0x13000000, 0xec000000, 0x5f000000, 0x97000000, 0x44000000, 0x17000000, -0xc4000000, 0xa7000000, 0x7e000000, 0x3d000000, 0x64000000, 0x5d000000, 0x19000000, 0x73000000, -0x60000000, 0x81000000, 0x4f000000, 0xdc000000, 0x22000000, 0x2a000000, 0x90000000, 0x88000000, -0x46000000, 0xee000000, 0xb8000000, 0x14000000, 0xde000000, 0x5e000000, 0x0b000000, 0xdb000000, -0xe0000000, 0x32000000, 0x3a000000, 0x0a000000, 0x49000000, 0x06000000, 0x24000000, 0x5c000000, -0xc2000000, 0xd3000000, 0xac000000, 0x62000000, 0x91000000, 0x95000000, 0xe4000000, 0x79000000, -0xe7000000, 0xc8000000, 0x37000000, 0x6d000000, 0x8d000000, 0xd5000000, 0x4e000000, 0xa9000000, -0x6c000000, 0x56000000, 0xf4000000, 0xea000000, 0x65000000, 0x7a000000, 0xae000000, 0x08000000, -0xba000000, 0x78000000, 0x25000000, 0x2e000000, 0x1c000000, 0xa6000000, 0xb4000000, 0xc6000000, -0xe8000000, 0xdd000000, 0x74000000, 0x1f000000, 0x4b000000, 0xbd000000, 0x8b000000, 0x8a000000, -0x70000000, 0x3e000000, 0xb5000000, 0x66000000, 0x48000000, 0x03000000, 0xf6000000, 0x0e000000, -0x61000000, 0x35000000, 0x57000000, 0xb9000000, 0x86000000, 0xc1000000, 0x1d000000, 0x9e000000, -0xe1000000, 0xf8000000, 0x98000000, 0x11000000, 0x69000000, 0xd9000000, 0x8e000000, 0x94000000, -0x9b000000, 0x1e000000, 0x87000000, 0xe9000000, 0xce000000, 0x55000000, 0x28000000, 0xdf000000, -0x8c000000, 0xa1000000, 0x89000000, 0x0d000000, 0xbf000000, 0xe6000000, 0x42000000, 0x68000000, -0x41000000, 0x99000000, 0x2d000000, 0x0f000000, 0xb0000000, 0x54000000, 0xbb000000, 0x16000000 -}; -#endif /* pelimac */ - -#ifndef ENCRYPT_ONLY - -static const uint32_t TD1[256] = { - 0x5051f4a7, 0x537e4165, 0xc31a17a4, 0x963a275e, - 0xcb3bab6b, 0xf11f9d45, 0xabacfa58, 0x934be303, - 0x552030fa, 0xf6ad766d, 0x9188cc76, 0x25f5024c, - 0xfc4fe5d7, 0xd7c52acb, 0x80263544, 0x8fb562a3, - 0x49deb15a, 0x6725ba1b, 0x9845ea0e, 0xe15dfec0, - 0x02c32f75, 0x12814cf0, 0xa38d4697, 0xc66bd3f9, - 0xe7038f5f, 0x9515929c, 0xebbf6d7a, 0xda955259, - 0x2dd4be83, 0xd3587421, 0x2949e069, 0x448ec9c8, - 0x6a75c289, 0x78f48e79, 0x6b99583e, 0xdd27b971, - 0xb6bee14f, 0x17f088ad, 0x66c920ac, 0xb47dce3a, - 0x1863df4a, 0x82e51a31, 0x60975133, 0x4562537f, - 0xe0b16477, 0x84bb6bae, 0x1cfe81a0, 0x94f9082b, - 0x58704868, 0x198f45fd, 0x8794de6c, 0xb7527bf8, - 0x23ab73d3, 0xe2724b02, 0x57e31f8f, 0x2a6655ab, - 0x07b2eb28, 0x032fb5c2, 0x9a86c57b, 0xa5d33708, - 0xf2302887, 0xb223bfa5, 0xba02036a, 0x5ced1682, - 0x2b8acf1c, 0x92a779b4, 0xf0f307f2, 0xa14e69e2, - 0xcd65daf4, 0xd50605be, 0x1fd13462, 0x8ac4a6fe, - 0x9d342e53, 0xa0a2f355, 0x32058ae1, 0x75a4f6eb, - 0x390b83ec, 0xaa4060ef, 0x065e719f, 0x51bd6e10, - 0xf93e218a, 0x3d96dd06, 0xaedd3e05, 0x464de6bd, - 0xb591548d, 0x0571c45d, 0x6f0406d4, 0xff605015, - 0x241998fb, 0x97d6bde9, 0xcc894043, 0x7767d99e, - 0xbdb0e842, 0x8807898b, 0x38e7195b, 0xdb79c8ee, - 0x47a17c0a, 0xe97c420f, 0xc9f8841e, 0x00000000, - 0x83098086, 0x48322bed, 0xac1e1170, 0x4e6c5a72, - 0xfbfd0eff, 0x560f8538, 0x1e3daed5, 0x27362d39, - 0x640a0fd9, 0x21685ca6, 0xd19b5b54, 0x3a24362e, - 0xb10c0a67, 0x0f9357e7, 0xd2b4ee96, 0x9e1b9b91, - 0x4f80c0c5, 0xa261dc20, 0x695a774b, 0x161c121a, - 0x0ae293ba, 0xe5c0a02a, 0x433c22e0, 0x1d121b17, - 0x0b0e090d, 0xadf28bc7, 0xb92db6a8, 0xc8141ea9, - 0x8557f119, 0x4caf7507, 0xbbee99dd, 0xfda37f60, - 0x9ff70126, 0xbc5c72f5, 0xc544663b, 0x345bfb7e, - 0x768b4329, 0xdccb23c6, 0x68b6edfc, 0x63b8e4f1, - 0xcad731dc, 0x10426385, 0x40139722, 0x2084c611, - 0x7d854a24, 0xf8d2bb3d, 0x11aef932, 0x6dc729a1, - 0x4b1d9e2f, 0xf3dcb230, 0xec0d8652, 0xd077c1e3, - 0x6c2bb316, 0x99a970b9, 0xfa119448, 0x2247e964, - 0xc4a8fc8c, 0x1aa0f03f, 0xd8567d2c, 0xef223390, - 0xc787494e, 0xc1d938d1, 0xfe8ccaa2, 0x3698d40b, - 0xcfa6f581, 0x28a57ade, 0x26dab78e, 0xa43fadbf, - 0xe42c3a9d, 0x0d507892, 0x9b6a5fcc, 0x62547e46, - 0xc2f68d13, 0xe890d8b8, 0x5e2e39f7, 0xf582c3af, - 0xbe9f5d80, 0x7c69d093, 0xa96fd52d, 0xb3cf2512, - 0x3bc8ac99, 0xa710187d, 0x6ee89c63, 0x7bdb3bbb, - 0x09cd2678, 0xf46e5918, 0x01ec9ab7, 0xa8834f9a, - 0x65e6956e, 0x7eaaffe6, 0x0821bccf, 0xe6ef15e8, - 0xd9bae79b, 0xce4a6f36, 0xd4ea9f09, 0xd629b07c, - 0xaf31a4b2, 0x312a3f23, 0x30c6a594, 0xc035a266, - 0x37744ebc, 0xa6fc82ca, 0xb0e090d0, 0x1533a7d8, - 0x4af10498, 0xf741ecda, 0x0e7fcd50, 0x2f1791f6, - 0x8d764dd6, 0x4d43efb0, 0x54ccaa4d, 0xdfe49604, - 0xe39ed1b5, 0x1b4c6a88, 0xb8c12c1f, 0x7f466551, - 0x049d5eea, 0x5d018c35, 0x73fa8774, 0x2efb0b41, - 0x5ab3671d, 0x5292dbd2, 0x33e91056, 0x136dd647, - 0x8c9ad761, 0x7a37a10c, 0x8e59f814, 0x89eb133c, - 0xeecea927, 0x35b761c9, 0xede11ce5, 0x3c7a47b1, - 0x599cd2df, 0x3f55f273, 0x791814ce, 0xbf73c737, - 0xea53f7cd, 0x5b5ffdaa, 0x14df3d6f, 0x867844db, - 0x81caaff3, 0x3eb968c4, 0x2c382434, 0x5fc2a340, - 0x72161dc3, 0x0cbce225, 0x8b283c49, 0x41ff0d95, - 0x7139a801, 0xde080cb3, 0x9cd8b4e4, 0x906456c1, - 0x617bcb84, 0x70d532b6, 0x74486c5c, 0x42d0b857, -}; -static const uint32_t TD2[256] = { - 0xa75051f4, 0x65537e41, 0xa4c31a17, 0x5e963a27, - 0x6bcb3bab, 0x45f11f9d, 0x58abacfa, 0x03934be3, - 0xfa552030, 0x6df6ad76, 0x769188cc, 0x4c25f502, - 0xd7fc4fe5, 0xcbd7c52a, 0x44802635, 0xa38fb562, - 0x5a49deb1, 0x1b6725ba, 0x0e9845ea, 0xc0e15dfe, - 0x7502c32f, 0xf012814c, 0x97a38d46, 0xf9c66bd3, - 0x5fe7038f, 0x9c951592, 0x7aebbf6d, 0x59da9552, - 0x832dd4be, 0x21d35874, 0x692949e0, 0xc8448ec9, - 0x896a75c2, 0x7978f48e, 0x3e6b9958, 0x71dd27b9, - 0x4fb6bee1, 0xad17f088, 0xac66c920, 0x3ab47dce, - 0x4a1863df, 0x3182e51a, 0x33609751, 0x7f456253, - 0x77e0b164, 0xae84bb6b, 0xa01cfe81, 0x2b94f908, - 0x68587048, 0xfd198f45, 0x6c8794de, 0xf8b7527b, - 0xd323ab73, 0x02e2724b, 0x8f57e31f, 0xab2a6655, - 0x2807b2eb, 0xc2032fb5, 0x7b9a86c5, 0x08a5d337, - 0x87f23028, 0xa5b223bf, 0x6aba0203, 0x825ced16, - 0x1c2b8acf, 0xb492a779, 0xf2f0f307, 0xe2a14e69, - 0xf4cd65da, 0xbed50605, 0x621fd134, 0xfe8ac4a6, - 0x539d342e, 0x55a0a2f3, 0xe132058a, 0xeb75a4f6, - 0xec390b83, 0xefaa4060, 0x9f065e71, 0x1051bd6e, - 0x8af93e21, 0x063d96dd, 0x05aedd3e, 0xbd464de6, - 0x8db59154, 0x5d0571c4, 0xd46f0406, 0x15ff6050, - 0xfb241998, 0xe997d6bd, 0x43cc8940, 0x9e7767d9, - 0x42bdb0e8, 0x8b880789, 0x5b38e719, 0xeedb79c8, - 0x0a47a17c, 0x0fe97c42, 0x1ec9f884, 0x00000000, - 0x86830980, 0xed48322b, 0x70ac1e11, 0x724e6c5a, - 0xfffbfd0e, 0x38560f85, 0xd51e3dae, 0x3927362d, - 0xd9640a0f, 0xa621685c, 0x54d19b5b, 0x2e3a2436, - 0x67b10c0a, 0xe70f9357, 0x96d2b4ee, 0x919e1b9b, - 0xc54f80c0, 0x20a261dc, 0x4b695a77, 0x1a161c12, - 0xba0ae293, 0x2ae5c0a0, 0xe0433c22, 0x171d121b, - 0x0d0b0e09, 0xc7adf28b, 0xa8b92db6, 0xa9c8141e, - 0x198557f1, 0x074caf75, 0xddbbee99, 0x60fda37f, - 0x269ff701, 0xf5bc5c72, 0x3bc54466, 0x7e345bfb, - 0x29768b43, 0xc6dccb23, 0xfc68b6ed, 0xf163b8e4, - 0xdccad731, 0x85104263, 0x22401397, 0x112084c6, - 0x247d854a, 0x3df8d2bb, 0x3211aef9, 0xa16dc729, - 0x2f4b1d9e, 0x30f3dcb2, 0x52ec0d86, 0xe3d077c1, - 0x166c2bb3, 0xb999a970, 0x48fa1194, 0x642247e9, - 0x8cc4a8fc, 0x3f1aa0f0, 0x2cd8567d, 0x90ef2233, - 0x4ec78749, 0xd1c1d938, 0xa2fe8cca, 0x0b3698d4, - 0x81cfa6f5, 0xde28a57a, 0x8e26dab7, 0xbfa43fad, - 0x9de42c3a, 0x920d5078, 0xcc9b6a5f, 0x4662547e, - 0x13c2f68d, 0xb8e890d8, 0xf75e2e39, 0xaff582c3, - 0x80be9f5d, 0x937c69d0, 0x2da96fd5, 0x12b3cf25, - 0x993bc8ac, 0x7da71018, 0x636ee89c, 0xbb7bdb3b, - 0x7809cd26, 0x18f46e59, 0xb701ec9a, 0x9aa8834f, - 0x6e65e695, 0xe67eaaff, 0xcf0821bc, 0xe8e6ef15, - 0x9bd9bae7, 0x36ce4a6f, 0x09d4ea9f, 0x7cd629b0, - 0xb2af31a4, 0x23312a3f, 0x9430c6a5, 0x66c035a2, - 0xbc37744e, 0xcaa6fc82, 0xd0b0e090, 0xd81533a7, - 0x984af104, 0xdaf741ec, 0x500e7fcd, 0xf62f1791, - 0xd68d764d, 0xb04d43ef, 0x4d54ccaa, 0x04dfe496, - 0xb5e39ed1, 0x881b4c6a, 0x1fb8c12c, 0x517f4665, - 0xea049d5e, 0x355d018c, 0x7473fa87, 0x412efb0b, - 0x1d5ab367, 0xd25292db, 0x5633e910, 0x47136dd6, - 0x618c9ad7, 0x0c7a37a1, 0x148e59f8, 0x3c89eb13, - 0x27eecea9, 0xc935b761, 0xe5ede11c, 0xb13c7a47, - 0xdf599cd2, 0x733f55f2, 0xce791814, 0x37bf73c7, - 0xcdea53f7, 0xaa5b5ffd, 0x6f14df3d, 0xdb867844, - 0xf381caaf, 0xc43eb968, 0x342c3824, 0x405fc2a3, - 0xc372161d, 0x250cbce2, 0x498b283c, 0x9541ff0d, - 0x017139a8, 0xb3de080c, 0xe49cd8b4, 0xc1906456, - 0x84617bcb, 0xb670d532, 0x5c74486c, 0x5742d0b8, -}; -static const uint32_t TD3[256] = { - 0xf4a75051, 0x4165537e, 0x17a4c31a, 0x275e963a, - 0xab6bcb3b, 0x9d45f11f, 0xfa58abac, 0xe303934b, - 0x30fa5520, 0x766df6ad, 0xcc769188, 0x024c25f5, - 0xe5d7fc4f, 0x2acbd7c5, 0x35448026, 0x62a38fb5, - 0xb15a49de, 0xba1b6725, 0xea0e9845, 0xfec0e15d, - 0x2f7502c3, 0x4cf01281, 0x4697a38d, 0xd3f9c66b, - 0x8f5fe703, 0x929c9515, 0x6d7aebbf, 0x5259da95, - 0xbe832dd4, 0x7421d358, 0xe0692949, 0xc9c8448e, - 0xc2896a75, 0x8e7978f4, 0x583e6b99, 0xb971dd27, - 0xe14fb6be, 0x88ad17f0, 0x20ac66c9, 0xce3ab47d, - 0xdf4a1863, 0x1a3182e5, 0x51336097, 0x537f4562, - 0x6477e0b1, 0x6bae84bb, 0x81a01cfe, 0x082b94f9, - 0x48685870, 0x45fd198f, 0xde6c8794, 0x7bf8b752, - 0x73d323ab, 0x4b02e272, 0x1f8f57e3, 0x55ab2a66, - 0xeb2807b2, 0xb5c2032f, 0xc57b9a86, 0x3708a5d3, - 0x2887f230, 0xbfa5b223, 0x036aba02, 0x16825ced, - 0xcf1c2b8a, 0x79b492a7, 0x07f2f0f3, 0x69e2a14e, - 0xdaf4cd65, 0x05bed506, 0x34621fd1, 0xa6fe8ac4, - 0x2e539d34, 0xf355a0a2, 0x8ae13205, 0xf6eb75a4, - 0x83ec390b, 0x60efaa40, 0x719f065e, 0x6e1051bd, - 0x218af93e, 0xdd063d96, 0x3e05aedd, 0xe6bd464d, - 0x548db591, 0xc45d0571, 0x06d46f04, 0x5015ff60, - 0x98fb2419, 0xbde997d6, 0x4043cc89, 0xd99e7767, - 0xe842bdb0, 0x898b8807, 0x195b38e7, 0xc8eedb79, - 0x7c0a47a1, 0x420fe97c, 0x841ec9f8, 0x00000000, - 0x80868309, 0x2bed4832, 0x1170ac1e, 0x5a724e6c, - 0x0efffbfd, 0x8538560f, 0xaed51e3d, 0x2d392736, - 0x0fd9640a, 0x5ca62168, 0x5b54d19b, 0x362e3a24, - 0x0a67b10c, 0x57e70f93, 0xee96d2b4, 0x9b919e1b, - 0xc0c54f80, 0xdc20a261, 0x774b695a, 0x121a161c, - 0x93ba0ae2, 0xa02ae5c0, 0x22e0433c, 0x1b171d12, - 0x090d0b0e, 0x8bc7adf2, 0xb6a8b92d, 0x1ea9c814, - 0xf1198557, 0x75074caf, 0x99ddbbee, 0x7f60fda3, - 0x01269ff7, 0x72f5bc5c, 0x663bc544, 0xfb7e345b, - 0x4329768b, 0x23c6dccb, 0xedfc68b6, 0xe4f163b8, - 0x31dccad7, 0x63851042, 0x97224013, 0xc6112084, - 0x4a247d85, 0xbb3df8d2, 0xf93211ae, 0x29a16dc7, - 0x9e2f4b1d, 0xb230f3dc, 0x8652ec0d, 0xc1e3d077, - 0xb3166c2b, 0x70b999a9, 0x9448fa11, 0xe9642247, - 0xfc8cc4a8, 0xf03f1aa0, 0x7d2cd856, 0x3390ef22, - 0x494ec787, 0x38d1c1d9, 0xcaa2fe8c, 0xd40b3698, - 0xf581cfa6, 0x7ade28a5, 0xb78e26da, 0xadbfa43f, - 0x3a9de42c, 0x78920d50, 0x5fcc9b6a, 0x7e466254, - 0x8d13c2f6, 0xd8b8e890, 0x39f75e2e, 0xc3aff582, - 0x5d80be9f, 0xd0937c69, 0xd52da96f, 0x2512b3cf, - 0xac993bc8, 0x187da710, 0x9c636ee8, 0x3bbb7bdb, - 0x267809cd, 0x5918f46e, 0x9ab701ec, 0x4f9aa883, - 0x956e65e6, 0xffe67eaa, 0xbccf0821, 0x15e8e6ef, - 0xe79bd9ba, 0x6f36ce4a, 0x9f09d4ea, 0xb07cd629, - 0xa4b2af31, 0x3f23312a, 0xa59430c6, 0xa266c035, - 0x4ebc3774, 0x82caa6fc, 0x90d0b0e0, 0xa7d81533, - 0x04984af1, 0xecdaf741, 0xcd500e7f, 0x91f62f17, - 0x4dd68d76, 0xefb04d43, 0xaa4d54cc, 0x9604dfe4, - 0xd1b5e39e, 0x6a881b4c, 0x2c1fb8c1, 0x65517f46, - 0x5eea049d, 0x8c355d01, 0x877473fa, 0x0b412efb, - 0x671d5ab3, 0xdbd25292, 0x105633e9, 0xd647136d, - 0xd7618c9a, 0xa10c7a37, 0xf8148e59, 0x133c89eb, - 0xa927eece, 0x61c935b7, 0x1ce5ede1, 0x47b13c7a, - 0xd2df599c, 0xf2733f55, 0x14ce7918, 0xc737bf73, - 0xf7cdea53, 0xfdaa5b5f, 0x3d6f14df, 0x44db8678, - 0xaff381ca, 0x68c43eb9, 0x24342c38, 0xa3405fc2, - 0x1dc37216, 0xe2250cbc, 0x3c498b28, 0x0d9541ff, - 0xa8017139, 0x0cb3de08, 0xb4e49cd8, 0x56c19064, - 0xcb84617b, 0x32b670d5, 0x6c5c7448, 0xb85742d0, -}; - -static const uint32_t Tks0[] = { -0x00000000, 0x0e090d0b, 0x1c121a16, 0x121b171d, 0x3824342c, 0x362d3927, 0x24362e3a, 0x2a3f2331, -0x70486858, 0x7e416553, 0x6c5a724e, 0x62537f45, 0x486c5c74, 0x4665517f, 0x547e4662, 0x5a774b69, -0xe090d0b0, 0xee99ddbb, 0xfc82caa6, 0xf28bc7ad, 0xd8b4e49c, 0xd6bde997, 0xc4a6fe8a, 0xcaaff381, -0x90d8b8e8, 0x9ed1b5e3, 0x8ccaa2fe, 0x82c3aff5, 0xa8fc8cc4, 0xa6f581cf, 0xb4ee96d2, 0xbae79bd9, -0xdb3bbb7b, 0xd532b670, 0xc729a16d, 0xc920ac66, 0xe31f8f57, 0xed16825c, 0xff0d9541, 0xf104984a, -0xab73d323, 0xa57ade28, 0xb761c935, 0xb968c43e, 0x9357e70f, 0x9d5eea04, 0x8f45fd19, 0x814cf012, -0x3bab6bcb, 0x35a266c0, 0x27b971dd, 0x29b07cd6, 0x038f5fe7, 0x0d8652ec, 0x1f9d45f1, 0x119448fa, -0x4be30393, 0x45ea0e98, 0x57f11985, 0x59f8148e, 0x73c737bf, 0x7dce3ab4, 0x6fd52da9, 0x61dc20a2, -0xad766df6, 0xa37f60fd, 0xb16477e0, 0xbf6d7aeb, 0x955259da, 0x9b5b54d1, 0x894043cc, 0x87494ec7, -0xdd3e05ae, 0xd33708a5, 0xc12c1fb8, 0xcf2512b3, 0xe51a3182, 0xeb133c89, 0xf9082b94, 0xf701269f, -0x4de6bd46, 0x43efb04d, 0x51f4a750, 0x5ffdaa5b, 0x75c2896a, 0x7bcb8461, 0x69d0937c, 0x67d99e77, -0x3daed51e, 0x33a7d815, 0x21bccf08, 0x2fb5c203, 0x058ae132, 0x0b83ec39, 0x1998fb24, 0x1791f62f, -0x764dd68d, 0x7844db86, 0x6a5fcc9b, 0x6456c190, 0x4e69e2a1, 0x4060efaa, 0x527bf8b7, 0x5c72f5bc, -0x0605bed5, 0x080cb3de, 0x1a17a4c3, 0x141ea9c8, 0x3e218af9, 0x302887f2, 0x223390ef, 0x2c3a9de4, -0x96dd063d, 0x98d40b36, 0x8acf1c2b, 0x84c61120, 0xaef93211, 0xa0f03f1a, 0xb2eb2807, 0xbce2250c, -0xe6956e65, 0xe89c636e, 0xfa877473, 0xf48e7978, 0xdeb15a49, 0xd0b85742, 0xc2a3405f, 0xccaa4d54, -0x41ecdaf7, 0x4fe5d7fc, 0x5dfec0e1, 0x53f7cdea, 0x79c8eedb, 0x77c1e3d0, 0x65daf4cd, 0x6bd3f9c6, -0x31a4b2af, 0x3fadbfa4, 0x2db6a8b9, 0x23bfa5b2, 0x09808683, 0x07898b88, 0x15929c95, 0x1b9b919e, -0xa17c0a47, 0xaf75074c, 0xbd6e1051, 0xb3671d5a, 0x99583e6b, 0x97513360, 0x854a247d, 0x8b432976, -0xd134621f, 0xdf3d6f14, 0xcd267809, 0xc32f7502, 0xe9105633, 0xe7195b38, 0xf5024c25, 0xfb0b412e, -0x9ad7618c, 0x94de6c87, 0x86c57b9a, 0x88cc7691, 0xa2f355a0, 0xacfa58ab, 0xbee14fb6, 0xb0e842bd, -0xea9f09d4, 0xe49604df, 0xf68d13c2, 0xf8841ec9, 0xd2bb3df8, 0xdcb230f3, 0xcea927ee, 0xc0a02ae5, -0x7a47b13c, 0x744ebc37, 0x6655ab2a, 0x685ca621, 0x42638510, 0x4c6a881b, 0x5e719f06, 0x5078920d, -0x0a0fd964, 0x0406d46f, 0x161dc372, 0x1814ce79, 0x322bed48, 0x3c22e043, 0x2e39f75e, 0x2030fa55, -0xec9ab701, 0xe293ba0a, 0xf088ad17, 0xfe81a01c, 0xd4be832d, 0xdab78e26, 0xc8ac993b, 0xc6a59430, -0x9cd2df59, 0x92dbd252, 0x80c0c54f, 0x8ec9c844, 0xa4f6eb75, 0xaaffe67e, 0xb8e4f163, 0xb6edfc68, -0x0c0a67b1, 0x02036aba, 0x10187da7, 0x1e1170ac, 0x342e539d, 0x3a275e96, 0x283c498b, 0x26354480, -0x7c420fe9, 0x724b02e2, 0x605015ff, 0x6e5918f4, 0x44663bc5, 0x4a6f36ce, 0x587421d3, 0x567d2cd8, -0x37a10c7a, 0x39a80171, 0x2bb3166c, 0x25ba1b67, 0x0f853856, 0x018c355d, 0x13972240, 0x1d9e2f4b, -0x47e96422, 0x49e06929, 0x5bfb7e34, 0x55f2733f, 0x7fcd500e, 0x71c45d05, 0x63df4a18, 0x6dd64713, -0xd731dcca, 0xd938d1c1, 0xcb23c6dc, 0xc52acbd7, 0xef15e8e6, 0xe11ce5ed, 0xf307f2f0, 0xfd0efffb, -0xa779b492, 0xa970b999, 0xbb6bae84, 0xb562a38f, 0x9f5d80be, 0x91548db5, 0x834f9aa8, 0x8d4697a3 -}; - -static const uint32_t Tks1[] = { -0x00000000, 0x0b0e090d, 0x161c121a, 0x1d121b17, 0x2c382434, 0x27362d39, 0x3a24362e, 0x312a3f23, -0x58704868, 0x537e4165, 0x4e6c5a72, 0x4562537f, 0x74486c5c, 0x7f466551, 0x62547e46, 0x695a774b, -0xb0e090d0, 0xbbee99dd, 0xa6fc82ca, 0xadf28bc7, 0x9cd8b4e4, 0x97d6bde9, 0x8ac4a6fe, 0x81caaff3, -0xe890d8b8, 0xe39ed1b5, 0xfe8ccaa2, 0xf582c3af, 0xc4a8fc8c, 0xcfa6f581, 0xd2b4ee96, 0xd9bae79b, -0x7bdb3bbb, 0x70d532b6, 0x6dc729a1, 0x66c920ac, 0x57e31f8f, 0x5ced1682, 0x41ff0d95, 0x4af10498, -0x23ab73d3, 0x28a57ade, 0x35b761c9, 0x3eb968c4, 0x0f9357e7, 0x049d5eea, 0x198f45fd, 0x12814cf0, -0xcb3bab6b, 0xc035a266, 0xdd27b971, 0xd629b07c, 0xe7038f5f, 0xec0d8652, 0xf11f9d45, 0xfa119448, -0x934be303, 0x9845ea0e, 0x8557f119, 0x8e59f814, 0xbf73c737, 0xb47dce3a, 0xa96fd52d, 0xa261dc20, -0xf6ad766d, 0xfda37f60, 0xe0b16477, 0xebbf6d7a, 0xda955259, 0xd19b5b54, 0xcc894043, 0xc787494e, -0xaedd3e05, 0xa5d33708, 0xb8c12c1f, 0xb3cf2512, 0x82e51a31, 0x89eb133c, 0x94f9082b, 0x9ff70126, -0x464de6bd, 0x4d43efb0, 0x5051f4a7, 0x5b5ffdaa, 0x6a75c289, 0x617bcb84, 0x7c69d093, 0x7767d99e, -0x1e3daed5, 0x1533a7d8, 0x0821bccf, 0x032fb5c2, 0x32058ae1, 0x390b83ec, 0x241998fb, 0x2f1791f6, -0x8d764dd6, 0x867844db, 0x9b6a5fcc, 0x906456c1, 0xa14e69e2, 0xaa4060ef, 0xb7527bf8, 0xbc5c72f5, -0xd50605be, 0xde080cb3, 0xc31a17a4, 0xc8141ea9, 0xf93e218a, 0xf2302887, 0xef223390, 0xe42c3a9d, -0x3d96dd06, 0x3698d40b, 0x2b8acf1c, 0x2084c611, 0x11aef932, 0x1aa0f03f, 0x07b2eb28, 0x0cbce225, -0x65e6956e, 0x6ee89c63, 0x73fa8774, 0x78f48e79, 0x49deb15a, 0x42d0b857, 0x5fc2a340, 0x54ccaa4d, -0xf741ecda, 0xfc4fe5d7, 0xe15dfec0, 0xea53f7cd, 0xdb79c8ee, 0xd077c1e3, 0xcd65daf4, 0xc66bd3f9, -0xaf31a4b2, 0xa43fadbf, 0xb92db6a8, 0xb223bfa5, 0x83098086, 0x8807898b, 0x9515929c, 0x9e1b9b91, -0x47a17c0a, 0x4caf7507, 0x51bd6e10, 0x5ab3671d, 0x6b99583e, 0x60975133, 0x7d854a24, 0x768b4329, -0x1fd13462, 0x14df3d6f, 0x09cd2678, 0x02c32f75, 0x33e91056, 0x38e7195b, 0x25f5024c, 0x2efb0b41, -0x8c9ad761, 0x8794de6c, 0x9a86c57b, 0x9188cc76, 0xa0a2f355, 0xabacfa58, 0xb6bee14f, 0xbdb0e842, -0xd4ea9f09, 0xdfe49604, 0xc2f68d13, 0xc9f8841e, 0xf8d2bb3d, 0xf3dcb230, 0xeecea927, 0xe5c0a02a, -0x3c7a47b1, 0x37744ebc, 0x2a6655ab, 0x21685ca6, 0x10426385, 0x1b4c6a88, 0x065e719f, 0x0d507892, -0x640a0fd9, 0x6f0406d4, 0x72161dc3, 0x791814ce, 0x48322bed, 0x433c22e0, 0x5e2e39f7, 0x552030fa, -0x01ec9ab7, 0x0ae293ba, 0x17f088ad, 0x1cfe81a0, 0x2dd4be83, 0x26dab78e, 0x3bc8ac99, 0x30c6a594, -0x599cd2df, 0x5292dbd2, 0x4f80c0c5, 0x448ec9c8, 0x75a4f6eb, 0x7eaaffe6, 0x63b8e4f1, 0x68b6edfc, -0xb10c0a67, 0xba02036a, 0xa710187d, 0xac1e1170, 0x9d342e53, 0x963a275e, 0x8b283c49, 0x80263544, -0xe97c420f, 0xe2724b02, 0xff605015, 0xf46e5918, 0xc544663b, 0xce4a6f36, 0xd3587421, 0xd8567d2c, -0x7a37a10c, 0x7139a801, 0x6c2bb316, 0x6725ba1b, 0x560f8538, 0x5d018c35, 0x40139722, 0x4b1d9e2f, -0x2247e964, 0x2949e069, 0x345bfb7e, 0x3f55f273, 0x0e7fcd50, 0x0571c45d, 0x1863df4a, 0x136dd647, -0xcad731dc, 0xc1d938d1, 0xdccb23c6, 0xd7c52acb, 0xe6ef15e8, 0xede11ce5, 0xf0f307f2, 0xfbfd0eff, -0x92a779b4, 0x99a970b9, 0x84bb6bae, 0x8fb562a3, 0xbe9f5d80, 0xb591548d, 0xa8834f9a, 0xa38d4697 -}; - -static const uint32_t Tks2[] = { -0x00000000, 0x0d0b0e09, 0x1a161c12, 0x171d121b, 0x342c3824, 0x3927362d, 0x2e3a2436, 0x23312a3f, -0x68587048, 0x65537e41, 0x724e6c5a, 0x7f456253, 0x5c74486c, 0x517f4665, 0x4662547e, 0x4b695a77, -0xd0b0e090, 0xddbbee99, 0xcaa6fc82, 0xc7adf28b, 0xe49cd8b4, 0xe997d6bd, 0xfe8ac4a6, 0xf381caaf, -0xb8e890d8, 0xb5e39ed1, 0xa2fe8cca, 0xaff582c3, 0x8cc4a8fc, 0x81cfa6f5, 0x96d2b4ee, 0x9bd9bae7, -0xbb7bdb3b, 0xb670d532, 0xa16dc729, 0xac66c920, 0x8f57e31f, 0x825ced16, 0x9541ff0d, 0x984af104, -0xd323ab73, 0xde28a57a, 0xc935b761, 0xc43eb968, 0xe70f9357, 0xea049d5e, 0xfd198f45, 0xf012814c, -0x6bcb3bab, 0x66c035a2, 0x71dd27b9, 0x7cd629b0, 0x5fe7038f, 0x52ec0d86, 0x45f11f9d, 0x48fa1194, -0x03934be3, 0x0e9845ea, 0x198557f1, 0x148e59f8, 0x37bf73c7, 0x3ab47dce, 0x2da96fd5, 0x20a261dc, -0x6df6ad76, 0x60fda37f, 0x77e0b164, 0x7aebbf6d, 0x59da9552, 0x54d19b5b, 0x43cc8940, 0x4ec78749, -0x05aedd3e, 0x08a5d337, 0x1fb8c12c, 0x12b3cf25, 0x3182e51a, 0x3c89eb13, 0x2b94f908, 0x269ff701, -0xbd464de6, 0xb04d43ef, 0xa75051f4, 0xaa5b5ffd, 0x896a75c2, 0x84617bcb, 0x937c69d0, 0x9e7767d9, -0xd51e3dae, 0xd81533a7, 0xcf0821bc, 0xc2032fb5, 0xe132058a, 0xec390b83, 0xfb241998, 0xf62f1791, -0xd68d764d, 0xdb867844, 0xcc9b6a5f, 0xc1906456, 0xe2a14e69, 0xefaa4060, 0xf8b7527b, 0xf5bc5c72, -0xbed50605, 0xb3de080c, 0xa4c31a17, 0xa9c8141e, 0x8af93e21, 0x87f23028, 0x90ef2233, 0x9de42c3a, -0x063d96dd, 0x0b3698d4, 0x1c2b8acf, 0x112084c6, 0x3211aef9, 0x3f1aa0f0, 0x2807b2eb, 0x250cbce2, -0x6e65e695, 0x636ee89c, 0x7473fa87, 0x7978f48e, 0x5a49deb1, 0x5742d0b8, 0x405fc2a3, 0x4d54ccaa, -0xdaf741ec, 0xd7fc4fe5, 0xc0e15dfe, 0xcdea53f7, 0xeedb79c8, 0xe3d077c1, 0xf4cd65da, 0xf9c66bd3, -0xb2af31a4, 0xbfa43fad, 0xa8b92db6, 0xa5b223bf, 0x86830980, 0x8b880789, 0x9c951592, 0x919e1b9b, -0x0a47a17c, 0x074caf75, 0x1051bd6e, 0x1d5ab367, 0x3e6b9958, 0x33609751, 0x247d854a, 0x29768b43, -0x621fd134, 0x6f14df3d, 0x7809cd26, 0x7502c32f, 0x5633e910, 0x5b38e719, 0x4c25f502, 0x412efb0b, -0x618c9ad7, 0x6c8794de, 0x7b9a86c5, 0x769188cc, 0x55a0a2f3, 0x58abacfa, 0x4fb6bee1, 0x42bdb0e8, -0x09d4ea9f, 0x04dfe496, 0x13c2f68d, 0x1ec9f884, 0x3df8d2bb, 0x30f3dcb2, 0x27eecea9, 0x2ae5c0a0, -0xb13c7a47, 0xbc37744e, 0xab2a6655, 0xa621685c, 0x85104263, 0x881b4c6a, 0x9f065e71, 0x920d5078, -0xd9640a0f, 0xd46f0406, 0xc372161d, 0xce791814, 0xed48322b, 0xe0433c22, 0xf75e2e39, 0xfa552030, -0xb701ec9a, 0xba0ae293, 0xad17f088, 0xa01cfe81, 0x832dd4be, 0x8e26dab7, 0x993bc8ac, 0x9430c6a5, -0xdf599cd2, 0xd25292db, 0xc54f80c0, 0xc8448ec9, 0xeb75a4f6, 0xe67eaaff, 0xf163b8e4, 0xfc68b6ed, -0x67b10c0a, 0x6aba0203, 0x7da71018, 0x70ac1e11, 0x539d342e, 0x5e963a27, 0x498b283c, 0x44802635, -0x0fe97c42, 0x02e2724b, 0x15ff6050, 0x18f46e59, 0x3bc54466, 0x36ce4a6f, 0x21d35874, 0x2cd8567d, -0x0c7a37a1, 0x017139a8, 0x166c2bb3, 0x1b6725ba, 0x38560f85, 0x355d018c, 0x22401397, 0x2f4b1d9e, -0x642247e9, 0x692949e0, 0x7e345bfb, 0x733f55f2, 0x500e7fcd, 0x5d0571c4, 0x4a1863df, 0x47136dd6, -0xdccad731, 0xd1c1d938, 0xc6dccb23, 0xcbd7c52a, 0xe8e6ef15, 0xe5ede11c, 0xf2f0f307, 0xfffbfd0e, -0xb492a779, 0xb999a970, 0xae84bb6b, 0xa38fb562, 0x80be9f5d, 0x8db59154, 0x9aa8834f, 0x97a38d46 -}; - -static const uint32_t Tks3[] = { -0x00000000, 0x090d0b0e, 0x121a161c, 0x1b171d12, 0x24342c38, 0x2d392736, 0x362e3a24, 0x3f23312a, -0x48685870, 0x4165537e, 0x5a724e6c, 0x537f4562, 0x6c5c7448, 0x65517f46, 0x7e466254, 0x774b695a, -0x90d0b0e0, 0x99ddbbee, 0x82caa6fc, 0x8bc7adf2, 0xb4e49cd8, 0xbde997d6, 0xa6fe8ac4, 0xaff381ca, -0xd8b8e890, 0xd1b5e39e, 0xcaa2fe8c, 0xc3aff582, 0xfc8cc4a8, 0xf581cfa6, 0xee96d2b4, 0xe79bd9ba, -0x3bbb7bdb, 0x32b670d5, 0x29a16dc7, 0x20ac66c9, 0x1f8f57e3, 0x16825ced, 0x0d9541ff, 0x04984af1, -0x73d323ab, 0x7ade28a5, 0x61c935b7, 0x68c43eb9, 0x57e70f93, 0x5eea049d, 0x45fd198f, 0x4cf01281, -0xab6bcb3b, 0xa266c035, 0xb971dd27, 0xb07cd629, 0x8f5fe703, 0x8652ec0d, 0x9d45f11f, 0x9448fa11, -0xe303934b, 0xea0e9845, 0xf1198557, 0xf8148e59, 0xc737bf73, 0xce3ab47d, 0xd52da96f, 0xdc20a261, -0x766df6ad, 0x7f60fda3, 0x6477e0b1, 0x6d7aebbf, 0x5259da95, 0x5b54d19b, 0x4043cc89, 0x494ec787, -0x3e05aedd, 0x3708a5d3, 0x2c1fb8c1, 0x2512b3cf, 0x1a3182e5, 0x133c89eb, 0x082b94f9, 0x01269ff7, -0xe6bd464d, 0xefb04d43, 0xf4a75051, 0xfdaa5b5f, 0xc2896a75, 0xcb84617b, 0xd0937c69, 0xd99e7767, -0xaed51e3d, 0xa7d81533, 0xbccf0821, 0xb5c2032f, 0x8ae13205, 0x83ec390b, 0x98fb2419, 0x91f62f17, -0x4dd68d76, 0x44db8678, 0x5fcc9b6a, 0x56c19064, 0x69e2a14e, 0x60efaa40, 0x7bf8b752, 0x72f5bc5c, -0x05bed506, 0x0cb3de08, 0x17a4c31a, 0x1ea9c814, 0x218af93e, 0x2887f230, 0x3390ef22, 0x3a9de42c, -0xdd063d96, 0xd40b3698, 0xcf1c2b8a, 0xc6112084, 0xf93211ae, 0xf03f1aa0, 0xeb2807b2, 0xe2250cbc, -0x956e65e6, 0x9c636ee8, 0x877473fa, 0x8e7978f4, 0xb15a49de, 0xb85742d0, 0xa3405fc2, 0xaa4d54cc, -0xecdaf741, 0xe5d7fc4f, 0xfec0e15d, 0xf7cdea53, 0xc8eedb79, 0xc1e3d077, 0xdaf4cd65, 0xd3f9c66b, -0xa4b2af31, 0xadbfa43f, 0xb6a8b92d, 0xbfa5b223, 0x80868309, 0x898b8807, 0x929c9515, 0x9b919e1b, -0x7c0a47a1, 0x75074caf, 0x6e1051bd, 0x671d5ab3, 0x583e6b99, 0x51336097, 0x4a247d85, 0x4329768b, -0x34621fd1, 0x3d6f14df, 0x267809cd, 0x2f7502c3, 0x105633e9, 0x195b38e7, 0x024c25f5, 0x0b412efb, -0xd7618c9a, 0xde6c8794, 0xc57b9a86, 0xcc769188, 0xf355a0a2, 0xfa58abac, 0xe14fb6be, 0xe842bdb0, -0x9f09d4ea, 0x9604dfe4, 0x8d13c2f6, 0x841ec9f8, 0xbb3df8d2, 0xb230f3dc, 0xa927eece, 0xa02ae5c0, -0x47b13c7a, 0x4ebc3774, 0x55ab2a66, 0x5ca62168, 0x63851042, 0x6a881b4c, 0x719f065e, 0x78920d50, -0x0fd9640a, 0x06d46f04, 0x1dc37216, 0x14ce7918, 0x2bed4832, 0x22e0433c, 0x39f75e2e, 0x30fa5520, -0x9ab701ec, 0x93ba0ae2, 0x88ad17f0, 0x81a01cfe, 0xbe832dd4, 0xb78e26da, 0xac993bc8, 0xa59430c6, -0xd2df599c, 0xdbd25292, 0xc0c54f80, 0xc9c8448e, 0xf6eb75a4, 0xffe67eaa, 0xe4f163b8, 0xedfc68b6, -0x0a67b10c, 0x036aba02, 0x187da710, 0x1170ac1e, 0x2e539d34, 0x275e963a, 0x3c498b28, 0x35448026, -0x420fe97c, 0x4b02e272, 0x5015ff60, 0x5918f46e, 0x663bc544, 0x6f36ce4a, 0x7421d358, 0x7d2cd856, -0xa10c7a37, 0xa8017139, 0xb3166c2b, 0xba1b6725, 0x8538560f, 0x8c355d01, 0x97224013, 0x9e2f4b1d, -0xe9642247, 0xe0692949, 0xfb7e345b, 0xf2733f55, 0xcd500e7f, 0xc45d0571, 0xdf4a1863, 0xd647136d, -0x31dccad7, 0x38d1c1d9, 0x23c6dccb, 0x2acbd7c5, 0x15e8e6ef, 0x1ce5ede1, 0x07f2f0f3, 0x0efffbfd, -0x79b492a7, 0x70b999a9, 0x6bae84bb, 0x62a38fb5, 0x5d80be9f, 0x548db591, 0x4f9aa883, 0x4697a38d -}; - -#endif /* ENCRYPT_ONLY */ - -#endif /* SMALL CODE */ - -static const uint32_t rcon[] = { - 0x01000000, 0x02000000, 0x04000000, 0x08000000, - 0x10000000, 0x20000000, 0x40000000, 0x80000000, - 0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ -}; diff --git a/osfmk/corecrypto/ccaes/src/ccaes_ltc_ecb_encrypt_mode.c b/osfmk/corecrypto/ccaes/src/ccaes_ltc_ecb_encrypt_mode.c deleted file mode 100644 index 0772f6861..000000000 --- a/osfmk/corecrypto/ccaes/src/ccaes_ltc_ecb_encrypt_mode.c +++ /dev/null @@ -1,421 +0,0 @@ -/* - * ccaes_ltc_ecb_encrypt_mode.c - * corecrypto - * - * Created on 12/12/2010 - * - * Copyright (c) 2010,2011,2015 Apple Inc. All rights reserved. - * - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - * Parts of this code adapted from LibTomCrypt - * - * LibTomCrypt, modular cryptographic library -- Tom St Denis - * - * LibTomCrypt is a library that provides various cryptographic - * algorithms in a highly modular and flexible manner. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@gmail.com, http://libtom.org - */ - - -#include -#include - -typedef struct ltc_rijndael_key { - uint32_t eK[60], dK[60]; - int Nr; -} ltc_rijndael_keysched; - -#include "aes_tab.c" - -static uint32_t setup_mix(uint32_t temp) -{ - return (Te4_3[cc_byte(temp, 2)]) ^ - (Te4_2[cc_byte(temp, 1)]) ^ - (Te4_1[cc_byte(temp, 0)]) ^ - (Te4_0[cc_byte(temp, 3)]); -} - -/*! - Initialize the AES (Rijndael) block cipher - @param key The symmetric key you wish to pass - @param keylen The key length in bytes - @param num_rounds The number of rounds desired (0 for default) - @param skey The key in as scheduled by this function. - @return CRYPT_OK if successful - */ -static int ccaes_ltc_init(const unsigned char *key, int keylen, int num_rounds, - ccecb_ctx *skey) -{ - int i, j; - uint32_t temp, *rk; -#ifndef ENCRYPT_ONLY - uint32_t *rrk; -#endif - ltc_rijndael_keysched *rijndael; - - rijndael = (ltc_rijndael_keysched *)skey; - - if (keylen != 16 && keylen != 24 && keylen != 32) { - return -1; //CRYPT_INVALID_KEYSIZE; - } - - if (num_rounds != 0 && num_rounds != (10 + ((keylen/8)-2)*2)) { - return -1; //CRYPT_INVALID_ROUNDS; - } - - rijndael->Nr = 10 + ((keylen/8)-2)*2; - - /* setup the forward key */ - i = 0; - rk = rijndael->eK; - CC_LOAD32_BE(rk[0], key ); - CC_LOAD32_BE(rk[1], key + 4); - CC_LOAD32_BE(rk[2], key + 8); - CC_LOAD32_BE(rk[3], key + 12); - if (keylen == 16) { - j = 44; - for (;;) { - temp = rk[3]; - rk[4] = rk[0] ^ setup_mix(temp) ^ rcon[i]; - rk[5] = rk[1] ^ rk[4]; - rk[6] = rk[2] ^ rk[5]; - rk[7] = rk[3] ^ rk[6]; - if (++i == 10) { - break; - } - rk += 4; - } - } else if (keylen == 24) { - j = 52; - CC_LOAD32_BE(rk[4], key + 16); - CC_LOAD32_BE(rk[5], key + 20); - for (;;) { -#ifdef _MSC_VER - temp = rijndael->eK[rk - rijndael->eK + 5]; -#else - temp = rk[5]; -#endif - rk[ 6] = rk[ 0] ^ setup_mix(temp) ^ rcon[i]; - rk[ 7] = rk[ 1] ^ rk[ 6]; - rk[ 8] = rk[ 2] ^ rk[ 7]; - rk[ 9] = rk[ 3] ^ rk[ 8]; - if (++i == 8) { - break; - } - rk[10] = rk[ 4] ^ rk[ 9]; - rk[11] = rk[ 5] ^ rk[10]; - rk += 6; - } - } else if (keylen == 32) { - j = 60; - CC_LOAD32_BE(rk[4], key + 16); - CC_LOAD32_BE(rk[5], key + 20); - CC_LOAD32_BE(rk[6], key + 24); - CC_LOAD32_BE(rk[7], key + 28); - for (;;) { -#ifdef _MSC_VER - temp = rijndael->eK[rk - rijndael->eK + 7]; -#else - temp = rk[7]; -#endif - rk[ 8] = rk[ 0] ^ setup_mix(temp) ^ rcon[i]; - rk[ 9] = rk[ 1] ^ rk[ 8]; - rk[10] = rk[ 2] ^ rk[ 9]; - rk[11] = rk[ 3] ^ rk[10]; - if (++i == 7) { - break; - } - temp = rk[11]; - rk[12] = rk[ 4] ^ setup_mix(CC_RORc(temp, 8)); - rk[13] = rk[ 5] ^ rk[12]; - rk[14] = rk[ 6] ^ rk[13]; - rk[15] = rk[ 7] ^ rk[14]; - rk += 8; - } - } else { - /* this can't happen */ - return -1; //CRYPT_ERROR; - } - -#ifndef ENCRYPT_ONLY - /* setup the inverse key now */ - rk = rijndael->dK; - rrk = rijndael->eK + j - 4; - - /* apply the inverse MixColumn transform to all round keys but the first and the last: */ - /* copy first */ - *rk++ = *rrk++; - *rk++ = *rrk++; - *rk++ = *rrk++; - *rk = *rrk; - rk -= 3; rrk -= 3; - - for (i = 1; i < rijndael->Nr; i++) { - rrk -= 4; - rk += 4; -#ifdef LTC_SMALL_CODE - temp = rrk[0]; - rk[0] = setup_mix2(temp); - temp = rrk[1]; - rk[1] = setup_mix2(temp); - temp = rrk[2]; - rk[2] = setup_mix2(temp); - temp = rrk[3]; - rk[3] = setup_mix2(temp); -#else - temp = rrk[0]; - rk[0] = - Tks0[cc_byte(temp, 3)] ^ - Tks1[cc_byte(temp, 2)] ^ - Tks2[cc_byte(temp, 1)] ^ - Tks3[cc_byte(temp, 0)]; - temp = rrk[1]; - rk[1] = - Tks0[cc_byte(temp, 3)] ^ - Tks1[cc_byte(temp, 2)] ^ - Tks2[cc_byte(temp, 1)] ^ - Tks3[cc_byte(temp, 0)]; - temp = rrk[2]; - rk[2] = - Tks0[cc_byte(temp, 3)] ^ - Tks1[cc_byte(temp, 2)] ^ - Tks2[cc_byte(temp, 1)] ^ - Tks3[cc_byte(temp, 0)]; - temp = rrk[3]; - rk[3] = - Tks0[cc_byte(temp, 3)] ^ - Tks1[cc_byte(temp, 2)] ^ - Tks2[cc_byte(temp, 1)] ^ - Tks3[cc_byte(temp, 0)]; -#endif - - } - - /* copy last */ - rrk -= 4; - rk += 4; - *rk++ = *rrk++; - *rk++ = *rrk++; - *rk++ = *rrk++; - *rk = *rrk; -#endif /* ENCRYPT_ONLY */ - - return 0; //CRYPT_OK; -} - -static int ccaes_ecb_encrypt_init(const struct ccmode_ecb *ecb CC_UNUSED, ccecb_ctx *key, - size_t rawkey_len, const void *rawkey) { - return ccaes_ltc_init(rawkey, (int)rawkey_len, 0, key); -} - -static void ccaes_ltc_ecb_encrypt(const ccecb_ctx *skey, const unsigned char *pt, - unsigned char *ct) -{ - uint32_t s0, s1, s2, s3, t0, t1, t2, t3; - const uint32_t *rk; - int Nr, r; - const ltc_rijndael_keysched *rijndael; - - rijndael = (const ltc_rijndael_keysched *)skey; - - Nr = rijndael->Nr; - rk = rijndael->eK; - - /* - * map byte array block to cipher state - * and add initial round key: - */ - CC_LOAD32_BE(s0, pt ); s0 ^= rk[0]; - CC_LOAD32_BE(s1, pt + 4); s1 ^= rk[1]; - CC_LOAD32_BE(s2, pt + 8); s2 ^= rk[2]; - CC_LOAD32_BE(s3, pt + 12); s3 ^= rk[3]; - -#ifdef LTC_SMALL_CODE - - for (r = 0; ; r++) { - rk += 4; - t0 = - Te0(cc_byte(s0, 3)) ^ - Te1(cc_byte(s1, 2)) ^ - Te2(cc_byte(s2, 1)) ^ - Te3(cc_byte(s3, 0)) ^ - rk[0]; - t1 = - Te0(cc_byte(s1, 3)) ^ - Te1(cc_byte(s2, 2)) ^ - Te2(cc_byte(s3, 1)) ^ - Te3(cc_byte(s0, 0)) ^ - rk[1]; - t2 = - Te0(cc_byte(s2, 3)) ^ - Te1(cc_byte(s3, 2)) ^ - Te2(cc_byte(s0, 1)) ^ - Te3(cc_byte(s1, 0)) ^ - rk[2]; - t3 = - Te0(cc_byte(s3, 3)) ^ - Te1(cc_byte(s0, 2)) ^ - Te2(cc_byte(s1, 1)) ^ - Te3(cc_byte(s2, 0)) ^ - rk[3]; - if (r == Nr-2) { - break; - } - s0 = t0; s1 = t1; s2 = t2; s3 = t3; - } - rk += 4; - -#else - - /* - * Nr - 1 full rounds: - */ - r = Nr >> 1; - for (;;) { - t0 = - Te0(cc_byte(s0, 3)) ^ - Te1(cc_byte(s1, 2)) ^ - Te2(cc_byte(s2, 1)) ^ - Te3(cc_byte(s3, 0)) ^ - rk[4]; - t1 = - Te0(cc_byte(s1, 3)) ^ - Te1(cc_byte(s2, 2)) ^ - Te2(cc_byte(s3, 1)) ^ - Te3(cc_byte(s0, 0)) ^ - rk[5]; - t2 = - Te0(cc_byte(s2, 3)) ^ - Te1(cc_byte(s3, 2)) ^ - Te2(cc_byte(s0, 1)) ^ - Te3(cc_byte(s1, 0)) ^ - rk[6]; - t3 = - Te0(cc_byte(s3, 3)) ^ - Te1(cc_byte(s0, 2)) ^ - Te2(cc_byte(s1, 1)) ^ - Te3(cc_byte(s2, 0)) ^ - rk[7]; - - rk += 8; - if (--r == 0) { - break; - } - - s0 = - Te0(cc_byte(t0, 3)) ^ - Te1(cc_byte(t1, 2)) ^ - Te2(cc_byte(t2, 1)) ^ - Te3(cc_byte(t3, 0)) ^ - rk[0]; - s1 = - Te0(cc_byte(t1, 3)) ^ - Te1(cc_byte(t2, 2)) ^ - Te2(cc_byte(t3, 1)) ^ - Te3(cc_byte(t0, 0)) ^ - rk[1]; - s2 = - Te0(cc_byte(t2, 3)) ^ - Te1(cc_byte(t3, 2)) ^ - Te2(cc_byte(t0, 1)) ^ - Te3(cc_byte(t1, 0)) ^ - rk[2]; - s3 = - Te0(cc_byte(t3, 3)) ^ - Te1(cc_byte(t0, 2)) ^ - Te2(cc_byte(t1, 1)) ^ - Te3(cc_byte(t2, 0)) ^ - rk[3]; - } - -#endif - - /* - * apply last round and - * map cipher state to byte array block: - */ - s0 = - (Te4_3[cc_byte(t0, 3)]) ^ - (Te4_2[cc_byte(t1, 2)]) ^ - (Te4_1[cc_byte(t2, 1)]) ^ - (Te4_0[cc_byte(t3, 0)]) ^ - rk[0]; - CC_STORE32_BE(s0, ct); - s1 = - (Te4_3[cc_byte(t1, 3)]) ^ - (Te4_2[cc_byte(t2, 2)]) ^ - (Te4_1[cc_byte(t3, 1)]) ^ - (Te4_0[cc_byte(t0, 0)]) ^ - rk[1]; - CC_STORE32_BE(s1, ct+4); - s2 = - (Te4_3[cc_byte(t2, 3)]) ^ - (Te4_2[cc_byte(t3, 2)]) ^ - (Te4_1[cc_byte(t0, 1)]) ^ - (Te4_0[cc_byte(t1, 0)]) ^ - rk[2]; - CC_STORE32_BE(s2, ct+8); - s3 = - (Te4_3[cc_byte(t3, 3)]) ^ - (Te4_2[cc_byte(t0, 2)]) ^ - (Te4_1[cc_byte(t1, 1)]) ^ - (Te4_0[cc_byte(t2, 0)]) ^ - rk[3]; - CC_STORE32_BE(s3, ct+12); -} - -static int ccaes_ecb_encrypt(const ccecb_ctx *key, size_t nblocks, - const void *in, void *out) { - if (nblocks) { - const unsigned char *p = in; - unsigned char *c = out; - for (;;) { - ccaes_ltc_ecb_encrypt(key, p, c); - if (--nblocks) { - p += CCAES_BLOCK_SIZE; - c += CCAES_BLOCK_SIZE; - } else { - break; - } - } - } - - return 0; -} - -const struct ccmode_ecb ccaes_ltc_ecb_encrypt_mode = { - .size = sizeof(ltc_rijndael_keysched), - .block_size = CCAES_BLOCK_SIZE, - .init = ccaes_ecb_encrypt_init, - .ecb = ccaes_ecb_encrypt, -}; diff --git a/osfmk/corecrypto/cchmac/src/cchmac_final.c b/osfmk/corecrypto/cchmac/src/cchmac_final.c index a7bfb84c0..cbd7db453 100644 --- a/osfmk/corecrypto/cchmac/src/cchmac_final.c +++ b/osfmk/corecrypto/cchmac/src/cchmac_final.c @@ -38,10 +38,16 @@ void cchmac_final(const struct ccdigest_info *di, cchmac_ctx_t hc, unsigned char *mac) { + + // Finalize the inner state of the data being HMAC'd, i.e., H((key \oplus ipad) || m) ccdigest_final(di, cchmac_digest_ctx(di, hc), cchmac_data(di, hc)); - /* typecast: output size will alwys fit in an unsigned int */ - cchmac_num(di, hc) = (unsigned int)di->output_size; + + // Set the HMAC output size based on the digest algorithm + cchmac_num(di, hc) = (unsigned int)di->output_size; /* typecast: output size will alwys fit in an unsigned int */ cchmac_nbits(di, hc) = di->block_size * 8; + + // Copy the pre-computed compress(key \oplus opad) back to digest state, + // and then run through the digest once more to finish the HMAC ccdigest_copy_state(di, cchmac_istate32(di, hc), cchmac_ostate32(di, hc)); ccdigest_final(di, cchmac_digest_ctx(di, hc), mac); } diff --git a/osfmk/corecrypto/ccmode/src/ccmode_ctr_crypt.c b/osfmk/corecrypto/ccmode/src/ccmode_ctr_crypt.c deleted file mode 100644 index 3efce7dfd..000000000 --- a/osfmk/corecrypto/ccmode/src/ccmode_ctr_crypt.c +++ /dev/null @@ -1,72 +0,0 @@ -/* - * ccmode_ctr_crypt.c - * corecrypto - * - * Created on 12/17/2010 - * - * Copyright (c) 2010,2011,2012,2014,2015 Apple Inc. All rights reserved. - * - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include "ccmode_internal.h" - -int ccmode_ctr_crypt(ccctr_ctx *key, - size_t nbytes, const void *in, void *out) { - const struct ccmode_ecb *ecb = CCMODE_CTR_KEY_ECB(key); - const ccecb_ctx *ecb_key = CCMODE_CTR_KEY_ECB_KEY(key); - uint8_t *ctr = (uint8_t *)CCMODE_CTR_KEY_CTR(key); - uint8_t *pad = (uint8_t *)CCMODE_CTR_KEY_PAD(key); - size_t pad_offset = CCMODE_CTR_KEY_PAD_OFFSET(key); - const uint8_t *in_bytes = in; - // Counter is 64bit wide for cipher with block size of 64bit or more - // This is to match the assembly - const size_t counter_size=(CC_MIN(ecb->block_size,(typeof(ecb->block_size))8)); - uint8_t *out_bytes = out; - size_t n; - - while (nbytes) { - if (pad_offset == ecb->block_size) { - ecb->ecb(ecb_key, 1, ctr, pad); - pad_offset = 0; - - /* increment the big endian counter */ - inc_uint(ctr + ecb->block_size - counter_size, counter_size); - - if (nbytes==0) break; - } - - n = CC_MIN(nbytes, ecb->block_size - pad_offset); - cc_xor(n, out_bytes, in_bytes, pad + pad_offset); - nbytes -= n; - in_bytes += n; - out_bytes += n; - pad_offset += n; - } - CCMODE_CTR_KEY_PAD_OFFSET(key) = pad_offset; - - return 0; -} diff --git a/osfmk/corecrypto/ccmode/src/ccmode_internal.h b/osfmk/corecrypto/ccmode/src/ccmode_internal.h deleted file mode 100644 index 0f7f0c617..000000000 --- a/osfmk/corecrypto/ccmode/src/ccmode_internal.h +++ /dev/null @@ -1,297 +0,0 @@ -/* - * ccmode_internal.h - * corecrypto - * - * Created on 12/12/2010 - * - * Copyright (c) 2010,2011,2012,2014,2015 Apple Inc. All rights reserved. - * - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef _CORECRYPTO_CCMODE_INTERNAL_H_ -#define _CORECRYPTO_CCMODE_INTERNAL_H_ - -#include -#include -#include -#include - -#define CCMODE_INVALID_INPUT -1 -#define CCMODE_INVALID_CALL_SEQUENCE -2 -#define CCMODE_INTEGRITY_FAILURE -3 -#define CCMODE_NOT_SUPPORTED -4 -#define CCMODE_INTERNAL_ERROR -5 - -// VNG speed up for GCM's AES encrypton and finite fileld multiplication -#if \ -((CCAES_INTEL_ASM && defined(__x86_64__)) || (CCAES_ARM_ASM && defined(__ARM_NEON__))) -#define CCMODE_GCM_VNG_SPEEDUP 1 -#else -#define CCMODE_GCM_VNG_SPEEDUP 0 -#endif - - -#define CCMODE_GCM_USE_GF_LOOKUP_TABLES 1 - -/* Helper function used. TODO: Probably not specific to xts, since - gcm uses it too */ -void ccmode_xts_mult_alpha(cc_unit *tweak); - -/* Macros for accessing a CCMODE_CBC_KEY. - { - const struct ccmode_ecb *ecb - ccn_unit ecb_key[ecb->n] - } */ -#define _CCMODE_CBC_KEY(K) ((struct _ccmode_cbc_key *)(K)) -#define _CCMODE_CBC_KEY_CONST(K) ((const struct _ccmode_cbc_key *)(K)) -#define CCMODE_CBC_KEY_ECB(K) (_CCMODE_CBC_KEY(K)->ecb) -#define CCMODE_CBC_KEY_ECB_KEY(K) ((ccecb_ctx *)&_CCMODE_CBC_KEY(K)->u[0]) - -CC_CONST CC_INLINE -const struct ccmode_ecb * ccmode_cbc_key_ecb(const cccbc_ctx *K) { - return ((const struct _ccmode_cbc_key *)K)->ecb; -} - -CC_CONST CC_INLINE -const ccecb_ctx * ccmode_cbc_key_ecb_key(const cccbc_ctx *K) { - return (const ccecb_ctx *)&((const struct _ccmode_cbc_key *)K)->u[0]; -} - -/* Macros for accessing a CCMODE_CFB_KEY. -{ - const struct ccmode_ecb *ecb - cc_size pad_len; - ccn_unit pad[ecb->block_size / CCN_UNIT_SIZE]; - ccn_unit iv[ecb->block_size / CCN_UNIT_SIZE]; - ccn_unit ecb_key[ecb->n] -} */ -#define _CCMODE_CFB_KEY(K) ((struct _ccmode_cfb_key *)(K)) -#define CCMODE_CFB_KEY_ECB(K) (_CCMODE_CFB_KEY(K)->ecb) -#define CCMODE_CFB_KEY_PAD_LEN(K) (_CCMODE_CFB_KEY(K)->pad_len) -#define CCMODE_CFB_KEY_PAD(K) (&_CCMODE_CFB_KEY(K)->u[0]) -#define CCMODE_CFB_KEY_IV(K) (&_CCMODE_CFB_KEY(K)->u[ccn_nof_size(CCMODE_CFB_KEY_ECB(K)->block_size)]) -#define CCMODE_CFB_KEY_ECB_KEY(K) ((ccecb_ctx *)&_CCMODE_CFB_KEY(K)->u[2 * ccn_nof_size(CCMODE_CFB_KEY_ECB(K)->block_size)]) - -/* Macros for accessing a CCMODE_CFB8_KEY. -{ - const struct ccmode_ecb *ecb - ccn_unit pad[ecb->block_size / CCN_UNIT_SIZE]; - ccn_unit iv[ecb->block_size / CCN_UNIT_SIZE]; - ccn_unit ecb_key[ecb->n] -} */ -#define _CCMODE_CFB8_KEY(K) ((struct _ccmode_cfb8_key *)(K)) -#define CCMODE_CFB8_KEY_ECB(K) (_CCMODE_CFB8_KEY(K)->ecb) -#define CCMODE_CFB8_KEY_PAD(K) (&_CCMODE_CFB8_KEY(K)->u[0]) -#define CCMODE_CFB8_KEY_IV(K) (&_CCMODE_CFB8_KEY(K)->u[ccn_nof_size(CCMODE_CFB8_KEY_ECB(K)->block_size)]) -#define CCMODE_CFB8_KEY_ECB_KEY(K) ((ccecb_ctx *)&_CCMODE_CFB8_KEY(K)->u[2 * ccn_nof_size(CCMODE_CFB8_KEY_ECB(K)->block_size)]) - - -/* Macros for accessing a CCMODE_CTR_KEY. -{ - const struct ccmode_ecb *ecb - cc_size pad_offset; - ccn_unit pad[ecb->block_size / CCN_UNIT_SIZE]; - ccn_unit ctr[ecb->block_size / CCN_UNIT_SIZE]; - ccn_unit ecb_key[ecb->n] -} */ -#define _CCMODE_CTR_KEY(K) ((struct _ccmode_ctr_key *)(K)) -#define CCMODE_CTR_KEY_ECB(K) (_CCMODE_CTR_KEY(K)->ecb) -#define CCMODE_CTR_KEY_PAD_OFFSET(K) (_CCMODE_CTR_KEY(K)->pad_offset) -#define CCMODE_CTR_KEY_PAD(K) (&_CCMODE_CTR_KEY(K)->u[0]) -#define CCMODE_CTR_KEY_CTR(K) (&_CCMODE_CTR_KEY(K)->u[ccn_nof_size(CCMODE_CTR_KEY_ECB(K)->block_size)]) -#define CCMODE_CTR_KEY_ECB_KEY(K) ((ccecb_ctx *)&_CCMODE_CTR_KEY(K)->u[2 * ccn_nof_size(CCMODE_CTR_KEY_ECB(K)->block_size)]) - -CC_INLINE int ccctr_setctr(const struct ccmode_ctr *mode, ccctr_ctx *ctx, const void *ctr) -{ - return mode->setctr(mode, ctx, ctr); -} - -/* Macros for accessing a CCMODE_OFB_KEY. -{ - const struct ccmode_ecb *ecb - cc_size pad_len; - ccn_unit iv[ecb->block_size / CCN_UNIT_SIZE]; - ccn_unit ecb_key[ecb->n] -} */ -#define _CCMODE_OFB_KEY(K) ((struct _ccmode_ofb_key *)(K)) -#define CCMODE_OFB_KEY_ECB(K) (_CCMODE_OFB_KEY(K)->ecb) -#define CCMODE_OFB_KEY_PAD_LEN(K) (_CCMODE_OFB_KEY(K)->pad_len) -#define CCMODE_OFB_KEY_IV(K) (&_CCMODE_OFB_KEY(K)->u[0]) -#define CCMODE_OFB_KEY_ECB_KEY(K) ((ccecb_ctx *)&_CCMODE_OFB_KEY(K)->u[ccn_nof_size(CCMODE_OFB_KEY_ECB(K)->block_size)]) - - -/* Macros for accessing a CCMODE_XTS_KEY. -{ - const struct ccmode_ecb *ecb - const struct ccmode_ecb *ecb_encrypt - ccn_unit data_key[ecb->size] - ccn_unit tweak_key[ecb_encrypt->size] -} */ -#define _CCMODE_XTS_KEY(K) ((struct _ccmode_xts_key *)(K)) -#define CCMODE_XTS_KEY_ECB(K) (_CCMODE_XTS_KEY(K)->ecb) -#define CCMODE_XTS_KEY_ECB_ENCRYPT(K) (_CCMODE_XTS_KEY(K)->ecb_encrypt) -#define CCMODE_XTS_KEY_DATA_KEY(K) ((ccecb_ctx *)&_CCMODE_XTS_KEY(K)->u[0]) -#define CCMODE_XTS_KEY_TWEAK_KEY(K) ((ccecb_ctx *)&_CCMODE_XTS_KEY(K)->u[ccn_nof_size(CCMODE_XTS_KEY_ECB(K)->size)]) - -CC_CONST CC_INLINE -const struct ccmode_ecb * ccmode_xts_key_ecb(const ccxts_ctx *K) { - return ((const struct _ccmode_xts_key *)K)->ecb; -} - -CC_CONST CC_INLINE -const struct ccmode_ecb * ccmode_xts_key_ecb_encrypt(const ccxts_ctx *K) { - return ((const struct _ccmode_xts_key *)K)->ecb_encrypt; -} - -CC_CONST CC_INLINE -const ccecb_ctx * ccmode_xts_key_data_key(const ccxts_ctx *K) { - return (const ccecb_ctx *)&((const struct _ccmode_xts_key *)K)->u[0]; -} - -CC_CONST CC_INLINE -const ccecb_ctx * ccmode_xts_key_tweak_key(const ccxts_ctx *K) { - return (const ccecb_ctx *)&((const struct _ccmode_xts_key *)K)->u[ccn_nof_size(ccmode_xts_key_ecb(K)->size)]; -} - -/* Macros for accessing a CCMODE_XTS_TWEAK. -{ - size_t blocks_processed; - uint8_t value[16]; -} */ -#define _CCMODE_XTS_TWEAK(T) ((struct _ccmode_xts_tweak *)(T)) -#define CCMODE_XTS_TWEAK_BLOCK_PROCESSED(T)(_CCMODE_XTS_TWEAK(T)->blocks_processed) -#define CCMODE_XTS_TWEAK_VALUE(T) (_CCMODE_XTS_TWEAK(T)->u) - - -/* Macros for accessing a CCMODE_GCM_KEY. - Common to the generic (factory) and the VNG implementation -*/ - -#define _CCMODE_GCM_KEY(K) ((struct _ccmode_gcm_key *)(K)) -#define CCMODE_GCM_KEY_H(K) (_CCMODE_GCM_KEY(K)->H) -#define CCMODE_GCM_KEY_X(K) (_CCMODE_GCM_KEY(K)->X) -#define CCMODE_GCM_KEY_Y(K) (_CCMODE_GCM_KEY(K)->Y) -#define CCMODE_GCM_KEY_Y_0(K) (_CCMODE_GCM_KEY(K)->Y_0) -#define CCMODE_GCM_KEY_PAD_LEN(K) (_CCMODE_GCM_KEY(K)->buf_nbytes) -#define CCMODE_GCM_KEY_PAD(K) (_CCMODE_GCM_KEY(K)->buf) - -#define _CCMODE_GCM_ECB_MODE(K) ((struct _ccmode_gcm_key *)(K)) -#define CCMODE_GCM_KEY_ECB(K) (_CCMODE_GCM_ECB_MODE(K)->ecb) -#define CCMODE_GCM_KEY_ECB_KEY(K) ((ccecb_ctx *)_CCMODE_GCM_ECB_MODE(K)->ecb_key) // set in init function - -#define CCMODE_GCM_STATE_IV 1 -#define CCMODE_GCM_STATE_AAD 2 -#define CCMODE_GCM_STATE_TEXT 3 -#define CCMODE_GCM_STATE_FINAL 4 - -#define CCMODE_STATE_INIT 2 //first call to init -#define CCMODE_STATE_IV_START 3 //first call to set_iv - -// rdar://problem/23523093 -//this allows users to bypass set_iv(). -//this is a temporary setting mainly to allow Security framework to adapt -//ccgcm_set_iv_legacy() and check the tack on decyption without -//need to change the Security twice -//#define CCMODE_STATE_IV_CONT 2 //subsequent calls to set_iv -#define CCMODE_STATE_IV_CONT CCMODE_STATE_IV_START - -#define CCMODE_STATE_AAD 4 -#define CCMODE_STATE_TEXT 5 - -#define CCMODE_CCM_STATE_IV 1 - -void ccmode_gcm_gf_mult(const unsigned char *a, const unsigned char *b, - unsigned char *c); -void ccmode_gcm_mult_h(ccgcm_ctx *key, unsigned char *I); - -/* Macros for accessing a CCMODE_CCM_KEY. */ -#define _CCMODE_CCM_KEY(K) ((struct _ccmode_ccm_key *)(K)) -#define CCMODE_CCM_KEY_ECB(K) (_CCMODE_CCM_KEY(K)->ecb) -#define CCMODE_CCM_KEY_ECB_KEY(K) ((ccecb_ctx *)&_CCMODE_CCM_KEY(K)->u[0]) - -#define _CCMODE_CCM_NONCE(N) ((struct _ccmode_ccm_nonce *)(N)) -#define CCMODE_CCM_KEY_MAC(N) (_CCMODE_CCM_NONCE(N)->MAC) -#define CCMODE_CCM_KEY_A_I(N) (_CCMODE_CCM_NONCE(N)->A_i) -#define CCMODE_CCM_KEY_B_I(N) (_CCMODE_CCM_NONCE(N)->B_i) -#define CCMODE_CCM_KEY_PAD_LEN(N) (_CCMODE_CCM_NONCE(N)->buflen) -#define CCMODE_CCM_KEY_PAD(N) (_CCMODE_CCM_NONCE(N)->buf) -#define CCMODE_CCM_KEY_MAC_LEN(N) (_CCMODE_CCM_NONCE(N)->mac_size) -#define CCMODE_CCM_KEY_NONCE_LEN(N) (_CCMODE_CCM_NONCE(N)->nonce_size) -#define CCMODE_CCM_KEY_AUTH_LEN(N) (_CCMODE_CCM_NONCE(N)->b_i_len) - -/* Macros for accessing a CCMODE_OMAC_KEY. -{ - const struct ccmode_ecb *ecb - cc_size tweak_size; - ccn_unit ecb_key1[ecb->n] - ccn_unit ecb_key2[ecb->n] -} */ -#define _CCMODE_OMAC_KEY(K) ((struct _ccmode_omac_key *)(K)) -#define CCMODE_OMAC_KEY_ECB(K) (_CCMODE_OMAC_KEY(K)->ecb) -#define CCMODE_OMAC_KEY_TWEAK_LEN(K) (_CCMODE_OMAC_KEY(K)->tweak_len) -#define CCMODE_OMAC_KEY_ECB_KEY(K) ((ccecb_ctx *)&_CCMODE_OMAC_KEY(K)->u[0]) - -CC_INLINE void inc_uint(uint8_t *buf, size_t nbytes) -{ - size_t i; - for (i = 0; i < nbytes; i += 1) { - if (++buf[nbytes-1-i] & 255) { break; } - } -} - -CC_INLINE void ccmode_gcm_update_pad(ccgcm_ctx *key) -{ - inc_uint(CCMODE_GCM_KEY_Y(key) + 12, 4); - CCMODE_GCM_KEY_ECB(key)->ecb(CCMODE_GCM_KEY_ECB_KEY(key), 1, - CCMODE_GCM_KEY_Y(key), - CCMODE_GCM_KEY_PAD(key)); -} - -CC_INLINE void ccmode_gcm_aad_finalize(ccgcm_ctx *key) -{ - if (_CCMODE_GCM_KEY(key)->state == CCMODE_GCM_STATE_AAD) { - if (_CCMODE_GCM_KEY(key)->aad_nbytes % CCGCM_BLOCK_NBYTES > 0) { - ccmode_gcm_mult_h(key, CCMODE_GCM_KEY_X(key)); - } - _CCMODE_GCM_KEY(key)->state = CCMODE_GCM_STATE_TEXT; - } -} - -CC_INLINE void xor_128bits(unsigned char *r, const unsigned char *a, const unsigned char *b) -{ - cc_unit *r1 = (cc_unit *)r; - const cc_unit *a1 = (const cc_unit *)a; - const cc_unit *b1 = (const cc_unit *)b; - - for (int i=0; i<128/(CCN_UNIT_SIZE*8); i++) { - r1[i] = a1[i] ^ b1[i]; - } -} - - - -#endif /* _CORECRYPTO_CCMODE_INTERNAL_H_ */ diff --git a/osfmk/corecrypto/ccsha1/src/ccdigest_final_64be.c b/osfmk/corecrypto/ccsha1/src/ccdigest_final_64be.c index 8bec7daf7..be6acaa1a 100644 --- a/osfmk/corecrypto/ccsha1/src/ccdigest_final_64be.c +++ b/osfmk/corecrypto/ccsha1/src/ccdigest_final_64be.c @@ -34,6 +34,7 @@ #include #include +#include "ccdigest_internal.h" /* This can be used for SHA1, SHA256 and SHA224 */ void ccdigest_final_64be(const struct ccdigest_info *di, ccdigest_ctx_t ctx, diff --git a/osfmk/corecrypto/ccsha1/src/ccdigest_internal.h b/osfmk/corecrypto/ccsha1/src/ccdigest_internal.h new file mode 100644 index 000000000..bc3921e2e --- /dev/null +++ b/osfmk/corecrypto/ccsha1/src/ccdigest_internal.h @@ -0,0 +1,47 @@ +/* + * ccdigest_internal.h + * corecrypto + * + * Created on 12/20/2017 + * + * Copyright (c) 2017 Apple Inc. All rights reserved. + * + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _CORECRYPTO_CCDIGEST_INTERNAL_H_ +#define _CORECRYPTO_CCDIGEST_INTERNAL_H_ + +#include + +void ccdigest_final_common(const struct ccdigest_info *di, + ccdigest_ctx_t ctx, void *digest); +void ccdigest_final_64be(const struct ccdigest_info *di, ccdigest_ctx_t, + unsigned char *digest); +void ccdigest_final_64le(const struct ccdigest_info *di, ccdigest_ctx_t, + unsigned char *digest); + +#endif /* _CORECRYPTO_CCDIGEST_INTERNAL_H_ */ diff --git a/osfmk/corecrypto/ccsha1/src/ccsha1_eay.c b/osfmk/corecrypto/ccsha1/src/ccsha1_eay.c index 3e945ad8c..a28e38f9f 100644 --- a/osfmk/corecrypto/ccsha1/src/ccsha1_eay.c +++ b/osfmk/corecrypto/ccsha1/src/ccsha1_eay.c @@ -81,8 +81,10 @@ #include +#include "ccsha1_internal.h" #include #include +#include "ccdigest_internal.h" #ifndef SHA_LONG_LOG2 diff --git a/osfmk/corecrypto/ccmode/src/ccmode_ctr_init.c b/osfmk/corecrypto/ccsha1/src/ccsha1_internal.h similarity index 70% rename from osfmk/corecrypto/ccmode/src/ccmode_ctr_init.c rename to osfmk/corecrypto/ccsha1/src/ccsha1_internal.h index 00e3ca6c1..323bbb2cd 100644 --- a/osfmk/corecrypto/ccmode/src/ccmode_ctr_init.c +++ b/osfmk/corecrypto/ccsha1/src/ccsha1_internal.h @@ -1,10 +1,10 @@ /* - * ccmode_ctr_init.c + * ccsha1_internal.h * corecrypto * - * Created on 12/17/2010 + * Created on 12/19/2017 * - * Copyright (c) 2010,2011,2012,2014,2015 Apple Inc. All rights reserved. + * Copyright (c) 2017 Apple Inc. All rights reserved. * * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ @@ -32,18 +32,17 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#include "ccmode_internal.h" +#ifndef _CORECRYPTO_CCSHA1_INTERNAL_H_ +#define _CORECRYPTO_CCSHA1_INTERNAL_H_ -int ccmode_ctr_init(const struct ccmode_ctr *ctr, ccctr_ctx *key, - size_t rawkey_len, const void *rawkey, - const void *iv) { - int rc; - const struct ccmode_ecb *ecb = ctr->custom; - CCMODE_CTR_KEY_ECB(key) = ecb; +#include +#include - rc = ecb->init(ecb, CCMODE_CTR_KEY_ECB_KEY(key), rawkey_len, rawkey); - - ccctr_setctr(ctr, key, iv); +extern const uint32_t ccsha1_initial_state[5]; - return rc; -} +#if CCSHA1_VNG_INTEL && defined(__x86_64__) +extern const struct ccdigest_info ccsha1_vng_intel_AVX2_di; +extern const struct ccdigest_info ccsha1_vng_intel_AVX1_di; +#endif + +#endif /* _CORECRYPTO_CCSHA1_INTERNAL_H_ */ diff --git a/osfmk/corecrypto/ccsha2/src/ccdigest_internal.h b/osfmk/corecrypto/ccsha2/src/ccdigest_internal.h new file mode 100644 index 000000000..bc3921e2e --- /dev/null +++ b/osfmk/corecrypto/ccsha2/src/ccdigest_internal.h @@ -0,0 +1,47 @@ +/* + * ccdigest_internal.h + * corecrypto + * + * Created on 12/20/2017 + * + * Copyright (c) 2017 Apple Inc. All rights reserved. + * + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _CORECRYPTO_CCDIGEST_INTERNAL_H_ +#define _CORECRYPTO_CCDIGEST_INTERNAL_H_ + +#include + +void ccdigest_final_common(const struct ccdigest_info *di, + ccdigest_ctx_t ctx, void *digest); +void ccdigest_final_64be(const struct ccdigest_info *di, ccdigest_ctx_t, + unsigned char *digest); +void ccdigest_final_64le(const struct ccdigest_info *di, ccdigest_ctx_t, + unsigned char *digest); + +#endif /* _CORECRYPTO_CCDIGEST_INTERNAL_H_ */ diff --git a/osfmk/corecrypto/ccsha2/src/ccsha256_di.c b/osfmk/corecrypto/ccsha2/src/ccsha256_di.c index c0b031a0d..c702b9736 100644 --- a/osfmk/corecrypto/ccsha2/src/ccsha256_di.c +++ b/osfmk/corecrypto/ccsha2/src/ccsha256_di.c @@ -33,6 +33,7 @@ */ #include +#include "ccsha2_internal.h" #include #include "corecrypto/fipspost_trace.h" @@ -43,8 +44,11 @@ const struct ccdigest_info *ccsha256_di(void) #if CCSHA2_VNG_INTEL #if defined (__x86_64__) - return ( (CC_HAS_AVX2() ? &ccsha256_vng_intel_AVX2_di : - ( (CC_HAS_AVX1() ? &ccsha256_vng_intel_AVX1_di : + if (CC_HAS_AVX512_AND_IN_KERNEL()) + return &ccsha256_vng_intel_SupplementalSSE3_di; + else + return ( (CC_HAS_AVX2() ? &ccsha256_vng_intel_AVX2_di : + ( (CC_HAS_AVX1() ? &ccsha256_vng_intel_AVX1_di : &ccsha256_vng_intel_SupplementalSSE3_di ) ) ) ) ; #else return &ccsha256_vng_intel_SupplementalSSE3_di; diff --git a/osfmk/corecrypto/ccsha2/src/ccsha256_ltc_compress.c b/osfmk/corecrypto/ccsha2/src/ccsha256_ltc_compress.c index b9ff54b87..fb301b446 100644 --- a/osfmk/corecrypto/ccsha2/src/ccsha256_ltc_compress.c +++ b/osfmk/corecrypto/ccsha2/src/ccsha256_ltc_compress.c @@ -50,6 +50,8 @@ #include #include "ccsha2_internal.h" +#if !CC_KERNEL || !CC_USE_ASM + // Various logical functions #define Ch(x,y,z) (z ^ (x & (y ^ z))) #define Maj(x,y,z) (((x | y) & z) | (x & y)) @@ -91,7 +93,7 @@ d += t0; \ h = t0 + t1; -// compress 512-bits +// compress 512-bits void ccsha256_ltc_compress(ccdigest_state_t state, size_t nblocks, const void *in) { uint32_t W[64], t0, t1; @@ -136,7 +138,7 @@ void ccsha256_ltc_compress(ccdigest_state_t state, size_t nblocks, const void *i RND(S2,S3,S4,S5,S6,S7,S0,S1,i+6); RND(S1,S2,S3,S4,S5,S6,S7,S0,i+7); } - + // feedback s[0] += S0; s[1] += S1; @@ -150,3 +152,5 @@ void ccsha256_ltc_compress(ccdigest_state_t state, size_t nblocks, const void *i buf+=CCSHA256_BLOCK_SIZE/sizeof(buf[0]); } } + +#endif diff --git a/osfmk/corecrypto/ccsha2/src/ccsha256_ltc_di.c b/osfmk/corecrypto/ccsha2/src/ccsha256_ltc_di.c index 1e4109b60..7b9aef1ce 100644 --- a/osfmk/corecrypto/ccsha2/src/ccsha256_ltc_di.c +++ b/osfmk/corecrypto/ccsha2/src/ccsha256_ltc_di.c @@ -34,8 +34,11 @@ #include #include +#include "ccdigest_internal.h" #include "ccsha2_internal.h" +#if !CC_KERNEL || !CC_USE_ASM + const struct ccdigest_info ccsha256_ltc_di = { .output_size = CCSHA256_OUTPUT_SIZE, .state_size = CCSHA256_STATE_SIZE, @@ -46,3 +49,5 @@ const struct ccdigest_info ccsha256_ltc_di = { .compress = ccsha256_ltc_compress, .final = ccdigest_final_64be, }; + +#endif diff --git a/osfmk/corecrypto/ccsha2/src/ccsha2_internal.h b/osfmk/corecrypto/ccsha2/src/ccsha2_internal.h index 14fd2d4fb..7bf64bc04 100644 --- a/osfmk/corecrypto/ccsha2/src/ccsha2_internal.h +++ b/osfmk/corecrypto/ccsha2/src/ccsha2_internal.h @@ -2,9 +2,9 @@ * ccsha2_internal.h * corecrypto * - * Created on 12/07/2010 + * Created on 12/19/2017 * - * Copyright (c) 2010,2011,2012,2014,2015 Apple Inc. All rights reserved. + * Copyright (c) 2017 Apple Inc. All rights reserved. * * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ @@ -37,8 +37,10 @@ #include -void ccsha256_ltc_compress(ccdigest_state_t state, size_t nblocks, const void *buf); +extern const struct ccdigest_info ccsha256_v6m_di; void ccsha256_v6m_compress(ccdigest_state_t state, size_t nblocks, const void *buf); + +void ccsha256_ltc_compress(ccdigest_state_t state, size_t nblocks, const void *buf); void ccsha512_ltc_compress(ccdigest_state_t state, size_t nblocks, const void *in); #if CCSHA2_VNG_INTEL @@ -49,10 +51,31 @@ void ccsha256_vng_intel_ssse3_compress(ccdigest_state_t state, size_t nblocks, c void ccsha512_vng_intel_avx2_compress(ccdigest_state_t state, size_t nblocks, const void *in); void ccsha512_vng_intel_avx1_compress(ccdigest_state_t state, size_t nblocks, const void *in); void ccsha512_vng_intel_ssse3_compress(ccdigest_state_t state, size_t nblocks, const void *in); + +extern const struct ccdigest_info ccsha224_vng_intel_AVX2_di; +extern const struct ccdigest_info ccsha224_vng_intel_AVX1_di; +extern const struct ccdigest_info ccsha256_vng_intel_AVX2_di; +extern const struct ccdigest_info ccsha256_vng_intel_AVX1_di; +extern const struct ccdigest_info ccsha384_vng_intel_AVX2_di; +extern const struct ccdigest_info ccsha384_vng_intel_AVX1_di; +extern const struct ccdigest_info ccsha384_vng_intel_SupplementalSSE3_di; +extern const struct ccdigest_info ccsha512_vng_intel_AVX2_di; +extern const struct ccdigest_info ccsha512_vng_intel_AVX1_di; +extern const struct ccdigest_info ccsha512_vng_intel_SupplementalSSE3_di; #endif void ccsha256_vng_intel_sse3_compress(ccdigest_state_t state, size_t nblocks, const void *in); #endif +#if CCSHA2_VNG_ARMV7NEON +extern const struct ccdigest_info ccsha384_vng_arm64_di; +extern const struct ccdigest_info ccsha384_vng_armv7neon_di; +extern const struct ccdigest_info ccsha512_vng_arm64_di; +extern const struct ccdigest_info ccsha512_vng_armv7neon_di; +#endif + +extern const uint32_t ccsha256_K[64]; +extern const uint64_t ccsha512_K[80]; + void ccsha512_final(const struct ccdigest_info *di, ccdigest_ctx_t ctx, unsigned char *digest); extern const uint32_t ccsha224_initial_state[8]; @@ -60,4 +83,5 @@ extern const uint32_t ccsha256_initial_state[8]; extern const uint64_t ccsha384_initial_state[8]; extern const uint64_t ccsha512_initial_state[8]; + #endif /* _CORECRYPTO_CCSHA2_INTERNAL_H_ */ diff --git a/osfmk/corpses/corpse.c b/osfmk/corpses/corpse.c index b2078a91a..2941e2125 100644 --- a/osfmk/corpses/corpse.c +++ b/osfmk/corpses/corpse.c @@ -369,9 +369,11 @@ task_purge_all_corpses(void) * Final cleanup: * + no unnesting * + remove immutable mappings + * + allow gaps in the range */ (VM_MAP_REMOVE_NO_UNNESTING | - VM_MAP_REMOVE_IMMUTABLE)); + VM_MAP_REMOVE_IMMUTABLE | + VM_MAP_REMOVE_GAPS_OK)); } lck_mtx_unlock(&tasks_corpse_lock); @@ -413,7 +415,9 @@ task_generate_corpse( if (kr != KERN_SUCCESS) { return kr; } - assert(thread == THREAD_NULL); + if (thread != THREAD_NULL) { + thread_deallocate(thread); + } /* wait for all the threads in the task to terminate */ task_lock(new_task); @@ -476,7 +480,9 @@ task_enqueue_exception_with_corpse( kr = task_generate_corpse_internal(task, &new_task, &thread, etype, code[0], code[1], reason); if (kr == KERN_SUCCESS) { - assert(thread != THREAD_NULL); + if (thread == THREAD_NULL) { + return KERN_FAILURE; + } assert(new_task != TASK_NULL); assert(etype == EXC_RESOURCE || etype == EXC_GUARD); thread_exception_enqueue(new_task, thread, etype); @@ -512,7 +518,8 @@ task_generate_corpse_internal( thread_t thread_next = THREAD_NULL; kern_return_t kr; struct proc *p = NULL; - int is64bit; + int is_64bit_addr; + int is_64bit_data; int t_flags; uint64_t *udata_buffer = NULL; int size = 0; @@ -543,8 +550,13 @@ task_generate_corpse_internal( goto error_task_generate_corpse; } - is64bit = IS_64BIT_PROCESS(p); - t_flags = TF_CORPSE_FORK | TF_PENDING_CORPSE | TF_CORPSE | (is64bit ? TF_64B_ADDR : TF_NONE); + is_64bit_addr = IS_64BIT_PROCESS(p); + is_64bit_data = (task == TASK_NULL) ? is_64bit_addr : task_get_64bit_data(task); + t_flags = TF_CORPSE_FORK | + TF_PENDING_CORPSE | + TF_CORPSE | + (is_64bit_addr ? TF_64B_ADDR : TF_NONE) | + (is_64bit_data ? TF_64B_DATA : TF_NONE); #if CONFIG_MACF /* Create the corpse label credentials from the process. */ @@ -555,7 +567,8 @@ task_generate_corpse_internal( kr = task_create_internal(task, NULL, TRUE, - is64bit, + is_64bit_addr, + is_64bit_data, t_flags, TPF_NONE, &new_task); diff --git a/osfmk/corpses/task_corpse.h b/osfmk/corpses/task_corpse.h index 59ab07592..120aeb97c 100644 --- a/osfmk/corpses/task_corpse.h +++ b/osfmk/corpses/task_corpse.h @@ -68,8 +68,6 @@ extern kern_return_t task_deliver_crash_notification(task_t, thread_t, exception extern kcdata_descriptor_t task_get_corpseinfo(task_t task); -extern unsigned long total_corposes_count(void) __attribute__((pure)); - extern kcdata_descriptor_t task_crashinfo_alloc_init( mach_vm_address_t crash_data_p, unsigned size, uint32_t kc_u_flags, unsigned kc_flags); diff --git a/osfmk/i386/AT386/conf.c b/osfmk/i386/AT386/conf.c deleted file mode 100644 index 8fffcc657..000000000 --- a/osfmk/i386/AT386/conf.c +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * @OSF_COPYRIGHT@ - */ -/* - * Mach Operating System - * Copyright (c) 1991,1990,1989 Carnegie Mellon University - * All Rights Reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" - * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR - * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * Carnegie Mellon requests users of this software to return to - * - * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU - * School of Computer Science - * Carnegie Mellon University - * Pittsburgh PA 15213-3890 - * - * any improvements or extensions that they make and grant Carnegie Mellon - * the rights to redistribute these changes. - */ -/* - */ - -/* - * Device switch for i386 AT bus. - */ - -#include -#include -#include - -/* - * Clock device subsystem configuration. The clock_list[] - * table contains the clock structures for all clocks in - * the system. - */ - -extern const struct clock_ops sysclk_ops, calend_ops; - -/* - * List of clock devices. - */ -SECURITY_READ_ONLY_LATE(struct clock) clock_list[] = { - - /* SYSTEM_CLOCK */ - { &sysclk_ops, 0, 0 }, - - /* CALENDAR_CLOCK */ - { &calend_ops, 0, 0 } -}; -int clock_count = sizeof(clock_list) / sizeof(clock_list[0]); diff --git a/osfmk/i386/AT386/model_dep.c b/osfmk/i386/AT386/model_dep.c index 614b310eb..3976f1f6b 100644 --- a/osfmk/i386/AT386/model_dep.c +++ b/osfmk/i386/AT386/model_dep.c @@ -125,8 +125,7 @@ #include #include - -#include +#include #if DEBUG || DEVELOPMENT #define DPRINTF(x...) kprintf(x) @@ -342,112 +341,9 @@ machine_conf(void) machine_info.memory_size = (typeof(machine_info.memory_size))mem_size; } - extern void *gPEEFIRuntimeServices; extern void *gPEEFISystemTable; -/*- - * COPYRIGHT (C) 1986 Gary S. Brown. You may use this program, or - * code or tables extracted from it, as desired without restriction. - * - * First, the polynomial itself and its table of feedback terms. The - * polynomial is - * X^32+X^26+X^23+X^22+X^16+X^12+X^11+X^10+X^8+X^7+X^5+X^4+X^2+X^1+X^0 - * - * Note that we take it "backwards" and put the highest-order term in - * the lowest-order bit. The X^32 term is "implied"; the LSB is the - * X^31 term, etc. The X^0 term (usually shown as "+1") results in - * the MSB being 1 - * - * Note that the usual hardware shift register implementation, which - * is what we're using (we're merely optimizing it by doing eight-bit - * chunks at a time) shifts bits into the lowest-order term. In our - * implementation, that means shifting towards the right. Why do we - * do it this way? Because the calculated CRC must be transmitted in - * order from highest-order term to lowest-order term. UARTs transmit - * characters in order from LSB to MSB. By storing the CRC this way - * we hand it to the UART in the order low-byte to high-byte; the UART - * sends each low-bit to hight-bit; and the result is transmission bit - * by bit from highest- to lowest-order term without requiring any bit - * shuffling on our part. Reception works similarly - * - * The feedback terms table consists of 256, 32-bit entries. Notes - * - * The table can be generated at runtime if desired; code to do so - * is shown later. It might not be obvious, but the feedback - * terms simply represent the results of eight shift/xor opera - * tions for all combinations of data and CRC register values - * - * The values must be right-shifted by eight bits by the "updcrc - * logic; the shift must be unsigned (bring in zeroes). On some - * hardware you could probably optimize the shift in assembler by - * using byte-swap instructions - * polynomial $edb88320 - * - * - * CRC32 code derived from work by Gary S. Brown. - */ - -static uint32_t crc32_tab[] = { - 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f, - 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, - 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2, - 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, - 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, - 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, - 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c, - 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59, - 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, - 0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, - 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106, - 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433, - 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, - 0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, - 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950, - 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65, - 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7, - 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, - 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, - 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, - 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81, - 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, - 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84, - 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, - 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, - 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, - 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e, - 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b, - 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, - 0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, - 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28, - 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d, - 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f, - 0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, - 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242, - 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777, - 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69, - 0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, - 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, - 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, - 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693, - 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, - 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d -}; - -static uint32_t -crc32(uint32_t crc, const void *buf, size_t size) -{ - const uint8_t *p; - - p = buf; - crc = crc ^ ~0U; - - while (size--) - crc = crc32_tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8); - - return crc ^ ~0U; -} - static void efi_set_tables_64(EFI_SYSTEM_TABLE_64 * system_table) { @@ -1017,9 +913,12 @@ RecordPanicStackshot() void SavePanicInfo( - __unused const char *message, uint64_t panic_options) + __unused const char *message, void *panic_data, uint64_t panic_options) { - void *stackptr; + void *stackptr = NULL; + thread_t thread_to_trace = (thread_t) panic_data; + cframe_t synthetic_stack_frame = { }; + char *debugger_msg = NULL; int cn = cpu_number(); /* @@ -1028,15 +927,37 @@ SavePanicInfo( */ panic_io_port_read(); - /* Obtain current frame pointer */ - __asm__ volatile("movq %%rbp, %0" : "=m" (stackptr)); + /* Obtain frame pointer for stack to trace */ + if (panic_options & DEBUGGER_INTERNAL_OPTION_THREAD_BACKTRACE) { + if (!mp_kdp_all_cpus_halted()) { + debugger_msg = "Backtracing panicked thread because failed to halt all CPUs\n"; + } else if (thread_to_trace == THREAD_NULL) { + debugger_msg = "Backtracing panicked thread because no thread pointer provided\n"; + } else if (kvtophys((vm_offset_t)thread_to_trace) == 0ULL) { + debugger_msg = "Backtracing panicked thread because unable to access specified thread\n"; + } else if (thread_to_trace->kernel_stack == 0) { + debugger_msg = "Backtracing panicked thread because kernel_stack is NULL for specified thread\n"; + } else if (kvtophys(STACK_IKS(thread_to_trace->kernel_stack) == 0ULL)) { + debugger_msg = "Backtracing panicked thread because unable to access kernel_stack for specified thread\n"; + } else { + debugger_msg = "Backtracing specified thread\n"; + /* We construct a synthetic stack frame so we can include the current instruction pointer */ + synthetic_stack_frame.prev = (cframe_t *)STACK_IKS(thread_to_trace->kernel_stack)->k_rbp; + synthetic_stack_frame.caller = (uintptr_t) STACK_IKS(thread_to_trace->kernel_stack)->k_rip; + stackptr = (void *) &synthetic_stack_frame; + } + } + + if (stackptr == NULL) { + __asm__ volatile("movq %%rbp, %0" : "=m" (stackptr)); + } /* Print backtrace - callee is internally synchronized */ if (panic_options & DEBUGGER_OPTION_INITPROC_PANIC) { /* Special handling of launchd died panics */ print_launchd_info(); } else { - panic_i386_backtrace(stackptr, ((panic_double_fault_cpu == cn) ? 80: 48), NULL, FALSE, NULL); + panic_i386_backtrace(stackptr, ((panic_double_fault_cpu == cn) ? 80: 48), debugger_msg, FALSE, NULL); } if (panic_options & DEBUGGER_OPTION_COPROC_INITIATED_PANIC) { diff --git a/osfmk/i386/Makefile b/osfmk/i386/Makefile index 4df5e8510..12e9c6025 100644 --- a/osfmk/i386/Makefile +++ b/osfmk/i386/Makefile @@ -22,6 +22,7 @@ EXPORT_ONLY_FILES = \ lapic.h \ lock.h \ locks.h \ + locks_i386_inlines.h \ machine_routines.h \ machine_cpu.h \ mtrr.h \ diff --git a/osfmk/i386/acpi.c b/osfmk/i386/acpi.c index 5ceff836b..5a991c597 100644 --- a/osfmk/i386/acpi.c +++ b/osfmk/i386/acpi.c @@ -318,7 +318,7 @@ acpi_sleep_kernel(acpi_sleep_callback func, void *refcon) * The sleep implementation uses indirect noreturn calls, so we miss stack * unpoisoning. Do it explicitly. */ - __asan_handle_no_return(); + kasan_unpoison_curstack(true); #endif #if HIBERNATION diff --git a/osfmk/i386/atomic.h b/osfmk/i386/atomic.h index ef4652d5c..940e5fcf2 100644 --- a/osfmk/i386/atomic.h +++ b/osfmk/i386/atomic.h @@ -51,7 +51,7 @@ #ifdef ATOMIC_PRIVATE -static boolean_t +static inline boolean_t atomic_compare_exchange(uintptr_t *target, uintptr_t oldval, uintptr_t newval, enum memory_order ord, boolean_t wait) { @@ -59,22 +59,14 @@ atomic_compare_exchange(uintptr_t *target, uintptr_t oldval, uintptr_t newval, return __c11_atomic_compare_exchange_strong((_Atomic uintptr_t *)target, &oldval, newval, ord, memory_order_relaxed); } -#endif // ATOMIC_PRIVATE - -#define os_atomic_rmw_loop(p, ov, nv, m, ...) ({ \ - bool _result = false; \ - typeof(p) _p = (p); \ - ov = atomic_load_explicit(_p, memory_order_relaxed); \ - do { \ - __VA_ARGS__; \ - typeof(ov) _r = (ov); \ - _result = atomic_compare_exchange_weak_explicit(_p, &_r, nv, \ - memory_order_##m, memory_order_relaxed); \ - (ov) = _r; \ - } while (__builtin_expect(!_result, 0)); \ - _result; \ - }) +static inline boolean_t +atomic_compare_exchange32(uint32_t *target, uint32_t oldval, uint32_t newval, + enum memory_order ord, boolean_t wait) +{ + (void)wait; + return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &oldval, newval, ord, memory_order_relaxed); +} -#define os_atomic_rmw_loop_give_up(expr) ({ expr; __builtin_trap(); }) +#endif // ATOMIC_PRIVATE #endif // _I386_ATOMIC_H_ diff --git a/osfmk/i386/bsd_i386.c b/osfmk/i386/bsd_i386.c index b855d1c66..805cbc1de 100644 --- a/osfmk/i386/bsd_i386.c +++ b/osfmk/i386/bsd_i386.c @@ -211,7 +211,7 @@ thread_set_child(thread_t child, int pid) { pal_register_cache_state(child, DIRTY); - if (thread_is_64bit(child)) { + if (thread_is_64bit_addr(child)) { x86_saved_state64_t *iss64; iss64 = USER_REGS64(child); @@ -609,7 +609,7 @@ thread_setuserstack( mach_vm_address_t user_stack) { pal_register_cache_state(thread, DIRTY); - if (thread_is_64bit(thread)) { + if (thread_is_64bit_addr(thread)) { x86_saved_state64_t *iss64; iss64 = USER_REGS64(thread); @@ -636,7 +636,7 @@ thread_adjuserstack( int adjust) { pal_register_cache_state(thread, DIRTY); - if (thread_is_64bit(thread)) { + if (thread_is_64bit_addr(thread)) { x86_saved_state64_t *iss64; iss64 = USER_REGS64(thread); @@ -665,7 +665,7 @@ void thread_setentrypoint(thread_t thread, mach_vm_address_t entry) { pal_register_cache_state(thread, DIRTY); - if (thread_is_64bit(thread)) { + if (thread_is_64bit_addr(thread)) { x86_saved_state64_t *iss64; iss64 = USER_REGS64(thread); @@ -685,7 +685,7 @@ kern_return_t thread_setsinglestep(thread_t thread, int on) { pal_register_cache_state(thread, DIRTY); - if (thread_is_64bit(thread)) { + if (thread_is_64bit_addr(thread)) { x86_saved_state64_t *iss64; iss64 = USER_REGS64(thread); diff --git a/osfmk/i386/bsd_i386_native.c b/osfmk/i386/bsd_i386_native.c index 4ec100eff..6b8e1d124 100644 --- a/osfmk/i386/bsd_i386_native.c +++ b/osfmk/i386/bsd_i386_native.c @@ -75,7 +75,8 @@ kern_return_t machine_thread_dup( thread_t parent, - thread_t child + thread_t child, + __unused boolean_t is_corpse ) { @@ -85,7 +86,7 @@ machine_thread_dup( /* * Copy over the x86_saved_state registers */ - if (thread_is_64bit(parent)) + if (thread_is_64bit_addr(parent)) bcopy(USER_REGS64(parent), USER_REGS64(child), sizeof(x86_saved_state64_t)); else bcopy(USER_REGS32(parent), USER_REGS32(child), sizeof(x86_saved_state32_t)); @@ -101,7 +102,7 @@ machine_thread_dup( * Copy the parent's cthread id and USER_CTHREAD descriptor, if 32-bit. */ child_pcb->cthread_self = parent_pcb->cthread_self; - if (!thread_is_64bit(parent)) + if (!thread_is_64bit_addr(parent)) child_pcb->cthread_desc = parent_pcb->cthread_desc; /* @@ -125,7 +126,7 @@ thread_set_parent(thread_t parent, int pid) { pal_register_cache_state(parent, DIRTY); - if (thread_is_64bit(parent)) { + if (thread_is_64bit_addr(parent)) { x86_saved_state64_t *iss64; iss64 = USER_REGS64(parent); diff --git a/osfmk/i386/commpage/commpage.c b/osfmk/i386/commpage/commpage.c index 81b962c1b..8d93e9456 100644 --- a/osfmk/i386/commpage/commpage.c +++ b/osfmk/i386/commpage/commpage.c @@ -126,22 +126,25 @@ commpage_allocate( vm_map_entry_t entry; ipc_port_t handle; kern_return_t kr; + vm_map_kernel_flags_t vmk_flags; if (submap == NULL) panic("commpage submap is null"); - if ((kr = vm_map_kernel(kernel_map, - &kernel_addr, - area_used, - 0, - VM_FLAGS_ANYWHERE, - VM_KERN_MEMORY_OSFMK, - NULL, - 0, - FALSE, - VM_PROT_ALL, - VM_PROT_ALL, - VM_INHERIT_NONE))) + kr = vm_map_kernel(kernel_map, + &kernel_addr, + area_used, + 0, + VM_FLAGS_ANYWHERE, + VM_MAP_KERNEL_FLAGS_NONE, + VM_KERN_MEMORY_OSFMK, + NULL, + 0, + FALSE, + VM_PROT_ALL, + VM_PROT_ALL, + VM_INHERIT_NONE); + if (kr != KERN_SUCCESS) panic("cannot allocate commpage %d", kr); if ((kr = vm_map_wire_kernel(kernel_map, @@ -171,18 +174,31 @@ commpage_allocate( NULL ))) // parent_entry (what is this?) panic("cannot make entry for commpage %d", kr); - if ((kr = vm_map_64_kernel( submap, // target map (shared submap) - &zero, // address (map into 1st page in submap) - area_used, // size - 0, // mask - VM_FLAGS_FIXED, // flags (it must be 1st page in submap) - VM_KERN_MEMORY_NONE, - handle, // port is the memory entry we just made - 0, // offset (map 1st page in memory entry) - FALSE, // copy - uperm, // cur_protection (R-only in user map) - uperm, // max_protection - VM_INHERIT_SHARE ))) // inheritance + vmk_flags = VM_MAP_KERNEL_FLAGS_NONE; + if (uperm == (VM_PROT_READ | VM_PROT_EXECUTE)) { + /* + * Mark this unsigned executable mapping as "jit" to avoid + * code-signing violations when attempting to execute unsigned + * code. + */ + vmk_flags.vmkf_map_jit = TRUE; + } + + kr = vm_map_64_kernel( + submap, // target map (shared submap) + &zero, // address (map into 1st page in submap) + area_used, // size + 0, // mask + VM_FLAGS_FIXED, // flags (it must be 1st page in submap) + vmk_flags, + VM_KERN_MEMORY_NONE, + handle, // port is the memory entry we just made + 0, // offset (map 1st page in memory entry) + FALSE, // copy + uperm, // cur_protection (R-only in user map) + uperm, // max_protection + VM_INHERIT_SHARE); // inheritance + if (kr != KERN_SUCCESS) panic("cannot map commpage %d", kr); ipc_port_release(handle); @@ -307,9 +323,9 @@ commpage_init_cpu_capabilities( void ) CPUID_LEAF7_FEATURE_HLE); setif(bits, kHasAVX2_0, cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_AVX2); - setif(bits, kHasRDSEED, cpuid_features() & + setif(bits, kHasRDSEED, cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_RDSEED); - setif(bits, kHasADX, cpuid_features() & + setif(bits, kHasADX, cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_ADX); #if 0 /* The kernel doesn't support MPX or SGX */ diff --git a/osfmk/i386/cpu_capabilities.h b/osfmk/i386/cpu_capabilities.h index dff0ae545..5caa412da 100644 --- a/osfmk/i386/cpu_capabilities.h +++ b/osfmk/i386/cpu_capabilities.h @@ -218,7 +218,7 @@ int _NumCPUs( void ) /* Align following entries to next cache line */ #define _COMM_PAGE_CONT_TIMEBASE (_COMM_PAGE_START_ADDRESS+0x0C0) /* used by mach_continuous_time() */ #define _COMM_PAGE_BOOTTIME_USEC (_COMM_PAGE_START_ADDRESS+0x0C8) /* uint64_t boottime */ -#define _COMM_PAGE_NEWTIMEOFDAY_DATA (_COMM_PAGE_START_ADDRESS+0x0D0) /* used by gettimeofday(). Currently, sizeof(new_commpage_timeofday_data_t) = 40*/ +#define _COMM_PAGE_NEWTIMEOFDAY_DATA (_COMM_PAGE_START_ADDRESS+0x0D0) /* used by gettimeofday(). Currently, sizeof(new_commpage_timeofday_data_t) = 40 */ #define _COMM_PAGE_END (_COMM_PAGE_START_ADDRESS+0xfff) /* end of common page */ diff --git a/osfmk/i386/cpu_data.h b/osfmk/i386/cpu_data.h index 32ffc1d83..4201068f4 100644 --- a/osfmk/i386/cpu_data.h +++ b/osfmk/i386/cpu_data.h @@ -268,7 +268,6 @@ typedef struct cpu_data #if CONFIG_MCA struct mca_state *cpu_mca_state; /* State at MC fault */ #endif - struct prngContext *cpu_prng; /* PRNG's context */ int cpu_type; int cpu_subtype; int cpu_threadtype; @@ -289,6 +288,7 @@ typedef struct cpu_data uint64_t cpu_exit_cr3; uint64_t cpu_pcid_last_cr3; #endif + boolean_t cpu_rendezvous_in_progress; } cpu_data_t; extern cpu_data_t *cpu_data_ptr[]; @@ -365,12 +365,37 @@ extern cpu_data_t *cpu_data_ptr[]; * inline versions of these routines. Everyone outside, must call * the real thing, */ + + +/* + * The "volatile" flavor of current_thread() is intended for use by + * scheduler code which may need to update the thread pointer in the + * course of a context switch. Any call to current_thread() made + * prior to the thread pointer update should be safe to optimize away + * as it should be consistent with that thread's state to the extent + * the compiler can reason about it. Likewise, the context switch + * path will eventually result in an arbitrary branch to the new + * thread's pc, about which the compiler won't be able to reason. + * Thus any compile-time optimization of current_thread() calls made + * within the new thread should be safely encapsulated in its + * register/stack state. The volatile form therefore exists to cover + * the window between the thread pointer update and the branch to + * the new pc. + */ static inline thread_t +get_active_thread_volatile(void) +{ + CPU_DATA_GET(cpu_active_thread,thread_t) +} + +static inline __pure2 thread_t get_active_thread(void) { CPU_DATA_GET(cpu_active_thread,thread_t) } + #define current_thread_fast() get_active_thread() +#define current_thread_volatile() get_active_thread_volatile() #define current_thread() current_thread_fast() #define cpu_mode_is64bit() TRUE diff --git a/osfmk/i386/fp_simd.s b/osfmk/i386/fp_simd.s index b6182d4a3..5679190e3 100644 --- a/osfmk/i386/fp_simd.s +++ b/osfmk/i386/fp_simd.s @@ -40,6 +40,33 @@ Entry(vzeroall) vzeroall ret +Entry(avx512_zero) + vzeroall + + VPX %zmm16 + VPX %zmm17 + VPX %zmm18 + VPX %zmm19 + + VPX %zmm20 + VPX %zmm21 + VPX %zmm22 + VPX %zmm23 + + VPX %zmm24 + VPX %zmm25 + VPX %zmm26 + VPX %zmm27 + + VPX %zmm28 + VPX %zmm29 + VPX %zmm30 + VPX %zmm31 + + xor %eax, %eax + kmovw %eax, %k1 + ret + Entry(xmmzeroall) PX %xmm0 PX %xmm1 diff --git a/osfmk/i386/fpu.c b/osfmk/i386/fpu.c index 51c89b832..c0883c821 100644 --- a/osfmk/i386/fpu.c +++ b/osfmk/i386/fpu.c @@ -57,7 +57,6 @@ #include #include #include -#include #include #include @@ -557,7 +556,7 @@ static void fpu_load_registers(void *fstate) { fp_save_layout_t layout = ifps->fp_save_layout; assert(current_task() == NULL || \ - (thread_is_64bit(current_thread()) ? \ + (thread_is_64bit_addr(current_thread()) ? \ (layout == FXSAVE64 || layout == XSAVE64) : \ (layout == FXSAVE32 || layout == XSAVE32))); assert(ALIGNED(ifps, 64)); @@ -701,7 +700,10 @@ fpu_switch_context(thread_t old, thread_t new) */ clear_ts(); /* registers are in FPU - save to memory */ - fpu_store_registers(ifps, (thread_is_64bit(old) && is_saved_state64(old->machine.iss))); + boolean_t is64 = (thread_is_64bit_addr(old) && + is_saved_state64(old->machine.iss)); + + fpu_store_registers(ifps, is64); ifps->fp_valid = TRUE; if (fpu_ZMM_capable && (cdp->cpu_xstate == AVX512)) { @@ -839,13 +841,13 @@ fpu_set_fxstate( panic("fpu_set_fxstate() UNDEFINED xstate"); break; case FP: - ifps->fp_save_layout = thread_is_64bit(thr_act) ? FXSAVE64 : FXSAVE32; + ifps->fp_save_layout = thread_is_64bit_addr(thr_act) ? FXSAVE64 : FXSAVE32; break; case AVX: { struct x86_avx_thread_state *iavx = (void *) ifps; x86_avx_state64_t *xs = (x86_avx_state64_t *) state; - iavx->fp.fp_save_layout = thread_is_64bit(thr_act) ? XSAVE64 : XSAVE32; + iavx->fp.fp_save_layout = thread_is_64bit_addr(thr_act) ? XSAVE64 : XSAVE32; /* Sanitize XSAVE header */ bzero(&iavx->_xh.xhrsvd[0], sizeof(iavx->_xh.xhrsvd)); @@ -870,7 +872,7 @@ fpu_set_fxstate( x86_avx512_state64_t *s64; } xs = { .ts = tstate }; - iavx->fp.fp_save_layout = thread_is_64bit(thr_act) ? XSAVE64 : XSAVE32; + iavx->fp.fp_save_layout = thread_is_64bit_addr(thr_act) ? XSAVE64 : XSAVE32; /* Sanitize XSAVE header */ bzero(&iavx->_xh.xhrsvd[0], sizeof(iavx->_xh.xhrsvd)); @@ -1182,7 +1184,7 @@ fpnoextflt(void) ifps = fp_state_alloc(xstate); bcopy((char *)&initial_fp_state, (char *)ifps, fp_state_size[xstate]); - if (!thread_is_64bit(thr_act)) { + if (!thread_is_64bit_addr(thr_act)) { ifps->fp_save_layout = fpu_YMM_capable ? XSAVE32 : FXSAVE32; } else @@ -1343,7 +1345,7 @@ fp_save( assert((get_cr0() & CR0_TS) == 0); /* registers are in FPU */ ifps->fp_valid = TRUE; - fpu_store_registers(ifps, thread_is_64bit(thr_act)); + fpu_store_registers(ifps, thread_is_64bit_addr(thr_act)); } } diff --git a/osfmk/i386/fpu.h b/osfmk/i386/fpu.h index 8868fe29c..0ed1dda80 100644 --- a/osfmk/i386/fpu.h +++ b/osfmk/i386/fpu.h @@ -134,5 +134,6 @@ extern void fpUDflt(user_addr_t rip); extern uint32_t thread_fpsimd_hash(thread_t); extern void vzeroall(void); extern void xmmzeroall(void); +extern void avx512_zero(void); #endif /* MKP */ #endif /* _I386_FPU_H_ */ diff --git a/osfmk/i386/i386_init.c b/osfmk/i386/i386_init.c index 5a7655de9..8eb6b7edf 100644 --- a/osfmk/i386/i386_init.c +++ b/osfmk/i386/i386_init.c @@ -416,7 +416,7 @@ vstart(vm_offset_t boot_args_start) #endif #if MONOTONIC - mt_init(); + mt_early_init(); #endif /* MONOTONIC */ first_avail = (vm_offset_t)ID_MAP_VTOP(physfree); @@ -729,6 +729,7 @@ void doublemap_init(void) { dblmap_dist = dblmap_base - hdescb; idt64_hndl_table0[1] = DBLMAP(idt64_hndl_table0[1]); + idt64_hndl_table0[6] = (uint64_t)(uintptr_t)&kernel_stack_mask; extern cpu_data_t cpshadows[], scdatas[]; uintptr_t cd1 = (uintptr_t) &cpshadows[0]; diff --git a/osfmk/i386/i386_lock.s b/osfmk/i386/i386_lock.s index d657afaee..ac0726918 100644 --- a/osfmk/i386/i386_lock.s +++ b/osfmk/i386/i386_lock.s @@ -41,7 +41,7 @@ #include #include #include - + #include "assym.s" #define PAUSE rep; nop @@ -51,29 +51,9 @@ #define LEAF_ENTRY(name) \ Entry(name) -#define LEAF_ENTRY2(n1,n2) \ - Entry(n1); \ - Entry(n2) - #define LEAF_RET \ ret -/* Non-leaf routines always have a stack frame: */ - -#define NONLEAF_ENTRY(name) \ - Entry(name); \ - FRAME - -#define NONLEAF_ENTRY2(n1,n2) \ - Entry(n1); \ - Entry(n2); \ - FRAME - -#define NONLEAF_RET \ - EMARF; \ - ret - - /* For x86_64, the varargs ABI requires that %al indicate * how many SSE register contain arguments. In our case, 0 */ #define ALIGN_STACK() and $0xFFFFFFFFFFFFFFF0, %rsp ; @@ -82,85 +62,6 @@ #define LOAD_PTR_ARG1(x) mov x, %rsi ; #define CALL_PANIC() xorb %al,%al ; call EXT(panic) ; -#define CHECK_UNLOCK(current, owner) \ - cmp current, owner ; \ - je 1f ; \ - ALIGN_STACK() ; \ - LOAD_STRING_ARG0(2f) ; \ - CALL_PANIC() ; \ - hlt ; \ - .data ; \ -2: String "Mutex unlock attempted from non-owner thread"; \ - .text ; \ -1: - -#if MACH_LDEBUG -/* - * Routines for general lock debugging. - */ - -/* - * Checks for expected lock types and calls "panic" on - * mismatch. Detects calls to Mutex functions with - * type simplelock and vice versa. - */ -#define CHECK_MUTEX_TYPE() \ - cmpl $ MUTEX_TAG,M_TYPE ; \ - je 1f ; \ - ALIGN_STACK() ; \ - LOAD_STRING_ARG0(2f) ; \ - CALL_PANIC() ; \ - hlt ; \ - .data ; \ -2: String "not a mutex!" ; \ - .text ; \ -1: - -#define CHECK_MYLOCK(current, owner) \ - cmp current, owner ; \ - jne 1f ; \ - ALIGN_STACK() ; \ - LOAD_STRING_ARG0(2f) ; \ - CALL_PANIC() ; \ - hlt ; \ - .data ; \ -2: String "Attempt to recursively lock a non-recursive lock"; \ - .text ; \ -1: - -#else /* MACH_LDEBUG */ -#define CHECK_MUTEX_TYPE() -#define CHECK_MYLOCK(thd) -#endif /* MACH_LDEBUG */ - -#if DEVELOPMENT || DEBUG -/* - * If one or more simplelocks are currently held by a thread, - * an attempt to acquire a mutex will cause this check to fail - * (since a mutex lock may context switch, holding a simplelock - * is not a good thing). - */ -#define CHECK_PREEMPTION_LEVEL() \ - cmpl $0,%gs:CPU_PREEMPTION_LEVEL ; \ - je 1f ; \ - cmpl $0,EXT(LckDisablePreemptCheck)(%rip) ; \ - jne 1f ; \ - cmpl $0,%gs:CPU_HIBERNATE ; \ - jne 1f ; \ - ALIGN_STACK() ; \ - movl %gs:CPU_PREEMPTION_LEVEL, %eax ; \ - LOAD_ARG1(%eax) ; \ - LOAD_STRING_ARG0(2f) ; \ - CALL_PANIC() ; \ - hlt ; \ - .data ; \ -2: String "preemption_level(%d) != 0!" ; \ - .text ; \ -1: -#else /* DEVELOPMENT || DEBUG */ -#define CHECK_PREEMPTION_LEVEL() -#endif /* DEVELOPMENT || DEBUG */ - #define PREEMPTION_DISABLE \ incl %gs:CPU_PREEMPTION_LEVEL @@ -200,45 +101,6 @@ 19: #endif - -#if CONFIG_DTRACE - - .globl _lockstat_probe - .globl _lockstat_probemap - -/* - * LOCKSTAT_LABEL creates a dtrace symbol which contains - * a pointer into the lock code function body. At that - * point is a "ret" instruction that can be patched into - * a "nop" - */ - -#define LOCKSTAT_LABEL(lab) \ - .data ;\ - .globl lab ;\ - lab: ;\ - .quad 9f ;\ - .text ;\ - 9: - -#define LOCKSTAT_RECORD(id, lck) \ - push %rbp ; \ - mov %rsp,%rbp ; \ - movl _lockstat_probemap + (id * 4)(%rip),%eax ; \ - test %eax,%eax ; \ - je 9f ; \ - mov lck, %rsi ; \ - mov %rax, %rdi ; \ - mov $0, %rdx ; \ - mov $0, %rcx ; \ - mov $0, %r8 ; \ - mov $0, %r9 ; \ - call *_lockstat_probe(%rip) ; \ -9: leave - /* ret - left to subsequent code, e.g. return values */ - -#endif /* CONFIG_DTRACE */ - /* * For most routines, the hw_lock_t pointer is loaded into a * register initially, and then either a byte or register-sized @@ -286,801 +148,6 @@ LEAF_ENTRY(hw_lock_byte_unlock) movb $0, (%rdi) /* Clear the lock byte */ PREEMPTION_ENABLE LEAF_RET - -/* - * N.B.: On x86, statistics are currently recorded for all indirect mutexes. - * Also, only the acquire attempt count (GRP_MTX_STAT_UTIL) is maintained - * as a 64-bit quantity (this matches the existing PowerPC implementation, - * and the new x86 specific statistics are also maintained as 32-bit - * quantities). - * - * - * Enable this preprocessor define to record the first miss alone - * By default, we count every miss, hence multiple misses may be - * recorded for a single lock acquire attempt via lck_mtx_lock - */ -#undef LOG_FIRST_MISS_ALONE - -/* - * This preprocessor define controls whether the R-M-W update of the - * per-group statistics elements are atomic (LOCK-prefixed) - * Enabled by default. - */ -#define ATOMIC_STAT_UPDATES 1 - -#if defined(ATOMIC_STAT_UPDATES) -#define LOCK_IF_ATOMIC_STAT_UPDATES lock -#else -#define LOCK_IF_ATOMIC_STAT_UPDATES -#endif /* ATOMIC_STAT_UPDATES */ - - -/* - * For most routines, the lck_mtx_t pointer is loaded into a - * register initially, and the owner field checked for indirection. - * Eventually the lock owner is loaded into a register and examined. - */ - -#define M_OWNER MUTEX_OWNER -#define M_PTR MUTEX_PTR -#define M_STATE MUTEX_STATE - - -#define LMTX_ENTER_EXTENDED \ - mov M_PTR(%rdx), %rdx ; \ - xor %r11, %r11 ; \ - mov MUTEX_GRP(%rdx), %r10 ; \ - LOCK_IF_ATOMIC_STAT_UPDATES ; \ - incq GRP_MTX_STAT_UTIL(%r10) - - -#if LOG_FIRST_MISS_ALONE -#define LMTX_UPDATE_MISS \ - test $1, %r11 ; \ - jnz 11f ; \ - LOCK_IF_ATOMIC_STAT_UPDATES ; \ - incl GRP_MTX_STAT_MISS(%r10) ; \ - or $1, %r11 ; \ -11: -#else -#define LMTX_UPDATE_MISS \ - LOCK_IF_ATOMIC_STAT_UPDATES ; \ - incl GRP_MTX_STAT_MISS(%r10) -#endif - - -#if LOG_FIRST_MISS_ALONE -#define LMTX_UPDATE_WAIT \ - test $2, %r11 ; \ - jnz 11f ; \ - LOCK_IF_ATOMIC_STAT_UPDATES ; \ - incl GRP_MTX_STAT_WAIT(%r10) ; \ - or $2, %r11 ; \ -11: -#else -#define LMTX_UPDATE_WAIT \ - LOCK_IF_ATOMIC_STAT_UPDATES ; \ - incl GRP_MTX_STAT_WAIT(%r10) -#endif - - -/* - * Record the "direct wait" statistic, which indicates if a - * miss proceeded to block directly without spinning--occurs - * if the owner of the mutex isn't running on another processor - * at the time of the check. - */ -#define LMTX_UPDATE_DIRECT_WAIT \ - LOCK_IF_ATOMIC_STAT_UPDATES ; \ - incl GRP_MTX_STAT_DIRECT_WAIT(%r10) - - -#define LMTX_CALLEXT1(func_name) \ - cmp %rdx, %rdi ; \ - je 12f ; \ - push %r10 ; \ - push %r11 ; \ -12: push %rdi ; \ - push %rdx ; \ - mov %rdx, %rdi ; \ - call EXT(func_name) ; \ - pop %rdx ; \ - pop %rdi ; \ - cmp %rdx, %rdi ; \ - je 12f ; \ - pop %r11 ; \ - pop %r10 ; \ -12: - -#define LMTX_CALLEXT2(func_name, reg) \ - cmp %rdx, %rdi ; \ - je 12f ; \ - push %r10 ; \ - push %r11 ; \ -12: push %rdi ; \ - push %rdx ; \ - mov reg, %rsi ; \ - mov %rdx, %rdi ; \ - call EXT(func_name) ; \ - pop %rdx ; \ - pop %rdi ; \ - cmp %rdx, %rdi ; \ - je 12f ; \ - pop %r11 ; \ - pop %r10 ; \ -12: - - -#define M_WAITERS_MSK 0x0000ffff -#define M_PRIORITY_MSK 0x00ff0000 -#define M_ILOCKED_MSK 0x01000000 -#define M_MLOCKED_MSK 0x02000000 -#define M_PROMOTED_MSK 0x04000000 -#define M_SPIN_MSK 0x08000000 - -/* - * void lck_mtx_assert(lck_mtx_t* l, unsigned int) - * Takes the address of a lock, and an assertion type as parameters. - * The assertion can take one of two forms determine by the type - * parameter: either the lock is held by the current thread, and the - * type is LCK_MTX_ASSERT_OWNED, or it isn't and the type is - * LCK_MTX_ASSERT_NOTOWNED. Calls panic on assertion failure. - * - */ - -NONLEAF_ENTRY(lck_mtx_assert) - mov %rdi, %rdx /* Load lock address */ - mov %gs:CPU_ACTIVE_THREAD, %rax /* Load current thread */ - - mov M_STATE(%rdx), %ecx - cmp $(MUTEX_IND), %ecx /* Is this an indirect mutex? */ - jne 0f - mov M_PTR(%rdx), %rdx /* If so, take indirection */ -0: - mov M_OWNER(%rdx), %rcx /* Load owner */ - cmp $(MUTEX_ASSERT_OWNED), %rsi - jne 2f /* Assert ownership? */ - cmp %rax, %rcx /* Current thread match? */ - jne 3f /* no, go panic */ - testl $(M_ILOCKED_MSK | M_MLOCKED_MSK), M_STATE(%rdx) - je 3f -1: /* yes, we own it */ - NONLEAF_RET -2: - cmp %rax, %rcx /* Current thread match? */ - jne 1b /* No, return */ - ALIGN_STACK() - LOAD_PTR_ARG1(%rdx) - LOAD_STRING_ARG0(mutex_assert_owned_str) - jmp 4f -3: - ALIGN_STACK() - LOAD_PTR_ARG1(%rdx) - LOAD_STRING_ARG0(mutex_assert_not_owned_str) -4: - CALL_PANIC() - - -lck_mtx_destroyed: - ALIGN_STACK() - LOAD_PTR_ARG1(%rdx) - LOAD_STRING_ARG0(mutex_interlock_destroyed_str) - CALL_PANIC() - - -.data -mutex_assert_not_owned_str: - .asciz "mutex (%p) not owned\n" -mutex_assert_owned_str: - .asciz "mutex (%p) owned\n" -mutex_interlock_destroyed_str: - .asciz "trying to interlock destroyed mutex (%p)" -.text - - - -/* - * lck_mtx_lock() - * lck_mtx_try_lock() - * lck_mtx_unlock() - * lck_mtx_lock_spin() - * lck_mtx_lock_spin_always() - * lck_mtx_try_lock_spin() - * lck_mtx_try_lock_spin_always() - * lck_mtx_convert_spin() - */ -NONLEAF_ENTRY(lck_mtx_lock_spin_always) - mov %rdi, %rdx /* fetch lock pointer */ - jmp Llmls_avoid_check - -NONLEAF_ENTRY(lck_mtx_lock_spin) - mov %rdi, %rdx /* fetch lock pointer */ - - CHECK_PREEMPTION_LEVEL() -Llmls_avoid_check: - mov M_STATE(%rdx), %ecx - test $(M_ILOCKED_MSK | M_MLOCKED_MSK), %ecx /* is the interlock or mutex held */ - jnz Llmls_slow -Llmls_try: /* no - can't be INDIRECT, DESTROYED or locked */ - mov %rcx, %rax /* eax contains snapshot for cmpxchgl */ - or $(M_ILOCKED_MSK | M_SPIN_MSK), %ecx - - PREEMPTION_DISABLE - lock - cmpxchg %ecx, M_STATE(%rdx) /* atomic compare and exchange */ - jne Llmls_busy_disabled - - mov %gs:CPU_ACTIVE_THREAD, %rax - mov %rax, M_OWNER(%rdx) /* record owner of interlock */ -#if MACH_LDEBUG - test %rax, %rax - jz 1f - incl TH_MUTEX_COUNT(%rax) /* lock statistic */ -1: -#endif /* MACH_LDEBUG */ - - /* return with the interlock held and preemption disabled */ - leave -#if CONFIG_DTRACE - LOCKSTAT_LABEL(_lck_mtx_lock_spin_lockstat_patch_point) - ret - /* inherit lock pointer in %rdx above */ - LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, %rdx) -#endif - ret - -Llmls_slow: - test $M_ILOCKED_MSK, %ecx /* is the interlock held */ - jz Llml_contended /* no, must have been the mutex */ - - cmp $(MUTEX_DESTROYED), %ecx /* check to see if its marked destroyed */ - je lck_mtx_destroyed - cmp $(MUTEX_IND), %ecx /* Is this an indirect mutex */ - jne Llmls_loop /* no... must be interlocked */ - - LMTX_ENTER_EXTENDED - - mov M_STATE(%rdx), %ecx - test $(M_SPIN_MSK), %ecx - jz Llmls_loop1 - - LMTX_UPDATE_MISS /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */ -Llmls_loop: - PAUSE - mov M_STATE(%rdx), %ecx -Llmls_loop1: - test $(M_ILOCKED_MSK | M_MLOCKED_MSK), %ecx - jz Llmls_try - test $(M_MLOCKED_MSK), %ecx - jnz Llml_contended /* mutex owned by someone else, go contend for it */ - jmp Llmls_loop - -Llmls_busy_disabled: - PREEMPTION_ENABLE - jmp Llmls_loop - - - -NONLEAF_ENTRY(lck_mtx_lock) - mov %rdi, %rdx /* fetch lock pointer */ - - CHECK_PREEMPTION_LEVEL() - - mov M_STATE(%rdx), %ecx - test $(M_ILOCKED_MSK | M_MLOCKED_MSK), %ecx /* is the interlock or mutex held */ - jnz Llml_slow -Llml_try: /* no - can't be INDIRECT, DESTROYED or locked */ - mov %rcx, %rax /* eax contains snapshot for cmpxchgl */ - or $(M_ILOCKED_MSK | M_MLOCKED_MSK), %ecx - - PREEMPTION_DISABLE - lock - cmpxchg %ecx, M_STATE(%rdx) /* atomic compare and exchange */ - jne Llml_busy_disabled - - mov %gs:CPU_ACTIVE_THREAD, %rax - mov %rax, M_OWNER(%rdx) /* record owner of mutex */ -#if MACH_LDEBUG - test %rax, %rax - jz 1f - incl TH_MUTEX_COUNT(%rax) /* lock statistic */ -1: -#endif /* MACH_LDEBUG */ - - testl $(M_WAITERS_MSK), M_STATE(%rdx) - jz Llml_finish - - LMTX_CALLEXT1(lck_mtx_lock_acquire_x86) - -Llml_finish: - andl $(~M_ILOCKED_MSK), M_STATE(%rdx) - PREEMPTION_ENABLE - - cmp %rdx, %rdi /* is this an extended mutex */ - jne 2f - - leave -#if CONFIG_DTRACE - LOCKSTAT_LABEL(_lck_mtx_lock_lockstat_patch_point) - ret - /* inherit lock pointer in %rdx above */ - LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, %rdx) -#endif - ret -2: - leave -#if CONFIG_DTRACE - LOCKSTAT_LABEL(_lck_mtx_lock_ext_lockstat_patch_point) - ret - /* inherit lock pointer in %rdx above */ - LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_ACQUIRE, %rdx) -#endif - ret - - -Llml_slow: - test $M_ILOCKED_MSK, %ecx /* is the interlock held */ - jz Llml_contended /* no, must have been the mutex */ - - cmp $(MUTEX_DESTROYED), %ecx /* check to see if its marked destroyed */ - je lck_mtx_destroyed - cmp $(MUTEX_IND), %ecx /* Is this an indirect mutex? */ - jne Llml_loop /* no... must be interlocked */ - - LMTX_ENTER_EXTENDED - - mov M_STATE(%rdx), %ecx - test $(M_SPIN_MSK), %ecx - jz Llml_loop1 - - LMTX_UPDATE_MISS /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */ -Llml_loop: - PAUSE - mov M_STATE(%rdx), %ecx -Llml_loop1: - test $(M_ILOCKED_MSK | M_MLOCKED_MSK), %ecx - jz Llml_try - test $(M_MLOCKED_MSK), %ecx - jnz Llml_contended /* mutex owned by someone else, go contend for it */ - jmp Llml_loop - -Llml_busy_disabled: - PREEMPTION_ENABLE - jmp Llml_loop - - -Llml_contended: - cmp %rdx, %rdi /* is this an extended mutex */ - je 0f - LMTX_UPDATE_MISS -0: - LMTX_CALLEXT1(lck_mtx_lock_spinwait_x86) - - test %rax, %rax - jz Llml_acquired /* acquired mutex, interlock held and preemption disabled */ - - cmp $1, %rax /* check for direct wait status */ - je 2f - cmp %rdx, %rdi /* is this an extended mutex */ - je 2f - LMTX_UPDATE_DIRECT_WAIT -2: - mov M_STATE(%rdx), %ecx - test $(M_ILOCKED_MSK), %ecx - jnz 6f - - mov %rcx, %rax /* eax contains snapshot for cmpxchgl */ - or $(M_ILOCKED_MSK), %ecx /* try to take the interlock */ - - PREEMPTION_DISABLE - lock - cmpxchg %ecx, M_STATE(%rdx) /* atomic compare and exchange */ - jne 5f - - test $(M_MLOCKED_MSK), %ecx /* we've got the interlock and */ - jnz 3f - or $(M_MLOCKED_MSK), %ecx /* the mutex is free... grab it directly */ - mov %ecx, M_STATE(%rdx) - - mov %gs:CPU_ACTIVE_THREAD, %rax - mov %rax, M_OWNER(%rdx) /* record owner of mutex */ -#if MACH_LDEBUG - test %rax, %rax - jz 1f - incl TH_MUTEX_COUNT(%rax) /* lock statistic */ -1: -#endif /* MACH_LDEBUG */ - -Llml_acquired: - testl $(M_WAITERS_MSK), M_STATE(%rdx) - jnz 1f - mov M_OWNER(%rdx), %rax - mov TH_WAS_PROMOTED_ON_WAKEUP(%rax), %eax - test %eax, %eax - jz Llml_finish -1: - LMTX_CALLEXT1(lck_mtx_lock_acquire_x86) - jmp Llml_finish - -3: /* interlock held, mutex busy */ - cmp %rdx, %rdi /* is this an extended mutex */ - je 4f - LMTX_UPDATE_WAIT -4: - LMTX_CALLEXT1(lck_mtx_lock_wait_x86) - jmp Llml_contended -5: - PREEMPTION_ENABLE -6: - PAUSE - jmp 2b - - -NONLEAF_ENTRY(lck_mtx_try_lock_spin_always) - mov %rdi, %rdx /* fetch lock pointer */ - jmp Llmts_avoid_check - -NONLEAF_ENTRY(lck_mtx_try_lock_spin) - mov %rdi, %rdx /* fetch lock pointer */ - -Llmts_avoid_check: - mov M_STATE(%rdx), %ecx - test $(M_ILOCKED_MSK | M_MLOCKED_MSK), %ecx /* is the interlock or mutex held */ - jnz Llmts_slow -Llmts_try: /* no - can't be INDIRECT, DESTROYED or locked */ - mov %rcx, %rax /* eax contains snapshot for cmpxchgl */ - or $(M_ILOCKED_MSK | M_SPIN_MSK), %rcx - - PREEMPTION_DISABLE - lock - cmpxchg %ecx, M_STATE(%rdx) /* atomic compare and exchange */ - jne Llmts_busy_disabled - - mov %gs:CPU_ACTIVE_THREAD, %rax - mov %rax, M_OWNER(%rdx) /* record owner of mutex */ -#if MACH_LDEBUG - test %rax, %rax - jz 1f - incl TH_MUTEX_COUNT(%rax) /* lock statistic */ -1: -#endif /* MACH_LDEBUG */ - - leave - -#if CONFIG_DTRACE - mov $1, %rax /* return success */ - LOCKSTAT_LABEL(_lck_mtx_try_lock_spin_lockstat_patch_point) - ret - /* inherit lock pointer in %rdx above */ - LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, %rdx) -#endif - mov $1, %rax /* return success */ - ret - -Llmts_slow: - test $(M_ILOCKED_MSK), %ecx /* is the interlock held */ - jz Llmts_fail /* no, must be held as a mutex */ - - cmp $(MUTEX_DESTROYED), %ecx /* check to see if its marked destroyed */ - je lck_mtx_destroyed - cmp $(MUTEX_IND), %ecx /* Is this an indirect mutex? */ - jne Llmts_loop1 - - LMTX_ENTER_EXTENDED -Llmts_loop: - PAUSE - mov M_STATE(%rdx), %ecx -Llmts_loop1: - test $(M_MLOCKED_MSK | M_SPIN_MSK), %ecx - jnz Llmts_fail - test $(M_ILOCKED_MSK), %ecx - jz Llmts_try - jmp Llmts_loop - -Llmts_busy_disabled: - PREEMPTION_ENABLE - jmp Llmts_loop - - - -NONLEAF_ENTRY(lck_mtx_try_lock) - mov %rdi, %rdx /* fetch lock pointer */ - - mov M_STATE(%rdx), %ecx - test $(M_ILOCKED_MSK | M_MLOCKED_MSK), %ecx /* is the interlock or mutex held */ - jnz Llmt_slow -Llmt_try: /* no - can't be INDIRECT, DESTROYED or locked */ - mov %rcx, %rax /* eax contains snapshot for cmpxchgl */ - or $(M_ILOCKED_MSK | M_MLOCKED_MSK), %ecx - - PREEMPTION_DISABLE - lock - cmpxchg %ecx, M_STATE(%rdx) /* atomic compare and exchange */ - jne Llmt_busy_disabled - - mov %gs:CPU_ACTIVE_THREAD, %rax - mov %rax, M_OWNER(%rdx) /* record owner of mutex */ -#if MACH_LDEBUG - test %rax, %rax - jz 1f - incl TH_MUTEX_COUNT(%rax) /* lock statistic */ -1: -#endif /* MACH_LDEBUG */ - - test $(M_WAITERS_MSK), %ecx - jz 0f - - LMTX_CALLEXT1(lck_mtx_lock_acquire_x86) -0: - andl $(~M_ILOCKED_MSK), M_STATE(%rdx) - PREEMPTION_ENABLE - - leave -#if CONFIG_DTRACE - mov $1, %rax /* return success */ - /* Dtrace probe: LS_LCK_MTX_TRY_LOCK_ACQUIRE */ - LOCKSTAT_LABEL(_lck_mtx_try_lock_lockstat_patch_point) - ret - /* inherit lock pointer in %rdx from above */ - LOCKSTAT_RECORD(LS_LCK_MTX_TRY_LOCK_ACQUIRE, %rdx) -#endif - mov $1, %rax /* return success */ - ret - -Llmt_slow: - test $(M_ILOCKED_MSK), %ecx /* is the interlock held */ - jz Llmt_fail /* no, must be held as a mutex */ - - cmp $(MUTEX_DESTROYED), %ecx /* check to see if its marked destroyed */ - je lck_mtx_destroyed - cmp $(MUTEX_IND), %ecx /* Is this an indirect mutex? */ - jne Llmt_loop - - LMTX_ENTER_EXTENDED -Llmt_loop: - PAUSE - mov M_STATE(%rdx), %ecx -Llmt_loop1: - test $(M_MLOCKED_MSK | M_SPIN_MSK), %ecx - jnz Llmt_fail - test $(M_ILOCKED_MSK), %ecx - jz Llmt_try - jmp Llmt_loop - -Llmt_busy_disabled: - PREEMPTION_ENABLE - jmp Llmt_loop - - -Llmt_fail: -Llmts_fail: - cmp %rdx, %rdi /* is this an extended mutex */ - je 0f - LMTX_UPDATE_MISS -0: - xor %rax, %rax - NONLEAF_RET - - - -NONLEAF_ENTRY(lck_mtx_convert_spin) - mov %rdi, %rdx /* fetch lock pointer */ - - mov M_STATE(%rdx), %ecx - cmp $(MUTEX_IND), %ecx /* Is this an indirect mutex? */ - jne 0f - mov M_PTR(%rdx), %rdx /* If so, take indirection */ - mov M_STATE(%rdx), %ecx -0: - test $(M_MLOCKED_MSK), %ecx /* already owned as a mutex, just return */ - jnz 2f - test $(M_WAITERS_MSK), %ecx /* are there any waiters? */ - jz 1f - - LMTX_CALLEXT1(lck_mtx_lock_acquire_x86) - mov M_STATE(%rdx), %ecx -1: - and $(~(M_ILOCKED_MSK | M_SPIN_MSK)), %ecx /* convert from spin version to mutex */ - or $(M_MLOCKED_MSK), %ecx - mov %ecx, M_STATE(%rdx) /* since I own the interlock, I don't need an atomic update */ - - PREEMPTION_ENABLE -2: - NONLEAF_RET - - - -NONLEAF_ENTRY(lck_mtx_unlock) - mov %rdi, %rdx /* fetch lock pointer */ -Llmu_entry: - mov M_STATE(%rdx), %ecx -Llmu_prim: - cmp $(MUTEX_IND), %ecx /* Is this an indirect mutex? */ - je Llmu_ext - -Llmu_chktype: - test $(M_MLOCKED_MSK), %ecx /* check for full mutex */ - jz Llmu_unlock -Llmu_mutex: - test $(M_ILOCKED_MSK), %rcx /* have to wait for interlock to clear */ - jnz Llmu_busy - - mov %rcx, %rax /* eax contains snapshot for cmpxchgl */ - and $(~M_MLOCKED_MSK), %ecx /* drop mutex */ - or $(M_ILOCKED_MSK), %ecx /* pick up interlock */ - - PREEMPTION_DISABLE - lock - cmpxchg %ecx, M_STATE(%rdx) /* atomic compare and exchange */ - jne Llmu_busy_disabled /* branch on failure to spin loop */ - -Llmu_unlock: - xor %rax, %rax - mov %rax, M_OWNER(%rdx) - mov %rcx, %rax /* keep original state in %ecx for later evaluation */ - and $(~(M_ILOCKED_MSK | M_SPIN_MSK | M_PROMOTED_MSK)), %rax - - test $(M_WAITERS_MSK), %eax - jz 2f - dec %eax /* decrement waiter count */ -2: - mov %eax, M_STATE(%rdx) /* since I own the interlock, I don't need an atomic update */ - -#if MACH_LDEBUG - /* perform lock statistics after drop to prevent delay */ - mov %gs:CPU_ACTIVE_THREAD, %rax - test %rax, %rax - jz 1f - decl TH_MUTEX_COUNT(%rax) /* lock statistic */ -1: -#endif /* MACH_LDEBUG */ - - test $(M_PROMOTED_MSK | M_WAITERS_MSK), %ecx - jz 3f - - LMTX_CALLEXT2(lck_mtx_unlock_wakeup_x86, %rcx) -3: - PREEMPTION_ENABLE - - cmp %rdx, %rdi - jne 4f - - leave -#if CONFIG_DTRACE - /* Dtrace: LS_LCK_MTX_UNLOCK_RELEASE */ - LOCKSTAT_LABEL(_lck_mtx_unlock_lockstat_patch_point) - ret - /* inherit lock pointer in %rdx from above */ - LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, %rdx) -#endif - ret -4: - leave -#if CONFIG_DTRACE - /* Dtrace: LS_LCK_MTX_EXT_UNLOCK_RELEASE */ - LOCKSTAT_LABEL(_lck_mtx_ext_unlock_lockstat_patch_point) - ret - /* inherit lock pointer in %rdx from above */ - LOCKSTAT_RECORD(LS_LCK_MTX_EXT_UNLOCK_RELEASE, %rdx) -#endif - ret - - -Llmu_busy_disabled: - PREEMPTION_ENABLE -Llmu_busy: - PAUSE - mov M_STATE(%rdx), %ecx - jmp Llmu_mutex - -Llmu_ext: - mov M_PTR(%rdx), %rdx - mov M_OWNER(%rdx), %rax - mov %gs:CPU_ACTIVE_THREAD, %rcx - CHECK_UNLOCK(%rcx, %rax) - mov M_STATE(%rdx), %ecx - jmp Llmu_chktype - - - -LEAF_ENTRY(lck_mtx_ilk_try_lock) - mov %rdi, %rdx /* fetch lock pointer - no indirection here */ - - mov M_STATE(%rdx), %ecx - - test $(M_ILOCKED_MSK), %ecx /* can't have the interlock yet */ - jnz 3f - - mov %rcx, %rax /* eax contains snapshot for cmpxchgl */ - or $(M_ILOCKED_MSK), %ecx - - PREEMPTION_DISABLE - lock - cmpxchg %ecx, M_STATE(%rdx) /* atomic compare and exchange */ - jne 2f /* return failure after re-enabling preemption */ - - mov $1, %rax /* return success with preemption disabled */ - LEAF_RET -2: - PREEMPTION_ENABLE /* need to re-enable preemption */ -3: - xor %rax, %rax /* return failure */ - LEAF_RET - - -LEAF_ENTRY(lck_mtx_ilk_unlock) - mov %rdi, %rdx /* fetch lock pointer - no indirection here */ - - andl $(~M_ILOCKED_MSK), M_STATE(%rdx) - - PREEMPTION_ENABLE /* need to re-enable preemption */ - - LEAF_RET - - -LEAF_ENTRY(lck_mtx_lock_grab_mutex) - mov %rdi, %rdx /* fetch lock pointer - no indirection here */ - - mov M_STATE(%rdx), %ecx - - test $(M_ILOCKED_MSK | M_MLOCKED_MSK), %ecx /* can't have the mutex yet */ - jnz 3f - - mov %rcx, %rax /* eax contains snapshot for cmpxchgl */ - or $(M_ILOCKED_MSK | M_MLOCKED_MSK), %ecx - - PREEMPTION_DISABLE - lock - cmpxchg %ecx, M_STATE(%rdx) /* atomic compare and exchange */ - jne 2f /* branch on failure to spin loop */ - - mov %gs:CPU_ACTIVE_THREAD, %rax - mov %rax, M_OWNER(%rdx) /* record owner of mutex */ -#if MACH_LDEBUG - test %rax, %rax - jz 1f - incl TH_MUTEX_COUNT(%rax) /* lock statistic */ -1: -#endif /* MACH_LDEBUG */ - - mov $1, %rax /* return success */ - LEAF_RET -2: - PREEMPTION_ENABLE -3: - xor %rax, %rax /* return failure */ - LEAF_RET - - - -LEAF_ENTRY(lck_mtx_lock_mark_destroyed) - mov %rdi, %rdx -1: - mov M_STATE(%rdx), %ecx - cmp $(MUTEX_IND), %ecx /* Is this an indirect mutex? */ - jne 2f - - movl $(MUTEX_DESTROYED), M_STATE(%rdx) /* convert to destroyed state */ - jmp 3f -2: - test $(M_ILOCKED_MSK), %rcx /* have to wait for interlock to clear */ - jnz 5f - - PREEMPTION_DISABLE - mov %rcx, %rax /* eax contains snapshot for cmpxchgl */ - or $(M_ILOCKED_MSK), %ecx - lock - cmpxchg %ecx, M_STATE(%rdx) /* atomic compare and exchange */ - jne 4f /* branch on failure to spin loop */ - movl $(MUTEX_DESTROYED), M_STATE(%rdx) /* convert to destroyed state */ - PREEMPTION_ENABLE -3: - LEAF_RET /* return with M_ILOCKED set */ -4: - PREEMPTION_ENABLE -5: - PAUSE - jmp 1b LEAF_ENTRY(preemption_underflow_panic) FRAME @@ -1093,4 +160,3 @@ LEAF_ENTRY(preemption_underflow_panic) 16: String "Preemption level underflow, possible cause unlocking an unlocked mutex or spinlock" .text - diff --git a/osfmk/i386/i386_vm_init.c b/osfmk/i386/i386_vm_init.c index e623d6004..9b74e8d44 100644 --- a/osfmk/i386/i386_vm_init.c +++ b/osfmk/i386/i386_vm_init.c @@ -113,6 +113,9 @@ vm_offset_t vm_prelink_einfo; vm_offset_t vm_slinkedit; vm_offset_t vm_elinkedit; +vm_offset_t vm_kernel_builtinkmod_text; +vm_offset_t vm_kernel_builtinkmod_text_end; + #define MAXLORESERVE (32 * 1024 * 1024) ppnum_t max_ppnum = 0; diff --git a/osfmk/i386/locks.h b/osfmk/i386/locks.h index 3d337a1c8..9bdd394cf 100644 --- a/osfmk/i386/locks.h +++ b/osfmk/i386/locks.h @@ -31,6 +31,7 @@ #include #include +#include #ifdef MACH_KERNEL_PRIVATE @@ -100,23 +101,39 @@ typedef struct _lck_mtx_ { }; } lck_mtx_t; +#define LCK_MTX_WAITERS_MSK 0x0000ffff +#define LCK_MTX_WAITER 0x00000001 +#define LCK_MTX_PRIORITY_MSK 0x00ff0000 +#define LCK_MTX_ILOCKED_MSK 0x01000000 +#define LCK_MTX_MLOCKED_MSK 0x02000000 +#define LCK_MTX_PROMOTED_MSK 0x04000000 +#define LCK_MTX_SPIN_MSK 0x08000000 + /* This pattern must subsume the interlocked, mlocked and spin bits */ #define LCK_MTX_TAG_INDIRECT 0x07ff1007 /* lock marked as Indirect */ #define LCK_MTX_TAG_DESTROYED 0x07fe2007 /* lock marked as Destroyed */ /* Adaptive spin before blocking */ extern uint64_t MutexSpin; -extern int lck_mtx_lock_spinwait_x86(lck_mtx_t *mutex); -extern void lck_mtx_lock_wait_x86(lck_mtx_t *mutex); -extern void lck_mtx_lock_acquire_x86(lck_mtx_t *mutex); -extern void lck_mtx_unlock_wakeup_x86(lck_mtx_t *mutex, int prior_lock_state); - -extern void lck_mtx_lock_mark_destroyed(lck_mtx_t *mutex); -extern int lck_mtx_lock_grab_mutex(lck_mtx_t *mutex); -extern void hw_lock_byte_init(volatile uint8_t *lock_byte); -extern void hw_lock_byte_lock(volatile uint8_t *lock_byte); -extern void hw_lock_byte_unlock(volatile uint8_t *lock_byte); +typedef enum lck_mtx_spinwait_ret_type { + LCK_MTX_SPINWAIT_ACQUIRED = 0, + LCK_MTX_SPINWAIT_SPUN = 1, + LCK_MTX_SPINWAIT_NO_SPIN = 2, +} lck_mtx_spinwait_ret_type_t; + +extern lck_mtx_spinwait_ret_type_t lck_mtx_lock_spinwait_x86(lck_mtx_t *mutex); +extern void lck_mtx_lock_wait_x86(lck_mtx_t *mutex); +extern void lck_mtx_lock_acquire_x86(lck_mtx_t *mutex); + +extern void lck_mtx_lock_slow(lck_mtx_t *lock); +extern boolean_t lck_mtx_try_lock_slow(lck_mtx_t *lock); +extern void lck_mtx_unlock_slow(lck_mtx_t *lock); +extern void lck_mtx_lock_spin_slow(lck_mtx_t *lock); +extern boolean_t lck_mtx_try_lock_spin_slow(lck_mtx_t *lock); +extern void hw_lock_byte_init(volatile uint8_t *lock_byte); +extern void hw_lock_byte_lock(volatile uint8_t *lock_byte); +extern void hw_lock_byte_unlock(volatile uint8_t *lock_byte); typedef struct { unsigned int type; @@ -176,7 +193,6 @@ typedef struct __lck_mtx_ext_t__ lck_mtx_ext_t; #endif #ifdef MACH_KERNEL_PRIVATE -#pragma pack(1) /* Make sure the structure stays as we defined it */ typedef union _lck_rw_t_internal_ { struct { volatile uint16_t lck_rw_shared_count; /* No. of accepted readers */ @@ -199,7 +215,9 @@ typedef union _lck_rw_t_internal_ { uint32_t lck_rw_pad12; }; } lck_rw_t; -#pragma pack() +#define LCK_RW_T_SIZE 16 + +static_assert(sizeof(lck_rw_t) == LCK_RW_T_SIZE); #define LCK_RW_SHARED_SHIFT 0 #define LCK_RW_INTERLOCK_BIT 16 @@ -244,6 +262,7 @@ typedef union _lck_rw_t_internal_ { #if LOCK_PRIVATE #define disable_preemption_for_thread(t) ((cpu_data_t GS_RELATIVE *)0UL)->cpu_preemption_level++ +#define preemption_disabled_for_thread(t) (((cpu_data_t GS_RELATIVE *)0UL)->cpu_preemption_level > 0) #define LCK_MTX_THREAD_TO_STATE(t) ((uintptr_t)t) #define PLATFORM_LCK_ILOCK 0 @@ -274,5 +293,4 @@ typedef struct __lck_rw_t__ lck_rw_t; extern void kernel_preempt_check (void); #endif /* MACH_KERNEL_PRIVATE */ - #endif /* _I386_LOCKS_H_ */ diff --git a/osfmk/i386/locks_i386.c b/osfmk/i386/locks_i386.c index 039584749..bc1669f7f 100644 --- a/osfmk/i386/locks_i386.c +++ b/osfmk/i386/locks_i386.c @@ -61,6 +61,9 @@ * Locking primitives implementation */ +#define ATOMIC_PRIVATE 1 +#define LOCK_PRIVATE 1 + #include #include @@ -79,9 +82,9 @@ #include #include #include - +#include #include -#include +#include /* * We need only enough declarations from the BSD-side to be able to @@ -160,14 +163,6 @@ typedef void *pc_t; #endif /* lint */ #endif /* USLOCK_DEBUG */ -// Enforce program order of loads and stores. -#define ordered_load(target) _Generic( (target),\ - uint32_t* : __c11_atomic_load((_Atomic uint32_t* )(target), memory_order_relaxed), \ - uintptr_t*: __c11_atomic_load((_Atomic uintptr_t*)(target), memory_order_relaxed) ) -#define ordered_store(target, value) _Generic( (target),\ - uint32_t* : __c11_atomic_store((_Atomic uint32_t* )(target), (value), memory_order_relaxed), \ - uintptr_t*: __c11_atomic_store((_Atomic uintptr_t*)(target), (value), memory_order_relaxed) ) - /* * atomic exchange API is a low level abstraction of the operations * to atomically read, modify, and write a pointer. This abstraction works @@ -235,7 +230,6 @@ int usld_lock_common_checks(usimple_lock_t, char *); #define USLDBG(stmt) #endif /* USLOCK_DEBUG */ - /* * Forward definitions */ @@ -250,6 +244,14 @@ void lck_rw_clear_promotions_x86(thread_t thread); static boolean_t lck_rw_held_read_or_upgrade(lck_rw_t *lock); static boolean_t lck_rw_grab_want(lck_rw_t *lock); static boolean_t lck_rw_grab_shared(lck_rw_t *lock); +static void lck_mtx_unlock_wakeup_tail(lck_mtx_t *mutex, int prior_lock_state, boolean_t indirect); +static void lck_mtx_interlock_lock(lck_mtx_t *mutex, uint32_t *new_state); +static void lck_mtx_interlock_lock_clear_flags(lck_mtx_t *mutex, uint32_t and_flags, uint32_t *new_state); +static int lck_mtx_interlock_try_lock(lck_mtx_t *mutex, uint32_t *new_state); +static int lck_mtx_interlock_try_lock_set_flags(lck_mtx_t *mutex, uint32_t or_flags, uint32_t *new_state); +static boolean_t lck_mtx_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state); +static boolean_t lck_mtx_try_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state); + /* * Routine: lck_spin_alloc_init @@ -1030,7 +1032,7 @@ static void lck_rw_lock_exclusive_gen( lck_rw_t *lck) { - __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck); + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck); uint64_t deadline = 0; int slept = 0; int gotlock = 0; @@ -1096,7 +1098,8 @@ lck_rw_lock_exclusive_gen( lck->lck_w_waiting = TRUE; thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite); - res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT); + res = assert_wait(RW_LOCK_WRITER_EVENT(lck), + THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); lck_interlock_unlock(lck, istate); if (res == THREAD_WAITING) { @@ -1175,7 +1178,8 @@ lck_rw_lock_exclusive_gen( lck->lck_w_waiting = TRUE; thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite); - res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT); + res = assert_wait(RW_LOCK_WRITER_EVENT(lck), + THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); lck_interlock_unlock(lck, istate); if (res == THREAD_WAITING) { @@ -1325,7 +1329,7 @@ lck_rw_done_gen( #endif if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) { /* sched_flags checked without lock, but will be rechecked while clearing */ - lck_rw_clear_promotion(thread); + lck_rw_clear_promotion(thread, unslide_for_kdebug(lck)); } #if CONFIG_DTRACE @@ -1440,7 +1444,7 @@ static void lck_rw_lock_shared_gen( lck_rw_t *lck) { - __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck); + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck); uint64_t deadline = 0; int gotlock = 0; int slept = 0; @@ -1506,7 +1510,8 @@ lck_rw_lock_shared_gen( lck->lck_r_waiting = TRUE; thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead); - res = assert_wait(RW_LOCK_READER_EVENT(lck), THREAD_UNINT); + res = assert_wait(RW_LOCK_READER_EVENT(lck), + THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); lck_interlock_unlock(lck, istate); if (res == THREAD_WAITING) { @@ -1634,7 +1639,7 @@ lck_rw_lock_shared_to_exclusive_failure( if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) { /* sched_flags checked without lock, but will be rechecked while clearing */ - lck_rw_clear_promotion(thread); + lck_rw_clear_promotion(thread, unslide_for_kdebug(lck)); } KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE, @@ -1656,7 +1661,7 @@ static boolean_t lck_rw_lock_shared_to_exclusive_success( lck_rw_t *lck) { - __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck); + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck); uint64_t deadline = 0; int slept = 0; int still_shared = 0; @@ -1720,7 +1725,8 @@ lck_rw_lock_shared_to_exclusive_success( lck->lck_w_waiting = TRUE; thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade); - res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT); + res = assert_wait(RW_LOCK_WRITER_EVENT(lck), + THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); lck_interlock_unlock(lck, istate); if (res == THREAD_WAITING) { @@ -1796,7 +1802,7 @@ lck_rw_lock_exclusive_to_shared_gen( lck_rw_t *lck, uint32_t prior_lock_state) { - __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck); + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck); lck_rw_t *fake_lck; fake_lck = (lck_rw_t *)&prior_lock_state; @@ -1956,7 +1962,7 @@ lck_rw_clear_promotions_x86(thread_t thread) #else /* Paper over the issue */ thread->rwlock_count = 0; - lck_rw_clear_promotion(thread); + lck_rw_clear_promotion(thread, 0); #endif } @@ -1987,10 +1993,77 @@ kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck) { return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_write) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE; } +/* + * Slow path routines for lck_mtx locking and unlocking functions. + * + * These functions were previously implemented in x86 assembly, + * and some optimizations are in place in this c code to obtain a compiled code + * as performant and compact as the assembly version. + * + * To avoid to inline these functions on the fast path, all functions directly called by + * the fast paths have the __attribute__((noinline)) specified. Also they are all implemented + * in such a way the fast path can tail call into them. In this way the return address + * does not need to be pushed on the caller stack and stack optimization can happen on the caller. + * + * Slow path code is structured in such a way there are no calls to functions that will return + * on the context of the caller function, i.e. all functions called are or tail call functions + * or inline functions. The number of arguments of the tail call functions are less then six, + * so that they can be passed over registers and do not need to be pushed on stack. + * This allows the compiler to not create a stack frame for the functions. + * + * __improbable and __probable are used to compile the slow path code in such a way + * the fast path case will be on a sequence of instructions with as less jumps as possible, + * to make this case the most optimized even if falling through the slow path. + */ + +/* + * Intel lock invariants: + * + * lck_mtx_waiters: contains the count of threads currently in the mutex waitqueue + * lck_mtx_pri: contains the max priority of all waiters during a contention period + * not cleared on last unlock, but stomped over on next first contention + * lck_mtx_promoted: set when the current lock owner has been promoted + * cleared when lock owner unlocks, set on acquire or wait. + * + * The lock owner is promoted to the max priority of all its waiters only if it + * was a lower priority when it acquired or was an owner when a waiter waited. + * Max priority is capped at MAXPRI_PROMOTE. + * + * The last waiter will not be promoted as it is woken up, but the last + * lock owner may not have been the last thread to have been woken up depending on the + * luck of the draw. Therefore a last-owner may still have the promoted-on-wakeup + * flag set. + * + * TODO: Figure out an algorithm for stopping a lock holder which is already at the right + * priority from dropping priority in the future without having to take thread lock + * on acquire. + */ #ifdef MUTEX_ZONE extern zone_t lck_mtx_zone; #endif + +/* + * N.B.: On x86, statistics are currently recorded for all indirect mutexes. + * Also, only the acquire attempt count (GRP_MTX_STAT_UTIL) is maintained + * as a 64-bit quantity (the new x86 specific statistics are also maintained + * as 32-bit quantities). + * + * + * Enable this preprocessor define to record the first miss alone + * By default, we count every miss, hence multiple misses may be + * recorded for a single lock acquire attempt via lck_mtx_lock + */ +#undef LOG_FIRST_MISS_ALONE + +/* + * This preprocessor define controls whether the R-M-W update of the + * per-group statistics elements are atomic (LOCK-prefixed) + * Enabled by default. + */ +#define ATOMIC_STAT_UPDATES 1 + + /* * Routine: lck_mtx_alloc_init */ @@ -2114,6 +2187,27 @@ lck_mtx_init_ext( lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX); } +static void +lck_mtx_lock_mark_destroyed( + lck_mtx_t *mutex, + boolean_t indirect) +{ + uint32_t state; + + if (indirect) { + /* convert to destroyed state */ + ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED); + return; + } + + state = ordered_load_mtx_state(mutex); + lck_mtx_interlock_lock(mutex, &state); + + ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED); + + enable_preemption(); +} + /* * Routine: lck_mtx_destroy */ @@ -2122,18 +2216,18 @@ lck_mtx_destroy( lck_mtx_t *lck, lck_grp_t *grp) { - boolean_t lck_is_indirect; + boolean_t indirect; if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) return; #if MACH_LDEBUG lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED); #endif - lck_is_indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT); + indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT); - lck_mtx_lock_mark_destroyed(lck); + lck_mtx_lock_mark_destroyed(lck, indirect); - if (lck_is_indirect) + if (indirect) kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t)); lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX); lck_grp_deallocate(grp); @@ -2141,29 +2235,133 @@ lck_mtx_destroy( } +#if DEVELOPMENT | DEBUG +__attribute__((noinline)) +void +lck_mtx_owner_check_panic( + lck_mtx_t *lock) +{ + thread_t owner = (thread_t)lock->lck_mtx_owner; + panic("Mutex unlock attempted from non-owner thread. Owner=%p lock=%p", owner, lock); +} +#endif + +__attribute__((always_inline)) +static boolean_t +get_indirect_mutex( + lck_mtx_t **lock, + uint32_t *state) +{ + *lock = &((*lock)->lck_mtx_ptr->lck_mtx); + *state = ordered_load_mtx_state(*lock); + return TRUE; +} + +/* + * Routine: lck_mtx_unlock_slow + * + * Unlocks a mutex held by current thread. + * + * It will wake up waiters if necessary and + * drop promotions. + * + * Interlock can be held. + */ +__attribute__((noinline)) +void +lck_mtx_unlock_slow( + lck_mtx_t *lock) +{ + thread_t thread; + uint32_t state, prev; + boolean_t indirect = FALSE; + + state = ordered_load_mtx_state(lock); + + /* Is this an indirect mutex? */ + if (__improbable(state == LCK_MTX_TAG_INDIRECT)) { + indirect = get_indirect_mutex(&lock, &state); + } + + thread = current_thread(); + +#if DEVELOPMENT | DEBUG + thread_t owner = (thread_t)lock->lck_mtx_owner; + if(__improbable(owner != thread)) + return lck_mtx_owner_check_panic(lock); +#endif + + /* check if it is held as a spinlock */ + if (__improbable((state & LCK_MTX_MLOCKED_MSK) == 0)) + goto unlock; + + lck_mtx_interlock_lock_clear_flags(lock, LCK_MTX_MLOCKED_MSK, &state); + +unlock: + /* preemption disabled, interlock held and mutex not held */ + + /* clear owner */ + ordered_store_mtx_owner(lock, 0); + /* keep original state in prev for later evaluation */ + prev = state; + /* release interlock, promotion and clear spin flag */ + state &= (~(LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK | LCK_MTX_PROMOTED_MSK)); + if ((state & LCK_MTX_WAITERS_MSK)) + state -= LCK_MTX_WAITER; /* decrement waiter count */ + ordered_store_mtx_state_release(lock, state); /* since I own the interlock, I don't need an atomic update */ + +#if MACH_LDEBUG + /* perform lock statistics after drop to prevent delay */ + if (thread) + thread->mutex_count--; /* lock statistic */ +#endif /* MACH_LDEBUG */ + + /* check if there are waiters to wake up or priority to drop */ + if ((prev & (LCK_MTX_PROMOTED_MSK | LCK_MTX_WAITERS_MSK))) + return lck_mtx_unlock_wakeup_tail(lock, prev, indirect); + + /* re-enable preemption */ + lck_mtx_unlock_finish_inline(lock, FALSE); + + return; +} + #define LCK_MTX_LCK_WAIT_CODE 0x20 #define LCK_MTX_LCK_WAKEUP_CODE 0x21 #define LCK_MTX_LCK_SPIN_CODE 0x22 #define LCK_MTX_LCK_ACQUIRE_CODE 0x23 #define LCK_MTX_LCK_DEMOTE_CODE 0x24 - /* - * Routine: lck_mtx_unlock_wakeup_x86 + * Routine: lck_mtx_unlock_wakeup_tail * - * Invoked on unlock when there is - * contention (i.e. the assembly routine sees that - * that mutex->lck_mtx_waiters != 0 or - * that mutex->lck_mtx_promoted != 0... + * Invoked on unlock when there is + * contention, i.e. the assembly routine sees + * that mutex->lck_mtx_waiters != 0 or + * that mutex->lck_mtx_promoted != 0 * * neither the mutex or interlock is held + * + * Note that this routine might not be called if there are pending + * waiters which have previously been woken up, and they didn't + * end up boosting the old owner. + * + * assembly routine previously did the following to mutex: + * (after saving the state in prior_lock_state) + * cleared lck_mtx_promoted + * decremented lck_mtx_waiters if nonzero + * + * This function needs to be called as a tail call + * to optimize the compiled code. */ -void -lck_mtx_unlock_wakeup_x86 ( +__attribute__((noinline)) +static void +lck_mtx_unlock_wakeup_tail ( lck_mtx_t *mutex, - int prior_lock_state) + int prior_lock_state, + boolean_t indirect) { - __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex); + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex); lck_mtx_t fake_lck; /* @@ -2175,56 +2373,50 @@ lck_mtx_unlock_wakeup_x86 ( fake_lck.lck_mtx_state = prior_lock_state; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START, - trace_lck, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0); + trace_lck, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0); if (__probable(fake_lck.lck_mtx_waiters)) { + kern_return_t did_wake; + if (fake_lck.lck_mtx_waiters > 1) - thread_wakeup_one_with_pri(LCK_MTX_EVENT(mutex), fake_lck.lck_mtx_pri); + did_wake = thread_wakeup_one_with_pri(LCK_MTX_EVENT(mutex), fake_lck.lck_mtx_pri); else - thread_wakeup_one(LCK_MTX_EVENT(mutex)); + did_wake = thread_wakeup_one(LCK_MTX_EVENT(mutex)); + /* + * The waiters count always precisely matches the number of threads on the waitqueue. + * i.e. we should never see ret == KERN_NOT_WAITING. + */ + assert(did_wake == KERN_SUCCESS); } + /* When lck_mtx_promoted was set, then I as the owner definitely have a promotion */ if (__improbable(fake_lck.lck_mtx_promoted)) { - thread_t thread = current_thread(); + thread_t thread = current_thread(); + spl_t s = splsched(); + thread_lock(thread); KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) | DBG_FUNC_NONE, - thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, 0, 0); + thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, 0, 0); + assert(thread->was_promoted_on_wakeup == 0); + assert(thread->promotions > 0); - if (thread->promotions > 0) { - spl_t s = splsched(); + assert_promotions_invariant(thread); - thread_lock(thread); + if (--thread->promotions == 0) + sched_thread_unpromote(thread, trace_lck); - if (--thread->promotions == 0 && (thread->sched_flags & TH_SFLAG_PROMOTED)) { + assert_promotions_invariant(thread); - thread->sched_flags &= ~TH_SFLAG_PROMOTED; - - if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) { - /* Thread still has a RW lock promotion */ - } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) { - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE, - thread->sched_pri, DEPRESSPRI, 0, trace_lck, 0); - - set_sched_pri(thread, DEPRESSPRI); - } - else { - if (thread->base_pri < thread->sched_pri) { - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE, - thread->sched_pri, thread->base_pri, 0, trace_lck, 0); - - thread_recompute_sched_pri(thread, FALSE); - } - } - } - thread_unlock(thread); - splx(s); - } + thread_unlock(thread); + splx(s); } + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END, - trace_lck, 0, mutex->lck_mtx_waiters, 0, 0); -} + trace_lck, 0, mutex->lck_mtx_waiters, 0, 0); + lck_mtx_unlock_finish_inline(mutex, indirect); +} /* * Routine: lck_mtx_lock_acquire_x86 @@ -2236,14 +2428,13 @@ lck_mtx_unlock_wakeup_x86 ( * * mutex is owned... interlock is held... preemption is disabled */ -void -lck_mtx_lock_acquire_x86( +__attribute__((always_inline)) +static void +lck_mtx_lock_acquire_inline( lck_mtx_t *mutex) { - __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex); - thread_t thread; + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex); integer_t priority; - spl_t s; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START, trace_lck, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0); @@ -2251,31 +2442,49 @@ lck_mtx_lock_acquire_x86( if (mutex->lck_mtx_waiters) priority = mutex->lck_mtx_pri; else - priority = 0; + priority = 0; /* not worth resetting lck_mtx_pri here, it will be reset by next waiter */ - thread = (thread_t)mutex->lck_mtx_owner; /* faster then current_thread() */ + /* the priority must have been set correctly by wait */ + assert(priority <= MAXPRI_PROMOTE); + assert(priority == 0 || priority >= BASEPRI_DEFAULT); - if (thread->sched_pri < priority || thread->was_promoted_on_wakeup) { + /* if the mutex wasn't owned, then the owner wasn't promoted */ + assert(mutex->lck_mtx_promoted == 0); - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE, - thread->sched_pri, priority, thread->was_promoted_on_wakeup, trace_lck, 0); + thread_t thread = (thread_t)mutex->lck_mtx_owner; /* faster than current_thread() */ - s = splsched(); + if (thread->sched_pri < priority || thread->was_promoted_on_wakeup) { + spl_t s = splsched(); thread_lock(thread); - if (thread->sched_pri < priority) { - /* Do not promote past promotion ceiling */ - assert(priority <= MAXPRI_PROMOTE); - set_sched_pri(thread, priority); - } - if (mutex->lck_mtx_promoted == 0) { + if (thread->was_promoted_on_wakeup) + assert(thread->promotions > 0); + + /* Intel only promotes if priority goes up */ + if (thread->sched_pri < priority && thread->promotion_priority < priority) { + /* Remember that I need to drop this promotion on unlock */ mutex->lck_mtx_promoted = 1; - - thread->promotions++; - thread->sched_flags |= TH_SFLAG_PROMOTED; + + if (thread->promotions++ == 0) { + /* This is the first promotion for the owner */ + sched_thread_promote_to_pri(thread, priority, trace_lck); + } else { + /* + * Holder was previously promoted due to a different mutex, + * raise to match this one. + * Or, this thread was promoted on wakeup but someone else + * later contended on mutex at higher priority before we got here + */ + sched_thread_update_promotion_to_pri(thread, priority, trace_lck); + } } - thread->was_promoted_on_wakeup = 0; - + + if (thread->was_promoted_on_wakeup) { + thread->was_promoted_on_wakeup = 0; + if (--thread->promotions == 0) + sched_thread_unpromote(thread, trace_lck); + } + thread_unlock(thread); splx(s); } @@ -2283,28 +2492,794 @@ lck_mtx_lock_acquire_x86( trace_lck, 0, mutex->lck_mtx_waiters, 0, 0); } +void +lck_mtx_lock_acquire_x86( + lck_mtx_t *mutex) +{ + return lck_mtx_lock_acquire_inline(mutex); +} + +/* + * Tail call helpers for lock functions that perform + * lck_mtx_lock_acquire followed by the caller's finish routine, to optimize + * the caller's compiled code. + */ -static int -lck_mtx_interlock_try_lock(lck_mtx_t *mutex, boolean_t *istate) +__attribute__((noinline)) +static void +lck_mtx_lock_acquire_tail( + lck_mtx_t *mutex, + boolean_t indirect) +{ + lck_mtx_lock_acquire_inline(mutex); + lck_mtx_lock_finish_inline(mutex, ordered_load_mtx_state(mutex), indirect); +} + +__attribute__((noinline)) +static boolean_t +lck_mtx_try_lock_acquire_tail( + lck_mtx_t *mutex) +{ + lck_mtx_lock_acquire_inline(mutex); + lck_mtx_try_lock_finish_inline(mutex, ordered_load_mtx_state(mutex)); + + return TRUE; +} + +__attribute__((noinline)) +static void +lck_mtx_convert_spin_acquire_tail( + lck_mtx_t *mutex) +{ + lck_mtx_lock_acquire_inline(mutex); + lck_mtx_convert_spin_finish_inline(mutex, ordered_load_mtx_state(mutex)); +} + +boolean_t +lck_mtx_ilk_unlock( + lck_mtx_t *mutex) +{ + lck_mtx_ilk_unlock_inline(mutex, ordered_load_mtx_state(mutex)); + return TRUE; +} + +static inline void +lck_mtx_interlock_lock_set_and_clear_flags( + lck_mtx_t *mutex, + uint32_t xor_flags, + uint32_t and_flags, + uint32_t *new_state) { - int retval; + uint32_t state, prev; + state = *new_state; + + for ( ; ; ) { + /* have to wait for interlock to clear */ + while (__improbable(state & (LCK_MTX_ILOCKED_MSK | xor_flags))) { + cpu_pause(); + state = ordered_load_mtx_state(mutex); + } + prev = state; /* prev contains snapshot for exchange */ + state |= LCK_MTX_ILOCKED_MSK | xor_flags; /* pick up interlock */ + state &= ~and_flags; /* clear flags */ + + disable_preemption(); + if (atomic_compare_exchange32(&mutex->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) + break; + enable_preemption(); + cpu_pause(); + state = ordered_load_mtx_state(mutex); + } + *new_state = state; + return; +} + +static inline void +lck_mtx_interlock_lock_clear_flags( + lck_mtx_t *mutex, + uint32_t and_flags, + uint32_t *new_state) +{ + return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, and_flags, new_state); +} + +static inline void +lck_mtx_interlock_lock( + lck_mtx_t *mutex, + uint32_t *new_state) +{ + return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, 0, new_state); +} + +static inline int +lck_mtx_interlock_try_lock_set_flags( + lck_mtx_t *mutex, + uint32_t or_flags, + uint32_t *new_state) +{ + uint32_t state, prev; + state = *new_state; + + /* have to wait for interlock to clear */ + if (state & (LCK_MTX_ILOCKED_MSK | or_flags)) { + return 0; + } + prev = state; /* prev contains snapshot for exchange */ + state |= LCK_MTX_ILOCKED_MSK | or_flags; /* pick up interlock */ + disable_preemption(); + if (atomic_compare_exchange32(&mutex->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) { + *new_state = state; + return 1; + } + + enable_preemption(); + return 0; +} + +static inline int +lck_mtx_interlock_try_lock( + lck_mtx_t *mutex, + uint32_t *new_state) +{ + return lck_mtx_interlock_try_lock_set_flags(mutex, 0, new_state); +} + +static inline int +lck_mtx_interlock_try_lock_disable_interrupts( + lck_mtx_t *mutex, + boolean_t *istate) +{ + uint32_t state; *istate = ml_set_interrupts_enabled(FALSE); - retval = lck_mtx_ilk_try_lock(mutex); + state = ordered_load_mtx_state(mutex); - if (retval == 0) + if (lck_mtx_interlock_try_lock(mutex, &state)) { + return 1; + } else { ml_set_interrupts_enabled(*istate); - - return retval; + return 0; + } } -static void -lck_mtx_interlock_unlock(lck_mtx_t *mutex, boolean_t istate) -{ +static inline void +lck_mtx_interlock_unlock_enable_interrupts( + lck_mtx_t *mutex, + boolean_t istate) +{ lck_mtx_ilk_unlock(mutex); ml_set_interrupts_enabled(istate); } +static void __inline__ +lck_mtx_inc_stats( + uint64_t* stat) +{ +#if ATOMIC_STAT_UPDATES + os_atomic_inc(stat, relaxed); +#else + *stat = (*stat)++; +#endif +} + +static void __inline__ +lck_mtx_update_miss( + struct _lck_mtx_ext_ *lock, + int *first_miss) +{ +#if LOG_FIRST_MISS_ALONE + if ((*first_miss & 1) == 0) { +#else +#pragma unused(first_miss) +#endif + uint64_t* stat = &lock->lck_mtx_grp->lck_grp_miss; + lck_mtx_inc_stats(stat); + +#if LOG_FIRST_MISS_ALONE + *first_miss |= 1; + } +#endif +} + +static void __inline__ +lck_mtx_update_direct_wait( + struct _lck_mtx_ext_ *lock) +{ + uint64_t* stat = &lock->lck_mtx_grp->lck_grp_direct_wait; + lck_mtx_inc_stats(stat); +} + +static void __inline__ +lck_mtx_update_wait( + struct _lck_mtx_ext_ *lock, + int *first_miss) +{ +#if LOG_FIRST_MISS_ALONE + if ((*first_miss & 2) == 0) { +#else +#pragma unused(first_miss) +#endif + uint64_t* stat = &lock->lck_mtx_grp->lck_grp_wait; + lck_mtx_inc_stats(stat); + +#if LOG_FIRST_MISS_ALONE + *first_miss |= 2; + } +#endif +} + +static void __inline__ +lck_mtx_update_util( + struct _lck_mtx_ext_ *lock) +{ + uint64_t* stat = &lock->lck_mtx_grp->lck_grp_util; + lck_mtx_inc_stats(stat); +} + +__attribute__((noinline)) +static void +lck_mtx_lock_contended( + lck_mtx_t *lock, + boolean_t indirect, + boolean_t *first_miss) +{ + lck_mtx_spinwait_ret_type_t ret; + uint32_t state; + thread_t thread; + +try_again: + + if (indirect) { + lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, first_miss); + } + + ret = lck_mtx_lock_spinwait_x86(lock); + state = ordered_load_mtx_state(lock); + switch (ret) { + case LCK_MTX_SPINWAIT_NO_SPIN: + /* + * owner not on core, lck_mtx_lock_spinwait_x86 didn't even + * try to spin. + */ + if (indirect) { + lck_mtx_update_direct_wait((struct _lck_mtx_ext_*)lock); + } + + /* just fall through case LCK_MTX_SPINWAIT_SPUN */ + case LCK_MTX_SPINWAIT_SPUN: + /* + * mutex not acquired but lck_mtx_lock_spinwait_x86 tried to spin + * interlock not held + */ + lck_mtx_interlock_lock(lock, &state); + assert(state & LCK_MTX_ILOCKED_MSK); + + if (state & LCK_MTX_MLOCKED_MSK) { + if (indirect) { + lck_mtx_update_wait((struct _lck_mtx_ext_*)lock, first_miss); + } + lck_mtx_lock_wait_x86(lock); + /* + * interlock is not held here. + */ + goto try_again; + } else { + + /* grab the mutex */ + state |= LCK_MTX_MLOCKED_MSK; + ordered_store_mtx_state_release(lock, state); + thread = current_thread(); + ordered_store_mtx_owner(lock, (uintptr_t)thread); +#if MACH_LDEBUG + if (thread) { + thread->mutex_count++; + } +#endif /* MACH_LDEBUG */ + } + + break; + case LCK_MTX_SPINWAIT_ACQUIRED: + /* + * mutex has been acquired by lck_mtx_lock_spinwait_x86 + * interlock is held and preemption disabled + * owner is set and mutex marked as locked + * statistics updated too + */ + break; + default: + panic("lck_mtx_lock_spinwait_x86 returned %d for mutex %p\n", ret, lock); + } + + /* + * interlock is already acquired here + */ + + /* mutex has been acquired */ + thread = (thread_t)lock->lck_mtx_owner; + if (state & LCK_MTX_WAITERS_MSK || thread->was_promoted_on_wakeup) { + return lck_mtx_lock_acquire_tail(lock, indirect); + } + + /* release the interlock */ + lck_mtx_lock_finish_inline(lock, ordered_load_mtx_state(lock), indirect); +} + +/* + * Helper noinline functions for calling + * panic to optimize compiled code. + */ + +__attribute__((noinline)) +static void +lck_mtx_destroyed( + lck_mtx_t *lock) +{ + panic("trying to interlock destroyed mutex (%p)", lock); +} + +__attribute__((noinline)) +static boolean_t +lck_mtx_try_destroyed( + lck_mtx_t *lock) +{ + panic("trying to interlock destroyed mutex (%p)", lock); + return FALSE; +} + +__attribute__((always_inline)) +static boolean_t +lck_mtx_lock_wait_interlock_to_clear( + lck_mtx_t *lock, + uint32_t* new_state) +{ + uint32_t state; + + for ( ; ; ) { + cpu_pause(); + state = ordered_load_mtx_state(lock); + if (!(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) { + *new_state = state; + return TRUE; + } + if (state & LCK_MTX_MLOCKED_MSK) { + /* if it is held as mutex, just fail */ + return FALSE; + } + } +} + +__attribute__((always_inline)) +static boolean_t +lck_mtx_try_lock_wait_interlock_to_clear( + lck_mtx_t *lock, + uint32_t* new_state) +{ + uint32_t state; + + for ( ; ; ) { + cpu_pause(); + state = ordered_load_mtx_state(lock); + if (state & (LCK_MTX_MLOCKED_MSK | LCK_MTX_SPIN_MSK)) { + /* if it is held as mutex or spin, just fail */ + return FALSE; + } + if (!(state & LCK_MTX_ILOCKED_MSK)) { + *new_state = state; + return TRUE; + } + } +} + +/* + * Routine: lck_mtx_lock_slow + * + * Locks a mutex for current thread. + * If the lock is contended this function might + * sleep. + * + * Called with interlock not held. + */ +__attribute__((noinline)) +void +lck_mtx_lock_slow( + lck_mtx_t *lock) +{ + boolean_t indirect = FALSE; + uint32_t state; + int first_miss = 0; + + state = ordered_load_mtx_state(lock); + + /* is the interlock or mutex held */ + if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) { + /* + * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT + * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK + * set in state (state == lck_mtx_tag) + */ + + + /* is the mutex already held and not indirect */ + if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){ + /* no, must have been the mutex */ + return lck_mtx_lock_contended(lock, indirect, &first_miss); + } + + /* check to see if it is marked destroyed */ + if (__improbable(state == LCK_MTX_TAG_DESTROYED)) { + return lck_mtx_destroyed(lock); + } + + /* Is this an indirect mutex? */ + if (__improbable(state == LCK_MTX_TAG_INDIRECT)) { + indirect = get_indirect_mutex(&lock, &state); + + first_miss = 0; + lck_mtx_update_util((struct _lck_mtx_ext_*)lock); + + if (state & LCK_MTX_SPIN_MSK) { + /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */ + assert(state & LCK_MTX_ILOCKED_MSK); + lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss); + } + } + + if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) { + return lck_mtx_lock_contended(lock, indirect, &first_miss); + } + } + + /* no - can't be INDIRECT, DESTROYED or locked */ + while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) { + if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) { + return lck_mtx_lock_contended(lock, indirect, &first_miss); + } + } + + /* lock and interlock acquired */ + + thread_t thread = current_thread(); + /* record owner of mutex */ + ordered_store_mtx_owner(lock, (uintptr_t)thread); + +#if MACH_LDEBUG + if (thread) { + thread->mutex_count++; /* lock statistic */ + } +#endif + /* + * Check if there are waiters to + * inherit their priority. + */ + if (__improbable(state & LCK_MTX_WAITERS_MSK)) { + return lck_mtx_lock_acquire_tail(lock, indirect); + } + + /* release the interlock */ + lck_mtx_lock_finish_inline(lock, ordered_load_mtx_state(lock), indirect); + + return; +} + +__attribute__((noinline)) +boolean_t +lck_mtx_try_lock_slow( + lck_mtx_t *lock) +{ + boolean_t indirect = FALSE; + uint32_t state; + int first_miss = 0; + + state = ordered_load_mtx_state(lock); + + /* is the interlock or mutex held */ + if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) { + /* + * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT + * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK + * set in state (state == lck_mtx_tag) + */ + + /* is the mutex already held and not indirect */ + if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){ + return FALSE; + } + + /* check to see if it is marked destroyed */ + if (__improbable(state == LCK_MTX_TAG_DESTROYED)) { + return lck_mtx_try_destroyed(lock); + } + + /* Is this an indirect mutex? */ + if (__improbable(state == LCK_MTX_TAG_INDIRECT)) { + indirect = get_indirect_mutex(&lock, &state); + + first_miss = 0; + lck_mtx_update_util((struct _lck_mtx_ext_*)lock); + } + + if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) { + if (indirect) + lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss); + return FALSE; + } + } + + /* no - can't be INDIRECT, DESTROYED or locked */ + while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) { + if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) { + if (indirect) + lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss); + return FALSE; + } + } + + /* lock and interlock acquired */ + + thread_t thread = current_thread(); + /* record owner of mutex */ + ordered_store_mtx_owner(lock, (uintptr_t)thread); + +#if MACH_LDEBUG + if (thread) { + thread->mutex_count++; /* lock statistic */ + } +#endif + /* + * Check if there are waiters to + * inherit their priority. + */ + if (__improbable(state & LCK_MTX_WAITERS_MSK)) { + return lck_mtx_try_lock_acquire_tail(lock); + } + + /* release the interlock */ + lck_mtx_try_lock_finish_inline(lock, ordered_load_mtx_state(lock)); + + return TRUE; + +} + +__attribute__((noinline)) +void +lck_mtx_lock_spin_slow( + lck_mtx_t *lock) +{ + boolean_t indirect = FALSE; + uint32_t state; + int first_miss = 0; + + state = ordered_load_mtx_state(lock); + + /* is the interlock or mutex held */ + if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) { + /* + * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT + * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK + * set in state (state == lck_mtx_tag) + */ + + + /* is the mutex already held and not indirect */ + if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){ + /* no, must have been the mutex */ + return lck_mtx_lock_contended(lock, indirect, &first_miss); + } + + /* check to see if it is marked destroyed */ + if (__improbable(state == LCK_MTX_TAG_DESTROYED)) { + return lck_mtx_destroyed(lock); + } + + /* Is this an indirect mutex? */ + if (__improbable(state == LCK_MTX_TAG_INDIRECT)) { + indirect = get_indirect_mutex(&lock, &state); + + first_miss = 0; + lck_mtx_update_util((struct _lck_mtx_ext_*)lock); + + if (state & LCK_MTX_SPIN_MSK) { + /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */ + assert(state & LCK_MTX_ILOCKED_MSK); + lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss); + } + } + + if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) { + return lck_mtx_lock_contended(lock, indirect, &first_miss); + } + } + + /* no - can't be INDIRECT, DESTROYED or locked */ + while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state) )) { + if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) { + return lck_mtx_lock_contended(lock, indirect, &first_miss); + } + } + + /* lock as spinlock and interlock acquired */ + + thread_t thread = current_thread(); + /* record owner of mutex */ + ordered_store_mtx_owner(lock, (uintptr_t)thread); + +#if MACH_LDEBUG + if (thread) { + thread->mutex_count++; /* lock statistic */ + } +#endif + +#if CONFIG_DTRACE + LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, lock, 0); +#endif + /* return with the interlock held and preemption disabled */ + return; +} + +__attribute__((noinline)) +boolean_t +lck_mtx_try_lock_spin_slow( + lck_mtx_t *lock) +{ + boolean_t indirect = FALSE; + uint32_t state; + int first_miss = 0; + + state = ordered_load_mtx_state(lock); + + /* is the interlock or mutex held */ + if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) { + /* + * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT + * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK + * set in state (state == lck_mtx_tag) + */ + + /* is the mutex already held and not indirect */ + if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){ + return FALSE; + } + + /* check to see if it is marked destroyed */ + if (__improbable(state == LCK_MTX_TAG_DESTROYED)) { + return lck_mtx_try_destroyed(lock); + } + + /* Is this an indirect mutex? */ + if (__improbable(state == LCK_MTX_TAG_INDIRECT)) { + indirect = get_indirect_mutex(&lock, &state); + + first_miss = 0; + lck_mtx_update_util((struct _lck_mtx_ext_*)lock); + } + + if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) { + if (indirect) + lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss); + return FALSE; + } + } + + /* no - can't be INDIRECT, DESTROYED or locked */ + while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) { + if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) { + if (indirect) + lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss); + return FALSE; + } + } + + /* lock and interlock acquired */ + + thread_t thread = current_thread(); + /* record owner of mutex */ + ordered_store_mtx_owner(lock, (uintptr_t)thread); + +#if MACH_LDEBUG + if (thread) { + thread->mutex_count++; /* lock statistic */ + } +#endif + +#if CONFIG_DTRACE + LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, lock, 0); +#endif + return TRUE; + +} + +__attribute__((noinline)) +void +lck_mtx_convert_spin( + lck_mtx_t *lock) +{ + uint32_t state; + + state = ordered_load_mtx_state(lock); + + /* Is this an indirect mutex? */ + if (__improbable(state == LCK_MTX_TAG_INDIRECT)) { + /* If so, take indirection */ + get_indirect_mutex(&lock, &state); + } + + assertf((thread_t)lock->lck_mtx_owner == current_thread(), "lock %p not owned by thread %p (current owner %p)", lock, current_thread(), (thread_t)lock->lck_mtx_owner ); + + if (__improbable(state & LCK_MTX_MLOCKED_MSK)) { + /* already owned as a mutex, just return */ + return; + } + + assert(get_preemption_level() > 0); + assert(state & LCK_MTX_ILOCKED_MSK); + assert(state & LCK_MTX_SPIN_MSK); + + /* + * Check if there are waiters to + * inherit their priority. + */ + if (__improbable(state & LCK_MTX_WAITERS_MSK)) { + return lck_mtx_convert_spin_acquire_tail(lock); + } + + lck_mtx_convert_spin_finish_inline(lock, ordered_load_mtx_state(lock)); + + return; +} + +static inline boolean_t +lck_mtx_lock_grab_mutex( + lck_mtx_t *lock) +{ + uint32_t state; + + state = ordered_load_mtx_state(lock); + + if (!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state)) { + return FALSE; + } + + /* lock and interlock acquired */ + + thread_t thread = current_thread(); + /* record owner of mutex */ + ordered_store_mtx_owner(lock, (uintptr_t)thread); + +#if MACH_LDEBUG + if (thread) { + thread->mutex_count++; /* lock statistic */ + } +#endif + return TRUE; +} + +__attribute__((noinline)) +void +lck_mtx_assert( + lck_mtx_t *lock, + unsigned int type) +{ + thread_t thread, owner; + uint32_t state; + + thread = current_thread(); + state = ordered_load_mtx_state(lock); + + if (state == LCK_MTX_TAG_INDIRECT) { + get_indirect_mutex(&lock, &state); + } + + owner = (thread_t)lock->lck_mtx_owner; + + if (type == LCK_MTX_ASSERT_OWNED) { + if (owner != thread || !(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) + panic("mutex (%p) not owned\n", lock); + } else { + assert (type == LCK_MTX_ASSERT_NOTOWNED); + if (owner == thread) + panic("mutex (%p) owned\n", lock); + } +} /* * Routine: lck_mtx_lock_spinwait_x86 @@ -2314,20 +3289,21 @@ lck_mtx_interlock_unlock(lck_mtx_t *mutex, boolean_t istate) * time waiting for the lock to be released. * * Called with the interlock unlocked. - * returns 0 if mutex acquired - * returns 1 if we spun - * returns 2 if we didn't spin due to the holder not running + * returns LCK_MTX_SPINWAIT_ACQUIRED if mutex acquired + * returns LCK_MTX_SPINWAIT_SPUN if we spun + * returns LCK_MTX_SPINWAIT_NO_SPIN if we didn't spin due to the holder not running */ -int +__attribute__((noinline)) +lck_mtx_spinwait_ret_type_t lck_mtx_lock_spinwait_x86( lck_mtx_t *mutex) { - __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex); + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex); thread_t holder; uint64_t overall_deadline; uint64_t check_owner_deadline; uint64_t cur_time; - int retval = 1; + lck_mtx_spinwait_ret_type_t retval = LCK_MTX_SPINWAIT_SPUN; int loopcount = 0; KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START, @@ -2347,7 +3323,7 @@ lck_mtx_lock_spinwait_x86( */ do { if (__probable(lck_mtx_lock_grab_mutex(mutex))) { - retval = 0; + retval = LCK_MTX_SPINWAIT_ACQUIRED; break; } cur_time = mach_absolute_time(); @@ -2358,21 +3334,33 @@ lck_mtx_lock_spinwait_x86( if (cur_time >= check_owner_deadline && mutex->lck_mtx_owner) { boolean_t istate; - if (lck_mtx_interlock_try_lock(mutex, &istate)) { + /* + * We will repeatedly peek at the state of the lock while spinning, + * and we will acquire the interlock to do so. + * The thread that will unlock the mutex will also need to acquire + * the interlock, and we want to avoid to slow it down. + * To avoid to get an interrupt while holding the interlock + * and increase the time we are holding it, we + * will try to acquire the interlock with interrupts disabled. + * This is safe because it is a "try_lock", if we can't acquire + * the interlock we re-enable the interrupts and fail, so it is + * ok to call it even if the interlock was already held. + */ + if (lck_mtx_interlock_try_lock_disable_interrupts(mutex, &istate)) { if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) { if ( !(holder->machine.specFlags & OnProc) || (holder->state & TH_IDLE)) { - lck_mtx_interlock_unlock(mutex, istate); + lck_mtx_interlock_unlock_enable_interrupts(mutex, istate); if (loopcount == 0) - retval = 2; + retval = LCK_MTX_SPINWAIT_NO_SPIN; break; } } - lck_mtx_interlock_unlock(mutex, istate); + lck_mtx_interlock_unlock_enable_interrupts(mutex, istate); check_owner_deadline = cur_time + (MutexSpin / 4); } @@ -2418,79 +3406,119 @@ lck_mtx_lock_spinwait_x86( * Invoked in order to wait on contention. * * Called with the interlock locked and - * preemption disabled... + * preemption disabled... * returns it unlocked and with preemption enabled + * + * lck_mtx_waiters is 1:1 with a wakeup needing to occur. + * A runnable waiter can exist between wait and acquire + * without a waiters count being set. + * This allows us to never make a spurious wakeup call. + * + * Priority: + * This avoids taking the thread lock if the owning thread is the same priority. + * This optimizes the case of same-priority threads contending on a lock. + * However, that allows the owning thread to drop in priority while holding the lock, + * because there is no state that the priority change can notice that + * says that the targeted thread holds a contended mutex. + * + * One possible solution: priority changes could look for some atomic tag + * on the thread saying 'holding contended lock', and then set up a promotion. + * Needs a story for dropping that promotion - the last contended unlock + * has to notice that this has happened. */ +__attribute__((noinline)) void lck_mtx_lock_wait_x86 ( lck_mtx_t *mutex) { - __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex); - thread_t self = current_thread(); - thread_t holder; - integer_t priority; - spl_t s; #if CONFIG_DTRACE - uint64_t sleep_start = 0; + uint64_t sleep_start = 0; if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) { sleep_start = mach_absolute_time(); } #endif + thread_t self = current_thread(); + assert(self->waiting_for_mutex == NULL); + + self->waiting_for_mutex = mutex; + + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex); + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START, - trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0); + trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), + mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0); - priority = self->sched_pri; + integer_t waiter_pri = self->sched_pri; + waiter_pri = MAX(waiter_pri, self->base_pri); + waiter_pri = MAX(waiter_pri, BASEPRI_DEFAULT); + waiter_pri = MIN(waiter_pri, MAXPRI_PROMOTE); - if (priority < self->base_pri) - priority = self->base_pri; - if (priority < BASEPRI_DEFAULT) - priority = BASEPRI_DEFAULT; + assert(mutex->lck_mtx_pri <= MAXPRI_PROMOTE); - /* Do not promote past promotion ceiling */ - priority = MIN(priority, MAXPRI_PROMOTE); + /* Re-initialize lck_mtx_pri if this is the first contention */ + if (mutex->lck_mtx_waiters == 0 || mutex->lck_mtx_pri <= waiter_pri) + mutex->lck_mtx_pri = waiter_pri; - if (mutex->lck_mtx_waiters == 0 || priority > mutex->lck_mtx_pri) - mutex->lck_mtx_pri = priority; - mutex->lck_mtx_waiters++; + thread_t holder = (thread_t)mutex->lck_mtx_owner; - if ( (holder = (thread_t)mutex->lck_mtx_owner) && - holder->sched_pri < mutex->lck_mtx_pri ) { - s = splsched(); + assert(holder != NULL); + + /* + * Intel only causes a promotion when priority needs to change, + * reducing thread lock holds but leaving us vulnerable to the holder + * dropping priority. + */ + if (holder->sched_pri < mutex->lck_mtx_pri) { + int promote_pri = mutex->lck_mtx_pri; + + spl_t s = splsched(); thread_lock(holder); - /* holder priority may have been bumped by another thread - * before thread_lock was taken - */ - if (holder->sched_pri < mutex->lck_mtx_pri) { - KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE, - holder->sched_pri, priority, thread_tid(holder), trace_lck, 0); - /* Assert that we're not altering the priority of a - * thread above the MAXPRI_PROMOTE band - */ - assert(holder->sched_pri < MAXPRI_PROMOTE); - set_sched_pri(holder, priority); - + /* Check again in case sched_pri changed */ + if (holder->sched_pri < promote_pri && holder->promotion_priority < promote_pri) { if (mutex->lck_mtx_promoted == 0) { - holder->promotions++; - holder->sched_flags |= TH_SFLAG_PROMOTED; - + /* This is the first promotion for this mutex */ mutex->lck_mtx_promoted = 1; + + if (holder->promotions++ == 0) { + /* This is the first promotion for holder */ + sched_thread_promote_to_pri(holder, promote_pri, trace_lck); + } else { + /* + * Holder was previously promoted due to a different mutex, + * check if it needs to raise to match this one + */ + sched_thread_update_promotion_to_pri(holder, promote_pri, + trace_lck); + } + } else { + /* + * Holder was previously promoted due to this mutex, + * check if the pri needs to go up + */ + sched_thread_update_promotion_to_pri(holder, promote_pri, trace_lck); } } + thread_unlock(holder); splx(s); } + + mutex->lck_mtx_waiters++; + thread_set_pending_block_hint(self, kThreadWaitKernelMutex); - assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT); + assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); lck_mtx_ilk_unlock(mutex); thread_block(THREAD_CONTINUE_NULL); + self->waiting_for_mutex = NULL; + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END, - trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0); + trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), + mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0); #if CONFIG_DTRACE /* diff --git a/osfmk/i386/locks_i386_inlines.h b/osfmk/i386/locks_i386_inlines.h new file mode 100644 index 000000000..7e4aa5995 --- /dev/null +++ b/osfmk/i386/locks_i386_inlines.h @@ -0,0 +1,142 @@ +/* + * Copyright (c) 201 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _I386_LOCKS_I386_INLINES_H_ +#define _I386_LOCKS_I386_INLINES_H_ + +#include +/* + * We need only enough declarations from the BSD-side to be able to + * test if our probe is active, and to call __dtrace_probe(). Setting + * NEED_DTRACE_DEFS gets a local copy of those definitions pulled in. + */ +#if CONFIG_DTRACE +#define NEED_DTRACE_DEFS +#include <../bsd/sys/lockstat.h> +#endif + +// Enforce program order of loads and stores. +#define ordered_load(target) _Generic( (target),\ + uint32_t* : __c11_atomic_load((_Atomic uint32_t* )(target), memory_order_relaxed), \ + uintptr_t*: __c11_atomic_load((_Atomic uintptr_t*)(target), memory_order_relaxed) ) +#define ordered_store_release(target, value) _Generic( (target),\ + uint32_t* : __c11_atomic_store((_Atomic uint32_t* )(target), (value), memory_order_release_smp), \ + uintptr_t*: __c11_atomic_store((_Atomic uintptr_t*)(target), (value), memory_order_release_smp) ) +#define ordered_store_volatile(target, value) _Generic( (target),\ + volatile uint32_t* : __c11_atomic_store((_Atomic volatile uint32_t* )(target), (value), memory_order_relaxed), \ + volatile uintptr_t*: __c11_atomic_store((_Atomic volatile uintptr_t*)(target), (value), memory_order_relaxed) ) + +/* Enforce program order of loads and stores. */ +#define ordered_load_mtx_state(lock) ordered_load(&(lock)->lck_mtx_state) +#define ordered_store_mtx_state_release(lock, value) ordered_store_release(&(lock)->lck_mtx_state, (value)) +#define ordered_store_mtx_owner(lock, value) ordered_store_volatile(&(lock)->lck_mtx_owner, (value)) + +#if DEVELOPMENT | DEBUG +void lck_mtx_owner_check_panic(lck_mtx_t *mutex); +#endif + +__attribute__((always_inline)) +static inline void +lck_mtx_ilk_unlock_inline( + lck_mtx_t *mutex, + uint32_t state) +{ + state &= ~LCK_MTX_ILOCKED_MSK; + ordered_store_mtx_state_release(mutex, state); + + enable_preemption(); +} + +__attribute__((always_inline)) +static inline void +lck_mtx_lock_finish_inline( + lck_mtx_t *mutex, + uint32_t state, + boolean_t indirect) +{ + assert(state & LCK_MTX_ILOCKED_MSK); + + /* release the interlock and re-enable preemption */ + lck_mtx_ilk_unlock_inline(mutex, state); + +#if CONFIG_DTRACE + if (indirect) { + LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_ACQUIRE, mutex, 0); + } else { + LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, mutex, 0); + } +#endif +} + +__attribute__((always_inline)) +static inline void +lck_mtx_try_lock_finish_inline( + lck_mtx_t *mutex, + uint32_t state) +{ + /* release the interlock and re-enable preemption */ + lck_mtx_ilk_unlock_inline(mutex, state); + +#if CONFIG_DTRACE + LOCKSTAT_RECORD(LS_LCK_MTX_TRY_LOCK_ACQUIRE, mutex, 0); +#endif +} + +__attribute__((always_inline)) +static inline void +lck_mtx_convert_spin_finish_inline( + lck_mtx_t *mutex, + uint32_t state) +{ + /* release the interlock and acquire it as mutex */ + state &= ~(LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK); + state |= LCK_MTX_MLOCKED_MSK; + + ordered_store_mtx_state_release(mutex, state); + enable_preemption(); +} + +__attribute__((always_inline)) +static inline void +lck_mtx_unlock_finish_inline( + lck_mtx_t *mutex, + boolean_t indirect) +{ + enable_preemption(); + +#if CONFIG_DTRACE + if (indirect) { + LOCKSTAT_RECORD(LS_LCK_MTX_EXT_UNLOCK_RELEASE, mutex, 0); + } else { + LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, mutex, 0); + } +#endif // CONFIG_DTRACE +} + +#endif /* _I386_LOCKS_I386_INLINES_H_ */ + diff --git a/osfmk/i386/locks_i386_opt.c b/osfmk/i386/locks_i386_opt.c new file mode 100644 index 000000000..90dcf06a1 --- /dev/null +++ b/osfmk/i386/locks_i386_opt.c @@ -0,0 +1,462 @@ +/* + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#define ATOMIC_PRIVATE 1 +#define LOCK_PRIVATE 1 + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include /* machine_timeout_suspended() */ +#include +#include +#include +#include +#include +#include + +/* + * Fast path routines for lck_mtx locking and unlocking functions. + * Fast paths will try a single compare and swap instruction to acquire/release the lock + * and interlock, and they will fall through the slow path in case it fails. + * + * These functions were previously implemented in x86 assembly, + * and some optimizations are in place in this c code to obtain a compiled code + * as performant and compact as the assembly version. + * + * To avoid to inline these functions and increase the kernel text size all functions have + * the __attribute__((noinline)) specified. + * + * The code is structured in such a way there are no calls to functions that will return + * on the context of the caller function, i.e. all functions called are or tail call functions + * or inline functions. The number of arguments of the tail call functions are less then six, + * so that they can be passed over registers and do not need to be pushed on stack. + * This allows the compiler to not create a stack frame for the functions. + * + * The file is compiled with momit-leaf-frame-pointer and O2. + */ + +#if DEVELOPMENT || DEBUG + +/* + * If one or more simplelocks are currently held by a thread, + * an attempt to acquire a mutex will cause this check to fail + * (since a mutex lock may context switch, holding a simplelock + * is not a good thing). + */ +void __inline__ +lck_mtx_check_preemption(void) +{ + if (get_preemption_level() == 0) + return; + if (LckDisablePreemptCheck) + return; + if (current_cpu_datap()->cpu_hibernate) + return; + + panic("preemption_level(%d) != 0\n", get_preemption_level()); +} + +#else /* DEVELOPMENT || DEBUG */ + +void __inline__ +lck_mtx_check_preemption(void) +{ + return; +} + +#endif /* DEVELOPMENT || DEBUG */ + +/* + * Routine: lck_mtx_lock + * + * Locks a mutex for current thread. + * It tries the fast path first and + * falls through the slow path in case + * of contention. + * + * Interlock or mutex cannot be already held by current thread. + * In case of contention it might sleep. + */ +__attribute__((noinline)) +void +lck_mtx_lock( + lck_mtx_t *lock) +{ + uint32_t prev, state; + + lck_mtx_check_preemption(); + state = ordered_load_mtx_state(lock); + + /* + * Fast path only if the mutex is not held + * interlock is not contended and there are no waiters. + * Indirect mutexes will fall through the slow path as + * well as destroyed mutexes. + */ + + prev = state & ~(LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK | LCK_MTX_WAITERS_MSK); + state = prev | LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK; + + disable_preemption(); + if (!atomic_compare_exchange32(&lock->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) { + enable_preemption(); + return lck_mtx_lock_slow(lock); + } + + /* mutex acquired, interlock acquired and preemption disabled */ + + thread_t thread = current_thread(); + /* record owner of mutex */ + ordered_store_mtx_owner(lock, (uintptr_t)thread); + +#if MACH_LDEBUG + if (thread) { + thread->mutex_count++; /* lock statistic */ + } +#endif + + /* release interlock and re-enable preemption */ + lck_mtx_lock_finish_inline(lock, state, FALSE); +} + +/* + * Routine: lck_mtx_try_lock + * + * Try to lock a mutex for current thread. + * It tries the fast path first and + * falls through the slow path in case + * of contention. + * + * Interlock or mutex cannot be already held by current thread. + * + * In case the mutex is held (either as spin or mutex) + * the function will fail, it will acquire the mutex otherwise. + */ +__attribute__((noinline)) +boolean_t +lck_mtx_try_lock( + lck_mtx_t *lock) +{ + uint32_t prev, state; + + state = ordered_load_mtx_state(lock); + + /* + * Fast path only if the mutex is not held + * interlock is not contended and there are no waiters. + * Indirect mutexes will fall through the slow path as + * well as destroyed mutexes. + */ + + prev = state & ~(LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK | LCK_MTX_WAITERS_MSK); + state = prev | LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK; + + disable_preemption(); + if (!atomic_compare_exchange32(&lock->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) { + enable_preemption(); + return lck_mtx_try_lock_slow(lock); + } + + /* mutex acquired, interlock acquired and preemption disabled */ + + thread_t thread = current_thread(); + /* record owner of mutex */ + ordered_store_mtx_owner(lock, (uintptr_t)thread); + +#if MACH_LDEBUG + if (thread) { + thread->mutex_count++; /* lock statistic */ + } +#endif + + /* release interlock and re-enable preemption */ + lck_mtx_try_lock_finish_inline(lock, state); + + return TRUE; +} + +/* + * Routine: lck_mtx_lock_spin_always + * + * Try to lock a mutex as spin lock for current thread. + * It tries the fast path first and + * falls through the slow path in case + * of contention. + * + * Interlock or mutex cannot be already held by current thread. + * + * In case the mutex is held as mutex by another thread + * this function will switch behavior and try to acquire the lock as mutex. + * + * In case the mutex is held as spinlock it will spin contending + * for it. + * + * In case of contention it might sleep. + */ +__attribute__((noinline)) +void +lck_mtx_lock_spin_always( + lck_mtx_t *lock) +{ + uint32_t prev, state; + + state = ordered_load_mtx_state(lock); + + /* + * Fast path only if the mutex is not held + * neither as mutex nor as spin and + * interlock is not contended. + * Indirect mutexes will fall through the slow path as + * well as destroyed mutexes. + */ + + /* Note LCK_MTX_SPIN_MSK is set only if LCK_MTX_ILOCKED_MSK is set */ + prev = state & ~(LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK); + state = prev | LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK; + + disable_preemption(); + if (!atomic_compare_exchange32(&lock->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) { + enable_preemption(); + return lck_mtx_lock_spin_slow(lock); + } + + /* mutex acquired as spinlock, interlock acquired and preemption disabled */ + + thread_t thread = current_thread(); + /* record owner of mutex */ + ordered_store_mtx_owner(lock, (uintptr_t)thread); + +#if MACH_LDEBUG + if (thread) { + thread->mutex_count++; /* lock statistic */ + } +#endif + +#if CONFIG_DTRACE + LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, lock, 0); +#endif + /* return with the interlock held and preemption disabled */ + return; +} + +/* + * Routine: lck_mtx_lock_spin + * + * Try to lock a mutex as spin lock for current thread. + * It tries the fast path first and + * falls through the slow path in case + * of contention. + * + * Interlock or mutex cannot be already held by current thread. + * + * In case the mutex is held as mutex by another thread + * this function will switch behavior and try to acquire the lock as mutex. + * + * In case the mutex is held as spinlock it will spin contending + * for it. + * + * In case of contention it might sleep. + */ +void +lck_mtx_lock_spin( + lck_mtx_t *lock) +{ + lck_mtx_check_preemption(); + lck_mtx_lock_spin_always(lock); +} + +/* + * Routine: lck_mtx_try_lock_spin_always + * + * Try to lock a mutex as spin lock for current thread. + * It tries the fast path first and + * falls through the slow path in case + * of contention. + * + * Interlock or mutex cannot be already held by current thread. + * + * In case the mutex is held (either as spin or mutex) + * the function will fail, it will acquire the mutex as spin lock + * otherwise. + * + */ +__attribute__((noinline)) +boolean_t +lck_mtx_try_lock_spin_always( + lck_mtx_t *lock) +{ + uint32_t prev, state; + + state = ordered_load_mtx_state(lock); + + /* + * Fast path only if the mutex is not held + * neither as mutex nor as spin and + * interlock is not contended. + * Indirect mutexes will fall through the slow path as + * well as destroyed mutexes. + */ + + /* Note LCK_MTX_SPIN_MSK is set only if LCK_MTX_ILOCKED_MSK is set */ + prev = state & ~(LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK); + state = prev | LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK; + + disable_preemption(); + if (!atomic_compare_exchange32(&lock->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) { + enable_preemption(); + return lck_mtx_try_lock_spin_slow(lock); + } + + /* mutex acquired as spinlock, interlock acquired and preemption disabled */ + + thread_t thread = current_thread(); + /* record owner of mutex */ + ordered_store_mtx_owner(lock, (uintptr_t)thread); + +#if MACH_LDEBUG + if (thread) { + thread->mutex_count++; /* lock statistic */ + } +#endif + +#if CONFIG_DTRACE + LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, lock, 0); +#endif + + /* return with the interlock held and preemption disabled */ + return TRUE; +} + +/* + * Routine: lck_mtx_try_lock_spin + * + * Try to lock a mutex as spin lock for current thread. + * It tries the fast path first and + * falls through the slow path in case + * of contention. + * + * Interlock or mutex cannot be already held by current thread. + * + * In case the mutex is held (either as spin or mutex) + * the function will fail, it will acquire the mutex as spin lock + * otherwise. + * + */ +boolean_t +lck_mtx_try_lock_spin( + lck_mtx_t *lock) +{ + return lck_mtx_try_lock_spin_always(lock); +} + +/* + * Routine: lck_mtx_unlock + * + * Unlocks a mutex held by current thread. + * It tries the fast path first, and falls + * through the slow path in case waiters need to + * be woken up or promotions need to be dropped. + * + * Interlock can be held, and the slow path will + * unlock the mutex for this case. + */ +__attribute__((noinline)) +void +lck_mtx_unlock( + lck_mtx_t *lock) +{ + uint32_t prev, state; + + state = ordered_load_mtx_state(lock); + + if (state & LCK_MTX_SPIN_MSK) + return lck_mtx_unlock_slow(lock); + + /* + * Only full mutex will go through the fast path + * (if the lock was acquired as a spinlock it will + * fall through the slow path). + * If there are waiters or promotions it will fall + * through the slow path. + * If it is indirect it will fall through the slow path. + */ + + /* + * Fast path state: + * interlock not held, no waiters, no promotion and mutex held. + */ + prev = state & ~(LCK_MTX_ILOCKED_MSK | LCK_MTX_WAITERS_MSK | LCK_MTX_PROMOTED_MSK); + prev |= LCK_MTX_MLOCKED_MSK; + + state = prev | LCK_MTX_ILOCKED_MSK; + state &= ~LCK_MTX_MLOCKED_MSK; + + disable_preemption(); + + /* the memory order needs to be acquire because it is acquiring the interlock */ + if (!atomic_compare_exchange32(&lock->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) { + enable_preemption(); + return lck_mtx_unlock_slow(lock); + } + + /* mutex released, interlock acquired and preemption disabled */ + +#if DEVELOPMENT | DEBUG + thread_t owner = (thread_t)lock->lck_mtx_owner; + if(__improbable(owner != current_thread())) + return lck_mtx_owner_check_panic(lock); +#endif + + /* clear owner */ + ordered_store_mtx_owner(lock, 0); + /* release interlock */ + state &= ~LCK_MTX_ILOCKED_MSK; + ordered_store_mtx_state_release(lock, state); + +#if MACH_LDEBUG + thread_t thread = current_thread(); + if (thread) + thread->mutex_count--; +#endif /* MACH_LDEBUG */ + + /* re-enable preemption */ + lck_mtx_unlock_finish_inline(lock, FALSE); +} + diff --git a/osfmk/i386/machine_routines.c b/osfmk/i386/machine_routines.c index 68f9f7819..e62e821c6 100644 --- a/osfmk/i386/machine_routines.c +++ b/osfmk/i386/machine_routines.c @@ -130,6 +130,20 @@ ml_static_ptovirt( #endif } +vm_offset_t +ml_static_slide( + vm_offset_t vaddr) +{ + return VM_KERNEL_SLIDE(vaddr); +} + +vm_offset_t +ml_static_unslide( + vm_offset_t vaddr) +{ + return VM_KERNEL_UNSLIDE(vaddr); +} + /* * Routine: ml_static_mfree @@ -505,7 +519,7 @@ ml_processor_register( /* allocate and initialize other per-cpu structures */ if (!boot_cpu) { mp_cpus_call_cpu_init(cpunum); - prng_cpu_init(cpunum); + early_random_cpu_init(cpunum); } /* output arg */ @@ -796,7 +810,7 @@ boolean_t ml_is64bit(void) { boolean_t ml_thread_is64bit(thread_t thread) { - return (thread_is_64bit(thread)); + return (thread_is_64bit_addr(thread)); } @@ -873,20 +887,21 @@ kernel_preempt_check(void) assert(get_preemption_level() == 0); - __asm__ volatile("pushf; pop %0" : "=r" (flags)); - - intr = ((flags & EFL_IF) != 0); - - if ((*ast_pending() & AST_URGENT) && intr == TRUE) { + if (__improbable(*ast_pending() & AST_URGENT)) { /* * can handle interrupts and preemptions * at this point */ + __asm__ volatile("pushf; pop %0" : "=r" (flags)); + + intr = ((flags & EFL_IF) != 0); /* * now cause the PRE-EMPTION trap */ - __asm__ volatile ("int %0" :: "N" (T_PREEMPT)); + if (intr == TRUE){ + __asm__ volatile ("int %0" :: "N" (T_PREEMPT)); + } } } diff --git a/osfmk/i386/machine_routines.h b/osfmk/i386/machine_routines.h index da6db8347..802099052 100644 --- a/osfmk/i386/machine_routines.h +++ b/osfmk/i386/machine_routines.h @@ -100,6 +100,12 @@ void ml_static_mfree( vm_offset_t ml_static_malloc( vm_size_t size); +vm_offset_t ml_static_slide( + vm_offset_t vaddr); + +vm_offset_t ml_static_unslide( + vm_offset_t vaddr); + /* virtual to physical on wired pages */ vm_offset_t ml_vtophys( vm_offset_t vaddr); diff --git a/osfmk/i386/machine_task.c b/osfmk/i386/machine_task.c index f1cd81ce4..3edd363c7 100644 --- a/osfmk/i386/machine_task.c +++ b/osfmk/i386/machine_task.c @@ -76,7 +76,7 @@ machine_task_set_state( case x86_DEBUG_STATE32: { x86_debug_state32_t *tstate = (x86_debug_state32_t*) state; - if ((task_has_64BitAddr(task)) || + if ((task_has_64Bit_addr(task)) || (state_count != x86_DEBUG_STATE32_COUNT) || (!debug_state_is_valid32(tstate))) { return KERN_INVALID_ARGUMENT; @@ -94,7 +94,7 @@ machine_task_set_state( { x86_debug_state64_t *tstate = (x86_debug_state64_t*) state; - if ((!task_has_64BitAddr(task)) || + if ((!task_has_64Bit_addr(task)) || (state_count != x86_DEBUG_STATE64_COUNT) || (!debug_state_is_valid64(tstate))) { return KERN_INVALID_ARGUMENT; @@ -118,7 +118,7 @@ machine_task_set_state( if ((tstate->dsh.flavor == x86_DEBUG_STATE32) && (tstate->dsh.count == x86_DEBUG_STATE32_COUNT) && - (!task_has_64BitAddr(task)) && + (!task_has_64Bit_addr(task)) && debug_state_is_valid32(&tstate->uds.ds32)) { if (task->task_debug == NULL) { @@ -130,7 +130,7 @@ machine_task_set_state( } else if ((tstate->dsh.flavor == x86_DEBUG_STATE64) && (tstate->dsh.count == x86_DEBUG_STATE64_COUNT) && - task_has_64BitAddr(task) && + task_has_64Bit_addr(task) && debug_state_is_valid64(&tstate->uds.ds64)) { if (task->task_debug == NULL) { @@ -161,7 +161,7 @@ machine_task_get_state(task_t task, { x86_debug_state32_t *tstate = (x86_debug_state32_t*) state; - if ((task_has_64BitAddr(task)) || (*state_count != x86_DEBUG_STATE32_COUNT)) { + if ((task_has_64Bit_addr(task)) || (*state_count != x86_DEBUG_STATE32_COUNT)) { return KERN_INVALID_ARGUMENT; } @@ -177,7 +177,7 @@ machine_task_get_state(task_t task, { x86_debug_state64_t *tstate = (x86_debug_state64_t*) state; - if ((!task_has_64BitAddr(task)) || (*state_count != x86_DEBUG_STATE64_COUNT)) { + if ((!task_has_64Bit_addr(task)) || (*state_count != x86_DEBUG_STATE64_COUNT)) { return KERN_INVALID_ARGUMENT; } @@ -196,7 +196,7 @@ machine_task_get_state(task_t task, if (*state_count != x86_DEBUG_STATE_COUNT) return(KERN_INVALID_ARGUMENT); - if (task_has_64BitAddr(task)) { + if (task_has_64Bit_addr(task)) { tstate->dsh.flavor = x86_DEBUG_STATE64; tstate->dsh.count = x86_DEBUG_STATE64_COUNT; @@ -270,7 +270,7 @@ machine_thread_inherit_taskwide( int flavor; mach_msg_type_number_t count; - if (task_has_64BitAddr(parent_task)) { + if (task_has_64Bit_addr(parent_task)) { flavor = x86_DEBUG_STATE64; count = x86_DEBUG_STATE64_COUNT; } else { diff --git a/osfmk/i386/mp.c b/osfmk/i386/mp.c index 3b7232687..e5c83e895 100644 --- a/osfmk/i386/mp.c +++ b/osfmk/i386/mp.c @@ -572,7 +572,7 @@ cpu_signal_handler(x86_saved_state_t *regs) } extern void kprintf_break_lock(void); -static int +int NMIInterruptHandler(x86_saved_state_t *regs) { void *stackptr; @@ -690,7 +690,7 @@ NMI_cpus(void) uint64_t tsc_timeout; intrs_enabled = ml_set_interrupts_enabled(FALSE); - + NMIPI_enable(TRUE); for (cpu = 0; cpu < real_ncpus; cpu++) { if (!cpu_is_running(cpu)) continue; @@ -707,6 +707,7 @@ NMI_cpus(void) } cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE; } + NMIPI_enable(FALSE); ml_set_interrupts_enabled(intrs_enabled); } @@ -849,6 +850,13 @@ mp_rendezvous_action(__unused void *null) boolean_t intrs_enabled; uint64_t tsc_spin_start; + /* + * Note that mp_rv_lock was acquired by the thread that initiated the + * rendezvous and must have been acquired before we enter + * mp_rendezvous_action(). + */ + current_cpu_datap()->cpu_rendezvous_in_progress = TRUE; + /* setup function */ if (mp_rv_setup_func != NULL) mp_rv_setup_func(mp_rv_func_arg); @@ -886,6 +894,8 @@ mp_rendezvous_action(__unused void *null) if (mp_rv_teardown_func != NULL) mp_rv_teardown_func(mp_rv_func_arg); + current_cpu_datap()->cpu_rendezvous_in_progress = FALSE; + /* Bump completion count */ atomic_incl(&mp_rv_complete, 1); } @@ -909,7 +919,7 @@ mp_rendezvous(void (*setup_func)(void *), } /* obtain rendezvous lock */ - (void) mp_safe_spin_lock(&mp_rv_lock); + mp_rendezvous_lock(); /* set static function pointers */ mp_rv_setup_func = setup_func; @@ -948,6 +958,18 @@ mp_rendezvous(void (*setup_func)(void *), mp_rv_func_arg = NULL; /* release lock */ + mp_rendezvous_unlock(); +} + +void +mp_rendezvous_lock(void) +{ + (void) mp_safe_spin_lock(&mp_rv_lock); +} + +void +mp_rendezvous_unlock(void) +{ simple_unlock(&mp_rv_lock); } @@ -1024,7 +1046,8 @@ mp_call_head_lock(mp_call_queue_t *cqp) */ void NMIPI_panic(cpumask_t cpu_mask, NMI_reason_t why) { - unsigned int cpu, cpu_bit; + unsigned int cpu; + cpumask_t cpu_bit; uint64_t deadline; NMIPI_enable(TRUE); @@ -1575,7 +1598,7 @@ mp_kdp_enter(boolean_t proceed_on_failure) } if (proceed_on_failure) { if (mach_absolute_time() - start_time > 500000000ll) { - kprintf("mp_kdp_enter() can't get x86_topo_lock! Debugging anyway! #YOLO\n"); + paniclog_append_noflush("mp_kdp_enter() can't get x86_topo_lock! Debugging anyway! #YOLO\n"); break; } locked = simple_lock_try(&x86_topo_lock); @@ -1648,13 +1671,16 @@ mp_kdp_enter(boolean_t proceed_on_failure) NMIPI_enable(TRUE); } if (mp_kdp_ncpus != ncpus) { - cpumask_t cpus_NMI_pending = 0; - DBG("mp_kdp_enter() timed-out on cpu %d, NMI-ing\n", my_cpu); + unsigned int wait_cycles = 0; + if (proceed_on_failure) + paniclog_append_noflush("mp_kdp_enter() timed-out on cpu %d, NMI-ing\n", my_cpu); + else + DBG("mp_kdp_enter() timed-out on cpu %d, NMI-ing\n", my_cpu); for (cpu = 0; cpu < real_ncpus; cpu++) { if (cpu == my_cpu || !cpu_is_running(cpu)) continue; if (cpu_signal_pending(cpu, MP_KDP)) { - cpus_NMI_pending |= cpu_to_cpumask(cpu); + cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE; cpu_NMI_interrupt(cpu); } } @@ -1663,9 +1689,24 @@ mp_kdp_enter(boolean_t proceed_on_failure) while (mp_kdp_ncpus != ncpus && rdtsc64() < tsc_timeout) { handle_pending_TLB_flushes(); cpu_pause(); + ++wait_cycles; } if (mp_kdp_ncpus != ncpus) { - kdb_printf("mp_kdp_enter(): %llu, %lu, %u TIMED-OUT WAITING FOR NMI-ACK, PROCEEDING\n", cpus_NMI_pending, mp_kdp_ncpus, ncpus); + paniclog_append_noflush("mp_kdp_enter() NMI pending on cpus:"); + for (cpu = 0; cpu < real_ncpus; cpu++) { + if (cpu_is_running(cpu) && !cpu_datap(cpu)->cpu_NMI_acknowledged) + paniclog_append_noflush(" %d", cpu); + } + paniclog_append_noflush("\n"); + if (proceed_on_failure) { + paniclog_append_noflush("mp_kdp_enter() timed-out during %s wait after NMI;" + "expected %u acks but received %lu after %u loops in %llu ticks\n", + (locked ? "locked" : "unlocked"), ncpus, mp_kdp_ncpus, wait_cycles, LockTimeOutTSC); + } else { + panic("mp_kdp_enter() timed-out during %s wait after NMI;" + "expected %u acks but received %lu after %u loops in %llu ticks", + (locked ? "locked" : "unlocked"), ncpus, mp_kdp_ncpus, wait_cycles, LockTimeOutTSC); + } } } } @@ -1686,6 +1727,22 @@ mp_kdp_enter(boolean_t proceed_on_failure) postcode(MP_KDP_ENTER); } +boolean_t +mp_kdp_all_cpus_halted() +{ + unsigned int ncpus = 0, cpu = 0, my_cpu = 0; + + my_cpu = cpu_number(); + ncpus = 1; /* current CPU */ + for (cpu = 0; cpu < real_ncpus; cpu++) { + if (cpu == my_cpu || !cpu_is_running(cpu)) + continue; + ncpus++; + } + + return (mp_kdp_ncpus == ncpus); +} + static boolean_t cpu_signal_pending(int cpu, mp_event_t event) { diff --git a/osfmk/i386/mp.h b/osfmk/i386/mp.h index 6f46c5d4a..705f41c18 100644 --- a/osfmk/i386/mp.h +++ b/osfmk/i386/mp.h @@ -112,8 +112,9 @@ extern uint32_t spinlock_timeout_NMI(uintptr_t thread_addr); extern uint64_t LastDebuggerEntryAllowance; -extern void mp_kdp_enter(boolean_t proceed_on_failure); -extern void mp_kdp_exit(void); +extern void mp_kdp_enter(boolean_t proceed_on_failure); +extern void mp_kdp_exit(void); +extern boolean_t mp_kdp_all_cpus_halted(void); extern boolean_t mp_recent_debugger_activity(void); extern void kernel_spin(uint64_t spin_ns); @@ -130,6 +131,8 @@ extern void mp_rendezvous_no_intrs( void (*action_func)(void *), void *arg); extern void mp_rendezvous_break_lock(void); +extern void mp_rendezvous_lock(void); +extern void mp_rendezvous_unlock(void); /* * All cpu broadcast. diff --git a/osfmk/i386/mp_desc.c b/osfmk/i386/mp_desc.c index 788e71663..78c9e11d0 100644 --- a/osfmk/i386/mp_desc.c +++ b/osfmk/i386/mp_desc.c @@ -646,7 +646,7 @@ cpu_data_alloc(boolean_t is_boot_cpu) * started. */ cdp->cpu_active_thread = (thread_t) (uintptr_t) cdp->cpu_number; - + cdp->cpu_NMI_acknowledged = TRUE; cdp->cpu_nanotime = &pal_rtc_nanotime_info; kprintf("cpu_data_alloc(%d) %p desc_table: %p " diff --git a/osfmk/i386/pcb.c b/osfmk/i386/pcb.c index 9a2ca4390..9f1471f36 100644 --- a/osfmk/i386/pcb.c +++ b/osfmk/i386/pcb.c @@ -721,6 +721,46 @@ get_thread_state64(thread_t thread, x86_thread_state64_t *ts) ts->gs = saved_state->gs; } +kern_return_t +machine_thread_state_convert_to_user( + __unused thread_t thread, + __unused thread_flavor_t flavor, + __unused thread_state_t tstate, + __unused mach_msg_type_number_t *count) +{ + // No conversion to userspace representation on this platform + return KERN_SUCCESS; +} + +kern_return_t +machine_thread_state_convert_from_user( + __unused thread_t thread, + __unused thread_flavor_t flavor, + __unused thread_state_t tstate, + __unused mach_msg_type_number_t count) +{ + // No conversion from userspace representation on this platform + return KERN_SUCCESS; +} + +kern_return_t +machine_thread_siguctx_pointer_convert_to_user( + __unused thread_t thread, + __unused user_addr_t *uctxp) +{ + // No conversion to userspace representation on this platform + return KERN_SUCCESS; +} + +kern_return_t +machine_thread_function_pointers_convert_from_user( + __unused thread_t thread, + __unused user_addr_t *fptrs, + __unused uint32_t count) +{ + // No conversion from userspace representation on this platform + return KERN_SUCCESS; +} /* * act_machine_set_state: @@ -744,7 +784,7 @@ machine_thread_set_state( if (count < x86_SAVED_STATE32_COUNT) return(KERN_INVALID_ARGUMENT); - if (thread_is_64bit(thr_act)) + if (thread_is_64bit_addr(thr_act)) return(KERN_INVALID_ARGUMENT); state = (x86_saved_state32_t *) tstate; @@ -809,7 +849,7 @@ machine_thread_set_state( if (count < x86_SAVED_STATE64_COUNT) return(KERN_INVALID_ARGUMENT); - if (!thread_is_64bit(thr_act)) + if (!thread_is_64bit_addr(thr_act)) return(KERN_INVALID_ARGUMENT); state = (x86_saved_state64_t *) tstate; @@ -879,7 +919,7 @@ machine_thread_set_state( if (count != _MachineStateCount[flavor]) return(KERN_INVALID_ARGUMENT); - if (thread_is_64bit(thr_act)) + if (thread_is_64bit_addr(thr_act)) return(KERN_INVALID_ARGUMENT); return fpu_set_fxstate(thr_act, tstate, flavor); @@ -894,7 +934,7 @@ machine_thread_set_state( if (count != _MachineStateCount[flavor]) return(KERN_INVALID_ARGUMENT); - if (!thread_is_64bit(thr_act)) + if (!thread_is_64bit_addr(thr_act)) return(KERN_INVALID_ARGUMENT); return fpu_set_fxstate(thr_act, tstate, flavor); @@ -909,11 +949,11 @@ machine_thread_set_state( state = (x86_float_state_t *)tstate; if (state->fsh.flavor == x86_FLOAT_STATE64 && state->fsh.count == x86_FLOAT_STATE64_COUNT && - thread_is_64bit(thr_act)) { + thread_is_64bit_addr(thr_act)) { return fpu_set_fxstate(thr_act, (thread_state_t)&state->ufs.fs64, x86_FLOAT_STATE64); } if (state->fsh.flavor == x86_FLOAT_STATE32 && state->fsh.count == x86_FLOAT_STATE32_COUNT && - !thread_is_64bit(thr_act)) { + !thread_is_64bit_addr(thr_act)) { return fpu_set_fxstate(thr_act, (thread_state_t)&state->ufs.fs32, x86_FLOAT_STATE32); } return(KERN_INVALID_ARGUMENT); @@ -934,7 +974,7 @@ machine_thread_set_state( /* 64-bit flavor? */ if (state->ash.flavor == (flavor - 1) && state->ash.count == _MachineStateCount[flavor - 1] && - thread_is_64bit(thr_act)) { + thread_is_64bit_addr(thr_act)) { return fpu_set_fxstate(thr_act, (thread_state_t)&state->ufs.as64, flavor - 1); @@ -942,7 +982,7 @@ machine_thread_set_state( /* 32-bit flavor? */ if (state->ash.flavor == (flavor - 2) && state->ash.count == _MachineStateCount[flavor - 2] && - !thread_is_64bit(thr_act)) { + !thread_is_64bit_addr(thr_act)) { return fpu_set_fxstate(thr_act, (thread_state_t)&state->ufs.as32, flavor - 2); @@ -955,7 +995,7 @@ machine_thread_set_state( if (count != x86_THREAD_STATE32_COUNT) return(KERN_INVALID_ARGUMENT); - if (thread_is_64bit(thr_act)) + if (thread_is_64bit_addr(thr_act)) return(KERN_INVALID_ARGUMENT); return set_thread_state32(thr_act, (x86_thread_state32_t *)tstate); @@ -966,7 +1006,7 @@ machine_thread_set_state( if (count != x86_THREAD_STATE64_COUNT) return(KERN_INVALID_ARGUMENT); - if (!thread_is_64bit(thr_act)) + if (!thread_is_64bit_addr(thr_act)) return(KERN_INVALID_ARGUMENT); return set_thread_state64(thr_act, (x86_thread_state64_t *)tstate); @@ -983,11 +1023,11 @@ machine_thread_set_state( if (state->tsh.flavor == x86_THREAD_STATE64 && state->tsh.count == x86_THREAD_STATE64_COUNT && - thread_is_64bit(thr_act)) { + thread_is_64bit_addr(thr_act)) { return set_thread_state64(thr_act, &state->uts.ts64); } else if (state->tsh.flavor == x86_THREAD_STATE32 && state->tsh.count == x86_THREAD_STATE32_COUNT && - !thread_is_64bit(thr_act)) { + !thread_is_64bit_addr(thr_act)) { return set_thread_state32(thr_act, &state->uts.ts32); } else return(KERN_INVALID_ARGUMENT); @@ -997,7 +1037,7 @@ machine_thread_set_state( x86_debug_state32_t *state; kern_return_t ret; - if (thread_is_64bit(thr_act)) + if (thread_is_64bit_addr(thr_act)) return(KERN_INVALID_ARGUMENT); state = (x86_debug_state32_t *)tstate; @@ -1011,7 +1051,7 @@ machine_thread_set_state( x86_debug_state64_t *state; kern_return_t ret; - if (!thread_is_64bit(thr_act)) + if (!thread_is_64bit_addr(thr_act)) return(KERN_INVALID_ARGUMENT); state = (x86_debug_state64_t *)tstate; @@ -1031,13 +1071,13 @@ machine_thread_set_state( state = (x86_debug_state_t *)tstate; if (state->dsh.flavor == x86_DEBUG_STATE64 && state->dsh.count == x86_DEBUG_STATE64_COUNT && - thread_is_64bit(thr_act)) { + thread_is_64bit_addr(thr_act)) { ret = set_debug_state64(thr_act, &state->uds.ds64); } else if (state->dsh.flavor == x86_DEBUG_STATE32 && state->dsh.count == x86_DEBUG_STATE32_COUNT && - !thread_is_64bit(thr_act)) { + !thread_is_64bit_addr(thr_act)) { ret = set_debug_state32(thr_act, &state->uds.ds32); } return ret; @@ -1135,7 +1175,7 @@ machine_thread_get_state( if (*count < x86_SAVED_STATE32_COUNT) return(KERN_INVALID_ARGUMENT); - if (thread_is_64bit(thr_act)) + if (thread_is_64bit_addr(thr_act)) return(KERN_INVALID_ARGUMENT); state = (x86_saved_state32_t *) tstate; @@ -1162,7 +1202,7 @@ machine_thread_get_state( if (*count < x86_SAVED_STATE64_COUNT) return(KERN_INVALID_ARGUMENT); - if (!thread_is_64bit(thr_act)) + if (!thread_is_64bit_addr(thr_act)) return(KERN_INVALID_ARGUMENT); state = (x86_saved_state64_t *)tstate; @@ -1184,7 +1224,7 @@ machine_thread_get_state( if (*count < x86_FLOAT_STATE32_COUNT) return(KERN_INVALID_ARGUMENT); - if (thread_is_64bit(thr_act)) + if (thread_is_64bit_addr(thr_act)) return(KERN_INVALID_ARGUMENT); *count = x86_FLOAT_STATE32_COUNT; @@ -1197,7 +1237,7 @@ machine_thread_get_state( if (*count < x86_FLOAT_STATE64_COUNT) return(KERN_INVALID_ARGUMENT); - if ( !thread_is_64bit(thr_act)) + if ( !thread_is_64bit_addr(thr_act)) return(KERN_INVALID_ARGUMENT); *count = x86_FLOAT_STATE64_COUNT; @@ -1219,7 +1259,7 @@ machine_thread_get_state( * no need to bzero... currently * x86_FLOAT_STATE64_COUNT == x86_FLOAT_STATE32_COUNT */ - if (thread_is_64bit(thr_act)) { + if (thread_is_64bit_addr(thr_act)) { state->fsh.flavor = x86_FLOAT_STATE64; state->fsh.count = x86_FLOAT_STATE64_COUNT; @@ -1243,7 +1283,7 @@ machine_thread_get_state( if (*count != _MachineStateCount[flavor]) return(KERN_INVALID_ARGUMENT); - if (thread_is_64bit(thr_act)) + if (thread_is_64bit_addr(thr_act)) return(KERN_INVALID_ARGUMENT); *count = _MachineStateCount[flavor]; @@ -1259,7 +1299,7 @@ machine_thread_get_state( if (*count != _MachineStateCount[flavor]) return(KERN_INVALID_ARGUMENT); - if ( !thread_is_64bit(thr_act)) + if ( !thread_is_64bit_addr(thr_act)) return(KERN_INVALID_ARGUMENT); *count = _MachineStateCount[flavor]; @@ -1283,7 +1323,7 @@ machine_thread_get_state( bzero((char *)state, *count * sizeof(int)); - if (thread_is_64bit(thr_act)) { + if (thread_is_64bit_addr(thr_act)) { flavor -= 1; /* 64-bit flavor */ fstate = (thread_state_t) &state->ufs.as64; } else { @@ -1301,7 +1341,7 @@ machine_thread_get_state( if (*count < x86_THREAD_STATE32_COUNT) return(KERN_INVALID_ARGUMENT); - if (thread_is_64bit(thr_act)) + if (thread_is_64bit_addr(thr_act)) return(KERN_INVALID_ARGUMENT); *count = x86_THREAD_STATE32_COUNT; @@ -1315,7 +1355,7 @@ machine_thread_get_state( if (*count < x86_THREAD_STATE64_COUNT) return(KERN_INVALID_ARGUMENT); - if ( !thread_is_64bit(thr_act)) + if ( !thread_is_64bit_addr(thr_act)) return(KERN_INVALID_ARGUMENT); *count = x86_THREAD_STATE64_COUNT; @@ -1335,7 +1375,7 @@ machine_thread_get_state( bzero((char *)state, sizeof(x86_thread_state_t)); - if (thread_is_64bit(thr_act)) { + if (thread_is_64bit_addr(thr_act)) { state->tsh.flavor = x86_THREAD_STATE64; state->tsh.count = x86_THREAD_STATE64_COUNT; @@ -1357,7 +1397,7 @@ machine_thread_get_state( if (*count < x86_EXCEPTION_STATE32_COUNT) return(KERN_INVALID_ARGUMENT); - if (thread_is_64bit(thr_act)) + if (thread_is_64bit_addr(thr_act)) return(KERN_INVALID_ARGUMENT); *count = x86_EXCEPTION_STATE32_COUNT; @@ -1376,7 +1416,7 @@ machine_thread_get_state( if (*count < x86_EXCEPTION_STATE64_COUNT) return(KERN_INVALID_ARGUMENT); - if ( !thread_is_64bit(thr_act)) + if ( !thread_is_64bit_addr(thr_act)) return(KERN_INVALID_ARGUMENT); *count = x86_EXCEPTION_STATE64_COUNT; @@ -1401,7 +1441,7 @@ machine_thread_get_state( bzero((char *)state, sizeof(x86_exception_state_t)); - if (thread_is_64bit(thr_act)) { + if (thread_is_64bit_addr(thr_act)) { state->esh.flavor = x86_EXCEPTION_STATE64; state->esh.count = x86_EXCEPTION_STATE64_COUNT; @@ -1421,7 +1461,7 @@ machine_thread_get_state( if (*count < x86_DEBUG_STATE32_COUNT) return(KERN_INVALID_ARGUMENT); - if (thread_is_64bit(thr_act)) + if (thread_is_64bit_addr(thr_act)) return(KERN_INVALID_ARGUMENT); get_debug_state32(thr_act, (x86_debug_state32_t *)tstate); @@ -1435,7 +1475,7 @@ machine_thread_get_state( if (*count < x86_DEBUG_STATE64_COUNT) return(KERN_INVALID_ARGUMENT); - if (!thread_is_64bit(thr_act)) + if (!thread_is_64bit_addr(thr_act)) return(KERN_INVALID_ARGUMENT); get_debug_state64(thr_act, (x86_debug_state64_t *)tstate); @@ -1455,7 +1495,7 @@ machine_thread_get_state( bzero(state, sizeof *state); - if (thread_is_64bit(thr_act)) { + if (thread_is_64bit_addr(thr_act)) { state->dsh.flavor = x86_DEBUG_STATE64; state->dsh.count = x86_DEBUG_STATE64_COUNT; @@ -1661,7 +1701,7 @@ machine_thread_switch_addrmode(thread_t thread) machine_thread_create(thread, thread->task); /* Adjust FPU state */ - fpu_switch_addrmode(thread, task_has_64BitAddr(thread->task)); + fpu_switch_addrmode(thread, task_has_64Bit_addr(thread->task)); /* If we're switching ourselves, reset the pcb addresses etc. */ if (thread == current_thread()) { @@ -1711,7 +1751,7 @@ get_useraddr(void) { thread_t thr_act = current_thread(); - if (thread_is_64bit(thr_act)) { + if (thread_is_64bit_addr(thr_act)) { x86_saved_state64_t *iss64; iss64 = USER_REGS64(thr_act); @@ -1849,7 +1889,7 @@ act_thread_csave(void) mach_msg_type_number_t val; thread_t thr_act = current_thread(); - if (thread_is_64bit(thr_act)) { + if (thread_is_64bit_addr(thr_act)) { struct x86_act_context64 *ic64; ic64 = (struct x86_act_context64 *)kalloc(sizeof(struct x86_act_context64)); @@ -1929,7 +1969,7 @@ act_thread_catt(void *ctx) if (ctx == (void *)NULL) return; - if (thread_is_64bit(thr_act)) { + if (thread_is_64bit_addr(thr_act)) { struct x86_act_context64 *ic64; ic64 = (struct x86_act_context64 *)ctx; diff --git a/osfmk/i386/pcb_native.c b/osfmk/i386/pcb_native.c index 34df7c119..748bde049 100644 --- a/osfmk/i386/pcb_native.c +++ b/osfmk/i386/pcb_native.c @@ -374,7 +374,7 @@ machine_thread_create( bzero((char *)pcb->iss, sizeof(x86_saved_state_t)); - if (task_has_64BitAddr(task)) { + if (task_has_64Bit_addr(task)) { pcb->iss->flavor = x86_SAVED_STATE64; pcb->iss->ss_64.isf.cs = USER64_CS; @@ -447,7 +447,7 @@ machine_thread_set_tsd_base( return KERN_INVALID_ARGUMENT; } - if (thread_is_64bit(thread)) { + if (thread_is_64bit_addr(thread)) { /* check for canonical address, set 0 otherwise */ if (!IS_USERADDR64_CANONICAL(tsd_base)) tsd_base = 0ULL; @@ -459,7 +459,7 @@ machine_thread_set_tsd_base( pcb_t pcb = THREAD_TO_PCB(thread); pcb->cthread_self = tsd_base; - if (!thread_is_64bit(thread)) { + if (!thread_is_64bit_addr(thread)) { /* Set up descriptor for later use */ struct real_descriptor desc = { .limit_low = 1, @@ -478,7 +478,7 @@ machine_thread_set_tsd_base( /* For current thread, make the TSD base active immediately */ if (thread == current_thread()) { - if (thread_is_64bit(thread)) { + if (thread_is_64bit_addr(thread)) { cpu_data_t *cdp; mp_disable_preemption(); diff --git a/osfmk/i386/pmap.h b/osfmk/i386/pmap.h index 3458ce7cf..076b69aa3 100644 --- a/osfmk/i386/pmap.h +++ b/osfmk/i386/pmap.h @@ -78,7 +78,6 @@ #include #include #include -#include #include #include @@ -433,6 +432,10 @@ extern boolean_t pmap_ept_support_ad; #define PMAP_ACTIVATE_CACHE 4 #define PMAP_NO_GUARD_CACHE 8 +/* Per-pmap ledger operations */ +#define pmap_ledger_debit(p, e, a) ledger_debit((p)->ledger, e, a) +#define pmap_ledger_credit(p, e, a) ledger_credit((p)->ledger, e, a) + #ifndef ASSEMBLER #include @@ -542,6 +545,7 @@ struct pmap { pmap_paddr_t pm_eptp; /* EPTP */ ledger_t ledger; /* ledger tracking phys mappings */ #if MACH_ASSERT + boolean_t pmap_stats_assert; int pmap_pid; char pmap_procname[17]; #endif /* MACH_ASSERT */ @@ -618,7 +622,8 @@ set_dirbase(pmap_t tpmap, thread_t thread, int my_cpu) { cpu_datap(ccpu)->cpu_ucr3 = ucr3; cpu_shadowp(ccpu)->cpu_ucr3 = ucr3; - cpu_datap(ccpu)->cpu_task_map = tpmap->pm_task_map; + cpu_datap(ccpu)->cpu_task_map = cpu_shadowp(ccpu)->cpu_task_map = + tpmap->pm_task_map; assert((get_preemption_level() > 0) || (ml_get_interrupts_enabled() == FALSE)); assert(ccpu == cpu_number()); diff --git a/osfmk/i386/pmap_internal.h b/osfmk/i386/pmap_internal.h index 4ddabaa20..1a7c75e32 100644 --- a/osfmk/i386/pmap_internal.h +++ b/osfmk/i386/pmap_internal.h @@ -917,10 +917,6 @@ pmap_pv_is_altacct( pvhash_idx = pvhashidx(pmap, vaddr); LOCK_PV_HASH(pvhash_idx); pvh_e = *(pvhash(pvhash_idx)); - if (PV_HASHED_ENTRY_NULL == pvh_e) { - panic("Possible memory corruption: pmap_pv_is_altacct(%p,0x%llx,0x%x): empty hash", - pmap, vaddr, ppn); - } while (PV_HASHED_ENTRY_NULL != pvh_e) { if (pvh_e->pmap == pmap && PVE_VA(pvh_e) == vaddr && diff --git a/osfmk/i386/pmap_x86_common.c b/osfmk/i386/pmap_x86_common.c index b66630233..443b97217 100644 --- a/osfmk/i386/pmap_x86_common.c +++ b/osfmk/i386/pmap_x86_common.c @@ -1969,12 +1969,14 @@ phys_attribute_clear( pte_bits &= ept_bits_to_clear; } } + if (options & PMAP_OPTIONS_CLEAR_WRITE) + pte_bits |= PTE_WRITE(is_ept); /* * Clear modify and/or reference bits. */ if (pte_bits) { - pmap_update_pte(pte, bits, 0); + pmap_update_pte(pte, pte_bits, 0); /* Ensure all processors using this translation * invalidate this TLB entry. The invalidation @@ -2472,13 +2474,15 @@ pmap_query_page_info( return KERN_SUCCESS; } -void pmap_set_jit_entitled(__unused pmap_t pmap) +void +pmap_set_jit_entitled(__unused pmap_t pmap) { /* The x86 pmap layer does not care if a map has a JIT entry. */ return; } -bool pmap_has_prot_policy(__unused vm_prot_t prot) +bool +pmap_has_prot_policy(__unused vm_prot_t prot) { /* * The x86 pmap layer does not apply any policy to any protection @@ -2487,8 +2491,43 @@ bool pmap_has_prot_policy(__unused vm_prot_t prot) return FALSE; } -void pmap_release_pages_fast(void) +uint64_t +pmap_release_pages_fast(void) +{ + return 0; +} + +void +pmap_trim(__unused pmap_t grand, __unused pmap_t subord, __unused addr64_t vstart, __unused addr64_t nstart, __unused uint64_t size) { return; } +void pmap_ledger_alloc_init(size_t size) +{ + panic("%s: unsupported, " + "size=%lu", + __func__, size); +} + +ledger_t pmap_ledger_alloc(void) +{ + panic("%s: unsupported", + __func__); + + return NULL; +} + +void pmap_ledger_free(ledger_t ledger) +{ + panic("%s: unsupported, " + "ledger=%p", + __func__, ledger); +} + +size_t +pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused) +{ + return (size_t)-1; +} + diff --git a/osfmk/i386/trap.c b/osfmk/i386/trap.c index 9c0174a43..ef03460db 100644 --- a/osfmk/i386/trap.c +++ b/osfmk/i386/trap.c @@ -93,6 +93,7 @@ #include #endif #include +#include #include #include @@ -140,7 +141,7 @@ thread_syscall_return( pal_register_cache_state(thr_act, DIRTY); - if (thread_is_64bit(thr_act)) { + if (thread_is_64bit_addr(thr_act)) { x86_saved_state64_t *regs; regs = USER_REGS64(thr_act); @@ -211,7 +212,7 @@ user_page_fault_continue( thread_t thread = current_thread(); user_addr_t vaddr; - if (thread_is_64bit(thread)) { + if (thread_is_64bit_addr(thread)) { x86_saved_state64_t *uregs; uregs = USER_REGS64(thread); @@ -383,7 +384,7 @@ interrupt(x86_saved_state_t *state) #if CONFIG_TELEMETRY if (telemetry_needs_record) { - telemetry_mark_curthread(user_mode); + telemetry_mark_curthread(user_mode, FALSE); } #endif @@ -454,13 +455,16 @@ interrupt(x86_saved_state_t *state) (long) depth, (long) VM_KERNEL_UNSLIDE(rip), 0, 0, 0); } } - + if (cnum == master_cpu) ml_entropy_collect(); - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - MACHDBG_CODE(DBG_MACH_EXCP_INTR, 0) | DBG_FUNC_END, - interrupt_num, 0, 0, 0, 0); +#if KPERF + kperf_interrupt(); +#endif /* KPERF */ + + KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_EXCP_INTR, 0) | DBG_FUNC_END, + interrupt_num); assert(ml_get_interrupts_enabled() == FALSE); } @@ -884,8 +888,8 @@ user_trap( user_addr_t rip; unsigned long dr6 = 0; /* 32 bit for i386, 64 bit for x86_64 */ - assert((is_saved_state32(saved_state) && !thread_is_64bit(thread)) || - (is_saved_state64(saved_state) && thread_is_64bit(thread))); + assert((is_saved_state32(saved_state) && !thread_is_64bit_addr(thread)) || + (is_saved_state64(saved_state) && thread_is_64bit_addr(thread))); if (is_saved_state64(saved_state)) { x86_saved_state64_t *regs; @@ -967,7 +971,7 @@ user_trap( * because the high order bits are not * used on x86_64 */ - if (thread_is_64bit(thread)) { + if (thread_is_64bit_addr(thread)) { x86_debug_state64_t *ids = pcb->ids; ids->dr6 = dr6; } else { /* 32 bit thread */ @@ -1249,11 +1253,11 @@ extern void thread_exception_return_internal(void) __dead2; void thread_exception_return(void) { thread_t thread = current_thread(); ml_set_interrupts_enabled(FALSE); - if (thread_is_64bit(thread) != task_has_64BitAddr(thread->task)) { - panic("Task/thread bitness mismatch %p %p, task: %d, thread: %d", thread, thread->task, thread_is_64bit(thread), task_has_64BitAddr(thread->task)); + if (thread_is_64bit_addr(thread) != task_has_64Bit_addr(thread->task)) { + panic("Task/thread bitness mismatch %p %p, task: %d, thread: %d", thread, thread->task, thread_is_64bit_addr(thread), task_has_64Bit_addr(thread->task)); } - if (thread_is_64bit(thread)) { + if (thread_is_64bit_addr(thread)) { if ((gdt_desc_p(USER64_CS)->access & ACC_PL_U) == 0) { panic("64-GDT mismatch %p, descriptor: %p", thread, gdt_desc_p(USER64_CS)); } diff --git a/osfmk/ipc/ipc_entry.c b/osfmk/ipc/ipc_entry.c index 288ea6a58..facaf2af0 100644 --- a/osfmk/ipc/ipc_entry.c +++ b/osfmk/ipc/ipc_entry.c @@ -79,6 +79,7 @@ #include #include #include +#include /* * Routine: ipc_entry_lookup @@ -489,6 +490,14 @@ ipc_entry_modified( space->is_low_mod = index; if (index > space->is_high_mod) space->is_high_mod = index; + + KERNEL_DEBUG_CONSTANT( + MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_PORT_ENTRY_MODIFY) | DBG_FUNC_NONE, + space->is_task ? task_pid(space->is_task) : 0, + name, + entry->ie_bits, + 0, + 0); } #define IPC_ENTRY_GROW_STATS 1 diff --git a/osfmk/ipc/ipc_importance.c b/osfmk/ipc/ipc_importance.c index f89005cf3..7ab4eb355 100644 --- a/osfmk/ipc/ipc_importance.c +++ b/osfmk/ipc/ipc_importance.c @@ -86,11 +86,6 @@ static lck_spin_t ipc_importance_lock_data; /* single lock for now */ lck_spin_unlock(&ipc_importance_lock_data) #define ipc_importance_assert_held() \ lck_spin_assert(&ipc_importance_lock_data, LCK_ASSERT_OWNED) -#define ipc_importance_sleep(elem) lck_spin_sleep(&ipc_importance_lock_data, \ - LCK_SLEEP_DEFAULT, \ - (event_t)(elem), \ - THREAD_UNINT) -#define ipc_importance_wakeup(elem) thread_wakeup((event_t)(elem)) #if IIE_REF_DEBUG #define incr_ref_counter(x) (hw_atomic_add(&(x), 1)) @@ -1660,7 +1655,7 @@ ipc_importance_task_mark_live_donor(ipc_importance_task_t task_imp, boolean_t li } /* - * Routine: ipc_importance_task_marked_live_donor + * Routine: ipc_importance_task_is_marked_live_donor * Purpose: * Query the live donor and donor flags for the given task importance. * Conditions: @@ -2155,9 +2150,6 @@ ipc_importance_exec_switch_task( /* Create an importance linkage from old_task to new_task */ inherit = ipc_importance_inherit_from_task(old_task, new_task); - if (inherit == III_NULL) { - return inherit; - } /* Switch task importance base from old task to new task */ ipc_importance_lock(); @@ -2214,9 +2206,7 @@ ipc_importance_check_circularity( boolean_t imp_lock_held = FALSE; int assertcnt = 0; ipc_port_t base; - sync_qos_count_t sync_qos_delta_add[THREAD_QOS_LAST] = {0}; - sync_qos_count_t sync_qos_delta_sub[THREAD_QOS_LAST] = {0}; - boolean_t update_knote = FALSE; + struct turnstile *send_turnstile = TURNSTILE_NULL; assert(port != IP_NULL); assert(dest != IP_NULL); @@ -2225,6 +2215,9 @@ ipc_importance_check_circularity( return TRUE; base = dest; + /* Check if destination needs a turnstile */ + ipc_port_send_turnstile_prepare(dest); + /* port is in limbo, so donation status is safe to latch */ if (port->ip_impdonation != 0) { imp_lock_held = TRUE; @@ -2302,22 +2295,24 @@ ipc_importance_check_circularity( assert(port->ip_receiver_name == MACH_PORT_NULL); assert(port->ip_destination == IP_NULL); - while (dest != IP_NULL) { + base = dest; + while (base != IP_NULL) { ipc_port_t next; - /* dest is in transit or in limbo */ + /* base is in transit or in limbo */ - assert(ip_active(dest)); - assert(dest->ip_receiver_name == MACH_PORT_NULL); + assert(ip_active(base)); + assert(base->ip_receiver_name == MACH_PORT_NULL); - next = dest->ip_destination; - ip_unlock(dest); - dest = next; + next = base->ip_destination; + ip_unlock(base); + base = next; } if (imp_lock_held) ipc_importance_unlock(); + ipc_port_send_turnstile_complete(dest); return TRUE; } @@ -2331,9 +2326,8 @@ ipc_importance_check_circularity( ipc_port_multiple_unlock(); not_circular: - imq_lock(&base->ip_messages); - /* port is in limbo */ + imq_lock(&port->ip_messages); assert(ip_active(port)); assert(port->ip_receiver_name == MACH_PORT_NULL); @@ -2359,10 +2353,22 @@ ipc_importance_check_circularity( /* take the port out of limbo w.r.t. assertions */ port->ip_tempowner = 0; - /* Capture the sync qos count delta */ - for (int i = 0; i < THREAD_QOS_LAST; i++) { - sync_qos_delta_add[i] = port_sync_qos(port, i); + /* + * Setup linkage for source port if it has a send turnstile i.e. it has + * a thread waiting in send or has a port enqueued in it or has sync ipc + * push from a special reply port. + */ + if (port_send_turnstile(port)) { + send_turnstile = turnstile_prepare((uintptr_t)port, + port_send_turnstile_address(port), + TURNSTILE_NULL, TURNSTILE_SYNC_IPC); + + turnstile_update_inheritor(send_turnstile, port_send_turnstile(dest), + (TURNSTILE_INHERITOR_TURNSTILE | TURNSTILE_IMMEDIATE_UPDATE)); + + /* update complete and turnstile complete called after dropping all locks */ } + imq_unlock(&port->ip_messages); /* now unlock chain */ @@ -2370,9 +2376,9 @@ ipc_importance_check_circularity( for (;;) { + ipc_port_t next; /* every port along chain track assertions behind it */ ipc_port_impcount_delta(dest, assertcnt, base); - update_knote = ipc_port_sync_qos_delta(dest, sync_qos_delta_add, sync_qos_delta_sub); if (dest == base) break; @@ -2384,9 +2390,9 @@ ipc_importance_check_circularity( assert(dest->ip_destination != IP_NULL); assert(dest->ip_tempowner == 0); - port = dest->ip_destination; + next = dest->ip_destination; ip_unlock(dest); - dest = port; + dest = next; } /* base is not in transit */ @@ -2425,10 +2431,6 @@ ipc_importance_check_circularity( } } - if (update_knote) { - KNOTE(&base->ip_messages.imq_klist, 0); - } - imq_unlock(&base->ip_messages); ip_unlock(base); /* @@ -2457,6 +2459,18 @@ ipc_importance_check_circularity( if (imp_lock_held) ipc_importance_unlock(); + /* All locks dropped, call turnstile_update_inheritor_complete for source port's turnstile */ + if (send_turnstile) { + turnstile_update_inheritor_complete(send_turnstile, TURNSTILE_INTERLOCK_NOT_HELD); + + /* Take the mq lock to call turnstile complete */ + imq_lock(&port->ip_messages); + turnstile_complete((uintptr_t)port, port_send_turnstile_address(port), NULL); + send_turnstile = TURNSTILE_NULL; + imq_unlock(&port->ip_messages); + turnstile_cleanup(); + } + if (imp_task != IIT_NULL) ipc_importance_task_release(imp_task); diff --git a/osfmk/ipc/ipc_init.c b/osfmk/ipc/ipc_init.c index 8d5ea071f..d8e0917e7 100644 --- a/osfmk/ipc/ipc_init.c +++ b/osfmk/ipc/ipc_init.c @@ -87,6 +87,7 @@ #include #include #include +#include #include #include @@ -131,8 +132,6 @@ lck_attr_t ipc_lck_attr; static lck_grp_attr_t ipc_lck_grp_attr; -extern void ikm_cache_init(void); - /* * Routine: ipc_bootstrap * Purpose: @@ -190,6 +189,7 @@ ipc_bootstrap(void) IKM_SAVED_KMSG_SIZE, "ipc kmsgs"); zone_change(ipc_kmsg_zone, Z_CALLERACCT, FALSE); + zone_change(ipc_kmsg_zone, Z_CACHING_ENABLED, TRUE); /* create special spaces */ @@ -275,6 +275,8 @@ ipc_init(void) msg_ool_size_small -= cpy_kdata_hdr_sz; ipc_host_init(); + ux_handler_init(); + } @@ -291,3 +293,4 @@ ipc_thread_call_init(void) ipc_importance_thread_call_init(); #endif } + diff --git a/osfmk/ipc/ipc_kmsg.c b/osfmk/ipc/ipc_kmsg.c index 736fe824c..81776c729 100644 --- a/osfmk/ipc/ipc_kmsg.c +++ b/osfmk/ipc/ipc_kmsg.c @@ -94,6 +94,8 @@ #include #include +#include + #include #include @@ -797,7 +799,7 @@ void ipc_kmsg_trace_send(ipc_kmsg_t kmsg, * Trailer contents */ trailer = (mach_msg_trailer_t *)((vm_offset_t)msg + - (vm_offset_t)msg->msgh_size); + round_msg((vm_offset_t)msg->msgh_size)); if (trailer->msgh_trailer_size <= sizeof(mach_msg_security_trailer_t)) { extern security_token_t KERNEL_SECURITY_TOKEN; mach_msg_security_trailer_t *strailer; @@ -849,11 +851,10 @@ void ipc_kmsg_clean_partial( vm_size_t length); mach_msg_return_t ipc_kmsg_copyin_body( - ipc_kmsg_t kmsg, - ipc_space_t space, - vm_map_t map); - -extern int thread_qos_from_pthread_priority(unsigned long, unsigned long *); + ipc_kmsg_t kmsg, + ipc_space_t space, + vm_map_t map, + mach_msg_option_t *optionp); /* * We keep a per-processor cache of kernel message buffers. @@ -914,21 +915,6 @@ ipc_kmsg_alloc( max_expanded_size = IKM_SAVED_MSG_SIZE; /* round up for ikm_cache */ if (max_expanded_size == IKM_SAVED_MSG_SIZE) { - struct ikm_cache *cache; - unsigned int i; - - disable_preemption(); - cache = &PROCESSOR_DATA(current_processor(), ikm_cache); - if ((i = cache->avail) > 0) { - assert(i <= IKM_STASH); - kmsg = cache->entries[--i]; - cache->avail = i; - enable_preemption(); - ikm_check_init(kmsg, max_expanded_size); - ikm_set_header(kmsg, msg_and_trailer_size); - return (kmsg); - } - enable_preemption(); kmsg = (ipc_kmsg_t)zalloc(ipc_kmsg_zone); } else { kmsg = (ipc_kmsg_t)kalloc(ikm_plus_overhead(max_expanded_size)); @@ -986,22 +972,7 @@ ipc_kmsg_free( ip_release(port); /* May be last reference */ } - /* - * Peek and see if it has to go back in the cache. - */ if (kmsg->ikm_size == IKM_SAVED_MSG_SIZE) { - struct ikm_cache *cache; - unsigned int i; - - disable_preemption(); - cache = &PROCESSOR_DATA(current_processor(), ikm_cache); - if ((i = cache->avail) < IKM_STASH) { - cache->entries[i] = kmsg; - cache->avail = i + 1; - enable_preemption(); - return; - } - enable_preemption(); zfree(ipc_kmsg_zone, kmsg); return; } @@ -1472,16 +1443,10 @@ ipc_kmsg_set_prealloc( assert(kmsg->ikm_prealloc == IP_NULL); kmsg->ikm_prealloc = IP_NULL; - /* take the mqueue lock since the sync qos is protected under it */ - imq_lock(&port->ip_messages); - /* copy the sync qos values to kmsg */ - for (int i = 0; i < THREAD_QOS_LAST; i++) { - kmsg->sync_qos[i] = port_sync_qos(port, i); - } - kmsg->special_port_qos = port_special_qos(port); + assert(port_send_turnstile(port) == TURNSTILE_NULL); + kmsg->ikm_turnstile = TURNSTILE_NULL; IP_SET_PREALLOC(port, kmsg); - imq_unlock(&port->ip_messages); } /* @@ -1496,20 +1461,11 @@ ipc_kmsg_clear_prealloc( ipc_kmsg_t kmsg, ipc_port_t port) { - assert(kmsg->ikm_prealloc == port); - - kmsg->ikm_prealloc = IP_NULL; - - /* take the mqueue lock since the sync qos is protected under it */ + /* take the mqueue lock since the turnstile is protected under it */ imq_lock(&port->ip_messages); IP_CLEAR_PREALLOC(port, kmsg); - - /* copy the sync qos values from kmsg to port */ - for (int i = 0; i < THREAD_QOS_LAST; i++) { - set_port_sync_qos(port, i, kmsg->sync_qos[i]); - } - set_port_special_qos(port, kmsg->special_port_qos); + set_port_send_turnstile(port, kmsg->ikm_turnstile); imq_unlock(&port->ip_messages); } @@ -1783,8 +1739,6 @@ ipc_kmsg_get_from_kernel( * MACH_SEND_INTERRUPTED Caller still has message. * MACH_SEND_INVALID_DEST Caller still has message. */ - - mach_msg_return_t ipc_kmsg_send( ipc_kmsg_t kmsg, @@ -1806,7 +1760,7 @@ ipc_kmsg_send( } #if IMPORTANCE_INHERITANCE - boolean_t did_importance = FALSE; + bool did_importance = false; #if IMPORTANCE_TRACE mach_msg_id_t imp_msgh_id = -1; int sender_pid = -1; @@ -1840,6 +1794,17 @@ ipc_kmsg_send( if (MACH_NODE_VALID(kmsg->ikm_node) && FPORT_VALID(port->ip_messages.imq_fport)) flipc_msg_ack(kmsg->ikm_node, &port->ip_messages, FALSE); #endif + if (did_importance) { + /* + * We're going to pretend we delivered this message + * successfully, and just eat the kmsg. However, the + * kmsg is actually visible via the importance_task! + * We need to cleanup this linkage before we destroy + * the message, and more importantly before we set the + * msgh_remote_port to NULL. See: 34302571 + */ + ipc_importance_clean(kmsg); + } ip_release(port); /* JMM - Future: release right, not just ref */ kmsg->ikm_header->msgh_remote_port = MACH_PORT_NULL; ipc_kmsg_destroy(kmsg); @@ -1885,8 +1850,8 @@ ipc_kmsg_send( * propagation. That routine can drop the port lock temporarily. * If it does we'll have to revalidate the destination. */ - if (did_importance == FALSE) { - did_importance = TRUE; + if (!did_importance) { + did_importance = true; if (ipc_importance_send(kmsg, option)) goto retry; } @@ -1901,6 +1866,9 @@ ipc_kmsg_send( * queue. Lock message queue while port is locked. */ imq_lock(&port->ip_messages); + + set_ip_srp_msg_sent(port); + ip_unlock(port); error = ipc_mqueue_send(&port->ip_messages, kmsg, option, @@ -1908,7 +1876,7 @@ ipc_kmsg_send( } #if IMPORTANCE_INHERITANCE - if (did_importance == TRUE) { + if (did_importance) { __unused int importance_cleared = 0; switch (error) { case MACH_SEND_TIMED_OUT: @@ -2024,7 +1992,7 @@ ipc_kmsg_put( kmsg->ikm_header->msgh_id); #if defined(__LP64__) - if (current_task() != kernel_task) { /* don't if receiver expects fully-cooked in-kernel msg; ux_exception */ + if (current_task() != kernel_task) { /* don't if receiver expects fully-cooked in-kernel msg; */ mach_msg_legacy_header_t *legacy_header = (mach_msg_legacy_header_t *)((vm_offset_t)(kmsg->ikm_header) + LEGACY_HEADER_SIZE_DELTA); @@ -2106,7 +2074,20 @@ ipc_kmsg_put_to_kernel( ipc_kmsg_free(kmsg); } -unsigned long pthread_priority_canonicalize(unsigned long priority, boolean_t propagation); +static mach_msg_priority_t +ipc_get_current_thread_priority(void) +{ + thread_t thread = current_thread(); + thread_qos_t qos; + int relpri; + + qos = thread_get_requested_qos(thread, &relpri); + if (!qos) { + qos = thread_user_promotion_qos_for_pri(thread->base_pri); + relpri = 0; + } + return (mach_msg_priority_t)_pthread_priority_make_from_thread_qos(qos, relpri, 0); +} static kern_return_t ipc_kmsg_set_qos( @@ -2115,24 +2096,23 @@ ipc_kmsg_set_qos( mach_msg_priority_t override) { kern_return_t kr; - unsigned long flags = 0; ipc_port_t special_reply_port = kmsg->ikm_header->msgh_local_port; ipc_port_t dest_port = kmsg->ikm_header->msgh_remote_port; kr = ipc_get_pthpriority_from_kmsg_voucher(kmsg, &kmsg->ikm_qos); if (kr != KERN_SUCCESS) { - kmsg->ikm_qos = MACH_MSG_PRIORITY_UNSPECIFIED; + if (options & MACH_SEND_PROPAGATE_QOS) { + kmsg->ikm_qos = ipc_get_current_thread_priority(); + } else { + kmsg->ikm_qos = MACH_MSG_PRIORITY_UNSPECIFIED; + } } kmsg->ikm_qos_override = kmsg->ikm_qos; if (options & MACH_SEND_OVERRIDE) { - unsigned long canonical; - mach_msg_priority_t canon; - - canonical = pthread_priority_canonicalize(override, TRUE); - canon = (mach_msg_priority_t)canonical; - if (canon > kmsg->ikm_qos) - kmsg->ikm_qos_override = canon; + pthread_priority_t pp = _pthread_priority_normalize_for_ipc(override); + if (pp > kmsg->ikm_qos) + kmsg->ikm_qos_override = (mach_msg_priority_t)pp; } kr = KERN_SUCCESS; @@ -2140,16 +2120,10 @@ ipc_kmsg_set_qos( if (IP_VALID(special_reply_port) && MACH_MSGH_BITS_LOCAL(kmsg->ikm_header->msgh_bits) == MACH_MSG_TYPE_PORT_SEND_ONCE) { /* - * Update the sync override count if the reply port is a special reply port, - * link the destination port to special reply port and update the qos count - * of destination port. - * - * Use the qos value passed by voucher and not the one passed by notify field. + * Link the destination port to special reply port and make sure that + * dest port has a send turnstile, else allocate one. */ - kr = ipc_port_link_special_reply_port_with_qos(special_reply_port, dest_port, - thread_qos_from_pthread_priority(kmsg->ikm_qos, &flags)); - } else { - kr = KERN_FAILURE; + ipc_port_link_special_reply_port(special_reply_port, dest_port); } } return kr; @@ -2243,6 +2217,9 @@ ipc_kmsg_copyin_header( (voucher_type != MACH_MSG_TYPE_MOVE_SEND && voucher_type != MACH_MSG_TYPE_COPY_SEND)) { is_write_unlock(space); + if ((*optionp & MACH_SEND_KERNEL) == 0) { + mach_port_guard_exception(voucher_name, 0, 0, kGUARD_EXC_SEND_INVALID_VOUCHER); + } return MACH_SEND_INVALID_VOUCHER; } @@ -2252,6 +2229,9 @@ ipc_kmsg_copyin_header( (voucher_entry->ie_bits & MACH_PORT_TYPE_SEND) == 0 || io_kotype(voucher_entry->ie_object) != IKOT_VOUCHER) { is_write_unlock(space); + if ((*optionp & MACH_SEND_KERNEL) == 0) { + mach_port_guard_exception(voucher_name, 0, 0, kGUARD_EXC_SEND_INVALID_VOUCHER); + } return MACH_SEND_INVALID_VOUCHER; } } else { @@ -2609,6 +2589,9 @@ ipc_kmsg_copyin_header( assert(voucher_port == IP_NULL); assert(voucher_soright == IP_NULL); + if ((*optionp & MACH_SEND_KERNEL) == 0) { + mach_port_guard_exception(reply_name, 0, 0, kGUARD_EXC_SEND_INVALID_REPLY); + } return MACH_SEND_INVALID_REPLY; invalid_dest: @@ -2627,23 +2610,26 @@ ipc_kmsg_copyin_header( } mach_msg_descriptor_t *ipc_kmsg_copyin_port_descriptor( - volatile mach_msg_port_descriptor_t *dsc, - mach_msg_legacy_port_descriptor_t *user_dsc, - ipc_space_t space, - ipc_object_t dest, - ipc_kmsg_t kmsg, - mach_msg_return_t *mr); + volatile mach_msg_port_descriptor_t *dsc, + mach_msg_legacy_port_descriptor_t *user_dsc, + ipc_space_t space, + ipc_object_t dest, + ipc_kmsg_t kmsg, + mach_msg_option_t *optionp, + mach_msg_return_t *mr); void ipc_print_type_name( int type_name); + mach_msg_descriptor_t * ipc_kmsg_copyin_port_descriptor( - volatile mach_msg_port_descriptor_t *dsc, - mach_msg_legacy_port_descriptor_t *user_dsc_in, - ipc_space_t space, - ipc_object_t dest, - ipc_kmsg_t kmsg, - mach_msg_return_t *mr) + volatile mach_msg_port_descriptor_t *dsc, + mach_msg_legacy_port_descriptor_t *user_dsc_in, + ipc_space_t space, + ipc_object_t dest, + ipc_kmsg_t kmsg, + mach_msg_option_t *optionp, + mach_msg_return_t *mr) { volatile mach_msg_legacy_port_descriptor_t *user_dsc = user_dsc_in; mach_msg_type_name_t user_disp; @@ -2659,6 +2645,9 @@ ipc_kmsg_copyin_port_descriptor( kern_return_t kr = ipc_object_copyin(space, name, user_disp, &object); if (kr != KERN_SUCCESS) { + if ((*optionp & MACH_SEND_KERNEL) == 0) { + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_SEND_INVALID_RIGHT); + } *mr = MACH_SEND_INVALID_RIGHT; return NULL; } @@ -2681,24 +2670,27 @@ ipc_kmsg_copyin_port_descriptor( } mach_msg_descriptor_t * ipc_kmsg_copyin_ool_descriptor( - mach_msg_ool_descriptor_t *dsc, - mach_msg_descriptor_t *user_dsc, - int is_64bit, - vm_offset_t *paddr, - vm_map_copy_t *copy, - vm_size_t *space_needed, - vm_map_t map, - mach_msg_return_t *mr); + mach_msg_ool_descriptor_t *dsc, + mach_msg_descriptor_t *user_dsc, + int is_64bit, + vm_offset_t *paddr, + vm_map_copy_t *copy, + vm_size_t *space_needed, + vm_map_t map, + mach_msg_option_t *optionp, + mach_msg_return_t *mr); + mach_msg_descriptor_t * ipc_kmsg_copyin_ool_descriptor( - mach_msg_ool_descriptor_t *dsc, - mach_msg_descriptor_t *user_dsc, - int is_64bit, - vm_offset_t *paddr, - vm_map_copy_t *copy, - vm_size_t *space_needed, - vm_map_t map, - mach_msg_return_t *mr) + mach_msg_ool_descriptor_t *dsc, + mach_msg_descriptor_t *user_dsc, + int is_64bit, + vm_offset_t *paddr, + vm_map_copy_t *copy, + vm_size_t *space_needed, + vm_map_t map, + __unused mach_msg_option_t *optionp, + mach_msg_return_t *mr) { vm_size_t length; boolean_t dealloc; @@ -2794,36 +2786,39 @@ ipc_kmsg_copyin_ool_descriptor( } mach_msg_descriptor_t * ipc_kmsg_copyin_ool_ports_descriptor( - mach_msg_ool_ports_descriptor_t *dsc, - mach_msg_descriptor_t *user_dsc, - int is_64bit, - vm_map_t map, - ipc_space_t space, - ipc_object_t dest, - ipc_kmsg_t kmsg, - mach_msg_return_t *mr); + mach_msg_ool_ports_descriptor_t *dsc, + mach_msg_descriptor_t *user_dsc, + int is_64bit, + vm_map_t map, + ipc_space_t space, + ipc_object_t dest, + ipc_kmsg_t kmsg, + mach_msg_option_t *optionp, + mach_msg_return_t *mr); + mach_msg_descriptor_t * ipc_kmsg_copyin_ool_ports_descriptor( - mach_msg_ool_ports_descriptor_t *dsc, - mach_msg_descriptor_t *user_dsc, - int is_64bit, - vm_map_t map, - ipc_space_t space, - ipc_object_t dest, - ipc_kmsg_t kmsg, - mach_msg_return_t *mr) + mach_msg_ool_ports_descriptor_t *dsc, + mach_msg_descriptor_t *user_dsc, + int is_64bit, + vm_map_t map, + ipc_space_t space, + ipc_object_t dest, + ipc_kmsg_t kmsg, + mach_msg_option_t *optionp, + mach_msg_return_t *mr) { - void *data; - ipc_object_t *objects; - unsigned int i; - mach_vm_offset_t addr; - mach_msg_type_name_t user_disp; - mach_msg_type_name_t result_disp; - mach_msg_type_number_t count; - mach_msg_copy_options_t copy_option; - boolean_t deallocate; - mach_msg_descriptor_type_t type; - vm_size_t ports_length, names_length; + void *data; + ipc_object_t *objects; + unsigned int i; + mach_vm_offset_t addr; + mach_msg_type_name_t user_disp; + mach_msg_type_name_t result_disp; + mach_msg_type_number_t count; + mach_msg_copy_options_t copy_option; + boolean_t deallocate; + mach_msg_descriptor_type_t type; + vm_size_t ports_length, names_length; if (is_64bit) { mach_msg_ool_ports_descriptor64_t *user_ool_dsc = (typeof(user_ool_dsc))user_dsc; @@ -2928,6 +2923,9 @@ ipc_kmsg_copyin_ool_ports_descriptor( } kfree(data, ports_length); dsc->address = NULL; + if ((*optionp & MACH_SEND_KERNEL) == 0) { + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_SEND_INVALID_RIGHT); + } *mr = MACH_SEND_INVALID_RIGHT; return NULL; } @@ -2970,7 +2968,8 @@ mach_msg_return_t ipc_kmsg_copyin_body( ipc_kmsg_t kmsg, ipc_space_t space, - vm_map_t map) + vm_map_t map, + mach_msg_option_t *optionp) { ipc_object_t dest; mach_msg_body_t *body; @@ -3144,20 +3143,20 @@ ipc_kmsg_copyin_body( switch (user_addr->type.type) { case MACH_MSG_PORT_DESCRIPTOR: user_addr = ipc_kmsg_copyin_port_descriptor((mach_msg_port_descriptor_t *)kern_addr, - (mach_msg_legacy_port_descriptor_t *)user_addr, space, dest, kmsg, &mr); + (mach_msg_legacy_port_descriptor_t *)user_addr, space, dest, kmsg, optionp, &mr); kern_addr++; complex = TRUE; break; case MACH_MSG_OOL_VOLATILE_DESCRIPTOR: case MACH_MSG_OOL_DESCRIPTOR: user_addr = ipc_kmsg_copyin_ool_descriptor((mach_msg_ool_descriptor_t *)kern_addr, - user_addr, is_task_64bit, &paddr, ©, &space_needed, map, &mr); + user_addr, is_task_64bit, &paddr, ©, &space_needed, map, optionp, &mr); kern_addr++; complex = TRUE; break; case MACH_MSG_OOL_PORTS_DESCRIPTOR: user_addr = ipc_kmsg_copyin_ool_ports_descriptor((mach_msg_ool_ports_descriptor_t *)kern_addr, - user_addr, is_task_64bit, map, space, dest, kmsg, &mr); + user_addr, is_task_64bit, map, space, dest, kmsg, optionp, &mr); kern_addr++; complex = TRUE; break; @@ -3248,7 +3247,7 @@ ipc_kmsg_copyin( if ((kmsg->ikm_header->msgh_bits & MACH_MSGH_BITS_COMPLEX) == 0) return MACH_MSG_SUCCESS; - mr = ipc_kmsg_copyin_body( kmsg, space, map); + mr = ipc_kmsg_copyin_body( kmsg, space, map, optionp); /* unreachable if !DEBUG */ __unreachable_ok_push @@ -4823,7 +4822,7 @@ ipc_kmsg_add_trailer(ipc_kmsg_t kmsg, ipc_space_t space __unused, trailer->msgh_seqno = seqno; trailer->msgh_context = context; - trailer->msgh_trailer_size = REQUESTED_TRAILER_SIZE(thread_is_64bit(thread), option); + trailer->msgh_trailer_size = REQUESTED_TRAILER_SIZE(thread_is_64bit_addr(thread), option); if (minimal_trailer) { goto done; @@ -4846,7 +4845,7 @@ ipc_kmsg_add_trailer(ipc_kmsg_t kmsg, ipc_space_t space __unused, done: #ifdef __arm64__ - ipc_kmsg_munge_trailer(trailer, real_trailer_out, thread_is_64bit(thread)); + ipc_kmsg_munge_trailer(trailer, real_trailer_out, thread_is_64bit_addr(thread)); #endif /* __arm64__ */ return trailer->msgh_trailer_size; diff --git a/osfmk/ipc/ipc_kmsg.h b/osfmk/ipc/ipc_kmsg.h index f7ff4059c..f5598615f 100644 --- a/osfmk/ipc/ipc_kmsg.h +++ b/osfmk/ipc/ipc_kmsg.h @@ -108,8 +108,7 @@ struct ipc_kmsg { mach_msg_priority_t ikm_qos_override; /* qos override on this kmsg */ struct ipc_importance_elem *ikm_importance; /* inherited from */ queue_chain_t ikm_inheritance; /* inherited from link */ - sync_qos_count_t sync_qos[THREAD_QOS_LAST]; /* sync qos counters for ikm_prealloc port */ - sync_qos_count_t special_port_qos; /* special port qos for ikm_prealloc port */ + struct turnstile *ikm_turnstile; /* send turnstile for ikm_prealloc port */ #if MACH_FLIPC struct mach_node *ikm_node; /* Originating node - needed for ack */ #endif diff --git a/osfmk/ipc/ipc_mqueue.c b/osfmk/ipc/ipc_mqueue.c index 38af0db4c..685950c90 100644 --- a/osfmk/ipc/ipc_mqueue.c +++ b/osfmk/ipc/ipc_mqueue.c @@ -120,15 +120,14 @@ static void ipc_mqueue_peek_on_thread( void ipc_mqueue_init( ipc_mqueue_t mqueue, - boolean_t is_set, - uint64_t *reserved_link) + boolean_t is_set) { if (is_set) { waitq_set_init(&mqueue->imq_set_queue, SYNC_POLICY_FIFO|SYNC_POLICY_PREPOST, - reserved_link, NULL); + NULL, NULL); } else { - waitq_init(&mqueue->imq_wait_queue, SYNC_POLICY_FIFO); + waitq_init(&mqueue->imq_wait_queue, SYNC_POLICY_FIFO | SYNC_POLICY_PORT); ipc_kmsg_queue_init(&mqueue->imq_messages); mqueue->imq_seqno = 0; mqueue->imq_msgcount = 0; @@ -298,6 +297,7 @@ ipc_mqueue_add( kern_return_t kr; assert(reserved_link && *reserved_link != 0); + assert(waitqs_is_linked(set_waitq)); imq_lock(port_mqueue); @@ -371,7 +371,7 @@ ipc_mqueue_add( */ msize = ipc_kmsg_copyout_size(kmsg, th->map); if (th->ith_rsize < - (msize + REQUESTED_TRAILER_SIZE(thread_is_64bit(th), th->ith_option))) { + (msize + REQUESTED_TRAILER_SIZE(thread_is_64bit_addr(th), th->ith_option))) { th->ith_state = MACH_RCV_TOO_LARGE; th->ith_msize = msize; if (th->ith_option & MACH_RCV_LARGE) { @@ -427,8 +427,25 @@ void ipc_mqueue_changed( ipc_mqueue_t mqueue) { - /* Indicate that this message queue is vanishing */ - knote_vanish(&mqueue->imq_klist); + if (IMQ_KLIST_VALID(mqueue)) { + /* + * Indicate that this message queue is vanishing + * + * When this is called, the associated receive right may be in flight + * between two tasks: the one it used to live in, and the one that armed + * a port destroyed notification for it. + * + * The new process may want to register the port it gets back with an + * EVFILT_MACHPORT filter again, and may have pending sync IPC on this + * port pending already, in which case we want the imq_klist field to be + * reusable for nefarious purposes (see IMQ_SET_INHERITOR). + * + * Fortunately, we really don't need this linkage anymore after this + * point as EV_VANISHED / EV_EOF will be the last thing delivered ever. + */ + knote_vanish(&mqueue->imq_klist); + klist_init(&mqueue->imq_klist); + } waitq_wakeup64_all_locked(&mqueue->imq_wait_queue, IPC_MQUEUE_RECEIVE, @@ -439,13 +456,13 @@ ipc_mqueue_changed( } - + /* * Routine: ipc_mqueue_send * Purpose: * Send a message to a message queue. The message holds a reference - * for the destination port for this message queue in the + * for the destination port for this message queue in the * msgh_remote_port field. * * If unsuccessful, the caller still has possession of @@ -474,7 +491,7 @@ ipc_mqueue_send( * 3) Message is sent to a send-once right. */ if (!imq_full(mqueue) || - (!imq_full_kernel(mqueue) && + (!imq_full_kernel(mqueue) && ((option & MACH_SEND_ALWAYS) || (MACH_MSGH_BITS_REMOTE(kmsg->ikm_header->msgh_bits) == MACH_MSG_TYPE_PORT_SEND_ONCE)))) { @@ -483,9 +500,12 @@ ipc_mqueue_send( imq_unlock(mqueue); } else { thread_t cur_thread = current_thread(); + ipc_port_t port = ip_from_mq(mqueue); + struct turnstile *send_turnstile = TURNSTILE_NULL; + turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL; uint64_t deadline; - /* + /* * We have to wait for space to be granted to us. */ if ((option & MACH_SEND_TIMEOUT) && (send_timeout == 0)) { @@ -504,38 +524,65 @@ ipc_mqueue_send( deadline = 0; thread_set_pending_block_hint(cur_thread, kThreadWaitPortSend); - wresult = waitq_assert_wait64_locked( - &mqueue->imq_wait_queue, - IPC_MQUEUE_FULL, - THREAD_ABORTSAFE, - TIMEOUT_URGENCY_USER_NORMAL, - deadline, TIMEOUT_NO_LEEWAY, - cur_thread); + + send_turnstile = turnstile_prepare((uintptr_t)port, + port_send_turnstile_address(port), + TURNSTILE_NULL, TURNSTILE_SYNC_IPC); + + /* Check if the port in is in transit, get the destination port's turnstile */ + if (ip_active(port) && + port->ip_receiver_name == MACH_PORT_NULL && + port->ip_destination != NULL) { + inheritor = port_send_turnstile(port->ip_destination); + } else { + inheritor = ipc_port_get_inheritor(port); + } + + turnstile_update_inheritor(send_turnstile, inheritor, + TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_TURNSTILE); + + wresult = waitq_assert_wait64_leeway( + &send_turnstile->ts_waitq, + IPC_MQUEUE_FULL, + THREAD_ABORTSAFE, + TIMEOUT_URGENCY_USER_NORMAL, + deadline, + TIMEOUT_NO_LEEWAY); imq_unlock(mqueue); - + turnstile_update_inheritor_complete(send_turnstile, + TURNSTILE_INTERLOCK_NOT_HELD); + if (wresult == THREAD_WAITING) { wresult = thread_block(THREAD_CONTINUE_NULL); counter(c_ipc_mqueue_send_block++); } - + + /* Call turnstile complete with interlock held */ + imq_lock(mqueue); + turnstile_complete((uintptr_t)port, port_send_turnstile_address(port), NULL); + imq_unlock(mqueue); + + /* Call cleanup after dropping the interlock */ + turnstile_cleanup(); + switch (wresult) { case THREAD_AWAKENED: - /* + /* * we can proceed - inherited msgcount from waker * or the message queue has been destroyed and the msgcount * has been reset to zero (will detect in ipc_mqueue_post()). */ break; - + case THREAD_TIMED_OUT: assert(option & MACH_SEND_TIMEOUT); return MACH_SEND_TIMED_OUT; - + case THREAD_INTERRUPTED: return MACH_SEND_INTERRUPTED; - + case THREAD_RESTART: /* mqueue is being destroyed */ return MACH_SEND_INVALID_DEST; @@ -569,12 +616,14 @@ extern void ipc_mqueue_override_send( imq_lock(mqueue); assert(imq_valid(mqueue)); assert(!imq_is_set(mqueue)); - + if (imq_full(mqueue)) { ipc_kmsg_t first = ipc_kmsg_queue_first(&mqueue->imq_messages); - if (first && ipc_kmsg_override_qos(&mqueue->imq_messages, first, override)) - KNOTE(&mqueue->imq_klist, 0); + if (first && ipc_kmsg_override_qos(&mqueue->imq_messages, first, override)) { + if (IMQ_KLIST_VALID(mqueue)) + KNOTE(&mqueue->imq_klist, 0); + } if (!first) full_queue_empty = TRUE; } @@ -608,26 +657,32 @@ extern void ipc_mqueue_override_send( void ipc_mqueue_release_msgcount(ipc_mqueue_t port_mq, ipc_mqueue_t set_mq) { + struct turnstile *send_turnstile = port_send_turnstile(ip_from_mq(port_mq)); (void)set_mq; assert(imq_held(port_mq)); assert(port_mq->imq_msgcount > 1 || ipc_kmsg_queue_empty(&port_mq->imq_messages)); port_mq->imq_msgcount--; - if (!imq_full(port_mq) && port_mq->imq_fullwaiters) { + if (!imq_full(port_mq) && port_mq->imq_fullwaiters && + send_turnstile != TURNSTILE_NULL) { /* * boost the priority of the awoken thread * (WAITQ_PROMOTE_PRIORITY) to ensure it uses * the message queue slot we've just reserved. * * NOTE: this will never prepost + * + * The wakeup happens on a turnstile waitq + * which will wakeup the highest priority waiter. + * A potential downside of this would be starving low + * priority senders if there is a constant churn of + * high priority threads trying to send to this port. */ - if (waitq_wakeup64_one_locked(&port_mq->imq_wait_queue, + if (waitq_wakeup64_one(&send_turnstile->ts_waitq, IPC_MQUEUE_FULL, THREAD_AWAKENED, - NULL, - WAITQ_PROMOTE_PRIORITY, - WAITQ_KEEP_LOCKED) != KERN_SUCCESS) { + WAITQ_PROMOTE_PRIORITY) != KERN_SUCCESS) { port_mq->imq_fullwaiters = FALSE; } else { /* gave away our slot - add reference back */ @@ -694,24 +749,26 @@ ipc_mqueue_post( if (receiver == THREAD_NULL) { - /* + /* * no receivers; queue kmsg if space still reserved * Reservations are cancelled when the port goes inactive. * note that this will enqueue the message for any - * "peeking" receivers. + * "peeking" receivers. * * Also, post the knote to wake up any threads waiting * on that style of interface if this insertion is of * note (first insertion, or adjusted override qos all * the way to the head of the queue). - * + * * This is just for ports. portset knotes are stay-active, * and their threads get awakened through the !MACH_RCV_IN_PROGRESS * logic below). */ if (mqueue->imq_msgcount > 0) { - if (ipc_kmsg_enqueue_qos(&mqueue->imq_messages, kmsg)) - KNOTE(&mqueue->imq_klist, 0); + if (ipc_kmsg_enqueue_qos(&mqueue->imq_messages, kmsg)) { + if (IMQ_KLIST_VALID(mqueue)) + KNOTE(&mqueue->imq_klist, 0); + } break; } @@ -722,7 +779,7 @@ ipc_mqueue_post( destroy_msg = TRUE; goto out_unlock; } - + /* * If a thread is attempting a "peek" into the message queue * (MACH_PEEK_IN_PROGRESS), then we enqueue the message and set the @@ -753,7 +810,7 @@ ipc_mqueue_post( continue; } - + /* * We found a waiting thread. * If the message is too large or the scatter list is too small @@ -761,7 +818,7 @@ ipc_mqueue_post( */ msize = ipc_kmsg_copyout_size(kmsg, receiver->map); if (receiver->ith_rsize < - (msize + REQUESTED_TRAILER_SIZE(thread_is_64bit(receiver), receiver->ith_option))) { + (msize + REQUESTED_TRAILER_SIZE(thread_is_64bit_addr(receiver), receiver->ith_option))) { receiver->ith_msize = msize; receiver->ith_state = MACH_RCV_TOO_LARGE; } else { @@ -924,7 +981,7 @@ ipc_mqueue_receive( return; if (wresult == THREAD_WAITING) { - counter((interruptible == THREAD_ABORTSAFE) ? + counter((interruptible == THREAD_ABORTSAFE) ? c_ipc_mqueue_receive_block_user++ : c_ipc_mqueue_receive_block_kernel++); @@ -986,6 +1043,8 @@ ipc_mqueue_receive_on_thread( { wait_result_t wresult; uint64_t deadline; + struct turnstile *rcv_turnstile = TURNSTILE_NULL; + turnstile_inheritor_t inheritor = NULL; /* called with mqueue locked */ @@ -1001,7 +1060,7 @@ ipc_mqueue_receive_on_thread( */ return THREAD_RESTART; } - + if (imq_is_set(mqueue)) { ipc_mqueue_t port_mq = IMQ_NULL; @@ -1040,7 +1099,7 @@ ipc_mqueue_receive_on_thread( /* * Receive on a single port. Just try to get the messages. */ - kmsgs = &mqueue->imq_messages; + kmsgs = &mqueue->imq_messages; if (ipc_kmsg_queue_first(kmsgs) != IKM_NULL) { if (option & MACH_PEEK_MSG) ipc_mqueue_peek_on_thread(mqueue, option, thread); @@ -1054,7 +1113,7 @@ ipc_mqueue_receive_on_thread( panic("Unknown mqueue type 0x%x: likely memory corruption!\n", mqueue->imq_wait_queue.waitq_type); } - + /* * Looks like we'll have to block. The mqueue we will * block on (whether the set's or the local port's) is @@ -1082,6 +1141,37 @@ ipc_mqueue_receive_on_thread( else deadline = 0; + /* + * Threads waiting on a port (not portset) + * will wait on port's receive turnstile. + * Donate waiting thread's turnstile and + * setup inheritor for special reply port. + * Based on the state of the special reply + * port, the inheritor would be the send + * turnstile of the connection port on which + * the send of sync ipc would happen or + * workloop's turnstile who would reply to + * the sync ipc message. + * + * Pass in mqueue wait in waitq_assert_wait to + * support port set wakeup. The mqueue waitq of port + * will be converted to to turnstile waitq + * in waitq_assert_wait instead of global waitqs. + */ + if (imq_is_queue(mqueue)) { + ipc_port_t port = ip_from_mq(mqueue); + rcv_turnstile = turnstile_prepare((uintptr_t)port, + port_rcv_turnstile_address(port), + TURNSTILE_NULL, TURNSTILE_SYNC_IPC); + + if (port->ip_specialreply) { + inheritor = ipc_port_get_special_reply_port_inheritor(port); + } + + turnstile_update_inheritor(rcv_turnstile, inheritor, + (TURNSTILE_INHERITOR_TURNSTILE | TURNSTILE_DELAYED_UPDATE)); + } + thread_set_pending_block_hint(thread, kThreadWaitPortReceive); wresult = waitq_assert_wait64_locked(&mqueue->imq_wait_queue, IPC_MQUEUE_RECEIVE, @@ -1096,6 +1186,12 @@ ipc_mqueue_receive_on_thread( imq_unlock(mqueue); + /* Check if its a port mqueue and if it needs to call turnstile_update_inheritor_complete */ + if (rcv_turnstile != TURNSTILE_NULL) { + turnstile_update_inheritor_complete(rcv_turnstile, TURNSTILE_INTERLOCK_NOT_HELD); + } + /* Its callers responsibility to call turnstile_complete to get the turnstile back */ + return wresult; } @@ -1175,7 +1271,7 @@ ipc_mqueue_select_on_thread( * (and size needed). */ msize = ipc_kmsg_copyout_size(kmsg, thread->map); - if (msize + REQUESTED_TRAILER_SIZE(thread_is_64bit(thread), option) > max_size) { + if (msize + REQUESTED_TRAILER_SIZE(thread_is_64bit_addr(thread), option) > max_size) { mr = MACH_RCV_TOO_LARGE; if (option & MACH_RCV_LARGE) { thread->ith_receiver_name = port_mq->imq_receiver_name; @@ -1236,7 +1332,7 @@ ipc_mqueue_peek_locked(ipc_mqueue_t mq, if (seqno == 0) { seqno = mq->imq_seqno; msgoff = 0; - } else if (seqno >= mq->imq_seqno && + } else if (seqno >= mq->imq_seqno && seqno < mq->imq_seqno + mq->imq_msgcount) { msgoff = seqno - mq->imq_seqno; } else @@ -1259,7 +1355,7 @@ ipc_mqueue_peek_locked(ipc_mqueue_t mq, if (msg_idp != NULL) *msg_idp = kmsg->ikm_header->msgh_id; if (msg_trailerp != NULL) - memcpy(msg_trailerp, + memcpy(msg_trailerp, (mach_msg_max_trailer_t *)((vm_offset_t)kmsg->ikm_header + round_msg(kmsg->ikm_header->msgh_size)), sizeof(mach_msg_max_trailer_t)); @@ -1353,7 +1449,7 @@ static int mqueue_peek_iterator(void *ctx, struct waitq *waitq, (void)ctx; (void)wqset; - + if (ipc_kmsg_queue_first(kmsgs) != IKM_NULL) return WQ_ITERATE_BREAK; /* break out of the prepost iteration */ @@ -1482,6 +1578,7 @@ ipc_mqueue_destroy_locked(ipc_mqueue_t mqueue) ipc_kmsg_queue_t kmqueue; ipc_kmsg_t kmsg; boolean_t reap = FALSE; + struct turnstile *send_turnstile = port_send_turnstile(ip_from_mq(mqueue)); assert(!imq_is_set(mqueue)); @@ -1491,12 +1588,13 @@ ipc_mqueue_destroy_locked(ipc_mqueue_t mqueue) * (never preposts) */ mqueue->imq_fullwaiters = FALSE; - waitq_wakeup64_all_locked(&mqueue->imq_wait_queue, - IPC_MQUEUE_FULL, - THREAD_RESTART, - NULL, - WAITQ_ALL_PRIORITIES, - WAITQ_KEEP_LOCKED); + + if (send_turnstile != TURNSTILE_NULL) { + waitq_wakeup64_all(&send_turnstile->ts_waitq, + IPC_MQUEUE_FULL, + THREAD_RESTART, + WAITQ_ALL_PRIORITIES); + } /* * Move messages from the specified queue to the per-thread @@ -1559,6 +1657,7 @@ ipc_mqueue_set_qlimit( imq_lock(mqueue); if (qlimit > mqueue->imq_qlimit) { mach_port_msgcount_t i, wakeup; + struct turnstile *send_turnstile = port_send_turnstile(ip_from_mq(mqueue)); /* caution: wakeup, qlimit are unsigned */ wakeup = qlimit - mqueue->imq_qlimit; @@ -1571,12 +1670,11 @@ ipc_mqueue_set_qlimit( * * NOTE: this will never prepost */ - if (waitq_wakeup64_one_locked(&mqueue->imq_wait_queue, - IPC_MQUEUE_FULL, - THREAD_AWAKENED, - NULL, - WAITQ_PROMOTE_PRIORITY, - WAITQ_KEEP_LOCKED) == KERN_NOT_WAITING) { + if (send_turnstile == TURNSTILE_NULL || + waitq_wakeup64_one(&send_turnstile->ts_waitq, + IPC_MQUEUE_FULL, + THREAD_AWAKENED, + WAITQ_PROMOTE_PRIORITY) == KERN_NOT_WAITING) { mqueue->imq_fullwaiters = FALSE; break; } diff --git a/osfmk/ipc/ipc_mqueue.h b/osfmk/ipc/ipc_mqueue.h index af0a534a4..05d952ef3 100644 --- a/osfmk/ipc/ipc_mqueue.h +++ b/osfmk/ipc/ipc_mqueue.h @@ -86,7 +86,7 @@ typedef struct ipc_mqueue { struct { struct waitq waitq; struct ipc_kmsg_queue messages; - mach_port_seqno_t seqno; + mach_port_seqno_t seqno; mach_port_name_t receiver_name; uint16_t msgcount; uint16_t qlimit; @@ -98,11 +98,31 @@ typedef struct ipc_mqueue { struct waitq_set setq; } pset; } data; - struct klist imq_klist; + union { + struct klist imq_klist; + uintptr_t imq_inheritor; + }; } *ipc_mqueue_t; #define IMQ_NULL ((ipc_mqueue_t) 0) +/* + * When a receive right is in flight, before it can ever be registered with + * a new knote, its imq_klist field can be overloaded to hold a pointer + * to the knote that the port is pushing on through his turnstile. + * + * if IMQ_KLIST_VALID() returns true, then the imq_klist field can be used, + * else IMQ_INHERITOR() can be used to get the pointer to the knote currently + * being the port turnstile inheritor. + */ +#define IMQ_KLIST_VALID(imq) (((imq)->imq_inheritor & 1) == 0) +#define IMQ_INHERITOR(imq) ((struct turnstile *)((imq)->imq_inheritor ^ 1)) +#define IMQ_SET_INHERITOR(imq, inheritor) \ +MACRO_BEGIN \ + assert(((imq)->imq_inheritor & 1) || SLIST_EMPTY(&(imq)->imq_klist)); \ + ((imq)->imq_inheritor = (uintptr_t)(inheritor) | 1); \ +MACRO_END + #define imq_wait_queue data.port.waitq #define imq_messages data.port.messages #define imq_msgcount data.port.msgcount @@ -141,11 +161,11 @@ typedef struct ipc_mqueue { #define imq_from_waitq(waitq) (waitq_is_set(waitq) ? \ ((struct ipc_mqueue *)((void *)( \ (uintptr_t)(waitq) - \ - __offsetof(struct ipc_mqueue, imq_wait_queue)) \ + __offsetof(struct ipc_mqueue, imq_set_queue)) \ )) : \ ((struct ipc_mqueue *)((void *)( \ (uintptr_t)(waitq) - \ - __offsetof(struct ipc_mqueue, imq_set_queue)) \ + __offsetof(struct ipc_mqueue, imq_wait_queue)) \ )) \ ) @@ -171,8 +191,7 @@ extern int ipc_mqueue_full; /* Initialize a newly-allocated message queue */ extern void ipc_mqueue_init( ipc_mqueue_t mqueue, - boolean_t is_set, - uint64_t *reserved_link); + boolean_t is_set); /* de-initialize / cleanup an mqueue (specifically waitq resources) */ extern void ipc_mqueue_deinit( diff --git a/osfmk/ipc/ipc_notify.c b/osfmk/ipc/ipc_notify.c index d1f50d5c7..44e6ed6d9 100644 --- a/osfmk/ipc/ipc_notify.c +++ b/osfmk/ipc/ipc_notify.c @@ -158,7 +158,7 @@ void ipc_notify_send_once( ipc_port_t port) { - ipc_port_unlink_special_reply_port(port, IPC_PORT_UNLINK_SR_NONE); + ipc_port_adjust_special_reply_port(port, IPC_PORT_ADJUST_SR_NONE, FALSE); (void)mach_notify_send_once(port); /* send-once right consumed */ diff --git a/osfmk/ipc/ipc_object.c b/osfmk/ipc/ipc_object.c index 5ff293fca..6b40e4761 100644 --- a/osfmk/ipc/ipc_object.c +++ b/osfmk/ipc/ipc_object.c @@ -202,11 +202,13 @@ ipc_object_translate_two( if ((entry1->ie_bits & MACH_PORT_TYPE(right1)) == MACH_PORT_TYPE_NONE) { is_read_unlock(space); + mach_port_guard_exception(name1, 0, 0, kGUARD_EXC_INVALID_RIGHT); return KERN_INVALID_RIGHT; } if ((entry2->ie_bits & MACH_PORT_TYPE(right2)) == MACH_PORT_TYPE_NONE) { is_read_unlock(space); + mach_port_guard_exception(name2, 0, 0, kGUARD_EXC_INVALID_RIGHT); return KERN_INVALID_RIGHT; } @@ -576,6 +578,7 @@ ipc_object_copyin_from_kernel( ipc_port_t port = (ipc_port_t) object; ip_lock(port); + imq_lock(&port->ip_messages); assert(ip_active(port)); if (port->ip_destination != IP_NULL) { assert(port->ip_receiver == ipc_space_kernel); @@ -586,6 +589,7 @@ ipc_object_copyin_from_kernel( port->ip_receiver_name = MACH_PORT_NULL; port->ip_destination = IP_NULL; } + imq_unlock(&port->ip_messages); ip_unlock(port); break; } @@ -748,6 +752,7 @@ ipc_object_copyout( boolean_t overflow, mach_port_name_t *namep) { + struct knote *kn = current_thread()->ith_knote; mach_port_name_t name; ipc_entry_t entry; kern_return_t kr; @@ -755,6 +760,11 @@ ipc_object_copyout( assert(IO_VALID(object)); assert(io_otype(object) == IOT_PORT); + if (ITH_KNOTE_VALID(kn, msgt_name)) { + filt_machport_turnstile_prepare_lazily(kn, + msgt_name, (ipc_port_t)object); + } + is_write_lock(space); for (;;) { @@ -842,6 +852,7 @@ ipc_object_copyout_name( ipc_entry_t oentry; ipc_entry_t entry; kern_return_t kr; + struct knote *kn = current_thread()->ith_knote; #if IMPORTANCE_INHERITANCE int assertcnt = 0; @@ -851,6 +862,11 @@ ipc_object_copyout_name( assert(IO_VALID(object)); assert(io_otype(object) == IOT_PORT); + if (ITH_KNOTE_VALID(kn, msgt_name)) { + filt_machport_turnstile_prepare_lazily(kn, + msgt_name, (ipc_port_t)object); + } + kr = ipc_entry_alloc_name(space, name, &entry); if (kr != KERN_SUCCESS) return kr; diff --git a/osfmk/ipc/ipc_object.h b/osfmk/ipc/ipc_object.h index 6aaf285a6..17e5abc02 100644 --- a/osfmk/ipc/ipc_object.h +++ b/osfmk/ipc/ipc_object.h @@ -236,7 +236,7 @@ io_release(ipc_object_t io) { } } -/* +/* * Retrieve a label for use in a kernel call that takes a security * label as a parameter. If necessary, io_getlabel acquires internal * (not io_lock) locks, and io_unlocklabel releases them. diff --git a/osfmk/ipc/ipc_port.c b/osfmk/ipc/ipc_port.c index 871f98f49..823abe3a7 100644 --- a/osfmk/ipc/ipc_port.c +++ b/osfmk/ipc/ipc_port.c @@ -92,6 +92,7 @@ #include #include #include +#include #include @@ -638,10 +639,14 @@ ipc_port_init( port->ip_impcount = 0; port->ip_specialreply = 0; - port->ip_link_sync_qos = 0; + port->ip_sync_link_state = PORT_SYNC_LINK_ANY; + + reset_ip_srp_bits(port); + + port->ip_send_turnstile = TURNSTILE_NULL; ipc_mqueue_init(&port->ip_messages, - FALSE /* !set */, NULL /* no reserved link */); + FALSE /* !set */); } /* @@ -919,13 +924,15 @@ ipc_port_destroy(ipc_port_t port) port->ip_pdrequest = IP_NULL; /* make port be in limbo */ + imq_lock(&port->ip_messages); port->ip_receiver_name = MACH_PORT_NULL; port->ip_destination = IP_NULL; + imq_unlock(&port->ip_messages); ip_unlock(port); if (special_reply) { - ipc_port_unlink_special_reply_port(port, - IPC_PORT_UNLINK_SR_ALLOW_SYNC_QOS_LINKAGE); + ipc_port_adjust_special_reply_port(port, + IPC_PORT_ADJUST_SR_ALLOW_SYNC_LINKAGE, FALSE); } /* consumes our refs for port and pdrequest */ ipc_notify_port_destroyed(pdrequest, port); @@ -933,8 +940,11 @@ ipc_port_destroy(ipc_port_t port) goto drop_assertions; } + /* port active bit needs to be guarded under mqueue lock for turnstiles */ + imq_lock(&port->ip_messages); port->ip_object.io_bits &= ~IO_BITS_ACTIVE; port->ip_timestamp = ipc_port_timestamp(); + imq_unlock(&port->ip_messages); nsrequest = port->ip_nsrequest; /* @@ -966,7 +976,7 @@ ipc_port_destroy(ipc_port_t port) kmsg = port->ip_premsg; assert(kmsg != IKM_NULL); inuse_port = ikm_prealloc_inuse_port(kmsg); - IP_CLEAR_PREALLOC(port, kmsg); + ipc_kmsg_clear_prealloc(kmsg, port); ip_unlock(port); if (inuse_port != IP_NULL) { assert(inuse_port == port); @@ -979,8 +989,8 @@ ipc_port_destroy(ipc_port_t port) /* unlink the kmsg from special reply port */ if (special_reply) { - ipc_port_unlink_special_reply_port(port, - IPC_PORT_UNLINK_SR_ALLOW_SYNC_QOS_LINKAGE); + ipc_port_adjust_special_reply_port(port, + IPC_PORT_ADJUST_SR_ALLOW_SYNC_LINKAGE, FALSE); } /* throw away no-senders request */ @@ -1056,9 +1066,6 @@ ipc_port_check_circularity( return ipc_importance_check_circularity(port, dest); #else ipc_port_t base; - sync_qos_count_t sync_qos_delta_add[THREAD_QOS_LAST] = {0}; - sync_qos_count_t sync_qos_delta_sub[THREAD_QOS_LAST] = {0}; - boolean_t update_knote = FALSE; assert(port != IP_NULL); assert(dest != IP_NULL); @@ -1067,6 +1074,9 @@ ipc_port_check_circularity( return TRUE; base = dest; + /* Check if destination needs a turnstile */ + ipc_port_send_turnstile_prepare(dest); + /* * First try a quick check that can run in parallel. * No circularity if dest is not in transit. @@ -1115,19 +1125,21 @@ ipc_port_check_circularity( assert(port->ip_receiver_name == MACH_PORT_NULL); assert(port->ip_destination == IP_NULL); - while (dest != IP_NULL) { + base = dest; + while (base != IP_NULL) { ipc_port_t next; /* dest is in transit or in limbo */ - assert(ip_active(dest)); - assert(dest->ip_receiver_name == MACH_PORT_NULL); + assert(ip_active(base)); + assert(base->ip_receiver_name == MACH_PORT_NULL); - next = dest->ip_destination; - ip_unlock(dest); - dest = next; + next = base->ip_destination; + ip_unlock(base); + base = next; } + ipc_port_send_turnstile_complete(dest); return TRUE; } @@ -1141,7 +1153,7 @@ ipc_port_check_circularity( ipc_port_multiple_unlock(); not_circular: - imq_lock(&base->ip_messages); + imq_lock(&port->ip_messages); /* port is in limbo */ @@ -1152,18 +1164,27 @@ ipc_port_check_circularity( ip_reference(dest); port->ip_destination = dest; - /* Capture the sync qos count delta */ - for (int i = 0; i < THREAD_QOS_LAST; i++) { - sync_qos_delta_add[i] = port_sync_qos(port, i); + /* Setup linkage for source port if it has sync ipc push */ + struct turnstile *send_turnstile = TURNSTILE_NULL; + if (port_send_turnstile(port)) { + send_turnstile = turnstile_prepare((uintptr_t)port, + port_send_turnstile_address(port), + TURNSTILE_NULL, TURNSTILE_SYNC_IPC); + + turnstile_update_inheritor(send_turnstile, port_send_turnstile(dest), + (TURNSTILE_INHERITOR_TURNSTILE | TURNSTILE_IMMEDIATE_UPDATE)); + + /* update complete and turnstile complete called after dropping all locks */ } + imq_unlock(&port->ip_messages); /* now unlock chain */ ip_unlock(port); for (;;) { - /* every port along chain tracks override behind it */ - update_knote = ipc_port_sync_qos_delta(dest, sync_qos_delta_add, sync_qos_delta_sub); + ipc_port_t next; + if (dest == base) break; @@ -1173,9 +1194,9 @@ ipc_port_check_circularity( assert(dest->ip_receiver_name == MACH_PORT_NULL); assert(dest->ip_destination != IP_NULL); - port = dest->ip_destination; + next = dest->ip_destination; ip_unlock(dest); - dest = port; + dest = next; } /* base is not in transit */ @@ -1183,456 +1204,476 @@ ipc_port_check_circularity( (base->ip_receiver_name != MACH_PORT_NULL) || (base->ip_destination == IP_NULL)); - if (update_knote) { - KNOTE(&base->ip_messages.imq_klist, 0); - } - imq_unlock(&base->ip_messages); - ip_unlock(base); + /* All locks dropped, call turnstile_update_inheritor_complete for source port's turnstile */ + if (send_turnstile) { + turnstile_update_inheritor_complete(send_turnstile, TURNSTILE_INTERLOCK_NOT_HELD); + + /* Take the mq lock to call turnstile complete */ + imq_lock(&port->ip_messages); + turnstile_complete((uintptr_t)port, port_send_turnstile_address(port), NULL); + send_turnstile = TURNSTILE_NULL; + imq_unlock(&port->ip_messages); + turnstile_cleanup(); + } + return FALSE; #endif /* !IMPORTANCE_INHERITANCE */ } -/* - * Routine: ipc_port_link_special_reply_port_with_qos - * Purpose: - * Link the special reply port with the destination port. - * Update the sync qos count of special reply port, - * destination port. - * - * Conditions: - * Nothing is locked. - */ -kern_return_t -ipc_port_link_special_reply_port_with_qos( - ipc_port_t special_reply_port, - ipc_port_t dest_port, - int qos) +struct turnstile * +ipc_port_get_inheritor(ipc_port_t port) { - ipc_port_t next, base; - sync_qos_count_t sync_qos_delta_add[THREAD_QOS_LAST] = {0}; - sync_qos_count_t sync_qos_delta_sub[THREAD_QOS_LAST] = {0}; - boolean_t update_knote = FALSE; - boolean_t multiple_lock = FALSE; + ipc_mqueue_t mqueue = &port->ip_messages; + struct knote *kn; - ip_lock(dest_port); + assert(imq_held(mqueue)); - /* Check if dest is active */ - if (!ip_active(dest_port)) { - ip_unlock(dest_port); - return KERN_FAILURE; + if (!IMQ_KLIST_VALID(mqueue)) { + return IMQ_INHERITOR(mqueue); } - if ((dest_port->ip_receiver_name == MACH_PORT_NULL) && - (dest_port->ip_destination != IP_NULL)) { - /* dest_port is in transit; need to take the serialize lock */ - ip_unlock(dest_port); - goto take_multiple_lock; - } - - /* Check if the port is a special reply port */ - if (ip_lock_try(special_reply_port)) { - if (!special_reply_port->ip_specialreply || - !special_reply_port->ip_link_sync_qos || - (special_reply_port->ip_sync_qos_override_port != IP_NULL && - special_reply_port->ip_sync_qos_override_port != dest_port)) { - - boolean_t link_sync_qos = special_reply_port->ip_link_sync_qos; - ip_unlock(special_reply_port); - ip_unlock(dest_port); - /* return KERN_SUCCESS when link_sync_qos is not set */ - if (!link_sync_qos) { - return KERN_SUCCESS; - } - return KERN_FAILURE; - } else { - goto both_ports_locked; + SLIST_FOREACH(kn, &port->ip_messages.imq_klist, kn_selnext) { + if ((kn->kn_sfflags & MACH_RCV_MSG) && (kn->kn_status & KN_DISPATCH)) { + return filt_machport_kqueue_turnstile(kn); } } - ip_unlock(dest_port); - -take_multiple_lock: + return TURNSTILE_NULL; +} - ipc_port_multiple_lock(); /* massive serialization */ - multiple_lock = TRUE; +/* + * Routine: ipc_port_send_turnstile_prepare + * Purpose: + * Get a reference on port's send turnstile, if + * port does not have a send turnstile then allocate one. + * + * Conditions: + * Nothing is locked. + */ +void +ipc_port_send_turnstile_prepare(ipc_port_t port) +{ + struct turnstile *turnstile = TURNSTILE_NULL; + struct turnstile *inheritor = TURNSTILE_NULL; + struct turnstile *send_turnstile = TURNSTILE_NULL; - ip_lock(special_reply_port); +retry_alloc: + imq_lock(&port->ip_messages); - /* Check if the special reply port is marked regular */ - if (!special_reply_port->ip_specialreply || - !special_reply_port->ip_link_sync_qos || - (special_reply_port->ip_sync_qos_override_port != IP_NULL && - special_reply_port->ip_sync_qos_override_port != dest_port)) { + if (port_send_turnstile(port) == NULL || + port_send_turnstile(port)->ts_port_ref == 0) { - boolean_t link_sync_qos = special_reply_port->ip_link_sync_qos; - ip_unlock(special_reply_port); - ipc_port_multiple_unlock(); - /* return KERN_SUCCESS when link_sync_qos is not set */ - if (!link_sync_qos) { - return KERN_SUCCESS; + if (turnstile == TURNSTILE_NULL) { + imq_unlock(&port->ip_messages); + turnstile = turnstile_alloc(); + goto retry_alloc; } - return KERN_FAILURE; - } - ip_lock(dest_port); + send_turnstile = turnstile_prepare((uintptr_t)port, + port_send_turnstile_address(port), + turnstile, TURNSTILE_SYNC_IPC); + turnstile = TURNSTILE_NULL; -both_ports_locked: - next = dest_port; + /* + * if port in transit, setup linkage for its turnstile, + * otherwise the link it to WL turnstile. + */ + if (ip_active(port) && + port->ip_receiver_name == MACH_PORT_NULL && + port->ip_destination != IP_NULL) { + assert(port->ip_receiver_name == MACH_PORT_NULL); + assert(port->ip_destination != IP_NULL); - /* Apply the qos to special reply port, capture the old qos */ - if (special_reply_port->ip_sync_qos_override_port != IP_NULL) { - /* Check if qos needs to be updated */ - if ((sync_qos_count_t)qos <= port_special_qos(special_reply_port)) { - imq_lock(&dest_port->ip_messages); - goto done_update; + inheritor = port_send_turnstile(port->ip_destination); + } else { + inheritor = ipc_port_get_inheritor(port); } - sync_qos_delta_sub[port_special_qos(special_reply_port)]++; + turnstile_update_inheritor(send_turnstile, inheritor, + TURNSTILE_INHERITOR_TURNSTILE | TURNSTILE_IMMEDIATE_UPDATE); + /* turnstile complete will be called in ipc_port_send_turnstile_complete */ } - set_port_special_qos(special_reply_port, (sync_qos_count_t)qos); - sync_qos_delta_add[qos]++; + /* Increment turnstile counter */ + port_send_turnstile(port)->ts_port_ref++; + imq_unlock(&port->ip_messages); - /* Link the special reply port to dest port */ - if (special_reply_port->ip_sync_qos_override_port == IP_NULL) { - /* take a reference on dest_port */ - ip_reference(dest_port); - special_reply_port->ip_sync_qos_override_port = dest_port; + if (send_turnstile) { + turnstile_update_inheritor_complete(send_turnstile, + TURNSTILE_INTERLOCK_NOT_HELD); } - - /* Apply the sync qos delta to all in-transit ports */ - for (;;) { - boolean_t port_not_in_transit = FALSE; - if (!ip_active(next) || - (next->ip_receiver_name != MACH_PORT_NULL) || - (next->ip_destination == IP_NULL)) { - /* Get the mqueue lock for destination port to update knotes */ - imq_lock(&next->ip_messages); - port_not_in_transit = TRUE; - } - /* Apply the sync qos delta */ - update_knote = ipc_port_sync_qos_delta(next, sync_qos_delta_add, sync_qos_delta_sub); - - if (port_not_in_transit) - break; - - next = next->ip_destination; - ip_lock(next); + if (turnstile != TURNSTILE_NULL) { + turnstile_deallocate(turnstile); } -done_update: +} - if (multiple_lock) { - ipc_port_multiple_unlock(); - } - ip_unlock(special_reply_port); - base = next; - next = dest_port; +/* + * Routine: ipc_port_send_turnstile_complete + * Purpose: + * Drop a ref on the port's send turnstile, if the + * ref becomes zero, deallocate the turnstile. + * + * Conditions: + * The space might be locked, use safe deallocate. + */ +void +ipc_port_send_turnstile_complete(ipc_port_t port) +{ + struct turnstile *turnstile = TURNSTILE_NULL; - while (next != base) { - ipc_port_t prev = next; - next = next->ip_destination; + /* Drop turnstile count on dest port */ + imq_lock(&port->ip_messages); - ip_unlock(prev); + port_send_turnstile(port)->ts_port_ref--; + if (port_send_turnstile(port)->ts_port_ref == 0) { + turnstile_complete((uintptr_t)port, port_send_turnstile_address(port), + &turnstile); + assert(turnstile != TURNSTILE_NULL); } + imq_unlock(&port->ip_messages); + turnstile_cleanup(); - if (update_knote) { - KNOTE(&base->ip_messages.imq_klist, 0); + if (turnstile != TURNSTILE_NULL) { + turnstile_deallocate_safe(turnstile); + turnstile = TURNSTILE_NULL; } - imq_unlock(&base->ip_messages); - ip_unlock(base); - return KERN_SUCCESS; } + /* - * Routine: ipc_port_unlink_special_reply_port_locked + * Routine: ipc_port_rcv_turnstile_waitq * Purpose: - * If the special port is linked to a port, adjust it's sync qos override and unlink the port. - * Condition: - * Special reply port locked on entry. - * Special reply port unlocked on return. - * Returns: - * None. + * Given the mqueue's waitq, find the port's + * rcv turnstile and return its waitq. + * + * Conditions: + * mqueue locked or thread waiting on turnstile is locked. */ -void -ipc_port_unlink_special_reply_port_locked( - ipc_port_t special_reply_port, - struct knote *kn, - uint8_t flags) +struct waitq * +ipc_port_rcv_turnstile_waitq(struct waitq *waitq) { - ipc_port_t dest_port; - sync_qos_count_t sync_qos; - sync_qos_count_t sync_qos_delta_add[THREAD_QOS_LAST] = {0}; - sync_qos_count_t sync_qos_delta_sub[THREAD_QOS_LAST] = {0}; + struct waitq *safeq; - /* Return if called from copy out in pseudo receive */ - if (kn == ITH_KNOTE_PSEUDO) { - ip_unlock(special_reply_port); - return; - } - - /* check if special port has a port linked to it */ - if (special_reply_port->ip_specialreply == 0 || - special_reply_port->ip_sync_qos_override_port == IP_NULL) { - set_port_special_qos(special_reply_port, 0); - if (flags & IPC_PORT_UNLINK_SR_CLEAR_SPECIAL_REPLY) { - special_reply_port->ip_specialreply = 0; - } - if (flags & IPC_PORT_UNLINK_SR_ALLOW_SYNC_QOS_LINKAGE) { - special_reply_port->ip_link_sync_qos = 1; - } - ip_unlock(special_reply_port); - return; - } - - /* - * port->ip_sync_qos_override_port is not null and it is safe - * to access it since ip_specialreply is set. - */ - dest_port = special_reply_port->ip_sync_qos_override_port; - sync_qos_delta_sub[port_special_qos(special_reply_port)]++; - sync_qos = port_special_qos(special_reply_port); - - /* Clear qos delta for special reply port */ - set_port_special_qos(special_reply_port, 0); - special_reply_port->ip_sync_qos_override_port = IP_NULL; - if (flags & IPC_PORT_UNLINK_SR_CLEAR_SPECIAL_REPLY) { - special_reply_port->ip_specialreply = 0; - } + ipc_mqueue_t mqueue = imq_from_waitq(waitq); + ipc_port_t port = ip_from_mq(mqueue); + struct turnstile *rcv_turnstile = ipc_port_rcv_turnstile(port); - if (flags & IPC_PORT_UNLINK_SR_ALLOW_SYNC_QOS_LINKAGE) { - special_reply_port->ip_link_sync_qos = 1; + /* Check if the port has a rcv turnstile */ + if (rcv_turnstile != TURNSTILE_NULL) { + safeq = &rcv_turnstile->ts_waitq; } else { - special_reply_port->ip_link_sync_qos = 0; + safeq = global_eventq(waitq); } - - ip_unlock(special_reply_port); - - /* Add the sync qos on knote */ - if (ITH_KNOTE_VALID(kn)) { - knote_adjust_sync_qos(kn, sync_qos, TRUE); - } - - /* Adjust the sync qos of destination */ - ipc_port_adjust_sync_qos(dest_port, sync_qos_delta_add, sync_qos_delta_sub); - ip_release(dest_port); + return safeq; } + /* - * Routine: ipc_port_unlink_special_reply_port + * Routine: ipc_port_rcv_turnstile * Purpose: - * If the special port is linked to a port, adjust it's sync qos override and unlink the port. - * Condition: - * Nothing locked. - * Returns: - * None. + * Get the port's receive turnstile + * + * Conditions: + * mqueue locked or thread waiting on turnstile is locked. */ -void -ipc_port_unlink_special_reply_port( - ipc_port_t special_reply_port, - uint8_t flags) +struct turnstile * +ipc_port_rcv_turnstile(ipc_port_t port) { - ip_lock(special_reply_port); - ipc_port_unlink_special_reply_port_locked(special_reply_port, NULL, flags); - /* special_reply_port unlocked */ + return turnstile_lookup_by_proprietor((uintptr_t)port); } + /* - * Routine: ipc_port_sync_qos_delta + * Routine: ipc_port_link_special_reply_port * Purpose: - * Adjust the sync qos count associated with a port. + * Link the special reply port with the destination port. + * Allocates turnstile to dest port. * - * For now, be defensive during deductions to make sure the - * sync_qos count for the port doesn't underflow zero. - * Returns: - * TRUE: if max sync qos of the port changes. - * FALSE: otherwise. * Conditions: - * The port is referenced and locked. - * The mqueue is locked if port is not in-transit. + * Nothing is locked. */ -boolean_t -ipc_port_sync_qos_delta( - ipc_port_t port, - sync_qos_count_t *sync_qos_delta_add, - sync_qos_count_t *sync_qos_delta_sub) +void +ipc_port_link_special_reply_port( + ipc_port_t special_reply_port, + ipc_port_t dest_port) { - sync_qos_count_t max_sync_qos_index; + boolean_t drop_turnstile_ref = FALSE; - if (!ip_active(port)) { - return FALSE; - } + /* Check if dest_port needs a turnstile */ + ipc_port_send_turnstile_prepare(dest_port); + + /* Lock the special reply port and establish the linkage */ + ip_lock(special_reply_port); + imq_lock(&special_reply_port->ip_messages); - max_sync_qos_index = ipc_port_get_max_sync_qos_index(port); + /* Check if we need to drop the acquired turnstile ref on dest port */ + if (!special_reply_port->ip_specialreply || + special_reply_port->ip_sync_link_state != PORT_SYNC_LINK_ANY || + special_reply_port->ip_sync_inheritor_port != IPC_PORT_NULL) { + drop_turnstile_ref = TRUE; + } else { + /* take a reference on dest_port */ + ip_reference(dest_port); + special_reply_port->ip_sync_inheritor_port = dest_port; + special_reply_port->ip_sync_link_state = PORT_SYNC_LINK_PORT; + } - for (int i = 0; i < THREAD_QOS_LAST; i++) { - sync_qos_count_t port_sync_qos_count = port_sync_qos(port, i); - /* Do not let the sync qos underflow */ - if (sync_qos_delta_sub[i] > port_sync_qos_count) { - KDBG_FILTERED(IMPORTANCE_CODE(IMP_SYNC_IPC_QOS, IMP_SYNC_IPC_QOS_UNDERFLOW), - i, VM_KERNEL_UNSLIDE_OR_PERM(port), - port_sync_qos_count, sync_qos_delta_sub[i]); + imq_unlock(&special_reply_port->ip_messages); + ip_unlock(special_reply_port); - set_port_sync_qos(port, i, 0); - } else if (sync_qos_delta_sub[i] != 0) { - KDBG_FILTERED(IMPORTANCE_CODE(IMP_SYNC_IPC_QOS, IMP_SYNC_IPC_QOS_REMOVED), - i, VM_KERNEL_UNSLIDE_OR_PERM(port), - port_sync_qos_count, sync_qos_delta_sub[i]); + if (drop_turnstile_ref) { + ipc_port_send_turnstile_complete(dest_port); + } - set_port_sync_qos(port, i, (port_sync_qos_count - sync_qos_delta_sub[i])); - } + return; +} - port_sync_qos_count = port_sync_qos(port, i); - /* Do not let the sync qos overflow */ - if (UCHAR_MAX - sync_qos_delta_add[i] < port_sync_qos_count) { - KDBG_FILTERED(IMPORTANCE_CODE(IMP_SYNC_IPC_QOS, IMP_SYNC_IPC_QOS_OVERFLOW), - i, VM_KERNEL_UNSLIDE_OR_PERM(port), - port_sync_qos_count, sync_qos_delta_add[i]); +#if DEVELOPMENT || DEBUG +inline void +reset_ip_srp_bits(ipc_port_t special_reply_port) +{ + special_reply_port->ip_srp_lost_link = 0; + special_reply_port->ip_srp_msg_sent = 0; +} - set_port_sync_qos(port, i, UCHAR_MAX); - } else if (sync_qos_delta_add[i] != 0) { - KDBG_FILTERED(IMPORTANCE_CODE(IMP_SYNC_IPC_QOS, IMP_SYNC_IPC_QOS_APPLIED), - i, VM_KERNEL_UNSLIDE_OR_PERM(port), - port_sync_qos_count, sync_qos_delta_add[i]); +inline void +reset_ip_srp_msg_sent(ipc_port_t special_reply_port) +{ + if (special_reply_port->ip_specialreply == 1) { + special_reply_port->ip_srp_msg_sent = 0; + } +} - set_port_sync_qos(port, i, (port_sync_qos_count + sync_qos_delta_add[i])); - } +inline void +set_ip_srp_msg_sent(ipc_port_t special_reply_port) +{ + if (special_reply_port->ip_specialreply == 1) { + special_reply_port->ip_srp_msg_sent = 1; } - return (ipc_port_get_max_sync_qos_index(port) != max_sync_qos_index); } -/* - * Routine: ipc_port_get_max_sync_qos_index - * Purpose: - * Return the max sync qos of the port. - * - * Conditions: - */ -sync_qos_count_t -ipc_port_get_max_sync_qos_index( - ipc_port_t port) +inline void +set_ip_srp_lost_link(ipc_port_t special_reply_port) { - int i; - for (i = THREAD_QOS_LAST - 1; i >= 0; i--) { - if (port_sync_qos(port, i) != 0) { - return i; - } + if (special_reply_port->ip_specialreply == 1 && special_reply_port->ip_srp_msg_sent == 0) { + special_reply_port->ip_srp_lost_link = 1; } - return THREAD_QOS_UNSPECIFIED; } +#else /* DEVELOPMENT || DEBUG */ +inline void +reset_ip_srp_bits(__unused ipc_port_t special_reply_port) +{ + return; +} + +inline void +reset_ip_srp_msg_sent(__unused ipc_port_t special_reply_port) +{ + return; +} + +inline void +set_ip_srp_msg_sent(__unused ipc_port_t special_reply_port) +{ + return; +} + +inline void +set_ip_srp_lost_link(__unused ipc_port_t special_reply_port) +{ + return; +} +#endif /* DEVELOPMENT || DEBUG */ + /* - * Routine: ipc_port_adjust_sync_qos + * Routine: ipc_port_adjust_special_reply_port_locked * Purpose: - * Adjust sync qos of the port and it's destination - * port if the port is in transit. - * Conditions: - * Nothing locked. + * If the special port has a turnstile, update it's inheritor. + * Condition: + * Special reply port locked on entry. + * Special reply port unlocked on return. * Returns: * None. */ void -ipc_port_adjust_sync_qos( - ipc_port_t port, - sync_qos_count_t *sync_qos_delta_add, - sync_qos_count_t *sync_qos_delta_sub) +ipc_port_adjust_special_reply_port_locked( + ipc_port_t special_reply_port, + struct knote *kn, + uint8_t flags, + boolean_t get_turnstile) { - boolean_t update_knote; - boolean_t multiple_lock = FALSE; - ipc_port_t dest, base, next; + ipc_port_t dest_port = IPC_PORT_NULL; + int sync_link_state = PORT_SYNC_LINK_NO_LINKAGE; + turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL; + struct turnstile *dest_ts = TURNSTILE_NULL, *ts = TURNSTILE_NULL; - ip_lock(port); + imq_lock(&special_reply_port->ip_messages); - /* Check if the port is in transit */ - if (!ip_active(port) || - (port->ip_receiver_name != MACH_PORT_NULL) || - (port->ip_destination == IP_NULL)) { - /* lock the mqueue since port is not in-transit */ - imq_lock(&port->ip_messages); - update_knote = ipc_port_sync_qos_delta(port, sync_qos_delta_add, sync_qos_delta_sub); - if (update_knote) { - KNOTE(&port->ip_messages.imq_klist, 0); + if (flags & IPC_PORT_ADJUST_SR_RECEIVED_MSG) { + reset_ip_srp_msg_sent(special_reply_port); + } + + /* Check if the special reply port is marked non-special */ + if (special_reply_port->ip_specialreply == 0 || + special_reply_port->ip_sync_link_state == PORT_SYNC_LINK_ANY) { + if (get_turnstile) { + turnstile_complete((uintptr_t)special_reply_port, + port_rcv_turnstile_address(special_reply_port), + NULL); + } + imq_unlock(&special_reply_port->ip_messages); + ip_unlock(special_reply_port); + if (get_turnstile) { + turnstile_cleanup(); } - imq_unlock(&port->ip_messages); - ip_unlock(port); return; } - dest = port->ip_destination; - assert(dest != IP_NULL); + /* Clear thread's special reply port and clear linkage */ + if (flags & IPC_PORT_ADJUST_SR_CLEAR_SPECIAL_REPLY) { + /* This option should only be specified by a non blocking thread */ + assert(get_turnstile == FALSE); + special_reply_port->ip_specialreply = 0; - if (ip_lock_try(dest)) { - if (!ip_active(dest) || - (dest->ip_receiver_name != MACH_PORT_NULL) || - (dest->ip_destination == IP_NULL)) { - update_knote = ipc_port_sync_qos_delta(port, sync_qos_delta_add, sync_qos_delta_sub); - ip_unlock(port); + reset_ip_srp_bits(special_reply_port); - /* lock the mqueue since dest is not in-transit */ - imq_lock(&dest->ip_messages); - update_knote = ipc_port_sync_qos_delta(dest, sync_qos_delta_add, sync_qos_delta_sub); - if (update_knote) { - KNOTE(&dest->ip_messages.imq_klist, 0); - } - imq_unlock(&dest->ip_messages); - ip_unlock(dest); + /* Check if need to break linkage */ + if (special_reply_port->ip_sync_link_state == PORT_SYNC_LINK_NO_LINKAGE) { + imq_unlock(&special_reply_port->ip_messages); + ip_unlock(special_reply_port); return; } - - /* dest is in transit; need to take the serialize lock */ - ip_unlock(dest); + } else if (flags & IPC_PORT_ADJUST_SR_LINK_WORKLOOP) { + if (special_reply_port->ip_sync_link_state == PORT_SYNC_LINK_ANY || + special_reply_port->ip_sync_link_state == PORT_SYNC_LINK_PORT) { + if (ITH_KNOTE_VALID(kn, MACH_MSG_TYPE_PORT_SEND_ONCE)) { + inheritor = filt_machport_stash_port(kn, special_reply_port, + &sync_link_state); + } + } + } else if (flags & IPC_PORT_ADJUST_SR_ALLOW_SYNC_LINKAGE) { + sync_link_state = PORT_SYNC_LINK_ANY; + } + + switch (special_reply_port->ip_sync_link_state) { + case PORT_SYNC_LINK_PORT: + dest_port = special_reply_port->ip_sync_inheritor_port; + special_reply_port->ip_sync_inheritor_port = IPC_PORT_NULL; + break; + case PORT_SYNC_LINK_WORKLOOP_KNOTE: + special_reply_port->ip_sync_inheritor_knote = NULL; + break; + case PORT_SYNC_LINK_WORKLOOP_STASH: + dest_ts = special_reply_port->ip_sync_inheritor_ts; + special_reply_port->ip_sync_inheritor_ts = NULL; + break; + } + + special_reply_port->ip_sync_link_state = sync_link_state; + + switch (sync_link_state) { + case PORT_SYNC_LINK_WORKLOOP_KNOTE: + special_reply_port->ip_sync_inheritor_knote = kn; + break; + case PORT_SYNC_LINK_WORKLOOP_STASH: + turnstile_reference(inheritor); + special_reply_port->ip_sync_inheritor_ts = inheritor; + break; + case PORT_SYNC_LINK_NO_LINKAGE: + if (flags & IPC_PORT_ADJUST_SR_ENABLE_EVENT) { + set_ip_srp_lost_link(special_reply_port); + } + break; } - ip_unlock(port); - - ipc_port_multiple_lock(); /* massive serialization */ - multiple_lock = TRUE; - - ip_lock(port); - next = port; - - /* Apply the sync qos delta to all in-transit ports */ - for (;;) { - boolean_t port_not_in_transit = FALSE; - - if (!ip_active(next) || - (next->ip_receiver_name != MACH_PORT_NULL) || - (next->ip_destination == IP_NULL)) { - /* Get the mqueue lock for destination port to update knotes */ - imq_lock(&next->ip_messages); - port_not_in_transit = TRUE; + /* Get thread's turnstile donated to special reply port */ + if (get_turnstile) { + turnstile_complete((uintptr_t)special_reply_port, + port_rcv_turnstile_address(special_reply_port), + NULL); + } else { + ts = ipc_port_rcv_turnstile(special_reply_port); + if (ts) { + turnstile_reference(ts); + turnstile_update_inheritor(ts, inheritor, + (TURNSTILE_INHERITOR_TURNSTILE | TURNSTILE_IMMEDIATE_UPDATE)); } + } - /* Apply the sync qos delta */ - update_knote = ipc_port_sync_qos_delta(next, sync_qos_delta_add, sync_qos_delta_sub); - - if (port_not_in_transit) - break; + imq_unlock(&special_reply_port->ip_messages); + ip_unlock(special_reply_port); - next = next->ip_destination; - ip_lock(next); + if (get_turnstile) { + turnstile_cleanup(); + } else if (ts) { + /* Call turnstile cleanup after dropping the interlock */ + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD); + turnstile_deallocate_safe(ts); } - if (multiple_lock) { - ipc_port_multiple_unlock(); + /* Release the ref on the dest port and it's turnstile */ + if (dest_port) { + ipc_port_send_turnstile_complete(dest_port); + /* release the reference on the dest port */ + ip_release(dest_port); } - base = next; - next = port; + if (dest_ts) { + turnstile_deallocate_safe(dest_ts); + } +} - while (next != base) { - ipc_port_t prev = next; - next = next->ip_destination; +/* + * Routine: ipc_port_adjust_special_reply_port + * Purpose: + * If the special port has a turnstile, update it's inheritor. + * Condition: + * Nothing locked. + * Returns: + * None. + */ +void +ipc_port_adjust_special_reply_port( + ipc_port_t special_reply_port, + uint8_t flags, + boolean_t get_turnstile) +{ + ip_lock(special_reply_port); + ipc_port_adjust_special_reply_port_locked(special_reply_port, NULL, flags, get_turnstile); + /* special_reply_port unlocked */ +} - ip_unlock(prev); - } +/* + * Routine: ipc_port_get_special_reply_port_inheritor + * Purpose: + * Returns the current inheritor of the special reply port + * Condition: + * mqueue is locked, port is a special reply port + * Returns: + * the current inheritor + */ +turnstile_inheritor_t +ipc_port_get_special_reply_port_inheritor( + ipc_port_t port) +{ + assert(port->ip_specialreply); + imq_held(&port->ip_messages); - if (update_knote) { - KNOTE(&base->ip_messages.imq_klist, 0); + switch (port->ip_sync_link_state) { + case PORT_SYNC_LINK_PORT: + if (port->ip_sync_inheritor_port != NULL) { + return port_send_turnstile(port->ip_sync_inheritor_port); + } + break; + case PORT_SYNC_LINK_WORKLOOP_KNOTE: + return filt_machport_stashed_special_reply_port_turnstile(port); + case PORT_SYNC_LINK_WORKLOOP_STASH: + return port->ip_sync_inheritor_ts; } - imq_unlock(&base->ip_messages); - ip_unlock(base); + return TURNSTILE_INHERITOR_NULL; } /* @@ -2052,6 +2093,40 @@ ipc_port_copyout_send( return name; } +/* + * Routine: ipc_port_copyout_name_send + * Purpose: + * Copyout a naked send right (possibly null/dead) to given name, + * or if that fails, destroy the right. + * Conditions: + * Nothing locked. + */ + +mach_port_name_t +ipc_port_copyout_name_send( + ipc_port_t sright, + ipc_space_t space, + mach_port_name_t name) +{ + if (IP_VALID(sright)) { + kern_return_t kr; + + kr = ipc_object_copyout_name(space, (ipc_object_t) sright, + MACH_MSG_TYPE_PORT_SEND, TRUE, name); + if (kr != KERN_SUCCESS) { + ipc_port_release_send(sright); + + if (kr == KERN_INVALID_CAPABILITY) + name = MACH_PORT_DEAD; + else + name = MACH_PORT_NULL; + } + } else + name = CAST_MACH_PORT_TO_NAME(sright); + + return name; +} + /* * Routine: ipc_port_release_send * Purpose: @@ -2165,7 +2240,7 @@ ipc_port_release_sonce( if (!IP_VALID(port)) return; - ipc_port_unlink_special_reply_port(port, IPC_PORT_UNLINK_SR_NONE); + ipc_port_adjust_special_reply_port(port, IPC_PORT_ADJUST_SR_NONE, FALSE); ip_lock(port); @@ -2205,8 +2280,10 @@ ipc_port_release_receive( ipc_port_destroy(port); /* consumes ref, unlocks */ - if (dest != IP_NULL) + if (dest != IP_NULL) { + ipc_port_send_turnstile_complete(dest); ip_release(dest); + } } /* @@ -2272,8 +2349,10 @@ ipc_port_dealloc_special( * the ipc_space_kernel check in ipc_mqueue_send. */ + imq_lock(&port->ip_messages); port->ip_receiver_name = MACH_PORT_NULL; port->ip_receiver = IS_NULL; + imq_unlock(&port->ip_messages); /* relevant part of ipc_port_clear_receiver */ ipc_port_set_mscount(port, 0); @@ -2297,7 +2376,13 @@ ipc_port_finalize( { ipc_port_request_t requests = port->ip_requests; - assert(!ip_active(port)); + assert(port_send_turnstile(port) == TURNSTILE_NULL); + assert(ipc_port_rcv_turnstile(port) == TURNSTILE_NULL); + + if (ip_active(port)) { + panic("Trying to free an active port. port %p", port); + } + if (requests != IPR_NULL) { ipc_table_size_t its = requests->ipr_size; it_requests_free(its, requests); @@ -2327,11 +2412,13 @@ ipc_port_finalize( void kdp_mqueue_send_find_owner(struct waitq * waitq, __assert_only event64_t event, thread_waitinfo_t * waitinfo) { + struct turnstile *turnstile; assert(waitinfo->wait_type == kThreadWaitPortSend); assert(event == IPC_MQUEUE_FULL); + assert(waitq_is_turnstile_queue(waitq)); - ipc_mqueue_t mqueue = imq_from_waitq(waitq); - ipc_port_t port = ip_from_mq(mqueue); /* we are blocking on send */ + turnstile = waitq_to_turnstile(waitq); + ipc_port_t port = (ipc_port_t)turnstile->ts_proprietor; /* we are blocking on send */ assert(kdp_is_in_zone(port, "ipc ports")); waitinfo->owner = 0; diff --git a/osfmk/ipc/ipc_port.h b/osfmk/ipc/ipc_port.h index c2be8199b..16addb831 100644 --- a/osfmk/ipc/ipc_port.h +++ b/osfmk/ipc/ipc_port.h @@ -84,6 +84,7 @@ #include #include +#include #include #include @@ -128,18 +129,18 @@ struct ipc_port { union { ipc_kobject_t kobject; ipc_importance_task_t imp_task; - ipc_port_t sync_qos_override_port; + ipc_port_t sync_inheritor_port; + struct knote *sync_inheritor_knote; + struct turnstile *sync_inheritor_ts; } kdata; - + struct ipc_port *ip_nsrequest; struct ipc_port *ip_pdrequest; struct ipc_port_request *ip_requests; union { struct ipc_kmsg *premsg; - struct { - sync_qos_count_t sync_qos[THREAD_QOS_LAST]; - sync_qos_count_t special_port_qos; - } qos_counter; + struct turnstile *send_turnstile; + SLIST_ENTRY(ipc_port) dealloc_elm; } kdata2; mach_vm_address_t ip_context; @@ -151,8 +152,8 @@ struct ipc_port { ip_guarded:1, /* port guarded (use context value as guard) */ ip_strict_guard:1, /* Strict guarding; Prevents user manipulation of context values directly */ ip_specialreply:1, /* port is a special reply port */ - ip_link_sync_qos:1, /* link the special reply port to destination port */ - ip_impcount:24; /* number of importance donations in nested queue */ + ip_sync_link_state:3, /* link the special reply port to destination port/ Workloop */ + ip_impcount:22; /* number of importance donations in nested queue */ mach_port_mscount_t ip_mscount; mach_port_rights_t ip_srights; @@ -167,6 +168,10 @@ struct ipc_port { uintptr_t ip_callstack[IP_CALLSTACK_MAX]; /* stack trace */ unsigned long ip_spares[IP_NSPARES]; /* for debugging */ #endif /* MACH_ASSERT */ +#if DEVELOPMENT || DEBUG + uint8_t ip_srp_lost_link:1, /* special reply port turnstile link chain broken */ + ip_srp_msg_sent:1; /* special reply port msg sent */ +#endif }; @@ -182,32 +187,63 @@ struct ipc_port { #define ip_kobject kdata.kobject #define ip_imp_task kdata.imp_task -#define ip_sync_qos_override_port kdata.sync_qos_override_port +#define ip_sync_inheritor_port kdata.sync_inheritor_port +#define ip_sync_inheritor_knote kdata.sync_inheritor_knote +#define ip_sync_inheritor_ts kdata.sync_inheritor_ts #define ip_premsg kdata2.premsg -#define ip_sync_qos kdata2.qos_counter.sync_qos -#define ip_special_port_qos kdata2.qos_counter.special_port_qos - -#define port_sync_qos(port, i) (IP_PREALLOC(port) ? (port)->ip_premsg->sync_qos[(i)] : (port)->ip_sync_qos[(i)]) -#define port_special_qos(port) (IP_PREALLOC(port) ? (port)->ip_premsg->special_port_qos : (port)->ip_special_port_qos) - -#define set_port_sync_qos(port, i, value) \ -MACRO_BEGIN \ -if (IP_PREALLOC(port)) { \ - (port)->ip_premsg->sync_qos[(i)] = (value); \ -} else { \ - (port)->ip_sync_qos[(i)] = (value); \ -} \ +#define ip_send_turnstile kdata2.send_turnstile +#define ip_dealloc_elm kdata2.dealloc_elm + +#define port_send_turnstile(port) (IP_PREALLOC(port) ? (port)->ip_premsg->ikm_turnstile : (port)->ip_send_turnstile) + +#define set_port_send_turnstile(port, value) \ +MACRO_BEGIN \ +if (IP_PREALLOC(port)) { \ + (port)->ip_premsg->ikm_turnstile = (value); \ +} else { \ + (port)->ip_send_turnstile = (value); \ +} \ MACRO_END -#define set_port_special_qos(port, value) \ -MACRO_BEGIN \ -if (IP_PREALLOC(port)) { \ - (port)->ip_premsg->special_port_qos = (value); \ -} else { \ - (port)->ip_special_port_qos = (value); \ -} \ -MACRO_END +#define port_send_turnstile_address(port) \ +(IP_PREALLOC(port) ? &((port)->ip_premsg->ikm_turnstile) : &((port)->ip_send_turnstile)) + +#define port_rcv_turnstile_address(port) (NULL) + + +/* + * SYNC IPC state flags for special reply port. + * + * PORT_SYNC_LINK_ANY + * Special reply port is not linked to any other port + * or WL and linkage should be allowed. + * + * PORT_SYNC_LINK_PORT + * Special reply port is linked to the port and + * ip_sync_inheritor_port contains the inheritor + * port. + * + * PORT_SYNC_LINK_WORKLOOP_KNOTE + * Special reply port is linked to a WL (via a knote). + * ip_sync_inheritor_knote contains a pointer to the knote + * the port is stashed on. + * + * PORT_SYNC_LINK_WORKLOOP_STASH + * Special reply port is linked to a WL (via a knote stash). + * ip_sync_inheritor_ts contains a pointer to the turnstile with a +1 + * the port is stashed on. + * + * PORT_SYNC_LINK_NO_LINKAGE + * Message sent to special reply port, do + * not allow any linkages till receive is + * complete. + */ +#define PORT_SYNC_LINK_ANY (0) +#define PORT_SYNC_LINK_PORT (0x1) +#define PORT_SYNC_LINK_WORKLOOP_KNOTE (0x2) +#define PORT_SYNC_LINK_WORKLOOP_STASH (0x3) +#define PORT_SYNC_LINK_NO_LINKAGE (0x4) #define IP_NULL IPC_PORT_NULL #define IP_DEAD IPC_PORT_DEAD @@ -224,10 +260,8 @@ MACRO_END #define ip_release(port) io_release(&(port)->ip_object) /* get an ipc_port pointer from an ipc_mqueue pointer */ -#define ip_from_mq(mq) ((struct ipc_port *)((void *)( \ - (char *)(mq) - \ - __offsetof(struct ipc_port, ip_messages)) \ - )) +#define ip_from_mq(mq) \ + __container_of(mq, struct ipc_port, ip_messages) #define ip_reference_mq(mq) ip_reference(ip_from_mq(mq)) #define ip_release_mq(mq) ip_release(ip_from_mq(mq)) @@ -475,46 +509,60 @@ enum { }; /* link the destination port with special reply port */ -kern_return_t -ipc_port_link_special_reply_port_with_qos( +void +ipc_port_link_special_reply_port( ipc_port_t special_reply_port, - ipc_port_t dest_port, - int qos); + ipc_port_t dest_port); + +#define IPC_PORT_ADJUST_SR_NONE 0 +#define IPC_PORT_ADJUST_SR_CLEAR_SPECIAL_REPLY 0x1 +#define IPC_PORT_ADJUST_SR_ALLOW_SYNC_LINKAGE 0x2 +#define IPC_PORT_ADJUST_SR_LINK_WORKLOOP 0x4 + +#define IPC_PORT_ADJUST_SR_RECEIVED_MSG 0x8 +#define IPC_PORT_ADJUST_SR_ENABLE_EVENT 0x10 + +void +reset_ip_srp_bits(ipc_port_t special_reply_port); + +void +reset_ip_srp_msg_sent(ipc_port_t special_reply_port); + +void +set_ip_srp_msg_sent(ipc_port_t special_reply_port); + +void +set_ip_srp_lost_link(ipc_port_t special_reply_port); -/* link the destination port with locked special reply port */ -void ipc_port_unlink_special_reply_port_locked( +/* Adjust special reply port linkage */ +void ipc_port_adjust_special_reply_port_locked( ipc_port_t special_reply_port, struct knote *kn, - uint8_t flags); + uint8_t flags, + boolean_t get_turnstile); -/* Unlink the destination port from special reply port */ +/* Adjust special reply port linkage */ void -ipc_port_unlink_special_reply_port( +ipc_port_adjust_special_reply_port( ipc_port_t special_reply_port, - uint8_t flags); - -#define IPC_PORT_UNLINK_SR_NONE 0 -#define IPC_PORT_UNLINK_SR_CLEAR_SPECIAL_REPLY 0x1 -#define IPC_PORT_UNLINK_SR_ALLOW_SYNC_QOS_LINKAGE 0x2 + uint8_t flags, + boolean_t get_turnstile); -/* Get the max sync qos override index applied to the port */ -sync_qos_count_t -ipc_port_get_max_sync_qos_index( - ipc_port_t port); +turnstile_inheritor_t +ipc_port_get_special_reply_port_inheritor( + ipc_port_t special_reply_port); -/* Apply qos delta to the port */ -boolean_t -ipc_port_sync_qos_delta( - ipc_port_t port, - sync_qos_count_t *sync_qos_delta_add, - sync_qos_count_t *sync_qos_delta_sub); +void +ipc_port_send_turnstile_prepare(ipc_port_t port); -/* Adjust the sync qos of the port and it's destination port */ void -ipc_port_adjust_sync_qos( - ipc_port_t port, - sync_qos_count_t *sync_qos_delta_add, - sync_qos_count_t *sync_qos_delta_sub); +ipc_port_send_turnstile_complete(ipc_port_t port); + +struct waitq * +ipc_port_rcv_turnstile_waitq(struct waitq *waitq); + +struct turnstile * +ipc_port_rcv_turnstile(ipc_port_t port); /* apply importance delta to port only */ extern mach_port_delta_t @@ -561,6 +609,12 @@ extern mach_port_name_t ipc_port_copyout_send( ipc_port_t sright, ipc_space_t space); +/* Copyout a naked send right to given name */ +extern mach_port_name_t ipc_port_copyout_name_send( + ipc_port_t sright, + ipc_space_t space, + mach_port_name_t name); + #endif /* MACH_KERNEL_PRIVATE */ #if KERNEL_PRIVATE @@ -617,6 +671,9 @@ extern void ipc_port_track_dealloc( extern void ipc_port_debug_init(void); #endif /* MACH_ASSERT */ +extern struct turnstile *ipc_port_get_inheritor( + ipc_port_t port); + #define ipc_port_alloc_kernel() \ ipc_port_alloc_special(ipc_space_kernel) #define ipc_port_dealloc_kernel(port) \ diff --git a/osfmk/ipc/ipc_pset.c b/osfmk/ipc/ipc_pset.c index fe989b283..8a8e12979 100644 --- a/osfmk/ipc/ipc_pset.c +++ b/osfmk/ipc/ipc_pset.c @@ -101,24 +101,18 @@ ipc_pset_alloc( ipc_pset_t pset; mach_port_name_t name; kern_return_t kr; - uint64_t reserved_link; - - reserved_link = waitq_link_reserve(NULL); kr = ipc_object_alloc(space, IOT_PORT_SET, MACH_PORT_TYPE_PORT_SET, 0, &name, (ipc_object_t *) &pset); if (kr != KERN_SUCCESS) { - waitq_link_release(reserved_link); return kr; } /* pset and space are locked */ - ipc_mqueue_init(&pset->ips_messages, TRUE /* set */, &reserved_link); + ipc_mqueue_init(&pset->ips_messages, TRUE /* set */); is_write_unlock(space); - waitq_link_release(reserved_link); - *namep = name; *psetp = pset; return KERN_SUCCESS; @@ -146,23 +140,16 @@ ipc_pset_alloc_name( { ipc_pset_t pset; kern_return_t kr; - uint64_t reserved_link; - - - reserved_link = waitq_link_reserve(NULL); kr = ipc_object_alloc_name(space, IOT_PORT_SET, MACH_PORT_TYPE_PORT_SET, 0, name, (ipc_object_t *) &pset); if (kr != KERN_SUCCESS) { - waitq_link_release(reserved_link); return kr; } /* pset is locked */ - ipc_mqueue_init(&pset->ips_messages, TRUE /* set */, &reserved_link); - - waitq_link_release(reserved_link); + ipc_mqueue_init(&pset->ips_messages, TRUE /* set */); *psetp = pset; return KERN_SUCCESS; @@ -183,17 +170,13 @@ ipc_pset_alloc_special( __assert_only ipc_space_t space) { ipc_pset_t pset; - uint64_t reserved_link; assert(space != IS_NULL); assert(space->is_table == IE_NULL); assert(!is_active(space)); - reserved_link = waitq_link_reserve(NULL); - __IGNORE_WCASTALIGN(pset = (ipc_pset_t)io_alloc(IOT_PORT_SET)); if (pset == IPS_NULL) { - waitq_link_release(reserved_link); return IPS_NULL; } @@ -203,9 +186,7 @@ ipc_pset_alloc_special( pset->ips_references = 1; pset->ips_object.io_bits = io_makebits(TRUE, IOT_PORT_SET, 0); - ipc_mqueue_init(&pset->ips_messages, TRUE /* set */, &reserved_link); - - waitq_link_release(reserved_link); + ipc_mqueue_init(&pset->ips_messages, TRUE /* set */); return pset; } @@ -250,7 +231,7 @@ ipc_pset_add( assert(ips_active(pset)); assert(ip_active(port)); - + kr = ipc_mqueue_add(&port->ip_messages, &pset->ips_messages, reserved_link, reserved_prepost); @@ -286,6 +267,55 @@ ipc_pset_remove( return kr; } +/* + * Routine: ipc_pset_lazy_allocate + * Purpose: + * lazily initialize the wqset of a port set. + * Conditions: + * Nothing locked. + */ + +kern_return_t +ipc_pset_lazy_allocate( + ipc_space_t space, + mach_port_name_t psname) +{ + kern_return_t kr; + ipc_entry_t entry; + ipc_object_t psobj; + ipc_pset_t pset; + + kr = ipc_right_lookup_read(space, psname, &entry); + if (kr != KERN_SUCCESS) + return kr; + + /* space is read-locked and active */ + if ((entry->ie_bits & MACH_PORT_TYPE_PORT_SET) == 0) { + is_read_unlock(space); + kr = KERN_INVALID_RIGHT; + return kr; + } + + psobj = entry->ie_object; + __IGNORE_WCASTALIGN(pset = (ipc_pset_t) psobj); + assert(pset != NULL); + ipc_mqueue_t set_mqueue = &pset->ips_messages; + struct waitq_set *wqset = &set_mqueue->imq_set_queue; + + io_reference(psobj); + is_read_unlock(space); + + /* + * lazily initialize the wqset to avoid + * possible allocation while linking + * under spinlocks. + */ + waitq_set_lazy_init_link(wqset); + io_release(psobj); + + return KERN_SUCCESS; +} + /* * Routine: ipc_pset_remove_from_all * Purpose: @@ -347,26 +377,217 @@ ipc_pset_destroy( ips_release(pset); /* consume the ref our caller gave us */ } -/* Kqueue EVFILT_MACHPORT support */ +/* + * Kqueue EVFILT_MACHPORT support + * + * - kn_ptr.p_mqueue points to the monitored mqueue + * + * - (in/out) ext[0] holds a mach_vm_address_t to a userspace buffer + * that can be used to direct-deliver messages when + * MACH_RCV_MSG is set in kn_sfflags + * + * - (in/out) ext[1] holds a mach_msg_size_t representing the size + * of the userspace buffer held in ext[0]. + * + * - (out) ext[2] is used to deliver qos information + * about the send queue to userspace. + * + * - (abused) ext[3] is used in kernel to hold a reference to the first port + * with a turnstile that participate to sync IPC override. + * + * - kn_hook is optionally a "knote" turnstile. It is used as the inheritor + * of turnstiles for rights copied out as part of direct message delivery + * when they can participate to sync IPC override. + * + * It is used to atomically neuter the sync IPC override when the knote is + * re-enabled. + * + */ #include #include -static int filt_machportattach(struct knote *kn, struct kevent_internal_s *kev); -static void filt_machportdetach(struct knote *kn); -static int filt_machport(struct knote *kn, long hint); -static int filt_machporttouch(struct knote *kn, struct kevent_internal_s *kev); -static int filt_machportprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev); -static unsigned filt_machportpeek(struct knote *kn); -SECURITY_READ_ONLY_EARLY(struct filterops) machport_filtops = { - .f_adjusts_qos = 1, - .f_attach = filt_machportattach, - .f_detach = filt_machportdetach, - .f_event = filt_machport, - .f_touch = filt_machporttouch, - .f_process = filt_machportprocess, - .f_peek = filt_machportpeek, -}; +static int +filt_machport_adjust_qos(struct knote *kn, ipc_kmsg_t first) +{ + if (kn->kn_sfflags & MACH_RCV_MSG) { + int qos = _pthread_priority_thread_qos(first->ikm_qos_override); + return FILTER_ADJUST_EVENT_QOS(qos); + } + return 0; +} + +struct turnstile * +filt_machport_kqueue_turnstile(struct knote *kn) +{ + if ((kn->kn_sfflags & MACH_RCV_MSG) && (kn->kn_status & KN_DISPATCH)) { + return kqueue_turnstile(knote_get_kq(kn)); + } + return TURNSTILE_NULL; +} + +/* + * Stashes a port that participate to sync IPC override until the knote + * is being re-enabled. + * + * It returns: + * - the turnstile to use as an inheritor for the stashed port + * - the kind of stash that happened as PORT_SYNC_* value among: + * o not stashed (no sync IPC support) + * o stashed in the knote (in kn_ext[3]) + * o to be hooked to the kn_hook knote + */ +struct turnstile * +filt_machport_stash_port(struct knote *kn, ipc_port_t port, int *link) +{ + struct turnstile *ts = filt_machport_kqueue_turnstile(kn); + + if (!ts) { + if (link) *link = PORT_SYNC_LINK_NO_LINKAGE; + } else if (kn->kn_ext[3] == 0) { + ip_reference(port); + kn->kn_ext[3] = (uintptr_t)port; + if (link) *link = PORT_SYNC_LINK_WORKLOOP_KNOTE; + } else { + ts = (struct turnstile *)kn->kn_hook; + if (link) *link = PORT_SYNC_LINK_WORKLOOP_STASH; + } + + return ts; +} + +struct turnstile * +filt_machport_stashed_special_reply_port_turnstile(ipc_port_t port) +{ + struct knote *kn = port->ip_sync_inheritor_knote; + + assert(port->ip_specialreply); + assert(port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE); + if (kn->kn_ext[3] == (uint64_t)port) { + return kqueue_turnstile(knote_get_kq(kn)); + } + return kn->kn_hook; +} + +/* + * Lazily prepare a turnstile so that filt_machport_stash_port() + * can be called with the mqueue lock held. + * + * It will allocate a turnstile in kn_hook if: + * - the knote supports sync IPC override, + * - we already stashed a port in kn_ext[3], + * - the object that will be copied out has a chance to ask to be stashed. + * + * It is setup so that its inheritor is the workloop turnstile that has been + * allocated when this knote was attached. + */ +void +filt_machport_turnstile_prepare_lazily( + struct knote *kn, + mach_msg_type_name_t msgt_name, + ipc_port_t port) +{ + /* This is called from within filt_machportprocess */ + assert((kn->kn_status & KN_SUPPRESSED) && (kn->kn_status & KN_LOCKED)); + + struct turnstile *ts = filt_machport_kqueue_turnstile(kn); + if (ts == TURNSTILE_NULL || kn->kn_ext[3] == 0 || kn->kn_hook) + return; + + if ((msgt_name == MACH_MSG_TYPE_PORT_SEND_ONCE && port->ip_specialreply) || + (msgt_name == MACH_MSG_TYPE_PORT_RECEIVE)) { + struct turnstile *kn_ts = turnstile_alloc(); + kn_ts = turnstile_prepare((uintptr_t)kn, + (struct turnstile **)&kn->kn_hook, kn_ts, TURNSTILE_KNOTE); + turnstile_update_inheritor(kn_ts, ts, + TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_TURNSTILE); + turnstile_cleanup(); + } +} + +/* + * Other half of filt_machport_turnstile_prepare_lazily() + * + * This is serialized by the knote state machine. + */ +static void +filt_machport_turnstile_complete(struct knote *kn) +{ + struct turnstile *ts = TURNSTILE_NULL; + + if (kn->kn_ext[3]) { + ipc_port_t port = (ipc_port_t)kn->kn_ext[3]; + ipc_mqueue_t mqueue = &port->ip_messages; + + ip_lock(port); + if (port->ip_specialreply) { + /* + * If the reply has been sent to the special reply port already, + * then the special reply port may already be reused to do something + * entirely different. + * + * However, the only reason for it to still point to this knote is + * that it's still waiting for a reply, so when this is the case, + * neuter the linkage. + */ + if (port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE && + port->ip_sync_inheritor_knote == kn) { + ipc_port_adjust_special_reply_port_locked(port, NULL, + (IPC_PORT_ADJUST_SR_NONE | IPC_PORT_ADJUST_SR_ENABLE_EVENT), FALSE); + } else { + ip_unlock(port); + } + } else { + struct turnstile *kq_ts = kqueue_turnstile(knote_get_kq(kn)); + + /* + * For receive rights, if their IMQ_INHERITOR() is still this + * workloop, then sever the link. + * + * It has a theoretical hole: if the port is sent again to a new + * receive right that is also monitored by the same kqueue, + * we would sever the link incorrectly. + * + * However this would be a REALLY cumbersome thing to do. + */ + imq_lock(mqueue); + if (!IMQ_KLIST_VALID(mqueue) && IMQ_INHERITOR(mqueue) == kq_ts) { + turnstile_deallocate_safe(kq_ts); + klist_init(&mqueue->imq_klist); + ts = port_send_turnstile(port); + } + if (ts) { + turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL, + TURNSTILE_IMMEDIATE_UPDATE); + turnstile_reference(ts); + } + imq_unlock(mqueue); + ip_unlock(port); + + if (ts) { + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD); + turnstile_deallocate(ts); + } + } + + ip_release(port); + kn->kn_ext[3] = 0; + } + + if (kn->kn_hook) { + ts = kn->kn_hook; + + turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL, + TURNSTILE_IMMEDIATE_UPDATE); + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD); + + turnstile_complete((uintptr_t)kn, (struct turnstile **)&kn->kn_hook, &ts); + turnstile_cleanup(); + + assert(ts); + turnstile_deallocate(ts); + } +} static int filt_machportattach( @@ -377,6 +598,8 @@ filt_machportattach( uint64_t wq_link_id = waitq_link_reserve(NULL); ipc_space_t space = current_space(); ipc_kmsg_t first; + struct turnstile *turnstile = TURNSTILE_NULL; + struct turnstile *send_turnstile = TURNSTILE_NULL; int error; int result = 0; @@ -384,13 +607,48 @@ filt_machportattach( ipc_entry_t entry; ipc_mqueue_t mqueue; + kn->kn_flags &= ~EV_EOF; + kn->kn_ext[3] = 0; + + if ((kn->kn_sfflags & MACH_RCV_MSG) && (kn->kn_status & KN_DISPATCH)) { + /* + * If the filter is likely to support sync IPC override, + * and it happens to be attaching to a workloop, + * make sure the workloop has an allocated turnstile. + */ + turnstile = kqueue_alloc_turnstile(knote_get_kq(kn)); + } + kr = ipc_right_lookup_read(space, name, &entry); + +check_lookup: if (kr == KERN_SUCCESS) { /* space is read-locked and active */ if (entry->ie_bits & MACH_PORT_TYPE_PORT_SET) { ipc_pset_t pset; + if (knote_link_waitqset_should_lazy_alloc(kn)) { + is_read_unlock(space); + + /* + * We need to link the portset of the kn, + * to insure that the link is allocated before taking + * any spinlocks. + */ + knote_link_waitqset_lazy_alloc(kn); + + /* + * We had to drop the space lock because knote_link_waitqset_lazy_alloc() + * could have allocated memory. The ipc_right_lookup_read() + * function returns with the space locked, so we need to revalidate state. + */ + kr = ipc_right_lookup_read(space, name, &entry); + if (!(kr == KERN_SUCCESS) || !(entry->ie_bits & MACH_PORT_TYPE_PORT_SET)) { + goto check_lookup; + } + } + __IGNORE_WCASTALIGN(pset = (ipc_pset_t)entry->ie_object); mqueue = &pset->ips_messages; ips_reference(pset); @@ -407,11 +665,10 @@ filt_machportattach( */ error = knote_link_waitq(kn, &mqueue->imq_wait_queue, &wq_link_id); if (!error) { + assert(IMQ_KLIST_VALID(mqueue)); KNOTE_ATTACH(&mqueue->imq_klist, kn); imq_unlock(mqueue); - - } - else { + } else { kn->kn_ptr.p_mqueue = IMQ_NULL; imq_unlock(mqueue); ips_release(pset); @@ -440,18 +697,37 @@ filt_machportattach( * first message in the queue. */ imq_lock(mqueue); - kn->kn_ptr.p_mqueue = mqueue; + kn->kn_ptr.p_mqueue = mqueue; + if (!IMQ_KLIST_VALID(mqueue)) { + /* + * We're attaching a port that used to have an IMQ_INHERITOR, + * clobber this state, and set the inheritor of its turnstile + * to the kqueue it's now attached to. + */ + turnstile_deallocate_safe(IMQ_INHERITOR(mqueue)); + klist_init(&mqueue->imq_klist); + } KNOTE_ATTACH(&mqueue->imq_klist, kn); + + /* Update the port's turnstile inheritor */ + send_turnstile = port_send_turnstile(port); + if (send_turnstile) { + turnstile_reference(send_turnstile); + turnstile_update_inheritor(send_turnstile, turnstile, + (TURNSTILE_INHERITOR_TURNSTILE | TURNSTILE_IMMEDIATE_UPDATE)); + } + if ((first = ipc_kmsg_queue_first(&mqueue->imq_messages)) != IKM_NULL) { - int sync_qos_override_index = ipc_port_get_max_sync_qos_index(port); - if (kn->kn_sfflags & MACH_RCV_MSG) - knote_adjust_qos(kn, first->ikm_qos, first->ikm_qos_override, - sync_qos_override_index); - result = 1; + result = FILTER_ACTIVE | filt_machport_adjust_qos(kn, first); } imq_unlock(mqueue); - is_read_unlock(space); + if (send_turnstile) { + turnstile_update_inheritor_complete(send_turnstile, + TURNSTILE_INTERLOCK_NOT_HELD); + turnstile_deallocate(send_turnstile); + } + error = 0; } else { is_read_unlock(space); @@ -465,8 +741,7 @@ filt_machportattach( /* bail out on errors */ if (error) { - kn->kn_flags |= EV_ERROR; - kn->kn_data = error; + knote_set_error(kn, error); return 0; } @@ -485,24 +760,54 @@ filt_machportdetach( { ipc_mqueue_t mqueue = kn->kn_ptr.p_mqueue; ipc_object_t object = mqueue_to_object(mqueue); + struct turnstile *send_turnstile = TURNSTILE_NULL; + + filt_machport_turnstile_complete(kn); imq_lock(mqueue); - KNOTE_DETACH(&mqueue->imq_klist, kn); + if ((kn->kn_status & KN_VANISHED) || (kn->kn_flags & EV_EOF)) { + /* + * ipc_mqueue_changed() already unhooked this knote from the mqueue, + */ + } else { + assert(IMQ_KLIST_VALID(mqueue)); + KNOTE_DETACH(&mqueue->imq_klist, kn); + } + + if (io_otype(object) == IOT_PORT) { + ipc_port_t port = ip_from_mq(mqueue); + + send_turnstile = port_send_turnstile(port); + if (send_turnstile) { + turnstile_reference(send_turnstile); + turnstile_update_inheritor(send_turnstile, + ipc_port_get_inheritor(port), + TURNSTILE_INHERITOR_TURNSTILE | TURNSTILE_IMMEDIATE_UPDATE); + } + } + + /* Clear the knote pointer once the knote has been removed from turnstile */ kn->kn_ptr.p_mqueue = IMQ_NULL; imq_unlock(mqueue); + if (send_turnstile) { + turnstile_update_inheritor_complete(send_turnstile, + TURNSTILE_INTERLOCK_NOT_HELD); + turnstile_deallocate(send_turnstile); + } + if (io_otype(object) == IOT_PORT_SET) { /* * Unlink the portset wait queue from knote/kqueue. - * JMM - Does this need to be atomic under the mq lock? + * JMM - Does this need to be atomic under the mq lock? */ (void)knote_unlink_waitq(kn, &mqueue->imq_wait_queue); - } + } io_release(object); } /* - * filt_machport - deliver events into the mach port filter + * filt_machportevent - deliver events into the mach port filter * * Mach port message arrival events are currently only posted via the * kqueue filter routine for ports. Port sets are marked stay-active @@ -524,7 +829,7 @@ filt_machportdetach( * avoiding a conflict). */ static int -filt_machport( +filt_machportevent( struct knote *kn, long hint) { @@ -537,17 +842,11 @@ filt_machport( if (hint == NOTE_REVOKE) { kn->kn_flags |= EV_EOF | EV_ONESHOT; - result = 1; + result = FILTER_ACTIVE | FILTER_RESET_EVENT_QOS; } else if (imq_is_valid(mqueue)) { assert(!imq_is_set(mqueue)); if ((first = ipc_kmsg_queue_first(&mqueue->imq_messages)) != IKM_NULL) { - ipc_port_t port = ip_from_mq(mqueue); - int sync_qos_override_index = ipc_port_get_max_sync_qos_index(port); - - if (kn->kn_sfflags & MACH_RCV_MSG) - knote_adjust_qos(kn, first->ikm_qos, first->ikm_qos_override, - sync_qos_override_index); - result = 1; + result = FILTER_ACTIVE | filt_machport_adjust_qos(kn, first); } } @@ -556,21 +855,25 @@ filt_machport( static int filt_machporttouch( - struct knote *kn, + struct knote *kn, struct kevent_internal_s *kev) { ipc_mqueue_t mqueue = kn->kn_ptr.p_mqueue; ipc_kmsg_t first; int result = 0; - imq_lock(mqueue); - /* copy in new settings and save off new input fflags */ kn->kn_sfflags = kev->fflags; kn->kn_ext[0] = kev->ext[0]; kn->kn_ext[1] = kev->ext[1]; - if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) - kn->kn_udata = kev->udata; + + if (kev->flags & EV_ENABLE) { + /* + * If the knote is being enabled, make sure there's no lingering + * IPC overrides from the previous message delivery. + */ + filt_machport_turnstile_complete(kn); + } /* * If the mqueue is a valid port and there is a message @@ -579,20 +882,12 @@ filt_machporttouch( * the event. If there are no more messages, reset the * QoS to the value provided by the kevent. */ + imq_lock(mqueue); if (imq_is_valid(mqueue) && !imq_is_set(mqueue) && (first = ipc_kmsg_queue_first(&mqueue->imq_messages)) != IKM_NULL) { - ipc_port_t port = ip_from_mq(mqueue); - int sync_qos_override_index = ipc_port_get_max_sync_qos_index(port); - - if (kn->kn_sfflags & MACH_RCV_MSG) - knote_adjust_qos(kn, first->ikm_qos, first->ikm_qos_override, - sync_qos_override_index); - result = 1; + result = FILTER_ACTIVE | filt_machport_adjust_qos(kn, first); } else if (kn->kn_sfflags & MACH_RCV_MSG) { - knote_adjust_qos(kn, - MACH_MSG_PRIORITY_UNSPECIFIED, - MACH_MSG_PRIORITY_UNSPECIFIED, - THREAD_QOS_UNSPECIFIED); + result = FILTER_RESET_EVENT_QOS; } imq_unlock(mqueue); @@ -615,16 +910,14 @@ filt_machportprocess( mach_vm_address_t addr; mach_msg_size_t size; - imq_lock(mqueue); - /* Capture current state */ *kev = kn->kn_kevent; + kev->ext[3] = 0; /* hide our port reference from userspace */ /* If already deallocated/moved return one last EOF event */ if (kev->flags & EV_EOF) { - imq_unlock(mqueue); - return 1; - } + return FILTER_ACTIVE | FILTER_RESET_EVENT_QOS; + } /* * Only honor supported receive options. If no options are @@ -658,6 +951,8 @@ filt_machportprocess( size = 0; } + imq_lock(mqueue); + /* just use the reference from here on out */ io_reference(object); @@ -693,6 +988,7 @@ filt_machportprocess( * reference on the ipc_object and return zero. */ if (wresult == THREAD_RESTART || self->ith_state == MACH_RCV_TIMED_OUT) { + assert(self->turnstile != TURNSTILE_NULL); io_release(object); return 0; } @@ -710,7 +1006,7 @@ filt_machportprocess( assert(self->ith_kmsg == IKM_NULL); kev->data = self->ith_receiver_name; io_release(object); - return 1; + return FILTER_ACTIVE; } /* @@ -750,26 +1046,24 @@ filt_machportprocess( process_data->fp_data_out += size; } else { assert(option & MACH_RCV_STACK); - kev->ext[0] = process_data->fp_data_out + + kev->ext[0] = process_data->fp_data_out + process_data->fp_data_resid; } } /* * Apply message-based QoS values to output kevent as prescribed. - * The kev->qos field gets max(msg-qos, kn->kn_qos). * The kev->ext[2] field gets (msg-qos << 32) | (override-qos). * * The mach_msg_receive_results() call saved off the message * QoS values in the continuation save area on successful receive. */ if (kev->fflags == MACH_MSG_SUCCESS) { - kev->qos = mach_msg_priority_combine(self->ith_qos, kn->kn_qos); - kev->ext[2] = ((uint64_t)self->ith_qos << 32) | - (uint64_t)self->ith_qos_override; + kev->ext[2] = ((uint64_t)self->ith_qos << 32) | + (uint64_t)self->ith_qos_override; } - return 1; + return FILTER_ACTIVE; } /* @@ -785,10 +1079,21 @@ filt_machportprocess( * will catch changes in this status when the event gets posted * up to the knote's kqueue). */ -static unsigned +static int filt_machportpeek(struct knote *kn) { ipc_mqueue_t mqueue = kn->kn_ptr.p_mqueue; - return (ipc_mqueue_set_peek(mqueue)); + return ipc_mqueue_set_peek(mqueue) ? FILTER_ACTIVE : 0; } + +SECURITY_READ_ONLY_EARLY(struct filterops) machport_filtops = { + .f_adjusts_qos = true, + .f_extended_codes = true, + .f_attach = filt_machportattach, + .f_detach = filt_machportdetach, + .f_event = filt_machportevent, + .f_touch = filt_machporttouch, + .f_process = filt_machportprocess, + .f_peek = filt_machportpeek, +}; diff --git a/osfmk/ipc/ipc_pset.h b/osfmk/ipc/ipc_pset.h index a18b9adcd..42008febf 100644 --- a/osfmk/ipc/ipc_pset.h +++ b/osfmk/ipc/ipc_pset.h @@ -94,10 +94,8 @@ struct ipc_pset { #define ips_release(pset) io_release(&(pset)->ips_object) /* get an ipc_pset pointer from an ipc_mqueue pointer */ -#define ips_from_mq(mq) ((struct ipc_pset *)((void *)( \ - (char *)(mq) - \ - __offsetof(struct ipc_pset, ips_messages)) \ - )) +#define ips_from_mq(mq) \ + __container_of(mq, struct ipc_pset, ips_messages) /* Allocate a port set */ extern kern_return_t ipc_pset_alloc( @@ -132,6 +130,11 @@ extern kern_return_t ipc_pset_remove( ipc_pset_t pset, ipc_port_t port); +/* lazily initialize the wqset of a port set */ +extern kern_return_t ipc_pset_lazy_allocate( + ipc_space_t space, + mach_port_name_t psname); + /* Remove a port from all its current port sets */ extern kern_return_t ipc_pset_remove_from_all( ipc_port_t port); @@ -140,4 +143,22 @@ extern kern_return_t ipc_pset_remove_from_all( extern void ipc_pset_destroy( ipc_pset_t pset); +#if MACH_KERNEL_PRIVATE +extern struct turnstile *filt_machport_kqueue_turnstile( + struct knote *kn); + +extern struct turnstile *filt_machport_stashed_special_reply_port_turnstile( + ipc_port_t port); + +extern void filt_machport_turnstile_prepare_lazily( + struct knote *kn, + mach_msg_type_name_t msgt_name, + ipc_port_t port); + +extern struct turnstile *filt_machport_stash_port( + struct knote *kn, + ipc_port_t port, + int *link); +#endif + #endif /* _IPC_IPC_PSET_H_ */ diff --git a/osfmk/ipc/ipc_right.c b/osfmk/ipc/ipc_right.c index 04043b3ea..d1925e69c 100644 --- a/osfmk/ipc/ipc_right.c +++ b/osfmk/ipc/ipc_right.c @@ -90,13 +90,6 @@ #include #include -/* Allow IPC to generate mach port guard exceptions */ -extern kern_return_t -mach_port_guard_exception( - mach_port_name_t name, - uint64_t inguard, - uint64_t portguard, - unsigned reason); /* * Routine: ipc_right_lookup_write * Purpose: @@ -170,10 +163,12 @@ ipc_right_lookup_two_write( if ((entry1 = ipc_entry_lookup(space, name1)) == IE_NULL) { is_write_unlock(space); + mach_port_guard_exception(name1, 0, 0, kGUARD_EXC_INVALID_NAME); return KERN_INVALID_NAME; } if ((entry2 = ipc_entry_lookup(space, name2)) == IE_NULL) { is_write_unlock(space); + mach_port_guard_exception(name2, 0, 0, kGUARD_EXC_INVALID_NAME); return KERN_INVALID_NAME; } *entryp1 = entry1; @@ -1042,6 +1037,7 @@ ipc_right_dealloc( default: is_write_unlock(space); + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_RIGHT); return KERN_INVALID_RIGHT; } @@ -1075,7 +1071,6 @@ ipc_right_delta( bits = entry->ie_bits; - /* * The following is used (for case MACH_PORT_RIGHT_DEAD_NAME) in the * switch below. It is used to keep track of those cases (in DIPC) @@ -1093,8 +1088,10 @@ ipc_right_delta( case MACH_PORT_RIGHT_PORT_SET: { ipc_pset_t pset; - if ((bits & MACH_PORT_TYPE_PORT_SET) == 0) + if ((bits & MACH_PORT_TYPE_PORT_SET) == 0) { + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_RIGHT); goto invalid_right; + } assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_PORT_SET); assert(IE_BITS_UREFS(bits) == 0); @@ -1123,8 +1120,10 @@ ipc_right_delta( case MACH_PORT_RIGHT_RECEIVE: { ipc_port_t request = IP_NULL; - if ((bits & MACH_PORT_TYPE_RECEIVE) == 0) + if ((bits & MACH_PORT_TYPE_RECEIVE) == 0) { + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_RIGHT); goto invalid_right; + } if (delta == 0) goto success; @@ -1230,6 +1229,7 @@ ipc_right_delta( if (ipc_right_check(space, port, name, entry)) { assert(!(entry->ie_bits & MACH_PORT_TYPE_SEND_ONCE)); + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_RIGHT); goto invalid_right; } /* port is locked and active */ @@ -1274,12 +1274,14 @@ ipc_right_delta( /* port is locked and active */ ip_unlock(port); port = IP_NULL; + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_RIGHT); goto invalid_right; } bits = entry->ie_bits; relport = port; port = IP_NULL; } else if ((bits & MACH_PORT_TYPE_DEAD_NAME) == 0) { + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_RIGHT); goto invalid_right; } @@ -1334,8 +1336,13 @@ ipc_right_delta( ipc_port_t port_to_release = IP_NULL; mach_port_mscount_t mscount = 0; - if ((bits & MACH_PORT_TYPE_SEND) == 0) + if ((bits & MACH_PORT_TYPE_SEND) == 0) { + /* invalid right exception only when not live/dead confusion */ + if ((bits & MACH_PORT_TYPE_DEAD_NAME) == 0) { + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_RIGHT); + } goto invalid_right; + } /* maximum urefs for send is MACH_PORT_UREFS_MAX */ @@ -1454,6 +1461,7 @@ ipc_right_delta( invalid_value: is_write_unlock(space); + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_VALUE); return KERN_INVALID_VALUE; guard_failure: @@ -1491,12 +1499,13 @@ ipc_right_destruct( mach_port_mscount_t mscount = 0; bits = entry->ie_bits; - + assert(is_active(space)); if (((bits & MACH_PORT_TYPE_RECEIVE) == 0) || (srdelta && ((bits & MACH_PORT_TYPE_SEND) == 0))) { is_write_unlock(space); + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_RIGHT); return KERN_INVALID_RIGHT; } @@ -1636,8 +1645,8 @@ ipc_right_destruct( invalid_value: is_write_unlock(space); + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_VALUE); return KERN_INVALID_VALUE; - } @@ -1933,8 +1942,10 @@ ipc_right_copyin( ipc_entry_modified(space, name, entry); (void)ipc_port_clear_receiver(port, FALSE); /* don't destroy the port/mqueue */ + imq_lock(&port->ip_messages); port->ip_receiver_name = MACH_PORT_NULL; port->ip_destination = IP_NULL; + imq_unlock(&port->ip_messages); #if IMPORTANCE_INHERITANCE /* @@ -2545,8 +2556,8 @@ ipc_right_copyout( assert(port->ip_sorights > 0); if (port->ip_specialreply) { - ipc_port_unlink_special_reply_port_locked(port, - current_thread()->ith_knote, IPC_PORT_UNLINK_SR_NONE); + ipc_port_adjust_special_reply_port_locked(port, + current_thread()->ith_knote, IPC_PORT_ADJUST_SR_LINK_WORKLOOP, FALSE); /* port unlocked on return */ } else { ip_unlock(port); @@ -2610,23 +2621,17 @@ ipc_right_copyout( case MACH_MSG_TYPE_PORT_RECEIVE: { ipc_port_t dest; - sync_qos_count_t max_sync_qos = THREAD_QOS_UNSPECIFIED; - sync_qos_count_t sync_qos_delta_add[THREAD_QOS_LAST] = {0}; - sync_qos_count_t sync_qos_delta_sub[THREAD_QOS_LAST] = {0}; + turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL; + struct turnstile *ts = TURNSTILE_NULL; #if IMPORTANCE_INHERITANCE natural_t assertcnt = port->ip_impcount; #endif /* IMPORTANCE_INHERITANCE */ - /* Capture the sync qos count delta */ - for (int i = 0; i < THREAD_QOS_LAST; i++) { - sync_qos_delta_sub[i] = port_sync_qos(port, i); - if (sync_qos_delta_sub[i] != 0) { - max_sync_qos = i; - } - } assert(port->ip_mscount == 0); assert(port->ip_receiver_name == MACH_PORT_NULL); + + imq_lock(&port->ip_messages); dest = port->ip_destination; port->ip_receiver_name = name; @@ -2634,6 +2639,24 @@ ipc_right_copyout( assert((bits & MACH_PORT_TYPE_RECEIVE) == 0); + /* Update the port's turnstile linkage to WL turnstile */ + ts = port_send_turnstile(port); + if (ts) { + struct knote *kn = current_thread()->ith_knote; + if (ITH_KNOTE_VALID(kn, MACH_MSG_TYPE_PORT_RECEIVE)) { + inheritor = filt_machport_stash_port(kn, port, NULL); + if (inheritor) { + turnstile_reference(inheritor); + IMQ_SET_INHERITOR(&port->ip_messages, inheritor); + } + } + turnstile_reference(ts); + turnstile_update_inheritor(ts, inheritor, + (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_TURNSTILE)); + } + + imq_unlock(&port->ip_messages); + if (bits & MACH_PORT_TYPE_SEND) { assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_SEND); assert(IE_BITS_UREFS(bits) > 0); @@ -2643,9 +2666,7 @@ ipc_right_copyout( ip_release(port); /* entry is locked holding ref, so can use port */ - - ipc_hash_delete(space, (ipc_object_t) port, - name, entry); + ipc_hash_delete(space, (ipc_object_t) port, name, entry); } else { assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_NONE); assert(IE_BITS_UREFS(bits) == 0); @@ -2656,9 +2677,9 @@ ipc_right_copyout( entry->ie_bits = bits | MACH_PORT_TYPE_RECEIVE; ipc_entry_modified(space, name, entry); - /* update the sync qos count on knote */ - if (ITH_KNOTE_VALID(current_thread()->ith_knote)) { - knote_adjust_sync_qos(current_thread()->ith_knote, max_sync_qos, TRUE); + if (ts) { + turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD); + turnstile_deallocate_safe(ts); } if (dest != IP_NULL) { @@ -2673,8 +2694,9 @@ ipc_right_copyout( ipc_port_impcount_delta(dest, 0 - assertcnt, IP_NULL); ip_unlock(dest); #endif /* IMPORTANCE_INHERITANCE */ - /* Adjust the sync qos of destination */ - ipc_port_adjust_sync_qos(dest, sync_qos_delta_add, sync_qos_delta_sub); + + /* Drop turnstile ref on dest */ + ipc_port_send_turnstile_complete(dest); ip_release(dest); } break; @@ -2775,11 +2797,13 @@ ipc_right_rename( assert(port != IP_NULL); ip_lock(port); + imq_lock(&port->ip_messages); assert(ip_active(port)); assert(port->ip_receiver_name == oname); assert(port->ip_receiver == space); port->ip_receiver_name = nname; + imq_unlock(&port->ip_messages); ip_unlock(port); break; } diff --git a/osfmk/ipc/ipc_space.c b/osfmk/ipc/ipc_space.c index 8596ad530..9760d042e 100644 --- a/osfmk/ipc/ipc_space.c +++ b/osfmk/ipc/ipc_space.c @@ -149,12 +149,12 @@ ipc_space_rand_freelist( mach_port_index_t bottom, mach_port_index_t top) { + int at_start = (bottom == 0); #ifdef CONFIG_SEMI_RANDOM_ENTRIES /* * Only make sequential entries at the start of the table, and not when * we're growing the space. */ - int at_start = (bottom == 0); ipc_entry_num_t total = 0; #endif @@ -210,6 +210,11 @@ ipc_space_rand_freelist( table[curr].ie_object = IO_NULL; table[curr].ie_index = 0; table[curr].ie_bits = IE_BITS_GEN_MASK; + + /* The freelist head should always have generation number set to 0 */ + if (at_start) { + table[0].ie_bits = 0; + } } diff --git a/osfmk/ipc/ipc_voucher.c b/osfmk/ipc/ipc_voucher.c index ab0858c99..4e7b4b950 100644 --- a/osfmk/ipc/ipc_voucher.c +++ b/osfmk/ipc/ipc_voucher.c @@ -98,26 +98,22 @@ static lck_spin_t ivgt_lock_data; ipc_voucher_t iv_alloc(iv_index_t entries); void iv_dealloc(ipc_voucher_t iv, boolean_t unhash); -extern int thread_qos_from_pthread_priority(unsigned long, unsigned long *); -static inline iv_refs_t +os_refgrp_decl(static, iv_refgrp, "voucher", NULL); +os_refgrp_decl(static, ivac_refgrp, "voucher attribute control", NULL); + +static inline void iv_reference(ipc_voucher_t iv) { - iv_refs_t refs; - - refs = hw_atomic_add(&iv->iv_refs, 1); - return refs; + os_ref_retain(&iv->iv_refs); } static inline void iv_release(ipc_voucher_t iv) { - iv_refs_t refs; - - assert(0 < iv->iv_refs); - refs = hw_atomic_sub(&iv->iv_refs, 1); - if (0 == refs) + if (os_ref_release(&iv->iv_refs) == 0) { iv_dealloc(iv, TRUE); + } } /* @@ -242,7 +238,7 @@ iv_alloc(iv_index_t entries) if (IV_NULL == iv) return IV_NULL; - iv->iv_refs = 1; + os_ref_init(&iv->iv_refs, &iv_refgrp); iv->iv_sum = 0; iv->iv_hash = 0; iv->iv_port = IP_NULL; @@ -298,7 +294,7 @@ iv_dealloc(ipc_voucher_t iv, boolean_t unhash) */ if (unhash) { ivht_lock(); - assert(0 == iv->iv_refs); + assert(os_ref_get_count(&iv->iv_refs) == 0); assert(IV_HASH_BUCKETS > iv->iv_hash); queue_remove(&ivht_bucket[iv->iv_hash], iv, ipc_voucher_t, iv_hash_link); ivht_count--; @@ -307,8 +303,10 @@ iv_dealloc(ipc_voucher_t iv, boolean_t unhash) KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_VOUCHER_DESTROY) | DBG_FUNC_NONE, VM_KERNEL_ADDRPERM((uintptr_t)iv), 0, ivht_count, 0, 0); - } else - assert(0 == --iv->iv_refs); + } else { + os_ref_count_t cnt __assert_only = os_ref_release(&iv->iv_refs); + assert(cnt == 0); + } /* * if a port was allocated for this voucher, @@ -451,13 +449,10 @@ convert_port_name_to_voucher( void ipc_voucher_reference(ipc_voucher_t voucher) { - iv_refs_t refs; - if (IPC_VOUCHER_NULL == voucher) return; - refs = iv_reference(voucher); - assert(1 < refs); + iv_reference(voucher); } void @@ -505,7 +500,7 @@ convert_voucher_to_port(ipc_voucher_t voucher) if (IV_NULL == voucher) return (IP_NULL); - assert(0 < voucher->iv_refs); + assert(os_ref_get_count(&voucher->iv_refs) > 0); /* create a port if needed */ port = voucher->iv_port; @@ -579,7 +574,7 @@ ivac_alloc(iv_index_t key_index) if (IVAC_NULL == ivac) return IVAC_NULL; - ivac->ivac_refs = 1; + os_ref_init(&ivac->ivac_refs, &ivac_refgrp); ivac->ivac_is_growing = FALSE; ivac->ivac_port = IP_NULL; @@ -617,7 +612,7 @@ ivac_dealloc(ipc_voucher_attr_control_t ivac) * that the reference count is still zero. */ ivgt_lock(); - if (ivac->ivac_refs > 0) { + if (os_ref_get_count(&ivac->ivac_refs) > 0) { ivgt_unlock(); return; } @@ -1617,8 +1612,7 @@ iv_dedup(ipc_voucher_t new_iv) assert(iv->iv_hash == hash); /* if not already deallocating and sums match... */ - if (0 < iv->iv_refs && iv->iv_sum == sum) { - iv_refs_t refs; + if ((os_ref_get_count(&iv->iv_refs) > 0) && (iv->iv_sum == sum)) { iv_index_t i; assert(iv->iv_table_size <= new_iv->iv_table_size); @@ -1641,16 +1635,12 @@ iv_dedup(ipc_voucher_t new_iv) /* can we get a ref before it hits 0 * - * This is thread safe. The reference is just an atomic - * add. If the reference count is zero when we adjust it, - * no other thread can have a reference to the voucher. + * This is thread safe. If the reference count is zero before we + * adjust it, no other thread can have a reference to the voucher. * The dealloc code requires holding the ivht_lock, so * the voucher cannot be yanked out from under us. */ - refs = iv_reference(iv); - if (1 == refs) { - /* drats! going away. Put back to zero */ - iv->iv_refs = 0; + if (!os_ref_retain_try(&iv->iv_refs)) { continue; } @@ -1724,24 +1714,21 @@ iv_dedup(ipc_voucher_t new_iv) } } - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_VOUCHER_CREATE) | DBG_FUNC_NONE, - voucher_addr, - new_iv->iv_table_size, ivht_count, payload_size, 0); + KDBG(MACHDBG_CODE(DBG_MACH_IPC, MACH_IPC_VOUCHER_CREATE), + voucher_addr, new_iv->iv_table_size, ivht_count, + payload_size); uintptr_t index = 0; while (attr_tracepoints_needed--) { - KERNEL_DEBUG_CONSTANT1(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_VOUCHER_CREATE_ATTR_DATA) | DBG_FUNC_NONE, - payload[index], - payload[index+1], - payload[index+2], - payload[index+3], - voucher_addr); + KDBG(MACHDBG_CODE(DBG_MACH_IPC, + MACH_IPC_VOUCHER_CREATE_ATTR_DATA), payload[index], + payload[index + 1], payload[index + 2], + payload[index + 3]); index += 4; } } else { - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_VOUCHER_CREATE) | DBG_FUNC_NONE, - voucher_addr, - new_iv->iv_table_size, ivht_count, 0, 0); + KDBG(MACHDBG_CODE(DBG_MACH_IPC, MACH_IPC_VOUCHER_CREATE), + voucher_addr, new_iv->iv_table_size, ivht_count); } } #endif /* KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD */ @@ -2353,7 +2340,7 @@ mach_voucher_attr_control_get_values( key_index = control->ivac_key_index; - assert(0 < voucher->iv_refs); + assert(os_ref_get_count(&voucher->iv_refs) > 0); value_index = iv_lookup(voucher, key_index); ivace_lookup_values(key_index, value_index, out_values, in_out_size); diff --git a/osfmk/ipc/ipc_voucher.h b/osfmk/ipc/ipc_voucher.h index 3f637856c..b306487a4 100644 --- a/osfmk/ipc/ipc_voucher.h +++ b/osfmk/ipc/ipc_voucher.h @@ -32,6 +32,7 @@ #include #include #include +#include #ifdef MACH_KERNEL_PRIVATE @@ -50,8 +51,6 @@ extern void ipc_voucher_init(void); typedef mach_voucher_attr_value_handle_t iv_value_handle_t; typedef mach_voucher_attr_value_reference_t iv_value_refs_t; -typedef natural_t iv_refs_t; - typedef natural_t iv_index_t; #define IV_UNUSED_VALINDEX ((iv_index_t) 0) #define IV_UNUSED_KEYINDEX ((iv_index_t) ~0) @@ -71,7 +70,7 @@ typedef iv_index_t *iv_entry_t; struct ipc_voucher { iv_index_t iv_hash; /* checksum hash */ iv_index_t iv_sum; /* checksum of values */ - iv_refs_t iv_refs; /* reference count */ + os_refcnt_t iv_refs; /* reference count */ iv_index_t iv_table_size; /* size of the voucher table */ iv_index_t iv_inline_table[IV_ENTRIES_INLINE]; iv_entry_t iv_table; /* table of voucher attr entries */ @@ -142,7 +141,7 @@ typedef ivac_entry *ivac_entry_t; #define IVAC_ENTRIES_MAX 524288 struct ipc_voucher_attr_control { - iv_refs_t ivac_refs; + os_refcnt_t ivac_refs; boolean_t ivac_is_growing; /* is the table being grown */ ivac_entry_t ivac_table; /* table of voucher attr value entries */ iv_index_t ivac_table_size; /* size of the attr value table */ @@ -182,20 +181,20 @@ extern void ivac_dealloc(ipc_voucher_attr_control_t ivac); static inline void ivac_reference(ipc_voucher_attr_control_t ivac) { - (void)hw_atomic_add(&ivac->ivac_refs, 1); + if (ivac == IVAC_NULL) + return; + os_ref_retain(&ivac->ivac_refs); } static inline void ivac_release(ipc_voucher_attr_control_t ivac) { - iv_refs_t refs; - if (IVAC_NULL == ivac) return; - refs = hw_atomic_sub(&ivac->ivac_refs, 1); - if (refs == 0) + if (os_ref_release(&ivac->ivac_refs) == 0) { ivac_dealloc(ivac); + } } #define IVAM_NULL IPC_VOUCHER_ATTR_MANAGER_NULL diff --git a/osfmk/ipc/mach_debug.c b/osfmk/ipc/mach_debug.c index 972073a71..a7b47831b 100644 --- a/osfmk/ipc/mach_debug.c +++ b/osfmk/ipc/mach_debug.c @@ -545,3 +545,51 @@ mach_port_kernel_object( return kr; } #endif /* MACH_IPC_DEBUG */ + +#if (DEVELOPMENT || DEBUG) +kern_return_t +mach_port_special_reply_port_reset_link( + ipc_space_t space, + mach_port_name_t name, + boolean_t *srp_lost_link) +{ + ipc_port_t port; + kern_return_t kr; + thread_t thread = current_thread(); + + if (space != current_space()) + return KERN_INVALID_TASK; + + if (!MACH_PORT_VALID(name)) + return KERN_INVALID_NAME; + + if (!IP_VALID(thread->ith_special_reply_port)) + return KERN_INVALID_VALUE; + + kr = ipc_port_translate_receive(space, name, &port); + if (kr != KERN_SUCCESS) + return kr; + + if (thread->ith_special_reply_port != port) { + ip_unlock(port); + return KERN_INVALID_ARGUMENT; + } + + imq_lock(&port->ip_messages); + *srp_lost_link = (port->ip_srp_lost_link == 1)? TRUE : FALSE; + port->ip_srp_lost_link = 0; + imq_unlock(&port->ip_messages); + + ip_unlock(port); + return KERN_SUCCESS; +} +#else +kern_return_t +mach_port_special_reply_port_reset_link( + __unused ipc_space_t space, + __unused mach_port_name_t name, + __unused boolean_t *srp_lost_link) +{ + return KERN_NOT_SUPPORTED; +} +#endif diff --git a/osfmk/ipc/mach_kernelrpc.c b/osfmk/ipc/mach_kernelrpc.c index b4ee58fec..05fd050bb 100644 --- a/osfmk/ipc/mach_kernelrpc.c +++ b/osfmk/ipc/mach_kernelrpc.c @@ -268,6 +268,44 @@ _kernelrpc_mach_port_insert_right_trap(struct _kernelrpc_mach_port_insert_right_ return (rv); } +int +_kernelrpc_mach_port_get_attributes_trap(struct _kernelrpc_mach_port_get_attributes_args *args) +{ + task_inspect_t task = port_name_to_task_inspect(args->target); + int rv = MACH_SEND_INVALID_DEST; + mach_msg_type_number_t count; + + if (task != current_task()) + goto done; + + // MIG does not define the type or size of the mach_port_info_t out array + // anywhere, so derive them from the field in the generated reply struct +#define MACH_PORT_INFO_OUT (((__Reply__mach_port_get_attributes_t*)NULL)->port_info_out) +#define MACH_PORT_INFO_STACK_LIMIT 80 // current size is 68 == 17 * sizeof(integer_t) + _Static_assert(sizeof(MACH_PORT_INFO_OUT) < MACH_PORT_INFO_STACK_LIMIT, + "mach_port_info_t has grown significantly, reevaluate stack usage"); + const mach_msg_type_number_t max_count = (sizeof(MACH_PORT_INFO_OUT)/sizeof(MACH_PORT_INFO_OUT[0])); + typeof(MACH_PORT_INFO_OUT[0]) info[max_count]; + + if (copyin(CAST_USER_ADDR_T(args->count), &count, sizeof(count))) { + rv = MACH_SEND_INVALID_DATA; + goto done; + } + if (count > max_count) + count = max_count; + + rv = mach_port_get_attributes(task->itk_space, args->name, args->flavor, info, &count); + if (rv == KERN_SUCCESS) + rv = copyout(&count, CAST_USER_ADDR_T(args->count), sizeof(count)); + if (rv == KERN_SUCCESS && count > 0) + rv = copyout(info, CAST_USER_ADDR_T(args->info), count * sizeof(info[0])); + +done: + if (task) + task_deallocate(task); + return (rv); +} + int _kernelrpc_mach_port_insert_member_trap(struct _kernelrpc_mach_port_insert_member_args *args) { @@ -487,7 +525,8 @@ mach_voucher_extract_attr_recipe_trap(struct mach_voucher_extract_attr_recipe_ar kfree(krecipe, (vm_size_t)max_sz); } - kr = copyout(&sz, args->recipe_size, sizeof(sz)); + if (kr == KERN_SUCCESS) + kr = copyout(&sz, args->recipe_size, sizeof(sz)); done: ipc_voucher_release(voucher); diff --git a/osfmk/ipc/mach_msg.c b/osfmk/ipc/mach_msg.c index 128cd9605..d17cb24c3 100644 --- a/osfmk/ipc/mach_msg.c +++ b/osfmk/ipc/mach_msg.c @@ -95,6 +95,7 @@ #include +#include #include #include #include @@ -151,8 +152,8 @@ mach_msg_rcv_link_special_reply_port( ipc_port_t special_reply_port, mach_port_name_t dest_name_port); -static void -mach_msg_rcv_unlink_special_reply_port(void); +void +mach_msg_receive_results_complete(ipc_object_t object); security_token_t KERNEL_SECURITY_TOKEN = KERNEL_SECURITY_TOKEN_VALUE; audit_token_t KERNEL_AUDIT_TOKEN = KERNEL_AUDIT_TOKEN_VALUE; @@ -205,6 +206,8 @@ mach_msg_send( mach_msg_size_t msg_and_trailer_size; mach_msg_max_trailer_t *trailer; + option |= MACH_SEND_KERNEL; + if ((send_size & 3) || send_size < sizeof(mach_msg_header_t) || (send_size < sizeof(mach_msg_base_t) && (msg->msgh_bits & MACH_MSGH_BITS_COMPLEX))) @@ -320,8 +323,11 @@ mach_msg_receive_results( mach_msg_trailer_size_t trailer_size; mach_msg_size_t size = 0; - /* unlink the special_reply_port before releasing reference to object */ - mach_msg_rcv_unlink_special_reply_port(); + /* + * unlink the special_reply_port before releasing reference to object. + * get the thread's turnstile, if the thread donated it's turnstile to the port + */ + mach_msg_receive_results_complete(object); io_release(object); if (mr != MACH_MSG_SUCCESS) { @@ -415,33 +421,6 @@ mach_msg_receive_results( *sizep = size; return mr; } -#ifndef _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG -#define _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG 0x02000000 /* pthread event manager bit */ -#endif -#ifndef _PTHREAD_PRIORITY_OVERCOMMIT_FLAG -#define _PTHREAD_PRIORITY_OVERCOMMIT_FLAG 0x80000000 /* request overcommit threads */ -#endif -#ifndef _PTHREAD_PRIORITY_QOS_CLASS_MASK -#define _PTHREAD_PRIORITY_QOS_CLASS_MASK 0x003fff00 /* QoS class mask */ -#endif - -/* JMM - this needs to invoke a pthread function to compute this */ -mach_msg_priority_t -mach_msg_priority_combine(mach_msg_priority_t msg_qos, - mach_msg_priority_t recv_qos) -{ - mach_msg_priority_t overcommit; - mach_msg_priority_t no_oc_qos; - mach_msg_priority_t res; - - assert(msg_qos < _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG); - overcommit = recv_qos & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG; - no_oc_qos = recv_qos & ~overcommit; - res = (no_oc_qos > msg_qos) ? no_oc_qos : msg_qos; - res |= overcommit; - - return res; -} /* * Routine: mach_msg_receive [Kernel Internal] @@ -451,7 +430,7 @@ mach_msg_priority_combine(mach_msg_priority_t msg_qos, * Unlike being dispatched to by ipc_kobject_server() or the * reply part of mach_msg_rpc_from_kernel(), this routine * looks up the receive port name in the kernel's port - * namespace and copies out received port rights to that namespace + * namespace and copies out received port rights to that namespace * as well. Out-of-line memory is copied out the kernel's * address space (rather than just providing the vm_map_copy_t). * Conditions: @@ -586,6 +565,7 @@ mach_msg_overwrite_trap( mr = ipc_mqueue_copyin(space, rcv_name, &mqueue, &object); if (mr != MACH_MSG_SUCCESS) { + mach_port_guard_exception(rcv_name, 0, 0, kGUARD_EXC_RCV_INVALID_NAME); return mr; } /* hold ref for object */ @@ -640,7 +620,6 @@ mach_msg_rcv_link_special_reply_port( { ipc_port_t dest_port = IP_NULL; kern_return_t kr; - int qos; if (current_thread()->ith_special_reply_port != special_reply_port) { return MACH_RCV_INVALID_NOTIFY; @@ -660,12 +639,8 @@ mach_msg_rcv_link_special_reply_port( * do not fail the receive in that case. */ if (kr == KERN_SUCCESS && IP_VALID(dest_port)) { - - /* Get the effective qos of the thread */ - qos = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_QOS); - - ipc_port_link_special_reply_port_with_qos(special_reply_port, - dest_port, qos); + ipc_port_link_special_reply_port(special_reply_port, + dest_port); /* release the send right */ ipc_port_release_send(dest_port); @@ -674,29 +649,47 @@ mach_msg_rcv_link_special_reply_port( } /* - * Routine: mach_msg_rcv_unlink_special_reply_port + * Routine: mach_msg_receive_results_complete * Purpose: - * Unlink the special reply port to the other end - * of the sync ipc channel. + * Get thread's turnstile back from the object and + * if object is a special reply port then reset its + * linkage. * Condition: * Nothing locked. * Returns: * None. */ -static void -mach_msg_rcv_unlink_special_reply_port(void) +void +mach_msg_receive_results_complete(ipc_object_t object) { thread_t self = current_thread(); - ipc_port_t special_reply_port = self->ith_special_reply_port; - mach_msg_option_t option = self->ith_option; + ipc_port_t port = IPC_PORT_NULL; + boolean_t get_turnstile = self->turnstile ? FALSE : TRUE; - if ((special_reply_port == IP_NULL) || - !(option & MACH_RCV_SYNC_WAIT)) { + if (io_otype(object) == IOT_PORT) { + __IGNORE_WCASTALIGN(port = (ipc_port_t) object); + } else { + assert(self->turnstile != TURNSTILE_NULL); return; } - ipc_port_unlink_special_reply_port(special_reply_port, - IPC_PORT_UNLINK_SR_ALLOW_SYNC_QOS_LINKAGE); + uint8_t flags = IPC_PORT_ADJUST_SR_ALLOW_SYNC_LINKAGE; + + /* + * Don't clear the ip_srp_msg_sent bit if... + */ + if (!((self->ith_state == MACH_RCV_TOO_LARGE && self->ith_option & MACH_RCV_LARGE) || //msg was too large and the next receive will get it + self->ith_state == MACH_RCV_INTERRUPTED || + self->ith_state == MACH_RCV_TIMED_OUT || + self->ith_state == MACH_RCV_PORT_CHANGED || + self->ith_state == MACH_PEEK_READY)) { + + flags |= IPC_PORT_ADJUST_SR_RECEIVED_MSG; + } + + ipc_port_adjust_special_reply_port(port, + flags, get_turnstile); + /* thread now has a turnstile */ } /* diff --git a/osfmk/ipc/mach_port.c b/osfmk/ipc/mach_port.c index ced4e6384..7d88c5481 100644 --- a/osfmk/ipc/mach_port.c +++ b/osfmk/ipc/mach_port.c @@ -87,6 +87,7 @@ #include #include #include +#include #include #include #include @@ -120,14 +121,6 @@ void mach_port_gst_helper( mach_port_name_t *names, ipc_entry_num_t *actualp); - -kern_return_t -mach_port_guard_exception( - mach_port_name_t name, - uint64_t inguard, - uint64_t portguard, - unsigned reason); - /* Needs port locked */ void mach_port_get_status_helper( ipc_port_t port, @@ -464,8 +457,10 @@ mach_port_type( } kr = ipc_right_lookup_write(space, name, &entry); - if (kr != KERN_SUCCESS) + if (kr != KERN_SUCCESS) { + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_NAME); return kr; + } /* space is write-locked and active */ kr = ipc_right_info(space, name, entry, typep, &urefs); @@ -677,12 +672,14 @@ mach_port_allocate_full( } else { mach_msg_size_t size = qosp->len + MAX_TRAILER_SIZE; - if (right != MACH_PORT_RIGHT_RECEIVE) + if (right != MACH_PORT_RIGHT_RECEIVE) { return (KERN_INVALID_VALUE); + } kmsg = (ipc_kmsg_t)ipc_kmsg_prealloc(size); - if (kmsg == IKM_NULL) + if (kmsg == IKM_NULL) { return (KERN_RESOURCE_SHORTAGE); + } } } @@ -763,8 +760,10 @@ mach_port_destroy( return KERN_SUCCESS; kr = ipc_right_lookup_write(space, name, &entry); - if (kr != KERN_SUCCESS) + if (kr != KERN_SUCCESS) { + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_NAME); return kr; + } /* space is write-locked and active */ kr = ipc_right_destroy(space, name, entry, TRUE, 0); /* unlocks space */ @@ -804,8 +803,10 @@ mach_port_deallocate( return KERN_SUCCESS; kr = ipc_right_lookup_write(space, name, &entry); - if (kr != KERN_SUCCESS) + if (kr != KERN_SUCCESS) { + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_NAME); return kr; + } /* space is write-locked */ kr = ipc_right_dealloc(space, name, entry); /* unlocks space */ @@ -857,8 +858,10 @@ mach_port_get_refs( } kr = ipc_right_lookup_write(space, name, &entry); - if (kr != KERN_SUCCESS) + if (kr != KERN_SUCCESS) { + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_NAME); return kr; + } /* space is write-locked and active */ kr = ipc_right_info(space, name, entry, &type, &urefs); @@ -937,8 +940,11 @@ mach_port_mod_refs( } kr = ipc_right_lookup_write(space, name, &entry); - if (kr != KERN_SUCCESS) + if (kr != KERN_SUCCESS) { + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_NAME); return kr; + } + /* space is write-locked and active */ kr = ipc_right_delta(space, name, entry, right, delta); /* unlocks */ @@ -1011,14 +1017,21 @@ mach_port_peek( * leaking the context pointer and to avoid variable-sized context issues. */ if (GET_RCV_ELEMENTS(trailer_type) > MACH_RCV_TRAILER_AUDIT || - REQUESTED_TRAILER_SIZE(TRUE, trailer_type) > *trailer_sizep) + REQUESTED_TRAILER_SIZE(TRUE, trailer_type) > *trailer_sizep) { + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_VALUE); return KERN_INVALID_VALUE; + } *trailer_sizep = REQUESTED_TRAILER_SIZE(TRUE, trailer_type); kr = ipc_port_translate_receive(space, name, &port); - if (kr != KERN_SUCCESS) + if (kr != KERN_SUCCESS) { + mach_port_guard_exception(name, 0, 0, + ((KERN_INVALID_NAME == kr) ? + kGUARD_EXC_INVALID_NAME : + kGUARD_EXC_INVALID_RIGHT)); return kr; + } /* Port locked and active */ @@ -1392,6 +1405,9 @@ mach_port_move_member( wq_link_id = waitq_link_reserve(NULL); wq_reserved_prepost = waitq_prepost_reserve(NULL, 10, WAITQ_DONT_LOCK); + kr = ipc_pset_lazy_allocate(space, after); + if (kr != KERN_SUCCESS) + goto done; } kr = ipc_right_lookup_read(space, member, &entry); @@ -2048,6 +2064,10 @@ mach_port_insert_member( wq_link_id = waitq_link_reserve(NULL); wq_reserved_prepost = waitq_prepost_reserve(NULL, 10, WAITQ_DONT_LOCK); + kr = ipc_pset_lazy_allocate(space, psname); + if (kr != KERN_SUCCESS) + goto done; + kr = ipc_object_translate_two(space, name, MACH_PORT_RIGHT_RECEIVE, &obj, @@ -2224,7 +2244,7 @@ mach_port_unguard_locked( * Returns: * KERN_FAILURE Thread marked with AST_GUARD. */ -kern_return_t +void mach_port_guard_exception( mach_port_name_t name, __unused uint64_t inguard, @@ -2238,7 +2258,6 @@ mach_port_guard_exception( mach_exception_subcode_t subcode = (uint64_t)portguard; thread_t t = current_thread(); thread_guard_violation(t, code, subcode); - return KERN_FAILURE; } @@ -2253,16 +2272,65 @@ mach_port_guard_exception( */ void -mach_port_guard_ast(thread_t __unused t, +mach_port_guard_ast(thread_t t, mach_exception_data_type_t code, mach_exception_data_type_t subcode) { - assert(t->task != kernel_task); + unsigned int reason = EXC_GUARD_DECODE_GUARD_FLAVOR(code); + task_t task = t->task; + unsigned int behavior = task->task_exc_guard; + assert(task == current_task()); + assert(task != kernel_task); - /* Raise an EXC_GUARD exception */ - task_exception_notify(EXC_GUARD, code, subcode); + switch (reason) { + /* + * Fatal Mach port guards - always delivered synchronously + */ + case kGUARD_EXC_DESTROY: + case kGUARD_EXC_MOD_REFS: + case kGUARD_EXC_SET_CONTEXT: + case kGUARD_EXC_UNGUARDED: + case kGUARD_EXC_INCORRECT_GUARD: + task_exception_notify(EXC_GUARD, code, subcode); + task_bsdtask_kill(task); + break; + + default: + /* + * Mach port guards controlled by task settings. + */ + + /* Is delivery enabled */ + if ((behavior & TASK_EXC_GUARD_MP_DELIVER) == 0) { + return; + } + + /* If only once, make sure we're that once */ + while (behavior & TASK_EXC_GUARD_MP_ONCE) { + uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_MP_DELIVER; + + if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) { + break; + } + behavior = task->task_exc_guard; + if ((behavior & TASK_EXC_GUARD_MP_DELIVER) == 0) { + return; + } + } + + /* Raise exception via corpse fork or synchronously */ + if ((task->task_exc_guard & TASK_EXC_GUARD_MP_CORPSE) && + (task->task_exc_guard & TASK_EXC_GUARD_MP_FATAL) == 0) { + task_violated_guard(code, subcode, NULL); + } else { + task_exception_notify(EXC_GUARD, code, subcode); + } - /* Terminate task which caused the exception */ - task_bsdtask_kill(current_task()); + /* Terminate the task if desired */ + if (task->task_exc_guard & TASK_EXC_GUARD_MP_FATAL) { + task_bsdtask_kill(task); + } + break; + } } /* @@ -2390,8 +2458,10 @@ mach_port_destruct( /* Remove reference for receive right */ kr = ipc_right_lookup_write(space, name, &entry); - if (kr != KERN_SUCCESS) + if (kr != KERN_SUCCESS) { + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_NAME); return kr; + } /* space is write-locked and active */ kr = ipc_right_destruct(space, name, entry, srdelta, guard); /* unlocks */ @@ -2431,15 +2501,23 @@ mach_port_guard( /* Guard can be applied only to receive rights */ kr = ipc_port_translate_receive(space, name, &port); - if (kr != KERN_SUCCESS) + if (kr != KERN_SUCCESS) { + mach_port_guard_exception(name, 0, 0, + ((KERN_INVALID_NAME == kr) ? + kGUARD_EXC_INVALID_NAME : + kGUARD_EXC_INVALID_RIGHT)); return kr; + } /* Port locked and active */ kr = mach_port_guard_locked(port, guard, strict); ip_unlock(port); - return kr; + if (KERN_INVALID_ARGUMENT == kr) { + mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_ARGUMENT); + } + return kr; } /* @@ -2474,12 +2552,18 @@ mach_port_unguard( return KERN_INVALID_NAME; kr = ipc_port_translate_receive(space, name, &port); - if (kr != KERN_SUCCESS) + if (kr != KERN_SUCCESS) { + mach_port_guard_exception(name, 0, 0, + ((KERN_INVALID_NAME == kr) ? + kGUARD_EXC_INVALID_NAME : + kGUARD_EXC_INVALID_RIGHT)); return kr; + } /* Port locked and active */ kr = mach_port_unguard_locked(port, name, guard); ip_unlock(port); + return kr; } diff --git a/osfmk/ipc/port.h b/osfmk/ipc/port.h index 26dd9bcb1..7e25b7f05 100644 --- a/osfmk/ipc/port.h +++ b/osfmk/ipc/port.h @@ -86,4 +86,12 @@ #define MACH_PORT_UREFS_UNDERFLOW(urefs, delta) \ (((delta) < 0) && (((mach_port_urefs_t)-(delta)) > (urefs))) +__BEGIN_DECLS +extern void mach_port_guard_exception( + mach_port_name_t name, + uint64_t inguard, + uint64_t portguard, + unsigned reason); +__END_DECLS + #endif /* _IPC_PORT_H_ */ diff --git a/osfmk/kdp/kdp_core.c b/osfmk/kdp/kdp_core.c index c3ea1037c..d214c8e04 100644 --- a/osfmk/kdp/kdp_core.c +++ b/osfmk/kdp/kdp_core.c @@ -178,6 +178,10 @@ boolean_t kdp_has_polled_corefile(void) return (NULL != gIOPolledCoreFileVars); } +kern_return_t kdp_polled_corefile_error(void) +{ + return gIOPolledCoreFileOpenRet; +} #if CONFIG_EMBEDDED /* * Whenever we start a coredump, make sure the buffers @@ -461,7 +465,7 @@ kern_dump_disk_proc(unsigned int request, __unused char *corename, case KDP_DATA: err = IOPolledFileWrite(gIOPolledCoreFileVars, data, length, NULL); if (kIOReturnSuccess != err) { - kern_coredump_log(NULL, "IOPolledFileWrite(gIOPolledCoreFileVars, 0x%p, 0x%llx, NULL) returned 0x%x\n", + kern_coredump_log(NULL, "IOPolledFileWrite(gIOPolledCoreFileVars, %p, 0x%llx, NULL) returned 0x%x\n", data, length, err); break; } @@ -510,7 +514,7 @@ kdp_core_zoutput(z_streamp strm, Bytef *buf, unsigned len) { if ((ret = (*vars->outproc)(KDP_DATA, NULL, len, buf)) != kIOReturnSuccess) { - kern_coredump_log(NULL, "(kdp_core_zoutput) outproc(KDP_DATA, NULL, 0x%x, 0x%p) returned 0x%x\n", + kern_coredump_log(NULL, "(kdp_core_zoutput) outproc(KDP_DATA, NULL, 0x%x, %p) returned 0x%x\n", len, buf, ret); vars->error = ret; } @@ -553,7 +557,7 @@ kdp_core_zoutputbuf(z_streamp strm, Bytef *inbuf, unsigned inlen) vars->outlen - vars->outremain, vars->outbuf)) != kIOReturnSuccess) { - kern_coredump_log(NULL, "(kdp_core_zoutputbuf) outproc(KDP_DATA, NULL, 0x%x, 0x%p) returned 0x%x\n", + kern_coredump_log(NULL, "(kdp_core_zoutputbuf) outproc(KDP_DATA, NULL, 0x%x, %p) returned 0x%x\n", (vars->outlen - vars->outremain), vars->outbuf, ret); vars->error = ret; } @@ -708,7 +712,7 @@ kernel_pmap_present_mapping(uint64_t vaddr, uint64_t * pvincr, uintptr_t * pvphy vincr = kdp_core_ramdisk_size; } else -#if defined(__arm64__) +#if defined(__arm64__) && defined(CONFIG_XNUPOST) if (vaddr == _COMM_HIGH_PAGE64_BASE_ADDRESS) { /* not readable */ @@ -795,22 +799,25 @@ pmap_traverse_present_mappings(pmap_t __unused pmap, ppn = VM_PAGE_GET_PHYS_PAGE(m); break; } - m = (vm_page_t)vm_page_queue_next(&m->listq); - } - vcur = phystokv(ptoa(ppn)); - if (vcur != vprev) - { - ret = callback(vcurstart, vprev, context); - lastvavalid = FALSE; + m = (vm_page_t)vm_page_queue_next(&m->vmp_listq); } vincr = PAGE_SIZE_64; if (ppn == atop(avail_end)) { vm_object_unlock(&pmap_object_store); m = VM_PAGE_NULL; + // avail_end is not a valid physical address, + // so phystokv(avail_end) may not produce the expected result. + vcur = phystokv(avail_start) + (avail_end - avail_start); + } else { + m = (vm_page_t)vm_page_queue_next(&m->vmp_listq); + vcur = phystokv(ptoa(ppn)); + } + if (vcur != vprev) + { + ret = callback(vcurstart, vprev, context); + lastvavalid = FALSE; } - else - m = (vm_page_t)vm_page_queue_next(&m->listq); } if (m == VM_PAGE_NULL) ppn = kernel_pmap_present_mapping(vcur, &vincr, NULL); @@ -1066,13 +1073,13 @@ kern_dump_update_header(struct kdp_core_out_vars *outvars) /* Write the file header -- first seek to the beginning of the file */ foffset = 0; if ((ret = (outvars->outproc)(KDP_SEEK, NULL, sizeof(foffset), &foffset)) != kIOReturnSuccess) { - kern_coredump_log(NULL, "(kern_dump_update_header) outproc(KDP_SEEK, NULL, %lu, 0x%p) foffset = 0x%llx returned 0x%x\n", + kern_coredump_log(NULL, "(kern_dump_update_header) outproc(KDP_SEEK, NULL, %lu, %p) foffset = 0x%llx returned 0x%x\n", sizeof(foffset), &foffset, foffset, ret); return ret; } if ((ret = (outvars->outproc)(KDP_DATA, NULL, sizeof(kdp_core_header), &kdp_core_header)) != kIOReturnSuccess) { - kern_coredump_log(NULL, "(kern_dump_update_header) outproc(KDP_DATA, NULL, %lu, 0x%p) returned 0x%x\n", + kern_coredump_log(NULL, "(kern_dump_update_header) outproc(KDP_DATA, NULL, %lu, %p) returned 0x%x\n", sizeof(kdp_core_header), &kdp_core_header, ret); return ret; } @@ -1125,7 +1132,7 @@ kern_dump_seek_to_next_file(void *kdp_core_out_vars, uint64_t next_file_offset) int ret; if ((ret = (outvars->outproc)(KDP_SEEK, NULL, sizeof(next_file_offset), &next_file_offset)) != kIOReturnSuccess) { - kern_coredump_log(NULL, "(kern_dump_seek_to_next_file) outproc(KDP_SEEK, NULL, %lu, 0x%p) foffset = 0x%llx returned 0x%x\n", + kern_coredump_log(NULL, "(kern_dump_seek_to_next_file) outproc(KDP_SEEK, NULL, %lu, %p) foffset = 0x%llx returned 0x%x\n", sizeof(next_file_offset), &next_file_offset, next_file_offset, ret); } @@ -1186,7 +1193,7 @@ do_kern_dump(kern_dump_output_proc outproc, enum kern_dump_type kd_variant) /* Seek the calculated offset (we'll scrollback later to flush the logs and header) */ if ((ret = (*outproc)(KDP_SEEK, NULL, sizeof(foffset), &foffset)) != kIOReturnSuccess) { - kern_coredump_log(NULL, "(do_kern_dump seek begin) outproc(KDP_SEEK, NULL, %lu, 0x%p) foffset = 0x%llx returned 0x%x\n", + kern_coredump_log(NULL, "(do_kern_dump seek begin) outproc(KDP_SEEK, NULL, %lu, %p) foffset = 0x%llx returned 0x%x\n", sizeof(foffset), &foffset, foffset, ret); dump_succeeded = FALSE; goto exit; @@ -1237,11 +1244,11 @@ do_kern_dump(kern_dump_output_proc outproc, enum kern_dump_type kd_variant) kern_coredump_log(NULL, "Failed to reset outvars for stackshot with len 0x%zx, returned 0x%x\n", panic_stackshot_len, ret); dump_succeeded = FALSE; } else if ((ret = kdp_core_output(&outvars, panic_stackshot_len, (void *)panic_stackshot_buf)) != KERN_SUCCESS) { - kern_coredump_log(NULL, "Failed to write panic stackshot to file, kdp_coreoutput(outvars, %lu, 0x%p) returned 0x%x\n", + kern_coredump_log(NULL, "Failed to write panic stackshot to file, kdp_coreoutput(outvars, %lu, %p) returned 0x%x\n", panic_stackshot_len, (void *) panic_stackshot_buf, ret); dump_succeeded = FALSE; } else if ((ret = kdp_core_output(&outvars, 0, NULL)) != KERN_SUCCESS) { - kern_coredump_log(NULL, "Failed to flush stackshot data : kdp_core_output(0x%p, 0, NULL) returned 0x%x\n", &outvars, ret); + kern_coredump_log(NULL, "Failed to flush stackshot data : kdp_core_output(%p, 0, NULL) returned 0x%x\n", &outvars, ret); dump_succeeded = FALSE; } else if ((ret = kern_dump_record_file(&outvars, "panic_stackshot.kcdata", foffset, &compressed_stackshot_len)) != KERN_SUCCESS) { kern_coredump_log(NULL, "Failed to record panic stackshot in corefile header, kern_dump_record_file returned 0x%x\n", ret); @@ -1255,7 +1262,7 @@ do_kern_dump(kern_dump_output_proc outproc, enum kern_dump_type kd_variant) /* Write the debug log -- first seek to the end of the corefile header */ foffset = KERN_COREDUMP_HEADERSIZE; if ((ret = (*outproc)(KDP_SEEK, NULL, sizeof(foffset), &foffset)) != kIOReturnSuccess) { - kern_coredump_log(NULL, "(do_kern_dump seek logfile) outproc(KDP_SEEK, NULL, %lu, 0x%p) foffset = 0x%llx returned 0x%x\n", + kern_coredump_log(NULL, "(do_kern_dump seek logfile) outproc(KDP_SEEK, NULL, %lu, %p) foffset = 0x%llx returned 0x%x\n", sizeof(foffset), &foffset, foffset, ret); dump_succeeded = FALSE; goto exit; @@ -1281,7 +1288,7 @@ do_kern_dump(kern_dump_output_proc outproc, enum kern_dump_type kd_variant) */ buf = debug_buf_base; if ((ret = (*outproc)(KDP_DATA, NULL, existing_log_size, buf)) != kIOReturnSuccess) { - kern_coredump_log(NULL, "(do_kern_dump paniclog) outproc(KDP_DATA, NULL, %lu, 0x%p) returned 0x%x\n", + kern_coredump_log(NULL, "(do_kern_dump paniclog) outproc(KDP_DATA, NULL, %lu, %p) returned 0x%x\n", existing_log_size, buf, ret); dump_succeeded = FALSE; goto exit; @@ -1302,7 +1309,7 @@ do_kern_dump(kern_dump_output_proc outproc, enum kern_dump_type kd_variant) /* Write the coredump log */ if ((ret = (*outproc)(KDP_DATA, NULL, new_log_len, buf)) != kIOReturnSuccess) { - kern_coredump_log(NULL, "(do_kern_dump coredump log) outproc(KDP_DATA, NULL, %lu, 0x%p) returned 0x%x\n", + kern_coredump_log(NULL, "(do_kern_dump coredump log) outproc(KDP_DATA, NULL, %lu, %p) returned 0x%x\n", new_log_len, buf, ret); dump_succeeded = FALSE; goto exit; @@ -1382,17 +1389,18 @@ kern_dump(enum kern_dump_type kd_variant) } #if CONFIG_EMBEDDED -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wmissing-noreturn" void panic_spin_shmcon() { -#pragma clang diagnostic pop + if (hwsd_info == NULL) { + kern_coredump_log(NULL, "handshake structure not initialized\n"); + return; + } + kern_coredump_log(NULL, "\nPlease go to https://panic.apple.com to report this panic\n"); kern_coredump_log(NULL, "Waiting for hardware shared memory debugger, handshake structure is at virt: %p, phys %p\n", hwsd_info, (void *)kvtophys((vm_offset_t)hwsd_info)); - assert(hwsd_info != NULL); hwsd_info->xhsdci_status = XHSDCI_STATUS_KERNEL_READY; hwsd_info->xhsdci_seq_no = 0; FlushPoC_DcacheRegion((vm_offset_t) hwsd_info, sizeof(*hwsd_info)); diff --git a/osfmk/kdp/kdp_core.h b/osfmk/kdp/kdp_core.h index 45cee67b1..7e0b17cfd 100644 --- a/osfmk/kdp/kdp_core.h +++ b/osfmk/kdp/kdp_core.h @@ -156,6 +156,7 @@ void kern_collectth_state_size(uint64_t * tstate_count, uint64_t * tstate_size); void kern_collectth_state(thread_t thread, void *buffer, uint64_t size, void **iter); boolean_t kdp_has_polled_corefile(void); +kern_return_t kdp_polled_corefile_error(void); void kdp_core_init(void); diff --git a/osfmk/kdp/ml/arm/kdp_machdep.c b/osfmk/kdp/ml/arm/kdp_machdep.c index 1bf25ac74..4e7fa0639 100644 --- a/osfmk/kdp/ml/arm/kdp_machdep.c +++ b/osfmk/kdp/ml/arm/kdp_machdep.c @@ -65,7 +65,8 @@ int machine_trace_thread64(thread_t thread, int nframes, boolean_t user_p, boolean_t trace_fp, - uint32_t * thread_trace_flags); + uint32_t * thread_trace_flags, + uint64_t *sp); void kdp_trap(unsigned int, struct arm_saved_state * saved_state); @@ -495,11 +496,22 @@ machine_trace_thread(thread_t thread, if(target_cpu_datap == (cpu_data_t *)NULL) continue; - if ((prevfp >= (target_cpu_datap->intstack_top-INTSTACK_SIZE) && prevfp < target_cpu_datap->intstack_top) || - (prevfp >= (target_cpu_datap->fiqstack_top-PAGE_SIZE) && prevfp < target_cpu_datap->fiqstack_top)) { + if (prevfp >= (target_cpu_datap->intstack_top-INTSTACK_SIZE) && prevfp < target_cpu_datap->intstack_top) { prev_in_interrupt_stack = TRUE; break; } + +#if defined(__arm__) + if (prevfp >= (target_cpu_datap->fiqstack_top-FIQSTACK_SIZE) && prevfp < target_cpu_datap->fiqstack_top) { + prev_in_interrupt_stack = TRUE; + break; + } +#elif defined(__arm64__) + if (prevfp >= (target_cpu_datap->excepstack_top-EXCEPSTACK_SIZE) && prevfp < target_cpu_datap->excepstack_top) { + prev_in_interrupt_stack = TRUE; + break; + } +#endif } } @@ -555,8 +567,10 @@ machine_trace_thread64(thread_t thread, int nframes, boolean_t user_p, boolean_t trace_fp, - uint32_t * thread_trace_flags) + uint32_t * thread_trace_flags, + uint64_t *sp_out) { +#pragma unused(sp_out) #if defined(__arm__) #pragma unused(thread, tracepos, tracebound, nframes, user_p, trace_fp, thread_trace_flags) return 0; @@ -577,6 +591,8 @@ machine_trace_thread64(thread_t thread, vm_offset_t kern_virt_addr = 0; vm_map_t bt_vm_map = VM_MAP_NULL; + const boolean_t is_64bit_addr = thread_is_64bit_addr(thread); + nframes = (tracebound > tracepos) ? MIN(nframes, (int)((tracebound - tracepos) / framesize)) : 0; if (!nframes) { return (0); @@ -586,8 +602,8 @@ machine_trace_thread64(thread_t thread, if (user_p) { /* Examine the user savearea */ state = thread->machine.upcb; - stacklimit = MACH_VM_MAX_ADDRESS; - stacklimit_bottom = MACH_VM_MIN_ADDRESS; + stacklimit = (is_64bit_addr) ? MACH_VM_MAX_ADDRESS : VM_MAX_ADDRESS; + stacklimit_bottom = (is_64bit_addr) ? MACH_VM_MIN_ADDRESS : VM_MIN_ADDRESS; /* Fake up a stack frame for the PC */ *tracebuf++ = get_saved_state_pc(state); @@ -666,12 +682,21 @@ machine_trace_thread64(thread_t thread, if(target_cpu_datap == (cpu_data_t *)NULL) continue; - if ((prevfp >= (target_cpu_datap->intstack_top-INTSTACK_SIZE) && prevfp < target_cpu_datap->intstack_top) || - (prevfp >= (target_cpu_datap->fiqstack_top-PAGE_SIZE) && prevfp < target_cpu_datap->fiqstack_top)) { + if (prevfp >= (target_cpu_datap->intstack_top-INTSTACK_SIZE) && prevfp < target_cpu_datap->intstack_top) { switched_stacks = TRUE; break; } - +#if defined(__arm__) + if (prevfp >= (target_cpu_datap->fiqstack_top-FIQSTACK_SIZE) && prevfp < target_cpu_datap->fiqstack_top) { + switched_stacks = TRUE; + break; + } +#elif defined(__arm64__) + if (prevfp >= (target_cpu_datap->excepstack_top-EXCEPSTACK_SIZE) && prevfp < target_cpu_datap->excepstack_top) { + switched_stacks = TRUE; + break; + } +#endif } } diff --git a/osfmk/kdp/ml/x86_64/kdp_machdep.c b/osfmk/kdp/ml/x86_64/kdp_machdep.c index 0d716b5d1..5cfc3be33 100644 --- a/osfmk/kdp/ml/x86_64/kdp_machdep.c +++ b/osfmk/kdp/ml/x86_64/kdp_machdep.c @@ -586,7 +586,8 @@ machine_trace_thread64(thread_t thread, int nframes, boolean_t user_p, boolean_t trace_fp, - uint32_t * thread_trace_flags) + uint32_t * thread_trace_flags, + uint64_t *sp) { uint64_t * tracebuf = (uint64_t *)tracepos; unsigned framesize = (trace_fp ? 2 : 1) * sizeof(addr64_t); @@ -607,6 +608,9 @@ machine_trace_thread64(thread_t thread, prev_rip = iss64->isf.rip; stackptr = iss64->rbp; bt_vm_map = thread->task->map; + if (sp && user_p) { + *sp = iss64->isf.rsp; + } } else { stackptr = STACK_IKS(thread->kernel_stack)->k_rbp; diff --git a/osfmk/kdp/processor_core.c b/osfmk/kdp/processor_core.c index f33d9915c..e1c40e141 100644 --- a/osfmk/kdp/processor_core.c +++ b/osfmk/kdp/processor_core.c @@ -258,7 +258,7 @@ coredump_save_summary(uint64_t core_segment_count, uint64_t core_byte_count, /* Send the core_header to the output procedure */ ret = kdp_core_output(core_context->core_outvars, sizeof(core_header), (caddr_t)&core_header); if (ret != KERN_SUCCESS) { - kern_coredump_log(context, "coredump_save_summary() : failed to write mach header : kdp_core_output(0x%p, %lu, 0x%p) returned error 0x%x\n", + kern_coredump_log(context, "coredump_save_summary() : failed to write mach header : kdp_core_output(%p, %lu, %p) returned error 0x%x\n", core_context->core_outvars, sizeof(core_header), &core_header, ret); return ret; } @@ -280,7 +280,7 @@ coredump_save_summary(uint64_t core_segment_count, uint64_t core_byte_count, /* Send the core_header to the output procedure */ ret = kdp_core_output(core_context->core_outvars, sizeof(core_header), (caddr_t)&core_header); if (ret != KERN_SUCCESS) { - kern_coredump_log(context, "coredump_save_summary() : failed to write mach header : kdp_core_output(0x%p, %lu, 0x%p) returned error 0x%x\n", + kern_coredump_log(context, "coredump_save_summary() : failed to write mach header : kdp_core_output(%p, %lu, %p) returned error 0x%x\n", core_context->core_outvars, sizeof(core_header), &core_header, ret); return ret; } @@ -303,13 +303,13 @@ coredump_save_segment_descriptions(uint64_t seg_start, uint64_t seg_end, uint64_t size = seg_end - seg_start; if (seg_end <= seg_start) { - kern_coredump_log(context, "coredump_save_segment_descriptions(0x%llx, 0x%llx, 0x%p) : called with invalid addresses : start 0x%llx >= end 0x%llx\n", + kern_coredump_log(context, "coredump_save_segment_descriptions(0x%llx, 0x%llx, %p) : called with invalid addresses : start 0x%llx >= end 0x%llx\n", seg_start, seg_end, context, seg_start, seg_end); return KERN_INVALID_ARGUMENT; } if (core_context->core_segments_remaining == 0) { - kern_coredump_log(context, "coredump_save_segment_descriptions(0x%llx, 0x%llx, 0x%p) : coredump_save_segment_descriptions() called too many times, %llu segment descriptions already recorded\n", + kern_coredump_log(context, "coredump_save_segment_descriptions(0x%llx, 0x%llx, %p) : coredump_save_segment_descriptions() called too many times, %llu segment descriptions already recorded\n", seg_start, seg_end, context, core_context->core_segment_count); return KERN_INVALID_ARGUMENT; } @@ -320,7 +320,7 @@ coredump_save_segment_descriptions(uint64_t seg_start, uint64_t seg_end, struct segment_command_64 seg_command = { }; if (core_context->core_cur_hoffset + sizeof(seg_command) > core_context->core_header_size) { - kern_coredump_log(context, "coredump_save_segment_descriptions(0x%llx, 0x%llx, 0x%p) : ran out of space to save commands with %llu of %llu remaining\n", + kern_coredump_log(context, "coredump_save_segment_descriptions(0x%llx, 0x%llx, %p) : ran out of space to save commands with %llu of %llu remaining\n", seg_start, seg_end, context, core_context->core_segments_remaining, core_context->core_segment_count); return KERN_NO_SPACE; } @@ -338,7 +338,7 @@ coredump_save_segment_descriptions(uint64_t seg_start, uint64_t seg_end, /* Flush new command to output */ ret = kdp_core_output(core_context->core_outvars, sizeof(seg_command), (caddr_t)&seg_command); if (ret != KERN_SUCCESS) { - kern_coredump_log(context, "coredump_save_segment_descriptions(0x%llx, 0x%llx, 0x%p) : failed to write segment %llu of %llu. kdp_core_output(0x%p, %lu, 0x%p) returned error %d\n", + kern_coredump_log(context, "coredump_save_segment_descriptions(0x%llx, 0x%llx, %p) : failed to write segment %llu of %llu. kdp_core_output(%p, %lu, %p) returned error %d\n", seg_start, seg_end, context, core_context->core_segment_count - core_context->core_segments_remaining, core_context->core_segment_count, core_context->core_outvars, sizeof(seg_command), &seg_command, ret); return ret; @@ -351,13 +351,13 @@ coredump_save_segment_descriptions(uint64_t seg_start, uint64_t seg_end, struct segment_command seg_command = { }; if (seg_start > UINT32_MAX || seg_end > UINT32_MAX) { - kern_coredump_log(context, "coredump_save_segment_descriptions(0x%llx, 0x%llx, 0x%p) : called with invalid addresses for 32-bit : start 0x%llx, end 0x%llx\n", + kern_coredump_log(context, "coredump_save_segment_descriptions(0x%llx, 0x%llx, %p) : called with invalid addresses for 32-bit : start 0x%llx, end 0x%llx\n", seg_start, seg_end, context, seg_start, seg_end); return KERN_INVALID_ARGUMENT; } if (core_context->core_cur_hoffset + sizeof(seg_command) > core_context->core_header_size) { - kern_coredump_log(context, "coredump_save_segment_descriptions(0x%llx, 0x%llx, 0x%p) : ran out of space to save commands with %llu of %llu remaining\n", + kern_coredump_log(context, "coredump_save_segment_descriptions(0x%llx, 0x%llx, %p) : ran out of space to save commands with %llu of %llu remaining\n", seg_start, seg_end, context, core_context->core_segments_remaining, core_context->core_segment_count); return KERN_NO_SPACE; } @@ -375,7 +375,7 @@ coredump_save_segment_descriptions(uint64_t seg_start, uint64_t seg_end, /* Flush new command to output */ ret = kdp_core_output(core_context->core_outvars, sizeof(seg_command), (caddr_t)&seg_command); if (ret != KERN_SUCCESS) { - kern_coredump_log(context, "coredump_save_segment_descriptions(0x%llx, 0x%llx, 0x%p) : failed to write segment %llu of %llu : kdp_core_output(0x%p, %lu, 0x%p) returned error 0x%x\n", + kern_coredump_log(context, "coredump_save_segment_descriptions(0x%llx, 0x%llx, %p) : failed to write segment %llu of %llu : kdp_core_output(%p, %lu, %p) returned error 0x%x\n", seg_start, seg_end, context, core_context->core_segment_count - core_context->core_segments_remaining, core_context->core_segment_count, core_context->core_outvars, sizeof(seg_command), &seg_command, ret); return ret; @@ -404,20 +404,20 @@ coredump_save_thread_state(void *thread_state, void *context) int ret; if (tc->cmd != LC_THREAD) { - kern_coredump_log(context, "coredump_save_thread_state(0x%p, 0x%p) : found %d expected LC_THREAD (%d)\n", + kern_coredump_log(context, "coredump_save_thread_state(%p, %p) : found %d expected LC_THREAD (%d)\n", thread_state, context, tc->cmd, LC_THREAD); return KERN_INVALID_ARGUMENT; } if (core_context->core_cur_hoffset + core_context->core_thread_state_size > core_context->core_header_size) { - kern_coredump_log(context, "coredump_save_thread_state(0x%p, 0x%p) : ran out of space to save threads with %llu of %llu remaining\n", + kern_coredump_log(context, "coredump_save_thread_state(%p, %p) : ran out of space to save threads with %llu of %llu remaining\n", thread_state, context, core_context->core_threads_remaining, core_context->core_thread_count); return KERN_NO_SPACE; } ret = kdp_core_output(core_context->core_outvars, core_context->core_thread_state_size, (caddr_t)thread_state); if (ret != KERN_SUCCESS) { - kern_coredump_log(context, "coredump_save_thread_state(0x%p, 0x%p) : failed to write thread data : kdp_core_output(0x%p, %llu, 0x%p) returned 0x%x\n", + kern_coredump_log(context, "coredump_save_thread_state(%p, %p) : failed to write thread data : kdp_core_output(%p, %llu, %p) returned 0x%x\n", thread_state, context, core_context->core_outvars, core_context->core_thread_state_size, thread_state, ret); return ret; } @@ -436,13 +436,13 @@ coredump_save_sw_vers(void *sw_vers, uint64_t length, void *context) int ret; if (length > KERN_COREDUMP_VERSIONSTRINGMAXSIZE || !length) { - kern_coredump_log(context, "coredump_save_sw_vers(0x%p, %llu, 0x%p) : called with invalid length %llu\n", + kern_coredump_log(context, "coredump_save_sw_vers(%p, %llu, %p) : called with invalid length %llu\n", sw_vers, length, context, length); return KERN_INVALID_ARGUMENT; } if (core_context->core_cur_hoffset + sizeof(struct ident_command) + length > core_context->core_header_size) { - kern_coredump_log(context, "coredump_save_sw_vers(0x%p, %llu, 0x%p) : ran out of space to save data\n", + kern_coredump_log(context, "coredump_save_sw_vers(%p, %llu, %p) : ran out of space to save data\n", sw_vers, length, context); return KERN_NO_SPACE; } @@ -451,14 +451,14 @@ coredump_save_sw_vers(void *sw_vers, uint64_t length, void *context) ident.cmdsize = (uint32_t)(sizeof(struct ident_command) + KERN_COREDUMP_VERSIONSTRINGMAXSIZE); ret = kdp_core_output(core_context->core_outvars, sizeof(struct ident_command), (caddr_t)&ident); if (ret != KERN_SUCCESS) { - kern_coredump_log(context, "coredump_save_sw_vers(0x%p, %llu, 0x%p) : failed to write ident command : kdp_core_output(0x%p, %lu, 0x%p) returned 0x%x\n", + kern_coredump_log(context, "coredump_save_sw_vers(%p, %llu, %p) : failed to write ident command : kdp_core_output(%p, %lu, %p) returned 0x%x\n", sw_vers, length, context, core_context->core_outvars, sizeof(struct ident_command), &ident, ret); return ret; } ret = kdp_core_output(core_context->core_outvars, length, (caddr_t)sw_vers); if (ret != KERN_SUCCESS) { - kern_coredump_log(context, "coredump_save_sw_vers(0x%p, %llu, 0x%p) : failed to write version string : kdp_core_output(0x%p, %llu, 0x%p) returned 0x%x\n", + kern_coredump_log(context, "coredump_save_sw_vers(%p, %llu, %p) : failed to write version string : kdp_core_output(%p, %llu, %p) returned 0x%x\n", sw_vers, length, context, core_context->core_outvars, length, sw_vers, ret); return ret; } @@ -467,7 +467,7 @@ coredump_save_sw_vers(void *sw_vers, uint64_t length, void *context) /* Zero fill to the full command size */ ret = kdp_core_output(core_context->core_outvars, (KERN_COREDUMP_VERSIONSTRINGMAXSIZE - length), NULL); if (ret != KERN_SUCCESS) { - kern_coredump_log(context, "coredump_save_sw_vers(0x%p, %llu, 0x%p) : failed to write zero fill padding : kdp_core_output(0x%p, %llu, NULL) returned 0x%x\n", + kern_coredump_log(context, "coredump_save_sw_vers(%p, %llu, %p) : failed to write zero fill padding : kdp_core_output(%p, %llu, NULL) returned 0x%x\n", sw_vers, length, context, core_context->core_outvars, (KERN_COREDUMP_VERSIONSTRINGMAXSIZE - length), ret); return ret; } @@ -485,7 +485,7 @@ coredump_save_segment_data(void *seg_data, uint64_t length, void *context) processor_core_context *core_context = (processor_core_context *)context; if (length > core_context->core_segment_bytes_remaining) { - kern_coredump_log(context, "coredump_save_segment_data(0x%p, %llu, 0x%p) : called with too much data, %llu written, %llu left\n", + kern_coredump_log(context, "coredump_save_segment_data(%p, %llu, %p) : called with too much data, %llu written, %llu left\n", seg_data, length, context, core_context->core_segment_byte_total - core_context->core_segment_bytes_remaining, core_context->core_segment_bytes_remaining); return KERN_INVALID_ARGUMENT; @@ -493,7 +493,7 @@ coredump_save_segment_data(void *seg_data, uint64_t length, void *context) ret = kdp_core_output(core_context->core_outvars, length, (caddr_t)seg_data); if (ret != KERN_SUCCESS) { - kern_coredump_log(context, "coredump_save_segment_data(0x%p, %llu, 0x%p) : failed to write data (%llu bytes remaining) :%d\n", + kern_coredump_log(context, "coredump_save_segment_data(%p, %llu, %p) : failed to write data (%llu bytes remaining) :%d\n", seg_data, length, context, core_context->core_segment_bytes_remaining, ret); return ret; } @@ -595,7 +595,7 @@ kern_coredump_routine(void *core_outvars, struct kern_coredump_core *current_cor /* Zero fill between the end of the header and the beginning of the segment data file offset */ ret = kdp_core_output(context.core_outvars, (round_page(context.core_header_size) - context.core_header_size), NULL); if (ret != KERN_SUCCESS) { - kern_coredump_log(&context, "(kern_coredump_routine) : failed to write zero fill padding (%llu bytes remaining) : kdp_core_output(0x%p, %llu, NULL) returned 0x%x\n", + kern_coredump_log(&context, "(kern_coredump_routine) : failed to write zero fill padding (%llu bytes remaining) : kdp_core_output(%p, %llu, NULL) returned 0x%x\n", context.core_segment_bytes_remaining, context.core_outvars, (round_page(context.core_header_size) - context.core_header_size), ret); return ret; } @@ -618,7 +618,7 @@ kern_coredump_routine(void *core_outvars, struct kern_coredump_core *current_cor /* Flush the last data out */ ret = kdp_core_output(context.core_outvars, 0, NULL); if (ret != KERN_SUCCESS) { - kern_coredump_log(&context, "(kern_coredump_routine) : failed to flush final core data : kdp_core_output(0x%p, 0, NULL) returned 0x%x\n", + kern_coredump_log(&context, "(kern_coredump_routine) : failed to flush final core data : kdp_core_output(%p, 0, NULL) returned 0x%x\n", context.core_outvars, ret); return ret; } diff --git a/osfmk/kern/Makefile b/osfmk/kern/Makefile index dcde0c50e..bf4cc9197 100644 --- a/osfmk/kern/Makefile +++ b/osfmk/kern/Makefile @@ -15,11 +15,13 @@ DATAFILES = \ PRIVATE_DATAFILES = \ cs_blobs.h \ + trustcache.h \ debug.h \ ecc.h \ block_hint.h \ monotonic.h \ - arithmetic_128.h + arithmetic_128.h \ + turnstile.h EXPORT_FILES = \ affinity.h \ @@ -27,6 +29,7 @@ EXPORT_FILES = \ audit_sessionport.h \ backtrace.h \ bits.h \ + btlog.h \ call_entry.h \ clock.h \ coalition.h \ @@ -52,6 +55,7 @@ EXPORT_FILES = \ policy_internal.h \ processor.h \ queue.h \ + priority_queue.h \ sched_prim.h \ sfi.h \ simple_lock.h \ @@ -72,6 +76,11 @@ PRIVATE_EXPORT_FILES = \ copyout_shim.h +XNU_ONLY_EXPORTS = \ + cpu_quiesce.h \ + ipc_kobject.h \ + ux_handler.h + INSTALL_MI_LIST = ${DATAFILES} INSTALL_MI_LCL_LIST = ${PRIVATE_DATAFILES} @@ -80,7 +89,7 @@ INSTALL_KF_MI_LCL_LIST = ${DATAFILES} ${PRIVATE_DATAFILES} ${EXPORT_FILES} ${PRI INSTALL_MI_DIR = kern -EXPORT_MI_LIST = ${DATAFILES} ${PRIVATE_DATAFILES} ${EXPORT_FILES} ipc_kobject.h +EXPORT_MI_LIST = ${DATAFILES} ${PRIVATE_DATAFILES} ${EXPORT_FILES} ${XNU_ONLY_EXPORTS} EXPORT_MI_DIR = kern diff --git a/osfmk/kern/ast.c b/osfmk/kern/ast.c index 8f282ce58..a75d45dff 100644 --- a/osfmk/kern/ast.c +++ b/osfmk/kern/ast.c @@ -56,6 +56,7 @@ #include #include +#include #include #include #include @@ -297,7 +298,28 @@ ast_taken_user(void) } } + if (ast_consume(AST_UNQUIESCE) == AST_UNQUIESCE) { + cpu_quiescent_counter_ast(); + } + + cpu_quiescent_counter_assert_ast(); + splx(s); + + /* + * Here's a good place to put assertions of things which must be true + * upon return to userspace. + */ + assert((thread->sched_flags & TH_SFLAG_WAITQ_PROMOTED) == 0); + assert((thread->sched_flags & TH_SFLAG_RW_PROMOTED) == 0); + assert((thread->sched_flags & TH_SFLAG_EXEC_PROMOTED) == 0); + assert((thread->sched_flags & TH_SFLAG_PROMOTED) == 0); + assert((thread->sched_flags & TH_SFLAG_DEPRESS) == 0); + + assert(thread->promotions == 0); + assert(thread->was_promoted_on_wakeup == 0); + assert(thread->waiting_for_mutex == NULL); + assert(thread->rwlock_count == 0); } /* diff --git a/osfmk/kern/ast.h b/osfmk/kern/ast.h index d41821e2d..1fd6916ab 100644 --- a/osfmk/kern/ast.h +++ b/osfmk/kern/ast.h @@ -122,11 +122,13 @@ typedef uint32_t ast_t; #define AST_GUARD 0x1000 #define AST_TELEMETRY_USER 0x2000 /* telemetry sample requested on interrupt from userspace */ #define AST_TELEMETRY_KERNEL 0x4000 /* telemetry sample requested on interrupt from kernel */ +#define AST_TELEMETRY_PMI 0x8000 /* telemetry sample requested on PMI */ #define AST_SFI 0x10000 /* Evaluate if SFI wait is needed before return to userspace */ #define AST_DTRACE 0x20000 #define AST_TELEMETRY_IO 0x40000 /* telemetry sample requested for I/O */ #define AST_KEVENT 0x80000 #define AST_REBALANCE 0x100000 /* thread context switched due to rebalancing */ +#define AST_UNQUIESCE 0x200000 /* catch unquiesced processor before returning to userspace */ #define AST_NONE 0x00 #define AST_ALL (~AST_NONE) @@ -134,7 +136,8 @@ typedef uint32_t ast_t; #define AST_SCHEDULING (AST_PREEMPTION | AST_YIELD | AST_HANDOFF) #define AST_PREEMPTION (AST_PREEMPT | AST_QUANTUM | AST_URGENT) -#define AST_TELEMETRY_ALL (AST_TELEMETRY_USER | AST_TELEMETRY_KERNEL | AST_TELEMETRY_IO) +#define AST_TELEMETRY_ALL (AST_TELEMETRY_USER | AST_TELEMETRY_KERNEL | \ + AST_TELEMETRY_PMI | AST_TELEMETRY_IO) /* Per-thread ASTs follow the thread at context-switch time. */ #define AST_PER_THREAD (AST_APC | AST_BSD | AST_MACF | AST_LEDGER | AST_GUARD | AST_TELEMETRY_ALL | AST_KEVENT) diff --git a/osfmk/kern/backtrace.c b/osfmk/kern/backtrace.c index b47ce7940..0588970f0 100644 --- a/osfmk/kern/backtrace.c +++ b/osfmk/kern/backtrace.c @@ -41,6 +41,7 @@ #endif + uint32_t __attribute__((noinline)) backtrace(uintptr_t *bt, uint32_t max_frames) { @@ -84,6 +85,7 @@ backtrace_frame(uintptr_t *bt, uint32_t max_frames, void *start_frame) while (fp != NULL && frame_index < max_frames) { uintptr_t *next_fp = (uintptr_t *)*fp; + uintptr_t ret_addr = *(fp + 1); /* return address is one word higher than frame pointer */ /* * If the frame pointer is 0, backtracing has reached the top of @@ -97,8 +99,7 @@ backtrace_frame(uintptr_t *bt, uint32_t max_frames, void *start_frame) break; } - /* return address is one word higher than frame pointer */ - bt[frame_index++] = *(fp + 1); + bt[frame_index++] = ret_addr; /* stacks grow down; backtracing should be moving to higher addresses */ if (next_fp <= fp) { @@ -218,7 +219,7 @@ backtrace_interrupted(uintptr_t *bt, uint32_t max_frames) return 1; } - return backtrace_frame(bt + 1, max_frames - 1, (void *)fp); + return backtrace_frame(bt + 1, max_frames - 1, (void *)fp) + 1; } int @@ -235,16 +236,11 @@ backtrace_thread_user(void *thread, uintptr_t *bt, uint32_t max_frames, { bool user_64; uintptr_t pc, fp, next_fp; - vm_map_t map, old_map; + vm_map_t map = NULL, old_map = NULL; uint32_t frame_index = 0; int err = 0; size_t frame_size; - assert(ml_get_interrupts_enabled() == TRUE); - if (!ml_get_interrupts_enabled()) { - return EINVAL; - } - assert(bt != NULL); assert(max_frames > 0); assert(frames_out != NULL); @@ -302,15 +298,23 @@ backtrace_thread_user(void *thread, uintptr_t *bt, uint32_t max_frames, #error "backtrace_thread_user: unsupported architecture" #endif /* !defined(__arm__) */ - /* switch to the correct map, for copyin */ - if (thread != current_thread()) { - map = get_task_map_reference(get_threadtask(thread)); - if (map == NULL) { - return EINVAL; - } - old_map = vm_map_switch(map); - } else { - map = NULL; + if (max_frames == 0) { + goto out; + } + + bt[frame_index++] = pc; + + if (frame_index >= max_frames) { + goto out; + } + + if (INVALID_USER_FP(fp)) { + goto out; + } + + assert(ml_get_interrupts_enabled() == TRUE); + if (!ml_get_interrupts_enabled()) { + return EINVAL; } union { @@ -323,12 +327,18 @@ backtrace_thread_user(void *thread, uintptr_t *bt, uint32_t max_frames, uint32_t ret; } u32; } frame; - frame_size = 2 * (user_64 ? sizeof(uint64_t) : sizeof(uint32_t)); - bt[frame_index++] = pc; + frame_size = 2 * (user_64 ? sizeof(uint64_t) : sizeof(uint32_t)); - if (INVALID_USER_FP(fp)) { - goto out; + /* switch to the correct map, for copyin */ + if (thread != current_thread()) { + map = get_task_map_reference(get_threadtask(thread)); + if (map == NULL) { + return EINVAL; + } + old_map = vm_map_switch(map); + } else { + map = NULL; } while (fp != 0 && frame_index < max_frames) { @@ -343,7 +353,8 @@ backtrace_thread_user(void *thread, uintptr_t *bt, uint32_t max_frames, break; } - bt[frame_index++] = user_64 ? frame.u64.ret : frame.u32.ret; + uintptr_t ret_addr = user_64 ? frame.u64.ret : frame.u32.ret; + bt[frame_index++] = ret_addr; /* stacks grow down; backtracing should be moving to higher addresses */ if (next_fp <= fp) { diff --git a/osfmk/kern/bits.h b/osfmk/kern/bits.h index 5c977497d..13ce948d2 100644 --- a/osfmk/kern/bits.h +++ b/osfmk/kern/bits.h @@ -39,7 +39,7 @@ typedef unsigned int uint; #define BIT(b) (1ULL << (b)) -#define mask(width) (BIT(width) - 1) +#define mask(width) (width >= 64 ? -1 : (BIT(width) - 1)) #define extract(x, shift, width) ((((uint64_t)(x)) >> (shift)) & mask(width)) #define bits(x, hi, lo) extract((x), (lo), (hi) - (lo) + 1) @@ -47,6 +47,31 @@ typedef unsigned int uint; #define bit_clear(x, b) ((x) &= ~BIT(b)) #define bit_test(x, b) ((bool)((x) & BIT(b))) +inline static uint64_t +bit_ror64(uint64_t bitmap, uint n) +{ +#if defined(__arm64__) + uint64_t result; + uint64_t _n = (uint64_t)n; + asm volatile("ror %0, %1, %2" : "=r" (result) : "r" (bitmap), "r" (_n)); + return result; +#else + n = n & 63; + return ((bitmap >> n) | (bitmap << (64 - n))); +#endif +} + +inline static uint64_t +bit_rol64(uint64_t bitmap, uint n) +{ +#if defined(__arm64__) + return bit_ror64(bitmap, 64U - n); +#else + n = n & 63; + return ((bitmap << n) | (bitmap >> (64 - n))); +#endif +} + /* Non-atomically clear the bit and returns whether the bit value was changed */ inline static bool bit_clear_if_set(uint64_t bitmap, int bit) diff --git a/osfmk/kern/block_hint.h b/osfmk/kern/block_hint.h index c52b9488f..f379d0850 100644 --- a/osfmk/kern/block_hint.h +++ b/osfmk/kern/block_hint.h @@ -29,7 +29,6 @@ #ifndef _KERN_BLOCK_HINT_H_ #define _KERN_BLOCK_HINT_H_ -/* This must fit inside a short */ typedef enum thread_snapshot_wait_flags { kThreadWaitNone = 0x00, kThreadWaitKernelMutex = 0x01, @@ -48,8 +47,12 @@ typedef enum thread_snapshot_wait_flags { kThreadWaitPThreadCondVar = 0x0e, kThreadWaitParkedWorkQueue = 0x0f, kThreadWaitWorkloopSyncWait = 0x10, + kThreadWaitOnProcess = 0x11, } __attribute__((packed)) block_hint_t; +_Static_assert(sizeof(block_hint_t) <= sizeof(short), + "block_hint_t must fit within a short"); + #ifdef XNU_KERNEL_PRIVATE struct waitq; @@ -66,6 +69,7 @@ extern void kdp_rwlck_find_owner(struct waitq * waitq, event64_t event, thread_w extern void kdp_pthread_find_owner(thread_t thread, thread_waitinfo_t *waitinfo); extern void *kdp_pthread_get_thread_kwq(thread_t thread); extern void kdp_workloop_sync_wait_find_owner(thread_t thread, event64_t event, thread_waitinfo_t *waitinfo); +extern void kdp_wait4_find_process(thread_t thread, event64_t event, thread_waitinfo_t *waitinfo); #endif /* XNU_KERNEL_PRIVATE */ diff --git a/osfmk/kern/bsd_kern.c b/osfmk/kern/bsd_kern.c index f70820520..d017ae520 100644 --- a/osfmk/kern/bsd_kern.c +++ b/osfmk/kern/bsd_kern.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -308,7 +308,7 @@ int get_task_numacts(task_t t) /* does this machine need 64bit register set for signal handler */ int is_64signalregset(void) { - if (task_has_64BitData(current_task())) { + if (task_has_64Bit_data(current_task())) { return(1); } @@ -430,21 +430,24 @@ uint64_t get_task_phys_footprint(task_t task) return 0; } +#if CONFIG_LEDGER_INTERVAL_MAX /* * */ -uint64_t get_task_phys_footprint_recent_max(task_t task) +uint64_t get_task_phys_footprint_interval_max(task_t task, int reset) { kern_return_t ret; ledger_amount_t max; - ret = ledger_get_recent_max(task->ledger, task_ledgers.phys_footprint, &max); - if (KERN_SUCCESS == ret) { + ret = ledger_get_interval_max(task->ledger, task_ledgers.phys_footprint, &max, reset); + + if(KERN_SUCCESS == ret) { return max; } return 0; } +#endif /* CONFIG_LEDGER_INTERVAL_MAX */ /* * @@ -583,6 +586,46 @@ uint64_t get_task_iokit_mapped(task_t task) return 0; } +uint64_t get_task_network_nonvolatile(task_t task) +{ + kern_return_t ret; + ledger_amount_t credit, debit; + + ret = ledger_get_entries(task->ledger, task_ledgers.network_nonvolatile, &credit, &debit); + if (KERN_SUCCESS == ret) { + return (credit - debit); + } + + return 0; +} + +uint64_t get_task_network_nonvolatile_compressed(task_t task) +{ + kern_return_t ret; + ledger_amount_t credit, debit; + + ret = ledger_get_entries(task->ledger, task_ledgers.network_nonvolatile_compressed, &credit, &debit); + if (KERN_SUCCESS == ret) { + return (credit - debit); + } + + return 0; +} + +uint64_t get_task_wired_mem(task_t task) +{ + kern_return_t ret; + ledger_amount_t credit, debit; + + ret = ledger_get_entries(task->ledger, task_ledgers.wired_mem, &credit, &debit); + if (KERN_SUCCESS == ret) { + return (credit - debit); + } + + return 0; +} + + uint64_t get_task_cpu_time(task_t task) { kern_return_t ret; @@ -885,7 +928,7 @@ fill_taskprocinfo(task_t task, struct proc_taskinfo_internal * ptinfo) } int -fill_taskthreadinfo(task_t task, uint64_t thaddr, int thuniqueid, struct proc_threadinfo_internal * ptinfo, void * vpp, int *vidp) +fill_taskthreadinfo(task_t task, uint64_t thaddr, bool thuniqueid, struct proc_threadinfo_internal * ptinfo, void * vpp, int *vidp) { thread_t thact; int err=0; @@ -898,7 +941,7 @@ fill_taskthreadinfo(task_t task, uint64_t thaddr, int thuniqueid, struct proc_th for (thact = (thread_t)(void *)queue_first(&task->threads); !queue_end(&task->threads, (queue_entry_t)thact); ) { - addr = (thuniqueid==0)?thact->machine.cthread_self: thact->thread_id; + addr = (thuniqueid) ? thact->thread_id : thact->machine.cthread_self; if (addr == thaddr) { @@ -935,7 +978,7 @@ fill_taskthreadinfo(task_t task, uint64_t thaddr, int thuniqueid, struct proc_th } int -fill_taskthreadlist(task_t task, void * buffer, int thcount) +fill_taskthreadlist(task_t task, void * buffer, int thcount, bool thuniqueid) { int numthr=0; thread_t thact; @@ -948,7 +991,7 @@ fill_taskthreadlist(task_t task, void * buffer, int thcount) for (thact = (thread_t)(void *)queue_first(&task->threads); !queue_end(&task->threads, (queue_entry_t)thact); ) { - thaddr = thact->machine.cthread_self; + thaddr = (thuniqueid) ? thact->thread_id : thact->machine.cthread_self; *uptr++ = thaddr; numthr++; if (numthr >= thcount) diff --git a/osfmk/kern/btlog.c b/osfmk/kern/btlog.c index 80a479961..a15aef980 100644 --- a/osfmk/kern/btlog.c +++ b/osfmk/kern/btlog.c @@ -837,4 +837,66 @@ btlog_copy_backtraces_for_elements(btlog_t * btlog, btlog_unlock(btlog); } +/* + * Returns the number of records in the btlog struct. + * + * Called by the mach_zone_get_btlog_records() MIG routine. + */ +size_t +get_btlog_records_count(btlog_t *btlog) +{ + if (btlog->btlog_buffersize < sizeof(btlog_t)) { + return 0; + } + return ((btlog->btlog_buffersize - sizeof(btlog_t))/btlog->btrecord_size); +} + +/* + * Copies out relevant info from btlog_record_t's to zone_btrecord_t's. 'numrecs' points to the number of records + * the 'records' buffer can hold. Upon return 'numrecs' points to the number of records actually copied out. + * + * Called by the mach_zone_get_btlog_records() MIG routine. + */ +void +get_btlog_records(btlog_t *btlog, zone_btrecord_t *records, unsigned int *numrecs) +{ + unsigned int count, recs_copied, frame; + zone_btrecord_t *current_rec; + btlog_record_t *zstack_record; + btlog_recordindex_t zstack_index = BTLOG_RECORDINDEX_NONE; + + btlog_lock(btlog); + + count = 0; + if (btlog->btlog_buffersize > sizeof(btlog_t)) { + count = (unsigned int)((btlog->btlog_buffersize - sizeof(btlog_t))/btlog->btrecord_size); + } + /* Copy out only as many records as the pre-allocated buffer size permits. */ + if (count > *numrecs) { + count = *numrecs; + } + zstack_index = btlog->head; + + current_rec = &records[0]; + recs_copied = 0; + while (recs_copied < count && (zstack_index != BTLOG_RECORDINDEX_NONE)) { + zstack_record = lookup_btrecord(btlog, zstack_index); + current_rec->operation_type = (uint32_t)(zstack_record->operation); + current_rec->ref_count = zstack_record->ref_count; + + frame = 0; + while (frame < MIN(btlog->btrecord_btdepth, MAX_ZTRACE_DEPTH)) { + current_rec->bt[frame] = (uint64_t)VM_KERNEL_UNSLIDE(zstack_record->bt[frame]); + frame++; + } + + zstack_index = zstack_record->next; + recs_copied++; + current_rec++; + } + *numrecs = recs_copied; + + btlog_unlock(btlog); +} + #endif /* DEBUG || DEVELOPMENT */ diff --git a/osfmk/kern/btlog.h b/osfmk/kern/btlog.h index c9e937b60..3930703ab 100644 --- a/osfmk/kern/btlog.h +++ b/osfmk/kern/btlog.h @@ -33,6 +33,7 @@ #include #include #include +#include #ifdef XNU_KERNEL_PRIVATE @@ -87,6 +88,13 @@ void btlog_copy_backtraces_for_elements(btlog_t * btlog, uint32_t zoneSize, leak_site_proc proc, void * refCon); + +size_t get_btlog_records_count(btlog_t *btlog); + +void get_btlog_records(btlog_t *btlog, + zone_btrecord_t *records, + unsigned int *numrecs); + #endif /* DEBUG || DEVELOPMENT */ #endif /* XNU_KERNEL_PRIVATE */ diff --git a/osfmk/kern/clock.c b/osfmk/kern/clock.c index 9bd9f3b0e..2cd05c562 100644 --- a/osfmk/kern/clock.c +++ b/osfmk/kern/clock.c @@ -272,14 +272,6 @@ static struct clock_calend { static uint64_t ticks_per_sec; /* ticks in a second (expressed in abs time) */ -#if DEVELOPMENT || DEBUG -clock_sec_t last_utc_sec = 0; -clock_usec_t last_utc_usec = 0; -clock_sec_t max_utc_sec = 0; -clock_sec_t last_sys_sec = 0; -clock_usec_t last_sys_usec = 0; -#endif - #if DEVELOPMENT || DEBUG extern int g_should_log_clock_adjustments; @@ -704,24 +696,6 @@ clock_gettimeofday_and_absolute_time( } } -static void -update_basesleep(struct bintime delta, bool forward) -{ - /* - * Update basesleep only if the platform does not have monotonic clock. - * In that case the sleep time computation will use the PMU time - * which offset gets modified by settimeofday. - * We don't need this for mononic clock because in that case the sleep - * time computation is independent from the offset value of the PMU. - */ - if (!has_monotonic_clock) { - if (forward) - bintime_add(&clock_calend.basesleep, &delta); - else - bintime_sub(&clock_calend.basesleep, &delta); - } -} - /* * clock_set_calendar_microtime: * @@ -792,34 +766,19 @@ clock_set_calendar_microtime( TIME_SUB(deltasecs, oldsecs, deltamicrosecs, oldmicrosecs, USEC_PER_SEC); -#if DEVELOPMENT || DEBUG - if (g_should_log_clock_adjustments) { - os_log(OS_LOG_DEFAULT, "%s delta requested %lu s %d u\n", - __func__, (unsigned long)deltasecs, deltamicrosecs); - } -#endif - TIME_ADD(clock_boottime, deltasecs, clock_boottime_usec, deltamicrosecs, USEC_PER_SEC); clock2bintime(&deltasecs, &deltamicrosecs, &bt); bintime_add(&clock_calend.boottime, &bt); - update_basesleep(bt, TRUE); } else { // moving backwards deltasecs = oldsecs; deltamicrosecs = oldmicrosecs; TIME_SUB(deltasecs, secs, deltamicrosecs, microsecs, USEC_PER_SEC); -#if DEVELOPMENT || DEBUG - if (g_should_log_clock_adjustments) { - os_log(OS_LOG_DEFAULT, "%s negative delta requested %lu s %d u\n", - __func__, (unsigned long)deltasecs, deltamicrosecs); - } -#endif TIME_SUB(clock_boottime, deltasecs, clock_boottime_usec, deltamicrosecs, USEC_PER_SEC); clock2bintime(&deltasecs, &deltamicrosecs, &bt); bintime_sub(&clock_calend.boottime, &bt); - update_basesleep(bt, FALSE); } clock_calend.bintime = clock_calend.boottime; @@ -1065,26 +1024,24 @@ clock_initialize_calendar(void) clock_usec_t microsys2, monotonic_usec; size_t size; - //Get PMU time with offset and corresponding sys time + //Get the UTC time and corresponding sys time PEGetUTCTimeOfDay(&secs, µsecs); clock_get_system_microtime(&sys, µsys); /* * If the platform has a monotonic clock, use kern.monotonicclock_usecs - * to estimate the sleep/wake time, otherwise use the PMU and adjustments - * provided through settimeofday to estimate the sleep time. - * NOTE: the latter case relies that the kernel is the only component - * to set the PMU offset. + * to estimate the sleep/wake time, otherwise use the UTC time to estimate + * the sleep time. */ size = sizeof(monotonic_time); if (kernel_sysctlbyname("kern.monotonicclock_usecs", &monotonic_time, &size, NULL, 0) != 0) { has_monotonic_clock = 0; - os_log(OS_LOG_DEFAULT, "%s system does not have monotonic clock.\n", __func__); + os_log(OS_LOG_DEFAULT, "%s system does not have monotonic clock\n", __func__); } else { has_monotonic_clock = 1; monotonic_usec_total = monotonic_time.monotonic_time_usec; absolutetime_to_microtime(monotonic_time.mach_time, &sys2, µsys2); - os_log(OS_LOG_DEFAULT, "%s system has monotonic clock.\n", __func__); + os_log(OS_LOG_DEFAULT, "%s system has monotonic clock\n", __func__); } s = splclock(); @@ -1095,15 +1052,6 @@ clock_initialize_calendar(void) utc_offset_secs = secs; utc_offset_microsecs = microsecs; -#if DEVELOPMENT || DEBUG - last_utc_sec = secs; - last_utc_usec = microsecs; - last_sys_sec = sys; - last_sys_usec = microsys; - if (secs > max_utc_sec) - max_utc_sec = secs; -#endif - /* * We normally expect the UTC clock to be always-on and produce * greater readings than the tick counter. There may be corner cases @@ -1112,18 +1060,17 @@ clock_initialize_calendar(void) * on error) in which that doesn't hold true. Bring the UTC measurements * in-line with the tick counter measurements as a best effort in that case. */ - //FIXME if the current time is prior than 1970 secs will be negative if ((sys > secs) || ((sys == secs) && (microsys > microsecs))) { - os_log(OS_LOG_DEFAULT, "%s WARNING: PMU offset is less then sys PMU %lu s %d u sys %lu s %d u\n", + os_log(OS_LOG_DEFAULT, "%s WARNING: UTC time is less then sys time, (%lu s %d u) UTC (%lu s %d u) sys\n", __func__, (unsigned long) secs, microsecs, (unsigned long)sys, microsys); secs = utc_offset_secs = sys; microsecs = utc_offset_microsecs = microsys; } - // PMU time with offset - sys + // UTC - sys // This macro stores the subtraction result in utc_offset_secs and utc_offset_microsecs TIME_SUB(utc_offset_secs, sys, utc_offset_microsecs, microsys, USEC_PER_SEC); - + // This function converts utc_offset_secs and utc_offset_microsecs in bintime clock2bintime(&utc_offset_secs, &utc_offset_microsecs, &bt); /* @@ -1151,16 +1098,13 @@ clock_initialize_calendar(void) monotonic_sec = monotonic_usec_total / (clock_sec_t)USEC_PER_SEC; monotonic_usec = monotonic_usec_total % (clock_usec_t)USEC_PER_SEC; - // PMU time without offset - sys + // monotonic clock - sys // This macro stores the subtraction result in monotonic_sec and monotonic_usec TIME_SUB(monotonic_sec, sys2, monotonic_usec, microsys2, USEC_PER_SEC); clock2bintime(&monotonic_sec, &monotonic_usec, &monotonic_bt); // set the baseleep as the difference between monotonic clock - sys clock_calend.basesleep = monotonic_bt; - } else { - // set the baseleep as the difference between PMU clock - sys - clock_calend.basesleep = bt; } commpage_update_mach_continuous_time(mach_absolutetime_asleep); @@ -1187,149 +1131,189 @@ clock_initialize_calendar(void) void clock_wakeup_calendar(void) { - clock_sec_t sys; - clock_sec_t secs; - clock_usec_t microsys; - clock_usec_t microsecs; + clock_sec_t wake_sys_sec; + clock_usec_t wake_sys_usec; + clock_sec_t wake_sec; + clock_usec_t wake_usec; + clock_sec_t wall_time_sec; + clock_usec_t wall_time_usec; + clock_sec_t diff_sec; + clock_usec_t diff_usec; + clock_sec_t var_s; + clock_usec_t var_us; spl_t s; struct bintime bt, last_sleep_bt; - clock_sec_t basesleep_s, last_sleep_sec; - clock_usec_t basesleep_us, last_sleep_usec; struct latched_time monotonic_time; uint64_t monotonic_usec_total; + uint64_t wake_abs; size_t size; - clock_sec_t secs_copy; - clock_usec_t microsecs_copy; -#if DEVELOPMENT || DEBUG - clock_sec_t utc_sec; - clock_usec_t utc_usec; - PEGetUTCTimeOfDay(&utc_sec, &utc_usec); -#endif /* * If the platform has the monotonic clock use that to * compute the sleep time. The monotonic clock does not have an offset * that can be modified, so nor kernel or userspace can change the time * of this clock, it can only monotonically increase over time. - * During sleep mach_absolute_time does not tick, - * so the sleep time is the difference betwen the current monotonic time + * During sleep mach_absolute_time (sys time) does not tick, + * so the sleep time is the difference between the current monotonic time * less the absolute time and the previous difference stored at wake time. * - * basesleep = monotonic - sys ---> computed at last wake + * basesleep = (monotonic - sys) ---> computed at last wake * sleep_time = (monotonic - sys) - basesleep * - * If the platform does not support monotonic time we use the PMU time - * to compute the last sleep. - * The PMU time is the monotonic clock + an offset that can be set + * If the platform does not support monotonic clock we set the wall time to what the + * UTC clock returns us. + * Setting the wall time to UTC time implies that we loose all the adjustments + * done during wake time through adjtime/ntp_adjustime. + * The UTC time is the monotonic clock + an offset that can be set * by kernel. + * The time slept in this case is the difference between wall time and UTC + * at wake. * * IMPORTANT: - * We assume that only the kernel is setting the offset of the PMU and that + * We assume that only the kernel is setting the offset of the PMU/RTC and that * it is doing it only througth the settimeofday interface. - * - * basesleep is the different between the PMU time and the mach_absolute_time - * at wake. - * During awake time settimeofday can change the PMU offset by a delta, - * and basesleep is shifted by the same delta applyed to the PMU. So the sleep - * time computation becomes: - * - * PMU = monotonic + PMU_offset - * basesleep = PMU - sys ---> computed at last wake - * basesleep += settimeofday_delta - * PMU_offset += settimeofday_delta - * sleep_time = (PMU - sys) - basesleep */ if (has_monotonic_clock) { - //Get monotonic time with corresponding sys time + +#if DEVELOPMENT || DEBUG + /* + * Just for debugging, get the wake UTC time. + */ + PEGetUTCTimeOfDay(&var_s, &var_us); +#endif + /* + * Get monotonic time with corresponding sys time + */ size = sizeof(monotonic_time); if (kernel_sysctlbyname("kern.monotonicclock_usecs", &monotonic_time, &size, NULL, 0) != 0) { panic("%s: could not call kern.monotonicclock_usecs", __func__); } - monotonic_usec_total = monotonic_time.monotonic_time_usec; - absolutetime_to_microtime(monotonic_time.mach_time, &sys, µsys); + wake_abs = monotonic_time.mach_time; + absolutetime_to_microtime(wake_abs, &wake_sys_sec, &wake_sys_usec); - secs = monotonic_usec_total / (clock_sec_t)USEC_PER_SEC; - microsecs = monotonic_usec_total % (clock_usec_t)USEC_PER_SEC; + monotonic_usec_total = monotonic_time.monotonic_time_usec; + wake_sec = monotonic_usec_total / (clock_sec_t)USEC_PER_SEC; + wake_usec = monotonic_usec_total % (clock_usec_t)USEC_PER_SEC; } else { - //Get PMU time with offset and corresponding sys time - PEGetUTCTimeOfDay(&secs, µsecs); - clock_get_system_microtime(&sys, µsys); - + /* + * Get UTC time and corresponding sys time + */ + PEGetUTCTimeOfDay(&wake_sec, &wake_usec); + wake_abs = mach_absolute_time(); + absolutetime_to_microtime(wake_abs, &wake_sys_sec, &wake_sys_usec); } +#if DEVELOPMENT || DEBUG + os_log(OS_LOG_DEFAULT, "time at wake %lu s %d u from %s clock, abs %llu\n", (unsigned long)wake_sec, wake_usec, (has_monotonic_clock)?"monotonic":"UTC", wake_abs); + if (has_monotonic_clock) { + os_log(OS_LOG_DEFAULT, "UTC time %lu s %d u\n", (unsigned long)var_s, var_us); + } +#endif /* DEVELOPMENT || DEBUG */ + s = splclock(); clock_lock(); commpage_disable_timestamp(); - secs_copy = secs; - microsecs_copy = microsecs; - #if DEVELOPMENT || DEBUG struct clock_calend clock_calend_cp1 = clock_calend; #endif /* DEVELOPMENT || DEBUG */ -#if DEVELOPMENT || DEBUG - last_utc_sec = secs; - last_utc_usec = microsecs; - last_sys_sec = sys; - last_sys_usec = microsys; - if (secs > max_utc_sec) - max_utc_sec = secs; -#endif /* - * We normally expect the UTC clock to be always-on and produce - * greater readings than the tick counter. There may be corner cases - * due to differing clock resolutions (UTC clock is likely lower) and - * and errors reading the UTC clock (some implementations return 0 - * on error) in which that doesn't hold true. Bring the UTC measurements - * in-line with the tick counter measurements as a best effort in that case. + * We normally expect the UTC/monotonic clock to be always-on and produce + * greater readings than the sys counter. There may be corner cases + * due to differing clock resolutions (UTC/monotonic clock is likely lower) and + * and errors reading the UTC/monotonic clock (some implementations return 0 + * on error) in which that doesn't hold true. */ - //FIXME if the current time is prior than 1970 secs will be negative - if ((sys > secs) || ((sys == secs) && (microsys > microsecs))) { - os_log(OS_LOG_DEFAULT, "%s WARNING: %s is less then sys %s %lu s %d u sys %lu s %d u\n", - __func__, (has_monotonic_clock)?"monotonic":"PMU", (has_monotonic_clock)?"monotonic":"PMU", (unsigned long)secs, microsecs, (unsigned long)sys, microsys); - secs = sys; - microsecs = microsys; + if ((wake_sys_sec > wake_sec) || ((wake_sys_sec == wake_sec) && (wake_sys_usec > wake_usec))) { + os_log_error(OS_LOG_DEFAULT, "WARNING: %s clock is less then sys clock at wake: %lu s %d u vs %lu s %d u, defaulting sleep time to zero\n", (has_monotonic_clock)?"monotonic":"UTC", (unsigned long)wake_sec, wake_usec, (unsigned long)wake_sys_sec, wake_sys_usec); + mach_absolutetime_last_sleep = 0; + goto done; } - // PMU or monotonic - sys - // This macro stores the subtraction result in secs and microsecs - TIME_SUB(secs, sys, microsecs, microsys, USEC_PER_SEC); - clock2bintime(&secs, µsecs, &bt); + if (has_monotonic_clock) { + /* + * computer the difference monotonic - sys + * we already checked that monotonic time is + * greater than sys. + */ + diff_sec = wake_sec; + diff_usec = wake_usec; + // This macro stores the subtraction result in diff_sec and diff_usec + TIME_SUB(diff_sec, wake_sys_sec, diff_usec, wake_sys_usec, USEC_PER_SEC); + //This function converts diff_sec and diff_usec in bintime + clock2bintime(&diff_sec, &diff_usec, &bt); - /* - * Safety belt: the UTC clock will likely have a lower resolution than the tick counter. - * It's also possible that the device didn't fully transition to the powered-off state on - * the most recent sleep, so the tick counter may not have reset or may have only briefly - * tured off. In that case it's possible for the difference between the UTC clock and the - * tick counter to be less than the previously recorded value in clock.calend.basesleep. - * In that case simply record that we slept for 0 ticks. - */ - if ((bt.sec > clock_calend.basesleep.sec) || - ((bt.sec == clock_calend.basesleep.sec) && (bt.frac > clock_calend.basesleep.frac))) { - - //last_sleep is the difference between current PMU or monotonic - abs and last wake PMU or monotonic - abs - last_sleep_bt = bt; - bintime_sub(&last_sleep_bt, &clock_calend.basesleep); - - //set baseseep to current PMU or monotonic - abs - clock_calend.basesleep = bt; - bintime2usclock(&last_sleep_bt, &last_sleep_sec, &last_sleep_usec); - bintime2absolutetime(&last_sleep_bt, &mach_absolutetime_last_sleep); - mach_absolutetime_asleep += mach_absolutetime_last_sleep; - - bintime_add(&clock_calend.offset, &last_sleep_bt); - bintime_add(&clock_calend.bintime, &last_sleep_bt); - - } else{ - mach_absolutetime_last_sleep = 0; - last_sleep_sec = last_sleep_usec = 0; - bintime2usclock(&clock_calend.basesleep, &basesleep_s, &basesleep_us); - os_log(OS_LOG_DEFAULT, "%s WARNING: basesleep (%lu s %d u) > %s-sys (%lu s %d u) \n", - __func__, (unsigned long) basesleep_s, basesleep_us, (has_monotonic_clock)?"monotonic":"PMU", (unsigned long) secs_copy, microsecs_copy ); - } + /* + * Safety belt: the monotonic clock will likely have a lower resolution than the sys counter. + * It's also possible that the device didn't fully transition to the powered-off state on + * the most recent sleep, so the sys counter may not have reset or may have only briefly + * turned off. In that case it's possible for the difference between the monotonic clock and the + * sys counter to be less than the previously recorded value in clock.calend.basesleep. + * In that case simply record that we slept for 0 ticks. + */ + if ((bt.sec > clock_calend.basesleep.sec) || + ((bt.sec == clock_calend.basesleep.sec) && (bt.frac > clock_calend.basesleep.frac))) { + //last_sleep is the difference between (current monotonic - abs) and (last wake monotonic - abs) + last_sleep_bt = bt; + bintime_sub(&last_sleep_bt, &clock_calend.basesleep); + + bintime2absolutetime(&last_sleep_bt, &mach_absolutetime_last_sleep); + mach_absolutetime_asleep += mach_absolutetime_last_sleep; + + //set basesleep to current monotonic - abs + clock_calend.basesleep = bt; + + //update wall time + bintime_add(&clock_calend.offset, &last_sleep_bt); + bintime_add(&clock_calend.bintime, &last_sleep_bt); + + bintime2usclock(&last_sleep_bt, &var_s, &var_us); + os_log(OS_LOG_DEFAULT, "time_slept (%lu s %d u)\n", (unsigned long) var_s, var_us); + + } else { + bintime2usclock(&clock_calend.basesleep, &var_s, &var_us); + os_log_error(OS_LOG_DEFAULT, "WARNING: last wake monotonic-sys time (%lu s %d u) is greater then current monotonic-sys time(%lu s %d u), defaulting sleep time to zero\n", (unsigned long) var_s, var_us, (unsigned long) diff_sec, diff_usec); + + mach_absolutetime_last_sleep = 0; + } + } else { + /* + * set the wall time to UTC value + */ + bt = get_scaled_time(wake_abs); + bintime_add(&bt, &clock_calend.bintime); + bintime2usclock(&bt, &wall_time_sec, &wall_time_usec); + + if (wall_time_sec > wake_sec || (wall_time_sec == wake_sec && wall_time_usec > wake_usec) ) { + os_log(OS_LOG_DEFAULT, "WARNING: wall time (%lu s %d u) is greater than current UTC time (%lu s %d u), defaulting sleep time to zero\n", (unsigned long) wall_time_sec, wall_time_usec, (unsigned long) wake_sec, wake_usec); + + mach_absolutetime_last_sleep = 0; + } else { + diff_sec = wake_sec; + diff_usec = wake_usec; + // This macro stores the subtraction result in diff_sec and diff_usec + TIME_SUB(diff_sec, wall_time_sec, diff_usec, wall_time_usec, USEC_PER_SEC); + //This function converts diff_sec and diff_usec in bintime + clock2bintime(&diff_sec, &diff_usec, &bt); + + //time slept in this case is the difference between PMU/RTC and wall time + last_sleep_bt = bt; + + bintime2absolutetime(&last_sleep_bt, &mach_absolutetime_last_sleep); + mach_absolutetime_asleep += mach_absolutetime_last_sleep; + + //update wall time + bintime_add(&clock_calend.offset, &last_sleep_bt); + bintime_add(&clock_calend.bintime, &last_sleep_bt); + + bintime2usclock(&last_sleep_bt, &var_s, &var_us); + os_log(OS_LOG_DEFAULT, "time_slept (%lu s %d u)\n", (unsigned long)var_s, var_us); + } + } +done: KERNEL_DEBUG_CONSTANT( MACHDBG_CODE(DBG_MACH_CLOCK,MACH_EPOCH_CHANGE) | DBG_FUNC_NONE, (uintptr_t) mach_absolutetime_last_sleep, @@ -1350,11 +1334,8 @@ clock_wakeup_calendar(void) #if DEVELOPMENT || DEBUG if (g_should_log_clock_adjustments) { - os_log(OS_LOG_DEFAULT, "PMU was %lu s %d u\n",(unsigned long) utc_sec, utc_usec); - os_log(OS_LOG_DEFAULT, "last sleep was %lu s %d u\n",(unsigned long) last_sleep_sec, last_sleep_usec); - print_all_clock_variables("clock_wakeup_calendar:BEFORE", - &secs_copy, µsecs_copy, &sys, µsys, &clock_calend_cp1); - print_all_clock_variables("clock_wakeup_calendar:AFTER", NULL, NULL, NULL, NULL, &clock_calend_cp); + print_all_clock_variables("clock_wakeup_calendar: BEFORE", NULL, NULL, NULL, NULL, &clock_calend_cp1); + print_all_clock_variables("clock_wakeup_calendar: AFTER", NULL, NULL, NULL, NULL, &clock_calend_cp); } #endif /* DEVELOPMENT || DEBUG */ diff --git a/osfmk/kern/clock_oldops.c b/osfmk/kern/clock_oldops.c index bc7357560..c77d40f28 100644 --- a/osfmk/kern/clock_oldops.c +++ b/osfmk/kern/clock_oldops.c @@ -158,6 +158,19 @@ SECURITY_READ_ONLY_EARLY(struct clock_ops) calend_ops = { calend_getattr, }; +/* + * List of clock devices. + */ +SECURITY_READ_ONLY_LATE(struct clock) clock_list[] = { + + /* SYSTEM_CLOCK */ + { &sysclk_ops, 0, 0 }, + + /* CALENDAR_CLOCK */ + { &calend_ops, 0, 0 } +}; +int clock_count = sizeof(clock_list) / sizeof(clock_list[0]); + /* * Macros to lock/unlock clock system. */ diff --git a/osfmk/kern/coalition.c b/osfmk/kern/coalition.c index ed24958c1..26f9d33a4 100644 --- a/osfmk/kern/coalition.c +++ b/osfmk/kern/coalition.c @@ -220,7 +220,7 @@ struct i_jetsam_coalition { queue_head_t extensions; queue_head_t services; queue_head_t other; - thread_group_t thread_group; + struct thread_group *thread_group; }; diff --git a/osfmk/kern/coalition.h b/osfmk/kern/coalition.h index 10cc5b742..195ba05be 100644 --- a/osfmk/kern/coalition.h +++ b/osfmk/kern/coalition.h @@ -68,10 +68,10 @@ boolean_t task_coalition_adjust_focal_count(task_t task, int count, uint32_t *ne uint32_t task_coalition_focal_count(task_t task); boolean_t task_coalition_adjust_nonfocal_count(task_t task, int count, uint32_t *new_count); uint32_t task_coalition_nonfocal_count(task_t task); -thread_group_t task_coalition_get_thread_group(task_t task); -void coalition_set_thread_group(coalition_t coal, thread_group_t tg); -thread_group_t kdp_coalition_get_thread_group(coalition_t coal); -thread_group_t coalition_get_thread_group(coalition_t coal); +struct thread_group *task_coalition_get_thread_group(task_t task); +void coalition_set_thread_group(coalition_t coal, struct thread_group *tg); +struct thread_group *kdp_coalition_get_thread_group(coalition_t coal); +struct thread_group *coalition_get_thread_group(coalition_t coal); void task_coalition_thread_group_focal_update(task_t task); void coalition_for_each_task(coalition_t coal, void *ctx, diff --git a/osfmk/kern/cpu_quiesce.c b/osfmk/kern/cpu_quiesce.c new file mode 100644 index 000000000..977f5e50f --- /dev/null +++ b/osfmk/kern/cpu_quiesce.c @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifdef __x86_64__ +#error This file is only needed on weakly-ordered systems! +#endif + +#include +#include +#include + +#include +#include +#include + +#include + +/* + * CPU quiescing generation counter implemented with a checkin mask + * + * A tri-state bitfield, with 2 bits for each processor:; + * 1) 'checkin' bit, saying this processor has 'checked in', i.e. executed the acqrel barrier + * 2) 'expected' bit, saying this processor is expected to check in, i.e. not idle. + * + * When a processor causes the 'expected' bits to equal the 'checkin' bits, which + * indicates that all processors have executed the barrier, it ticks the algorithm + * and resets the state. + * + * Idle CPUs won't check in, because they don't run, so the algorithm won't tick. + * However, they can't do anything in userspace while idle, so we don't need + * them to execute barriers, so we have them 'leave' the counter so that + * they don't delay the tick while idle. + * + * This bitfield currently limits MAX_CPUS to 32 on LP64. + * In the future, we can use double-wide atomics and int128 if we need 64 CPUS. + * + * The mask only guarantees ordering to code running in userspace. + * We defer joining the counter until we actually reach userspace, allowing + * processors that come out of idle and only run kernel code to avoid the overhead + * of participation. + * + * We additionally defer updating the counter for a minimum interval to + * reduce the frequency of executing the exclusive atomic operations. + * + * The longest delay between two checkins assuming that at least one processor + * joins is + ( * 2) + */ + +typedef unsigned long checkin_mask_t; + +static _Atomic checkin_mask_t cpu_quiescing_checkin_state; + +static uint64_t cpu_checkin_last_commit; + +#define CPU_CHECKIN_MIN_INTERVAL_US 4000 /* 4ms */ +#define CPU_CHECKIN_MIN_INTERVAL_MAX_US USEC_PER_SEC /* 1s */ +static uint64_t cpu_checkin_min_interval; +uint32_t cpu_checkin_min_interval_us; + +#if __LP64__ +static_assert(MAX_CPUS <= 32); +#define CPU_CHECKIN_MASK 0x5555555555555555UL +#define CPU_EXPECTED_MASK (~CPU_CHECKIN_MASK) +#else +/* Avoid double-wide CAS on 32-bit platforms by using a 32-bit state and mask */ +static_assert(MAX_CPUS <= 16); +#define CPU_CHECKIN_MASK 0x55555555UL +#define CPU_EXPECTED_MASK (~CPU_CHECKIN_MASK) +#endif + +static_assert(CPU_CHECKIN_MASK == CPU_EXPECTED_MASK >> 1); + +static inline checkin_mask_t +cpu_checked_in_bit(int cpuid) +{ + return 1UL << (2 * cpuid); +} + +static inline checkin_mask_t +cpu_expected_bit(int cpuid) +{ + return 1UL << (2 * cpuid + 1); +} + +void +cpu_quiescent_counter_init(void) +{ + assert(CPU_CHECKIN_MASK & cpu_checked_in_bit(MAX_CPUS)); + assert(CPU_EXPECTED_MASK & cpu_expected_bit(MAX_CPUS)); + assert((CPU_CHECKIN_MASK & cpu_expected_bit(MAX_CPUS)) == 0); + assert((CPU_EXPECTED_MASK & cpu_checked_in_bit(MAX_CPUS)) == 0); + + cpu_quiescent_counter_set_min_interval_us(CPU_CHECKIN_MIN_INTERVAL_US); +} + +void +cpu_quiescent_counter_set_min_interval_us(uint32_t new_value_us) +{ + /* clamp to something vaguely sane */ + if (new_value_us > CPU_CHECKIN_MIN_INTERVAL_MAX_US) + new_value_us = CPU_CHECKIN_MIN_INTERVAL_MAX_US; + + cpu_checkin_min_interval_us = new_value_us; + + uint64_t abstime = 0; + clock_interval_to_absolutetime_interval(cpu_checkin_min_interval_us, + NSEC_PER_USEC, &abstime); + cpu_checkin_min_interval = abstime; +} + + +/* + * Called when all running CPUs have checked in. + * + * The commpage increment is protected by the 'lock' of having caused the tick, + * and it is published by the state reset release barrier. + */ +static void +cpu_quiescent_counter_commit(uint64_t ctime) +{ + __kdebug_only uint64_t old_gen; + __kdebug_only checkin_mask_t old_state; + + old_gen = commpage_increment_cpu_quiescent_counter(); + + cpu_checkin_last_commit = ctime; + + old_state = os_atomic_and(&cpu_quiescing_checkin_state, ~CPU_CHECKIN_MASK, release); + + KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUIESCENT_COUNTER), old_gen, old_state, ctime, 0); +} + +/* + * Have all the expected CPUs checked in? + */ +static bool +cpu_quiescent_counter_needs_commit(checkin_mask_t state) +{ + return (state & CPU_CHECKIN_MASK) == ((state & CPU_EXPECTED_MASK) >> 1); +} + +/* + * Called when a processor wants to start participating in the counter, e.g. + * 1) when context switching away from the idle thread + * 2) when coming up for the first time + * 3) when coming up after a shutdown + * + * Called with interrupts disabled. + */ +void +cpu_quiescent_counter_join(__unused uint64_t ctime) +{ + processor_t processor = current_processor(); + __assert_only int cpuid = processor->cpu_id; + + assert(processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_NONE || + processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_LEFT); + + assert((os_atomic_load(&cpu_quiescing_checkin_state, relaxed) & + (cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid))) == 0); + + processor->cpu_quiesce_state = CPU_QUIESCE_COUNTER_PENDING_JOIN; + + /* + * Mark the processor to call cpu_quiescent_counter_ast before it + * ever returns to userspace. + */ + ast_on(AST_UNQUIESCE); +} + +/* + * Called with interrupts disabled from the userspace boundary at the AST_UNQUIESCE callback + * It needs to acquire the counter to see data and the counter published by other CPUs. + */ +void +cpu_quiescent_counter_ast(void) +{ + processor_t processor = current_processor(); + int cpuid = processor->cpu_id; + + assert(processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_PENDING_JOIN); + + /* We had better not already be joined. */ + assert((os_atomic_load(&cpu_quiescing_checkin_state, relaxed) & + (cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid))) == 0); + + /* + * No release barrier needed because we have no prior state to publish. + * Acquire barrier needed because we need this processor to see + * the latest counter value. + * + * The state may be in 'needs checkin' both before and after + * this atomic or. + * + * Additionally, if this is the first processor to come out of idle, + * it may need to kickstart the algorithm, otherwise it would + * stay in 'needs commit' perpetually with no processor assigned to + * actually do the commit. To do that, the first processor only adds + * its expected bit. + */ + + processor->cpu_quiesce_state = CPU_QUIESCE_COUNTER_JOINED; + processor->cpu_quiesce_last_checkin = mach_absolute_time(); + + checkin_mask_t old_mask, new_mask; + os_atomic_rmw_loop(&cpu_quiescing_checkin_state, old_mask, new_mask, acquire, { + if (old_mask == 0) { + new_mask = old_mask | cpu_expected_bit(cpuid); + } else { + new_mask = old_mask | cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid); + } + }); +} + +/* + * Called when a processor no longer wants to participate in the counter, + * i.e. when a processor is on its way to idle or shutdown. + * + * Called with interrupts disabled. + * + * The processor needs to remove itself from the expected mask, to allow the + * algorithm to continue ticking without its participation. + * However, it needs to ensure that anything it has done since the last time + * it checked in has been published before the next tick is allowed to commit. + */ +void +cpu_quiescent_counter_leave(uint64_t ctime) +{ + processor_t processor = current_processor(); + int cpuid = processor->cpu_id; + + assert(processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_JOINED || + processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_PENDING_JOIN); + + /* We no longer need the cpu_quiescent_counter_ast callback to be armed */ + ast_off(AST_UNQUIESCE); + + if (processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_PENDING_JOIN) { + /* We never actually joined, so we don't have to do the work to leave. */ + processor->cpu_quiesce_state = CPU_QUIESCE_COUNTER_LEFT; + return; + } + + /* Leaving can't be deferred, even if we're within the min interval */ + processor->cpu_quiesce_last_checkin = ctime; + + checkin_mask_t mask = cpu_checked_in_bit(cpuid) | cpu_expected_bit(cpuid); + + checkin_mask_t orig_state = os_atomic_and_orig(&cpu_quiescing_checkin_state, + ~mask, acq_rel); + + assert((orig_state & cpu_expected_bit(cpuid))); + + processor->cpu_quiesce_state = CPU_QUIESCE_COUNTER_LEFT; + + if (cpu_quiescent_counter_needs_commit(orig_state)) { + /* + * the old state indicates someone else was already doing a commit + * but hadn't finished yet. We successfully inserted the acq_rel + * before they finished the commit by resetting the bitfield, + * so we're done here. + */ + return; + } + + checkin_mask_t new_state = orig_state & ~mask; + + if (cpu_quiescent_counter_needs_commit(new_state)) { + cpu_quiescent_counter_commit(ctime); + } +} + +/* + * Called when a processor wants to check in to the counter + * If it hasn't yet fully joined, it doesn't need to check in. + * + * Called with interrupts disabled. + */ +void +cpu_quiescent_counter_checkin(uint64_t ctime) +{ + processor_t processor = current_processor(); + int cpuid = processor->cpu_id; + + assert(processor->cpu_quiesce_state != CPU_QUIESCE_COUNTER_NONE); + + /* If we're not joined yet, we don't need to check in */ + if (__probable(processor->cpu_quiesce_state != CPU_QUIESCE_COUNTER_JOINED)) + return; + + /* If we've checked in recently, we don't need to check in yet. */ + if (__probable((ctime - processor->cpu_quiesce_last_checkin) <= cpu_checkin_min_interval)) + return; + + processor->cpu_quiesce_last_checkin = ctime; + + checkin_mask_t state = os_atomic_load(&cpu_quiescing_checkin_state, relaxed); + + assert((state & cpu_expected_bit(cpuid))); + + if (__probable((state & cpu_checked_in_bit(cpuid)))) { + /* + * Processor has already checked in for this round, no need to + * acquire the cacheline exclusive. + */ + return; + } + + checkin_mask_t orig_state = os_atomic_or_orig(&cpu_quiescing_checkin_state, + cpu_checked_in_bit(cpuid), acq_rel); + + checkin_mask_t new_state = orig_state | cpu_checked_in_bit(cpuid); + + if (cpu_quiescent_counter_needs_commit(new_state)) { + assertf(!cpu_quiescent_counter_needs_commit(orig_state), + "old: 0x%lx, new: 0x%lx", orig_state, new_state); + cpu_quiescent_counter_commit(ctime); + } +} + +#if MACH_ASSERT +/* + * Called on all AST exits to userspace to assert this processor actually joined + * + * Called with interrupts disabled after the AST should have been handled + */ +void +cpu_quiescent_counter_assert_ast(void) +{ + processor_t processor = current_processor(); + int cpuid = processor->cpu_id; + + assert(processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_JOINED); + + checkin_mask_t state = os_atomic_load(&cpu_quiescing_checkin_state, relaxed); + assert((state & cpu_expected_bit(cpuid))); +} +#endif /* MACH_ASSERT */ + diff --git a/osfmk/kern/cpu_quiesce.h b/osfmk/kern/cpu_quiesce.h new file mode 100644 index 000000000..324a2b038 --- /dev/null +++ b/osfmk/kern/cpu_quiesce.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _KERN_CPU_QUIESCE_H_ +#define _KERN_CPU_QUIESCE_H_ + +#ifdef XNU_KERNEL_PRIVATE + +#include + +/* State field kept on each CPU to track participation */ +typedef enum { + /* Processor that never participated */ + CPU_QUIESCE_COUNTER_NONE = 0, + /* Processor ready to participate, waiting for return to user */ + CPU_QUIESCE_COUNTER_PENDING_JOIN = 1, + /* Processor currently participating in counter */ + CPU_QUIESCE_COUNTER_JOINED = 2, + /* Processor currently not participating in counter */ + CPU_QUIESCE_COUNTER_LEFT = 3, +} cpu_quiescent_state_t; + +#if CONFIG_QUIESCE_COUNTER + +extern void cpu_quiescent_counter_join(uint64_t ctime); +extern void cpu_quiescent_counter_leave(uint64_t ctime); +extern void cpu_quiescent_counter_checkin(uint64_t ctime); +extern void cpu_quiescent_counter_ast(void); +extern void cpu_quiescent_counter_init(void); + +/* use of these is guarded by the config */ +extern uint32_t cpu_checkin_min_interval_us; +extern void cpu_quiescent_counter_set_min_interval_us(uint32_t new_value); + +#else /* CONFIG_QUIESCE_COUNTER */ + +/* stub routines for platforms without the counter */ + +static inline void cpu_quiescent_counter_join(__unused uint64_t ctime) { } +static inline void cpu_quiescent_counter_leave(__unused uint64_t ctime) { } +static inline void cpu_quiescent_counter_checkin(__unused uint64_t ctime) { } +static inline void cpu_quiescent_counter_ast(void) { } +static inline void cpu_quiescent_counter_init(void) { } + +#endif /* CONFIG_QUIESCE_COUNTER */ + +#if MACH_ASSERT && CONFIG_QUIESCE_COUNTER +extern void cpu_quiescent_counter_assert_ast(void); +#else +static inline void cpu_quiescent_counter_assert_ast(void) { } +#endif + +#endif /* XNU_KERNEL_PRIVATE */ + +#endif /* _KERN_CPU_QUIESCE_H_ */ + diff --git a/osfmk/kern/cs_blobs.h b/osfmk/kern/cs_blobs.h index e8007e9f2..cafafcaff 100644 --- a/osfmk/kern/cs_blobs.h +++ b/osfmk/kern/cs_blobs.h @@ -30,38 +30,45 @@ #define _KERN_CODESIGN_H_ /* code signing attributes of a process */ -#define CS_VALID 0x0000001 /* dynamically valid */ -#define CS_ADHOC 0x0000002 /* ad hoc signed */ -#define CS_GET_TASK_ALLOW 0x0000004 /* has get-task-allow entitlement */ -#define CS_INSTALLER 0x0000008 /* has installer entitlement */ - -#define CS_HARD 0x0000100 /* don't load invalid pages */ -#define CS_KILL 0x0000200 /* kill process if it becomes invalid */ -#define CS_CHECK_EXPIRATION 0x0000400 /* force expiration checking */ -#define CS_RESTRICT 0x0000800 /* tell dyld to treat restricted */ -#define CS_ENFORCEMENT 0x0001000 /* require enforcement */ -#define CS_REQUIRE_LV 0x0002000 /* require library validation */ -#define CS_ENTITLEMENTS_VALIDATED 0x0004000 /* code signature permits restricted entitlements */ -#define CS_NVRAM_UNRESTRICTED 0x0008000 /* has com.apple.rootless.restricted-nvram-variables.heritable entitlement */ - -#define CS_ALLOWED_MACHO (CS_ADHOC | CS_HARD | CS_KILL | CS_CHECK_EXPIRATION | \ - CS_RESTRICT | CS_ENFORCEMENT | CS_REQUIRE_LV) - -#define CS_EXEC_SET_HARD 0x0100000 /* set CS_HARD on any exec'ed process */ -#define CS_EXEC_SET_KILL 0x0200000 /* set CS_KILL on any exec'ed process */ -#define CS_EXEC_SET_ENFORCEMENT 0x0400000 /* set CS_ENFORCEMENT on any exec'ed process */ -#define CS_EXEC_INHERIT_SIP 0x0800000 /* set CS_INSTALLER on any exec'ed process */ - -#define CS_KILLED 0x1000000 /* was killed by kernel for invalidity */ -#define CS_DYLD_PLATFORM 0x2000000 /* dyld used to load this is a platform binary */ -#define CS_PLATFORM_BINARY 0x4000000 /* this is a platform binary */ -#define CS_PLATFORM_PATH 0x8000000 /* platform binary by the fact of path (osx only) */ -#define CS_DEBUGGED 0x10000000 /* process is currently or has previously been debugged and allowed to run with invalid pages */ -#define CS_SIGNED 0x20000000 /* process has a signature (may have gone invalid) */ -#define CS_DEV_CODE 0x40000000 /* code is dev signed, cannot be loaded into prod signed code (will go away with rdar://problem/28322552) */ -#define CS_DATAVAULT_CONTROLLER 0x80000000 /* has Data Vault controller entitlement */ - -#define CS_ENTITLEMENT_FLAGS (CS_GET_TASK_ALLOW | CS_INSTALLER | CS_DATAVAULT_CONTROLLER | CS_NVRAM_UNRESTRICTED) +#define CS_VALID 0x00000001 /* dynamically valid */ +#define CS_ADHOC 0x00000002 /* ad hoc signed */ +#define CS_GET_TASK_ALLOW 0x00000004 /* has get-task-allow entitlement */ +#define CS_INSTALLER 0x00000008 /* has installer entitlement */ + +#define CS_FORCED_LV 0x00000010 /* Library Validation required by Hardened System Policy */ +#define CS_INVALID_ALLOWED 0x00000020 /* (macOS Only) Page invalidation allowed by task port policy */ + +#define CS_HARD 0x00000100 /* don't load invalid pages */ +#define CS_KILL 0x00000200 /* kill process if it becomes invalid */ +#define CS_CHECK_EXPIRATION 0x00000400 /* force expiration checking */ +#define CS_RESTRICT 0x00000800 /* tell dyld to treat restricted */ + +#define CS_ENFORCEMENT 0x00001000 /* require enforcement */ +#define CS_REQUIRE_LV 0x00002000 /* require library validation */ +#define CS_ENTITLEMENTS_VALIDATED 0x00004000 /* code signature permits restricted entitlements */ +#define CS_NVRAM_UNRESTRICTED 0x00008000 /* has com.apple.rootless.restricted-nvram-variables.heritable entitlement */ + +#define CS_RUNTIME 0x00010000 /* Apply hardened runtime policies */ + +#define CS_ALLOWED_MACHO (CS_ADHOC | CS_HARD | CS_KILL | CS_CHECK_EXPIRATION | \ + CS_RESTRICT | CS_ENFORCEMENT | CS_REQUIRE_LV | CS_RUNTIME) + +#define CS_EXEC_SET_HARD 0x00100000 /* set CS_HARD on any exec'ed process */ +#define CS_EXEC_SET_KILL 0x00200000 /* set CS_KILL on any exec'ed process */ +#define CS_EXEC_SET_ENFORCEMENT 0x00400000 /* set CS_ENFORCEMENT on any exec'ed process */ +#define CS_EXEC_INHERIT_SIP 0x00800000 /* set CS_INSTALLER on any exec'ed process */ + +#define CS_KILLED 0x01000000 /* was killed by kernel for invalidity */ +#define CS_DYLD_PLATFORM 0x02000000 /* dyld used to load this is a platform binary */ +#define CS_PLATFORM_BINARY 0x04000000 /* this is a platform binary */ +#define CS_PLATFORM_PATH 0x08000000 /* platform binary by the fact of path (osx only) */ + +#define CS_DEBUGGED 0x10000000 /* process is currently or has previously been debugged and allowed to run with invalid pages */ +#define CS_SIGNED 0x20000000 /* process has a signature (may have gone invalid) */ +#define CS_DEV_CODE 0x40000000 /* code is dev signed, cannot be loaded into prod signed code (will go away with rdar://problem/28322552) */ +#define CS_DATAVAULT_CONTROLLER 0x80000000 /* has Data Vault controller entitlement */ + +#define CS_ENTITLEMENT_FLAGS (CS_GET_TASK_ALLOW | CS_INSTALLER | CS_DATAVAULT_CONTROLLER | CS_NVRAM_UNRESTRICTED) /* executable segment flags */ @@ -69,7 +76,7 @@ #define CS_EXECSEG_ALLOW_UNSIGNED 0x10 /* allow unsigned pages (for debugging) */ #define CS_EXECSEG_DEBUGGER 0x20 /* main binary is debugger */ #define CS_EXECSEG_JIT 0x40 /* JIT enabled */ -#define CS_EXECSEG_SKIP_LV 0x80 /* skip library validation */ +#define CS_EXECSEG_SKIP_LV 0x80 /* OBSOLETE: skip library validation */ #define CS_EXECSEG_CAN_LOAD_CDHASH 0x100 /* can bless cdhash for execution */ #define CS_EXECSEG_CAN_EXEC_CDHASH 0x200 /* can execute blessed cdhash */ @@ -103,6 +110,8 @@ enum { CSSLOT_ALTERNATE_CODEDIRECTORY_LIMIT = CSSLOT_ALTERNATE_CODEDIRECTORIES + CSSLOT_ALTERNATE_CODEDIRECTORY_MAX, /* one past the last */ CSSLOT_SIGNATURESLOT = 0x10000, /* CMS Signature */ + CSSLOT_IDENTIFICATIONSLOT = 0x10001, + CSSLOT_TICKETSLOT = 0x10002, CSTYPE_INDEX_REQUIREMENTS = 0x00000002, /* compat with amfi */ CSTYPE_INDEX_ENTITLEMENTS = 0x00000005, /* compat with amfi */ diff --git a/osfmk/kern/debug.c b/osfmk/kern/debug.c index 313c979b5..0d15f8f8e 100644 --- a/osfmk/kern/debug.c +++ b/osfmk/kern/debug.c @@ -109,6 +109,10 @@ extern volatile struct xnu_hw_shmem_dbg_command_info *hwsd_info; #endif +#if CONFIG_XNUPOST +#include +extern int vsnprintf(char *, size_t, const char *, va_list); +#endif unsigned int halt_in_debugger = 0; unsigned int current_debugger = 0; @@ -142,6 +146,7 @@ unsigned int kernel_debugger_entry_count = 0; #define CPUPANICSTR PROCESSOR_DATA(current_processor(), debugger_state).db_panic_str #define CPUPANICARGS PROCESSOR_DATA(current_processor(), debugger_state).db_panic_args #define CPUPANICOPTS PROCESSOR_DATA(current_processor(), debugger_state).db_panic_options +#define CPUPANICDATAPTR PROCESSOR_DATA(current_processor(), debugger_state).db_panic_data_ptr #define CPUDEBUGGERSYNC PROCESSOR_DATA(current_processor(), debugger_state).db_proceed_on_sync_failure #define CPUDEBUGGERCOUNT PROCESSOR_DATA(current_processor(), debugger_state).db_entry_count #define CPUDEBUGGERRET PROCESSOR_DATA(current_processor(), debugger_state).db_op_return @@ -160,11 +165,13 @@ MACRO_END debugger_op debugger_current_op = DBOP_NONE; const char *debugger_panic_str = NULL; va_list *debugger_panic_args = NULL; +void *debugger_panic_data = NULL; uint64_t debugger_panic_options = 0; const char *debugger_message = NULL; unsigned long debugger_panic_caller = 0; -void panic_trap_to_debugger(const char *panic_format_str, va_list *panic_args, unsigned int reason, void *ctx, uint64_t panic_options_mask, unsigned long panic_caller); +void panic_trap_to_debugger(const char *panic_format_str, va_list *panic_args, unsigned int reason, void *ctx, + uint64_t panic_options_mask, void *panic_data, unsigned long panic_caller); static void kdp_machine_reboot_type(unsigned int type); __attribute__((noreturn)) void panic_spin_forever(void); extern kern_return_t do_stackshot(void); @@ -215,6 +222,10 @@ unsigned int debugger_context = 0; static char model_name[64]; unsigned char *kernel_uuid; +boolean_t kernelcache_uuid_valid = FALSE; +uuid_t kernelcache_uuid; +uuid_string_t kernelcache_uuid_string; + /* * By default we treat Debugger() the same as calls to panic(), unless * we have debug boot-args present and the DB_KERN_DUMP_ON_NMI *NOT* set. @@ -231,6 +242,7 @@ boolean_t debug_boot_arg_inited = FALSE; SECURITY_READ_ONLY_LATE(unsigned int) debug_boot_arg; char kernel_uuid_string[37]; /* uuid_string_t */ +char kernelcache_uuid_string[37]; /* uuid_string_t */ char panic_disk_error_description[512]; size_t panic_disk_error_description_size = sizeof(panic_disk_error_description); @@ -412,7 +424,7 @@ DebuggerResumeOtherCores() static void DebuggerSaveState(debugger_op db_op, const char *db_message, const char *db_panic_str, - va_list *db_panic_args, uint64_t db_panic_options, + va_list *db_panic_args, uint64_t db_panic_options, void *db_panic_data_ptr, boolean_t db_proceed_on_sync_failure, unsigned long db_panic_caller) { CPUDEBUGGEROP = db_op; @@ -422,6 +434,7 @@ DebuggerSaveState(debugger_op db_op, const char *db_message, const char *db_pani CPUDEBUGGERMSG = db_message; CPUPANICSTR = db_panic_str; CPUPANICARGS = db_panic_args; + CPUPANICDATAPTR = db_panic_data_ptr; CPUPANICCALLER = db_panic_caller; } else if (CPUDEBUGGERCOUNT > 1 && db_panic_str != NULL) { kprintf("Nested panic detected:"); @@ -444,21 +457,21 @@ DebuggerSaveState(debugger_op db_op, const char *db_message, const char *db_pani */ kern_return_t DebuggerTrapWithState(debugger_op db_op, const char *db_message, const char *db_panic_str, - va_list *db_panic_args, uint64_t db_panic_options, + va_list *db_panic_args, uint64_t db_panic_options, void *db_panic_data_ptr, boolean_t db_proceed_on_sync_failure, unsigned long db_panic_caller) { kern_return_t ret; assert(ml_get_interrupts_enabled() == FALSE); - DebuggerSaveState(db_op, db_message, db_panic_str, - db_panic_args, db_panic_options, db_proceed_on_sync_failure, - db_panic_caller); + DebuggerSaveState(db_op, db_message, db_panic_str, db_panic_args, + db_panic_options, db_panic_data_ptr, + db_proceed_on_sync_failure, db_panic_caller); TRAP_DEBUGGER; ret = CPUDEBUGGERRET; - DebuggerSaveState(DBOP_NONE, NULL, NULL, NULL, 0, FALSE, 0); + DebuggerSaveState(DBOP_NONE, NULL, NULL, NULL, 0, NULL, FALSE, 0); return ret; } @@ -525,13 +538,13 @@ DebuggerWithContext(unsigned int reason, void *ctx, const char *message, if (ctx != NULL) { DebuggerSaveState(DBOP_DEBUGGER, message, - NULL, NULL, debugger_options_mask, TRUE, 0); + NULL, NULL, debugger_options_mask, NULL, TRUE, 0); handle_debugger_trap(reason, 0, 0, ctx); DebuggerSaveState(DBOP_NONE, NULL, NULL, - NULL, 0, FALSE, 0); + NULL, 0, NULL, FALSE, 0); } else { DebuggerTrapWithState(DBOP_DEBUGGER, message, - NULL, NULL, debugger_options_mask, TRUE, 0); + NULL, NULL, debugger_options_mask, NULL, TRUE, 0); } CPUDEBUGGERCOUNT--; @@ -604,7 +617,7 @@ panic(const char *str, ...) va_list panic_str_args; va_start(panic_str_args, str); - panic_trap_to_debugger(str, &panic_str_args, 0, NULL, 0, (unsigned long)(char *)__builtin_return_address(0)); + panic_trap_to_debugger(str, &panic_str_args, 0, NULL, 0, NULL, (unsigned long)(char *)__builtin_return_address(0)); va_end(panic_str_args); } @@ -614,25 +627,47 @@ panic_with_options(unsigned int reason, void *ctx, uint64_t debugger_options_mas va_list panic_str_args; va_start(panic_str_args, str); - panic_trap_to_debugger(str, &panic_str_args, reason, ctx, debugger_options_mask, (unsigned long)(char *)__builtin_return_address(0)); + panic_trap_to_debugger(str, &panic_str_args, reason, ctx, (debugger_options_mask & ~DEBUGGER_INTERNAL_OPTIONS_MASK), + NULL, (unsigned long)(char *)__builtin_return_address(0)); va_end(panic_str_args); } +#if defined (__x86_64__) +/* + * panic_with_thread_context() is used on x86 platforms to specify a different thread that should be backtraced in the paniclog. + * We don't generally need this functionality on embedded platforms because embedded platforms include a panic time stackshot + * from customer devices. We plumb the thread pointer via the debugger trap mechanism and backtrace the kernel stack from the + * thread when writing the panic log. + * + * NOTE: panic_with_thread_context() should be called with an explicit thread reference held on the passed thread. + */ void -panic_context(unsigned int reason, void *ctx, const char *str, ...) +panic_with_thread_context(unsigned int reason, void *ctx, uint64_t debugger_options_mask, thread_t thread, const char *str, ...) { va_list panic_str_args; + __assert_only uint32_t th_ref_count; + + assert_thread_magic(thread); + th_ref_count = atomic_load_explicit(&thread->ref_count, memory_order_acquire); + assertf(th_ref_count > 0, "panic_with_thread_context called with invalid thread %p with refcount %u", thread, th_ref_count); + + /* Take a reference on the thread so it doesn't disappear by the time we try to backtrace it */ + thread_reference(thread); va_start(panic_str_args, str); - panic_trap_to_debugger(str, &panic_str_args, reason, ctx, 0, (unsigned long)(char *)__builtin_return_address(0)); + panic_trap_to_debugger(str, &panic_str_args, reason, ctx, ((debugger_options_mask & ~DEBUGGER_INTERNAL_OPTIONS_MASK) | DEBUGGER_INTERNAL_OPTION_THREAD_BACKTRACE), + thread, (unsigned long)(char *)__builtin_return_address(0)); + va_end(panic_str_args); + } +#endif /* defined (__x86_64__) */ #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wmissing-noreturn" void -panic_trap_to_debugger(const char *panic_format_str, va_list *panic_args, unsigned int reason, void - *ctx, uint64_t panic_options_mask, unsigned long panic_caller) +panic_trap_to_debugger(const char *panic_format_str, va_list *panic_args, unsigned int reason, void *ctx, + uint64_t panic_options_mask, void *panic_data_ptr, unsigned long panic_caller) { #pragma clang diagnostic pop @@ -706,7 +741,7 @@ panic_trap_to_debugger(const char *panic_format_str, va_list *panic_args, unsign */ DebuggerSaveState(DBOP_PANIC, "panic", panic_format_str, panic_args, - panic_options_mask, TRUE, panic_caller); + panic_options_mask, panic_data_ptr, TRUE, panic_caller); handle_debugger_trap(reason, 0, 0, ctx); } @@ -718,7 +753,7 @@ panic_trap_to_debugger(const char *panic_format_str, va_list *panic_args, unsign #endif /* defined(__arm64__) */ DebuggerTrapWithState(DBOP_PANIC, "panic", panic_format_str, - panic_args, panic_options_mask, TRUE, panic_caller); + panic_args, panic_options_mask, panic_data_ptr, TRUE, panic_caller); /* * Not reached. @@ -840,7 +875,7 @@ debugger_collect_diagnostics(unsigned int exception, unsigned int code, unsigned * TODO: Need to clear panic log when return from debugger * hooked up for embedded */ - SavePanicInfo(debugger_message, debugger_panic_options); + SavePanicInfo(debugger_message, debugger_panic_data, debugger_panic_options); #if DEVELOPMENT || DEBUG DEBUGGER_DEBUGGING_NESTED_PANIC_IF_REQUESTED((debugger_panic_options & DEBUGGER_OPTION_RECURPANIC_POSTLOG)); @@ -866,31 +901,51 @@ debugger_collect_diagnostics(unsigned int exception, unsigned int code, unsigned * Consider generating a local corefile if the infrastructure is configured * and we haven't disabled on-device coredumps. */ - if (kdp_has_polled_corefile() && !(debug_boot_arg & DB_DISABLE_LOCAL_CORE)) { - int ret = -1; + if (!(debug_boot_arg & DB_DISABLE_LOCAL_CORE)) { + if (!kdp_has_polled_corefile()) { + if (debug_boot_arg & (DB_KERN_DUMP_ON_PANIC | DB_KERN_DUMP_ON_NMI)) { + paniclog_append_noflush("skipping local kernel core because core file could not be opened prior to panic (error : 0x%x)", + kdp_polled_corefile_error()); +#if CONFIG_EMBEDDED + panic_info->eph_panic_flags |= EMBEDDED_PANIC_HEADER_FLAG_COREDUMP_FAILED; + paniclog_flush(); +#else /* CONFIG_EMBEDDED */ + if (panic_info->mph_panic_log_offset != 0) { + panic_info->mph_panic_flags |= MACOS_PANIC_HEADER_FLAG_COREDUMP_FAILED; + paniclog_flush(); + } +#endif /* CONFIG_EMBEDDED */ + } + } else { + int ret = -1; #if defined (__x86_64__) - /* On x86 we don't do a coredump on Debugger unless the DB_KERN_DUMP_ON_NMI boot-arg is specified. */ - if (debugger_current_op != DBOP_DEBUGGER || (debug_boot_arg & DB_KERN_DUMP_ON_NMI)) + /* On x86 we don't do a coredump on Debugger unless the DB_KERN_DUMP_ON_NMI boot-arg is specified. */ + if (debugger_current_op != DBOP_DEBUGGER || (debug_boot_arg & DB_KERN_DUMP_ON_NMI)) #endif - { - /* - * Doing an on-device coredump leaves the disk driver in a state - * that can not be resumed. - */ - debugger_safe_to_return = FALSE; - begin_panic_transfer(); - ret = kern_dump(KERN_DUMP_DISK); - abort_panic_transfer(); + { + /* + * Doing an on-device coredump leaves the disk driver in a state + * that can not be resumed. + */ + debugger_safe_to_return = FALSE; + begin_panic_transfer(); + ret = kern_dump(KERN_DUMP_DISK); + abort_panic_transfer(); #if DEVELOPMENT || DEBUG - DEBUGGER_DEBUGGING_NESTED_PANIC_IF_REQUESTED((debugger_panic_options & DEBUGGER_OPTION_RECURPANIC_POSTCORE)); + DEBUGGER_DEBUGGING_NESTED_PANIC_IF_REQUESTED((debugger_panic_options & DEBUGGER_OPTION_RECURPANIC_POSTCORE)); #endif - } + } - /* If we wrote a corefile and DB_REBOOT_POST_CORE is set, reboot */ - if (ret == 0 && (debug_boot_arg & DB_REBOOT_POST_CORE)) { - kdp_machine_reboot_type(kPEPanicRestartCPU); + /* + * If DB_REBOOT_POST_CORE is set, then reboot if coredump is sucessfully saved + * or if option to ignore failures is set. + */ + if ((debug_boot_arg & DB_REBOOT_POST_CORE) && + ((ret == 0) || (debugger_panic_options & DEBUGGER_OPTION_ATTEMPTCOREDUMPANDREBOOT))) { + kdp_machine_reboot_type(kPEPanicRestartCPU); + } } } @@ -984,6 +1039,7 @@ handle_debugger_trap(unsigned int exception, unsigned int code, unsigned int sub if (debugger_panic_str == NULL) { debugger_panic_str = CPUPANICSTR; debugger_panic_args = CPUPANICARGS; + debugger_panic_data = CPUPANICDATAPTR; debugger_message = CPUDEBUGGERMSG; debugger_panic_caller = CPUPANICCALLER; } @@ -1026,6 +1082,7 @@ handle_debugger_trap(unsigned int exception, unsigned int code, unsigned int sub if (debugger_current_op != DBOP_BREAKPOINT) { debugger_panic_str = NULL; debugger_panic_args = NULL; + debugger_panic_data = NULL; debugger_panic_options = 0; debugger_message = NULL; } diff --git a/osfmk/kern/debug.h b/osfmk/kern/debug.h index 860824c04..1ad189d54 100644 --- a/osfmk/kern/debug.h +++ b/osfmk/kern/debug.h @@ -156,7 +156,9 @@ struct micro_snapshot { } __attribute__ ((packed)); - +/* + * mirrors the dyld_cache_header struct defined in dyld_cache_format.h from dyld source code + */ struct _dyld_cache_header { char magic[16]; // e.g. "dyld_v0 i386" @@ -172,14 +174,47 @@ struct _dyld_cache_header uint64_t localSymbolsOffset; // file offset of where local symbols are stored uint64_t localSymbolsSize; // size of local symbols information uint8_t uuid[16]; // unique value for each shared cache file + uint64_t cacheType; // 0 for development, 1 for production + uint32_t branchPoolsOffset; // file offset to table of uint64_t pool addresses + uint32_t branchPoolsCount; // number of uint64_t entries + uint64_t accelerateInfoAddr; // (unslid) address of optimization info + uint64_t accelerateInfoSize; // size of optimization info + uint64_t imagesTextOffset; // file offset to first dyld_cache_image_text_info + uint64_t imagesTextCount; // number of dyld_cache_image_text_info entries + uint64_t dylibsImageGroupAddr; // (unslid) address of ImageGroup for dylibs in this cache + uint64_t dylibsImageGroupSize; // size of ImageGroup for dylibs in this cache + uint64_t otherImageGroupAddr; // (unslid) address of ImageGroup for other OS dylibs + uint64_t otherImageGroupSize; // size of oImageGroup for other OS dylibs + uint64_t progClosuresAddr; // (unslid) address of list of program launch closures + uint64_t progClosuresSize; // size of list of program launch closures + uint64_t progClosuresTrieAddr; // (unslid) address of trie of indexes into program launch closures + uint64_t progClosuresTrieSize; // size of trie of indexes into program launch closures + uint32_t platform; // platform number (macOS=1, etc) + uint32_t formatVersion : 8, // dyld3::closure::kFormatVersion + dylibsExpectedOnDisk : 1, // dyld should expect the dylib exists on disk and to compare inode/mtime to see if cache is valid + simulator : 1, // for simulator of specified platform + locallyBuiltCache : 1, // 0 for B&I built cache, 1 for locally built cache + padding : 21; // TBD +}; + +/* + * mirrors the dyld_cache_image_text_info struct defined in dyld_cache_format.h from dyld source code + */ +struct _dyld_cache_image_text_info +{ + uuid_t uuid; + uint64_t loadAddress; // unslid address of start of __TEXT + uint32_t textSegmentSize; + uint32_t pathOffset; // offset from start of cache file }; enum micro_snapshot_flags { kInterruptRecord = 0x1, kTimerArmingRecord = 0x2, - kUserMode = 0x4, /* interrupted usermode, or armed by usermode */ - kIORecord = 0x8, + kUserMode = 0x4, /* interrupted usermode, or armed by usermode */ + kIORecord = 0x8, + kPMIRecord = 0x10, }; /* @@ -209,25 +244,8 @@ enum { STACKSHOT_KCDATA_FORMAT = 0x10000, STACKSHOT_ENABLE_BT_FAULTING = 0x20000, STACKSHOT_COLLECT_DELTA_SNAPSHOT = 0x40000, - /* - * STACKSHOT_TAILSPIN flips on several features aimed at minimizing the size - * of stackshots. It is meant to be used only by the tailspin daemon. Its - * behavior may be changed at any time to suit the needs of the tailspin - * daemon. Seriously, if you are not the tailspin daemon, don't use this - * flag. If you need these features, ask us to add a stable SPI for what - * you need. That being said, the features it turns on are: - * - * minimize_uuids: If the set of loaded dylibs or kexts has not changed in - * the delta period, do then not report them. - * - * iostats: do not include io statistics. - * - * trace_fp: do not include the frame pointers in stack traces. - * - * minimize_nonrunnables: Do not report detailed information about threads - * which were not runnable in the delta period. - */ - STACKSHOT_TAILSPIN = 0x80000, + /* Include the layout of the system shared cache */ + STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT = 0x80000, /* * Kernel consumers of stackshot (via stack_snapshot_from_kernel) can ask * that we try to take the stackshot lock, and fail if we don't get it. @@ -241,6 +259,8 @@ enum { STACKSHOT_THREAD_GROUP = 0x2000000, STACKSHOT_SAVE_JETSAM_COALITIONS = 0x4000000, STACKSHOT_INSTRS_CYCLES = 0x8000000, + STACKSHOT_ASID = 0x10000000, + STACKSHOT_PAGE_TABLES = 0x20000000, }; #define STACKSHOT_THREAD_SNAPSHOT_MAGIC 0xfeedface @@ -429,15 +449,19 @@ enum { /* * Values for a 64-bit mask that's passed to the debugger. */ -#define DEBUGGER_OPTION_NONE 0x0ULL -#define DEBUGGER_OPTION_PANICLOGANDREBOOT 0x1ULL /* capture a panic log and then reboot immediately */ -#define DEBUGGER_OPTION_RECURPANIC_ENTRY 0x2ULL -#define DEBUGGER_OPTION_RECURPANIC_PRELOG 0x4ULL -#define DEBUGGER_OPTION_RECURPANIC_POSTLOG 0x8ULL -#define DEBUGGER_OPTION_RECURPANIC_POSTCORE 0x10ULL -#define DEBUGGER_OPTION_INITPROC_PANIC 0x20ULL -#define DEBUGGER_OPTION_COPROC_INITIATED_PANIC 0x40ULL /* panic initiated by a co-processor */ -#define DEBUGGER_OPTION_SKIP_LOCAL_COREDUMP 0x80ULL /* don't try to save local coredumps for this panic */ +#define DEBUGGER_OPTION_NONE 0x0ULL +#define DEBUGGER_OPTION_PANICLOGANDREBOOT 0x1ULL /* capture a panic log and then reboot immediately */ +#define DEBUGGER_OPTION_RECURPANIC_ENTRY 0x2ULL +#define DEBUGGER_OPTION_RECURPANIC_PRELOG 0x4ULL +#define DEBUGGER_OPTION_RECURPANIC_POSTLOG 0x8ULL +#define DEBUGGER_OPTION_RECURPANIC_POSTCORE 0x10ULL +#define DEBUGGER_OPTION_INITPROC_PANIC 0x20ULL +#define DEBUGGER_OPTION_COPROC_INITIATED_PANIC 0x40ULL /* panic initiated by a co-processor */ +#define DEBUGGER_OPTION_SKIP_LOCAL_COREDUMP 0x80ULL /* don't try to save local coredumps for this panic */ +#define DEBUGGER_OPTION_ATTEMPTCOREDUMPANDREBOOT 0x100ULL /* attempt to save coredump. always reboot */ +#define DEBUGGER_INTERNAL_OPTION_THREAD_BACKTRACE 0x200ULL /* backtrace the specified thread in the paniclog (x86 only) */ + +#define DEBUGGER_INTERNAL_OPTIONS_MASK (DEBUGGER_INTERNAL_OPTION_THREAD_BACKTRACE) __BEGIN_DECLS @@ -453,7 +477,6 @@ __BEGIN_DECLS #define panic(ex, ...) (panic)(# ex "@" PANIC_LOCATION, ## __VA_ARGS__) #endif -void panic_context(unsigned int reason, void *ctx, const char *string, ...); void panic_with_options(unsigned int reason, void *ctx, uint64_t debugger_options_mask, const char *str, ...); void Debugger(const char * message); void populate_model_name(char *); @@ -466,6 +489,12 @@ __END_DECLS #if XNU_KERNEL_PRIVATE +#if defined (__x86_64__) +struct thread; + +void panic_with_thread_context(unsigned int reason, void *ctx, uint64_t debugger_options_mask, struct thread* th, const char *str, ...); +#endif + boolean_t oslog_is_safe(void); boolean_t debug_mode_active(void); boolean_t stackshot_active(void); @@ -512,6 +541,10 @@ extern unsigned int debug_boot_arg; extern boolean_t debug_boot_arg_inited; #endif +extern boolean_t kernelcache_uuid_valid; +extern uuid_t kernelcache_uuid; +extern uuid_string_t kernelcache_uuid_string; + #ifdef __cplusplus extern "C" { #endif @@ -557,7 +590,7 @@ extern size_t panic_stackshot_len; #endif /* DEVELOPMENT || DEBUG */ #endif /* defined (__x86_64__) */ -void SavePanicInfo(const char *message, uint64_t panic_options); +void SavePanicInfo(const char *message, void *panic_data, uint64_t panic_options); void paniclog_flush(void); void panic_display_system_configuration(boolean_t launchd_exit); void panic_display_zprint(void); @@ -592,7 +625,7 @@ typedef enum { } debugger_op; kern_return_t DebuggerTrapWithState(debugger_op db_op, const char *db_message, const char *db_panic_str, va_list *db_panic_args, - uint64_t db_panic_options, boolean_t db_proceed_on_sync_failure, unsigned long db_panic_caller); + uint64_t db_panic_options, void *db_panic_data_ptr, boolean_t db_proceed_on_sync_failure, unsigned long db_panic_caller); void handle_debugger_trap(unsigned int exception, unsigned int code, unsigned int subcode, void *state); void DebuggerWithContext(unsigned int reason, void *ctx, const char *message, uint64_t debugger_options_mask); diff --git a/osfmk/kern/kern_ecc.c b/osfmk/kern/ecc_logging.c similarity index 100% rename from osfmk/kern/kern_ecc.c rename to osfmk/kern/ecc_logging.c diff --git a/osfmk/kern/exc_guard.h b/osfmk/kern/exc_guard.h index 8486ec569..dbdb3f193 100644 --- a/osfmk/kern/exc_guard.h +++ b/osfmk/kern/exc_guard.h @@ -64,6 +64,8 @@ /* EXC_GUARD types */ +#define GUARD_TYPE_NONE 0x0 + /* * Mach port guards use the exception codes like this: * @@ -128,6 +130,22 @@ #define GUARD_TYPE_VN 0x4 /* guarded vnode */ +/* + * VM guards use the exception codes like this: + * + * code: + * +-------------------------------+----------------+-----------------+ + * |[63:61] GUARD_TYPE_VIRT_MEMORY | [60:32] flavor | [31:0] unused | + * +-------------------------------+----------------+-----------------+ + * + * subcode: + * +----------------------------------------------------------------+ + * |[63:0] offset | + * +----------------------------------------------------------------+ + */ + +#define GUARD_TYPE_VIRT_MEMORY 0x5 /* VM operation violating guard */ + #ifdef KERNEL #define EXC_GUARD_ENCODE_TYPE(code, type) \ diff --git a/osfmk/kern/exc_resource.h b/osfmk/kern/exc_resource.h index c90fafc61..21d0d0b6d 100644 --- a/osfmk/kern/exc_resource.h +++ b/osfmk/kern/exc_resource.h @@ -62,6 +62,7 @@ #define RESOURCE_TYPE_WAKEUPS 2 #define RESOURCE_TYPE_MEMORY 3 #define RESOURCE_TYPE_IO 4 +#define RESOURCE_TYPE_THREADS 5 /* RESOURCE_TYPE_CPU flavors */ #define FLAVOR_CPU_MONITOR 1 @@ -195,6 +196,19 @@ ((subcode) & 0x7FFFULL) +/* + * RESOURCE_TYPE_THREADS exception code & subcode + * + * This is sent by the kernel when a task crosses its + * thread limit. + */ + +#define EXC_RESOURCE_THREADS_DECODE_THREADS(code) \ + ((code) & 0x7FFFULL) + +/* RESOURCE_TYPE_THREADS flavors */ +#define FLAVOR_THREADS_HIGH_WATERMARK 1 + #ifdef KERNEL /* EXC_RESOURCE type and flavor encoding macros */ @@ -229,6 +243,10 @@ #define EXC_RESOURCE_IO_ENCODE_OBSERVED(subcode, num) \ ((subcode) |= (((uint64_t)(num) & 0x7FFFULL))) +/* RESOURCE_TYPE_THREADS specific encoding macros */ +#define EXC_RESOURCE_THREADS_ENCODE_THREADS(code, threads) \ + ((code) |= (((uint64_t)(threads) & 0x7FFFULL))) + #endif /* KERNEL */ diff --git a/osfmk/kern/exception.c b/osfmk/kern/exception.c index 4042b91b3..9a67b727b 100644 --- a/osfmk/kern/exception.c +++ b/osfmk/kern/exception.c @@ -67,6 +67,7 @@ #include #include #include + #include #include #include @@ -74,6 +75,7 @@ #include #include #include + #include #include #include @@ -83,8 +85,11 @@ #include #include #include +#include + #include #include + #include extern int panic_on_exception_triage; @@ -212,7 +217,10 @@ exception_deliver( * As with other failures, exception_triage_thread will go on * to the next level. */ - if (mac_exc_action_check_exception_send(task, excp) != 0) { + + /* The global exception-to-signal translation port is safe to be an exception handler. */ + if (is_ux_handler_port(exc_port) == FALSE && + mac_exc_action_check_exception_send(task, excp) != 0) { kr = KERN_FAILURE; goto out_release_right; } @@ -241,7 +249,7 @@ exception_deliver( c_thr_exc_raise_state++; state_cnt = _MachineStateCount[flavor]; - kr = thread_getstatus(thread, flavor, + kr = thread_getstatus_to_user(thread, flavor, (thread_state_t)state, &state_cnt); if (kr == KERN_SUCCESS) { @@ -263,7 +271,7 @@ exception_deliver( } if (kr == KERN_SUCCESS) { if (exception != EXC_CORPSE_NOTIFY) - kr = thread_setstatus(thread, flavor, + kr = thread_setstatus_from_user(thread, flavor, (thread_state_t)state, state_cnt); goto out_release_right; @@ -300,7 +308,7 @@ exception_deliver( c_thr_exc_raise_state_id++; state_cnt = _MachineStateCount[flavor]; - kr = thread_getstatus(thread, flavor, + kr = thread_getstatus_to_user(thread, flavor, (thread_state_t)state, &state_cnt); if (kr == KERN_SUCCESS) { @@ -329,7 +337,7 @@ exception_deliver( if (kr == KERN_SUCCESS) { if (exception != EXC_CORPSE_NOTIFY) - kr = thread_setstatus(thread, flavor, + kr = thread_setstatus_from_user(thread, flavor, (thread_state_t)state, state_cnt); goto out_release_right; diff --git a/osfmk/kern/gzalloc.c b/osfmk/kern/gzalloc.c index 9bd11481f..64d0ba9eb 100644 --- a/osfmk/kern/gzalloc.c +++ b/osfmk/kern/gzalloc.c @@ -562,26 +562,43 @@ boolean_t gzalloc_element_size(void *gzaddr, zone_t *z, vm_size_t *gzsz) { uintptr_t a = (uintptr_t)gzaddr; if (__improbable(gzalloc_mode && (a >= gzalloc_map_min) && (a < gzalloc_map_max))) { gzhdr_t *gzh; + boolean_t vmef; + vm_map_entry_t gzvme = NULL; + vm_map_lock_read(gzalloc_map); + vmef = vm_map_lookup_entry(gzalloc_map, (vm_map_offset_t)a, &gzvme); + vm_map_unlock(gzalloc_map); + if (vmef == FALSE) { + panic("GZALLOC: unable to locate map entry for %p\n", (void *)a); + } + assertf(gzvme->vme_atomic != 0, "GZALLOC: VM map entry inconsistency, vme: %p, start: %llu end: %llu", gzvme, gzvme->vme_start, gzvme->vme_end); /* Locate the gzalloc metadata adjoining the element */ if (gzalloc_uf_mode == TRUE) { - boolean_t vmef; - vm_map_entry_t gzvme = NULL; /* In underflow detection mode, locate the map entry describing * the element, and then locate the copy of the gzalloc * header at the trailing edge of the range. */ - vm_map_lock_read(gzalloc_map); - vmef = vm_map_lookup_entry(gzalloc_map, (vm_map_offset_t)a, &gzvme); - vm_map_unlock(gzalloc_map); - if (vmef == FALSE) { - panic("GZALLOC: unable to locate map entry for %p\n", (void *)a); - } - assertf(gzvme->vme_atomic != 0, "GZALLOC: VM map entry inconsistency, vme: %p, start: %llu end: %llu", gzvme, gzvme->vme_start, gzvme->vme_end); gzh = (gzhdr_t *)(gzvme->vme_end - GZHEADER_SIZE); } else { - gzh = (gzhdr_t *)(a - GZHEADER_SIZE); + /* In overflow detection mode, scan forward from + * the base of the map entry to locate the + * gzalloc header. + */ + uint32_t *p = (uint32_t*) gzvme->vme_start; + while (p < (uint32_t *) gzvme->vme_end) { + if (*p == GZALLOC_SIGNATURE) + break; + else { + p++; + } + } + if (p >= (uint32_t *) gzvme->vme_end) { + panic("GZALLOC signature missing addr %p, zone %p", gzaddr, z); + } + p++; + uintptr_t q = (uintptr_t) p; + gzh = (gzhdr_t *) (q - sizeof(gzhdr_t)); } if (gzh->gzsig != GZALLOC_SIGNATURE) { diff --git a/osfmk/kern/host.c b/osfmk/kern/host.c index 47bab64b1..0f4fe2fb9 100644 --- a/osfmk/kern/host.c +++ b/osfmk/kern/host.c @@ -307,6 +307,31 @@ host_info(host_t host, host_flavor_t flavor, host_info_t info, mach_msg_type_num #endif } + case HOST_PREFERRED_USER_ARCH: { + host_preferred_user_arch_t user_arch_info; + + /* + * Basic information about this host. + */ + if (*count < HOST_PREFERRED_USER_ARCH_COUNT) + return (KERN_FAILURE); + + user_arch_info = (host_preferred_user_arch_t)info; + +#if defined(PREFERRED_USER_CPU_TYPE) && defined(PREFERRED_USER_CPU_SUBTYPE) + user_arch_info->cpu_type = PREFERRED_USER_CPU_TYPE; + user_arch_info->cpu_subtype = PREFERRED_USER_CPU_SUBTYPE; +#else + int master_id = master_processor->cpu_id; + user_arch_info->cpu_type = slot_type(master_id); + user_arch_info->cpu_subtype = slot_subtype(master_id); +#endif + + *count = HOST_PREFERRED_USER_ARCH_COUNT; + + return (KERN_SUCCESS); + } + default: return (KERN_INVALID_ARGUMENT); } } @@ -939,6 +964,27 @@ set_sched_stats_active(boolean_t active) return (KERN_SUCCESS); } + +uint64_t +get_pages_grabbed_count(void) +{ + processor_t processor; + uint64_t pages_grabbed_count = 0; + + simple_lock(&processor_list_lock); + + processor = processor_list; + + while (processor) { + pages_grabbed_count += PROCESSOR_DATA(processor, page_grab_count); + processor = processor->processor_list; + } + simple_unlock(&processor_list_lock); + + return(pages_grabbed_count); +} + + kern_return_t get_sched_statistics(struct _processor_statistics_np * out, uint32_t * count) { @@ -1150,6 +1196,14 @@ host_processor_info(host_t host, return (KERN_SUCCESS); } +static bool +is_valid_host_special_port(int id) +{ + return (id <= HOST_MAX_SPECIAL_PORT) && + (id >= HOST_MIN_SPECIAL_PORT) && + ((id <= HOST_LAST_SPECIAL_KERNEL_PORT) || (id > HOST_MAX_SPECIAL_KERNEL_PORT)); +} + /* * Kernel interface for setting a special port. */ @@ -1158,9 +1212,12 @@ kernel_set_special_port(host_priv_t host_priv, int id, ipc_port_t port) { ipc_port_t old_port; + if (!is_valid_host_special_port(id)) + panic("attempted to set invalid special port %d", id); + #if !MACH_FLIPC - if (id == HOST_NODE_PORT) - return (KERN_NOT_SUPPORTED); + if (id == HOST_NODE_PORT) + return (KERN_NOT_SUPPORTED); #endif host_lock(host_priv); @@ -1169,7 +1226,7 @@ kernel_set_special_port(host_priv_t host_priv, int id, ipc_port_t port) host_unlock(host_priv); #if MACH_FLIPC - if (id == HOST_NODE_PORT) + if (id == HOST_NODE_PORT) mach_node_port_changed(); #endif @@ -1184,10 +1241,13 @@ kernel_set_special_port(host_priv_t host_priv, int id, ipc_port_t port) kern_return_t kernel_get_special_port(host_priv_t host_priv, int id, ipc_port_t * portp) { - host_lock(host_priv); - *portp = host_priv->special[id]; - host_unlock(host_priv); - return (KERN_SUCCESS); + if (!is_valid_host_special_port(id)) + panic("attempted to get invalid special port %d", id); + + host_lock(host_priv); + *portp = host_priv->special[id]; + host_unlock(host_priv); + return (KERN_SUCCESS); } /* @@ -1227,7 +1287,7 @@ host_get_special_port(host_priv_t host_priv, __unused int node, int id, ipc_port { ipc_port_t port; - if (host_priv == HOST_PRIV_NULL || id == HOST_SECURITY_PORT || id > HOST_MAX_SPECIAL_PORT || id < 0) + if (host_priv == HOST_PRIV_NULL || id == HOST_SECURITY_PORT || id > HOST_MAX_SPECIAL_PORT || id < HOST_MIN_SPECIAL_PORT) return (KERN_INVALID_ARGUMENT); host_lock(host_priv); diff --git a/osfmk/kern/host_statistics.h b/osfmk/kern/host_statistics.h index c67af697e..cbccf8c22 100644 --- a/osfmk/kern/host_statistics.h +++ b/osfmk/kern/host_statistics.h @@ -42,6 +42,8 @@ #include #include +extern +uint64_t get_pages_grabbed_count(void); #define VM_STAT_INCR(event) \ MACRO_BEGIN \ diff --git a/osfmk/kern/ipc_kobject.c b/osfmk/kern/ipc_kobject.c index 181bb0383..2a216b2fb 100644 --- a/osfmk/kern/ipc_kobject.c +++ b/osfmk/kern/ipc_kobject.c @@ -89,6 +89,7 @@ #include #include #include +#include #include #include #include @@ -101,6 +102,9 @@ #endif #include +#include +#include + #include #include @@ -190,6 +194,7 @@ const struct mig_subsystem *mig_e[] = { (const struct mig_subsystem *)&UNDReply_subsystem, (const struct mig_subsystem *)&mach_voucher_subsystem, (const struct mig_subsystem *)&mach_voucher_attr_control_subsystem, + (const struct mig_subsystem *)&memory_entry_subsystem, #if XK_PROXY (const struct mig_subsystem *)&do_uproxy_xk_uproxy_subsystem, @@ -200,6 +205,9 @@ const struct mig_subsystem *mig_e[] = { #if MCMSG && iPSC860 (const struct mig_subsystem *)&mcmsg_info_subsystem, #endif /* MCMSG && iPSC860 */ + (const struct mig_subsystem *)&catch_exc_subsystem, + (const struct mig_subsystem *)&catch_mach_exc_subsystem, + }; void @@ -269,20 +277,20 @@ ipc_kobject_server( task_t task = TASK_NULL; uint32_t exec_token; boolean_t exec_token_changed = FALSE; + int request_msgh_id = request->ikm_header->msgh_id; /* * Find out corresponding mig_hash entry if any */ { - int key = request->ikm_header->msgh_id; - unsigned int i = (unsigned int)MIG_HASH(key); + unsigned int i = (unsigned int)MIG_HASH(request_msgh_id); int max_iter = mig_table_max_displ; do { ptr = &mig_buckets[i++ % MAX_MIG_ENTRIES]; - } while (key != ptr->num && ptr->num && --max_iter); + } while (request_msgh_id != ptr->num && ptr->num && --max_iter); - if (!ptr->routine || key != ptr->num) { + if (!ptr->routine || request_msgh_id != ptr->num) { ptr = (mig_hash_t *)0; reply_size = mig_reply_size; } else { @@ -466,8 +474,7 @@ ipc_kobject_server( */ #if DEVELOPMENT || DEBUG printf("%s: refusing to send reply to kobject %d port (id:%d)\n", - __func__, ip_kotype(replyp), - request->ikm_header->msgh_id); + __func__, ip_kotype(replyp), request_msgh_id); #endif /* DEVELOPMENT || DEBUG */ ipc_kmsg_destroy(reply); return IKM_NULL; diff --git a/osfmk/kern/ipc_kobject.h b/osfmk/kern/ipc_kobject.h index 52431b60e..28db4e47d 100644 --- a/osfmk/kern/ipc_kobject.h +++ b/osfmk/kern/ipc_kobject.h @@ -129,12 +129,13 @@ typedef natural_t ipc_kobject_type_t; #define IKOT_VOUCHER 37 #define IKOT_VOUCHER_ATTR_CONTROL 38 #define IKOT_WORK_INTERVAL 39 +#define IKOT_UX_HANDLER 40 /* * Add new entries here and adjust IKOT_UNKNOWN. * Please keep ipc/ipc_object.c:ikot_print_array up to date. */ -#define IKOT_UNKNOWN 40 /* magic catchall */ +#define IKOT_UNKNOWN 41 /* magic catchall */ #define IKOT_MAX_TYPE (IKOT_UNKNOWN+1) /* # of IKOT_ types */ diff --git a/osfmk/kern/ipc_mig.c b/osfmk/kern/ipc_mig.c index 8114708a1..ddbfa0e5a 100644 --- a/osfmk/kern/ipc_mig.c +++ b/osfmk/kern/ipc_mig.c @@ -84,6 +84,9 @@ #include +void +mach_msg_receive_results_complete(ipc_object_t object); + /* * Routine: mach_msg_send_from_kernel * Purpose: @@ -410,6 +413,7 @@ mach_msg_rpc_from_kernel_body( for (;;) { ipc_mqueue_t mqueue; + ipc_object_t object; assert(reply->ip_in_pset == 0); assert(ip_active(reply)); @@ -434,6 +438,9 @@ mach_msg_rpc_from_kernel_body( kmsg = self->ith_kmsg; seqno = self->ith_seqno; + __IGNORE_WCASTALIGN(object = (ipc_object_t) reply); + mach_msg_receive_results_complete(object); + if (mr == MACH_MSG_SUCCESS) { break; @@ -598,6 +605,7 @@ mach_msg_overwrite( &mqueue, &object); if (mr != MACH_MSG_SUCCESS) return mr; + /* hold ref for object */ self->ith_continuation = (void (*)(mach_msg_return_t))0; @@ -610,6 +618,7 @@ mach_msg_overwrite( kmsg = self->ith_kmsg; seqno = self->ith_seqno; + mach_msg_receive_results_complete(object); io_release(object); } while (mr == MACH_RCV_INTERRUPTED); diff --git a/osfmk/kern/ipc_tt.c b/osfmk/kern/ipc_tt.c index 920ac8fc5..6fcef9f17 100644 --- a/osfmk/kern/ipc_tt.c +++ b/osfmk/kern/ipc_tt.c @@ -867,6 +867,7 @@ mach_reply_port( * Conditions: * Nothing locked. * Returns: + * mach_port_name_t: send right & receive right for special reply port. * MACH_PORT_NULL if there are any resource failures * or other errors. */ @@ -877,6 +878,7 @@ thread_get_special_reply_port( { ipc_port_t port; mach_port_name_t name; + mach_port_name_t send_name; kern_return_t kr; thread_t thread = current_thread(); @@ -891,7 +893,22 @@ thread_get_special_reply_port( kr = ipc_port_alloc(current_task()->itk_space, &name, &port); if (kr == KERN_SUCCESS) { ipc_port_bind_special_reply_port_locked(port); + + /* Make a send right and insert it in the space at specified name */ + ipc_port_make_send_locked(port); ip_unlock(port); + send_name = ipc_port_copyout_name_send(port, current_task()->itk_space, name); + /* + * If insertion of send right failed, userland is doing something bad, error out. + * The space was marked inactive or the receive right just inserted above at the + * given name was moved, in either case do not try to deallocate the receive right. + */ + if (send_name == MACH_PORT_NULL || send_name == MACH_PORT_DEAD) { + if (IP_VALID(thread->ith_special_reply_port)) { + ipc_port_unbind_special_reply_port(thread, TRUE); + } + name = MACH_PORT_NULL; + } } else { name = MACH_PORT_NULL; } @@ -918,14 +935,17 @@ ipc_port_bind_special_reply_port_locked( ip_reference(port); thread->ith_special_reply_port = port; port->ip_specialreply = 1; - port->ip_link_sync_qos = 1; + port->ip_sync_link_state = PORT_SYNC_LINK_ANY; + + reset_ip_srp_bits(port); } /* * Routine: ipc_port_unbind_special_reply_port * Purpose: * Unbind the thread's special reply port. - * If the special port is linked to a port, adjust it's sync qos delta`. + * If the special port has threads waiting on turnstile, + * update it's inheritor. * Condition: * Nothing locked. * Returns: @@ -947,8 +967,8 @@ ipc_port_unbind_special_reply_port( } thread->ith_special_reply_port = NULL; - ipc_port_unlink_special_reply_port_locked(special_reply_port, NULL, - IPC_PORT_UNLINK_SR_CLEAR_SPECIAL_REPLY); + ipc_port_adjust_special_reply_port_locked(special_reply_port, NULL, + IPC_PORT_ADJUST_SR_CLEAR_SPECIAL_REPLY, FALSE); /* port unlocked */ ip_release(special_reply_port); @@ -1365,7 +1385,7 @@ task_conversion_eval(task_t caller, task_t victim) * Only the kernel can can resolve the kernel's task port. We've established * by this point that the caller is not kernel_task. */ - if (victim == kernel_task) { + if (victim == TASK_NULL || victim == kernel_task) { return KERN_INVALID_SECURITY; } @@ -1751,12 +1771,13 @@ convert_port_to_thread( if (IP_VALID(port)) { ip_lock(port); - if ( ip_active(port) && - ip_kotype(port) == IKOT_THREAD ) { + if (ip_active(port) && + ip_kotype(port) == IKOT_THREAD) { thread = (thread_t)port->ip_kobject; assert(thread != THREAD_NULL); - if (thread->task && thread->task == kernel_task && - current_task() != kernel_task) { + + /* Use task conversion rules for thread control conversions */ + if (task_conversion_eval(current_task(), thread->task) != KERN_SUCCESS) { ip_unlock(port); return THREAD_NULL; } diff --git a/osfmk/kern/kalloc.c b/osfmk/kern/kalloc.c index 65e9df392..6527654f2 100644 --- a/osfmk/kern/kalloc.c +++ b/osfmk/kern/kalloc.c @@ -716,8 +716,6 @@ kalloc_external( return( kalloc_tag_bt(size, VM_KERN_MEMORY_KALLOC) ); } -volatile SInt32 kfree_nop_count = 0; - void kfree( void *data, @@ -751,28 +749,7 @@ kfree( if ((((vm_offset_t) data) >= kalloc_map_min) && (((vm_offset_t) data) <= kalloc_map_max)) alloc_map = kalloc_map; if (size > kalloc_largest_allocated) { - /* - * work around double FREEs of small MALLOCs - * this used to end up being a nop - * since the pointer being freed from an - * alloc backed by the zalloc world could - * never show up in the kalloc_map... however, - * the kernel_map is a different issue... since it - * was released back into the zalloc pool, a pointer - * would have gotten written over the 'size' that - * the MALLOC was retaining in the first 4 bytes of - * the underlying allocation... that pointer ends up - * looking like a really big size on the 2nd FREE and - * pushes the kfree into the kernel_map... we - * end up removing a ton of virtual space before we panic - * this check causes us to ignore the kfree for a size - * that must be 'bogus'... note that it might not be due - * to the above scenario, but it would still be wrong and - * cause serious damage. - */ - - OSAddAtomic(1, &kfree_nop_count); - return; + panic("kfree: size %lu > kalloc_largest_allocated %lu", (unsigned long)size, (unsigned long)kalloc_largest_allocated); } kmem_free(alloc_map, (vm_offset_t)data, size); kalloc_spin_lock(); @@ -797,7 +774,9 @@ kfree( z, z->zone_name, (unsigned long)size); #endif assert(size <= z->elem_size); +#if !KASAN_KALLOC DTRACE_VM3(kfree, vm_size_t, size, vm_size_t, z->elem_size, void*, data); +#endif zfree(z, data); } diff --git a/osfmk/kern/kcdata.h b/osfmk/kern/kcdata.h index 702bfacbc..e36c55352 100644 --- a/osfmk/kern/kcdata.h +++ b/osfmk/kern/kcdata.h @@ -436,47 +436,49 @@ struct kcdata_type_definition { * NOTE: Please update kcdata/libkdd/kcdtypes.c if you make any changes * in STACKSHOT_KCTYPE_* types. */ -#define STACKSHOT_KCTYPE_IOSTATS 0x901u /* io_stats_snapshot */ -#define STACKSHOT_KCTYPE_GLOBAL_MEM_STATS 0x902u /* struct mem_and_io_snapshot */ +#define STACKSHOT_KCTYPE_IOSTATS 0x901u /* io_stats_snapshot */ +#define STACKSHOT_KCTYPE_GLOBAL_MEM_STATS 0x902u /* struct mem_and_io_snapshot */ #define STACKSHOT_KCCONTAINER_TASK 0x903u #define STACKSHOT_KCCONTAINER_THREAD 0x904u -#define STACKSHOT_KCTYPE_TASK_SNAPSHOT 0x905u /* task_snapshot_v2 */ -#define STACKSHOT_KCTYPE_THREAD_SNAPSHOT 0x906u /* thread_snapshot_v2, thread_snapshot_v3 */ -#define STACKSHOT_KCTYPE_DONATING_PIDS 0x907u /* int[] */ -#define STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO 0x908u /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */ -#define STACKSHOT_KCTYPE_THREAD_NAME 0x909u /* char[] */ -#define STACKSHOT_KCTYPE_KERN_STACKFRAME 0x90Au /* struct stack_snapshot_frame32 */ -#define STACKSHOT_KCTYPE_KERN_STACKFRAME64 0x90Bu /* struct stack_snapshot_frame64 */ -#define STACKSHOT_KCTYPE_USER_STACKFRAME 0x90Cu /* struct stack_snapshot_frame32 */ -#define STACKSHOT_KCTYPE_USER_STACKFRAME64 0x90Du /* struct stack_snapshot_frame64 */ -#define STACKSHOT_KCTYPE_BOOTARGS 0x90Eu /* boot args string */ -#define STACKSHOT_KCTYPE_OSVERSION 0x90Fu /* os version string */ -#define STACKSHOT_KCTYPE_KERN_PAGE_SIZE 0x910u /* kernel page size in uint32_t */ -#define STACKSHOT_KCTYPE_JETSAM_LEVEL 0x911u /* jetsam level in uint32_t */ -#define STACKSHOT_KCTYPE_DELTA_SINCE_TIMESTAMP 0x912u /* timestamp used for the delta stackshot */ +#define STACKSHOT_KCTYPE_TASK_SNAPSHOT 0x905u /* task_snapshot_v2 */ +#define STACKSHOT_KCTYPE_THREAD_SNAPSHOT 0x906u /* thread_snapshot_v2, thread_snapshot_v3 */ +#define STACKSHOT_KCTYPE_DONATING_PIDS 0x907u /* int[] */ +#define STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO 0x908u /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */ +#define STACKSHOT_KCTYPE_THREAD_NAME 0x909u /* char[] */ +#define STACKSHOT_KCTYPE_KERN_STACKFRAME 0x90Au /* struct stack_snapshot_frame32 */ +#define STACKSHOT_KCTYPE_KERN_STACKFRAME64 0x90Bu /* struct stack_snapshot_frame64 */ +#define STACKSHOT_KCTYPE_USER_STACKFRAME 0x90Cu /* struct stack_snapshot_frame32 */ +#define STACKSHOT_KCTYPE_USER_STACKFRAME64 0x90Du /* struct stack_snapshot_frame64 */ +#define STACKSHOT_KCTYPE_BOOTARGS 0x90Eu /* boot args string */ +#define STACKSHOT_KCTYPE_OSVERSION 0x90Fu /* os version string */ +#define STACKSHOT_KCTYPE_KERN_PAGE_SIZE 0x910u /* kernel page size in uint32_t */ +#define STACKSHOT_KCTYPE_JETSAM_LEVEL 0x911u /* jetsam level in uint32_t */ +#define STACKSHOT_KCTYPE_DELTA_SINCE_TIMESTAMP 0x912u /* timestamp used for the delta stackshot */ +#define STACKSHOT_KCTYPE_KERN_STACKLR 0x913u /* uint32_t */ +#define STACKSHOT_KCTYPE_KERN_STACKLR64 0x914u /* uint64_t */ +#define STACKSHOT_KCTYPE_USER_STACKLR 0x915u /* uint32_t */ +#define STACKSHOT_KCTYPE_USER_STACKLR64 0x916u /* uint64_t */ +#define STACKSHOT_KCTYPE_NONRUNNABLE_TIDS 0x917u /* uint64_t */ +#define STACKSHOT_KCTYPE_NONRUNNABLE_TASKS 0x918u /* uint64_t */ +#define STACKSHOT_KCTYPE_CPU_TIMES 0x919u /* struct stackshot_cpu_times or stackshot_cpu_times_v2 */ +#define STACKSHOT_KCTYPE_STACKSHOT_DURATION 0x91au /* struct stackshot_duration */ +#define STACKSHOT_KCTYPE_STACKSHOT_FAULT_STATS 0x91bu /* struct stackshot_fault_stats */ +#define STACKSHOT_KCTYPE_KERNELCACHE_LOADINFO 0x91cu /* kernelcache UUID -- same as KCDATA_TYPE_LIBRARY_LOADINFO64 */ +#define STACKSHOT_KCTYPE_THREAD_WAITINFO 0x91du /* struct stackshot_thread_waitinfo */ +#define STACKSHOT_KCTYPE_THREAD_GROUP_SNAPSHOT 0x91eu /* struct thread_group_snapshot or thread_group_snapshot_v2 */ +#define STACKSHOT_KCTYPE_THREAD_GROUP 0x91fu /* uint64_t */ +#define STACKSHOT_KCTYPE_JETSAM_COALITION_SNAPSHOT 0x920u /* struct jetsam_coalition_snapshot */ +#define STACKSHOT_KCTYPE_JETSAM_COALITION 0x921u /* uint64_t */ +#define STACKSHOT_KCTYPE_THREAD_POLICY_VERSION 0x922u /* THREAD_POLICY_INTERNAL_STRUCT_VERSION in uint32 */ +#define STACKSHOT_KCTYPE_INSTRS_CYCLES 0x923u /* struct instrs_cycles_snapshot */ +#define STACKSHOT_KCTYPE_USER_STACKTOP 0x924u /* struct stack_snapshot_stacktop */ +#define STACKSHOT_KCTYPE_ASID 0x925u /* uint32_t */ +#define STACKSHOT_KCTYPE_PAGE_TABLES 0x926u /* uint64_t */ +#define STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT 0x927u /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */ #define STACKSHOT_KCTYPE_TASK_DELTA_SNAPSHOT 0x940u /* task_delta_snapshot_v2 */ #define STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT 0x941u /* thread_delta_snapshot_v* */ -#define STACKSHOT_KCTYPE_KERN_STACKLR 0x913u /* uint32_t */ -#define STACKSHOT_KCTYPE_KERN_STACKLR64 0x914u /* uint64_t */ -#define STACKSHOT_KCTYPE_USER_STACKLR 0x915u /* uint32_t */ -#define STACKSHOT_KCTYPE_USER_STACKLR64 0x916u /* uint64_t */ -#define STACKSHOT_KCTYPE_NONRUNNABLE_TIDS 0x917u /* uint64_t */ -#define STACKSHOT_KCTYPE_NONRUNNABLE_TASKS 0x918u /* uint64_t */ -#define STACKSHOT_KCTYPE_CPU_TIMES 0x919u /* struct stackshot_cpu_times */ -#define STACKSHOT_KCTYPE_STACKSHOT_DURATION 0x91au /* struct stackshot_duration */ -#define STACKSHOT_KCTYPE_STACKSHOT_FAULT_STATS 0x91bu /* struct stackshot_fault_stats */ -#define STACKSHOT_KCTYPE_KERNELCACHE_LOADINFO 0x91cu /* kernelcache UUID -- same as KCDATA_TYPE_LIBRARY_LOADINFO64 */ -#define STACKSHOT_KCTYPE_THREAD_WAITINFO 0x91du /* struct stackshot_thread_waitinfo */ -#define STACKSHOT_KCTYPE_THREAD_GROUP_SNAPSHOT 0x91eu /* struct thread_group_snapshot or thread_group_snapshot_v2 */ -#define STACKSHOT_KCTYPE_THREAD_GROUP 0x91fu /* uint64_t */ -#define STACKSHOT_KCTYPE_JETSAM_COALITION_SNAPSHOT 0x920u /* struct jetsam_coalition_snapshot */ -#define STACKSHOT_KCTYPE_JETSAM_COALITION 0x921u /* uint64_t */ -#define STACKSHOT_KCTYPE_INSTRS_CYCLES 0x923u /* struct instrs_cycles_snapshot */ - -#define STACKSHOT_KCTYPE_THREAD_POLICY_VERSION 0x922u /* THREAD_POLICY_INTERNAL_STRUCT_VERSION in uint32 */ - struct stack_snapshot_frame32 { uint32_t lr; uint32_t sp; @@ -537,6 +539,10 @@ enum task_snapshot_flags { kTaskUUIDInfoMissing = 0x200000, /* some UUID info was paged out */ kTaskUUIDInfoTriedFault = 0x400000, /* tried to fault in UUID info */ kTaskSharedRegionInfoUnavailable = 0x800000, /* shared region info unavailable */ + kTaskTALEngaged = 0x1000000, + /* 0x2000000 unused */ + kTaskIsDirtyTracked = 0x4000000, + kTaskAllowIdleExit = 0x8000000, }; enum thread_snapshot_flags { @@ -785,6 +791,12 @@ struct stackshot_cpu_times { uint64_t system_usec; } __attribute__((packed)); +struct stackshot_cpu_times_v2 { + uint64_t user_usec; + uint64_t system_usec; + uint64_t runnable_usec; +} __attribute__((packed)); + struct stackshot_duration { uint64_t stackshot_duration; uint64_t stackshot_duration_outer; @@ -813,6 +825,12 @@ typedef struct stackshot_thread_waitinfo { #define STACKSHOT_WAITOWNER_SUSPENDED (UINT64_MAX - 7) /* workloop is suspended */ +struct stack_snapshot_stacktop { + uint64_t sp; + uint8_t stack_contents[8]; +}; + + /**************** definitions for crashinfo *********************/ /* @@ -866,6 +884,22 @@ struct crashinfo_proc_uniqidentifierinfo { #define TASK_CRASHINFO_UDATA_PTRS 0x81C /* uint64_t */ #define TASK_CRASHINFO_MEMORY_LIMIT 0x81D /* uint64_t */ +#define TASK_CRASHINFO_LEDGER_INTERNAL 0x81E /* uint64_t */ +#define TASK_CRASHINFO_LEDGER_INTERNAL_COMPRESSED 0x81F /* uint64_t */ +#define TASK_CRASHINFO_LEDGER_IOKIT_MAPPED 0x820 /* uint64_t */ +#define TASK_CRASHINFO_LEDGER_ALTERNATE_ACCOUNTING 0x821 /* uint64_t */ +#define TASK_CRASHINFO_LEDGER_ALTERNATE_ACCOUNTING_COMPRESSED 0x822 /* uint64_t */ +#define TASK_CRASHINFO_LEDGER_PURGEABLE_NONVOLATILE 0x823 /* uint64_t */ +#define TASK_CRASHINFO_LEDGER_PURGEABLE_NONVOLATILE_COMPRESSED 0x824 /* uint64_t */ +#define TASK_CRASHINFO_LEDGER_PAGE_TABLE 0x825 /* uint64_t */ +#define TASK_CRASHINFO_LEDGER_PHYS_FOOTPRINT 0x826 /* uint64_t */ +#define TASK_CRASHINFO_LEDGER_PHYS_FOOTPRINT_LIFETIME_MAX 0x827 /* uint64_t */ +#define TASK_CRASHINFO_LEDGER_NETWORK_NONVOLATILE 0x828 /* uint64_t */ +#define TASK_CRASHINFO_LEDGER_NETWORK_NONVOLATILE_COMPRESSED 0x829 /* uint64_t */ +#define TASK_CRASHINFO_LEDGER_WIRED_MEM 0x82A /* uint64_t */ + + + #define TASK_CRASHINFO_END KCDATA_TYPE_BUFFER_END /**************** definitions for os reasons *********************/ @@ -963,7 +997,7 @@ kcdata_iter_type(kcdata_iter_t iter) static inline uint32_t kcdata_calc_padding(uint32_t size) { - /* calculate number of bits to add to size to get something divisible by 16 */ + /* calculate number of bytes to add to size to get something divisible by 16 */ return (-size) & 0xf; } diff --git a/osfmk/kern/kern_cdata.c b/osfmk/kern/kern_cdata.c index 97bee226c..71a2368f6 100644 --- a/osfmk/kern/kern_cdata.c +++ b/osfmk/kern/kern_cdata.c @@ -185,11 +185,13 @@ static kern_return_t kcdata_get_memory_addr_with_flavor( uint64_t flags, mach_vm_address_t *user_addr) { + kern_return_t kr; struct kcdata_item info; uint32_t orig_size = size; /* make sure 16 byte aligned */ - size += kcdata_calc_padding(size); + uint32_t padding = kcdata_calc_padding(size); + size += padding; uint32_t total_size = size + sizeof(info); if (user_addr == NULL || data == NULL || total_size + sizeof(info) < orig_size) { @@ -207,14 +209,18 @@ static kern_return_t kcdata_get_memory_addr_with_flavor( return KERN_RESOURCE_SHORTAGE; } - if (data->kcd_flags & KCFLAG_USE_COPYOUT) { - if (copyout(&info, data->kcd_addr_end, sizeof(info))) - return KERN_NO_ACCESS; - } else { - memcpy((void *)data->kcd_addr_end, &info, sizeof(info)); - } + kr = kcdata_memcpy(data, data->kcd_addr_end, &info, sizeof(info)); + if (kr) + return kr; data->kcd_addr_end += sizeof(info); + + if (padding) { + kr = kcdata_bzero(data, data->kcd_addr_end + size - padding, padding); + if (kr) + return kr; + } + *user_addr = data->kcd_addr_end; data->kcd_addr_end += size; @@ -317,7 +323,7 @@ kcdata_undo_add_container_begin(kcdata_descriptor_t data) * returns: KERN_NO_ACCESS if copyout fails. */ -kern_return_t kcdata_memcpy(kcdata_descriptor_t data, mach_vm_address_t dst_addr, void *src_addr, uint32_t size) +kern_return_t kcdata_memcpy(kcdata_descriptor_t data, mach_vm_address_t dst_addr, const void *src_addr, uint32_t size) { if (data->kcd_flags & KCFLAG_USE_COPYOUT) { if (copyout(src_addr, dst_addr, size)) @@ -328,6 +334,30 @@ kern_return_t kcdata_memcpy(kcdata_descriptor_t data, mach_vm_address_t dst_addr return KERN_SUCCESS; } +/* + * Routine: kcdata_bzero + * Desc: zero out a portion of a kcdata buffer. + */ +kern_return_t +kcdata_bzero(kcdata_descriptor_t data, mach_vm_address_t dst_addr, uint32_t size) +{ + kern_return_t kr = KERN_SUCCESS; + if (data->kcd_flags & KCFLAG_USE_COPYOUT) { + uint8_t zeros[16] = {}; + while (size) { + uint32_t block_size = MIN(size, 16); + kr = copyout(&zeros, dst_addr, block_size); + if (kr) + return KERN_NO_ACCESS; + size -= block_size; + } + return KERN_SUCCESS; + } else { + bzero((void*)dst_addr, size); + return KERN_SUCCESS; + } +} + /* * Routine: kcdata_add_type_definition * Desc: add type definition to kcdata buffer. diff --git a/osfmk/kern/kern_cdata.h b/osfmk/kern/kern_cdata.h index ce49bf679..39739d76e 100644 --- a/osfmk/kern/kern_cdata.h +++ b/osfmk/kern/kern_cdata.h @@ -101,7 +101,8 @@ typedef void * kcdata_descriptor_t; uint32_t kcdata_estimate_required_buffer_size(uint32_t num_items, uint32_t payload_size); uint64_t kcdata_memory_get_used_bytes(kcdata_descriptor_t kcd); -kern_return_t kcdata_memcpy(kcdata_descriptor_t data, mach_vm_address_t dst_addr, void * src_addr, uint32_t size); +kern_return_t kcdata_memcpy(kcdata_descriptor_t data, mach_vm_address_t dst_addr, const void * src_addr, uint32_t size); +kern_return_t kcdata_bzero(kcdata_descriptor_t data, mach_vm_address_t dst_addr, uint32_t size); kern_return_t kcdata_get_memory_addr(kcdata_descriptor_t data, uint32_t type, uint32_t size, mach_vm_address_t * user_addr); kern_return_t kcdata_get_memory_addr_for_array( kcdata_descriptor_t data, uint32_t type_of_element, uint32_t size_of_element, uint32_t count, mach_vm_address_t * user_addr); diff --git a/osfmk/kern/kern_monotonic.c b/osfmk/kern/kern_monotonic.c index 92bacff03..0c9d825e9 100644 --- a/osfmk/kern/kern_monotonic.c +++ b/osfmk/kern/kern_monotonic.c @@ -61,6 +61,12 @@ _Atomic uint64_t mt_retrograde = 0; #define MAXSPINS 100 #define MAXRETRIES 10 +/* + * Write the fixed counter values for the thread `thread` into `counts_out`. + * + * This function does not include the accumulated counter values since the + * thread's last context switch or quantum expiration. + */ int mt_fixed_thread_counts(thread_t thread, uint64_t *counts_out) { @@ -521,3 +527,54 @@ mt_stackshot_task(task_t task, uint64_t *instrs, uint64_t *cycles) *cycles = task->task_monotonic.mtk_counts[MT_CORE_CYCLES]; } + +/* + * Maintain reset values for the fixed instruction and cycle counters so + * clients can be notified after a given number of those events occur. This is + * only used by microstackshot. + */ + +bool mt_microstackshots = false; +unsigned int mt_microstackshot_ctr = 0; +mt_pmi_fn mt_microstackshot_pmi_handler = NULL; +void *mt_microstackshot_ctx = NULL; +uint64_t mt_core_reset_values[MT_CORE_NFIXED] = { 0 }; + +#define MT_MIN_FIXED_PERIOD (10 * 1000 * 1000) + +int +mt_microstackshot_start(unsigned int ctr, uint64_t period, mt_pmi_fn handler, + void *ctx) +{ + assert(ctr < MT_CORE_NFIXED); + + if (period < MT_MIN_FIXED_PERIOD) { + return EINVAL; + } + if (mt_microstackshots) { + return EBUSY; + } + + mt_microstackshot_ctr = ctr; + mt_microstackshot_pmi_handler = handler; + mt_microstackshot_ctx = ctx; + + int error = mt_microstackshot_start_arch(period); + if (error) { + return error; + } + + mt_microstackshots = true; + + return 0; +} + +int +mt_microstackshot_stop(void) +{ + mt_microstackshots = false; + memset(mt_core_reset_values, 0, sizeof(mt_core_reset_values)); + + return 0; +} + diff --git a/osfmk/kern/kern_stackshot.c b/osfmk/kern/kern_stackshot.c index 0c1e07bf1..28d6270fa 100644 --- a/osfmk/kern/kern_stackshot.c +++ b/osfmk/kern/kern_stackshot.c @@ -45,6 +45,7 @@ #include #include +#include #include /* bcopy */ @@ -63,6 +64,11 @@ #include #include +#if defined(__x86_64__) +#include +#include +#endif + #if CONFIG_EMBEDDED #include /* For gPanicBase/gPanicBase */ #endif @@ -75,15 +81,11 @@ extern unsigned int not_in_kdp; -#if CONFIG_EMBEDDED -uuid_t kernelcache_uuid; -#endif /* indicate to the compiler that some accesses are unaligned */ typedef uint64_t unaligned_u64 __attribute__((aligned(1))); extern addr64_t kdp_vtophys(pmap_t pmap, addr64_t va); -extern void * proc_get_uthread_uu_threadlist(void * uthread_v); int kdp_snapshot = 0; static kern_return_t stack_snapshot_ret = 0; @@ -134,18 +136,20 @@ static void stackshot_coalition_jetsam_snapshot(void *arg, int i, coalition_t c #endif /* CONFIG_COALITIONS */ -extern uint32_t workqueue_get_pwq_state_kdp(void *proc); +extern uint32_t workqueue_get_pwq_state_kdp(void *proc); extern int proc_pid(void *p); extern uint64_t proc_uniqueid(void *p); extern uint64_t proc_was_throttled(void *p); extern uint64_t proc_did_throttle(void *p); -static uint64_t proc_did_throttle_from_task(task_t task); -extern void proc_name_kdp(task_t task, char * buf, int size); -extern int proc_threadname_kdp(void * uth, char * buf, size_t size); -extern void proc_starttime_kdp(void * p, uint64_t * tv_sec, uint64_t * tv_usec, uint64_t * abstime); +extern int proc_exiting(void *p); +extern int proc_in_teardown(void *p); +static uint64_t proc_did_throttle_from_task(task_t task); +extern void proc_name_kdp(task_t task, char * buf, int size); +extern int proc_threadname_kdp(void * uth, char * buf, size_t size); +extern void proc_starttime_kdp(void * p, uint64_t * tv_sec, uint64_t * tv_usec, uint64_t * abstime); extern int memorystatus_get_pressure_status_kdp(void); -extern boolean_t memorystatus_proc_is_dirty_unsafe(void * v); +extern void memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit); extern int count_busy_buffers(void); /* must track with declaration in bsd/sys/buf_internal.h */ extern void bcopy_phys(addr64_t, addr64_t, vm_size_t); @@ -217,6 +221,8 @@ static lck_mtx_t stackshot_subsys_mutex; #define SANE_BOOTPROFILE_TRACEBUF_SIZE (64 * 1024 * 1024) #define SANE_TRACEBUF_SIZE (8 * 1024 * 1024) +SECURITY_READ_ONLY_LATE(static uint32_t) max_tracebuf_size = SANE_TRACEBUF_SIZE; + /* * We currently set a ceiling of 3 milliseconds spent in the kdp fault path * for non-panic stackshots where faulting is requested. @@ -245,6 +251,8 @@ stackshot_init( void ) clock_timebase_info(&timebase); fault_stats.sfs_system_max_fault_time = ((KDP_FAULT_PATH_MAX_TIME_PER_STACKSHOT_NSECS * timebase.denom)/ timebase.numer); + + PE_parse_boot_argn("stackshot_maxsz", &max_tracebuf_size, sizeof(max_tracebuf_size)); } /* @@ -275,7 +283,38 @@ static uint64_t safe_grab_timer_value(struct timer *t) static kern_return_t stackshot_trap() { - return DebuggerTrapWithState(DBOP_STACKSHOT, NULL, NULL, NULL, 0, FALSE, 0); + kern_return_t rv; + +#if defined(__x86_64__) + /* + * Since mp_rendezvous and stackshot both attempt to capture cpus then perform an + * operation, it's essential to apply mutual exclusion to the other when one + * mechanism is in operation, lest there be a deadlock as the mechanisms race to + * capture CPUs. + * + * Further, we assert that invoking stackshot from mp_rendezvous*() is not + * allowed, so we check to ensure there there is no rendezvous in progress before + * trying to grab the lock (if there is, a deadlock will occur when we try to + * grab the lock). This is accomplished by setting cpu_rendezvous_in_progress to + * TRUE in the mp rendezvous action function. If stackshot_trap() is called by + * a subordinate of the call chain within the mp rendezvous action, this flag will + * be set and can be used to detect the inevitable deadlock that would occur + * if this thread tried to grab the rendezvous lock. + */ + + if (current_cpu_datap()->cpu_rendezvous_in_progress == TRUE) { + panic("Calling stackshot from a rendezvous is not allowed!"); + } + + mp_rendezvous_lock(); +#endif + + rv = DebuggerTrapWithState(DBOP_STACKSHOT, NULL, NULL, NULL, 0, NULL, FALSE, 0); + +#if defined(__x86_64__) + mp_rendezvous_unlock(); +#endif + return (rv); } @@ -295,9 +334,9 @@ stack_snapshot_from_kernel(int pid, void *buf, uint32_t size, uint32_t flags, ui return KERN_INVALID_ARGUMENT; } - /* cap in individual stackshot to SANE_TRACEBUF_SIZE */ - if (size > SANE_TRACEBUF_SIZE) { - size = SANE_TRACEBUF_SIZE; + /* cap in individual stackshot to max_tracebuf_size */ + if (size > max_tracebuf_size) { + size = max_tracebuf_size; } /* Serialize tracing */ @@ -375,7 +414,7 @@ stack_microstackshot(user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t flag STACKSHOT_SUBSYS_LOCK(); if (flags & STACKSHOT_GET_MICROSTACKSHOT) { - if (tracebuf_size > SANE_TRACEBUF_SIZE) { + if (tracebuf_size > max_tracebuf_size) { error = KERN_INVALID_ARGUMENT; goto unlock_exit; } @@ -517,7 +556,7 @@ kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_confi pid = config->sc_pid; flags = config->sc_flags; since_timestamp = config->sc_delta_timestamp; - if (config->sc_size <= SANE_TRACEBUF_SIZE) { + if (config->sc_size <= max_tracebuf_size) { size_hint = config->sc_size; } break; @@ -610,7 +649,7 @@ kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_confi stackshotbuf_size = get_stackshot_estsize(size_hint); - for (; stackshotbuf_size <= SANE_TRACEBUF_SIZE; stackshotbuf_size <<= 1) { + for (; stackshotbuf_size <= max_tracebuf_size; stackshotbuf_size <<= 1) { if (kmem_alloc(kernel_map, (vm_offset_t *)&stackshotbuf, stackshotbuf_size, VM_KERN_MEMORY_DIAG) != KERN_SUCCESS) { error = KERN_RESOURCE_SHORTAGE; goto error_exit; @@ -699,7 +738,7 @@ kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_confi goto error_exit; } - if (stackshotbuf_size > SANE_TRACEBUF_SIZE) { + if (stackshotbuf_size > max_tracebuf_size) { error = KERN_RESOURCE_SHORTAGE; } @@ -794,11 +833,11 @@ static uint64_t kcdata_get_task_ss_flags(task_t task) { uint64_t ss_flags = 0; - boolean_t task64 = task_has_64BitAddr(task); + boolean_t task_64bit_addr = task_has_64Bit_addr(task); - if (task64) + if (task_64bit_addr) ss_flags |= kUser64_p; - if (!task->active || task_is_a_corpse(task)) + if (!task->active || task_is_a_corpse(task) || proc_exiting(task->bsd_info)) ss_flags |= kTerminatedSnapshot; if (task->pidsuspended) ss_flags |= kPidSuspended; @@ -813,9 +852,19 @@ kcdata_get_task_ss_flags(task_t task) if (task->effective_policy.tep_sup_active == 1) ss_flags |= kTaskIsSuppressed; #if CONFIG_MEMORYSTATUS - if (memorystatus_proc_is_dirty_unsafe(task->bsd_info)) + + boolean_t dirty = FALSE, dirty_tracked = FALSE, allow_idle_exit = FALSE; + memorystatus_proc_flags_unsafe(task->bsd_info, &dirty, &dirty_tracked, &allow_idle_exit); + if (dirty) ss_flags |= kTaskIsDirty; + if (dirty_tracked) + ss_flags |= kTaskIsDirtyTracked; + if (allow_idle_exit) + ss_flags |= kTaskAllowIdleExit; + #endif + if (task->effective_policy.tep_tal_engaged) + ss_flags |= kTaskTALEngaged; ss_flags |= (0x7 & workqueue_get_pwq_state_kdp(task->bsd_info)) << 17; @@ -827,19 +876,17 @@ kcdata_get_task_ss_flags(task_t task) ss_flags |= kTaskIsLiveImpDonor; } #endif - return ss_flags; } static kern_return_t -kcdata_record_shared_cache_info(kcdata_descriptor_t kcd, task_t task, struct dyld_uuid_info_64_v2 *sys_shared_cache_loadinfo, unaligned_u64 *task_snap_ss_flags) +kcdata_record_shared_cache_info(kcdata_descriptor_t kcd, task_t task, unaligned_u64 *task_snap_ss_flags) { kern_return_t error = KERN_SUCCESS; mach_vm_address_t out_addr = 0; uint64_t shared_cache_slide = 0; uint64_t shared_cache_base_address = 0; - int task_pid = pid_from_task(task); uint32_t kdp_fault_results = 0; assert(task_snap_ss_flags != NULL); @@ -863,22 +910,9 @@ kcdata_record_shared_cache_info(kcdata_descriptor_t kcd, task_t task, struct dyl */ shared_cache_slide = task->shared_region->sr_slide_info.slide; - if (sys_shared_cache_loadinfo) { - if (task_pid == 1) { - /* save launchd's shared cache info as system level */ - stackshot_memcpy(sys_shared_cache_loadinfo->imageUUID, &task->shared_region->sr_uuid, sizeof(task->shared_region->sr_uuid)); - sys_shared_cache_loadinfo->imageLoadAddress = shared_cache_slide; - sys_shared_cache_loadinfo->imageSlidBaseAddress = shared_cache_slide + task->shared_region->sr_base_address; - - goto error_exit; - } else { - if (shared_cache_slide == sys_shared_cache_loadinfo->imageLoadAddress && - 0 == memcmp(&task->shared_region->sr_uuid, sys_shared_cache_loadinfo->imageUUID, - sizeof(task->shared_region->sr_uuid))) { - /* skip adding shared cache info. its same as system level one */ - goto error_exit; - } - } + if (task->shared_region == init_task_shared_region) { + /* skip adding shared cache info -- it's the same as the system level one */ + goto error_exit; } kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO, sizeof(struct dyld_uuid_info_64_v2), &out_addr)); @@ -908,8 +942,6 @@ kcdata_record_uuid_info(kcdata_descriptor_t kcd, task_t task, uint32_t trace_fla { boolean_t save_loadinfo_p = ((trace_flags & STACKSHOT_SAVE_LOADINFO) != 0); boolean_t save_kextloadinfo_p = ((trace_flags & STACKSHOT_SAVE_KEXT_LOADINFO) != 0); - boolean_t collect_delta_stackshot = ((trace_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) != 0); - boolean_t minimize_uuids = collect_delta_stackshot && ((trace_flags & STACKSHOT_TAILSPIN) != 0); boolean_t should_fault = (trace_flags & STACKSHOT_ENABLE_UUID_FAULTING); kern_return_t error = KERN_SUCCESS; @@ -923,11 +955,11 @@ kcdata_record_uuid_info(kcdata_descriptor_t kcd, task_t task, uint32_t trace_fla assert(task_snap_ss_flags != NULL); int task_pid = pid_from_task(task); - boolean_t task64 = task_has_64BitAddr(task); + boolean_t task_64bit_addr = task_has_64Bit_addr(task); if (save_loadinfo_p && have_pmap && task->active && task_pid > 0) { /* Read the dyld_all_image_infos struct from the task memory to get UUID array count and location */ - if (task64) { + if (task_64bit_addr) { struct user64_dyld_all_image_infos task_image_infos; if (kdp_copyin(task->map, task->all_image_info_addr, &task_image_infos, sizeof(struct user64_dyld_all_image_infos), should_fault, &kdp_fault_results)) { @@ -968,13 +1000,10 @@ kcdata_record_uuid_info(kcdata_descriptor_t kcd, task_t task, uint32_t trace_fla } if (task_pid > 0 && uuid_info_count > 0 && uuid_info_count < MAX_LOADINFOS) { - if (minimize_uuids && uuid_info_timestamp != 0 && uuid_info_timestamp < stack_snapshot_delta_since_timestamp) - goto error_exit; - - uint32_t uuid_info_size = (uint32_t)(task64 ? sizeof(struct user64_dyld_uuid_info) : sizeof(struct user32_dyld_uuid_info)); + uint32_t uuid_info_size = (uint32_t)(task_64bit_addr ? sizeof(struct user64_dyld_uuid_info) : sizeof(struct user32_dyld_uuid_info)); uint32_t uuid_info_array_size = uuid_info_count * uuid_info_size; - kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcd, (task64 ? KCDATA_TYPE_LIBRARY_LOADINFO64 : KCDATA_TYPE_LIBRARY_LOADINFO), + kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcd, (task_64bit_addr ? KCDATA_TYPE_LIBRARY_LOADINFO64 : KCDATA_TYPE_LIBRARY_LOADINFO), uuid_info_size, uuid_info_count, &out_addr)); /* Copy in the UUID info array @@ -985,15 +1014,12 @@ kcdata_record_uuid_info(kcdata_descriptor_t kcd, task_t task, uint32_t trace_fla } } else if (task_pid == 0 && uuid_info_count > 0 && uuid_info_count < MAX_LOADINFOS) { - if (minimize_uuids && gLoadedKextSummaries != 0 && gLoadedKextSummariesTimestamp < stack_snapshot_delta_since_timestamp) - goto error_exit; - uintptr_t image_load_address; do { #if CONFIG_EMBEDDED - if (!save_kextloadinfo_p) { + if (kernelcache_uuid_valid && !save_kextloadinfo_p) { kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_KERNELCACHE_LOADINFO, sizeof(struct dyld_uuid_info_64), &out_addr)); struct dyld_uuid_info_64 *kc_uuid = (struct dyld_uuid_info_64 *)out_addr; kc_uuid->imageLoadAddress = VM_MIN_KERNEL_AND_KEXT_ADDRESS; @@ -1102,10 +1128,15 @@ static kern_return_t kcdata_record_task_snapshot(kcdata_descriptor_t kcd, task_t task, uint32_t trace_flags, boolean_t have_pmap, unaligned_u64 **task_snap_ss_flags) { boolean_t collect_delta_stackshot = ((trace_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) != 0); - boolean_t collect_iostats = !collect_delta_stackshot && !(trace_flags & STACKSHOT_TAILSPIN) && !(trace_flags & STACKSHOT_NO_IO_STATS); + boolean_t collect_iostats = !collect_delta_stackshot && !(trace_flags & STACKSHOT_NO_IO_STATS); #if MONOTONIC boolean_t collect_instrs_cycles = ((trace_flags & STACKSHOT_INSTRS_CYCLES) != 0); #endif /* MONOTONIC */ +#if __arm__ || __arm64__ + boolean_t collect_asid = ((trace_flags & STACKSHOT_ASID) != 0); +#endif + boolean_t collect_pagetables = ((trace_flags & STACKSHOT_PAGE_TABLES) != 0); + kern_return_t error = KERN_SUCCESS; mach_vm_address_t out_addr = 0; @@ -1118,8 +1149,8 @@ kcdata_record_task_snapshot(kcdata_descriptor_t kcd, task_t task, uint32_t trace uint64_t proc_starttime_secs = 0; kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_TASK_SNAPSHOT, sizeof(struct task_snapshot_v2), &out_addr)); - cur_tsnap = (struct task_snapshot_v2 *)out_addr; + bzero(cur_tsnap, sizeof(*cur_tsnap)); cur_tsnap->ts_unique_pid = task_uniqueid; cur_tsnap->ts_ss_flags = kcdata_get_task_ss_flags(task); @@ -1129,28 +1160,49 @@ kcdata_record_task_snapshot(kcdata_descriptor_t kcd, task_t task, uint32_t trace proc_starttime_kdp(task->bsd_info, &proc_starttime_secs, NULL, NULL); cur_tsnap->ts_p_start_sec = proc_starttime_secs; - -#if CONFIG_EMBEDDED cur_tsnap->ts_task_size = have_pmap ? get_task_phys_footprint(task) : 0; -#else - cur_tsnap->ts_task_size = have_pmap ? (pmap_resident_count(task->map->pmap) * PAGE_SIZE) : 0; -#endif cur_tsnap->ts_max_resident_size = get_task_resident_max(task); + cur_tsnap->ts_was_throttled = (uint32_t) proc_was_throttled_from_task(task); + cur_tsnap->ts_did_throttle = (uint32_t) proc_did_throttle_from_task(task); + cur_tsnap->ts_suspend_count = task->suspend_count; cur_tsnap->ts_faults = task->faults; cur_tsnap->ts_pageins = task->pageins; cur_tsnap->ts_cow_faults = task->cow_faults; - cur_tsnap->ts_was_throttled = (uint32_t) proc_was_throttled_from_task(task); - cur_tsnap->ts_did_throttle = (uint32_t) proc_did_throttle_from_task(task); cur_tsnap->ts_latency_qos = (task->effective_policy.tep_latency_qos == LATENCY_QOS_TIER_UNSPECIFIED) ? LATENCY_QOS_TIER_UNSPECIFIED : ((0xFF << 16) | task->effective_policy.tep_latency_qos); cur_tsnap->ts_pid = task_pid; +#if __arm__ || __arm64__ + if (collect_asid && have_pmap) { + uint32_t asid = task->map->pmap->asid; + kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_ASID, sizeof(uint32_t), &out_addr)); + stackshot_memcpy((void*)out_addr, &asid, sizeof(asid)); + } +#endif + if (collect_pagetables && have_pmap) { +#if INTERRUPT_MASKED_DEBUG + // pagetable dumps can be large; reset the interrupt timeout to avoid a panic + ml_spin_debug_clear_self(); +#endif + size_t bytes_dumped = pmap_dump_page_tables(task->map->pmap, kcd_end_address(kcd), kcd_max_address(kcd)); + if (bytes_dumped == 0) { + error = KERN_INSUFFICIENT_BUFFER_SIZE; + goto error_exit; + } else if (bytes_dumped == (size_t)-1) { + error = KERN_NOT_SUPPORTED; + goto error_exit; + } else { + kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcd, STACKSHOT_KCTYPE_PAGE_TABLES, + sizeof(uint64_t), (uint32_t)(bytes_dumped / sizeof(uint64_t)), &out_addr)); + } + } + /* Add the BSD process identifiers */ if (task_pid != -1 && task->bsd_info != NULL) { proc_name_kdp(task, cur_tsnap->ts_p_comm, sizeof(cur_tsnap->ts_p_comm)); #if CONFIG_COALITIONS - if (trace_flags & STACKSHOT_SAVE_JETSAM_COALITIONS) { + if ((trace_flags & STACKSHOT_SAVE_JETSAM_COALITIONS) && (task->coalition[COALITION_TYPE_JETSAM] != NULL)) { uint64_t jetsam_coal_id = coalition_id(task->coalition[COALITION_TYPE_JETSAM]); kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_JETSAM_COALITION, sizeof(jetsam_coal_id), &out_addr)); stackshot_memcpy((void*)out_addr, &jetsam_coal_id, sizeof(jetsam_coal_id)); @@ -1162,7 +1214,7 @@ kcdata_record_task_snapshot(kcdata_descriptor_t kcd, task_t task, uint32_t trace #if IMPORTANCE_INHERITANCE && (DEVELOPMENT || DEBUG) if (task->task_imp_base != NULL) { stackshot_strlcpy(cur_tsnap->ts_p_comm, &task->task_imp_base->iit_procname[0], - MIN((int)sizeof(task->task_imp_base->iit_procname), (int)sizeof(cur_tsnap->ts_p_comm))); + MIN((int)sizeof(task->task_imp_base->iit_procname), (int)sizeof(cur_tsnap->ts_p_comm))); } #endif /* IMPORTANCE_INHERITANCE && (DEVELOPMENT || DEBUG) */ } @@ -1184,13 +1236,18 @@ kcdata_record_task_snapshot(kcdata_descriptor_t kcd, task_t task, uint32_t trace static kern_return_t kcdata_record_task_delta_snapshot(kcdata_descriptor_t kcd, task_t task, uint32_t trace_flags, boolean_t have_pmap, unaligned_u64 **task_snap_ss_flags) { +#if !MONOTONIC +#pragma unused(trace_flags) +#endif /* !MONOTONIC */ kern_return_t error = KERN_SUCCESS; struct task_delta_snapshot_v2 * cur_tsnap = NULL; mach_vm_address_t out_addr = 0; + (void) trace_flags; +#if __arm__ || __arm64__ + boolean_t collect_asid = ((trace_flags & STACKSHOT_ASID) != 0); +#endif #if MONOTONIC boolean_t collect_instrs_cycles = ((trace_flags & STACKSHOT_INSTRS_CYCLES) != 0); -#else - (void)trace_flags; #endif /* MONOTONIC */ uint64_t task_uniqueid = get_task_uniqueid(task); @@ -1207,11 +1264,7 @@ kcdata_record_task_delta_snapshot(kcdata_descriptor_t kcd, task_t task, uint32_t cur_tsnap->tds_user_time_in_terminated_threads = task->total_user_time; cur_tsnap->tds_system_time_in_terminated_threads = task->total_system_time; -#if CONFIG_EMBEDDED cur_tsnap->tds_task_size = have_pmap ? get_task_phys_footprint(task) : 0; -#else - cur_tsnap->tds_task_size = have_pmap ? (pmap_resident_count(task->map->pmap) * PAGE_SIZE) : 0; -#endif cur_tsnap->tds_max_resident_size = get_task_resident_max(task); cur_tsnap->tds_suspend_count = task->suspend_count; @@ -1221,8 +1274,16 @@ kcdata_record_task_delta_snapshot(kcdata_descriptor_t kcd, task_t task, uint32_t cur_tsnap->tds_was_throttled = (uint32_t)proc_was_throttled_from_task(task); cur_tsnap->tds_did_throttle = (uint32_t)proc_did_throttle_from_task(task); cur_tsnap->tds_latency_qos = (task-> effective_policy.tep_latency_qos == LATENCY_QOS_TIER_UNSPECIFIED) - ? LATENCY_QOS_TIER_UNSPECIFIED - : ((0xFF << 16) | task-> effective_policy.tep_latency_qos); + ? LATENCY_QOS_TIER_UNSPECIFIED + : ((0xFF << 16) | task-> effective_policy.tep_latency_qos); + +#if __arm__ || __arm64__ + if (collect_asid && have_pmap) { + uint32_t asid = task->map->pmap->asid; + kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_ASID, sizeof(uint32_t), &out_addr)); + stackshot_memcpy((void*)out_addr, &asid, sizeof(asid)); + } +#endif #if MONOTONIC if (collect_instrs_cycles) { @@ -1275,7 +1336,7 @@ kcdata_record_thread_snapshot( boolean_t active_kthreads_only_p = ((trace_flags & STACKSHOT_ACTIVE_KERNEL_THREADS_ONLY) != 0); boolean_t trace_fp_p = false; boolean_t collect_delta_stackshot = ((trace_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) != 0); - boolean_t collect_iostats = !collect_delta_stackshot && !(trace_flags & STACKSHOT_TAILSPIN) && !(trace_flags & STACKSHOT_NO_IO_STATS); + boolean_t collect_iostats = !collect_delta_stackshot && !(trace_flags & STACKSHOT_NO_IO_STATS); #if MONOTONIC boolean_t collect_instrs_cycles = ((trace_flags & STACKSHOT_INSTRS_CYCLES) != 0); #endif /* MONOTONIC */ @@ -1287,7 +1348,7 @@ kcdata_record_thread_snapshot( struct thread_snapshot_v4 * cur_thread_snap = NULL; char cur_thread_name[STACKSHOT_MAX_THREAD_NAME_SIZE]; uint64_t tval = 0; - boolean_t task64 = task_has_64BitAddr(task); + const boolean_t is_64bit_data = task_has_64Bit_data(task); kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_THREAD_SNAPSHOT, sizeof(struct thread_snapshot_v4), &out_addr)); cur_thread_snap = (struct thread_snapshot_v4 *)out_addr; @@ -1365,7 +1426,8 @@ kcdata_record_thread_snapshot( cur_thread_snap->ths_sched_priority = thread->sched_pri; cur_thread_snap->ths_eqos = thread->effective_policy.thep_qos; cur_thread_snap->ths_rqos = thread->requested_policy.thrp_qos; - cur_thread_snap->ths_rqos_override = thread->requested_policy.thrp_qos_override; + cur_thread_snap->ths_rqos_override = MAX(thread->requested_policy.thrp_qos_override, + thread->requested_policy.thrp_qos_workq_override); cur_thread_snap->ths_io_tier = proc_get_effective_thread_policy(thread, TASK_POLICY_IO); cur_thread_snap->ths_thread_t = VM_KERNEL_UNSLIDE_OR_PERM(thread); @@ -1382,30 +1444,47 @@ kcdata_record_thread_snapshot( stackshot_memcpy((void *)out_addr, (void *)cur_thread_name, sizeof(cur_thread_name)); } - /* record system and user cpu times */ - time_value_t user_time; - time_value_t system_time; - thread_read_times(thread, &user_time, &system_time); - kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_CPU_TIMES, sizeof(struct stackshot_cpu_times), &out_addr)); - struct stackshot_cpu_times * stackshot_cpu_times = (struct stackshot_cpu_times *)out_addr; - stackshot_cpu_times->user_usec = ((uint64_t)user_time.seconds) * USEC_PER_SEC + user_time.microseconds; - stackshot_cpu_times->system_usec = ((uint64_t)system_time.seconds) * USEC_PER_SEC + system_time.microseconds; + /* record system, user, and runnable times */ + time_value_t user_time, system_time, runnable_time; + thread_read_times(thread, &user_time, &system_time, &runnable_time); + kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_CPU_TIMES, sizeof(struct stackshot_cpu_times_v2), &out_addr)); + struct stackshot_cpu_times_v2 *stackshot_cpu_times = (struct stackshot_cpu_times_v2 *)out_addr; + *stackshot_cpu_times = (struct stackshot_cpu_times_v2){ + .user_usec = (uint64_t)user_time.seconds * USEC_PER_SEC + user_time.microseconds, + .system_usec = (uint64_t)system_time.seconds * USEC_PER_SEC + system_time.microseconds, + .runnable_usec = (uint64_t)runnable_time.seconds * USEC_PER_SEC + runnable_time.microseconds, + }; /* Trace user stack, if any */ if (!active_kthreads_only_p && task->active && thread->task->map != kernel_map) { uint32_t thread_snapshot_flags = 0; - /* 64-bit task? */ - if (task64) { + + /* Uses 64-bit machine state? */ + if (is_64bit_data) { + uint64_t sp = 0; out_addr = (mach_vm_address_t)kcd_end_address(kcd); saved_count = machine_trace_thread64(thread, (char *)out_addr, (char *)kcd_max_address(kcd), MAX_FRAMES, TRUE, - trace_fp_p, &thread_snapshot_flags); + trace_fp_p, &thread_snapshot_flags, &sp); if (saved_count > 0) { int frame_size = trace_fp_p ? sizeof(struct stack_snapshot_frame64) : sizeof(uint64_t); kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcd, trace_fp_p ? STACKSHOT_KCTYPE_USER_STACKFRAME64 - : STACKSHOT_KCTYPE_USER_STACKLR64, + : STACKSHOT_KCTYPE_USER_STACKLR64, frame_size, saved_count / frame_size, &out_addr)); cur_thread_snap->ths_ss_flags |= kUser64_p; } +#if __x86_64__ + if (sp) { + // I'm using 8 here and not sizeof(stack_contents) because this + // code would not work if you just made stack_contents bigger. + vm_offset_t kern_virt_addr = machine_trace_thread_get_kva(sp, thread->task->map, &thread_snapshot_flags); + if (kern_virt_addr && (kern_virt_addr % 8) == 0) { + kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_USER_STACKTOP, sizeof(struct stack_snapshot_stacktop), &out_addr)); + struct stack_snapshot_stacktop *stacktop = (struct stack_snapshot_stacktop *)out_addr; + stacktop->sp = sp; + memcpy(stacktop->stack_contents, (void*) kern_virt_addr, 8); + } + } +#endif } else { out_addr = (mach_vm_address_t)kcd_end_address(kcd); saved_count = machine_trace_thread(thread, (char *)out_addr, (char *)kcd_max_address(kcd), MAX_FRAMES, TRUE, trace_fp_p, @@ -1413,7 +1492,7 @@ kcdata_record_thread_snapshot( if (saved_count > 0) { int frame_size = trace_fp_p ? sizeof(struct stack_snapshot_frame32) : sizeof(uint32_t); kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcd, trace_fp_p ? STACKSHOT_KCTYPE_USER_STACKFRAME - : STACKSHOT_KCTYPE_USER_STACKLR, + : STACKSHOT_KCTYPE_USER_STACKLR, frame_size, saved_count / frame_size, &out_addr)); } } @@ -1431,12 +1510,12 @@ kcdata_record_thread_snapshot( #if defined(__LP64__) out_addr = (mach_vm_address_t)kcd_end_address(kcd); saved_count = machine_trace_thread64(thread, (char *)out_addr, (char *)kcd_max_address(kcd), MAX_FRAMES, FALSE, trace_fp_p, - &thread_snapshot_flags); + &thread_snapshot_flags, NULL); if (saved_count > 0) { int frame_size = trace_fp_p ? sizeof(struct stack_snapshot_frame64) : sizeof(uint64_t); cur_thread_snap->ths_ss_flags |= kKernel64_p; kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcd, trace_fp_p ? STACKSHOT_KCTYPE_KERN_STACKFRAME64 - : STACKSHOT_KCTYPE_KERN_STACKLR64, + : STACKSHOT_KCTYPE_KERN_STACKLR64, frame_size, saved_count / frame_size, &out_addr)); } #else @@ -1506,7 +1585,8 @@ kcdata_record_thread_delta_snapshot(struct thread_delta_snapshot_v3 * cur_thread cur_thread_snap->tds_sched_priority = thread->sched_pri; cur_thread_snap->tds_eqos = thread->effective_policy.thep_qos; cur_thread_snap->tds_rqos = thread->requested_policy.thrp_qos; - cur_thread_snap->tds_rqos_override = thread->requested_policy.thrp_qos_override; + cur_thread_snap->tds_rqos_override = MAX(thread->requested_policy.thrp_qos_override, + thread->requested_policy.thrp_qos_workq_override); cur_thread_snap->tds_io_tier = proc_get_effective_thread_policy(thread, TASK_POLICY_IO); static_assert(sizeof(thread->effective_policy) == sizeof(uint64_t)); @@ -1529,45 +1609,15 @@ struct saved_uniqueids { unsigned count; }; -static kern_return_t -flush_nonrunnable_tasks(struct saved_uniqueids * ids) -{ - if (ids->count == 0) - return KERN_SUCCESS; - mach_vm_address_t out_addr = 0; - kern_return_t ret = kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_NONRUNNABLE_TASKS, sizeof(uint64_t), - ids->count, &out_addr); - if (ret != KERN_SUCCESS) { - return ret; - } - stackshot_memcpy((void *)out_addr, ids->ids, sizeof(uint64_t) * ids->count); - ids->count = 0; - return ret; -} - -static kern_return_t -handle_nonrunnable_task(struct saved_uniqueids * ids, uint64_t pid) -{ - kern_return_t ret = KERN_SUCCESS; - ids->ids[ids->count] = pid; - ids->count++; - assert(ids->count <= UNIQUEIDSPERFLUSH); - if (ids->count == UNIQUEIDSPERFLUSH) - ret = flush_nonrunnable_tasks(ids); - return ret; -} - enum thread_classification { tc_full_snapshot, /* take a full snapshot */ tc_delta_snapshot, /* take a delta snapshot */ - tc_nonrunnable, /* only report id */ }; static enum thread_classification classify_thread(thread_t thread, boolean_t * thread_on_core_p, uint32_t trace_flags) { boolean_t collect_delta_stackshot = ((trace_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) != 0); - boolean_t minimize_nonrunnables = ((trace_flags & STACKSHOT_TAILSPIN) != 0); processor_t last_processor = thread->last_processor; @@ -1581,14 +1631,230 @@ classify_thread(thread_t thread, boolean_t * thread_on_core_p, uint32_t trace_fl if (!collect_delta_stackshot || thread_on_core || (thread->last_run_time > stack_snapshot_delta_since_timestamp)) { return tc_full_snapshot; } else { - if (minimize_nonrunnables && !(thread->state & TH_RUN)) { - return tc_nonrunnable; + return tc_delta_snapshot; + } +} + +struct stackshot_context +{ + int pid; + uint32_t trace_flags; +}; + +static kern_return_t +kdp_stackshot_record_task(struct stackshot_context *ctx, task_t task) +{ + boolean_t active_kthreads_only_p = ((ctx->trace_flags & STACKSHOT_ACTIVE_KERNEL_THREADS_ONLY) != 0); + boolean_t save_donating_pids_p = ((ctx->trace_flags & STACKSHOT_SAVE_IMP_DONATION_PIDS) != 0); + boolean_t collect_delta_stackshot = ((ctx->trace_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) != 0); + boolean_t save_owner_info = ((ctx->trace_flags & STACKSHOT_THREAD_WAITINFO) != 0); + + + kern_return_t error = KERN_SUCCESS; + mach_vm_address_t out_addr = 0; + int saved_count = 0; + + int task_pid = 0; + uint64_t task_uniqueid = 0; + int num_delta_thread_snapshots = 0; + int num_nonrunnable_threads = 0; + int num_waitinfo_threads = 0; + + uint64_t task_start_abstime = 0; + boolean_t task_delta_stackshot = FALSE; + boolean_t have_map = FALSE, have_pmap = FALSE; + boolean_t some_thread_ran = FALSE; + unaligned_u64 *task_snap_ss_flags = NULL; + + if ((task == NULL) || !ml_validate_nofault((vm_offset_t)task, sizeof(struct task))) { + error = KERN_FAILURE; + goto error_exit; + } + + have_map = (task->map != NULL) && (ml_validate_nofault((vm_offset_t)(task->map), sizeof(struct _vm_map))); + have_pmap = have_map && (task->map->pmap != NULL) && (ml_validate_nofault((vm_offset_t)(task->map->pmap), sizeof(struct pmap))); + + task_pid = pid_from_task(task); + task_uniqueid = get_task_uniqueid(task); + + if (!task->active || task_is_a_corpse(task)) { + /* + * Not interested in terminated tasks without threads, and + * at the moment, stackshot can't handle a task without a name. + */ + if (queue_empty(&task->threads) || task_pid == -1) { + return KERN_SUCCESS; + } + } + + if (collect_delta_stackshot) { + proc_starttime_kdp(task->bsd_info, NULL, NULL, &task_start_abstime); + } + + /* Trace everything, unless a process was specified */ + if ((ctx->pid == -1) || (ctx->pid == task_pid)) { + + /* add task snapshot marker */ + kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_BEGIN, + STACKSHOT_KCCONTAINER_TASK, task_uniqueid)); + + if (!collect_delta_stackshot || (task_start_abstime == 0) || + (task_start_abstime > stack_snapshot_delta_since_timestamp)) { + kcd_exit_on_error(kcdata_record_task_snapshot(stackshot_kcdata_p, task, ctx->trace_flags, have_pmap, &task_snap_ss_flags)); } else { - return tc_delta_snapshot; + task_delta_stackshot = TRUE; + kcd_exit_on_error(kcdata_record_task_delta_snapshot(stackshot_kcdata_p, task, ctx->trace_flags, have_pmap, &task_snap_ss_flags)); } + + /* Iterate over task threads */ + thread_t thread = THREAD_NULL; + queue_iterate(&task->threads, thread, thread_t, task_threads) + { + uint64_t thread_uniqueid; + + if ((thread == NULL) || !ml_validate_nofault((vm_offset_t)thread, sizeof(struct thread))) { + error = KERN_FAILURE; + goto error_exit; + } + + if (active_kthreads_only_p && thread->kernel_stack == 0) + continue; + + thread_uniqueid = thread_tid(thread); + + boolean_t thread_on_core; + enum thread_classification thread_classification = classify_thread(thread, &thread_on_core, ctx->trace_flags); + + switch (thread_classification) { + case tc_full_snapshot: + /* add thread marker */ + kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_BEGIN, + STACKSHOT_KCCONTAINER_THREAD, thread_uniqueid)); + kcd_exit_on_error( + kcdata_record_thread_snapshot(stackshot_kcdata_p, thread, task, ctx->trace_flags, have_pmap, thread_on_core)); + + /* mark end of thread snapshot data */ + kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_END, + STACKSHOT_KCCONTAINER_THREAD, thread_uniqueid)); + + some_thread_ran = TRUE; + break; + + case tc_delta_snapshot: + num_delta_thread_snapshots++; + break; + } + + /* We want to report owner information regardless of whether a thread + * has changed since the last delta, whether it's a normal stackshot, + * or whether it's nonrunnable */ + if (save_owner_info && stackshot_thread_has_valid_waitinfo(thread)) + num_waitinfo_threads++; + } + + struct thread_delta_snapshot_v3 * delta_snapshots = NULL; + int current_delta_snapshot_index = 0; + + if (num_delta_thread_snapshots > 0) { + kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT, + sizeof(struct thread_delta_snapshot_v3), + num_delta_thread_snapshots, &out_addr)); + delta_snapshots = (struct thread_delta_snapshot_v3 *)out_addr; + } + + uint64_t * nonrunnable_tids = NULL; + + if (num_nonrunnable_threads > 0) { + kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_NONRUNNABLE_TIDS, + sizeof(uint64_t), num_nonrunnable_threads, &out_addr)); + nonrunnable_tids = (uint64_t *)out_addr; + } + + thread_waitinfo_t *thread_waitinfo = NULL; + int current_waitinfo_index = 0; + + if (num_waitinfo_threads > 0) { + kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_THREAD_WAITINFO, + sizeof(thread_waitinfo_t), num_waitinfo_threads, &out_addr)); + thread_waitinfo = (thread_waitinfo_t *)out_addr; + } + + if (num_delta_thread_snapshots > 0 || num_nonrunnable_threads > 0 || num_waitinfo_threads > 0) { + queue_iterate(&task->threads, thread, thread_t, task_threads) + { + if (active_kthreads_only_p && thread->kernel_stack == 0) + continue; + + /* If we want owner info, we should capture it regardless of its classification */ + if (save_owner_info && stackshot_thread_has_valid_waitinfo(thread)) { + stackshot_thread_wait_owner_info( + thread, + &thread_waitinfo[current_waitinfo_index++]); + } + + boolean_t thread_on_core; + enum thread_classification thread_classification = classify_thread(thread, &thread_on_core, ctx->trace_flags); + + switch (thread_classification) { + case tc_full_snapshot: + /* full thread snapshot captured above */ + continue; + + case tc_delta_snapshot: + kcd_exit_on_error(kcdata_record_thread_delta_snapshot(&delta_snapshots[current_delta_snapshot_index++], + thread, thread_on_core)); + break; + } + } + +#if DEBUG || DEVELOPMENT + if (current_delta_snapshot_index != num_delta_thread_snapshots) { + panic("delta thread snapshot count mismatch while capturing snapshots for task %p. expected %d, found %d", task, + num_delta_thread_snapshots, current_delta_snapshot_index); + } + if (current_waitinfo_index != num_waitinfo_threads) { + panic("thread wait info count mismatch while capturing snapshots for task %p. expected %d, found %d", task, + num_waitinfo_threads, current_waitinfo_index); + } +#endif + } + +#if IMPORTANCE_INHERITANCE + if (save_donating_pids_p) { + kcd_exit_on_error( + ((((mach_vm_address_t)kcd_end_address(stackshot_kcdata_p) + (TASK_IMP_WALK_LIMIT * sizeof(int32_t))) < + (mach_vm_address_t)kcd_max_address(stackshot_kcdata_p)) + ? KERN_SUCCESS + : KERN_RESOURCE_SHORTAGE)); + saved_count = task_importance_list_pids(task, TASK_IMP_LIST_DONATING_PIDS, + (void *)kcd_end_address(stackshot_kcdata_p), TASK_IMP_WALK_LIMIT); + if (saved_count > 0) + kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_DONATING_PIDS, + sizeof(int32_t), saved_count, &out_addr)); + } +#endif + + if (!collect_delta_stackshot || (num_delta_thread_snapshots != task->thread_count) || !task_delta_stackshot) { + /* + * Collect shared cache info and UUID info in these scenarios + * 1) a full stackshot + * 2) a delta stackshot where the task started after the previous full stackshot OR + * any thread from the task has run since the previous full stackshot + */ + + kcd_exit_on_error(kcdata_record_shared_cache_info(stackshot_kcdata_p, task, task_snap_ss_flags)); + kcd_exit_on_error(kcdata_record_uuid_info(stackshot_kcdata_p, task, ctx->trace_flags, have_pmap, task_snap_ss_flags)); + } + /* mark end of task snapshot data */ + kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_END, STACKSHOT_KCCONTAINER_TASK, + task_uniqueid)); } + +error_exit: + return error; } + static kern_return_t kdp_stackshot_kcdata_format(int pid, uint32_t trace_flags, uint32_t * pBytesTraced) { @@ -1597,21 +1863,14 @@ kdp_stackshot_kcdata_format(int pid, uint32_t trace_flags, uint32_t * pBytesTrac uint64_t abs_time = 0, abs_time_end = 0; uint64_t *abs_time_addr = NULL; uint64_t system_state_flags = 0; - int saved_count = 0; task_t task = TASK_NULL; - thread_t thread = THREAD_NULL; mach_timebase_info_data_t timebase = {0, 0}; uint32_t length_to_copy = 0, tmp32 = 0; - abs_time = mach_absolute_time(); /* process the flags */ - boolean_t active_kthreads_only_p = ((trace_flags & STACKSHOT_ACTIVE_KERNEL_THREADS_ONLY) != 0); - boolean_t save_donating_pids_p = ((trace_flags & STACKSHOT_SAVE_IMP_DONATION_PIDS) != 0); boolean_t collect_delta_stackshot = ((trace_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) != 0); - boolean_t minimize_nonrunnables = ((trace_flags & STACKSHOT_TAILSPIN) != 0); boolean_t use_fault_path = ((trace_flags & (STACKSHOT_ENABLE_UUID_FAULTING | STACKSHOT_ENABLE_BT_FAULTING)) != 0); - boolean_t save_owner_info = ((trace_flags & STACKSHOT_THREAD_WAITINFO) != 0); stack_enable_faulting = (trace_flags & (STACKSHOT_ENABLE_BT_FAULTING)); #if CONFIG_EMBEDDED @@ -1619,7 +1878,9 @@ kdp_stackshot_kcdata_format(int pid, uint32_t trace_flags, uint32_t * pBytesTrac trace_flags &= ~(STACKSHOT_SAVE_KEXT_LOADINFO); #endif - struct saved_uniqueids saved_uniqueids = {.count = 0}; + struct stackshot_context ctx = {}; + ctx.trace_flags = trace_flags; + ctx.pid = pid; if (use_fault_path) { fault_stats.sfs_pages_faulted_in = 0; @@ -1682,13 +1943,33 @@ kdp_stackshot_kcdata_format(int pid, uint32_t trace_flags, uint32_t * pBytesTrac kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, KCDATA_TYPE_USECS_SINCE_EPOCH, sizeof(uint64_t), &out_addr)); stackshot_memcpy((void *)out_addr, &stackshot_microsecs, sizeof(uint64_t)); - /* reserve space of system level shared cache load info */ - struct dyld_uuid_info_64_v2 * sys_shared_cache_loadinfo = NULL; - if (!collect_delta_stackshot) { + /* record system level shared cache load info (if available) */ + if (!collect_delta_stackshot && init_task_shared_region && + ml_validate_nofault((vm_offset_t)init_task_shared_region, sizeof(struct vm_shared_region))) { + struct dyld_uuid_info_64_v2 *sys_shared_cache_info = NULL; kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO, sizeof(struct dyld_uuid_info_64_v2), &out_addr)); - sys_shared_cache_loadinfo = (struct dyld_uuid_info_64_v2 *)out_addr; - bzero((void *)sys_shared_cache_loadinfo, sizeof(struct dyld_uuid_info_64_v2)); + sys_shared_cache_info = (struct dyld_uuid_info_64_v2 *)out_addr; + + stackshot_memcpy(sys_shared_cache_info->imageUUID, &init_task_shared_region->sr_uuid, sizeof(init_task_shared_region->sr_uuid)); + sys_shared_cache_info->imageLoadAddress = init_task_shared_region->sr_slide_info.slide; + sys_shared_cache_info->imageSlidBaseAddress = init_task_shared_region->sr_slide_info.slide + init_task_shared_region->sr_base_address; + + if (trace_flags & STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT) { + /* + * Include a map of the system shared cache layout if it has been populated + * (which is only when the system is using a custom shared cache). + */ + if (init_task_shared_region->sr_images && ml_validate_nofault((vm_offset_t)init_task_shared_region->sr_images, + (init_task_shared_region->sr_images_count * sizeof(struct dyld_uuid_info_64)))) { + assert(init_task_shared_region->sr_images_count != 0); + kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT, + sizeof(struct dyld_uuid_info_64), + init_task_shared_region->sr_images_count, &out_addr)); + stackshot_memcpy((void*)out_addr, init_task_shared_region->sr_images, + (init_task_shared_region->sr_images_count * sizeof(struct dyld_uuid_info_64))); + } + } } /* Add requested information first */ @@ -1724,252 +2005,28 @@ kdp_stackshot_kcdata_format(int pid, uint32_t trace_flags, uint32_t * pBytesTrac trace_flags &= ~(STACKSHOT_THREAD_GROUP); + /* Iterate over tasks */ - queue_head_t *task_list = &tasks; - queue_iterate(task_list, task, task_t, tasks) { - int task_pid = 0; - uint64_t task_uniqueid = 0; - int num_delta_thread_snapshots = 0; - int num_nonrunnable_threads = 0; - int num_waitinfo_threads = 0; - - uint64_t task_start_abstime = 0; - boolean_t task_delta_stackshot = FALSE; - boolean_t task64 = FALSE, have_map = FALSE, have_pmap = FALSE; - boolean_t some_thread_ran = FALSE; - unaligned_u64 *task_snap_ss_flags = NULL; - - if ((task == NULL) || !ml_validate_nofault((vm_offset_t)task, sizeof(struct task))) { - error = KERN_FAILURE; + queue_iterate(&tasks, task, task_t, tasks) + { + error = kdp_stackshot_record_task(&ctx, task); + if (error) goto error_exit; - } - - have_map = (task->map != NULL) && (ml_validate_nofault((vm_offset_t)(task->map), sizeof(struct _vm_map))); - have_pmap = have_map && (task->map->pmap != NULL) && (ml_validate_nofault((vm_offset_t)(task->map->pmap), sizeof(struct pmap))); - - task_pid = pid_from_task(task); - task_uniqueid = get_task_uniqueid(task); - task64 = task_has_64BitAddr(task); - - if (!task->active || task_is_a_corpse(task)) { - /* - * Not interested in terminated tasks without threads, and - * at the moment, stackshot can't handle a task without a name. - */ - if (queue_empty(&task->threads) || task_pid == -1) { - continue; - } - } - - if (collect_delta_stackshot) { - proc_starttime_kdp(task->bsd_info, NULL, NULL, &task_start_abstime); - } - - /* Trace everything, unless a process was specified */ - if ((pid == -1) || (pid == task_pid)) { -#if DEBUG || DEVELOPMENT - /* we might want to call kcdata_undo_add_container_begin(), which is - * only safe if we call it after kcdata_add_container_marker() but - * before adding any other kcdata items. In development kernels, - * we'll remember where the buffer end was and confirm after calling - * kcdata_undo_add_container_begin() that it's in exactly the same - * place.*/ - mach_vm_address_t revert_addr = stackshot_kcdata_p->kcd_addr_end; -#endif - - /* add task snapshot marker */ - kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_BEGIN, - STACKSHOT_KCCONTAINER_TASK, task_uniqueid)); - - if (!collect_delta_stackshot || (task_start_abstime == 0) || - (task_start_abstime > stack_snapshot_delta_since_timestamp)) { - kcd_exit_on_error(kcdata_record_task_snapshot(stackshot_kcdata_p, task, trace_flags, have_pmap, &task_snap_ss_flags)); - } else { - task_delta_stackshot = TRUE; - if (minimize_nonrunnables) { - // delay taking the task snapshot. If there are no runnable threads we'll skip it. - } else { - kcd_exit_on_error(kcdata_record_task_delta_snapshot(stackshot_kcdata_p, task, trace_flags, have_pmap, &task_snap_ss_flags)); - } - } - - /* Iterate over task threads */ - queue_iterate(&task->threads, thread, thread_t, task_threads) - { - uint64_t thread_uniqueid; - - if ((thread == NULL) || !ml_validate_nofault((vm_offset_t)thread, sizeof(struct thread))) { - error = KERN_FAILURE; - goto error_exit; - } - - if (active_kthreads_only_p && thread->kernel_stack == 0) - continue; - - thread_uniqueid = thread_tid(thread); - - boolean_t thread_on_core; - enum thread_classification thread_classification = classify_thread(thread, &thread_on_core, trace_flags); - - switch (thread_classification) { - case tc_full_snapshot: - /* add thread marker */ - kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_BEGIN, - STACKSHOT_KCCONTAINER_THREAD, thread_uniqueid)); - kcd_exit_on_error( - kcdata_record_thread_snapshot(stackshot_kcdata_p, thread, task, trace_flags, have_pmap, thread_on_core)); - - /* mark end of thread snapshot data */ - kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_END, - STACKSHOT_KCCONTAINER_THREAD, thread_uniqueid)); - - some_thread_ran = TRUE; - break; - - case tc_delta_snapshot: - num_delta_thread_snapshots++; - break; - - case tc_nonrunnable: - num_nonrunnable_threads++; - break; - } - - /* We want to report owner information regardless of whether a thread - * has changed since the last delta, whether it's a normal stackshot, - * or whether it's nonrunnable */ - if (save_owner_info && stackshot_thread_has_valid_waitinfo(thread)) - num_waitinfo_threads++; - } - - if (task_delta_stackshot && minimize_nonrunnables) { - if (some_thread_ran || num_delta_thread_snapshots > 0) { - kcd_exit_on_error(kcdata_record_task_delta_snapshot(stackshot_kcdata_p, task, trace_flags, have_pmap, &task_snap_ss_flags)); - } else { - kcd_exit_on_error(kcdata_undo_add_container_begin(stackshot_kcdata_p)); - -#if DEBUG || DEVELOPMENT - mach_vm_address_t undo_addr = stackshot_kcdata_p->kcd_addr_end; - if (revert_addr != undo_addr) { - panic("tried to revert a container begin but we already moved past it. revert=%p undo=%p", - (void *)revert_addr, (void *)undo_addr); - } -#endif - kcd_exit_on_error(handle_nonrunnable_task(&saved_uniqueids, task_uniqueid)); - continue; - } - } - - struct thread_delta_snapshot_v3 * delta_snapshots = NULL; - int current_delta_snapshot_index = 0; - - if (num_delta_thread_snapshots > 0) { - kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT, - sizeof(struct thread_delta_snapshot_v3), - num_delta_thread_snapshots, &out_addr)); - delta_snapshots = (struct thread_delta_snapshot_v3 *)out_addr; - } - - uint64_t * nonrunnable_tids = NULL; - int current_nonrunnable_index = 0; - - if (num_nonrunnable_threads > 0) { - kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_NONRUNNABLE_TIDS, - sizeof(uint64_t), num_nonrunnable_threads, &out_addr)); - nonrunnable_tids = (uint64_t *)out_addr; - } - - thread_waitinfo_t *thread_waitinfo = NULL; - int current_waitinfo_index = 0; - - if (num_waitinfo_threads > 0) { - kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_THREAD_WAITINFO, - sizeof(thread_waitinfo_t), num_waitinfo_threads, &out_addr)); - thread_waitinfo = (thread_waitinfo_t *)out_addr; - } - - if (num_delta_thread_snapshots > 0 || num_nonrunnable_threads > 0 || num_waitinfo_threads > 0) { - queue_iterate(&task->threads, thread, thread_t, task_threads) - { - if (active_kthreads_only_p && thread->kernel_stack == 0) - continue; - - /* If we want owner info, we should capture it regardless of its classification */ - if (save_owner_info && stackshot_thread_has_valid_waitinfo(thread)) { - stackshot_thread_wait_owner_info( - thread, - &thread_waitinfo[current_waitinfo_index++]); - } - - boolean_t thread_on_core; - enum thread_classification thread_classification = classify_thread(thread, &thread_on_core, trace_flags); - - switch (thread_classification) { - case tc_full_snapshot: - /* full thread snapshot captured above */ - continue; - - case tc_delta_snapshot: - kcd_exit_on_error(kcdata_record_thread_delta_snapshot(&delta_snapshots[current_delta_snapshot_index++], - thread, thread_on_core)); - break; - - case tc_nonrunnable: - nonrunnable_tids[current_nonrunnable_index++] = thread_tid(thread); - continue; - } - } - -#if DEBUG || DEVELOPMENT - if (current_delta_snapshot_index != num_delta_thread_snapshots) { - panic("delta thread snapshot count mismatch while capturing snapshots for task %p. expected %d, found %d", task, - num_delta_thread_snapshots, current_delta_snapshot_index); - } - if (current_nonrunnable_index != num_nonrunnable_threads) { - panic("nonrunnable thread count mismatch while capturing snapshots for task %p. expected %d, found %d", task, - num_nonrunnable_threads, current_nonrunnable_index); - } - if (current_waitinfo_index != num_waitinfo_threads) { - panic("thread wait info count mismatch while capturing snapshots for task %p. expected %d, found %d", task, - num_waitinfo_threads, current_waitinfo_index); - } -#endif - } - -#if IMPORTANCE_INHERITANCE - if (save_donating_pids_p) { - kcd_exit_on_error( - ((((mach_vm_address_t)kcd_end_address(stackshot_kcdata_p) + (TASK_IMP_WALK_LIMIT * sizeof(int32_t))) < - (mach_vm_address_t)kcd_max_address(stackshot_kcdata_p)) - ? KERN_SUCCESS - : KERN_RESOURCE_SHORTAGE)); - saved_count = task_importance_list_pids(task, TASK_IMP_LIST_DONATING_PIDS, - (void *)kcd_end_address(stackshot_kcdata_p), TASK_IMP_WALK_LIMIT); - if (saved_count > 0) - kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_DONATING_PIDS, - sizeof(int32_t), saved_count, &out_addr)); - } -#endif - - if (!collect_delta_stackshot || (num_delta_thread_snapshots != task->thread_count) || !task_delta_stackshot) { - /* - * Collect shared cache info and UUID info in these scenarios - * 1) a full stackshot - * 2) a delta stackshot where the task started after the previous full stackshot OR - * any thread from the task has run since the previous full stackshot - */ - - kcd_exit_on_error(kcdata_record_shared_cache_info(stackshot_kcdata_p, task, sys_shared_cache_loadinfo, task_snap_ss_flags)); - kcd_exit_on_error(kcdata_record_uuid_info(stackshot_kcdata_p, task, trace_flags, have_pmap, task_snap_ss_flags)); - } - /* mark end of task snapshot data */ - kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_END, STACKSHOT_KCCONTAINER_TASK, - task_uniqueid)); - } } - - if (minimize_nonrunnables) { - flush_nonrunnable_tasks(&saved_uniqueids); + /* + * Iterate over the tasks in the terminated tasks list. We only inspect + * tasks that have a valid bsd_info pointer where P_LPEXIT is NOT set. + * We're only interested in tasks that have remaining threads (which + * could be involved in a deadlock, etc), and the last thread that tears + * itself down during exit sets P_LPEXIT during proc_exit(). + */ + queue_iterate(&terminated_tasks, task, task_t, tasks) + { + if (task->bsd_info && !proc_in_teardown(task->bsd_info)) { + error = kdp_stackshot_record_task(&ctx, task); + if (error) + goto error_exit; + } } if (use_fault_path) { @@ -2192,7 +2249,7 @@ boolean_t kdp_copyin_word( task_t task, uint64_t addr, uint64_t *result, boolean_t try_fault, uint32_t *kdp_fault_results) { - if (task_has_64BitAddr(task)) { + if (task_has_64Bit_data(task)) { return kdp_copyin(task->map, addr, result, sizeof(uint64_t), try_fault, kdp_fault_results); } else { uint32_t buf; @@ -2453,6 +2510,9 @@ stackshot_thread_wait_owner_info(thread_t thread, thread_waitinfo_t *waitinfo) case kThreadWaitWorkloopSyncWait: kdp_workloop_sync_wait_find_owner(thread, thread->wait_event, waitinfo); break; + case kThreadWaitOnProcess: + kdp_wait4_find_process(thread, thread->wait_event, waitinfo); + break; default: waitinfo->owner = 0; waitinfo->context = 0; diff --git a/osfmk/kern/kern_types.h b/osfmk/kern/kern_types.h index 727712ec9..8308df68d 100644 --- a/osfmk/kern/kern_types.h +++ b/osfmk/kern/kern_types.h @@ -116,12 +116,25 @@ typedef void (*thread_continue_t)(void *, wait_result_t); * You must provide this value for any unbounded wait - otherwise you will * pend user signals forever. * + * THREAD_WAIT_NOREPORT: + * The scheduler has a callback (sched_call) that some subsystems use to + * decide whether more threads should be thrown at a given problem by trying + * to maintain a good level of concurrency. + * + * When the wait will not be helped by adding more threads (e.g. lock + * contention), using this flag as an argument to assert_wait* (or any of its + * wrappers) will prevent the next wait/block to cause thread creation. + * + * This comes in two flavors: THREAD_WAIT_NOREPORT_KERNEL, and + * THREAD_WAIT_NOREPORT_USER to prevent reporting about the wait for kernel + * and user threads respectively. + * * Thread interrupt mask: * - * The current maximum interruptible state for the thread, as set by - * thread_interrupt_level(), will limit the conditions that will cause a wake. - * This is useful for code that can't be interrupted to set before calling code - * that doesn't know that. + * The current maximum interruptible state for the thread, as set by + * thread_interrupt_level(), will limit the conditions that will cause a wake. + * This is useful for code that can't be interrupted to set before calling code + * that doesn't know that. * * Thread termination vs safe abort: * @@ -152,9 +165,12 @@ typedef void (*thread_continue_t)(void *, wait_result_t); * call will always either return or call the passed in continuation. */ typedef int wait_interrupt_t; -#define THREAD_UNINT 0 /* not interruptible */ -#define THREAD_INTERRUPTIBLE 1 /* may not be restartable */ -#define THREAD_ABORTSAFE 2 /* abortable safely */ +#define THREAD_UNINT 0x00000000 /* not interruptible */ +#define THREAD_INTERRUPTIBLE 0x00000001 /* may not be restartable */ +#define THREAD_ABORTSAFE 0x00000002 /* abortable safely */ +#define THREAD_WAIT_NOREPORT_KERNEL 0x80000000 +#define THREAD_WAIT_NOREPORT_USER 0x40000000 +#define THREAD_WAIT_NOREPORT (THREAD_WAIT_NOREPORT_KERNEL | THREAD_WAIT_NOREPORT_USER) typedef int wait_timeout_urgency_t; #define TIMEOUT_URGENCY_SYS_NORMAL 0x00 /* use default leeway thresholds for system */ diff --git a/osfmk/kern/kext_alloc.c b/osfmk/kern/kext_alloc.c index 479d114e3..02ef41fba 100644 --- a/osfmk/kern/kext_alloc.c +++ b/osfmk/kern/kext_alloc.c @@ -156,6 +156,7 @@ kext_alloc(vm_offset_t *_addr, vm_size_t size, boolean_t fixed) size, 0, flags, + VM_MAP_KERNEL_FLAGS_NONE, VM_KERN_MEMORY_KEXT, MACH_PORT_NULL, 0, diff --git a/osfmk/kern/kpc.h b/osfmk/kern/kpc.h index aa2db20a1..3af897184 100644 --- a/osfmk/kern/kpc.h +++ b/osfmk/kern/kpc.h @@ -91,6 +91,8 @@ struct cpu_data; extern boolean_t kpc_register_cpu(struct cpu_data *cpu_data); extern void kpc_unregister_cpu(struct cpu_data *cpu_data); +extern bool kpc_supported; + /* bootstrap */ extern void kpc_init(void); @@ -155,6 +157,7 @@ extern void kpc_thread_destroy(thread_t thread); /* allocate a buffer big enough for all counters */ extern uint64_t *kpc_counterbuf_alloc(void); extern void kpc_counterbuf_free(uint64_t*); +extern uint32_t kpc_get_counterbuf_size(void); /* whether we're currently accounting into threads */ extern int kpc_threads_counting; diff --git a/osfmk/kern/kpc_common.c b/osfmk/kern/kpc_common.c index 96455de01..53f382ec4 100644 --- a/osfmk/kern/kpc_common.c +++ b/osfmk/kern/kpc_common.c @@ -68,8 +68,8 @@ static bool kpc_calling_pm = false; #endif /* MACH_ASSERT */ boolean_t kpc_context_switch_active = FALSE; +bool kpc_supported = true; -void kpc_common_init(void); void kpc_common_init(void) { @@ -503,13 +503,19 @@ kpc_set_config(uint32_t classes, kpc_config_t *configv) return ret; } +uint32_t +kpc_get_counterbuf_size(void) +{ + return COUNTERBUF_SIZE; +} + /* allocate a buffer large enough for all possible counters */ uint64_t * kpc_counterbuf_alloc(void) { uint64_t *buf = NULL; - buf = kalloc(COUNTERBUF_SIZE); + buf = kalloc_tag(COUNTERBUF_SIZE, VM_KERN_MEMORY_DIAG); if (buf) { bzero(buf, COUNTERBUF_SIZE); } @@ -529,16 +535,19 @@ void kpc_sample_kperf(uint32_t actionid) { struct kperf_sample sbuf; - struct kperf_context ctx; BUF_DATA(PERF_KPC_HNDLR | DBG_FUNC_START); - ctx.cur_pid = 0; - ctx.cur_thread = current_thread(); - ctx.cur_pid = task_pid(current_task()); + thread_t thread = current_thread(); + task_t task = get_threadtask(thread); - ctx.trigger_type = TRIGGER_TYPE_PMI; - ctx.trigger_id = 0; + struct kperf_context ctx = { + .cur_thread = thread, + .cur_task = task, + .cur_pid = task_pid(task), + .trigger_type = TRIGGER_TYPE_PMI, + .trigger_id = 0, + }; int r = kperf_sample(&sbuf, &ctx, actionid, SAMPLE_FLAG_PEND_USER); diff --git a/osfmk/kern/ledger.c b/osfmk/kern/ledger.c index ec35a3a52..001cad83b 100644 --- a/osfmk/kern/ledger.c +++ b/osfmk/kern/ledger.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2010-2018 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -46,6 +46,8 @@ #include #include +#include + /* * Ledger entry flags. Bits in second nibble (masked by 0xF0) are used for * ledger actions (LEDGER_ACTION_BLOCK, etc). @@ -113,6 +115,7 @@ struct ledger_template { volatile uint32_t lt_inuse; lck_mtx_t lt_lock; zone_t lt_zone; + bool lt_initialized; struct entry_template *lt_entries; }; @@ -130,47 +133,6 @@ struct ledger_template { splx(s); \ } -/* - * Use NTOCKS "tocks" to track the rolling maximum balance of a ledger entry. - */ -#define NTOCKS 1 -/* - * The explicit alignment is to ensure that atomic operations don't panic - * on ARM. - */ -struct ledger_entry { - volatile uint32_t le_flags; - ledger_amount_t le_limit; - ledger_amount_t le_warn_level; - volatile ledger_amount_t le_credit __attribute__((aligned(8))); - volatile ledger_amount_t le_debit __attribute__((aligned(8))); - union { - struct { - /* - * XXX - the following two fields can go away if we move all of - * the refill logic into process policy - */ - uint64_t le_refill_period; - uint64_t le_last_refill; - } le_refill; - struct _le_maxtracking { - struct _le_peak { - uint32_t le_max; /* Lower 32-bits of observed max balance */ - uint32_t le_time; /* time when this peak was observed */ - } le_peaks[NTOCKS]; - ledger_amount_t le_lifetime_max; /* greatest peak ever observed */ - } le_maxtracking; - } _le; -} __attribute__((aligned(8))); - -struct ledger { - uint64_t l_id; - int32_t l_refs; - int32_t l_size; - struct ledger_template *l_template; - struct ledger_entry l_entries[0] __attribute__((aligned(8))); -}; - static int ledger_cnt = 0; /* ledger ast helper functions */ static uint32_t ledger_check_needblock(ledger_t l, uint64_t now); @@ -366,6 +328,22 @@ ledger_template_complete(ledger_template_t template) template->lt_zone = zinit(ledger_size, CONFIG_TASK_MAX * ledger_size, ledger_size, template->lt_name); + template->lt_initialized = true; +} + +/* + * Like ledger_template_complete, except we'll ask + * the pmap layer to manage allocations for us. + * Meant for ledgers that should be owned by the + * pmap layer. + */ +void +ledger_template_complete_secure_alloc(ledger_template_t template) +{ + size_t ledger_size; + ledger_size = sizeof(struct ledger) + (template->lt_cnt * sizeof(struct ledger_entry)); + pmap_ledger_alloc_init(ledger_size); + template->lt_initialized = true; } /* @@ -385,10 +363,14 @@ ledger_instantiate(ledger_template_t template, int entry_type) template_lock(template); template->lt_refs++; cnt = template->lt_cnt; - assert(template->lt_zone); template_unlock(template); - ledger = (ledger_t)zalloc(template->lt_zone); + if (template->lt_zone) { + ledger = (ledger_t)zalloc(template->lt_zone); + } else { + ledger = pmap_ledger_alloc(); + } + if (ledger == NULL) { ledger_template_dereference(template); return LEDGER_NULL; @@ -477,7 +459,11 @@ ledger_dereference(ledger_t ledger) /* Just released the last reference. Free it. */ if (v == 1) { - zfree(ledger->l_template->lt_zone, ledger); + if (ledger->l_template->lt_zone) { + zfree(ledger->l_template->lt_zone, ledger); + } else { + pmap_ledger_free(ledger); + } } return (KERN_SUCCESS); @@ -657,74 +643,22 @@ ledger_refill(uint64_t now, ledger_t ledger, int entry) ledger_limit_entry_wakeup(le); } -/* - * In tenths of a second, the length of one lookback period (a "tock") for - * ledger rolling maximum calculations. The effective lookback window will be this times - * NTOCKS. - * - * Use a tock length of 2.5 seconds to get a total lookback period of 5 seconds. - * - * XXX Could make this caller-definable, at the point that rolling max tracking - * is enabled for the entry. - */ -#define TOCKLEN 25 - -/* - * How many sched_tick's are there in one tock (one of our lookback periods)? - * - * X sched_ticks 2.5 sec N sched_ticks - * --------------- = ---------- * ------------- - * tock tock sec - * - * where N sched_ticks/sec is calculated via 1 << SCHED_TICK_SHIFT (see sched_prim.h) - * - * This should give us 20 sched_tick's in one 2.5 second-long tock. - */ -#define SCHED_TICKS_PER_TOCK ((TOCKLEN * (1 << SCHED_TICK_SHIFT)) / 10) - -/* - * Rolling max timestamps use their own unit (let's call this a "tock"). One tock is the - * length of one lookback period that we use for our rolling max calculation. - * - * Calculate the current time in tocks from sched_tick (which runs at a some - * fixed rate). - */ -#define CURRENT_TOCKSTAMP() (sched_tick / SCHED_TICKS_PER_TOCK) - -/* - * Does the given tockstamp fall in either the current or the previous tocks? - */ -#define TOCKSTAMP_IS_STALE(now, tock) ((((now) - (tock)) < NTOCKS) ? FALSE : TRUE) - void ledger_entry_check_new_balance(thread_t thread, ledger_t ledger, int entry, struct ledger_entry *le) { - ledger_amount_t credit, debit; - if (le->le_flags & LF_TRACKING_MAX) { ledger_amount_t balance = le->le_credit - le->le_debit; - uint32_t now = CURRENT_TOCKSTAMP(); - struct _le_peak *p = &le->_le.le_maxtracking.le_peaks[now % NTOCKS]; - if (!TOCKSTAMP_IS_STALE(now, p->le_time) || (balance > p->le_max)) { - /* - * The current balance is greater than the previously - * observed peak for the current time block, *or* we - * haven't yet recorded a peak for the current time block -- - * so this is our new peak. - * - * (We only track the lower 32-bits of a balance for rolling - * max purposes.) - */ - p->le_max = (uint32_t)balance; - p->le_time = now; + if (balance > le->_le._le_max.le_lifetime_max){ + le->_le._le_max.le_lifetime_max = balance; } - struct _le_maxtracking *m = &le->_le.le_maxtracking; - if(balance > m->le_lifetime_max){ - m->le_lifetime_max = balance; +#if CONFIG_LEDGER_INTERVAL_MAX + if (balance > le->_le._le_max.le_interval_max) { + le->_le._le_max.le_interval_max = balance; } +#endif /* LEDGER_CONFIG_INTERVAL_MAX */ } /* Check to see whether we're due a refill */ @@ -799,16 +733,13 @@ ledger_entry_check_new_balance(thread_t thread, ledger_t ledger, } } - credit = le->le_credit; - debit = le->le_debit; if ((le->le_flags & LF_PANIC_ON_NEGATIVE) && - ((credit < debit) || - (le->le_credit < le->le_debit))) { - panic("ledger_entry_check_new_balance(%p,%d): negative ledger %p credit:%lld/%lld debit:%lld/%lld balance:%lld/%lld\n", + (le->le_credit < le->le_debit)) { + panic("ledger_entry_check_new_balance(%p,%d): negative ledger %p credit:%lld debit:%lld balance:%lld\n", ledger, entry, le, - credit, le->le_credit, - debit, le->le_debit, - credit - debit, le->le_credit - le->le_debit); + le->le_credit, + le->le_debit, + le->le_credit - le->le_debit); } } @@ -842,7 +773,9 @@ ledger_credit_thread(thread_t thread, ledger_t ledger, int entry, ledger_amount_ new = old + amount; lprintf(("%p Credit %lld->%lld\n", thread, old, new)); - ledger_entry_check_new_balance(thread, ledger, entry, le); + if (thread) { + ledger_entry_check_new_balance(thread, ledger, entry, le); + } return (KERN_SUCCESS); } @@ -856,6 +789,15 @@ ledger_credit(ledger_t ledger, int entry, ledger_amount_t amount) return ledger_credit_thread(current_thread(), ledger, entry, amount); } +/* + * Add value to an entry in a ledger; do not check balance after update. + */ +kern_return_t +ledger_credit_nocheck(ledger_t ledger, int entry, ledger_amount_t amount) +{ + return ledger_credit_thread(NULL, ledger, entry, amount); +} + /* Add all of one ledger's values into another. * They must have been created from the same template. * This is not done atomically. Another thread (if not otherwise synchronized) @@ -1004,41 +946,29 @@ ledger_set_limit(ledger_t ledger, int entry, ledger_amount_t limit, return (KERN_SUCCESS); } +#if CONFIG_LEDGER_INTERVAL_MAX kern_return_t -ledger_get_recent_max(ledger_t ledger, int entry, - ledger_amount_t *max_observed_balance) +ledger_get_interval_max(ledger_t ledger, int entry, + ledger_amount_t *max_interval_balance, int reset) { - struct ledger_entry *le; - uint32_t now = CURRENT_TOCKSTAMP(); - int i; - + struct ledger_entry *le; le = &ledger->l_entries[entry]; if (!ENTRY_VALID(ledger, entry) || !(le->le_flags & LF_TRACKING_MAX)) { return (KERN_INVALID_VALUE); } - /* - * Start with the current balance; if neither of the recorded peaks are - * within recent history, we use this. - */ - *max_observed_balance = le->le_credit - le->le_debit; - - for (i = 0; i < NTOCKS; i++) { - if (!TOCKSTAMP_IS_STALE(now, le->_le.le_maxtracking.le_peaks[i].le_time) && - (le->_le.le_maxtracking.le_peaks[i].le_max > *max_observed_balance)) { - /* - * The peak for this time block isn't stale, and it - * is greater than the current balance -- so use it. - */ - *max_observed_balance = le->_le.le_maxtracking.le_peaks[i].le_max; - } - } + *max_interval_balance = le->_le._le_max.le_interval_max; + lprintf(("ledger_get_interval_max: %lld%s\n", *max_interval_balance, + (reset) ? " --> 0" : "")); - lprintf(("ledger_get_maximum: %lld\n", *max_observed_balance)); + if (reset) { + le->_le._le_max.le_interval_max = 0; + } return (KERN_SUCCESS); } +#endif /* CONFIG_LEDGER_INTERVAL_MAX */ kern_return_t ledger_get_lifetime_max(ledger_t ledger, int entry, @@ -1051,7 +981,7 @@ ledger_get_lifetime_max(ledger_t ledger, int entry, return (KERN_INVALID_VALUE); } - *max_lifetime_balance = le->_le.le_maxtracking.le_lifetime_max; + *max_lifetime_balance = le->_le._le_max.le_lifetime_max; lprintf(("ledger_get_lifetime_max: %lld\n", *max_lifetime_balance)); return (KERN_SUCCESS); @@ -1318,7 +1248,9 @@ ledger_debit_thread(thread_t thread, ledger_t ledger, int entry, ledger_amount_t } lprintf(("%p Debit %lld->%lld\n", thread, old, new)); - ledger_entry_check_new_balance(thread, ledger, entry, le); + if (thread) { + ledger_entry_check_new_balance(thread, ledger, entry, le); + } return (KERN_SUCCESS); } @@ -1329,6 +1261,12 @@ ledger_debit(ledger_t ledger, int entry, ledger_amount_t amount) return ledger_debit_thread(current_thread(), ledger, entry, amount); } +kern_return_t +ledger_debit_nocheck(ledger_t ledger, int entry, ledger_amount_t amount) +{ + return ledger_debit_thread(NULL, ledger, entry, amount); +} + void ledger_ast(thread_t thread) { @@ -1523,7 +1461,7 @@ ledger_perform_blocking(ledger_t l) assert(!(le->le_flags & LF_TRACKING_MAX)); /* Prepare to sleep until the resource is refilled */ - ret = assert_wait_deadline(le, TRUE, + ret = assert_wait_deadline(le, THREAD_INTERRUPTIBLE, le->_le.le_refill.le_last_refill + le->_le.le_refill.le_refill_period); if (ret != THREAD_WAITING) return(KERN_SUCCESS); @@ -1595,6 +1533,25 @@ ledger_disable_panic_on_negative(ledger_t ledger, int entry) return (KERN_SUCCESS); } +kern_return_t +ledger_get_panic_on_negative(ledger_t ledger, int entry, int *panic_on_negative) +{ + struct ledger_entry *le; + + if (!ENTRY_VALID(ledger, entry)) + return (KERN_INVALID_ARGUMENT); + + le = &ledger->l_entries[entry]; + + if (le->le_flags & LF_PANIC_ON_NEGATIVE) { + *panic_on_negative = TRUE; + } else { + *panic_on_negative = FALSE; + } + + return (KERN_SUCCESS); +} + kern_return_t ledger_get_balance(ledger_t ledger, int entry, ledger_amount_t *balance) { diff --git a/osfmk/kern/ledger.h b/osfmk/kern/ledger.h index d6b27ce11..78eb4f848 100644 --- a/osfmk/kern/ledger.h +++ b/osfmk/kern/ledger.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2010-2018 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -55,6 +55,50 @@ struct ledger_template_info { char lti_units[LEDGER_NAME_MAX]; }; +#ifdef MACH_KERNEL_PRIVATE +/* + * These definitions are only here to allow pmap.c to determine the expected + * size of a ledger at build time. Direct access to ledger fields or to + * ledger entries is prohibited. + */ + +/* + * The explicit alignment is to ensure that atomic operations don't panic + * on ARM. + */ +struct ledger_entry { + volatile uint32_t le_flags; + ledger_amount_t le_limit; + ledger_amount_t le_warn_level; + volatile ledger_amount_t le_credit __attribute__((aligned(8))); + volatile ledger_amount_t le_debit __attribute__((aligned(8))); + union { + struct { + /* + * XXX - the following two fields can go away if we move all of + * the refill logic into process policy + */ + uint64_t le_refill_period; + uint64_t le_last_refill; + } le_refill; + struct { + ledger_amount_t le_lifetime_max; /* Process lifetime peak */ +#if CONFIG_LEDGER_INTERVAL_MAX + ledger_amount_t le_interval_max; /* Interval peak XXX better name needed */ +#endif + } _le_max; + } _le; +} __attribute__((aligned(8))); + +struct ledger { + uint64_t l_id; + int32_t l_refs; + int32_t l_size; + struct ledger_template *l_template; + struct ledger_entry l_entries[0] __attribute__((aligned(8))); +}; +#endif /* MACH_KERNEL_PRIVATE */ + struct ledger_entry_info { int64_t lei_balance; int64_t lei_credit; @@ -111,14 +155,17 @@ extern int ledger_key_lookup(ledger_template_t template, const char *key); #define LEDGER_CREATE_INACTIVE_ENTRIES 1 extern ledger_t ledger_instantiate(ledger_template_t template, int entry_type); extern void ledger_template_complete(ledger_template_t template); +extern void ledger_template_complete_secure_alloc(ledger_template_t template); extern kern_return_t ledger_disable_callback(ledger_t ledger, int entry); extern kern_return_t ledger_enable_callback(ledger_t ledger, int entry); extern kern_return_t ledger_get_limit(ledger_t ledger, int entry, ledger_amount_t *limit); extern kern_return_t ledger_set_limit(ledger_t ledger, int entry, ledger_amount_t limit, uint8_t warn_level_percentage); -extern kern_return_t ledger_get_recent_max(ledger_t ledger, int entry, - ledger_amount_t *max_observed_balance); +#if CONFIG_LEDGER_INTERVAL_MAX +extern kern_return_t ledger_get_interval_max(ledger_t ledger, int entry, + ledger_amount_t *max_interval_balance, int reset); +#endif /* CONFIG_LEDGER_INTERVAL_MAX */ extern kern_return_t ledger_get_lifetime_max(ledger_t ledger, int entry, ledger_amount_t *max_lifetime_balance); extern kern_return_t ledger_get_actions(ledger_t ledger, int entry, int *actions); @@ -132,8 +179,12 @@ extern kern_return_t ledger_entry_setactive(ledger_t ledger, int entry); extern void ledger_check_new_balance(thread_t thread, ledger_t ledger, int entry); extern kern_return_t ledger_credit(ledger_t ledger, int entry, ledger_amount_t amount); +extern kern_return_t ledger_credit_nocheck(ledger_t ledger, int entry, + ledger_amount_t amount); extern kern_return_t ledger_debit(ledger_t ledger, int entry, ledger_amount_t amount); +extern kern_return_t ledger_debit_nocheck(ledger_t ledger, int entry, + ledger_amount_t amount); extern kern_return_t ledger_credit_thread(thread_t thread, ledger_t ledger, int entry, ledger_amount_t amount); extern kern_return_t ledger_debit_thread(thread_t thread, ledger_t ledger, @@ -145,6 +196,7 @@ extern kern_return_t ledger_get_balance(ledger_t ledger, int entry, ledger_amount_t *balance); extern kern_return_t ledger_reset_callback_state(ledger_t ledger, int entry); extern kern_return_t ledger_disable_panic_on_negative(ledger_t ledger, int entry); +extern kern_return_t ledger_get_panic_on_negative(ledger_t ledger, int entry, int *panic_on_negative); extern kern_return_t ledger_rollup(ledger_t to_ledger, ledger_t from_ledger); extern kern_return_t ledger_rollup_entry(ledger_t to_ledger, ledger_t from_ledger, int entry); @@ -155,10 +207,6 @@ extern int ledger_reference_count(ledger_t ledger); extern kern_return_t ledger_reference(ledger_t ledger); extern kern_return_t ledger_dereference(ledger_t ledger); -/* Per-pmap ledger operations */ -#define pmap_ledger_debit(p, e, a) ledger_debit((p)->ledger, e, a) -#define pmap_ledger_credit(p, e, a) ledger_credit((p)->ledger, e, a) - /* Support for ledger() syscall */ #ifdef LEDGER_DEBUG extern int ledger_limit(task_t task, struct ledger_limit_args *args); diff --git a/osfmk/kern/locks.c b/osfmk/kern/locks.c index 25641b8be..04b5dd239 100644 --- a/osfmk/kern/locks.c +++ b/osfmk/kern/locks.c @@ -71,11 +71,11 @@ #include #include #include +#include #include #include #include - #include #if CONFIG_DTRACE @@ -120,6 +120,8 @@ static unsigned int lck_grp_cnt; decl_lck_mtx_data(static,lck_grp_lock) static lck_mtx_ext_t lck_grp_lock_ext; +SECURITY_READ_ONLY_LATE(boolean_t) spinlock_timeout_panic = TRUE; + lck_grp_attr_t LockDefaultGroupAttr; lck_grp_t LockCompatGroup; lck_attr_t LockDefaultLckAttr; @@ -132,6 +134,14 @@ uint64_t dtrace_spin_threshold = LOCK_PANIC_TIMEOUT / 1000000; // 500ns #endif #endif +uintptr_t +unslide_for_kdebug(void* object) { + if (__improbable(kdebug_enable)) + return VM_KERNEL_UNSLIDE_OR_PERM(object); + else + return 0; +} + /* * Routine: lck_mod_init */ @@ -535,20 +545,11 @@ hw_lock_lock_contended(hw_lock_t lock, uintptr_t data, uint64_t timeout, boolean } #endif // __SMP__ -/* - * Routine: hw_lock_lock - * - * Acquire lock, spinning until it becomes available, - * return with preemption disabled. - */ -void -hw_lock_lock(hw_lock_t lock) +static inline void +hw_lock_lock_internal(hw_lock_t lock, thread_t thread) { - thread_t thread; uintptr_t state; - thread = current_thread(); - disable_preemption_for_thread(thread); state = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK; #if __SMP__ @@ -563,7 +564,7 @@ hw_lock_lock(hw_lock_t lock) #if LOCK_PRETEST contended: #endif // LOCK_PRETEST - hw_lock_lock_contended(lock, state, 0, TRUE); + hw_lock_lock_contended(lock, state, 0, spinlock_timeout_panic); end: #else // __SMP__ if (lock->lock_data) @@ -576,6 +577,34 @@ hw_lock_lock(hw_lock_t lock) return; } +/* + * Routine: hw_lock_lock + * + * Acquire lock, spinning until it becomes available, + * return with preemption disabled. + */ +void +hw_lock_lock(hw_lock_t lock) +{ + thread_t thread = current_thread(); + disable_preemption_for_thread(thread); + hw_lock_lock_internal(lock, thread); +} + +/* + * Routine: hw_lock_lock_nopreempt + * + * Acquire lock, spinning until it becomes available. + */ +void +hw_lock_lock_nopreempt(hw_lock_t lock) +{ + thread_t thread = current_thread(); + if (__improbable(!preemption_disabled_for_thread(thread))) + panic("Attempt to take no-preempt spinlock %p in preemptible context", lock); + hw_lock_lock_internal(lock, thread); +} + /* * Routine: hw_lock_to * @@ -628,18 +657,10 @@ hw_lock_to(hw_lock_t lock, uint64_t timeout) * * returns with preemption disabled on success. */ -unsigned int -hw_lock_try(hw_lock_t lock) +static inline unsigned int +hw_lock_try_internal(hw_lock_t lock, thread_t thread) { - thread_t thread = current_thread(); int success = 0; -#if LOCK_TRY_DISABLE_INT - long intmask; - - intmask = disable_interrupts(); -#else - disable_preemption_for_thread(thread); -#endif // LOCK_TRY_DISABLE_INT #if __SMP__ #if LOCK_PRETEST @@ -655,20 +676,9 @@ hw_lock_try(hw_lock_t lock) } #endif // __SMP__ -#if LOCK_TRY_DISABLE_INT - if (success) - disable_preemption_for_thread(thread); -#if LOCK_PRETEST -failed: -#endif // LOCK_PRETEST - restore_interrupts(intmask); -#else #if LOCK_PRETEST failed: #endif // LOCK_PRETEST - if (!success) - enable_preemption(); -#endif // LOCK_TRY_DISABLE_INT #if CONFIG_DTRACE if (success) LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, lock, 0); @@ -676,13 +686,33 @@ hw_lock_try(hw_lock_t lock) return success; } +unsigned int +hw_lock_try(hw_lock_t lock) +{ + thread_t thread = current_thread(); + disable_preemption_for_thread(thread); + unsigned int success = hw_lock_try_internal(lock, thread); + if (!success) + enable_preemption(); + return success; +} + +unsigned int +hw_lock_try_nopreempt(hw_lock_t lock) +{ + thread_t thread = current_thread(); + if (__improbable(!preemption_disabled_for_thread(thread))) + panic("Attempt to test no-preempt spinlock %p in preemptible context", lock); + return hw_lock_try_internal(lock, thread); +} + /* * Routine: hw_lock_unlock * * Unconditionally release lock, release preemption level. */ -void -hw_lock_unlock(hw_lock_t lock) +static inline void +hw_lock_unlock_internal(hw_lock_t lock) { __c11_atomic_store((_Atomic uintptr_t *)&lock->lock_data, 0, memory_order_release_smp); #if __arm__ || __arm64__ @@ -692,9 +722,23 @@ hw_lock_unlock(hw_lock_t lock) #if CONFIG_DTRACE LOCKSTAT_RECORD(LS_LCK_SPIN_UNLOCK_RELEASE, lock, 0); #endif /* CONFIG_DTRACE */ +} + +void +hw_lock_unlock(hw_lock_t lock) +{ + hw_lock_unlock_internal(lock); enable_preemption(); } +void +hw_lock_unlock_nopreempt(hw_lock_t lock) +{ + if (__improbable(!preemption_disabled_for_thread(current_thread()))) + panic("Attempt to release no-preempt spinlock %p in preemptible context", lock); + hw_lock_unlock_internal(lock); +} + /* * Routine hw_lock_held, doesn't change preemption state. * N.B. Racy, of course. @@ -765,40 +809,6 @@ lck_spin_sleep_deadline( return res; } - -/* - * Routine: lck_mtx_clear_promoted - * - * Handle clearing of TH_SFLAG_PROMOTED, - * adjusting thread priority as needed. - * - * Called with thread lock held - */ -static void -lck_mtx_clear_promoted ( - thread_t thread, - __kdebug_only uintptr_t trace_lck) -{ - thread->sched_flags &= ~TH_SFLAG_PROMOTED; - - if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) { - /* Thread still has a RW lock promotion */ - } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) { - KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_SCHED,MACH_DEMOTE) | DBG_FUNC_NONE, - thread->sched_pri, DEPRESSPRI, 0, trace_lck, 0); - set_sched_pri(thread, DEPRESSPRI); - } else { - if (thread->base_pri < thread->sched_pri) { - KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_SCHED,MACH_DEMOTE) | DBG_FUNC_NONE, - thread->sched_pri, thread->base_pri, 0, trace_lck, 0); - } - thread_recompute_sched_pri(thread, FALSE); - } -} - - /* * Routine: lck_mtx_sleep */ @@ -848,7 +858,7 @@ lck_mtx_sleep( if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) { if ((thread->rwlock_count-- == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) { /* sched_flags checked without lock, but will be rechecked while clearing */ - lck_rw_clear_promotion(thread); + lck_rw_clear_promotion(thread, unslide_for_kdebug(event)); } } @@ -903,7 +913,7 @@ lck_mtx_sleep_deadline( if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) { if ((thread->rwlock_count-- == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) { /* sched_flags checked without lock, but will be rechecked while clearing */ - lck_rw_clear_promotion(thread); + lck_rw_clear_promotion(thread, unslide_for_kdebug(event)); } } @@ -913,12 +923,58 @@ lck_mtx_sleep_deadline( } /* - * Routine: lck_mtx_lock_wait + * Lock Boosting Invariants: + * + * The lock owner is always promoted to the max priority of all its waiters. + * Max priority is capped at MAXPRI_PROMOTE. + * + * lck_mtx_pri being set implies that the lock owner is promoted to at least lck_mtx_pri + * This prevents the thread from dropping in priority while holding a mutex + * (note: Intel locks currently don't do this, to avoid thread lock churn) + * + * thread->promotions has a +1 for every mutex currently promoting the thread + * and 1 for was_promoted_on_wakeup being set. + * TH_SFLAG_PROMOTED is set on a thread whenever it has any promotions + * from any mutex (i.e. thread->promotions != 0) + * + * was_promoted_on_wakeup is set on a thread which is woken up by a mutex when + * it raises the priority of the woken thread to match lck_mtx_pri. + * It can be set for multiple iterations of wait, fail to acquire, re-wait, etc + * was_promoted_on_wakeup being set always implies a +1 promotions count. + * + * The last waiter is not given a promotion when it wakes up or acquires the lock. + * When the last waiter is waking up, a new contender can always come in and + * steal the lock without having to wait for the last waiter to make forward progress. + * + * lck_mtx_waiters has a +1 for every waiter currently between wait and acquire + * This prevents us from asserting that every wakeup wakes up a thread. + * This also causes excess thread_wakeup calls in the unlock path. + * It can only be fooled into thinking there are more waiters than are + * actually blocked, not less. + * It does allows us to reduce the complexity of the lock state. + * + * This also means that a starved bg thread as the last waiter could end up + * keeping the lock in the contended state for a long period of time, which + * may keep lck_mtx_pri artificially high for a very long time even though + * it is not participating or blocking anyone else. + * Intel locks don't have this problem because they can go uncontended + * as soon as there are no blocked threads involved. + */ + +/* + * Routine: lck_mtx_lock_wait * * Invoked in order to wait on contention. * * Called with the interlock locked and * returns it unlocked. + * + * Always aggressively sets the owning thread to promoted, + * even if it's the same or higher priority + * This prevents it from lowering its own priority while holding a lock + * + * TODO: Come up with a more efficient way to handle same-priority promotions + * ARM mutex contention logic could avoid taking the thread lock */ void lck_mtx_lock_wait ( @@ -927,10 +983,8 @@ lck_mtx_lock_wait ( { thread_t self = current_thread(); lck_mtx_t *mutex; - __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck); - __kdebug_only uintptr_t trace_holder = VM_KERNEL_UNSLIDE_OR_PERM(holder); - integer_t priority; - spl_t s = splsched(); + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck); + #if CONFIG_DTRACE uint64_t sleep_start = 0; @@ -944,50 +998,65 @@ lck_mtx_lock_wait ( else mutex = &lck->lck_mtx_ptr->lck_mtx; - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START, trace_lck, trace_holder, 0, 0, 0); + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START, + trace_lck, (uintptr_t)thread_tid(thread), 0, 0, 0); + + spl_t s = splsched(); + thread_lock(holder); + + assert_promotions_invariant(holder); - priority = self->sched_pri; - if (priority < self->base_pri) - priority = self->base_pri; - if (priority < BASEPRI_DEFAULT) - priority = BASEPRI_DEFAULT; + if ((holder->sched_flags & TH_SFLAG_DEPRESS) == 0) + assert(holder->sched_pri >= mutex->lck_mtx_pri); - /* Do not promote past promotion ceiling */ + integer_t priority = self->sched_pri; + priority = MAX(priority, self->base_pri); + priority = MAX(priority, BASEPRI_DEFAULT); priority = MIN(priority, MAXPRI_PROMOTE); - thread_lock(holder); if (mutex->lck_mtx_pri == 0) { - holder->promotions++; - holder->sched_flags |= TH_SFLAG_PROMOTED; + /* This is the first promotion for this mutex */ + if (holder->promotions++ == 0) { + /* This is the first promotion for holder */ + sched_thread_promote_to_pri(holder, priority, trace_lck); + } else { + /* Holder was previously promoted due to a different mutex, raise to match this one */ + sched_thread_update_promotion_to_pri(holder, priority, trace_lck); + } + } else { + /* Holder was previously promoted due to this mutex, check if the pri needs to go up */ + sched_thread_update_promotion_to_pri(holder, priority, trace_lck); } - if (mutex->lck_mtx_pri < priority && holder->sched_pri < priority) { - KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_SCHED,MACH_PROMOTE) | DBG_FUNC_NONE, - holder->sched_pri, priority, trace_holder, trace_lck, 0); - set_sched_pri(holder, priority); - } + assert(holder->promotions > 0); + assert(holder->promotion_priority >= priority); + + if ((holder->sched_flags & TH_SFLAG_DEPRESS) == 0) + assert(holder->sched_pri >= mutex->lck_mtx_pri); + + assert_promotions_invariant(holder); + thread_unlock(holder); splx(s); if (mutex->lck_mtx_pri < priority) mutex->lck_mtx_pri = priority; - if (self->pending_promoter[self->pending_promoter_index] == NULL) { - self->pending_promoter[self->pending_promoter_index] = mutex; - mutex->lck_mtx_waiters++; - } - else - if (self->pending_promoter[self->pending_promoter_index] != mutex) { - self->pending_promoter[++self->pending_promoter_index] = mutex; + + if (self->waiting_for_mutex == NULL) { + self->waiting_for_mutex = mutex; mutex->lck_mtx_waiters++; } + assert(self->waiting_for_mutex == mutex); + thread_set_pending_block_hint(self, kThreadWaitKernelMutex); - assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT); + assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT | THREAD_WAIT_NOREPORT_USER); lck_mtx_ilk_unlock(mutex); thread_block(THREAD_CONTINUE_NULL); + assert(mutex->lck_mtx_waiters > 0); + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0); #if CONFIG_DTRACE /* @@ -1023,50 +1092,80 @@ lck_mtx_lock_acquire( thread_t thread = current_thread(); lck_mtx_t *mutex; integer_t priority; - spl_t s; - __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck); if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) mutex = lck; else mutex = &lck->lck_mtx_ptr->lck_mtx; - if (thread->pending_promoter[thread->pending_promoter_index] == mutex) { - thread->pending_promoter[thread->pending_promoter_index] = NULL; - if (thread->pending_promoter_index > 0) - thread->pending_promoter_index--; + /* + * If waiting_for_mutex is set, then this thread was previously blocked waiting on this lock + * If it's un-set, then this thread stole the lock from another waiter. + */ + if (thread->waiting_for_mutex == mutex) { + assert(mutex->lck_mtx_waiters > 0); + + thread->waiting_for_mutex = NULL; mutex->lck_mtx_waiters--; } - if (mutex->lck_mtx_waiters) + assert(thread->waiting_for_mutex == NULL); + + if (mutex->lck_mtx_waiters > 0) { priority = mutex->lck_mtx_pri; - else { + } else { + /* I was the last waiter, so the mutex is no longer promoted or contended */ mutex->lck_mtx_pri = 0; priority = 0; } if (priority || thread->was_promoted_on_wakeup) { - s = splsched(); + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck); + + /* + * Note: was_promoted_on_wakeup can happen for multiple wakeups in a row without + * an intervening acquire if a thread keeps failing to acquire the lock + * + * If priority is true but not promoted on wakeup, + * then this is a lock steal of a promoted mutex, so it needs a ++ of promotions. + * + * If promoted on wakeup is true, but priority is not, + * then this is the last owner, and the last owner does not need a promotion. + */ + + spl_t s = splsched(); thread_lock(thread); + assert_promotions_invariant(thread); + + if (thread->was_promoted_on_wakeup) + assert(thread->promotions > 0); + if (priority) { - thread->promotions++; - thread->sched_flags |= TH_SFLAG_PROMOTED; - if (thread->sched_pri < priority) { - KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_SCHED,MACH_PROMOTE) | DBG_FUNC_NONE, - thread->sched_pri, priority, 0, trace_lck, 0); - /* Do not promote past promotion ceiling */ - assert(priority <= MAXPRI_PROMOTE); - set_sched_pri(thread, priority); + if (thread->promotions++ == 0) { + /* This is the first promotion for holder */ + sched_thread_promote_to_pri(thread, priority, trace_lck); + } else { + /* + * Holder was previously promoted due to a different mutex, raise to match this one + * Or, this thread was promoted on wakeup but someone else later contended on mutex + * at higher priority before we got here + */ + sched_thread_update_promotion_to_pri(thread, priority, trace_lck); } } + if (thread->was_promoted_on_wakeup) { thread->was_promoted_on_wakeup = 0; - if (thread->promotions == 0) - lck_mtx_clear_promoted(thread, trace_lck); + if (--thread->promotions == 0) + sched_thread_unpromote(thread, trace_lck); } + assert_promotions_invariant(thread); + + if (priority && (thread->sched_flags & TH_SFLAG_DEPRESS) == 0) + assert(thread->sched_pri >= priority); + thread_unlock(thread); splx(s); } @@ -1089,6 +1188,10 @@ lck_mtx_lock_acquire( * Invoked on unlock when there is contention. * * Called with the interlock locked. + * + * TODO: the 'waiters' flag does not indicate waiters exist on the waitqueue, + * it indicates waiters exist between wait and acquire. + * This means that here we may do extra unneeded wakeups. */ void lck_mtx_unlock_wakeup ( @@ -1097,7 +1200,7 @@ lck_mtx_unlock_wakeup ( { thread_t thread = current_thread(); lck_mtx_t *mutex; - __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck); + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck); if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT) mutex = lck; @@ -1107,20 +1210,36 @@ lck_mtx_unlock_wakeup ( if (thread != holder) panic("lck_mtx_unlock_wakeup: mutex %p holder %p\n", mutex, holder); - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_START, trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(holder), 0, 0, 0); + KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_START, + trace_lck, (uintptr_t)thread_tid(thread), 0, 0, 0); assert(mutex->lck_mtx_waiters > 0); + assert(thread->was_promoted_on_wakeup == 0); + assert(thread->waiting_for_mutex == NULL); + + /* + * The waiters count does not precisely match the number of threads on the waitqueue, + * therefore we cannot assert that we actually wake up a thread here + */ if (mutex->lck_mtx_waiters > 1) thread_wakeup_one_with_pri(LCK_MTX_EVENT(lck), lck->lck_mtx_pri); else thread_wakeup_one(LCK_MTX_EVENT(lck)); - if (thread->promotions > 0) { - spl_t s = splsched(); - + /* When mutex->lck_mtx_pri is set, it means means I as the owner have a promotion. */ + if (mutex->lck_mtx_pri) { + spl_t s = splsched(); thread_lock(thread); - if (--thread->promotions == 0 && (thread->sched_flags & TH_SFLAG_PROMOTED)) - lck_mtx_clear_promoted(thread, trace_lck); + + assert(thread->promotions > 0); + + assert_promotions_invariant(thread); + + if (--thread->promotions == 0) + sched_thread_unpromote(thread, trace_lck); + + assert_promotions_invariant(thread); + thread_unlock(thread); splx(s); } @@ -1128,21 +1247,50 @@ lck_mtx_unlock_wakeup ( KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0); } +/* + * Callout from the waitqueue code from inside thread_wakeup_one_with_pri + * At splsched, thread is pulled from waitq, still locked, not on runqueue yet + * + * We always make sure to set the promotion flag, even if the thread is already at this priority, + * so that it doesn't go down. + */ void -lck_mtx_unlockspin_wakeup ( - lck_mtx_t *lck) +lck_mtx_wakeup_adjust_pri(thread_t thread, integer_t priority) { - assert(lck->lck_mtx_waiters > 0); - thread_wakeup_one(LCK_MTX_EVENT(lck)); + assert(priority <= MAXPRI_PROMOTE); + assert(thread->waiting_for_mutex != NULL); - KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_NONE, VM_KERNEL_UNSLIDE_OR_PERM(lck), 0, 0, 1, 0); -#if CONFIG_DTRACE - /* - * When there are waiters, we skip the hot-patch spot in the - * fastpath, so we record it here. - */ - LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lck, 0); -#endif + __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(thread->waiting_for_mutex); + + assert_promotions_invariant(thread); + + if (thread->was_promoted_on_wakeup) { + /* Thread was previously promoted, but contended again */ + sched_thread_update_promotion_to_pri(thread, priority, trace_lck); + return; + } + + if (thread->promotions > 0 && priority <= thread->promotion_priority) { + /* + * Thread is already promoted to the right level, no need to do more + * I can draft off of another promotion here, which is OK + * because I know the thread will soon run acquire to get its own promotion + */ + assert((thread->sched_flags & TH_SFLAG_PROMOTED) == TH_SFLAG_PROMOTED); + return; + } + + thread->was_promoted_on_wakeup = 1; + + if (thread->promotions++ == 0) { + /* This is the first promotion for this thread */ + sched_thread_promote_to_pri(thread, priority, trace_lck); + } else { + /* Holder was previously promoted due to a different mutex, raise to match this one */ + sched_thread_update_promotion_to_pri(thread, priority, trace_lck); + } + + assert_promotions_invariant(thread); } @@ -1265,7 +1413,7 @@ lck_rw_sleep( /* Only if the caller wanted the lck_rw_t returned unlocked should we drop to 0 */ assert(lck_sleep_action & LCK_SLEEP_UNLOCK); - lck_rw_clear_promotion(thread); + lck_rw_clear_promotion(thread, unslide_for_kdebug(event)); } } @@ -1319,7 +1467,7 @@ lck_rw_sleep_deadline( /* Only if the caller wanted the lck_rw_t returned unlocked should we drop to 0 */ assert(lck_sleep_action & LCK_SLEEP_UNLOCK); - lck_rw_clear_promotion(thread); + lck_rw_clear_promotion(thread, unslide_for_kdebug(event)); } } @@ -1331,11 +1479,11 @@ lck_rw_sleep_deadline( * * We support a limited form of reader-writer * lock promotion whose effects are: - * + * * * Qualifying threads have decay disabled * * Scheduler priority is reset to a floor of * of their statically assigned priority - * or BASEPRI_BACKGROUND + * or MINPRI_RWLOCK * * The rationale is that lck_rw_ts do not have * a single owner, so we cannot apply a directed @@ -1381,32 +1529,16 @@ lck_rw_sleep_deadline( * lck_rw_clear_promotion: Undo priority promotions when the last RW * lock is released by a thread (if a promotion was active) */ -void lck_rw_clear_promotion(thread_t thread) +void lck_rw_clear_promotion(thread_t thread, uintptr_t trace_obj) { assert(thread->rwlock_count == 0); /* Cancel any promotions if the thread had actually blocked while holding a RW lock */ spl_t s = splsched(); - thread_lock(thread); - if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) { - thread->sched_flags &= ~TH_SFLAG_RW_PROMOTED; - - if (thread->sched_flags & TH_SFLAG_PROMOTED) { - /* Thread still has a mutex promotion */ - } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) { - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_DEMOTE) | DBG_FUNC_NONE, - (uintptr_t)thread_tid(thread), thread->sched_pri, DEPRESSPRI, 0, 0); - - set_sched_pri(thread, DEPRESSPRI); - } else { - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_DEMOTE) | DBG_FUNC_NONE, - (uintptr_t)thread_tid(thread), thread->sched_pri, thread->base_pri, 0, 0); - - thread_recompute_sched_pri(thread, FALSE); - } - } + if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) + sched_thread_unpromote_reason(thread, TH_SFLAG_RW_PROMOTED, trace_obj); thread_unlock(thread); splx(s); @@ -1424,27 +1556,10 @@ lck_rw_set_promotion_locked(thread_t thread) if (LcksOpts & disLkRWPrio) return; - integer_t priority; + assert(thread->rwlock_count > 0); - priority = thread->sched_pri; - - if (priority < thread->base_pri) - priority = thread->base_pri; - if (priority < BASEPRI_BACKGROUND) - priority = BASEPRI_BACKGROUND; - - if ((thread->sched_pri < priority) || - !(thread->sched_flags & TH_SFLAG_RW_PROMOTED)) { - KERNEL_DEBUG_CONSTANT( - MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_PROMOTE) | DBG_FUNC_NONE, - (uintptr_t)thread_tid(thread), thread->sched_pri, - thread->base_pri, priority, 0); - - thread->sched_flags |= TH_SFLAG_RW_PROMOTED; - - if (thread->sched_pri < priority) - set_sched_pri(thread, priority); - } + if (!(thread->sched_flags & TH_SFLAG_RW_PROMOTED)) + sched_thread_promote_reason(thread, TH_SFLAG_RW_PROMOTED, 0); } kern_return_t diff --git a/osfmk/kern/locks.h b/osfmk/kern/locks.h index 99017d2cc..4db3c40f5 100644 --- a/osfmk/kern/locks.h +++ b/osfmk/kern/locks.h @@ -116,6 +116,12 @@ typedef struct _lck_grp_ { lck_grp_stat_t lck_grp_stat; } lck_grp_t; +#define lck_grp_miss lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_miss_cnt +#define lck_grp_held lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_held_cnt +#define lck_grp_util lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_util_cnt +#define lck_grp_wait lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_wait_cnt +#define lck_grp_direct_wait lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_held_cnt + #define LCK_GRP_NULL (lck_grp_t *)0 #else @@ -265,8 +271,14 @@ extern wait_result_t lck_spin_sleep_deadline( #ifdef KERNEL_PRIVATE +extern void lck_spin_lock_nopreempt( lck_spin_t *lck); + +extern void lck_spin_unlock_nopreempt( lck_spin_t *lck); + extern boolean_t lck_spin_try_lock( lck_spin_t *lck); +extern boolean_t lck_spin_try_lock_nopreempt( lck_spin_t *lck); + /* NOT SAFE: To be used only by kernel debugger to avoid deadlock. */ extern boolean_t kdp_lck_spin_is_acquired( lck_spin_t *lck); @@ -313,6 +325,17 @@ extern wait_result_t lck_mtx_sleep_deadline( event_t event, wait_interrupt_t interruptible, uint64_t deadline); +#if DEVELOPMENT || DEBUG +extern void erase_all_test_mtx_stats(void); +extern int get_test_mtx_stats_string(char* buffer, int buffer_size); +extern void lck_mtx_test_init(void); +extern void lck_mtx_test_lock(void); +extern void lck_mtx_test_unlock(void); +extern int lck_mtx_test_mtx_uncontended(int iter, char* buffer, int buffer_size); +extern int lck_mtx_test_mtx_contended(int iter, char* buffer, int buffer_size); +extern int lck_mtx_test_mtx_uncontended_loop_time(int iter, char* buffer, int buffer_size); +extern int lck_mtx_test_mtx_contended_loop_time(int iter, char* buffer, int buffer_size); +#endif #ifdef KERNEL_PRIVATE @@ -396,8 +419,6 @@ extern int lck_mtx_lock_acquire( extern void lck_mtx_unlock_wakeup( lck_mtx_t *lck, thread_t holder); -extern void lck_mtx_unlockspin_wakeup( - lck_mtx_t *lck); extern boolean_t lck_mtx_ilk_unlock( lck_mtx_t *lck); @@ -405,6 +426,8 @@ extern boolean_t lck_mtx_ilk_unlock( extern boolean_t lck_mtx_ilk_try_lock( lck_mtx_t *lck); +extern void lck_mtx_wakeup_adjust_pri(thread_t thread, integer_t priority); + #endif #define decl_lck_rw_data(class,name) class lck_rw_t name; @@ -466,9 +489,10 @@ extern void lck_rw_assert( lck_rw_t *lck, unsigned int type); -extern void lck_rw_clear_promotion( - thread_t thread); +extern void lck_rw_clear_promotion(thread_t thread, uintptr_t trace_obj); extern void lck_rw_set_promotion_locked(thread_t thread); + +uintptr_t unslide_for_kdebug(void* object); #endif #ifdef KERNEL_PRIVATE diff --git a/osfmk/kern/ltable.c b/osfmk/kern/ltable.c index 073f27c9a..43eb19510 100644 --- a/osfmk/kern/ltable.c +++ b/osfmk/kern/ltable.c @@ -405,6 +405,22 @@ void ltable_grow(struct link_table *table, uint32_t min_free) return; } +#if DEVELOPMENT || DEBUG + +int +ltable_nelem(struct link_table *table) +{ + int nelem = 0; + + lck_mtx_lock(&table->lock); + + nelem = table->used_elem; + + lck_mtx_unlock(&table->lock); + + return nelem; +} +#endif /** * ltable_alloc_elem: allocate one or more elements from a given table diff --git a/osfmk/kern/ltable.h b/osfmk/kern/ltable.h index aa62edfb9..c95743f10 100644 --- a/osfmk/kern/ltable.h +++ b/osfmk/kern/ltable.h @@ -208,6 +208,15 @@ struct lt_elem *ltable_alloc_elem(struct link_table *table, int type, int nelem, int nattempts); +#if DEVELOPMENT || DEBUG +/** + * ltable_nelem: returns how many elements are used in this + * table. + */ +extern +int ltable_nelem(struct link_table *table); +#endif + /** * ltable_realloc_elem: convert a reserved element to a particular type * diff --git a/osfmk/kern/mach_node.c b/osfmk/kern/mach_node.c index 4a0d96dc8..d342f3b48 100644 --- a/osfmk/kern/mach_node.c +++ b/osfmk/kern/mach_node.c @@ -318,6 +318,7 @@ mach_node_register(mach_node_t node) goto out; } + waitq_set_lazy_init_link(pp_set); /* Add the bootstrap port to the proxy port set */ uint64_t wq_link_id = waitq_link_reserve(NULL); uint64_t wq_reserved_prepost = waitq_prepost_reserve(NULL, 10, diff --git a/osfmk/kern/machine.c b/osfmk/kern/machine.c index cc0290ee1..43d69835c 100644 --- a/osfmk/kern/machine.c +++ b/osfmk/kern/machine.c @@ -77,6 +77,7 @@ #include #include #include +#include #include #include #include @@ -126,10 +127,7 @@ processor_up( pset = processor->processor_set; pset_lock(pset); ++pset->online_processor_count; - enqueue_tail(&pset->active_queue, (queue_entry_t)processor); - processor->state = PROCESSOR_RUNNING; - pset->active_processor_count++; - sched_update_pset_load_average(pset); + pset_update_processor_state(pset, processor, PROCESSOR_RUNNING); (void)hw_atomic_add(&processor_avail_count, 1); commpage_update_active_cpus(); pset_unlock(pset); @@ -230,15 +228,7 @@ processor_shutdown( return (KERN_SUCCESS); } - if (processor->state == PROCESSOR_IDLE) { - remqueue((queue_entry_t)processor); - } else if (processor->state == PROCESSOR_RUNNING) { - remqueue((queue_entry_t)processor); - pset->active_processor_count--; - sched_update_pset_load_average(pset); - } - - processor->state = PROCESSOR_SHUTDOWN; + pset_update_processor_state(pset, processor, PROCESSOR_SHUTDOWN); pset_unlock(pset); @@ -285,7 +275,7 @@ processor_doshutdown( pset = processor->processor_set; pset_lock(pset); - processor->state = PROCESSOR_OFF_LINE; + pset_update_processor_state(pset, processor, PROCESSOR_OFF_LINE); --pset->online_processor_count; (void)hw_atomic_sub(&processor_avail_count, 1); commpage_update_active_cpus(); @@ -331,6 +321,12 @@ processor_offline( thread_t old_thread = processor->active_thread; thread_t new_thread = processor->idle_thread; + if (!new_thread->kernel_stack) { + /* the idle thread has a reserved stack, so this will never fail */ + if (!stack_alloc_try(new_thread)) + panic("processor_offline"); + } + processor->active_thread = new_thread; processor_state_update_idle(processor); processor->starting_pri = IDLEPRI; @@ -343,7 +339,7 @@ processor_offline( old_thread->last_run_time = ctime; /* Update processor->thread_timer and ->kernel_timer to point to the new thread */ - thread_timer_event(ctime, &new_thread->system_timer); + processor_timer_switch_thread(ctime, &new_thread->system_timer); PROCESSOR_DATA(processor, kernel_timer) = &new_thread->system_timer; timer_stop(PROCESSOR_DATA(processor, current_state), ctime); @@ -356,6 +352,8 @@ processor_offline( thread_dispatch(old_thread, new_thread); + cpu_quiescent_counter_leave(processor->last_dispatch); + PMAP_DEACTIVATE_KERNEL(processor->cpu_id); cpu_sleep(); diff --git a/osfmk/kern/misc_protos.h b/osfmk/kern/misc_protos.h index 63eb58c09..74a0a7d63 100644 --- a/osfmk/kern/misc_protos.h +++ b/osfmk/kern/misc_protos.h @@ -46,6 +46,7 @@ #include #include #include +#include #ifndef MIN #define MIN(a,b) (((a)<(b))?(a):(b)) @@ -85,12 +86,6 @@ extern int testbit( int which, int *bitmap); -/* Move arbitrarily-aligned data from a user space to kernel space */ -extern int copyin( - const user_addr_t user_addr, - char *kernel_addr, - vm_size_t nbytes); - /* Move an aligned 32 or 64-bit word from user space to kernel space * using a single read instruction * @@ -116,12 +111,6 @@ extern int copyinmsg( char *kernel_addr, mach_msg_size_t nbytes); -/* Move arbitrarily-aligned data from a kernel space to user space */ -extern int copyout( - const void *kernel_addr, - user_addr_t user_addr, - vm_size_t nbytes); - /* Move arbitrarily-aligned data from a kernel space to user space */ extern int copyoutmsg( const char *kernel_addr, diff --git a/osfmk/kern/monotonic.h b/osfmk/kern/monotonic.h index a082a3535..65fd27140 100644 --- a/osfmk/kern/monotonic.h +++ b/osfmk/kern/monotonic.h @@ -30,6 +30,9 @@ #include #include +#include + +__BEGIN_DECLS extern bool mt_debug; extern _Atomic uint64_t mt_pmis; @@ -43,12 +46,16 @@ uint64_t mt_cur_cpu_cycles(void); uint64_t mt_cur_thread_instrs(void); uint64_t mt_cur_thread_cycles(void); +__END_DECLS + #if MACH_KERNEL_PRIVATE #include #include #include +__BEGIN_DECLS + #if defined(__arm__) || defined(__arm64__) #include #elif defined(__x86_64__) @@ -57,7 +64,6 @@ uint64_t mt_cur_thread_cycles(void); #error unsupported architecture #endif /* !defined(__arm__) && !defined(__arm64__) && !defined(__x86_64__) */ -void mt_init(void); void mt_update_fixed_counts(void); void mt_update_task(task_t task, thread_t thread); bool mt_update_thread(thread_t thread); @@ -65,22 +71,17 @@ int mt_fixed_thread_counts(thread_t thread, uint64_t *counts_out); int mt_fixed_task_counts(task_t task, uint64_t *counts_out); /* - * Called when a thread is switching off-core or expires its quantum. - */ -void mt_sched_update(thread_t thread); - -/* - * Called when a thread is terminating to save its counters into the task. The - * task lock must be held and the thread should be removed from the task's - * thread list in that same critical section. + * Private API for the platform layers. */ -void mt_terminate_update(task_t task, thread_t thread); /* - * Called when a core receives a PMI. + * Called once early in boot, before CPU initialization occurs (where + * `mt_cpu_up` is called). + * + * This allows monotonic to detect if the hardware supports performance counters + * and install the global PMI handler. */ -void mt_cpu_pmi(cpu_data_t *cpu, uint64_t pmsr); -uint64_t mt_cpu_update_count(cpu_data_t *cpu, unsigned int ctr); +void mt_early_init(void); /* * Called when a core is idling and exiting from idle. @@ -95,10 +96,42 @@ void mt_cpu_down(cpu_data_t *cpu); void mt_cpu_up(cpu_data_t *cpu); /* - * Called while single-threaded when the system is going to sleep and waking up. + * Called while single-threaded when the system is going to sleep. */ void mt_sleep(void); -void mt_wake(void); + +/* + * Called on each CPU as the system is waking from sleep. + */ +void mt_wake_per_core(void); + +#if __ARM_CLUSTER_COUNT__ +/* + * Called when a cluster is initialized. + */ +void mt_cluster_init(void); +#endif /* __ARM_CLUSTER_COUNT__ */ + +/* + * "Up-call" to the Mach layer to update counters from a PMI. + */ +uint64_t mt_cpu_update_count(cpu_data_t *cpu, unsigned int ctr); + +/* + * Private API for the scheduler. + */ + +/* + * Called when a thread is switching off-core or expires its quantum. + */ +void mt_sched_update(thread_t thread); + +/* + * Called when a thread is terminating to save its counters into the task. The + * task lock must be held and the thread should be removed from the task's + * thread list in that same critical section. + */ +void mt_terminate_update(task_t task, thread_t thread); /* * Private API for the performance controller callout. @@ -111,6 +144,16 @@ void mt_perfcontrol(uint64_t *instrs, uint64_t *cycles); void mt_stackshot_thread(thread_t thread, uint64_t *instrs, uint64_t *cycles); void mt_stackshot_task(task_t task, uint64_t *instrs, uint64_t *cycles); +/* + * Private API for microstackshot. + */ +typedef void (*mt_pmi_fn)(bool user_mode, void *ctx); +int mt_microstackshot_start(unsigned int ctr, uint64_t period, mt_pmi_fn fn, + void *ctx); +int mt_microstackshot_stop(void); + +__END_DECLS + #endif /* MACH_KERNEL_PRIVATE */ #endif /* !defined(KERN_MONOTONIC_H) */ diff --git a/osfmk/kern/policy_internal.h b/osfmk/kern/policy_internal.h index 91e4c0d12..3e2814408 100644 --- a/osfmk/kern/policy_internal.h +++ b/osfmk/kern/policy_internal.h @@ -106,9 +106,10 @@ extern kern_return_t task_importance(task_t task, integer_t importance); #define TASK_POLICY_QOS 0x35 #define TASK_POLICY_QOS_OVERRIDE 0x36 #define TASK_POLICY_QOS_AND_RELPRIO 0x38 /* QoS as value1, relative priority as value2 */ +#define TASK_POLICY_QOS_WORKQ_OVERRIDE 0x3B #define TASK_POLICY_QOS_PROMOTE 0x3C #define TASK_POLICY_QOS_IPC_OVERRIDE 0x3D -#define TASK_POLICY_QOS_SYNC_IPC_OVERRIDE 0x3E +// was TASK_POLICY_QOS_SYNC_IPC_OVERRIDE 0x3E #define TASK_POLICY_MAX 0x3F @@ -152,13 +153,8 @@ extern void proc_inherit_task_role(task_t new_task, task_t old_task); #define THROTTLE_LEVEL_COMPRESSOR_TIER1 THROTTLE_LEVEL_TIER1 #define THROTTLE_LEVEL_COMPRESSOR_TIER2 THROTTLE_LEVEL_TIER2 -#if CONFIG_EMBEDDED -#define THROTTLE_LEVEL_PAGEOUT_THROTTLED THROTTLE_LEVEL_TIER3 -#define THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED THROTTLE_LEVEL_TIER1 -#else #define THROTTLE_LEVEL_PAGEOUT_THROTTLED THROTTLE_LEVEL_TIER2 #define THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED THROTTLE_LEVEL_TIER1 -#endif #if CONFIG_IOSCHED #define IOSCHED_METADATA_TIER THROTTLE_LEVEL_TIER1 @@ -172,22 +168,17 @@ extern void proc_apply_task_networkbg(void * bsd_info, thread_t thread); #endif /* MACH_BSD */ /* Functions used by pthread_shims.c */ -extern boolean_t proc_thread_qos_add_override(task_t task, thread_t thread, uint64_t tid, +extern int proc_thread_qos_add_override(task_t task, thread_t thread, uint64_t tid, int override_qos, boolean_t first_override_for_resource, user_addr_t resource, int resource_type); -extern int proc_thread_qos_add_override_check_owner(thread_t thread, int override_qos, - boolean_t first_override_for_resource, user_addr_t resource, int resource_type, - user_addr_t user_lock_addr, mach_port_name_t user_lock_owner); -extern boolean_t proc_thread_qos_remove_override(task_t task, thread_t thread, uint64_t tid, +extern int proc_thread_qos_remove_override(task_t task, thread_t thread, uint64_t tid, user_addr_t resource, int resource_type); -extern boolean_t proc_thread_qos_reset_override(task_t task, thread_t thread, uint64_t tid, - user_addr_t resource, int resource_type); -extern int proc_thread_qos_squash_override(thread_t thread, user_addr_t resource, int resource_type); -extern kern_return_t -thread_set_workq_qos(thread_t thread, int qos_tier, int relprio); -extern kern_return_t -thread_set_workq_pri(thread_t thread, integer_t priority, integer_t policy); +extern void thread_reset_workq_qos(thread_t thread, uint32_t qos); +extern void thread_set_workq_override(thread_t thread, uint32_t qos); +extern void thread_set_workq_pri(thread_t thread, thread_qos_t qos, integer_t priority, integer_t policy); +extern uint8_t thread_workq_pri_for_qos(thread_qos_t qos) __pure2; +extern thread_qos_t thread_workq_qos_for_pri(int priority); extern int task_get_default_manager_qos(task_t task); @@ -204,6 +195,7 @@ extern int proc_lf_pidbind(task_t curtask, uint64_t tid, task_t target_task, int /* Importance inheritance functions not under IMPORTANCE_INHERITANCE */ extern void task_importance_mark_donor(task_t task, boolean_t donating); extern void task_importance_reset(task_t task); +extern void task_importance_init_from_parent(task_t new_task, task_t parent_task); #if IMPORTANCE_INHERITANCE extern boolean_t task_is_importance_donor(task_t task); @@ -252,19 +244,8 @@ extern int task_importance_estimate(task_t task); extern kern_return_t thread_policy_set_internal(thread_t thread, thread_policy_flavor_t flavor, thread_policy_t policy_info, mach_msg_type_number_t count); -struct promote_token { - uint16_t pt_basepri; - uint16_t pt_qos; -}; - -#define PROMOTE_TOKEN_INIT ((struct promote_token){.pt_basepri = 0, .pt_qos = 0}) - -extern void thread_user_promotion_add(thread_t thread, thread_t promoter, struct promote_token* promote_token); -extern void thread_user_promotion_update(thread_t thread, thread_t promoter, struct promote_token* promote_token); -extern void thread_user_promotion_drop(thread_t thread); - -/* for thread exec promotion */ -#define EXEC_BOOST_PRIORITY 31 +extern boolean_t thread_recompute_user_promotion_locked(thread_t thread); +extern thread_qos_t thread_user_promotion_qos_for_pri(int priority); extern void thread_set_exec_promotion(thread_t thread); extern void thread_clear_exec_promotion(thread_t thread); @@ -273,9 +254,9 @@ extern void thread_clear_exec_promotion(thread_t thread); extern void thread_add_ipc_override(thread_t thread, uint32_t qos_override); extern void thread_update_ipc_override(thread_t thread, uint32_t qos_override); extern void thread_drop_ipc_override(thread_t thread); -extern void thread_add_sync_ipc_override(thread_t thread); -extern void thread_drop_sync_ipc_override(thread_t thread); -extern uint32_t thread_get_ipc_override(thread_t thread); + +/* for ipc_pset.c */ +extern thread_qos_t thread_get_requested_qos(thread_t thread, int *relpri); /* ****************************** diff --git a/osfmk/kern/printf.c b/osfmk/kern/printf.c index 54a3220d3..c8abeb624 100644 --- a/osfmk/kern/printf.c +++ b/osfmk/kern/printf.c @@ -179,6 +179,7 @@ #include #endif + #define isdigit(d) ((d) >= '0' && (d) <= '9') #define Ctod(c) ((c) - '0') @@ -585,6 +586,7 @@ __doprnt( const char* strp = str; int strl = sizeof(str) - 1; + if (u >= VM_MIN_KERNEL_AND_KEXT_ADDRESS && u <= VM_MAX_KERNEL_ADDRESS) { while(*strp != '\0') { (*putc)(*strp, arg); diff --git a/osfmk/kern/priority.c b/osfmk/kern/priority.c index 29ee2506f..40eb17242 100644 --- a/osfmk/kern/priority.c +++ b/osfmk/kern/priority.c @@ -117,12 +117,14 @@ thread_quantum_expire( /* * We bill CPU time to both the individual thread and its task. * - * Because this balance adjustment could potentially attempt to wake this very - * thread, we must credit the ledger before taking the thread lock. The ledger - * pointers are only manipulated by the thread itself at the ast boundary. + * Because this balance adjustment could potentially attempt to wake this + * very thread, we must credit the ledger before taking the thread lock. + * The ledger pointers are only manipulated by the thread itself at the ast + * boundary. * - * TODO: This fails to account for the time between when the timer was armed and when it fired. - * It should be based on the system_timer and running a thread_timer_event operation here. + * TODO: This fails to account for the time between when the timer was + * armed and when it fired. It should be based on the system_timer and + * running a timer_update operation here. */ ledger_credit(thread->t_ledger, task_ledgers.cpu_time, thread->quantum_remaining); ledger_credit(thread->t_threadledger, thread_ledgers.cpu_time, thread->quantum_remaining); @@ -154,14 +156,15 @@ thread_quantum_expire( /* * Check for fail-safe trip. */ - if ((thread->sched_mode == TH_MODE_REALTIME || thread->sched_mode == TH_MODE_FIXED) && - !(thread->sched_flags & TH_SFLAG_PROMOTED_MASK) && - !(thread->options & TH_OPT_SYSTEM_CRITICAL)) { - uint64_t new_computation; - - new_computation = ctime - thread->computation_epoch; - new_computation += thread->computation_metered; - if (new_computation > max_unsafe_computation) { + if ((thread->sched_mode == TH_MODE_REALTIME || thread->sched_mode == TH_MODE_FIXED) && + !(thread->sched_flags & TH_SFLAG_PROMOTED) && + !(thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) && + !(thread->options & TH_OPT_SYSTEM_CRITICAL)) { + uint64_t new_computation; + + new_computation = ctime - thread->computation_epoch; + new_computation += thread->computation_metered; + if (new_computation > max_unsafe_computation) { KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_FAILSAFE)|DBG_FUNC_NONE, (uintptr_t)thread->sched_pri, (uintptr_t)thread->sched_mode, 0, 0, 0); @@ -199,12 +202,9 @@ thread_quantum_expire( * during privilege transitions, synthesize an event now. */ if (!thread->precise_user_kernel_time) { - timer_switch(PROCESSOR_DATA(processor, current_state), - ctime, - PROCESSOR_DATA(processor, current_state)); - timer_switch(PROCESSOR_DATA(processor, thread_timer), - ctime, - PROCESSOR_DATA(processor, thread_timer)); + timer_update(PROCESSOR_DATA(processor, current_state), ctime); + timer_update(PROCESSOR_DATA(processor, thread_timer), ctime); + timer_update(&thread->runnable_timer, ctime); } @@ -301,7 +301,7 @@ sched_set_thread_base_priority(thread_t thread, int priority) } sched_update_thread_bucket(thread); - thread_recompute_sched_pri(thread, FALSE); + thread_recompute_sched_pri(thread, SETPRI_DEFAULT); } /* @@ -311,28 +311,54 @@ sched_set_thread_base_priority(thread_t thread, int priority) * according to its base priority if the * thread has not been promoted or depressed. * - * This is the standard way to push base_pri changes into sched_pri, - * or to recalculate the appropriate sched_pri after clearing + * This is the only way to push base_pri changes into sched_pri, + * or to recalculate the appropriate sched_pri after changing * a promotion or depression. * * Called at splsched with the thread locked. + * + * TODO: Add an 'update urgency' flag to avoid urgency callouts on every rwlock operation */ void -thread_recompute_sched_pri( - thread_t thread, - boolean_t override_depress) +thread_recompute_sched_pri(thread_t thread, set_sched_pri_options_t options) { - int priority; + uint32_t sched_flags = thread->sched_flags; + sched_mode_t sched_mode = thread->sched_mode; - if (thread->sched_mode == TH_MODE_TIMESHARE) + int priority = thread->base_pri; + + if (sched_mode == TH_MODE_TIMESHARE) priority = SCHED(compute_timeshare_priority)(thread); - else - priority = thread->base_pri; - if ((!(thread->sched_flags & TH_SFLAG_PROMOTED_MASK) || (priority > thread->sched_pri)) && - (!(thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) || override_depress)) { - set_sched_pri(thread, priority); + if (sched_flags & TH_SFLAG_DEPRESS) { + /* thread_yield_internal overrides kernel mutex promotion */ + priority = DEPRESSPRI; + } else { + /* poll-depress is overridden by mutex promotion and promote-reasons */ + if ((sched_flags & TH_SFLAG_POLLDEPRESS)) { + priority = DEPRESSPRI; + } + + if (sched_flags & TH_SFLAG_PROMOTED) { + priority = MAX(priority, thread->promotion_priority); + + if (sched_mode != TH_MODE_REALTIME) + priority = MIN(priority, MAXPRI_PROMOTE); + } + + if (sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) { + if (sched_flags & TH_SFLAG_RW_PROMOTED) + priority = MAX(priority, MINPRI_RWLOCK); + + if (sched_flags & TH_SFLAG_WAITQ_PROMOTED) + priority = MAX(priority, MINPRI_WAITQ); + + if (sched_flags & TH_SFLAG_EXEC_PROMOTED) + priority = MAX(priority, MINPRI_EXEC); + } } + + set_sched_pri(thread, priority, options); } void @@ -380,23 +406,8 @@ lightweight_update_priority(thread_t thread) priority = sched_compute_timeshare_priority(thread); - /* - * Adjust the scheduled priority like thread_recompute_sched_pri, - * except with the benefit of knowing the thread is on this core. - */ - if ((!(thread->sched_flags & TH_SFLAG_PROMOTED_MASK) || (priority > thread->sched_pri)) && - (!(thread->sched_flags & TH_SFLAG_DEPRESSED_MASK)) && - priority != thread->sched_pri) { - - thread->sched_pri = priority; - - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY), - (uintptr_t)thread_tid(thread), - thread->base_pri, - thread->sched_pri, - thread->sched_usage, - 0); - } + if (priority != thread->sched_pri) + thread_recompute_sched_pri(thread, SETPRI_LAZY); } } @@ -512,8 +523,6 @@ update_priority( thread->sched_stamp += ticks; - thread->pri_shift = sched_pri_shifts[thread->th_sched_bucket]; - /* If requested, accelerate aging of sched_usage */ if (sched_decay_usage_age_factor > 1) ticks *= sched_decay_usage_age_factor; @@ -524,9 +533,9 @@ update_priority( thread_timer_delta(thread, delta); if (ticks < SCHED_DECAY_TICKS) { /* - * Accumulate timesharing usage only - * during contention for processor - * resources. + * Accumulate timesharing usage only during contention for processor + * resources. Use the pri_shift from the previous tick window to + * determine if the system was in a contended state. */ if (thread->pri_shift < INT8_MAX) thread->sched_usage += delta; @@ -561,36 +570,17 @@ update_priority( } /* - * Recompute scheduled priority if appropriate. + * Now that the thread's CPU usage has been accumulated and aged + * based on contention of the previous tick window, update the + * pri_shift of the thread to match the current global load/shift + * values. The updated pri_shift would be used to calculate the + * new priority of the thread. */ - if (thread->sched_mode == TH_MODE_TIMESHARE) { - int priority = sched_compute_timeshare_priority(thread); - - /* - * Adjust the scheduled priority like thread_recompute_sched_pri, - * except without setting an AST. - */ - if ((!(thread->sched_flags & TH_SFLAG_PROMOTED_MASK) || (priority > thread->sched_pri)) && - (!(thread->sched_flags & TH_SFLAG_DEPRESSED_MASK)) && - priority != thread->sched_pri) { - - boolean_t removed = thread_run_queue_remove(thread); - - thread->sched_pri = priority; - - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY), - (uintptr_t)thread_tid(thread), - thread->base_pri, - thread->sched_pri, - thread->sched_usage, - 0); - - if (removed) - thread_run_queue_reinsert(thread, SCHED_TAILQ); - } - } + thread->pri_shift = sched_pri_shifts[thread->th_sched_bucket]; - return; + /* Recompute scheduled priority if appropriate. */ + if (thread->sched_mode == TH_MODE_TIMESHARE) + thread_recompute_sched_pri(thread, SETPRI_LAZY); } #endif /* CONFIG_SCHED_TIMESHARE_CORE */ @@ -662,8 +652,10 @@ sched_update_thread_bucket(thread_t thread) break; case TH_MODE_TIMESHARE: - if (thread->base_pri > BASEPRI_UTILITY) + if (thread->base_pri > BASEPRI_DEFAULT) new_bucket = TH_BUCKET_SHARE_FG; + else if (thread->base_pri > BASEPRI_UTILITY) + new_bucket = TH_BUCKET_SHARE_DF; else if (thread->base_pri > MAXPRI_THROTTLE) new_bucket = TH_BUCKET_SHARE_UT; else @@ -779,4 +771,168 @@ sched_thread_mode_undemote(thread_t thread, uint32_t reason) thread_run_queue_reinsert(thread, SCHED_TAILQ); } +/* + * Promote thread to a specific priority + * + * Promotion must not last past syscall boundary + * Clients must always pair promote and unpromote 1:1 + * + * Called at splsched with thread locked + */ +void +sched_thread_promote_to_pri(thread_t thread, + int priority, + __kdebug_only uintptr_t trace_obj /* already unslid */) +{ + assert((thread->sched_flags & TH_SFLAG_PROMOTED) != TH_SFLAG_PROMOTED); + assert(thread->promotion_priority == 0); + assert(priority <= MAXPRI_PROMOTE); + assert(priority > 0); + + KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTED), + thread_tid(thread), trace_obj, priority); + + thread->sched_flags |= TH_SFLAG_PROMOTED; + thread->promotion_priority = priority; + + thread_recompute_sched_pri(thread, SETPRI_DEFAULT); +} + + +/* + * Update a pre-existing priority promotion to have a higher priority floor + * Priority can only go up from the previous value + * Update must occur while a promotion is active + * + * Called at splsched with thread locked + */ +void +sched_thread_update_promotion_to_pri(thread_t thread, + int priority, + __kdebug_only uintptr_t trace_obj /* already unslid */) +{ + assert(thread->promotions > 0); + assert((thread->sched_flags & TH_SFLAG_PROMOTED) == TH_SFLAG_PROMOTED); + assert(thread->promotion_priority > 0); + assert(priority <= MAXPRI_PROMOTE); + + if (thread->promotion_priority < priority) { + KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTED_UPDATE), + thread_tid(thread), trace_obj, priority); + + thread->promotion_priority = priority; + thread_recompute_sched_pri(thread, SETPRI_DEFAULT); + } +} + +/* + * End a priority promotion + * Demotes a thread back to its expected priority without the promotion in place + * + * Called at splsched with thread locked + */ +void +sched_thread_unpromote(thread_t thread, + __kdebug_only uintptr_t trace_obj /* already unslid */) +{ + assert((thread->sched_flags & TH_SFLAG_PROMOTED) == TH_SFLAG_PROMOTED); + assert(thread->promotion_priority > 0); + + KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_UNPROMOTED), + thread_tid(thread), trace_obj, 0); + + thread->sched_flags &= ~TH_SFLAG_PROMOTED; + thread->promotion_priority = 0; + + thread_recompute_sched_pri(thread, SETPRI_DEFAULT); +} + +/* called with thread locked */ +void +assert_promotions_invariant(thread_t thread) +{ + if (thread->promotions > 0) + assert((thread->sched_flags & TH_SFLAG_PROMOTED) == TH_SFLAG_PROMOTED); + + if (thread->promotions == 0) + assert((thread->sched_flags & TH_SFLAG_PROMOTED) != TH_SFLAG_PROMOTED); +} + +/* + * Promote thread to have a sched pri floor for a specific reason + * + * Promotion must not last past syscall boundary + * Clients must always pair promote and demote 1:1, + * Handling nesting of the same promote reason is the client's responsibility + * + * Called at splsched with thread locked + */ +void +sched_thread_promote_reason(thread_t thread, + uint32_t reason, + __kdebug_only uintptr_t trace_obj /* already unslid */) +{ + assert(reason & TH_SFLAG_PROMOTE_REASON_MASK); + assert((thread->sched_flags & reason) != reason); + + switch (reason) { + case TH_SFLAG_RW_PROMOTED: + KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_PROMOTE), + thread_tid(thread), thread->sched_pri, + thread->base_pri, trace_obj); + break; + case TH_SFLAG_WAITQ_PROMOTED: + KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAITQ_PROMOTE), + thread_tid(thread), thread->sched_pri, + thread->base_pri, trace_obj); + break; + case TH_SFLAG_EXEC_PROMOTED: + KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_EXEC_PROMOTE), + thread_tid(thread), thread->sched_pri, + thread->base_pri, trace_obj); + break; + } + + thread->sched_flags |= reason; + + thread_recompute_sched_pri(thread, SETPRI_DEFAULT); +} + +/* + * End a specific promotion reason + * Demotes a thread back to its expected priority without the promotion in place + * + * Called at splsched with thread locked + */ +void +sched_thread_unpromote_reason(thread_t thread, + uint32_t reason, + __kdebug_only uintptr_t trace_obj /* already unslid */) +{ + assert(reason & TH_SFLAG_PROMOTE_REASON_MASK); + assert((thread->sched_flags & reason) == reason); + + switch (reason) { + case TH_SFLAG_RW_PROMOTED: + KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_DEMOTE), + thread_tid(thread), thread->sched_pri, + thread->base_pri, trace_obj); + break; + case TH_SFLAG_WAITQ_PROMOTED: + KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAITQ_DEMOTE), + thread_tid(thread), thread->sched_pri, + thread->base_pri, trace_obj); + break; + case TH_SFLAG_EXEC_PROMOTED: + KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_EXEC_DEMOTE), + thread_tid(thread), thread->sched_pri, + thread->base_pri, trace_obj); + break; + } + + thread->sched_flags &= ~reason; + + thread_recompute_sched_pri(thread, SETPRI_DEFAULT); +} + diff --git a/osfmk/kern/priority_queue.c b/osfmk/kern/priority_queue.c new file mode 100644 index 000000000..5314d6034 --- /dev/null +++ b/osfmk/kern/priority_queue.c @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include + +#ifdef __LP64__ +static_assert(PRIORITY_QUEUE_ENTRY_CHILD_BITS >= VM_KERNEL_POINTER_SIGNIFICANT_BITS, + "Priority Queue child pointer packing failed"); +#endif + +priority_queue_entry_t +pqueue_pair_meld(priority_queue_entry_t elt, priority_queue_compare_fn_t cmp_fn) +{ + priority_queue_entry_t pq_meld_result = NULL; + priority_queue_entry_t pair_list = NULL; + + assert(elt); // caller needs to check this. + + /* Phase 1: */ + /* Split the list into a set of pairs going front to back. */ + /* Hook these pairs onto an intermediary list in reverse order of traversal.*/ + + do { + /* Consider two elements at a time for pairing */ + priority_queue_entry_t pair_item_a = elt; + priority_queue_entry_t pair_item_b = elt->next; + if (pair_item_b == NULL) { + /* Odd number of elements in the list; link the odd element */ + /* as it is on the intermediate list. */ + pair_item_a->prev = pair_list; + pair_list = pair_item_a; + break; + } + /* Found two elements to pair up */ + elt = pair_item_b->next; + priority_queue_entry_t pair = pqueue_merge(pair_item_a, pair_item_b, cmp_fn); + /* Link the pair onto the intermediary list */ + pair->prev = pair_list; + pair_list = pair; + } while (elt != NULL); + + /* Phase 2: Merge all the pairs in the pair_list */ + do { + elt = pair_list->prev; + pq_meld_result = pqueue_merge(pq_meld_result, pair_list, cmp_fn); + pair_list = elt; + } while (pair_list != NULL); + + return pq_meld_result; +} + +void +pqueue_destroy(struct priority_queue *q, size_t offset, + void (^callback)(void *e)) +{ + assert(callback != NULL); + priority_queue_entry_t head = pqueue_unpack_root(q); + priority_queue_entry_t tail = head; + + while (head != NULL) { + priority_queue_entry_t child_list = pqueue_entry_unpack_child(head); + if (child_list) { + tail->next = child_list; + while (tail->next) tail = tail->next; + } + + priority_queue_entry_t elt = head; + head = head->next; + callback((void *)elt - offset); + } + + /* poison the queue now that it's destroyed */ + q->pq_root_packed = ~0UL; +} diff --git a/osfmk/kern/priority_queue.h b/osfmk/kern/priority_queue.h new file mode 100644 index 000000000..ff9836b0c --- /dev/null +++ b/osfmk/kern/priority_queue.h @@ -0,0 +1,832 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _KERN_PRIORITY_QUEUE_H_ +#define _KERN_PRIORITY_QUEUE_H_ + +#include +#include +#include + +#include + +__BEGIN_DECLS + +/* + * A generic priorty ordered queue implementation based on pairing heaps. + * + * Reference Papers: + * - A Back-to-Basics Empirical Study of Priority Queues (https://arxiv.org/abs/1403.0252) + * - The Pairing Heap: A New Form of Self-Adjusting Heap (https://www.cs.cmu.edu/~sleator/papers/pairing-heaps.pdf) + * + * The XNU implementation is a basic version of the pairing heap. It allows for O(1) insertion and amortized O(log n) + * deletion. It is not a stable data structure since adding stability would need more pointers and hence more memory. + * + * The implementation supports two types of key storage: + * + * Type 1: PRIORITY_QUEUE_GENERIC_KEY + * + * This flag is useful when the priorities are either larger than 8-bits or the node comparision needs + * extra information other than the priority. The nodes do not store the priorities themselves and on + * comparision, callout to the comparator (of type priority_queue_compare_fn_t) provided as part of + * initialization. + * + * Sample Initialization: + * + * { + * static struct priority_queue pq; + * priority_queue_init(pq, PRIORITY_QUEUE_MAX_HEAP | PRIORITY_QUEUE_GENERIC_KEY); + * } + * + * For this type, all insertions, priority_increase, priority_decrease must pass PRIORITY_QUEUE_KEY_NONE + * as the priority key field. + * + * Type 2: PRIORITY_QUEUE_BUILTIN_KEY + * + * This type is useful when the priorities need to be stored within the data structure itself. + * Each node in the priority queue maintains a 8-bit priority key. + * + * Sample Initialization: + * { + * static struct priority_queue pq; + * priority_queue_init(pq, PRIORITY_QUEUE_MAX_HEAP | PRIORITY_QUEUE_BUILTIN_KEY); + * } + * + * + * Min / Max Heap: + * + * The semantics of Min/Max heap are not used by the implementation, it assumes that the comparison block + * that is passed to the insertion / removal / ... macros provides the right ordering. + * + * However for human readability purposes, whether this heap is a MIN or MAX heap is passed + * at initialization time, and will influence whether accessors like priority_queue_min + * or priority_queue_max can be used. + * + * + * Element Linkage: + * + * Both types use a common queue head and linkage pattern. + * The head of a priority queue is declared as: + * + * struct priority_queue pq_head; + * + * Elements in this queue are linked together using struct priority_queue_entry objects embedded within a structure: + * struct some_data { + * int field1; + * int field2; + * ... + * struct priority_queue_entry link; + * ... + * int last_field; + * }; + * struct some_data is referred to as the queue "element" + * + * This method uses the next, prev and child pointers of the struct priority_queue_entry linkage object embedded in a + * queue element to point to other elements in the queue. The head of the priority queue (the priority_queue + * object) will point to the root of the pairing heap (NULL if heap is empty). This method allows multiple chains + * through a given object, by embedding multiple priority_queue_entry objects in the structure, while simultaneously + * providing fast removal and insertion into the heap using only priority_queue_entry object pointers. + */ + + +/* + * Priority keys maintained by the data structure. + * Since the priority is packed in the node itself, it restricts keys to be 8-bits only. + */ +#define PRIORITY_QUEUE_KEY_NONE 0 +typedef uint8_t priority_queue_key_t; + +/* + * Flags passed to priority_queue_init() + * + * One key type must be picked (default is BUILTIN_KEY) + * Min or Max heap must be picked (default is MAX_HEAP) + */ +typedef enum priority_queue_flags { + PRIORITY_QUEUE_BUILTIN_KEY = 0x0, + PRIORITY_QUEUE_GENERIC_KEY = 0x1, + PRIORITY_QUEUE_MAX_HEAP = 0x0, + PRIORITY_QUEUE_MIN_HEAP = 0x2, +#define PRIORITY_QUEUE_BUILTIN_MAX_HEAP (PRIORITY_QUEUE_MAX_HEAP | PRIORITY_QUEUE_BUILTIN_KEY) +} priority_queue_flags_t; + +#ifdef __LP64__ + +/* + * For 64-bit platforms, pack the priority key into the child pointer + * The packing/unpacking is done using a compiler trick to sign extend long. + * This avoids additional NULL checks which are needed in typical packing + * implementation. The idea is to define the packed location as a long and + * for unpacking simply cast it to a full pointer which sign extends it. + */ +#define PRIORITY_QUEUE_ENTRY_CHILD_BITS 56 +#define PRIORITY_QUEUE_ENTRY_KEY_BITS 8 + +typedef struct priority_queue_entry { + struct priority_queue_entry *next; + struct priority_queue_entry *prev; + long key: PRIORITY_QUEUE_ENTRY_KEY_BITS; + long child: PRIORITY_QUEUE_ENTRY_CHILD_BITS; +} *priority_queue_entry_t; + +#else /* __LP64__ */ + +/* + * For 32-bit platforms, use an extra field to store the key since child pointer packing + * is not an option. The child is maintained as a long to use the same packing/unpacking + * routines that work for 64-bit platforms. + */ +typedef struct priority_queue_entry { + struct priority_queue_entry *next; + struct priority_queue_entry *prev; + long child; + priority_queue_key_t key; +} *priority_queue_entry_t; + +#endif /* __LP64__ */ + +/* + * Comparator block prototype + * Args: + * - elements to compare + * Return: + * comparision result to indicate relative ordering of elements according to the heap type + */ +typedef int (^priority_queue_compare_fn_t)(struct priority_queue_entry *e1, + struct priority_queue_entry *e2); + +/* + * Standard comparision routines for max and min heap. + * Must be used with PRIORITY_QUEUE_BUILTIN_KEY only. + */ +static inline int +priority_queue_element_builtin_key_compare(priority_queue_entry_t e1, priority_queue_entry_t e2) +{ + return (int)e2->key - (int)e1->key; +} + +#define priority_heap_make_comparator(name1, name2, type, field, ...) \ + (^int(priority_queue_entry_t __e1, priority_queue_entry_t __e2){ \ + type *name1 = pqe_element_fast(__e1, type, field); \ + type *name2 = pqe_element_fast(__e2, type, field); \ + __VA_ARGS__; \ + }) + +#define PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE \ + (^int(priority_queue_entry_t e1, priority_queue_entry_t e2){ \ + return -priority_queue_element_builtin_key_compare(e1, e2); \ + }) + +#define PRIORITY_QUEUE_SCHED_PRI_MIN_HEAP_COMPARE \ + (^int(priority_queue_entry_t e1, priority_queue_entry_t e2){ \ + return priority_queue_element_builtin_key_compare(e1, e2); \ + }) + +/* + * Helper routines for packing/unpacking the child pointer in heap nodes. + * On 64-bit platforms, these routines rely on the fact that the sign extension + * for the lower 56-bits of a kernel pointer results in the real pointer. The trick + * works for NULL pointers as well. + * */ +#define pqueue_entry_pack_child(qe, child_ptr) ((qe)->child = (long)(child_ptr)) +#define pqueue_entry_unpack_child(qe) ((struct priority_queue_entry *)((qe)->child)) + +/* + * Priority queue head structure. + * Stores the comparision function using pointer packing. The remaining bit is used + * for type of the queue. + */ +struct priority_queue { +/* + * we pack priority_queue_flags_t in the least significant two bits + * of the root pointer. + */ +#define PRIORITY_QUEUE_ROOT_FLAGS_MASK (3ul) +#define PRIORITY_QUEUE_ROOT_POINTER_MASK (~PRIORITY_QUEUE_ROOT_FLAGS_MASK) + unsigned long pq_root_packed; +}; + +/* + * Macro: pqe_element_fast + * Function: + * Convert a priority_queue_entry_t to a queue element pointer. + * Get a pointer to the user-defined element containing + * a given priority_queue_entry_t + * + * The fast variant assumes that `qe` is not NULL + * Header: + * pqe_element_fast(qe, type, field) + * qe + * type of element in priority queue + * chain field in (*) + * Returns: + * containing qe + */ +#define pqe_element_fast(qe, type, field) __container_of(qe, type, field) + +/* + * Macro: pqe_element + * Function: + * Convert a priority_queue_entry_t to a queue element pointer. + * Get a pointer to the user-defined element containing + * a given priority_queue_entry_t + * + * The non fast variant handles NULL `qe` + * Header: + * pqe_element(qe, type, field) + * qe + * type of element in priority queue + * chain field in (*) + * Returns: + * containing qe + */ +#define pqe_element(qe, type, field) ({ \ + priority_queue_entry_t _tmp_entry = (qe); \ + _tmp_entry ? pqe_element_fast(_tmp_entry, type, field) : ((type *)NULL); \ +}) + +#define pqueue_has_generic_keys(p) \ + (((p)->pq_root_packed & PRIORITY_QUEUE_GENERIC_KEY) != 0) + +#define pqueue_has_builtin_keys(p) \ + (((p)->pq_root_packed & PRIORITY_QUEUE_GENERIC_KEY) == 0) + +#define pqueue_is_min_heap(p) \ + (((p)->pq_root_packed & PRIORITY_QUEUE_MIN_HEAP) != 0) + +#define pqueue_is_max_heap(p) \ + (((p)->pq_root_packed & PRIORITY_QUEUE_MIN_HEAP) == 0) + +/* + * Macro: pqueue_pack_root + * Function: + * Pack the root pointer of the head. + * Header: + * pqueue_pack_root(q, root_ptr) + * q + * root_ptr + */ +#define pqueue_pack_root(q, root_ptr) \ +MACRO_BEGIN \ + uintptr_t __flags = (q)->pq_root_packed & PRIORITY_QUEUE_ROOT_FLAGS_MASK; \ + (q)->pq_root_packed = (uintptr_t)(root_ptr) | __flags; \ +MACRO_END + +/* + * Macro: pqueue_unpack_root + * Function: + * Unpack the root pointer from the head of the priority queue. + * Header: + * pqueue_unpack_root(q) + * q + * Returns: + * + */ +#define pqueue_unpack_root(q) \ + ((priority_queue_entry_t)((q)->pq_root_packed & PRIORITY_QUEUE_ROOT_POINTER_MASK)) + +/* + * Macro: pqueue_list_remove + * Function: + * Helper routine to remove an element from the list at its level + * Header: + * pqueue_list_remove(elt) + * elt + * Returns: + * None + */ +static inline void +pqueue_list_remove(priority_queue_entry_t elt) +{ + assert(elt->prev != NULL); + /* Check if elt is head of list at its level; */ + /* If yes, make the next node the head at that level */ + /* Else, remove elt from the list at that level */ + if (pqueue_entry_unpack_child(elt->prev) == elt) { + pqueue_entry_pack_child(elt->prev, elt->next); + } else { + elt->prev->next = elt->next; + } + /* Update prev for next element in list */ + if (elt->next != NULL) + elt->next->prev = elt->prev; +} + +/* + * Macro: pqueue_merge + * Function: + * Helper routine to merge two subtrees of the heap to form a single tree and + * maintain the parent > child invariant. If the two keys are equal, the current + * implementation makes the first subtree the parent and the second one the child. + * Header: + * pqueue_merge(subtree_a, subtree_b, cmp_fn) + * subtree_a + * subtree_b + * comparator function + * Returns: + * pointing to root of the merged tree + */ +static inline priority_queue_entry_t +pqueue_merge(priority_queue_entry_t subtree_a, priority_queue_entry_t subtree_b, + priority_queue_compare_fn_t cmp_fn) +{ + priority_queue_entry_t merge_result = NULL; + if (subtree_a == NULL) { + merge_result = subtree_b; + } else if (subtree_b == NULL || (subtree_a == subtree_b)) { + merge_result = subtree_a; + } else { + priority_queue_entry_t parent = subtree_a; + priority_queue_entry_t child = subtree_b; + if (cmp_fn(subtree_a, subtree_b) < 0) { + parent = subtree_b; + child = subtree_a; + } + /* Insert the child as the first element in the parent's child list */ + child->next = pqueue_entry_unpack_child(parent); + child->prev = parent; + if (pqueue_entry_unpack_child(parent) != NULL) + pqueue_entry_unpack_child(parent)->prev = child; + /* Create the parent child relationship */ + pqueue_entry_pack_child(parent, child); + parent->next = NULL; + parent->prev = NULL; + merge_result = parent; + } + return merge_result; +} + +/* + * Macro: pqueue_pair_meld + * Function: + * Helper routine to splitwise pair a set of subtrees on a list at a given level and then + * meld them together to form a new tree while maintaining the invariant parent > child. + * + * The caller must check the element is non NULL. + * + * Header: + * pqueue_pair_meld(elt, cmp_fn) + * elt + * comparator function + * Returns: + * pointing to root of the melded tree + */ +priority_queue_entry_t +pqueue_pair_meld(priority_queue_entry_t e, priority_queue_compare_fn_t cmp_fn); + +/* + * Macro: pqueue_update_key + * Function: + * Helper routine to update the key for a node in the heap. Note that the priority keys are only + * maintained for the PRIORITY_QUEUE_BUILTIN_KEY type of priority queue. For PRIORITY_QUEUE_GENERIC_KEY, + * this routine does nothing. + * Header: + * pqueue_update_key(que, elt, new_key) + * que + * elt + * new_key + * Returns: + * None + */ +static inline void +pqueue_update_key(struct priority_queue *que, priority_queue_entry_t elt, + priority_queue_key_t new_key) +{ + if (pqueue_has_builtin_keys(que)) { + assert(new_key <= UINT8_MAX); + elt->key = new_key; + } else { + assert(new_key == PRIORITY_QUEUE_KEY_NONE); + } +} + +/* + * Macro: pqueue_remove_root + * Function: + * Helper routine to remove the root element in a priority queue. + * Header: + * pqueue_remove_root(que, cmp_fn) + * que + * old_root + * comparator function + * Returns: + * old_root + */ +static inline priority_queue_entry_t +pqueue_remove_root(struct priority_queue *que, priority_queue_entry_t old_root, + priority_queue_compare_fn_t cmp_fn) +{ + priority_queue_entry_t new_root = pqueue_entry_unpack_child(old_root); + if (new_root) new_root = pqueue_pair_meld(new_root, cmp_fn); + pqueue_pack_root(que, new_root); + return old_root; +} + +/* + * Macro: pqueue_remove_non_root + * Function: + * Helper routine to remove a non root element in a priority queue. + * Header: + * pqueue_remove_non_root(que, cmp_fn) + * que + * elt + * comparator function + * Returns: + * elt + */ +static inline priority_queue_entry_t +pqueue_remove_non_root(struct priority_queue *que, priority_queue_entry_t elt, + priority_queue_compare_fn_t cmp_fn) +{ + priority_queue_entry_t child, new_root; + + /* To remove a non-root element with children levels, */ + /* - Remove element from its current level iist */ + /* - Pairwise split all the elements in the child level list */ + /* - Meld all these splits (right-to-left) to form new subtree */ + /* - Merge the root subtree with the newly formed subtree */ + pqueue_list_remove(elt); + + child = pqueue_entry_unpack_child(elt); + if (child) { + child = pqueue_pair_meld(child, cmp_fn); + new_root = pqueue_merge(pqueue_unpack_root(que), child, cmp_fn); + pqueue_pack_root(que, new_root); + } + + return elt; +} + +/* + * Macro: pqueue_destroy + * Function: + * Destroy a priority queue safely. This routine accepts a callback + * to handle any cleanup for elements in the priority queue. The queue does + * not maintain its invariants while getting destroyed. The priority queue and + * the linkage nodes need to be re-initialized before re-using them. + * + * Note: the offset is the offset to the linkage inside the elements + * That are linked inside the priority heap, because pqueue_destroy + * can't use pqe_element. + * Header: + * pqueue_destroy(q, offset, callback) + * q + * offset + * callback for each element + * + * Returns: + * None + */ +void +pqueue_destroy(struct priority_queue *q, size_t offset, + void (^callback)(void *e)); + +/* + * Priority Queue functionality routines + */ + +/* + * Macro: priority_queue_empty + * Function: + * Tests whether a priority queue is empty. + * Header: + * boolean_t priority_queue_empty(q) + * q + */ +#define priority_queue_empty(q) (pqueue_unpack_root((q)) == NULL) + +/* + * Macro: priority_queue_entry_key + * Function: + * Returns the priority queue entry key for an element on a PRIORITY_QUEUE_BUILTIN_KEY + * queue. It should not be called for an element on a PRIORITY_QUEUE_GENERIC_KEY queue. + * Header: + * priority_queue_key_t priority_queue_entry_key(q, elt) + * q + * elt + */ +#define priority_queue_entry_key(q, elt) ({ \ + assert(pqueue_has_builtin_keys(q)); \ + (priority_queue_key_t)((elt)->key); \ +}) + +/* + * Macro: priority_queue_init + * Function: + * Initialze a by setting the flags + * Valid flags are: + * - PRIORITY_QUEUE_BUILTIN_KEY or PRIORITY_QUEUE_GENERIC_KEY + * - PRIORITY_QUEUE_MAX_HEAP or PRIORITY_QUEUE_MIN_HEAP + * Header: + * priority_queue_init(q, cmp_fn, queue_type) + * q + * queue_flags + * Returns: + * None + */ +#define priority_queue_init(q, flags) \ +MACRO_BEGIN \ + pqueue_pack_root((q), NULL); \ + (q)->pq_root_packed = (flags); \ +MACRO_END + +/* + * Macro: priority_queue_entry_init + * Function: + * Initialze a priority_queue_entry_t + * Header: + * priority_queue_entry_init(qe) + * qe + * Returns: + * None + */ +#define priority_queue_entry_init(qe) \ +MACRO_BEGIN \ + (qe)->next = NULL; \ + (qe)->prev = NULL; \ + pqueue_entry_pack_child((qe), NULL); \ + (qe)->key = PRIORITY_QUEUE_KEY_NONE; \ +MACRO_END + +/* + * Macro: priority_queue_insert + * Function: + * Insert an element into the priority queue + * Header: + * priority_queue_insert(que, elt, new_key, cmp_fn) + * que + * elt + * new_key + * comparator function + * Returns: + * Whether the inserted element became the new root + */ +static inline boolean_t +priority_queue_insert(struct priority_queue *que, priority_queue_entry_t elt, + priority_queue_key_t new_key, priority_queue_compare_fn_t cmp_fn) +{ + priority_queue_entry_t new_root; + + pqueue_update_key(que, elt, new_key); + new_root = pqueue_merge(pqueue_unpack_root(que), elt, cmp_fn); + pqueue_pack_root(que, new_root); + return new_root == elt; +} + +/* + * Macro: priority_queue_remove + * Function: + * Removes an element from the priority queue + * Header: + * priority_queue_remove(que, elt, cmp_fn) + * que + * elt + * comparator function + * Returns: + * Whether the removed element was the root + */ +static inline boolean_t +priority_queue_remove(struct priority_queue *que, priority_queue_entry_t elt, + priority_queue_compare_fn_t cmp_fn) +{ + if (elt == pqueue_unpack_root(que)) { + pqueue_remove_root(que, elt, cmp_fn); + priority_queue_entry_init(elt); + return TRUE; + } else { + pqueue_remove_non_root(que, elt, cmp_fn); + priority_queue_entry_init(elt); + return FALSE; + } +} + +/* + * Macro: priority_queue_entry_decrease + * + * WARNING: + * This function is badly named for a min-heap, as it means the element + * moves toward the root, which happens if the key value became smaller. + * + * Function: + * Decrease the priority of an element in the priority queue. Since the heap invariant is to always + * have the maximum element at the root, the most efficient way to implement this is to remove + * the element and re-insert it into the heap. + * + * For PRIORITY_QUEUE_BUILTIN_KEY, the new_key is passed into this routine since the priority is + * maintained by the data structure. For PRIORITY_QUEUE_GENERIC_KEY, the caller must update the priority + * in the element and then call this routine. For the new_key field, it must pass PRIORITY_QUEUE_KEY_NONE. + * Header: + * priority_queue_entry_decrease(que, elt, new_key, cmp_fn) + * que + * elt + * new_key + * comparator function + * Returns: + * Whether the update caused the root or its key to change. + */ +static inline boolean_t +priority_queue_entry_decrease(struct priority_queue *que, priority_queue_entry_t elt, + priority_queue_key_t new_key, priority_queue_compare_fn_t cmp_fn) +{ + boolean_t was_root = priority_queue_remove(que, elt, cmp_fn); + /* Insert it back in the heap; insertion also causes the priority update in the element */ + priority_queue_insert(que, elt, new_key, cmp_fn); + return was_root; +} + +/* + * Macro: priority_queue_entry_increase + * + * WARNING: + * This function is badly named for a min-heap, as it means the element + * moves away from the root, which happens if the key value became larger. + * + * Function: + * Increase the priority of an element in the priority queue. If the root is being increased, no change + * to the data structure is needed. For elements at any other level, unhook it from that level and + * re-merge it. + * + * For PRIORITY_QUEUE_BUILTIN_KEY, the new_key is passed into this routine since the priority is + * maintained by the data structure. For PRIORITY_QUEUE_GENERIC_KEY, the caller must update the priority + * in the element and then call this routine. For the new_key field, it must pass PRIORITY_QUEUE_KEY_NONE. + * Header: + * priority_queue_entry_increase(que, elt, new_key, cmp_fn) + * que + * elt + * new_key + * comparator function + * Returns: + * Whether the update caused the root or its key to change. + */ +static inline boolean_t +priority_queue_entry_increase(struct priority_queue *que, priority_queue_entry_t elt, + priority_queue_key_t new_key, priority_queue_compare_fn_t cmp_fn) +{ + if (elt == pqueue_unpack_root(que)) { + pqueue_update_key(que, elt, new_key); + return TRUE; + } + + /* Remove the element from its current level list */ + pqueue_list_remove(elt); + /* Re-insert the element into the heap with a merge */ + return priority_queue_insert(que, elt, new_key, cmp_fn); +} + +/* + * Min/Max nodes lookup and removal routines + * Since the data structure is unaware of the type of heap being constructed, it provides both the min + * and max variants of the lookup and removal routines. Both variants do the exact same operation and + * it is up to the callers to call the right variant which makes semantic sense for the type of heap. + */ + +/* + * Macro: priority_queue_max + * Function: + * Lookup the max element in a priority queue. It simply returns the root of the + * priority queue. + * Header: + * priority_queue_max(q, type, field) + * q + * type of element in priority queue + * chain field in (*) + * Returns: + * max element + */ +#define priority_queue_max(q, type, field) ({ \ + assert(pqueue_is_max_heap(q)); \ + pqe_element(pqueue_unpack_root(q), type, field); \ +}) + +/* + * Macro: priority_queue_min + * Function: + * Lookup the min element in a priority queue. It simply returns the root of the + * priority queue. + * Header: + * priority_queue_min(q, type, field) + * q + * type of element in priority queue + * chain field in (*) + * Returns: + * min element + */ +#define priority_queue_min(q, type, field) ({ \ + assert(pqueue_is_min_heap(que)); \ + priority_queue_entry_key(pqueue_unpack_root(q), type, field); \ +}) + +/* + * Macro: priority_queue_max_key + * Function: + * Lookup the max key in a priority queue. + * Header: + * priority_queue_max_key(q) + * q + * Returns: + * max key + */ +#define priority_queue_max_key(q) ({ \ + assert(pqueue_is_max_heap(q)); \ + priority_queue_entry_key(q, pqueue_unpack_root(q)); \ +}) + +/* + * Macro: priority_queue_min_key + * Function: + * Lookup the min key in a priority queue. + * Header: + * priority_queue_min_key(q) + * q + * Returns: + * min key + */ +#define priority_queue_min_key(q) ({ \ + assert(pqueue_is_min_heap(q)); \ + priority_queue_entry_key(pqueue_unpack_root(q)); \ +}) + +/* + * Macro: priority_queue_remove_max + * Function: + * Remove the max element in a priority queue. + * Uses the priority_queue_remove() routine to actually do the removal. + * Header: + * priority_queue_remove_max(q, type, field) + * q + * type of element in priority queue + * chain field in (*) + * Returns: + * max element + */ +#define priority_queue_remove_max(q, type, field, cmp_fn) ({ \ + assert(pqueue_is_max_heap(q)); \ + pqe_element(pqueue_remove_root(q, pqueue_unpack_root(q), cmp_fn), type, field); \ +}) + +/* + * Macro: priority_queue_remove_min + * Function: + * Remove the min element in a priority queue. + * Uses the priority_queue_remove() routine to actually do the removal. + * Header: + * priority_queue_remove_min(q, type, field) + * q + * type of element in priority queue + * chain field in (*) + * Returns: + * min element + */ +#define priority_queue_remove_min(q, type, field, cmp_fn) ({ \ + assert(pqueue_is_min_heap(que)); \ + pqe_element(pqueue_remove_root(q, pqueue_unpack_root(q), cmp_fn), type, field); \ +}) + +/* + * Macro: priority_queue_destroy + * Function: + * Destroy a priority queue safely. This routine accepts a callback + * to handle any cleanup for elements in the priority queue. The queue does + * not maintain its invariants while getting destroyed. The priority queue and + * the linkage nodes need to be re-initialized before re-using them. + * Header: + * priority_queue_destroy(q, type, field, callback) + * q + * type of element in priority queue + * chain field in (*) + * callback for each element + * + * Returns: + * None + */ +#define priority_queue_destroy(q, type, field, callback, ...) \ + pqueue_destroy(q, offsetof(type, field), callback, ##__VA_ARGS__) + +__END_DECLS + +#endif /* _KERN_PRIORITY_QUEUE_H_ */ diff --git a/osfmk/kern/processor.c b/osfmk/kern/processor.c index 5aad73e37..479094c30 100644 --- a/osfmk/kern/processor.c +++ b/osfmk/kern/processor.c @@ -80,6 +80,11 @@ #include +#if defined(CONFIG_XNUPOST) + +#include + +#endif /* CONFIG_XNUPOST */ /* * Exported interface @@ -114,6 +119,36 @@ boolean_t sched_stats_active = FALSE; processor_t processor_array[MAX_SCHED_CPUS] = { 0 }; +#if defined(CONFIG_XNUPOST) +kern_return_t ipi_test(void); +extern void arm64_ipi_test(void); + +kern_return_t +ipi_test() +{ +#if __arm64__ + processor_t p; + + for (p = processor_list; p != NULL; p = p->processor_list) { + thread_bind(p); + thread_block(THREAD_CONTINUE_NULL); + kprintf("Running IPI test on cpu %d\n", p->cpu_id); + arm64_ipi_test(); + } + + /* unbind thread from specific cpu */ + thread_bind(PROCESSOR_NULL); + thread_block(THREAD_CONTINUE_NULL); + + T_PASS("Done running IPI tests"); +#else + T_PASS("Unsupported platform. Not running IPI tests"); + +#endif /* __arm64__ */ + + return KERN_SUCCESS; +} +#endif /* defined(CONFIG_XNUPOST) */ void @@ -154,6 +189,8 @@ processor_init( SCHED(processor_init)(processor); } + assert(cpu_id < MAX_SCHED_CPUS); + processor->state = PROCESSOR_OFF_LINE; processor->active_thread = processor->next_thread = processor->idle_thread = THREAD_NULL; processor->processor_set = pset; @@ -171,6 +208,8 @@ processor_init( processor->processor_self = IP_NULL; processor_data_init(processor); processor->processor_list = NULL; + processor->cpu_quiesce_state = CPU_QUIESCE_COUNTER_NONE; + processor->cpu_quiesce_last_checkin = 0; s = splsched(); pset_lock(pset); @@ -191,7 +230,6 @@ processor_init( processor_list_tail->processor_list = processor; processor_list_tail = processor; processor_count++; - assert(cpu_id < MAX_SCHED_CPUS); processor_array[cpu_id] = processor; simple_unlock(&processor_list_lock); } @@ -216,6 +254,9 @@ processor_set_primary( /* Mark both processors as SMT siblings */ primary->is_SMT = TRUE; processor->is_SMT = TRUE; + + processor_set_t pset = processor->processor_set; + atomic_bit_clear(&pset->primary_map, processor->cpu_id, memory_order_relaxed); } } @@ -328,17 +369,18 @@ pset_init( SCHED(rt_init)(pset); } - queue_init(&pset->active_queue); - queue_init(&pset->idle_queue); - queue_init(&pset->idle_secondary_queue); - queue_init(&pset->unused_queue); pset->online_processor_count = 0; - pset->active_processor_count = 0; pset->load_average = 0; pset->cpu_set_low = pset->cpu_set_hi = 0; pset->cpu_set_count = 0; + pset->last_chosen = -1; pset->cpu_bitmask = 0; pset->recommended_bitmask = ~0ULL; + pset->primary_map = ~0ULL; + pset->cpu_state_map[PROCESSOR_OFF_LINE] = ~0ULL; + for (uint i = PROCESSOR_SHUTDOWN; i < PROCESSOR_STATE_LEN; i++) { + pset->cpu_state_map[i] = 0; + } pset->pending_AST_cpu_mask = 0; #if defined(CONFIG_SCHED_DEFERRED_AST) pset->pending_deferred_AST_cpu_mask = 0; @@ -540,7 +582,7 @@ processor_start( return (KERN_FAILURE); } - processor->state = PROCESSOR_START; + pset_update_processor_state(pset, processor, PROCESSOR_START); pset_unlock(pset); splx(s); @@ -552,7 +594,7 @@ processor_start( if (result != KERN_SUCCESS) { s = splsched(); pset_lock(pset); - processor->state = PROCESSOR_OFF_LINE; + pset_update_processor_state(pset, processor, PROCESSOR_OFF_LINE); pset_unlock(pset); splx(s); @@ -571,7 +613,7 @@ processor_start( if (result != KERN_SUCCESS) { s = splsched(); pset_lock(pset); - processor->state = PROCESSOR_OFF_LINE; + pset_update_processor_state(pset, processor, PROCESSOR_OFF_LINE); pset_unlock(pset); splx(s); @@ -597,7 +639,7 @@ processor_start( if (result != KERN_SUCCESS) { s = splsched(); pset_lock(pset); - processor->state = PROCESSOR_OFF_LINE; + pset_update_processor_state(pset, processor, PROCESSOR_OFF_LINE); pset_unlock(pset); splx(s); diff --git a/osfmk/kern/processor.h b/osfmk/kern/processor.h index 09caf6a7f..646ea801c 100644 --- a/osfmk/kern/processor.h +++ b/osfmk/kern/processor.h @@ -81,25 +81,81 @@ #include #include #include +#include + +/* + * Processor state is accessed by locking the scheduling lock + * for the assigned processor set. + * + * -------------------- SHUTDOWN + * / ^ ^ + * _/ | \ + * OFF_LINE ---> START ---> RUNNING ---> IDLE ---> DISPATCHING + * \_________________^ ^ ^______/ / + * \__________________/ + * + * Most of these state transitions are externally driven as a + * a directive (for instance telling an IDLE processor to start + * coming out of the idle state to run a thread). However these + * are typically paired with a handshake by the processor itself + * to indicate that it has completed a transition of indeterminate + * length (for example, the DISPATCHING->RUNNING or START->RUNNING + * transitions must occur on the processor itself). + * + * The boot processor has some special cases, and skips the START state, + * since it has already bootstrapped and is ready to context switch threads. + * + * When a processor is in DISPATCHING or RUNNING state, the current_pri, + * current_thmode, and deadline fields should be set, so that other + * processors can evaluate if it is an appropriate candidate for preemption. + */ +#if defined(CONFIG_SCHED_DEFERRED_AST) +/* + * -------------------- SHUTDOWN + * / ^ ^ + * _/ | \ + * OFF_LINE ---> START ---> RUNNING ---> IDLE ---> DISPATCHING + * \_________________^ ^ ^______/ ^_____ / / + * \__________________/ + * + * A DISPATCHING processor may be put back into IDLE, if another + * processor determines that the target processor will have nothing to do + * upon reaching the RUNNING state. This is racy, but if the target + * responds and becomes RUNNING, it will not break the processor state + * machine. + * + * This change allows us to cancel an outstanding signal/AST on a processor + * (if such an operation is supported through hardware or software), and + * push the processor back into the IDLE state as a power optimization. + */ +#endif + +#define PROCESSOR_OFF_LINE 0 /* Not available */ +#define PROCESSOR_SHUTDOWN 1 /* Going off-line */ +#define PROCESSOR_START 2 /* Being started */ +/* 3 Formerly Inactive (unavailable) */ +#define PROCESSOR_IDLE 4 /* Idle (available) */ +#define PROCESSOR_DISPATCHING 5 /* Dispatching (idle -> active) */ +#define PROCESSOR_RUNNING 6 /* Normal execution */ +#define PROCESSOR_STATE_LEN (PROCESSOR_RUNNING+1) typedef enum { PSET_SMP, } pset_cluster_type_t; -struct processor_set { - queue_head_t active_queue; /* active processors */ - queue_head_t idle_queue; /* idle processors */ - queue_head_t idle_secondary_queue; /* idle secondary processors */ - queue_head_t unused_queue; /* processors not recommended by CLPC */ +typedef bitmap_t cpumap_t; - int online_processor_count; - int active_processor_count; - int load_average; +struct processor_set { + int online_processor_count; + int load_average; - int cpu_set_low, cpu_set_hi; - int cpu_set_count; - uint64_t cpu_bitmask; - uint64_t recommended_bitmask; + int cpu_set_low, cpu_set_hi; + int cpu_set_count; + int last_chosen; + cpumap_t cpu_bitmask; + cpumap_t recommended_bitmask; + cpumap_t cpu_state_map[PROCESSOR_STATE_LEN]; + cpumap_t primary_map; #if __SMP__ decl_simple_lock_data(,sched_lock) /* lock for above */ @@ -116,7 +172,7 @@ struct processor_set { #endif /* CPUs that have been sent an unacknowledged remote AST for scheduling purposes */ - uint64_t pending_AST_cpu_mask; + cpumap_t pending_AST_cpu_mask; #if defined(CONFIG_SCHED_DEFERRED_AST) /* * A separate mask, for ASTs that we may be able to cancel. This is dependent on @@ -129,9 +185,9 @@ struct processor_set { * of spurious ASTs in the system, and let processors spend longer periods in * IDLE. */ - uint64_t pending_deferred_AST_cpu_mask; + cpumap_t pending_deferred_AST_cpu_mask; #endif - uint64_t pending_spill_cpu_mask; + cpumap_t pending_spill_cpu_mask; struct ipc_port * pset_self; /* port for operations */ struct ipc_port * pset_name_self; /* port for information */ @@ -161,15 +217,12 @@ decl_lck_mtx_data(extern,tasks_threads_lock) decl_lck_mtx_data(extern,tasks_corpse_lock) struct processor { - queue_chain_t processor_queue;/* idle/active queue link, - * MUST remain the first element */ - int state; /* See below */ - boolean_t is_SMT; - boolean_t is_recommended; - struct thread - *active_thread, /* thread running on processor */ - *next_thread, /* next thread when dispatched */ - *idle_thread; /* this processor's idle thread. */ + int state; /* See above */ + bool is_SMT; + bool is_recommended; + struct thread *active_thread; /* thread running on processor */ + struct thread *next_thread; /* next thread when dispatched */ + struct thread *idle_thread; /* this processor's idle thread. */ processor_set_t processor_set; /* assigned set */ @@ -179,13 +232,17 @@ struct processor { int starting_pri; /* priority of current thread as it was when scheduled */ pset_cluster_type_t current_recommended_pset_type; /* Cluster type recommended for current thread */ int cpu_id; /* platform numeric id */ + cpu_quiescent_state_t cpu_quiesce_state; + uint64_t cpu_quiesce_last_checkin; timer_call_data_t quantum_timer; /* timer for quantum expiration */ uint64_t quantum_end; /* time when current quantum ends */ uint64_t last_dispatch; /* time of last dispatch */ + uint64_t kperf_last_sample_time; /* time of last kperf sample */ + uint64_t deadline; /* current deadline */ - boolean_t first_timeslice; /* has the quantum expired since context switch */ + bool first_timeslice; /* has the quantum expired since context switch */ #if defined(CONFIG_SCHED_TRADITIONAL) || defined(CONFIG_SCHED_MULTIQ) struct run_queue runq; /* runq for this processor */ @@ -220,61 +277,6 @@ extern processor_t master_processor; extern boolean_t sched_stats_active; -/* - * Processor state is accessed by locking the scheduling lock - * for the assigned processor set. - * - * -------------------- SHUTDOWN - * / ^ ^ - * _/ | \ - * OFF_LINE ---> START ---> RUNNING ---> IDLE ---> DISPATCHING - * \_________________^ ^ ^______/ / - * \__________________/ - * - * Most of these state transitions are externally driven as a - * a directive (for instance telling an IDLE processor to start - * coming out of the idle state to run a thread). However these - * are typically paired with a handshake by the processor itself - * to indicate that it has completed a transition of indeterminate - * length (for example, the DISPATCHING->RUNNING or START->RUNNING - * transitions must occur on the processor itself). - * - * The boot processor has some special cases, and skips the START state, - * since it has already bootstrapped and is ready to context switch threads. - * - * When a processor is in DISPATCHING or RUNNING state, the current_pri, - * current_thmode, and deadline fields should be set, so that other - * processors can evaluate if it is an appropriate candidate for preemption. - */ -#if defined(CONFIG_SCHED_DEFERRED_AST) -/* - * -------------------- SHUTDOWN - * / ^ ^ - * _/ | \ - * OFF_LINE ---> START ---> RUNNING ---> IDLE ---> DISPATCHING - * \_________________^ ^ ^______/ ^_____ / / - * \__________________/ - * - * A DISPATCHING processor may be put back into IDLE, if another - * processor determines that the target processor will have nothing to do - * upon reaching the RUNNING state. This is racy, but if the target - * responds and becomes RUNNING, it will not break the processor state - * machine. - * - * This change allows us to cancel an outstanding signal/AST on a processor - * (if such an operation is supported through hardware or software), and - * push the processor back into the IDLE state as a power optimization. - */ -#endif - -#define PROCESSOR_OFF_LINE 0 /* Not available */ -#define PROCESSOR_SHUTDOWN 1 /* Going off-line */ -#define PROCESSOR_START 2 /* Being started */ -/* 3 Formerly Inactive (unavailable) */ -#define PROCESSOR_IDLE 4 /* Idle (available) */ -#define PROCESSOR_DISPATCHING 5 /* Dispatching (idle -> active) */ -#define PROCESSOR_RUNNING 6 /* Normal execution */ - extern processor_t current_processor(void); /* Lock macros, always acquired and released with interrupts disabled (splsched()) */ @@ -283,6 +285,12 @@ extern processor_t current_processor(void); #define pset_lock(p) simple_lock(&(p)->sched_lock) #define pset_unlock(p) simple_unlock(&(p)->sched_lock) #define pset_lock_init(p) simple_lock_init(&(p)->sched_lock, 0) +#if defined(__arm__) || defined(__arm64__) +#define pset_assert_locked(p) LCK_SPIN_ASSERT(&(p)->sched_lock, LCK_ASSERT_OWNED) +#else +/* See pset_lock() should be converted to use lck_spin_lock() instead of simple_lock() */ +#define pset_assert_locked(p) do { (void)p; } while(0) +#endif #define rt_lock_lock(p) simple_lock(&SCHED(rt_runq)(p)->rt_lock) #define rt_lock_unlock(p) simple_unlock(&SCHED(rt_runq)(p)->rt_lock) @@ -291,6 +299,7 @@ extern processor_t current_processor(void); #define pset_lock(p) do { (void)p; } while(0) #define pset_unlock(p) do { (void)p; } while(0) #define pset_lock_init(p) do { (void)p; } while(0) +#define pset_assert_locked(p) do { (void)p; } while(0) #define rt_lock_lock(p) do { (void)p; } while(0) #define rt_lock_unlock(p) do { (void)p; } while(0) @@ -369,6 +378,40 @@ extern void processor_state_update_explicit(processor_t processor, int pri, sfi_class_id_t sfi_class, pset_cluster_type_t pset_type, perfcontrol_class_t perfctl_class); +#define PSET_LOAD_NUMERATOR_SHIFT 16 +#define PSET_LOAD_FRACTIONAL_SHIFT 4 + +inline static int +sched_get_pset_load_average(processor_set_t pset) +{ + return pset->load_average >> (PSET_LOAD_NUMERATOR_SHIFT - PSET_LOAD_FRACTIONAL_SHIFT); +} +extern void sched_update_pset_load_average(processor_set_t pset); + +inline static void +pset_update_processor_state(processor_set_t pset, processor_t processor, uint new_state) +{ + pset_assert_locked(pset); + + uint old_state = processor->state; + uint cpuid = processor->cpu_id; + + assert(processor->processor_set == pset); + assert(bit_test(pset->cpu_bitmask, cpuid)); + + assert(old_state < PROCESSOR_STATE_LEN); + assert(new_state < PROCESSOR_STATE_LEN); + + processor->state = new_state; + + bit_clear(pset->cpu_state_map[old_state], cpuid); + bit_set(pset->cpu_state_map[new_state], cpuid); + + if ((old_state == PROCESSOR_RUNNING) || (new_state == PROCESSOR_RUNNING)) { + sched_update_pset_load_average(pset); + } +} + #else /* MACH_KERNEL_PRIVATE */ __BEGIN_DECLS diff --git a/osfmk/kern/processor_data.h b/osfmk/kern/processor_data.h index 3f563c08f..8e70723bc 100644 --- a/osfmk/kern/processor_data.h +++ b/osfmk/kern/processor_data.h @@ -76,13 +76,6 @@ struct processor_data { /* VM event counters */ vm_statistics64_data_t vm_stat; - /* IPC free message cache */ - struct ikm_cache { -#define IKM_STASH 16 - ipc_kmsg_t entries[IKM_STASH]; - unsigned int avail; - } ikm_cache; - /* waitq prepost cache */ #define WQP_CACHE_MAX 50 struct wqp_cache { @@ -104,6 +97,7 @@ struct processor_data { const char *db_panic_str; va_list *db_panic_args; uint64_t db_panic_options; + void *db_panic_data_ptr; boolean_t db_proceed_on_sync_failure; uint32_t db_entry_count; /* incremented whenever we panic or call Debugger (current CPU panic level) */ kern_return_t db_op_return; diff --git a/osfmk/kern/queue.h b/osfmk/kern/queue.h index 1cdcd4f1b..ee2f141c6 100644 --- a/osfmk/kern/queue.h +++ b/osfmk/kern/queue.h @@ -226,7 +226,6 @@ typedef struct queue_entry *queue_entry_t; #ifdef XNU_KERNEL_PRIVATE #include -#include static inline void __QUEUE_ELT_VALIDATE(queue_entry_t elt) { queue_entry_t elt_next, elt_prev; @@ -730,12 +729,24 @@ movqueue(queue_t _old, queue_t _new) * is the chain field in (*) * Note: * This should only be used with Method 2 queue iteration (element chains) + * + * We insert a compiler barrier after setting the fields in the element + * to ensure that the element is updated before being added to the queue, + * which is especially important because stackshot, which operates from + * debugger context, iterates several queues that use this macro (the tasks + * lists and threads lists) without locks. Without this barrier, the + * compiler may re-order the instructions for this macro in a way that + * could cause stackshot to trip over an inconsistent queue during + * iteration. */ #define queue_enter(head, elt, type, field) \ MACRO_BEGIN \ queue_entry_t __prev; \ \ __prev = (head)->prev; \ + (elt)->field.prev = __prev; \ + (elt)->field.next = head; \ + __compiler_barrier(); \ if ((head) == __prev) { \ (head)->next = (queue_entry_t) (elt); \ } \ @@ -743,8 +754,6 @@ MACRO_BEGIN \ ((type)(void *)__prev)->field.next = \ (queue_entry_t)(elt); \ } \ - (elt)->field.prev = __prev; \ - (elt)->field.next = head; \ (head)->prev = (queue_entry_t) elt; \ MACRO_END diff --git a/osfmk/kern/sched.h b/osfmk/kern/sched.h index d8fe8ee36..43211cb9a 100644 --- a/osfmk/kern/sched.h +++ b/osfmk/kern/sched.h @@ -167,8 +167,13 @@ #define MAXPRI_THROTTLE (MINPRI + 4) /* 4 */ #define MINPRI_USER MINPRI /* 0 */ -#define DEPRESSPRI MINPRI /* depress priority */ -#define MAXPRI_PROMOTE (MAXPRI_KERNEL) /* ceiling for mutex promotion */ +#define DEPRESSPRI (MINPRI) /* depress priority */ + +#define MAXPRI_PROMOTE (MAXPRI_KERNEL) /* ceiling for mutex promotion */ +#define MINPRI_RWLOCK (BASEPRI_BACKGROUND) /* floor when holding rwlock count */ +#define MINPRI_EXEC (BASEPRI_DEFAULT) /* floor when in exec state */ +#define MINPRI_WAITQ (BASEPRI_DEFAULT) /* floor when in waitq handover state */ + /* Type used for thread->sched_mode and saved_mode */ typedef enum { @@ -182,7 +187,8 @@ typedef enum { typedef enum { TH_BUCKET_RUN = 0, /* All runnable threads */ TH_BUCKET_FIXPRI, /* Fixed-priority */ - TH_BUCKET_SHARE_FG, /* Timeshare thread above BASEPRI_UTILITY */ + TH_BUCKET_SHARE_FG, /* Timeshare thread above BASEPRI_DEFAULT */ + TH_BUCKET_SHARE_DF, /* Timeshare thread between BASEPRI_DEFAULT and BASEPRI_UTILITY */ TH_BUCKET_SHARE_UT, /* Timeshare thread between BASEPRI_UTILITY and MAXPRI_THROTTLE */ TH_BUCKET_SHARE_BG, /* Timeshare thread between MAXPRI_THROTTLE and MINPRI */ TH_BUCKET_MAX, @@ -306,6 +312,8 @@ extern void thread_quantum_expire( extern ast_t csw_check(processor_t processor, ast_t check_reason); +extern void sched_update_generation_count(void); + #if defined(CONFIG_SCHED_TIMESHARE_CORE) extern uint32_t std_quantum, min_std_quantum; extern uint32_t std_quantum_us; @@ -338,6 +346,8 @@ extern uint32_t sched_tick_interval; extern uint64_t sched_one_second_interval; /* Periodic computation of various averages */ +extern void compute_sched_load(void); + extern void compute_averages(uint64_t); extern void compute_averunnable( @@ -346,9 +356,6 @@ extern void compute_averunnable( extern void compute_stack_target( void *arg); -extern void compute_memory_pressure( - void *arg); - extern void compute_pageout_gc_throttle( void *arg); diff --git a/osfmk/kern/sched_average.c b/osfmk/kern/sched_average.c index cf9520915..e7b24bb0d 100644 --- a/osfmk/kern/sched_average.c +++ b/osfmk/kern/sched_average.c @@ -110,7 +110,6 @@ static struct sched_average { } sched_average[] = { { compute_averunnable, &sched_nrun, 5, 0 }, { compute_stack_target, NULL, 5, 1 }, - { compute_memory_pressure, NULL, 1, 0 }, { compute_pageout_gc_throttle, NULL, 1, 0 }, { compute_pmap_gc_throttle, NULL, 60, 0 }, #if CONFIG_TELEMETRY @@ -121,15 +120,63 @@ static struct sched_average { typedef struct sched_average *sched_average_t; -uint32_t load_now[TH_BUCKET_MAX]; +/* + * Scheduler load calculation algorithm + * + * The scheduler load values provide an estimate of the number of runnable + * timeshare threads in the system at various priority bands. The load + * ultimately affects the priority shifts applied to all threads in a band + * causing them to timeshare with other threads in the system. The load is + * maintained in buckets, with each bucket corresponding to a priority band. + * + * Each runnable thread on the system contributes its load to its priority + * band and to the bands above it. The contribution of a thread to the bands + * above it is not strictly 1:1 and is weighted based on the priority band + * of the thread. The rules of thread load contribution to each of its higher + * bands are as follows: + * + * - DF threads: Upto (2 * NCPUs) threads + * - UT threads: Upto NCPUs threads + * - BG threads: Upto 1 thread + * + * To calculate the load values, the various run buckets are sampled (every + * sched_load_compute_interval_abs) and the weighted contributions of the the + * lower bucket threads are added. The resultant value is plugged into an + * exponentially weighted moving average formula: + * new-load = alpha * old-load + (1 - alpha) * run-bucket-sample-count + * (where, alpha < 1) + * The calculations for the scheduler load are done using fixpoint math with + * a scale factor of 16 to avoid expensive divides and floating point + * operations. The final load values are a smooth curve representative of + * the actual number of runnable threads in a priority band. + */ + +/* Maintains the current (scaled for fixpoint) load in various buckets */ +uint32_t sched_load[TH_BUCKET_MAX]; -/* The "stdelta" parameter represents the number of scheduler maintenance - * "ticks" that have elapsed since the last invocation, subject to - * integer division imprecision. +/* + * Alpha factor for the EWMA alogrithm. The current values are chosen as + * 6:10 ("old load":"new samples") to make sure the scheduler reacts fast + * enough to changing system load but does not see too many spikes from bursty + * activity. The current values ensure that the scheduler would converge + * to the latest load in 2-3 sched_load_compute_interval_abs intervals + * (which amounts to ~30-45ms with current values). */ +#define SCHED_LOAD_EWMA_ALPHA_OLD 6 +#define SCHED_LOAD_EWMA_ALPHA_NEW 10 +#define SCHED_LOAD_EWMA_ALPHA_SHIFT 4 +static_assert((SCHED_LOAD_EWMA_ALPHA_OLD + SCHED_LOAD_EWMA_ALPHA_NEW) == (1ul << SCHED_LOAD_EWMA_ALPHA_SHIFT)); + +/* For fixpoint EWMA, roundup the load to make it converge */ +#define SCHED_LOAD_EWMA_ROUNDUP(load) (((load) & (1ul << (SCHED_LOAD_EWMA_ALPHA_SHIFT - 1))) != 0) +/* Macro to convert scaled sched load to a real load value */ +#define SCHED_LOAD_EWMA_UNSCALE(load) (((load) >> SCHED_LOAD_EWMA_ALPHA_SHIFT) + SCHED_LOAD_EWMA_ROUNDUP(load)) + +/* + * Routine to capture the latest runnable counts and update sched_load */ void -compute_averages(uint64_t stdelta) +compute_sched_load(void) { /* * Retrieve a snapshot of the current run counts. @@ -138,56 +185,65 @@ compute_averages(uint64_t stdelta) * not byte-by-byte copy. */ uint32_t ncpus = processor_avail_count; + uint32_t load_now[TH_BUCKET_MAX]; load_now[TH_BUCKET_RUN] = sched_run_buckets[TH_BUCKET_RUN]; load_now[TH_BUCKET_FIXPRI] = sched_run_buckets[TH_BUCKET_FIXPRI]; load_now[TH_BUCKET_SHARE_FG] = sched_run_buckets[TH_BUCKET_SHARE_FG]; + load_now[TH_BUCKET_SHARE_DF] = sched_run_buckets[TH_BUCKET_SHARE_DF]; load_now[TH_BUCKET_SHARE_UT] = sched_run_buckets[TH_BUCKET_SHARE_UT]; load_now[TH_BUCKET_SHARE_BG] = sched_run_buckets[TH_BUCKET_SHARE_BG]; assert(load_now[TH_BUCKET_RUN] >= 0); assert(load_now[TH_BUCKET_FIXPRI] >= 0); - /* Ignore the current thread, which is a running fixpri thread */ - - uint32_t nthreads = load_now[TH_BUCKET_RUN] - 1; - uint32_t nfixpri = load_now[TH_BUCKET_FIXPRI] - 1; + uint32_t nthreads = load_now[TH_BUCKET_RUN]; + uint32_t nfixpri = load_now[TH_BUCKET_FIXPRI]; KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD) | DBG_FUNC_NONE, - load_now[TH_BUCKET_FIXPRI] - 1, load_now[TH_BUCKET_SHARE_FG], + load_now[TH_BUCKET_FIXPRI], (load_now[TH_BUCKET_SHARE_FG] + load_now[TH_BUCKET_SHARE_DF]), load_now[TH_BUCKET_SHARE_BG], load_now[TH_BUCKET_SHARE_UT], 0); /* * Compute the timeshare priority conversion factor based on loading. * Because our counters may be incremented and accessed * concurrently with respect to each other, we may have - * windows where the invariant (nthreads - nfixpri) == (fg + bg + ut) + * windows where the invariant (nthreads - nfixpri) == (fg + df + bg + ut) * is broken, so truncate values in these cases. */ - uint32_t timeshare_threads = (nthreads - nfixpri); - for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG ; i++) { if (load_now[i] > timeshare_threads) load_now[i] = timeshare_threads; } + /* + * Default threads contribute up to (NCPUS * 2) of load to FG threads + */ + if (load_now[TH_BUCKET_SHARE_DF] <= (ncpus * 2)) { + load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_DF]; + } else { + load_now[TH_BUCKET_SHARE_FG] += (ncpus * 2); + } + /* - * Utility threads contribute up to NCPUS of load to FG threads + * Utility threads contribute up to NCPUS of load to FG & DF threads */ if (load_now[TH_BUCKET_SHARE_UT] <= ncpus) { load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_UT]; + load_now[TH_BUCKET_SHARE_DF] += load_now[TH_BUCKET_SHARE_UT]; } else { load_now[TH_BUCKET_SHARE_FG] += ncpus; + load_now[TH_BUCKET_SHARE_DF] += ncpus; } /* - * FG and UT should notice there's one thread of competition from BG, - * but no more. + * BG threads contribute up to 1 thread worth of load to FG, DF and UT threads */ if (load_now[TH_BUCKET_SHARE_BG] > 0) { load_now[TH_BUCKET_SHARE_FG] += 1; + load_now[TH_BUCKET_SHARE_DF] += 1; load_now[TH_BUCKET_SHARE_UT] += 1; } @@ -203,6 +259,7 @@ compute_averages(uint64_t stdelta) uint32_t bucket_load = 0; if (load_now[i] > ncpus) { + /* Normalize the load to number of CPUs */ if (ncpus > 1) bucket_load = load_now[i] / ncpus; else @@ -211,7 +268,27 @@ compute_averages(uint64_t stdelta) if (bucket_load > MAX_LOAD) bucket_load = MAX_LOAD; } + /* Plug the load values into the EWMA algorithm to calculate (scaled for fixpoint) sched_load */ + sched_load[i] = (sched_load[i] * SCHED_LOAD_EWMA_ALPHA_OLD) + ((bucket_load << SCHED_LOAD_EWMA_ALPHA_SHIFT) * SCHED_LOAD_EWMA_ALPHA_NEW); + sched_load[i] = sched_load[i] >> SCHED_LOAD_EWMA_ALPHA_SHIFT; + } + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD_EFFECTIVE) | DBG_FUNC_NONE, + SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_FG]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_DF]), + SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_UT]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_BG]), 0); +} + +void +compute_averages(uint64_t stdelta) +{ + + uint32_t nthreads = sched_run_buckets[TH_BUCKET_RUN] - 1; + uint32_t ncpus = processor_avail_count; + + /* Update the global pri_shifts based on the latest values */ + for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG ; i++) { + uint32_t bucket_load = SCHED_LOAD_EWMA_UNSCALE(sched_load[i]); sched_pri_shifts[i] = sched_fixed_shift - sched_load_shifts[bucket_load]; } diff --git a/osfmk/kern/sched_dualq.c b/osfmk/kern/sched_dualq.c index f465d803e..855d75853 100644 --- a/osfmk/kern/sched_dualq.c +++ b/osfmk/kern/sched_dualq.c @@ -94,6 +94,9 @@ sched_dualq_processor_queue_shutdown(processor_t processor); static sched_mode_t sched_dualq_initial_thread_sched_mode(task_t parent_task); +static bool +sched_dualq_thread_avoid_processor(processor_t processor, thread_t thread); + const struct sched_dispatch_table sched_dualq_dispatch = { .sched_name = "dualq", .init = sched_dualq_init, @@ -126,8 +129,8 @@ const struct sched_dispatch_table sched_dualq_dispatch = { .direct_dispatch_to_idle_processors = FALSE, .multiple_psets_enabled = TRUE, .sched_groups_enabled = FALSE, - .avoid_processor_enabled = FALSE, - .thread_avoid_processor = NULL, + .avoid_processor_enabled = TRUE, + .thread_avoid_processor = sched_dualq_thread_avoid_processor, .processor_balance = sched_SMT_balance, .rt_runq = sched_rtglobal_runq, @@ -251,6 +254,10 @@ sched_dualq_processor_csw_check(processor_t processor) boolean_t has_higher; int pri; + if (sched_dualq_thread_avoid_processor(processor, current_thread())) { + return (AST_PREEMPT | AST_URGENT); + } + run_queue_t main_runq = dualq_main_runq(processor); run_queue_t bound_runq = dualq_bound_runq(processor); @@ -476,4 +483,21 @@ sched_dualq_thread_update_scan(sched_update_scan_context_t scan_context) } while (restart_needed); } +extern int sched_allow_rt_smt; + +/* Return true if this thread should not continue running on this processor */ +static bool +sched_dualq_thread_avoid_processor(processor_t processor, thread_t thread) +{ + if (processor->processor_primary != processor) { + /* + * This is a secondary SMT processor. If the primary is running + * a realtime thread, only allow realtime threads on the secondary. + */ + if ((processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) && ((thread->sched_pri < BASEPRI_RTQUEUES) || !sched_allow_rt_smt)) { + return true; + } + } + return false; +} diff --git a/osfmk/kern/sched_prim.c b/osfmk/kern/sched_prim.c index 508c83bdb..751b57417 100644 --- a/osfmk/kern/sched_prim.c +++ b/osfmk/kern/sched_prim.c @@ -78,9 +78,7 @@ #include #include -#ifdef CONFIG_MACH_APPROXIMATE_TIME #include -#endif #include #include @@ -108,6 +106,7 @@ #include #include #include +#include #include #include @@ -181,8 +180,13 @@ uint32_t min_rt_quantum; #if defined(CONFIG_SCHED_TIMESHARE_CORE) -unsigned sched_tick; -uint32_t sched_tick_interval; +unsigned sched_tick; +uint32_t sched_tick_interval; + +/* Timeshare load calculation interval (15ms) */ +uint32_t sched_load_compute_interval_us = 15000; +uint64_t sched_load_compute_interval_abs; +static _Atomic uint64_t sched_load_compute_deadline; uint32_t sched_pri_shifts[TH_BUCKET_MAX]; uint32_t sched_fixed_shift; @@ -341,7 +345,7 @@ sched_init_override(void) kprintf("Scheduler: Runtime selection of %s\n", SCHED(sched_name)); } else { #if defined(CONFIG_SCHED_MULTIQ) - sched_current_dispatch = &sched_multiq_dispatch; + sched_current_dispatch = &sched_dualq_dispatch; #elif defined(CONFIG_SCHED_TRADITIONAL) sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch; #else @@ -379,6 +383,8 @@ sched_init(void) } strlcpy(sched_string, SCHED(sched_name), sizeof(sched_string)); + cpu_quiescent_counter_init(); + SCHED(init)(); SCHED(rt_init)(&pset0); sched_timer_deadline_tracking_init(); @@ -454,6 +460,10 @@ sched_timeshare_timebase_init(void) assert((abstime >> 32) == 0 && (uint32_t)abstime != 0); sched_tick_interval = (uint32_t)abstime; + /* timeshare load calculation interval & deadline initialization */ + clock_interval_to_absolutetime_interval(sched_load_compute_interval_us, NSEC_PER_USEC, &sched_load_compute_interval_abs); + sched_load_compute_deadline = sched_load_compute_interval_abs; + /* * Compute conversion factor from usage to * timesharing priorities with 5/8 ** n aging. @@ -662,6 +672,7 @@ thread_unblock( boolean_t ready_for_runq = FALSE; thread_t cthread = current_thread(); uint32_t new_run_count; + int old_thread_state; /* * Set wait_result. @@ -681,15 +692,20 @@ thread_unblock( * Update scheduling state: not waiting, * set running. */ - thread->state &= ~(TH_WAIT|TH_UNINT); + old_thread_state = thread->state; + thread->state = (old_thread_state | TH_RUN) & + ~(TH_WAIT|TH_UNINT|TH_WAIT_REPORT); - if (!(thread->state & TH_RUN)) { - thread->state |= TH_RUN; - thread->last_made_runnable_time = thread->last_basepri_change_time = mach_approximate_time(); + if ((old_thread_state & TH_RUN) == 0) { + uint64_t ctime = mach_approximate_time(); + thread->last_made_runnable_time = thread->last_basepri_change_time = ctime; + timer_start(&thread->runnable_timer, ctime); ready_for_runq = TRUE; - (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread); + if (old_thread_state & TH_WAIT_REPORT) { + (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread); + } /* Update the runnable thread count */ new_run_count = sched_run_incr(thread); @@ -786,6 +802,12 @@ thread_unblock( thread->callout_woke_thread = FALSE; } +#if KPERF + if (ready_for_runq) { + kperf_make_runnable(thread, aticontext); + } +#endif /* KPERF */ + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED,MACH_MAKE_RUNNABLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result, @@ -848,11 +870,12 @@ __private_extern__ wait_result_t thread_mark_wait_locked( thread_t thread, - wait_interrupt_t interruptible) + wait_interrupt_t interruptible_orig) { - boolean_t at_safe_point; + boolean_t at_safe_point; + wait_interrupt_t interruptible = interruptible_orig; - assert(!(thread->state & (TH_WAIT|TH_IDLE|TH_UNINT|TH_TERMINATE2))); + assert(!(thread->state & (TH_WAIT|TH_IDLE|TH_UNINT|TH_TERMINATE2|TH_WAIT_REPORT))); /* * The thread may have certain types of interrupts/aborts masked @@ -860,6 +883,7 @@ thread_mark_wait_locked( * are OK, we have to honor mask settings (outer-scoped code may * not be able to handle aborts at the moment). */ + interruptible &= TH_OPT_INTMASK; if (interruptible > (thread->options & TH_OPT_INTMASK)) interruptible = thread->options & TH_OPT_INTMASK; @@ -873,7 +897,20 @@ thread_mark_wait_locked( if ( !(thread->state & TH_TERMINATE)) DTRACE_SCHED(sleep); - thread->state |= (interruptible) ? TH_WAIT : (TH_WAIT | TH_UNINT); + int state_bits = TH_WAIT; + if (!interruptible) { + state_bits |= TH_UNINT; + } + if (thread->sched_call) { + wait_interrupt_t mask = THREAD_WAIT_NOREPORT_USER; + if (is_kerneltask(thread->task)) { + mask = THREAD_WAIT_NOREPORT_KERNEL; + } + if ((interruptible_orig & mask) == 0) { + state_bits |= TH_WAIT_REPORT; + } + } + thread->state |= state_bits; thread->at_safe_point = at_safe_point; /* TODO: pass this through assert_wait instead, have @@ -883,10 +920,10 @@ thread_mark_wait_locked( thread->pending_block_hint = kThreadWaitNone; return (thread->wait_result = THREAD_WAITING); + } else { + if (thread->sched_flags & TH_SFLAG_ABORTSAFELY) + thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK; } - else - if (thread->sched_flags & TH_SFLAG_ABORTSAFELY) - thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK; thread->pending_block_hint = kThreadWaitNone; return (thread->wait_result = THREAD_INTERRUPTED); @@ -1745,18 +1782,19 @@ sched_SMT_balance(processor_t cprocessor, processor_set_t cpset) { processor_t sprocessor; sched_ipi_type_t ipi_type = SCHED_IPI_NONE; - qe_foreach_element(sprocessor, &cpset->active_queue, processor_queue) { - if ((sprocessor->state == PROCESSOR_RUNNING) && - (sprocessor->processor_primary != sprocessor) && - (sprocessor->processor_primary->state == PROCESSOR_RUNNING) && + uint64_t running_secondary_map = (cpset->cpu_state_map[PROCESSOR_RUNNING] & + ~cpset->primary_map); + for (int cpuid = lsb_first(running_secondary_map); cpuid >= 0; cpuid = lsb_next(running_secondary_map, cpuid)) { + sprocessor = processor_array[cpuid]; + if ((sprocessor->processor_primary->state == PROCESSOR_RUNNING) && (sprocessor->current_pri < BASEPRI_RTQUEUES)) { - ipi_type = sched_ipi_action(sprocessor, NULL, false, SCHED_IPI_EVENT_SMT_REBAL); - if (ipi_type != SCHED_IPI_NONE) { - assert(sprocessor != cprocessor); - ast_processor = sprocessor; - break; - } + ipi_type = sched_ipi_action(sprocessor, NULL, false, SCHED_IPI_EVENT_SMT_REBAL); + if (ipi_type != SCHED_IPI_NONE) { + assert(sprocessor != cprocessor); + ast_processor = sprocessor; + break; + } } } @@ -1830,7 +1868,7 @@ thread_select(thread_t thread, * choose_processor(), so in those cases we should continue trying to dequeue work. */ if (!SCHED(processor_bound_count)(processor)) { - if (!queue_empty(&pset->idle_queue)) { + if ((pset->recommended_bitmask & pset->primary_map & pset->cpu_state_map[PROCESSOR_IDLE]) != 0) { goto idle; } @@ -2036,18 +2074,7 @@ thread_select(thread_t thread, * was running. */ if (processor->state == PROCESSOR_RUNNING) { - processor->state = PROCESSOR_IDLE; - - if (!processor->is_recommended) { - re_queue_head(&pset->unused_queue, &processor->processor_queue); - } else if (processor->processor_primary == processor) { - re_queue_head(&pset->idle_queue, &processor->processor_queue); - } else { - re_queue_head(&pset->idle_secondary_queue, &processor->processor_queue); - } - - pset->active_processor_count--; - sched_update_pset_load_average(pset); + pset_update_processor_state(pset, processor, PROCESSOR_IDLE); } #if __SMP__ @@ -2127,7 +2154,8 @@ thread_select_idle( #endif thread->last_run_time = processor->last_dispatch; - thread_timer_event(processor->last_dispatch, &processor->idle_thread->system_timer); + processor_timer_switch_thread(processor->last_dispatch, + &processor->idle_thread->system_timer); PROCESSOR_DATA(processor, kernel_timer) = &processor->idle_thread->system_timer; @@ -2137,7 +2165,9 @@ thread_select_idle( timer_call_quantum_timer_cancel(&processor->quantum_timer); processor->first_timeslice = FALSE; - (*thread->sched_call)(SCHED_CALL_BLOCK, thread); + if (thread->sched_call) { + (*thread->sched_call)(SCHED_CALL_BLOCK, thread); + } thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, NULL); @@ -2150,7 +2180,9 @@ thread_select_idle( /* * Return at splsched. */ - (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread); + if (thread->sched_call) { + (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread); + } thread_lock(thread); @@ -2159,16 +2191,17 @@ thread_select_idle( * Otherwise skip; we will context switch to another thread or return here. */ if (!(thread->state & TH_WAIT)) { - processor->last_dispatch = mach_absolute_time(); - thread_timer_event(processor->last_dispatch, &thread->system_timer); + uint64_t time_now = processor->last_dispatch = mach_absolute_time(); + processor_timer_switch_thread(time_now, &thread->system_timer); + timer_update(&thread->runnable_timer, time_now); PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer; thread_quantum_init(thread); - processor->quantum_end = processor->last_dispatch + thread->quantum_remaining; + processor->quantum_end = time_now + thread->quantum_remaining; timer_call_quantum_timer_enter(&processor->quantum_timer, - thread, processor->quantum_end, processor->last_dispatch); + thread, processor->quantum_end, time_now); processor->first_timeslice = TRUE; - thread->computation_epoch = processor->last_dispatch; + thread->computation_epoch = time_now; } thread->state &= ~TH_IDLE; @@ -2262,7 +2295,7 @@ thread_invoke( /* Prepare for spin debugging */ #if INTERRUPT_MASKED_DEBUG - ml_spin_debug_clear(thread); + ml_spin_debug_clear(thread); #endif if (continuation != NULL) { @@ -2300,7 +2333,8 @@ thread_invoke( processor->last_dispatch = ctime; self->last_run_time = ctime; - thread_timer_event(ctime, &thread->system_timer); + processor_timer_switch_thread(ctime, &thread->system_timer); + timer_update(&thread->runnable_timer, ctime); PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer; /* @@ -2308,11 +2342,9 @@ thread_invoke( * during privilege transitions, synthesize an event now. */ if (!thread->precise_user_kernel_time) { - timer_switch(PROCESSOR_DATA(processor, current_state), - ctime, - PROCESSOR_DATA(processor, current_state)); + timer_update(PROCESSOR_DATA(processor, current_state), ctime); } - + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF)|DBG_FUNC_NONE, self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0); @@ -2326,11 +2358,15 @@ thread_invoke( SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri); +#if KPERF + kperf_off_cpu(self); +#endif /* KPERF */ + TLOG(1, "thread_invoke: calling stack_handoff\n"); stack_handoff(self, thread); /* 'self' is now off core */ - assert(thread == current_thread()); + assert(thread == current_thread_volatile()); DTRACE_SCHED(on__cpu); @@ -2338,21 +2374,20 @@ thread_invoke( kperf_on_cpu(thread, continuation, NULL); #endif /* KPERF */ + thread_dispatch(self, thread); + #if KASAN - kasan_unpoison_fakestack(self); + /* Old thread's stack has been moved to the new thread, so explicitly + * unpoison it. */ kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size); #endif - thread_dispatch(self, thread); - thread->continuation = thread->parameter = NULL; counter(c_thread_invoke_hits++); - (void) spllo(); - assert(continuation); - call_continuation(continuation, parameter, thread->wait_result); + call_continuation(continuation, parameter, thread->wait_result, TRUE); /*NOTREACHED*/ } else if (thread == self) { @@ -2371,15 +2406,16 @@ thread_invoke( self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0); #if KASAN - kasan_unpoison_fakestack(self); + /* stack handoff to self - no thread_dispatch(), so clear the stack + * and free the fakestack directly */ + kasan_fakestack_drop(self); + kasan_fakestack_gc(self); kasan_unpoison_stack(self->kernel_stack, kernel_stack_size); #endif self->continuation = self->parameter = NULL; - (void) spllo(); - - call_continuation(continuation, parameter, self->wait_result); + call_continuation(continuation, parameter, self->wait_result, TRUE); /*NOTREACHED*/ } } else { @@ -2431,7 +2467,8 @@ thread_invoke( processor->last_dispatch = ctime; self->last_run_time = ctime; - thread_timer_event(ctime, &thread->system_timer); + processor_timer_switch_thread(ctime, &thread->system_timer); + timer_update(&thread->runnable_timer, ctime); PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer; /* @@ -2439,9 +2476,7 @@ thread_invoke( * during privilege transitions, synthesize an event now. */ if (!thread->precise_user_kernel_time) { - timer_switch(PROCESSOR_DATA(processor, current_state), - ctime, - PROCESSOR_DATA(processor, current_state)); + timer_update(PROCESSOR_DATA(processor, current_state), ctime); } KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, @@ -2457,6 +2492,10 @@ thread_invoke( SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri); +#if KPERF + kperf_off_cpu(self); +#endif /* KPERF */ + /* * This is where we actually switch register context, * and address space if required. We will next run @@ -2474,7 +2513,7 @@ thread_invoke( */ assert(continuation == self->continuation); thread = machine_switch_context(self, continuation, thread); - assert(self == current_thread()); + assert(self == current_thread_volatile()); TLOG(1,"thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread); DTRACE_SCHED(on__cpu); @@ -2491,9 +2530,7 @@ thread_invoke( if (continuation) { self->continuation = self->parameter = NULL; - (void) spllo(); - - call_continuation(continuation, parameter, self->wait_result); + call_continuation(continuation, parameter, self->wait_result, TRUE); /*NOTREACHED*/ } @@ -2537,11 +2574,14 @@ pset_cancel_deferred_dispatch( * correct (we won't accidentally have a runnable thread that hasn't been * dispatched to an idle processor), if not ideal (we may be restarting the * dispatch process, which could have some overhead). - * */ - if ((sampled_sched_run_count == 1) && - (pset->pending_deferred_AST_cpu_mask)) { - qe_foreach_element_safe(active_processor, &pset->active_queue, processor_queue) { + + if ((sampled_sched_run_count == 1) && (pset->pending_deferred_AST_cpu_mask)) { + uint64_t dispatching_map = (pset->cpu_state_map[PROCESSOR_DISPATCHING] & + pset->pending_deferred_AST_cpu_mask & + ~pset->pending_AST_cpu_mask); + for (int cpuid = lsb_first(dispatching_map); cpuid >= 0; cpuid = lsb_next(dispatching_map, cpuid)) { + active_processor = processor_array[cpuid]; /* * If a processor is DISPATCHING, it could be because of * a cancelable signal. @@ -2563,35 +2603,16 @@ pset_cancel_deferred_dispatch( * should be no different than if the core took some * interrupt while IDLE. */ - if ((active_processor->state == PROCESSOR_DISPATCHING) && - (bit_test(pset->pending_deferred_AST_cpu_mask, active_processor->cpu_id)) && - (!bit_test(pset->pending_AST_cpu_mask, active_processor->cpu_id)) && - (active_processor != processor)) { + if (active_processor != processor) { /* * Squash all of the processor state back to some * reasonable facsimile of PROCESSOR_IDLE. - * - * TODO: What queue policy do we actually want here? - * We want to promote selection of a good processor - * to run on. Do we want to enqueue at the head? - * The tail? At the (relative) old position in the - * queue? Or something else entirely? */ - if (!active_processor->is_recommended) { - re_queue_head(&pset->unused_queue, &active_processor->processor_queue); - } else if (active_processor->processor_primary == active_processor) { - re_queue_head(&pset->idle_queue, &active_processor->processor_queue); - } else { - re_queue_head(&pset->idle_secondary_queue, &active_processor->processor_queue); - } - - pset->active_processor_count--; - sched_update_pset_load_average(pset); assert(active_processor->next_thread == THREAD_NULL); processor_state_update_idle(active_processor); active_processor->deadline = UINT64_MAX; - active_processor->state = PROCESSOR_IDLE; + pset_update_processor_state(pset, active_processor, PROCESSOR_IDLE); bit_clear(pset->pending_deferred_AST_cpu_mask, active_processor->cpu_id); machine_signal_idle_cancel(active_processor); } @@ -2639,7 +2660,7 @@ thread_dispatch( processor_t processor = self->last_processor; assert(processor == current_processor()); - assert(self == current_thread()); + assert(self == current_thread_volatile()); assert(thread != self); if (thread != THREAD_NULL) { @@ -2656,7 +2677,25 @@ thread_dispatch( * - We do not want to callout if "thread" is NULL. */ thread_csw_callout(thread, self, processor->last_dispatch); - + +#if KASAN + if (thread->continuation != NULL) { + /* + * Thread has a continuation and the normal stack is going away. + * Unpoison the stack and mark all fakestack objects as unused. + */ + kasan_fakestack_drop(thread); + if (thread->kernel_stack) { + kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size); + } + } + + /* + * Free all unused fakestack objects. + */ + kasan_fakestack_gc(thread); +#endif + /* * If blocked at a continuation, discard * the stack. @@ -2785,8 +2824,9 @@ thread_dispatch( if (reason & AST_REBALANCE) { options |= SCHED_REBALANCE; if (reason & AST_QUANTUM) { - /* Having gone to the trouble of forcing this thread off a less preferred core, - * we should force the preferable core to reschedule immediatey to give this + /* + * Having gone to the trouble of forcing this thread off a less preferred core, + * we should force the preferable core to reschedule immediately to give this * thread a chance to run instead of just sitting on the run queue where * it may just be stolen back by the idle core we just forced it off. * But only do this at the end of a quantum to prevent cascading effects. @@ -2825,27 +2865,30 @@ thread_dispatch( */ boolean_t should_terminate = FALSE; uint32_t new_run_count; + int thread_state = thread->state; /* Only the first call to thread_dispatch * after explicit termination should add * the thread to the termination queue */ - if ((thread->state & (TH_TERMINATE|TH_TERMINATE2)) == TH_TERMINATE) { + if ((thread_state & (TH_TERMINATE|TH_TERMINATE2)) == TH_TERMINATE) { should_terminate = TRUE; - thread->state |= TH_TERMINATE2; + thread_state |= TH_TERMINATE2; } - thread->state &= ~TH_RUN; + timer_stop(&thread->runnable_timer, processor->last_dispatch); + + thread_state &= ~TH_RUN; + thread->state = thread_state; + thread->last_made_runnable_time = thread->last_basepri_change_time = THREAD_NOT_RUNNABLE; thread->chosen_processor = PROCESSOR_NULL; new_run_count = sched_run_decr(thread); #if CONFIG_SCHED_SFI - if ((thread->state & (TH_WAIT | TH_TERMINATE)) == TH_WAIT) { - if (thread->reason & AST_SFI) { - thread->wait_sfi_begin_time = processor->last_dispatch; - } + if (thread->reason & AST_SFI) { + thread->wait_sfi_begin_time = processor->last_dispatch; } #endif @@ -2853,10 +2896,12 @@ thread_dispatch( KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE, - (uintptr_t)thread_tid(thread), thread->reason, thread->state, + (uintptr_t)thread_tid(thread), thread->reason, thread_state, new_run_count, 0); - (*thread->sched_call)(SCHED_CALL_BLOCK, thread); + if (thread_state & TH_WAIT_REPORT) { + (*thread->sched_call)(SCHED_CALL_BLOCK, thread); + } if (thread->wake_active) { thread->wake_active = FALSE; @@ -2949,7 +2994,6 @@ thread_dispatch( pset_cancel_deferred_dispatch(processor->processor_set, processor); } #endif - } /* @@ -3105,11 +3149,10 @@ thread_continue( ml_spin_debug_clear(self); #endif - if (thread != THREAD_NULL) - (void)spllo(); - - TLOG(1, "thread_continue: calling call_continuation \n"); - call_continuation(continuation, parameter, self->wait_result); + TLOG(1, "thread_continue: calling call_continuation\n"); + + boolean_t enable_interrupts = thread != THREAD_NULL; + call_continuation(continuation, parameter, self->wait_result, enable_interrupts); /*NOTREACHED*/ } @@ -3347,8 +3390,9 @@ realtime_setrun( processor_t processor, thread_t thread) { - processor_set_t pset = processor->processor_set; - ast_t preempt; + processor_set_t pset = processor->processor_set; + pset_assert_locked(pset); + ast_t preempt; sched_ipi_type_t ipi_type = SCHED_IPI_NONE; @@ -3362,15 +3406,11 @@ realtime_setrun( */ if ( (thread->bound_processor == processor) && processor->state == PROCESSOR_IDLE) { - re_queue_tail(&pset->active_queue, &processor->processor_queue); - - pset->active_processor_count++; - sched_update_pset_load_average(pset); processor->next_thread = thread; processor_state_update_from_thread(processor, thread); processor->deadline = thread->realtime.deadline; - processor->state = PROCESSOR_DISPATCHING; + pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING); ipi_type = sched_ipi_action(processor, thread, true, SCHED_IPI_EVENT_BOUND_THR); pset_unlock(pset); @@ -3390,15 +3430,10 @@ realtime_setrun( ipi_type = SCHED_IPI_NONE; if (preempt != AST_NONE) { if (processor->state == PROCESSOR_IDLE) { - re_queue_tail(&pset->active_queue, &processor->processor_queue); - - pset->active_processor_count++; - sched_update_pset_load_average(pset); - processor->next_thread = THREAD_NULL; processor_state_update_from_thread(processor, thread); processor->deadline = thread->realtime.deadline; - processor->state = PROCESSOR_DISPATCHING; + pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING); if (processor == current_processor()) { ast_on(preempt); } else { @@ -3557,8 +3592,9 @@ processor_setrun( thread_t thread, integer_t options) { - processor_set_t pset = processor->processor_set; - ast_t preempt; + processor_set_t pset = processor->processor_set; + pset_assert_locked(pset); + ast_t preempt; enum { eExitIdle, eInterruptRunning, eDoNothing } ipi_action = eDoNothing; sched_ipi_type_t ipi_type = SCHED_IPI_NONE; @@ -3572,15 +3608,10 @@ processor_setrun( thread->bound_processor == processor) && processor->state == PROCESSOR_IDLE) { - re_queue_tail(&pset->active_queue, &processor->processor_queue); - - pset->active_processor_count++; - sched_update_pset_load_average(pset); - processor->next_thread = thread; processor_state_update_from_thread(processor, thread); processor->deadline = UINT64_MAX; - processor->state = PROCESSOR_DISPATCHING; + pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING); ipi_type = sched_ipi_action(processor, thread, true, SCHED_IPI_EVENT_BOUND_THR); pset_unlock(pset); @@ -3607,17 +3638,25 @@ processor_setrun( } else preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE; + if ((options & (SCHED_PREEMPT|SCHED_REBALANCE)) == (SCHED_PREEMPT|SCHED_REBALANCE)) { + /* + * Having gone to the trouble of forcing this thread off a less preferred core, + * we should force the preferable core to reschedule immediately to give this + * thread a chance to run instead of just sitting on the run queue where + * it may just be stolen back by the idle core we just forced it off. + */ + preempt |= AST_PREEMPT; + } + SCHED(processor_enqueue)(processor, thread, options); sched_update_pset_load_average(pset); if (preempt != AST_NONE) { if (processor->state == PROCESSOR_IDLE) { - re_queue_tail(&pset->active_queue, &processor->processor_queue); - pset->active_processor_count++; processor->next_thread = THREAD_NULL; processor_state_update_from_thread(processor, thread); processor->deadline = UINT64_MAX; - processor->state = PROCESSOR_DISPATCHING; + pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING); ipi_action = eExitIdle; } else if ( processor->state == PROCESSOR_DISPATCHING) { if ((processor->next_thread == THREAD_NULL) && (processor->current_pri < thread->sched_pri)) { @@ -3638,15 +3677,11 @@ processor_setrun( thread->sched_pri >= processor->current_pri ) { ipi_action = eInterruptRunning; } else if (processor->state == PROCESSOR_IDLE) { - re_queue_tail(&pset->active_queue, &processor->processor_queue); - - pset->active_processor_count++; - // sched_update_pset_load_average(pset); processor->next_thread = THREAD_NULL; processor_state_update_from_thread(processor, thread); processor->deadline = UINT64_MAX; - processor->state = PROCESSOR_DISPATCHING; + pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING); ipi_action = eExitIdle; } @@ -3701,11 +3736,12 @@ choose_next_pset( */ processor_t choose_processor( - processor_set_t pset, - processor_t processor, - thread_t thread) + processor_set_t starting_pset, + processor_t processor, + thread_t thread) { - processor_set_t nset, cset = pset; + processor_set_t pset = starting_pset; + processor_set_t nset; assert(thread->sched_pri <= BASEPRI_RTQUEUES); @@ -3821,16 +3857,19 @@ choose_processor( } do { - /* * Choose an idle processor, in pset traversal order */ - qe_foreach_element(processor, &cset->idle_queue, processor_queue) { - if (bit_test(cset->pending_AST_cpu_mask, processor->cpu_id)) { - continue; - } - if (processor->is_recommended) - return processor; + + uint64_t idle_primary_map = (pset->cpu_state_map[PROCESSOR_IDLE] & + pset->primary_map & + pset->recommended_bitmask & + ~pset->pending_AST_cpu_mask); + + int cpuid = lsb_first(idle_primary_map); + if (cpuid >= 0) { + processor = processor_array[cpuid]; + return processor; } /* @@ -3838,14 +3877,13 @@ choose_processor( * with lower priority/etc. */ - qe_foreach_element(processor, &cset->active_queue, processor_queue) { - - if (!processor->is_recommended) { - continue; - } - if (bit_test(cset->pending_AST_cpu_mask, processor->cpu_id)) { - continue; - } + uint64_t active_map = ((pset->cpu_state_map[PROCESSOR_RUNNING] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) & + pset->recommended_bitmask & + ~pset->pending_AST_cpu_mask); + active_map = bit_ror64(active_map, (pset->last_chosen + 1)); + for (int rotid = lsb_first(active_map); rotid >= 0; rotid = lsb_next(active_map, rotid)) { + cpuid = ((rotid + pset->last_chosen + 1) & 63); + processor = processor_array[cpuid]; integer_t cpri = processor->current_pri; if (processor->processor_primary != processor) { @@ -3876,15 +3914,20 @@ choose_processor( * For SMT configs, these idle secondary processors must have active primary. Otherwise * the idle primary would have short-circuited the loop above */ - qe_foreach_element(processor, &cset->idle_secondary_queue, processor_queue) { + uint64_t idle_secondary_map = (pset->cpu_state_map[PROCESSOR_IDLE] & + ~pset->primary_map & + pset->recommended_bitmask & + ~pset->pending_AST_cpu_mask); - if (!processor->is_recommended) { - continue; - } + for (cpuid = lsb_first(idle_secondary_map); cpuid >= 0; cpuid = lsb_next(idle_secondary_map, cpuid)) { + processor = processor_array[cpuid]; processor_t cprimary = processor->processor_primary; - if (bit_test(cset->pending_AST_cpu_mask, cprimary->cpu_id)) { + if (!cprimary->is_recommended) { + continue; + } + if (bit_test(pset->pending_AST_cpu_mask, cprimary->cpu_id)) { continue; } @@ -3911,16 +3954,15 @@ choose_processor( */ if (thread->sched_pri > lowest_unpaired_primary_priority) { - /* Move to end of active queue so that the next thread doesn't also pick it */ - re_queue_tail(&cset->active_queue, &lp_unpaired_primary_processor->processor_queue); + pset->last_chosen = lp_unpaired_primary_processor->cpu_id; return lp_unpaired_primary_processor; } if (thread->sched_pri > lowest_priority) { - /* Move to end of active queue so that the next thread doesn't also pick it */ - re_queue_tail(&cset->active_queue, &lp_processor->processor_queue); + pset->last_chosen = lp_processor->cpu_id; return lp_processor; } if (sched_allow_rt_smt && (thread->sched_pri > lowest_secondary_priority)) { + pset->last_chosen = lp_paired_secondary_processor->cpu_id; return lp_paired_secondary_processor; } if (thread->realtime.deadline < furthest_deadline) @@ -3935,13 +3977,11 @@ choose_processor( else { if (thread->sched_pri > lowest_unpaired_primary_priority) { - /* Move to end of active queue so that the next thread doesn't also pick it */ - re_queue_tail(&cset->active_queue, &lp_unpaired_primary_processor->processor_queue); + pset->last_chosen = lp_unpaired_primary_processor->cpu_id; return lp_unpaired_primary_processor; } if (thread->sched_pri > lowest_priority) { - /* Move to end of active queue so that the next thread doesn't also pick it */ - re_queue_tail(&cset->active_queue, &lp_processor->processor_queue); + pset->last_chosen = lp_processor->cpu_id; return lp_processor; } @@ -3955,15 +3995,15 @@ choose_processor( /* * Move onto the next processor set. */ - nset = next_pset(cset); + nset = next_pset(pset); - if (nset != pset) { - pset_unlock(cset); + if (nset != starting_pset) { + pset_unlock(pset); - cset = nset; - pset_lock(cset); + pset = nset; + pset_lock(pset); } - } while (nset != pset); + } while (nset != starting_pset); /* * Make sure that we pick a running processor, @@ -4001,10 +4041,10 @@ choose_processor( * Check that the correct processor set is * returned locked. */ - if (cset != processor->processor_set) { - pset_unlock(cset); - cset = processor->processor_set; - pset_lock(cset); + if (pset != processor->processor_set) { + pset_unlock(pset); + pset = processor->processor_set; + pset_lock(pset); } /* @@ -4019,11 +4059,8 @@ choose_processor( } while (processor == PROCESSOR_NULL); - if (processor->state == PROCESSOR_RUNNING) { - re_queue_tail(&cset->active_queue, &processor->processor_queue); - } - - return (processor); + pset->last_chosen = processor->cpu_id; + return processor; } /* @@ -4208,21 +4245,22 @@ csw_check_locked( } } - result = SCHED(processor_csw_check)(processor); - if (result != AST_NONE) - return (check_reason | result | (thread_eager_preemption(thread) ? AST_URGENT : AST_NONE)); - #if __SMP__ - /* - * If the current thread is running on a processor that is no longer recommended, gently - * (non-urgently) get to a point and then block, and which point thread_select() should + * If the current thread is running on a processor that is no longer recommended, + * urgently preempt it, at which point thread_select() should * try to idle the processor and re-dispatch the thread to a recommended processor. */ if (!processor->is_recommended) { - return (check_reason | AST_PREEMPT); + return (check_reason | AST_PREEMPT | AST_URGENT); } +#endif + result = SCHED(processor_csw_check)(processor); + if (result != AST_NONE) + return (check_reason | result | (thread_eager_preemption(thread) ? AST_URGENT : AST_NONE)); + +#if __SMP__ /* * Same for avoid-processor * @@ -4239,11 +4277,6 @@ csw_check_locked( * TODO: Should this do the same check that thread_select does? i.e. * if no bound threads target this processor, and idle primaries exist, preempt * The case of RT threads existing is already taken care of above - * Consider Capri in this scenario. - * - * if (!SCHED(processor_bound_count)(processor) && !queue_empty(&pset->idle_queue)) - * - * TODO: Alternatively - check if only primary is idle, or check if primary's pri is lower than mine. */ if (processor->current_pri < BASEPRI_RTQUEUES && @@ -4279,7 +4312,8 @@ csw_check_locked( void set_sched_pri( thread_t thread, - int new_priority) + int new_priority, + set_sched_pri_options_t options) { thread_t cthread = current_thread(); boolean_t is_current_thread = (thread == cthread) ? TRUE : FALSE; @@ -4287,6 +4321,8 @@ set_sched_pri( uint64_t urgency_param1, urgency_param2; boolean_t removed_from_runq = FALSE; + bool lazy_update = ((options & SETPRI_LAZY) == SETPRI_LAZY); + int old_priority = thread->sched_pri; /* If we're already at this priority, no need to mess with the runqueue */ @@ -4337,13 +4373,14 @@ set_sched_pri( * If a thread raises its own priority, don't aggressively rebalance it. * */ - if (new_priority < old_priority) { + if (!lazy_update && new_priority < old_priority) { ast_t preempt; if ((preempt = csw_check(processor, AST_NONE)) != AST_NONE) ast_on(preempt); } - } else if (processor != PROCESSOR_NULL && processor->active_thread == thread) { + } else if (!lazy_update && processor != PROCESSOR_NULL && + processor != current_processor() && processor->active_thread == thread) { cause_ast_check(processor); } } @@ -4479,12 +4516,12 @@ thread_run_queue_reinsert(thread_t thread, integer_t options) } void -sys_override_cpu_throttle(int flag) +sys_override_cpu_throttle(boolean_t enable_override) { - if (flag == CPU_THROTTLE_ENABLE) - cpu_throttle_enabled = 1; - if (flag == CPU_THROTTLE_DISABLE) + if (enable_override) cpu_throttle_enabled = 0; + else + cpu_throttle_enabled = 1; } int @@ -4574,10 +4611,13 @@ processor_idle( SCHED_STATS_CPU_IDLE_START(processor); - timer_switch(&PROCESSOR_DATA(processor, system_state), - mach_absolute_time(), &PROCESSOR_DATA(processor, idle_state)); + uint64_t ctime = mach_absolute_time(); + + timer_switch(&PROCESSOR_DATA(processor, system_state), ctime, &PROCESSOR_DATA(processor, idle_state)); PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, idle_state); + cpu_quiescent_counter_leave(ctime); + while (1) { /* * Ensure that updates to my processor and pset state, @@ -4622,6 +4662,17 @@ processor_idle( (void)splsched(); + /* + * Check if we should call sched_timeshare_consider_maintenance() here. + * The CPU was woken out of idle due to an interrupt and we should do the + * call only if the processor is still idle. If the processor is non-idle, + * the threads running on the processor would do the call as part of + * context swithing. + */ + if (processor->state == PROCESSOR_IDLE) { + sched_timeshare_consider_maintenance(mach_absolute_time()); + } + IDLE_KERNEL_DEBUG_CONSTANT( MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -2, 0); @@ -4634,10 +4685,13 @@ processor_idle( } } - timer_switch(&PROCESSOR_DATA(processor, idle_state), - mach_absolute_time(), &PROCESSOR_DATA(processor, system_state)); + ctime = mach_absolute_time(); + + timer_switch(&PROCESSOR_DATA(processor, idle_state), ctime, &PROCESSOR_DATA(processor, system_state)); PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, system_state); + cpu_quiescent_counter_join(ctime); + pset_lock(pset); /* If we were sent a remote AST and came out of idle, acknowledge it here with pset lock held */ @@ -4653,7 +4707,7 @@ processor_idle( */ new_thread = processor->next_thread; processor->next_thread = THREAD_NULL; - processor->state = PROCESSOR_RUNNING; + pset_update_processor_state(pset, processor, PROCESSOR_RUNNING); if ((new_thread != THREAD_NULL) && (SCHED(processor_queue_has_priority)(processor, new_thread->sched_pri, FALSE) || (rt_runq_count(pset) > 0)) ) { @@ -4686,12 +4740,7 @@ processor_idle( return (new_thread); } else if (state == PROCESSOR_IDLE) { - re_queue_tail(&pset->active_queue, &processor->processor_queue); - - pset->active_processor_count++; - sched_update_pset_load_average(pset); - - processor->state = PROCESSOR_RUNNING; + pset_update_processor_state(pset, processor, PROCESSOR_RUNNING); processor_state_update_idle(processor); processor->deadline = UINT64_MAX; @@ -4935,8 +4984,8 @@ sched_timeshare_maintenance_continue(void) #endif /* DEBUG || DEVELOPMENT */ KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_END, - sched_pri_shifts[TH_BUCKET_SHARE_FG], sched_pri_shifts[TH_BUCKET_SHARE_BG], - sched_pri_shifts[TH_BUCKET_SHARE_UT], 0, 0); + sched_pri_shifts[TH_BUCKET_SHARE_FG], sched_pri_shifts[TH_BUCKET_SHARE_BG], + sched_pri_shifts[TH_BUCKET_SHARE_UT], sched_pri_shifts[TH_BUCKET_SHARE_DF], 0); assert_wait((event_t)sched_timeshare_maintenance_continue, THREAD_UNINT); thread_block((thread_continue_t)sched_timeshare_maintenance_continue); @@ -4957,14 +5006,17 @@ static uint64_t sched_maintenance_wakeups; */ void sched_timeshare_consider_maintenance(uint64_t ctime) { - uint64_t ndeadline, deadline = sched_maintenance_deadline; + + cpu_quiescent_counter_checkin(ctime); + + uint64_t deadline = sched_maintenance_deadline; if (__improbable(ctime >= deadline)) { if (__improbable(current_thread() == sched_maintenance_thread)) return; OSMemoryBarrier(); - ndeadline = ctime + sched_tick_interval; + uint64_t ndeadline = ctime + sched_tick_interval; if (__probable(__sync_bool_compare_and_swap(&sched_maintenance_deadline, deadline, ndeadline))) { thread_wakeup((event_t)sched_timeshare_maintenance_continue); @@ -4972,6 +5024,18 @@ sched_timeshare_consider_maintenance(uint64_t ctime) { } } + uint64_t load_compute_deadline = __c11_atomic_load(&sched_load_compute_deadline, memory_order_relaxed); + + if (__improbable(load_compute_deadline && ctime >= load_compute_deadline)) { + uint64_t new_deadline = 0; + if (__c11_atomic_compare_exchange_strong(&sched_load_compute_deadline, &load_compute_deadline, new_deadline, + memory_order_relaxed, memory_order_relaxed)) { + compute_sched_load(); + new_deadline = ctime + sched_load_compute_interval_abs; + __c11_atomic_store(&sched_load_compute_deadline, new_deadline, memory_order_relaxed); + } + } + #if __arm64__ uint64_t perf_deadline = __c11_atomic_load(&sched_perfcontrol_callback_deadline, memory_order_relaxed); @@ -5241,7 +5305,8 @@ sched_timer_deadline_tracking_init(void) { uint32_t perfcontrol_requested_recommended_cores = ALL_CORES_RECOMMENDED; uint32_t perfcontrol_requested_recommended_core_count = MAX_CPUS; -boolean_t perfcontrol_failsafe_active = FALSE; +bool perfcontrol_failsafe_active = false; +bool perfcontrol_sleep_override = false; uint64_t perfcontrol_failsafe_maintenance_runnable_time; uint64_t perfcontrol_failsafe_activation_time; @@ -5279,7 +5344,7 @@ sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores) perfcontrol_requested_recommended_cores = recommended_cores; perfcontrol_requested_recommended_core_count = __builtin_popcountll(recommended_cores); - if (perfcontrol_failsafe_active == FALSE) + if ((perfcontrol_failsafe_active == false) && (perfcontrol_sleep_override == false)) sched_update_recommended_cores(perfcontrol_requested_recommended_cores); else KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, @@ -5291,6 +5356,36 @@ sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores) splx(s); } +void +sched_override_recommended_cores_for_sleep(void) +{ + spl_t s = splsched(); + simple_lock(&sched_recommended_cores_lock); + + if (perfcontrol_sleep_override == false) { + perfcontrol_sleep_override = true; + sched_update_recommended_cores(ALL_CORES_RECOMMENDED); + } + + simple_unlock(&sched_recommended_cores_lock); + splx(s); +} + +void +sched_restore_recommended_cores_after_sleep(void) +{ + spl_t s = splsched(); + simple_lock(&sched_recommended_cores_lock); + + if (perfcontrol_sleep_override == true) { + perfcontrol_sleep_override = false; + sched_update_recommended_cores(perfcontrol_requested_recommended_cores); + } + + simple_unlock(&sched_recommended_cores_lock); + splx(s); +} + /* * Consider whether we need to activate the recommended cores failsafe * @@ -5506,12 +5601,11 @@ sched_update_recommended_cores(uint32_t recommended_cores) processor = processor_list; pset = processor->processor_set; - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_START, - recommended_cores, perfcontrol_failsafe_active, 0, 0, 0); + KDBG(MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_START, + recommended_cores, perfcontrol_failsafe_active, 0, 0); if (__builtin_popcount(recommended_cores) == 0) { - recommended_cores |= 0x1U; /* add boot processor or we hang */ + bit_set(recommended_cores, master_processor->cpu_id); /* add boot processor or we hang */ } /* First set recommended cores */ @@ -5525,19 +5619,13 @@ sched_update_recommended_cores(uint32_t recommended_cores) pset_lock(pset); } - pset->recommended_bitmask = recommended_cores; - - if (recommended_cores & (1ULL << processor->cpu_id)) { + if (bit_test(recommended_cores, processor->cpu_id)) { processor->is_recommended = TRUE; + bit_set(pset->recommended_bitmask, processor->cpu_id); if (processor->state == PROCESSOR_IDLE) { - if (processor->processor_primary == processor) { - re_queue_head(&pset->idle_queue, &processor->processor_queue); - } else { - re_queue_head(&pset->idle_secondary_queue, &processor->processor_queue); - } if (processor != current_processor()) { - needs_exit_idle_mask |= (1ULL << processor->cpu_id); + bit_set(needs_exit_idle_mask, processor->cpu_id); } } } @@ -5558,16 +5646,28 @@ sched_update_recommended_cores(uint32_t recommended_cores) pset_lock(pset); } - if (!(recommended_cores & (1ULL << processor->cpu_id))) { + if (!bit_test(recommended_cores, processor->cpu_id)) { + sched_ipi_type_t ipi_type = SCHED_IPI_NONE; + processor->is_recommended = FALSE; - if (processor->state == PROCESSOR_IDLE) { - re_queue_head(&pset->unused_queue, &processor->processor_queue); + bit_clear(pset->recommended_bitmask, processor->cpu_id); + + if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) { + ipi_type = SCHED_IPI_IMMEDIATE; } SCHED(processor_queue_shutdown)(processor); /* pset unlocked */ SCHED(rt_queue_shutdown)(processor); + if (ipi_type != SCHED_IPI_NONE) { + if (processor == current_processor()) { + ast_on(AST_PREEMPT); + } else { + sched_ipi_perform(processor, ipi_type); + } + } + pset_lock(pset); } } while ((processor = processor->processor_list) != NULL); @@ -5579,9 +5679,8 @@ sched_update_recommended_cores(uint32_t recommended_cores) machine_signal_idle(processor); } - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_END, - needs_exit_idle_mask, 0, 0, 0, 0); + KDBG(MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_END, + needs_exit_idle_mask, 0, 0, 0); } #endif /* __arm__ || __arm64__ */ @@ -5661,25 +5760,10 @@ sched_perfcontrol_update_callback_deadline(uint64_t new_deadline) #endif /* __arm64__ */ -int -sched_get_pset_load_average(processor_set_t pset) -{ - return pset->load_average >> (PSET_LOAD_NUMERATOR_SHIFT - PSET_LOAD_FRACTIONAL_SHIFT); -} - void sched_update_pset_load_average(processor_set_t pset) { -#if DEBUG - queue_entry_t iter; - int count = 0; - qe_foreach(iter, &pset->active_queue) { - count++; - } - assertf(count == pset->active_processor_count, "count %d pset->active_processor_count %d\n", count, pset->active_processor_count); -#endif - - int load = ((pset->active_processor_count + pset->pset_runq.count + rt_runq_count(pset)) << PSET_LOAD_NUMERATOR_SHIFT); + int load = ((bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + pset->pset_runq.count + rt_runq_count(pset)) << PSET_LOAD_NUMERATOR_SHIFT); int new_load_average = (pset->load_average + load) >> 1; pset->load_average = new_load_average; @@ -5761,7 +5845,22 @@ all_available_primaries_are_running_realtime_threads(processor_set_t pset) continue; } - if ((processor->state != PROCESSOR_RUNNING) && (processor->state != PROCESSOR_DISPATCHING)) { + if (processor->state == PROCESSOR_IDLE) { + return false; + } + + if (processor->state == PROCESSOR_DISPATCHING) { + return false; + } + + if (processor->state != PROCESSOR_RUNNING) { + /* + * All other processor states are considered unavailable to run + * realtime threads. In particular, we prefer an available secondary + * processor over the risk of leaving a realtime thread on the run queue + * while waiting for a processor in PROCESSOR_START state, + * which should anyway be a rare case. + */ continue; } @@ -5772,3 +5871,5 @@ all_available_primaries_are_running_realtime_threads(processor_set_t pset) return true; } + + diff --git a/osfmk/kern/sched_prim.h b/osfmk/kern/sched_prim.h index c6361a9ca..bd67f6869 100644 --- a/osfmk/kern/sched_prim.h +++ b/osfmk/kern/sched_prim.h @@ -77,8 +77,6 @@ #ifdef MACH_KERNEL_PRIVATE -#include - /* Initialization */ extern void sched_init(void); @@ -144,12 +142,23 @@ extern void thread_continue( extern void call_continuation( thread_continue_t continuation, void *parameter, - wait_result_t wresult); + wait_result_t wresult, + boolean_t enable_interrupts); + +/* + * Flags that can be passed to set_sched_pri + * to skip side effects + */ +typedef enum { + SETPRI_DEFAULT = 0x0, + SETPRI_LAZY = 0x1, /* Avoid setting AST flags or sending IPIs */ +} set_sched_pri_options_t; /* Set the current scheduled priority */ -extern void set_sched_pri( - thread_t thread, - int priority); +extern void set_sched_pri( + thread_t thread, + int priority, + set_sched_pri_options_t options); /* Set base priority of the specified thread */ extern void sched_set_thread_base_priority( @@ -166,16 +175,22 @@ extern void sched_thread_mode_demote(thread_t thread, extern void sched_thread_mode_undemote(thread_t thread, uint32_t reason); +extern void sched_thread_promote_to_pri(thread_t thread, int priority, uintptr_t trace_obj); +extern void sched_thread_update_promotion_to_pri(thread_t thread, int priority, uintptr_t trace_obj); +extern void sched_thread_unpromote(thread_t thread, uintptr_t trace_obj); + +extern void assert_promotions_invariant(thread_t thread); + +extern void sched_thread_promote_reason(thread_t thread, uint32_t reason, uintptr_t trace_obj); +extern void sched_thread_unpromote_reason(thread_t thread, uint32_t reason, uintptr_t trace_obj); + /* Re-evaluate base priority of thread (thread locked) */ void thread_recompute_priority(thread_t thread); -/* Re-evaluate base priority of thread (thread unlocked) */ -void thread_recompute_qos(thread_t thread); - -/* Reset scheduled priority of thread */ -extern void thread_recompute_sched_pri( - thread_t thread, - boolean_t override_depress); +/* Re-evaluate scheduled priority of thread (thread locked) */ +extern void thread_recompute_sched_pri( + thread_t thread, + set_sched_pri_options_t options); /* Periodic scheduler activity */ extern void sched_init_thread(void (*)(void)); @@ -435,12 +450,6 @@ extern void active_rt_threads( extern perfcontrol_class_t thread_get_perfcontrol_class( thread_t thread); -#define PSET_LOAD_NUMERATOR_SHIFT 16 -#define PSET_LOAD_FRACTIONAL_SHIFT 4 - -extern int sched_get_pset_load_average(processor_set_t pset); -extern void sched_update_pset_load_average(processor_set_t pset); - /* Generic routine for Non-AMP schedulers to calculate parallelism */ extern uint32_t sched_qos_max_parallelism(int qos, uint64_t options); @@ -451,9 +460,7 @@ __BEGIN_DECLS #ifdef XNU_KERNEL_PRIVATE /* Toggles a global override to turn off CPU Throttling */ -#define CPU_THROTTLE_DISABLE 0 -#define CPU_THROTTLE_ENABLE 1 -extern void sys_override_cpu_throttle(int flag); +extern void sys_override_cpu_throttle(boolean_t enable_override); /* ****************** Only exported until BSD stops using ******************** @@ -479,7 +486,11 @@ extern char sched_string[SCHED_STRING_MAX_LENGTH]; extern thread_t port_name_to_thread_for_ulock(mach_port_name_t thread_name); /* Attempt to context switch to a specific runnable thread */ -extern wait_result_t thread_handoff(thread_t thread); +extern wait_result_t thread_handoff_deallocate(thread_t thread); + +__attribute__((nonnull(1, 2))) +extern void thread_handoff_parameter(thread_t thread, + thread_continue_t continuation, void *parameter) __dead2; extern struct waitq *assert_wait_queue(event_t event); @@ -498,9 +509,14 @@ extern void thread_set_pending_block_hint( #define QOS_PARALLELISM_COUNT_LOGICAL 0x1 #define QOS_PARALLELISM_REALTIME 0x2 extern uint32_t qos_max_parallelism(int qos, uint64_t options); - #endif /* KERNEL_PRIVATE */ +#if XNU_KERNEL_PRIVATE +extern void thread_yield_with_continuation( + thread_continue_t continuation, + void *parameter) __dead2; +#endif + /* Context switch */ extern wait_result_t thread_block( thread_continue_t continuation); @@ -582,8 +598,8 @@ extern boolean_t preemption_enabled(void); * For DEV & REL kernels, use a static dispatch table instead of * using the indirect function table. */ -extern const struct sched_dispatch_table sched_multiq_dispatch; -#define SCHED(f) (sched_multiq_dispatch.f) +extern const struct sched_dispatch_table sched_dualq_dispatch; +#define SCHED(f) (sched_dualq_dispatch.f) #endif /* DEBUG */ diff --git a/osfmk/kern/sched_traditional.c b/osfmk/kern/sched_traditional.c index 7bc3d4393..02066c97e 100644 --- a/osfmk/kern/sched_traditional.c +++ b/osfmk/kern/sched_traditional.c @@ -689,21 +689,18 @@ sched_traditional_steal_thread(processor_set_t pset) thread_t thread; do { - processor = (processor_t)(uintptr_t)queue_first(&cset->active_queue); - while (!queue_end(&cset->active_queue, (queue_entry_t)processor)) { + uint64_t active_map = (pset->cpu_state_map[PROCESSOR_RUNNING] | + pset->cpu_state_map[PROCESSOR_DISPATCHING]); + for (int cpuid = lsb_first(active_map); cpuid >= 0; cpuid = lsb_next(active_map, cpuid)) { + processor = processor_array[cpuid]; if (runq_for_processor(processor)->count > 0) { thread = sched_traditional_steal_processor_thread(processor); if (thread != THREAD_NULL) { - remqueue((queue_entry_t)processor); - enqueue_tail(&cset->active_queue, (queue_entry_t)processor); - pset_unlock(cset); return (thread); } } - - processor = (processor_t)(uintptr_t)queue_next((queue_entry_t)processor); } nset = next_pset(cset); diff --git a/osfmk/kern/sfi.c b/osfmk/kern/sfi.c index 80fa2c105..139a6798e 100644 --- a/osfmk/kern/sfi.c +++ b/osfmk/kern/sfi.c @@ -44,6 +44,8 @@ #include #include +#include + #include #include @@ -60,10 +62,6 @@ #define dprintf(...) do { } while(0) #endif -#ifdef MACH_BSD -extern sched_call_t workqueue_get_sched_callback(void); -#endif /* MACH_BSD */ - /* * SFI (Selective Forced Idle) operates by enabling a global * timer on the SFI window interval. When it fires, all processors @@ -131,36 +129,43 @@ typedef struct { * 5) Modify thermald to use the SFI class */ -static inline void _sfi_wait_cleanup(sched_call_t callback); - -#define SFI_CLASS_REGISTER(class_id, ledger_name) \ -extern char compile_time_assert_ ## class_id[SFI_CLASS_ ## class_id < MAX_SFI_CLASS_ID ? 1 : -1]; \ -void __attribute__((noinline,noreturn)) SFI_ ## class_id ## _THREAD_IS_WAITING(void *callback, wait_result_t wret __unused); \ -void SFI_ ## class_id ## _THREAD_IS_WAITING(void *callback, wait_result_t wret __unused) \ -{ \ - _sfi_wait_cleanup(callback); \ - thread_exception_return(); \ -} \ - \ -sfi_class_registration_t SFI_ ## class_id ## _registration __attribute__((section("__DATA,__sfi_class_reg"),used)) = { SFI_CLASS_ ## class_id, SFI_ ## class_id ## _THREAD_IS_WAITING, "SFI_CLASS_" # class_id, "SFI_CLASS_" # ledger_name }; +static inline void _sfi_wait_cleanup(void); + +#define SFI_CLASS_REGISTER(clsid, ledger_name) \ +static void __attribute__((noinline, noreturn)) \ +SFI_ ## clsid ## _THREAD_IS_WAITING(void *arg __unused, wait_result_t wret __unused) \ +{ \ + _sfi_wait_cleanup(); \ + thread_exception_return(); \ +} \ + \ +_Static_assert(SFI_CLASS_ ## clsid < MAX_SFI_CLASS_ID, "Invalid ID"); \ + \ +__attribute__((section("__DATA,__sfi_class_reg"), used)) \ +static sfi_class_registration_t SFI_ ## clsid ## _registration = { \ + .class_id = SFI_CLASS_ ## clsid, \ + .class_continuation = SFI_ ## clsid ## _THREAD_IS_WAITING, \ + .class_name = "SFI_CLASS_" # clsid, \ + .class_ledger_name = "SFI_CLASS_" # ledger_name, \ +} /* SFI_CLASS_UNSPECIFIED not included here */ -SFI_CLASS_REGISTER(MAINTENANCE, MAINTENANCE) -SFI_CLASS_REGISTER(DARWIN_BG, DARWIN_BG) -SFI_CLASS_REGISTER(APP_NAP, APP_NAP) -SFI_CLASS_REGISTER(MANAGED_FOCAL, MANAGED) -SFI_CLASS_REGISTER(MANAGED_NONFOCAL, MANAGED) -SFI_CLASS_REGISTER(UTILITY, UTILITY) -SFI_CLASS_REGISTER(DEFAULT_FOCAL, DEFAULT) -SFI_CLASS_REGISTER(DEFAULT_NONFOCAL, DEFAULT) -SFI_CLASS_REGISTER(LEGACY_FOCAL, LEGACY) -SFI_CLASS_REGISTER(LEGACY_NONFOCAL, LEGACY) -SFI_CLASS_REGISTER(USER_INITIATED_FOCAL, USER_INITIATED) -SFI_CLASS_REGISTER(USER_INITIATED_NONFOCAL, USER_INITIATED) -SFI_CLASS_REGISTER(USER_INTERACTIVE_FOCAL, USER_INTERACTIVE) -SFI_CLASS_REGISTER(USER_INTERACTIVE_NONFOCAL, USER_INTERACTIVE) -SFI_CLASS_REGISTER(KERNEL, OPTED_OUT) -SFI_CLASS_REGISTER(OPTED_OUT, OPTED_OUT) +SFI_CLASS_REGISTER(MAINTENANCE, MAINTENANCE); +SFI_CLASS_REGISTER(DARWIN_BG, DARWIN_BG); +SFI_CLASS_REGISTER(APP_NAP, APP_NAP); +SFI_CLASS_REGISTER(MANAGED_FOCAL, MANAGED); +SFI_CLASS_REGISTER(MANAGED_NONFOCAL, MANAGED); +SFI_CLASS_REGISTER(UTILITY, UTILITY); +SFI_CLASS_REGISTER(DEFAULT_FOCAL, DEFAULT); +SFI_CLASS_REGISTER(DEFAULT_NONFOCAL, DEFAULT); +SFI_CLASS_REGISTER(LEGACY_FOCAL, LEGACY); +SFI_CLASS_REGISTER(LEGACY_NONFOCAL, LEGACY); +SFI_CLASS_REGISTER(USER_INITIATED_FOCAL, USER_INITIATED); +SFI_CLASS_REGISTER(USER_INITIATED_NONFOCAL, USER_INITIATED); +SFI_CLASS_REGISTER(USER_INTERACTIVE_FOCAL, USER_INTERACTIVE); +SFI_CLASS_REGISTER(USER_INTERACTIVE_NONFOCAL, USER_INTERACTIVE); +SFI_CLASS_REGISTER(KERNEL, OPTED_OUT); +SFI_CLASS_REGISTER(OPTED_OUT, OPTED_OUT); struct sfi_class_state { uint64_t off_time_usecs; @@ -788,12 +793,15 @@ sfi_class_id_t sfi_thread_classify(thread_t thread) break; case TASK_BACKGROUND_APPLICATION: case TASK_DEFAULT_APPLICATION: - case TASK_THROTTLE_APPLICATION: case TASK_UNSPECIFIED: /* Focal if the task is in a coalition with a FG/focal app */ if (task_coalition_focal_count(thread->task) > 0) focal = TRUE; break; + case TASK_THROTTLE_APPLICATION: + case TASK_DARWINBG_APPLICATION: + case TASK_NONUI_APPLICATION: + /* Definitely not focal */ default: break; } @@ -894,29 +902,50 @@ ast_t sfi_processor_needs_ast(processor_t processor) } -static inline void _sfi_wait_cleanup(sched_call_t callback) { +static inline void _sfi_wait_cleanup(void) +{ thread_t self = current_thread(); - sfi_class_id_t current_sfi_wait_class = SFI_CLASS_UNSPECIFIED; - int64_t sfi_wait_time, sfi_wait_begin = 0; spl_t s = splsched(); - thread_lock(self); - if (callback) { - thread_sched_call(self, callback); - } - sfi_wait_begin = self->wait_sfi_begin_time; - thread_unlock(self); - simple_lock(&sfi_lock); - sfi_wait_time = mach_absolute_time() - sfi_wait_begin; - current_sfi_wait_class = self->sfi_wait_class; + + sfi_class_id_t current_sfi_wait_class = self->sfi_wait_class; + + assert((SFI_CLASS_UNSPECIFIED < current_sfi_wait_class) && + (current_sfi_wait_class < MAX_SFI_CLASS_ID)); + self->sfi_wait_class = SFI_CLASS_UNSPECIFIED; + simple_unlock(&sfi_lock); splx(s); - assert((SFI_CLASS_UNSPECIFIED < current_sfi_wait_class) && (current_sfi_wait_class < MAX_SFI_CLASS_ID)); -#if !CONFIG_EMBEDDED - ledger_credit(self->task->ledger, task_ledgers.sfi_wait_times[current_sfi_wait_class], sfi_wait_time); + + /* + * It's possible for the thread to be woken up due to the SFI period + * ending *before* it finishes blocking. In that case, + * wait_sfi_begin_time won't be set. + * + * Derive the time sacrificed to SFI by looking at when this thread was + * awoken by the on-timer, to avoid counting the time this thread spent + * waiting to get scheduled. + * + * Note that last_made_runnable_time could be reset if this thread + * gets preempted before we read the value. To fix that, we'd need to + * track wait time in a thread timer, sample the timer before blocking, + * pass the value through thread->parameter, and subtract that. + */ + + if (self->wait_sfi_begin_time != 0) { +#if !CONFIG_EMBEDDED + uint64_t made_runnable = os_atomic_load(&self->last_made_runnable_time, relaxed); + int64_t sfi_wait_time = made_runnable - self->wait_sfi_begin_time; + assert(sfi_wait_time >= 0); + + ledger_credit(self->task->ledger, task_ledgers.sfi_wait_times[current_sfi_wait_class], + sfi_wait_time); #endif /* !CONFIG_EMBEDDED */ + + self->wait_sfi_begin_time = 0; + } } /* @@ -932,9 +961,7 @@ void sfi_ast(thread_t thread) struct sfi_class_state *sfi_class; wait_result_t waitret; boolean_t did_wait = FALSE; - uint64_t tid; thread_continue_t continuation; - sched_call_t workq_callback = workqueue_get_sched_callback(); s = splsched(); @@ -955,7 +982,7 @@ void sfi_ast(thread_t thread) thread_lock(thread); thread->sfi_class = class_id = sfi_thread_classify(thread); - tid = thread_tid(thread); + thread_unlock(thread); /* * Once the sfi_lock is taken and the thread's ->sfi_class field is updated, we @@ -967,23 +994,15 @@ void sfi_ast(thread_t thread) * classification. */ - /* Optimistically clear workq callback while thread is already locked */ - if (workq_callback && (thread->sched_call == workq_callback)) { - thread_sched_call(thread, NULL); - } else { - workq_callback = NULL; - } - thread_unlock(thread); - sfi_class = &sfi_classes[class_id]; if (!sfi_class->class_in_on_phase) { /* Need to block thread in wait queue */ - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_THREAD_DEFER), tid, class_id, 0, 0, 0); + KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_THREAD_DEFER), + thread_tid(thread), class_id, 0, 0, 0); waitret = waitq_assert_wait64(&sfi_class->waitq, CAST_EVENT64_T(class_id), - THREAD_INTERRUPTIBLE, - 0); + THREAD_INTERRUPTIBLE | THREAD_WAIT_NOREPORT, 0); if (waitret == THREAD_WAITING) { thread->sfi_wait_class = class_id; did_wait = TRUE; @@ -994,13 +1013,13 @@ void sfi_ast(thread_t thread) } } simple_unlock(&sfi_lock); - + splx(s); if (did_wait) { - thread_block_reason(continuation, workq_callback, AST_SFI); - } else if (workq_callback) { - thread_reenable_sched_call(thread, workq_callback); + assert(thread->wait_sfi_begin_time == 0); + + thread_block_reason(continuation, NULL, AST_SFI); } } diff --git a/osfmk/kern/simple_lock.h b/osfmk/kern/simple_lock.h index 8ef311a88..b66313f7f 100644 --- a/osfmk/kern/simple_lock.h +++ b/osfmk/kern/simple_lock.h @@ -84,9 +84,15 @@ extern void hw_lock_init( extern void hw_lock_lock( hw_lock_t); +extern void hw_lock_lock_nopreempt( + hw_lock_t); + extern void hw_lock_unlock( hw_lock_t); +extern void hw_lock_unlock_nopreempt( + hw_lock_t); + extern unsigned int hw_lock_to( hw_lock_t, uint64_t); @@ -94,6 +100,9 @@ extern unsigned int hw_lock_to( extern unsigned int hw_lock_try( hw_lock_t); +extern unsigned int hw_lock_try_nopreempt( + hw_lock_t); + extern unsigned int hw_lock_held( hw_lock_t); diff --git a/osfmk/kern/stack.c b/osfmk/kern/stack.c index 18db3f24b..c56a597bc 100644 --- a/osfmk/kern/stack.c +++ b/osfmk/kern/stack.c @@ -2,7 +2,7 @@ * Copyright (c) 2003-2007 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* @@ -100,7 +100,7 @@ log2(vm_offset_t size) static inline vm_offset_t roundup_pow2(vm_offset_t size) { - return 1UL << (log2(size - 1) + 1); + return 1UL << (log2(size - 1) + 1); } static vm_offset_t stack_alloc_internal(void); @@ -110,7 +110,7 @@ void stack_init(void) { simple_lock_init(&stack_lock_data, 0); - + kernel_stack_pages = KERNEL_STACK_SIZE / PAGE_SIZE; kernel_stack_size = KERNEL_STACK_SIZE; kernel_stack_mask = -KERNEL_STACK_SIZE; @@ -127,7 +127,7 @@ stack_init(void) if (kernel_stack_size < round_page(kernel_stack_size)) panic("stack_init: stack size %p not a multiple of page size %d\n", (void *) kernel_stack_size, PAGE_SIZE); - + stack_addr_mask = roundup_pow2(kernel_stack_size) - 1; kernel_stack_mask = ~stack_addr_mask; } @@ -139,7 +139,7 @@ stack_init(void) * block. */ -static vm_offset_t +static vm_offset_t stack_alloc_internal(void) { vm_offset_t stack = 0; @@ -163,7 +163,7 @@ stack_alloc_internal(void) stack_free_delta--; stack_unlock(); splx(s); - + if (stack == 0) { /* @@ -172,7 +172,7 @@ stack_alloc_internal(void) * for these. */ - flags = KMA_GUARD_FIRST | KMA_GUARD_LAST | KMA_KSTACK | KMA_KOBJECT; + flags = KMA_GUARD_FIRST | KMA_GUARD_LAST | KMA_KSTACK | KMA_KOBJECT | KMA_ZERO; kr = kernel_memory_allocate(kernel_map, &stack, kernel_stack_size + (2*PAGE_SIZE), stack_addr_mask, @@ -219,11 +219,6 @@ stack_free( { vm_offset_t stack = machine_stack_detach(thread); -#if KASAN - kasan_unpoison_stack(stack, kernel_stack_size); - kasan_unpoison_fakestack(thread); -#endif - assert(stack); if (stack != thread->reserved_stack) { stack_free_stack(stack); @@ -235,9 +230,6 @@ stack_free_reserved( thread_t thread) { if (thread->reserved_stack != thread->kernel_stack) { -#if KASAN - kasan_unpoison_stack(thread->reserved_stack, kernel_stack_size); -#endif stack_free_stack(thread->reserved_stack); } } @@ -249,6 +241,11 @@ stack_free_stack( struct stack_cache *cache; spl_t s; +#if KASAN_DEBUG + /* Sanity check - stack should be unpoisoned by now */ + assert(kasan_check_shadow(stack, kernel_stack_size, 0)); +#endif + s = splsched(); cache = &PROCESSOR_DATA(current_processor(), stack_cache); if (cache->count < STACK_CACHE_SIZE) { @@ -416,7 +413,7 @@ stack_fake_zone_init(int zone_index) } void -stack_fake_zone_info(int *count, +stack_fake_zone_info(int *count, vm_size_t *cur_size, vm_size_t *max_size, vm_size_t *elem_size, vm_size_t *alloc_size, uint64_t *sum_size, int *collectable, int *exhaustable, int *caller_acct) { diff --git a/osfmk/kern/startup.c b/osfmk/kern/startup.c index c9cfd5167..c0693a117 100644 --- a/osfmk/kern/startup.c +++ b/osfmk/kern/startup.c @@ -82,10 +82,12 @@ #include #include #include +#include #include #include #include #include +#include #if CONFIG_SCHED_SFI #include #endif @@ -117,13 +119,19 @@ #include #include #include +#include #include #include #include #include #include +#include +#if CONFIG_XNUPOST +#include +#include +#endif #if CONFIG_ATM #include @@ -182,8 +190,13 @@ extern void cpu_physwindow_init(int); #include #endif +#if CONFIG_DTRACE +extern void dtrace_early_init(void); +extern void sdt_early_init(void); +#endif + // libkern/OSKextLib.cpp -extern void OSKextRemoveKextBootstrap(void); +extern void OSKextRemoveKextBootstrap(void); void scale_setup(void); extern void bsd_scale_setup(int); @@ -207,6 +220,11 @@ unsigned int trace_wrap = 0; boolean_t trace_serial = FALSE; boolean_t early_boot_complete = FALSE; +/* physically contiguous carveouts */ +SECURITY_READ_ONLY_LATE(uintptr_t) phys_carveout = 0; +SECURITY_READ_ONLY_LATE(uintptr_t) phys_carveout_pa = 0; +SECURITY_READ_ONLY_LATE(size_t) phys_carveout_size = 0; + /* mach leak logging */ int log_leaks = 0; @@ -301,9 +319,14 @@ kernel_bootstrap(void) csr_init(); #endif - if (PE_i_can_has_debugger(NULL) && - PE_parse_boot_argn("-show_pointers", &namep, sizeof (namep))) { - doprnt_hide_pointers = FALSE; + if (PE_i_can_has_debugger(NULL)) { + if (PE_parse_boot_argn("-show_pointers", &namep, sizeof(namep))) { + doprnt_hide_pointers = FALSE; + } + if (PE_parse_boot_argn("-no_slto_panic", &namep, sizeof(namep))) { + extern boolean_t spinlock_timeout_panic; + spinlock_timeout_panic = FALSE; + } } kernel_bootstrap_log("console_init"); @@ -365,6 +388,12 @@ kernel_bootstrap(void) kernel_bootstrap_log("thread_init"); thread_init(); + kernel_bootstrap_log("workq_init"); + workq_init(); + + kernel_bootstrap_log("turnstiles_init"); + turnstiles_init(); + #if CONFIG_ATM /* Initialize the Activity Trace Resource Manager. */ kernel_bootstrap_log("atm_init"); @@ -497,9 +526,25 @@ kernel_bootstrap_thread(void) cpu_physwindow_init(0); #endif + if (PE_i_can_has_debugger(NULL)) { + unsigned int phys_carveout_mb = 0; + if (PE_parse_boot_argn("phys_carveout_mb", &phys_carveout_mb, + sizeof(phys_carveout_mb)) && phys_carveout_mb > 0) { + phys_carveout_size = phys_carveout_mb * 1024 * 1024; + kern_return_t kr = kmem_alloc_contig(kernel_map, + (vm_offset_t *)&phys_carveout, phys_carveout_size, + VM_MAP_PAGE_MASK(kernel_map), 0, 0, KMA_NOPAGEWAIT, + VM_KERN_MEMORY_DIAG); + if (kr != KERN_SUCCESS) { + kprintf("failed to allocate %uMB for phys_carveout_mb: %u\n", + phys_carveout_mb, (unsigned int)kr); + } else { + phys_carveout_pa = kvtophys((vm_offset_t)phys_carveout); + } + } + } - -#if MACH_KDP +#if MACH_KDP kernel_bootstrap_log("kdp_init"); kdp_init(); #endif @@ -534,16 +579,13 @@ kernel_bootstrap_thread(void) kdebug_init(new_nkdbufs, trace_typefilter, trace_wrap); - kernel_bootstrap_log("prng_init"); - prng_cpu_init(master_cpu); - #ifdef MACH_BSD kernel_bootstrap_log("bsd_early_init"); bsd_early_init(); #endif #if defined(__arm64__) - ml_lockdown_init(); + ml_lockdown_init(); #endif #ifdef IOKIT @@ -591,9 +633,22 @@ kernel_bootstrap_thread(void) #endif #endif +#if CONFIG_DTRACE + dtrace_early_init(); + sdt_early_init(); +#endif + + + /* + * Get rid of segments used to bootstrap kext loading. This removes + * the KLD, PRELINK symtab, LINKEDIT, and symtab segments/load commands. + * Must be done prior to lockdown so that we can free (and possibly relocate) + * the static KVA mappings used for the jettisoned bootstrap segments. + */ + OSKextRemoveKextBootstrap(); #if defined(__arm__) || defined(__arm64__) #if CONFIG_KERNEL_INTEGRITY - machine_lockdown_preflight(); + machine_lockdown_preflight(); #endif /* * Finalize protections on statically mapped pages now that comm page mapping is established. @@ -627,6 +682,14 @@ kernel_bootstrap_thread(void) vm_set_restrictions(); +#ifdef CONFIG_XNUPOST + kern_return_t result = kernel_list_tests(); + result = kernel_do_post(); + if (result != KERN_SUCCESS) { + panic("kernel_do_post: Tests failed with result = 0x%08x\n", result); + } + kernel_bootstrap_log("kernel_do_post - done"); +#endif /* CONFIG_XNUPOST */ /* @@ -636,11 +699,6 @@ kernel_bootstrap_thread(void) bsd_init(); #endif - /* - * Get rid of segments used to bootstrap kext loading. This removes - * the KLD, PRELINK symtab, LINKEDIT, and symtab segments/load commands. - */ - OSKextRemoveKextBootstrap(); /* * Get rid of pages used for early boot tracing. @@ -772,6 +830,8 @@ load_context( PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, system_state); + cpu_quiescent_counter_join(processor->last_dispatch); + PMAP_ACTIVATE_USER(thread, processor->cpu_id); load_context_kprintf("machine_load_context\n"); diff --git a/osfmk/kern/sync_sema.c b/osfmk/kern/sync_sema.c index 0a6f7b33d..98f33ba8d 100644 --- a/osfmk/kern/sync_sema.c +++ b/osfmk/kern/sync_sema.c @@ -66,6 +66,8 @@ static unsigned int semaphore_event; zone_t semaphore_zone; unsigned int semaphore_max; +os_refgrp_decl(static, sema_refgrp, "semaphore", NULL); + /* Forward declarations */ @@ -184,7 +186,7 @@ semaphore_create( * Initialize the semaphore values. */ s->port = IP_NULL; - s->ref_count = 1; + os_ref_init(&s->ref_count, &sema_refgrp); s->count = value; s->active = TRUE; s->owner = task; @@ -280,11 +282,12 @@ semaphore_destroy( if (semaphore->owner != task) { semaphore_unlock(semaphore); + semaphore_dereference(semaphore); splx(spl_level); task_unlock(task); return KERN_INVALID_ARGUMENT; } - + semaphore_destroy_internal(task, semaphore); /* semaphore unlocked */ @@ -1105,7 +1108,7 @@ void semaphore_reference( semaphore_t semaphore) { - (void)hw_atomic_add(&semaphore->ref_count, 1); + os_ref_retain(&semaphore->ref_count); } /* @@ -1124,8 +1127,9 @@ semaphore_dereference( if (semaphore == NULL) return; - if (hw_atomic_sub(&semaphore->ref_count, 1) != 0) + if (os_ref_release(&semaphore->ref_count) > 0) { return; + } /* * Last ref, clean up the port [if any] diff --git a/osfmk/kern/sync_sema.h b/osfmk/kern/sync_sema.h index 2187c6bae..144e925d3 100644 --- a/osfmk/kern/sync_sema.h +++ b/osfmk/kern/sync_sema.h @@ -46,13 +46,14 @@ #include #include +#include typedef struct semaphore { queue_chain_t task_link; /* chain of semaphores owned by a task */ struct waitq waitq; /* queue of blocked threads & lock */ task_t owner; /* task that owns semaphore */ ipc_port_t port; /* semaphore port */ - uint32_t ref_count; /* reference count */ + os_refcnt_t ref_count; /* reference count */ int count; /* current count value */ boolean_t active; /* active status */ } Semaphore; diff --git a/osfmk/kern/syscall_subr.c b/osfmk/kern/syscall_subr.c index 27961d85c..1732d7ab2 100644 --- a/osfmk/kern/syscall_subr.c +++ b/osfmk/kern/syscall_subr.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2017 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* @@ -53,8 +53,6 @@ * any improvements or extensions that they make and grant Carnegie Mellon * the rights to redistribute these changes. */ -/* - */ #include #include @@ -78,12 +76,8 @@ #include #include -#ifdef MACH_BSD -extern void workqueue_thread_yielded(void); -extern sched_call_t workqueue_get_sched_callback(void); -#endif /* MACH_BSD */ - -extern wait_result_t thread_handoff_reason(thread_t thread, ast_t reason); +static void thread_depress_abstime(uint64_t interval); +static void thread_depress_ms(mach_msg_timeout_t interval); /* Called from commpage to take a delayed preemption when exiting * the "Preemption Free Zone" (PFZ). @@ -125,7 +119,6 @@ swtch( __unused struct swtch_args *args) { processor_t myprocessor; - boolean_t result; disable_preemption(); myprocessor = current_processor(); @@ -138,14 +131,7 @@ swtch( counter(c_swtch_block++); - thread_block_reason((thread_continue_t)swtch_continue, NULL, AST_YIELD); - - disable_preemption(); - myprocessor = current_processor(); - result = SCHED(thread_should_yield)(myprocessor, current_thread()); - enable_preemption(); - - return (result); + thread_yield_with_continuation((thread_continue_t)swtch_continue, NULL); } static void @@ -154,7 +140,7 @@ swtch_pri_continue(void) processor_t myprocessor; boolean_t result; - thread_depress_abort_internal(current_thread()); + thread_depress_abort(current_thread()); disable_preemption(); myprocessor = current_processor(); @@ -170,7 +156,6 @@ swtch_pri( __unused struct swtch_pri_args *args) { processor_t myprocessor; - boolean_t result; disable_preemption(); myprocessor = current_processor(); @@ -185,45 +170,17 @@ __unused struct swtch_pri_args *args) thread_depress_abstime(thread_depress_time); - thread_block_reason((thread_continue_t)swtch_pri_continue, NULL, AST_YIELD); - - thread_depress_abort_internal(current_thread()); - - disable_preemption(); - myprocessor = current_processor(); - result = SCHED(thread_should_yield)(myprocessor, current_thread()); - enable_preemption(); - - return (result); -} - -static boolean_t -thread_switch_disable_workqueue_sched_callback(void) -{ - sched_call_t callback = workqueue_get_sched_callback(); - return thread_disable_sched_call(current_thread(), callback) != NULL; + thread_yield_with_continuation((thread_continue_t)swtch_pri_continue, NULL); } static void -thread_switch_enable_workqueue_sched_callback(void) +thread_switch_continue(void *parameter, __unused int ret) { - sched_call_t callback = workqueue_get_sched_callback(); - thread_reenable_sched_call(current_thread(), callback); -} - -static void -thread_switch_continue(void) -{ - thread_t self = current_thread(); - int option = self->saved.swtch.option; - boolean_t reenable_workq_callback = self->saved.swtch.reenable_workq_callback; - + thread_t self = current_thread(); + int option = (int)(intptr_t)parameter; if (option == SWITCH_OPTION_DEPRESS || option == SWITCH_OPTION_OSLOCK_DEPRESS) - thread_depress_abort_internal(self); - - if (reenable_workq_callback) - thread_switch_enable_workqueue_sched_callback(); + thread_depress_abort(self); thread_syscall_return(KERN_SUCCESS); /*NOTREACHED*/ @@ -244,41 +201,34 @@ thread_switch( int option = args->option; mach_msg_timeout_t option_time = args->option_time; uint32_t scale_factor = NSEC_PER_MSEC; - boolean_t reenable_workq_callback = FALSE; boolean_t depress_option = FALSE; boolean_t wait_option = FALSE; + wait_interrupt_t interruptible = THREAD_ABORTSAFE; /* - * Validate and process option. - */ - switch (option) { - + * Validate and process option. + */ + switch (option) { case SWITCH_OPTION_NONE: - workqueue_thread_yielded(); break; case SWITCH_OPTION_WAIT: wait_option = TRUE; - workqueue_thread_yielded(); break; case SWITCH_OPTION_DEPRESS: depress_option = TRUE; - workqueue_thread_yielded(); break; case SWITCH_OPTION_DISPATCH_CONTENTION: scale_factor = NSEC_PER_USEC; wait_option = TRUE; - if (thread_switch_disable_workqueue_sched_callback()) - reenable_workq_callback = TRUE; + interruptible |= THREAD_WAIT_NOREPORT; break; case SWITCH_OPTION_OSLOCK_DEPRESS: depress_option = TRUE; - if (thread_switch_disable_workqueue_sched_callback()) - reenable_workq_callback = TRUE; + interruptible |= THREAD_WAIT_NOREPORT; break; case SWITCH_OPTION_OSLOCK_WAIT: wait_option = TRUE; - if (thread_switch_disable_workqueue_sched_callback()) - reenable_workq_callback = TRUE; + interruptible |= THREAD_WAIT_NOREPORT; break; default: return (KERN_INVALID_ARGUMENT); @@ -350,17 +300,13 @@ thread_switch( thread_deallocate_safe(thread); if (wait_option) - assert_wait_timeout((event_t)assert_wait_timeout, THREAD_ABORTSAFE, + assert_wait_timeout((event_t)assert_wait_timeout, interruptible, option_time, scale_factor); else if (depress_option) thread_depress_ms(option_time); - self->saved.swtch.option = option; - self->saved.swtch.reenable_workq_callback = reenable_workq_callback; - - thread_run(self, (thread_continue_t)thread_switch_continue, NULL, pulled_thread); - /* NOTREACHED */ - panic("returned from thread_run!"); + thread_run(self, thread_switch_continue, (void *)(intptr_t)option, pulled_thread); + __builtin_unreachable(); } splx(s); @@ -369,24 +315,25 @@ thread_switch( } if (wait_option) - assert_wait_timeout((event_t)assert_wait_timeout, THREAD_ABORTSAFE, option_time, scale_factor); + assert_wait_timeout((event_t)assert_wait_timeout, interruptible, option_time, scale_factor); else if (depress_option) thread_depress_ms(option_time); - self->saved.swtch.option = option; - self->saved.swtch.reenable_workq_callback = reenable_workq_callback; - - thread_block_reason((thread_continue_t)thread_switch_continue, NULL, AST_YIELD); - - if (depress_option) - thread_depress_abort_internal(self); - - if (reenable_workq_callback) - thread_switch_enable_workqueue_sched_callback(); + thread_yield_with_continuation(thread_switch_continue, (void *)(intptr_t)option); + __builtin_unreachable(); +} - return (KERN_SUCCESS); +void +thread_yield_with_continuation( + thread_continue_t continuation, + void *parameter) +{ + assert(continuation); + thread_block_reason(continuation, parameter, AST_YIELD); + __builtin_unreachable(); } + /* Returns a +1 thread reference */ thread_t port_name_to_thread_for_ulock(mach_port_name_t thread_name) @@ -425,10 +372,15 @@ port_name_to_thread_for_ulock(mach_port_name_t thread_name) /* This function is called after an assert_wait(), therefore it must not * cause another wait until after the thread_run() or thread_block() * - * Consumes a ref on thread + * + * When called with a NULL continuation, the thread ref is consumed + * (thread_handoff_deallocate calling convention) else it is up to the + * continuation to do the cleanup (thread_handoff_parameter calling convention) + * and it instead doesn't return. */ -wait_result_t -thread_handoff(thread_t thread) +static wait_result_t +thread_handoff_internal(thread_t thread, thread_continue_t continuation, + void *parameter) { thread_t deallocate_thread = THREAD_NULL; thread_t self = current_thread(); @@ -446,10 +398,12 @@ thread_handoff(thread_t thread) pulled_thread ? TRUE : FALSE, 0, 0); if (pulled_thread != THREAD_NULL) { - /* We can't be dropping the last ref here */ - thread_deallocate_safe(thread); + if (continuation == NULL) { + /* We can't be dropping the last ref here */ + thread_deallocate_safe(thread); + } - int result = thread_run(self, THREAD_CONTINUE_NULL, NULL, pulled_thread); + int result = thread_run(self, continuation, parameter, pulled_thread); splx(s); return result; @@ -461,7 +415,7 @@ thread_handoff(thread_t thread) thread = THREAD_NULL; } - int result = thread_block(THREAD_CONTINUE_NULL); + int result = thread_block_parameter(continuation, parameter); if (deallocate_thread != THREAD_NULL) { thread_deallocate(deallocate_thread); } @@ -469,54 +423,75 @@ thread_handoff(thread_t thread) return result; } +void +thread_handoff_parameter(thread_t thread, thread_continue_t continuation, + void *parameter) +{ + thread_handoff_internal(thread, continuation, parameter); + panic("NULL continuation passed to %s", __func__); + __builtin_unreachable(); +} + +wait_result_t +thread_handoff_deallocate(thread_t thread) +{ + return thread_handoff_internal(thread, NULL, NULL); +} + +/* + * Thread depression + * + * This mechanism drops a thread to priority 0 in order for it to yield to + * all other runnnable threads on the system. It can be canceled or timed out, + * whereupon the thread goes back to where it was. + * + * Note that TH_SFLAG_DEPRESS and TH_SFLAG_POLLDEPRESS are never set at the + * same time. DEPRESS always defers to POLLDEPRESS. + * + * DEPRESS only lasts across a single thread_block call, and never returns + * to userspace. + * POLLDEPRESS can be active anywhere up until thread termination. + */ + /* * Depress thread's priority to lowest possible for the specified interval, - * with a value of zero resulting in no timeout being scheduled. + * with an interval of zero resulting in no timeout being scheduled. + * + * Must block with AST_YIELD afterwards to take effect */ void -thread_depress_abstime( - uint64_t interval) +thread_depress_abstime(uint64_t interval) { - thread_t self = current_thread(); - uint64_t deadline; - spl_t s; - - s = splsched(); - thread_lock(self); - if (!(self->sched_flags & TH_SFLAG_DEPRESSED_MASK)) { - processor_t myprocessor = self->last_processor; - - self->sched_pri = DEPRESSPRI; - - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY), - (uintptr_t)thread_tid(self), - self->base_pri, - self->sched_pri, - self->sched_usage, - 0); - - myprocessor->current_pri = self->sched_pri; - myprocessor->current_perfctl_class = thread_get_perfcontrol_class(self); + thread_t self = current_thread(); + + spl_t s = splsched(); + thread_lock(self); + + assert((self->sched_flags & TH_SFLAG_DEPRESS) == 0); + + if ((self->sched_flags & TH_SFLAG_POLLDEPRESS) == 0) { self->sched_flags |= TH_SFLAG_DEPRESS; + thread_recompute_sched_pri(self, SETPRI_LAZY); if (interval != 0) { + uint64_t deadline; + clock_absolutetime_interval_to_deadline(interval, &deadline); if (!timer_call_enter(&self->depress_timer, deadline, TIMER_CALL_USER_CRITICAL)) self->depress_timer_active++; } } + thread_unlock(self); - splx(s); + splx(s); } void -thread_depress_ms( - mach_msg_timeout_t interval) +thread_depress_ms(mach_msg_timeout_t interval) { - uint64_t abstime; + uint64_t abstime; - clock_interval_to_absolutetime_interval( - interval, NSEC_PER_MSEC, &abstime); + clock_interval_to_absolutetime_interval(interval, NSEC_PER_MSEC, &abstime); thread_depress_abstime(abstime); } @@ -524,111 +499,132 @@ thread_depress_ms( * Priority depression expiration. */ void -thread_depress_expire( - void *p0, - __unused void *p1) +thread_depress_expire(void *p0, + __unused void *p1) { - thread_t thread = p0; - spl_t s; + thread_t thread = (thread_t)p0; + + spl_t s = splsched(); + thread_lock(thread); + + assert((thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) != TH_SFLAG_DEPRESSED_MASK); - s = splsched(); - thread_lock(thread); if (--thread->depress_timer_active == 0) { thread->sched_flags &= ~TH_SFLAG_DEPRESSED_MASK; - thread_recompute_sched_pri(thread, FALSE); + thread_recompute_sched_pri(thread, SETPRI_DEFAULT); } - thread_unlock(thread); - splx(s); + + thread_unlock(thread); + splx(s); } /* - * Prematurely abort priority depression if there is one. + * Prematurely abort priority depression if there is one. */ kern_return_t -thread_depress_abort_internal( - thread_t thread) +thread_depress_abort(thread_t thread) { - kern_return_t result = KERN_NOT_DEPRESSED; - spl_t s; - - s = splsched(); - thread_lock(thread); - if (!(thread->sched_flags & TH_SFLAG_POLLDEPRESS)) { - if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) { - thread->sched_flags &= ~TH_SFLAG_DEPRESSED_MASK; - thread_recompute_sched_pri(thread, FALSE); - result = KERN_SUCCESS; - } + kern_return_t result = KERN_NOT_DEPRESSED; + + spl_t s = splsched(); + thread_lock(thread); - if (timer_call_cancel(&thread->depress_timer)) - thread->depress_timer_active--; + assert((thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) != TH_SFLAG_DEPRESSED_MASK); + + /* + * User-triggered depress-aborts should not get out + * of the poll-depress, but they should cancel a regular depress. + */ + if ((thread->sched_flags & TH_SFLAG_POLLDEPRESS) == 0) { + result = thread_depress_abort_locked(thread); } + thread_unlock(thread); - splx(s); + splx(s); - return (result); + return result; } -void -thread_poll_yield( - thread_t self) +/* + * Prematurely abort priority depression or poll depression if one is active. + * Called with the thread locked. + */ +kern_return_t +thread_depress_abort_locked(thread_t thread) { - spl_t s; + if ((thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) == 0) + return KERN_NOT_DEPRESSED; + + assert((thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) != TH_SFLAG_DEPRESSED_MASK); + thread->sched_flags &= ~TH_SFLAG_DEPRESSED_MASK; + + thread_recompute_sched_pri(thread, SETPRI_LAZY); + + if (timer_call_cancel(&thread->depress_timer)) + thread->depress_timer_active--; + + return KERN_SUCCESS; +} + +/* + * Invoked as part of a polling operation like a no-timeout port receive + * + * Forces a fixpri thread to yield if it is detected polling without blocking for too long. + */ +void +thread_poll_yield(thread_t self) +{ assert(self == current_thread()); + assert((self->sched_flags & TH_SFLAG_DEPRESS) == 0); - s = splsched(); - if (self->sched_mode == TH_MODE_FIXED) { - uint64_t total_computation, abstime; - - abstime = mach_absolute_time(); - total_computation = abstime - self->computation_epoch; - total_computation += self->computation_metered; - if (total_computation >= max_poll_computation) { - processor_t myprocessor = current_processor(); - ast_t preempt; - - thread_lock(self); - if (!(self->sched_flags & TH_SFLAG_DEPRESSED_MASK)) { - self->sched_pri = DEPRESSPRI; - - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY), - (uintptr_t)thread_tid(self), - self->base_pri, - self->sched_pri, - self->sched_usage, - 0); - - myprocessor->current_pri = self->sched_pri; - myprocessor->current_perfctl_class = thread_get_perfcontrol_class(self); - } - self->computation_epoch = abstime; - self->computation_metered = 0; - self->sched_flags |= TH_SFLAG_POLLDEPRESS; + if (self->sched_mode != TH_MODE_FIXED) + return; - abstime += (total_computation >> sched_poll_yield_shift); - if (!timer_call_enter(&self->depress_timer, abstime, TIMER_CALL_USER_CRITICAL)) - self->depress_timer_active++; + spl_t s = splsched(); - if ((preempt = csw_check(myprocessor, AST_NONE)) != AST_NONE) - ast_on(preempt); + uint64_t abstime = mach_absolute_time(); + uint64_t total_computation = abstime - + self->computation_epoch + self->computation_metered; - thread_unlock(self); - } + if (total_computation >= max_poll_computation) { + thread_lock(self); + + self->computation_epoch = abstime; + self->computation_metered = 0; + + uint64_t yield_expiration = abstime + + (total_computation >> sched_poll_yield_shift); + + if (!timer_call_enter(&self->depress_timer, yield_expiration, + TIMER_CALL_USER_CRITICAL)) + self->depress_timer_active++; + + self->sched_flags |= TH_SFLAG_POLLDEPRESS; + thread_recompute_sched_pri(self, SETPRI_DEFAULT); + + thread_unlock(self); } splx(s); } - +/* + * Kernel-internal interface to yield for a specified period + * + * WARNING: Will still yield to priority 0 even if the thread is holding a contended lock! + */ void -thread_yield_internal( - mach_msg_timeout_t ms) +thread_yield_internal(mach_msg_timeout_t ms) { + thread_t self = current_thread(); + + assert((self->sched_flags & TH_SFLAG_DEPRESSED_MASK) != TH_SFLAG_DEPRESSED_MASK); + processor_t myprocessor; disable_preemption(); myprocessor = current_processor(); - if (!SCHED(thread_should_yield)(myprocessor, current_thread())) { + if (!SCHED(thread_should_yield)(myprocessor, self)) { mp_enable_preemption(); return; @@ -639,7 +635,7 @@ thread_yield_internal( thread_block_reason(THREAD_CONTINUE_NULL, NULL, AST_YIELD); - thread_depress_abort_internal(current_thread()); + thread_depress_abort(self); } /* diff --git a/osfmk/kern/syscall_subr.h b/osfmk/kern/syscall_subr.h index 5a68b926b..6d0984aec 100644 --- a/osfmk/kern/syscall_subr.h +++ b/osfmk/kern/syscall_subr.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2017 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,59 +22,50 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* * @OSF_COPYRIGHT@ */ -/* +/* * Mach Operating System * Copyright (c) 1991,1990,1989 Carnegie Mellon University * All Rights Reserved. - * + * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. - * + * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * + * * Carnegie Mellon requests users of this software to return to - * + * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 - * + * * any improvements or extensions that they make and grant Carnegie Mellon * the rights to redistribute these changes. */ -/* - */ -#ifndef _KERN_SYSCALL_SUBR_H_ +#ifndef _KERN_SYSCALL_SUBR_H_ #define _KERN_SYSCALL_SUBR_H_ #include -extern void thread_depress_abstime( - uint64_t interval); +extern kern_return_t thread_depress_abort(thread_t thread); -extern void thread_depress_ms( - mach_msg_timeout_t interval); +extern kern_return_t thread_depress_abort_locked(thread_t thread); -extern kern_return_t thread_depress_abort_internal( - thread_t thread); +extern void thread_depress_expire(void *thread, void *p1); -extern void thread_depress_expire( - void *thread, - void *p1); +extern void thread_poll_yield(thread_t self); -extern void thread_poll_yield( - thread_t self); +#endif /* _KERN_SYSCALL_SUBR_H_ */ -#endif /* _KERN_SYSCALL_SUBR_H_ */ diff --git a/osfmk/kern/syscall_sw.c b/osfmk/kern/syscall_sw.c index 5c4bd06f3..65e3b2890 100644 --- a/osfmk/kern/syscall_sw.c +++ b/osfmk/kern/syscall_sw.c @@ -142,7 +142,7 @@ const mach_trap_t mach_trap_table[MACH_TRAP_TABLE_COUNT] = { /* 37 */ MACH_TRAP(semaphore_wait_signal_trap, 2, 2, munge_ww), /* 38 */ MACH_TRAP(semaphore_timedwait_trap, 3, 3, munge_www), /* 39 */ MACH_TRAP(semaphore_timedwait_signal_trap, 4, 4, munge_wwww), -/* 40 */ MACH_TRAP(kern_invalid, 0, 0, NULL), +/* 40 */ MACH_TRAP(_kernelrpc_mach_port_get_attributes_trap, 5, 5, munge_wwwww), /* 41 */ MACH_TRAP(_kernelrpc_mach_port_guard_trap, 4, 5, munge_wwlw), /* 42 */ MACH_TRAP(_kernelrpc_mach_port_unguard_trap, 3, 4, munge_wwl), /* 43 */ MACH_TRAP(mach_generate_activity_id, 3, 3, munge_www), diff --git a/osfmk/kern/task.c b/osfmk/kern/task.c index 873779ca9..20eef5136 100644 --- a/osfmk/kern/task.c +++ b/osfmk/kern/task.c @@ -161,6 +161,7 @@ #include #include +#include #if CONFIG_ATM #include @@ -184,6 +185,7 @@ lck_grp_attr_t task_lck_grp_attr; extern int exc_via_corpse_forking; extern int corpse_for_fatal_memkill; +extern boolean_t proc_send_synchronous_EXC_RESOURCE(void *p); /* Flag set by core audio when audio is playing. Used to stifle EXC_RESOURCE generation when active. */ int audio_active = 0; @@ -197,15 +199,39 @@ lck_spin_t dead_task_statistics_lock; ledger_template_t task_ledger_template = NULL; -struct _task_ledger_indices task_ledgers __attribute__((used)) = - {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, +SECURITY_READ_ONLY_LATE(struct _task_ledger_indices) task_ledgers __attribute__((used)) = +{.cpu_time = -1, + .tkm_private = -1, + .tkm_shared = -1, + .phys_mem = -1, + .wired_mem = -1, + .internal = -1, + .iokit_mapped = -1, + .alternate_accounting = -1, + .alternate_accounting_compressed = -1, + .page_table = -1, + .phys_footprint = -1, + .internal_compressed = -1, + .purgeable_volatile = -1, + .purgeable_nonvolatile = -1, + .purgeable_volatile_compressed = -1, + .purgeable_nonvolatile_compressed = -1, + .network_volatile = -1, + .network_nonvolatile = -1, + .network_volatile_compressed = -1, + .network_nonvolatile_compressed = -1, + .platform_idle_wakeups = -1, + .interrupt_wakeups = -1, #if !CONFIG_EMBEDDED - { 0 /* initialized at runtime */}, + .sfi_wait_times = { 0 /* initialized at runtime */}, #endif /* !CONFIG_EMBEDDED */ - -1, -1, - -1, -1, - -1, -1, - }; + .cpu_time_billed_to_me = -1, + .cpu_time_billed_to_others = -1, + .physical_writes = -1, + .logical_writes = -1, + .energy_billed_to_me = -1, + .energy_billed_to_others = -1 +}; /* System sleep state */ boolean_t tasks_suspend_state; @@ -266,8 +292,11 @@ int64_t io_telemetry_limit; /* Threshold to take a microstackshot (0 indicated int64_t global_logical_writes_count = 0; /* Global count for logical writes */ static boolean_t global_update_logical_writes(int64_t); +#define TASK_MAX_THREAD_LIMIT 256 + #if MACH_ASSERT int pmap_ledgers_panic = 1; +int pmap_ledgers_panic_leeway = 3; #endif /* MACH_ASSERT */ int task_max = CONFIG_TASK_MAX; /* Max number of tasks */ @@ -280,9 +309,12 @@ int hwm_user_cores = 0; /* high watermark violations generate user core files */ extern void proc_getexecutableuuid(void *, unsigned char *, unsigned long); extern int proc_pid(struct proc *p); extern int proc_selfpid(void); +extern struct proc *current_proc(void); extern char *proc_name_address(struct proc *p); extern uint64_t get_dispatchqueue_offset_from_proc(void *); extern int kevent_proc_copy_uptrs(void *proc, uint64_t *buf, int bufsize); +extern void workq_proc_suspended(struct proc *p); +extern void workq_proc_resumed(struct proc *p); #if CONFIG_MEMORYSTATUS extern void proc_memstat_terminated(struct proc* p, boolean_t set); @@ -298,6 +330,17 @@ extern void memorystatus_abort_vm_map_fork(task_t); #endif /* MACH_BSD */ +#if DEVELOPMENT || DEBUG +int exc_resource_threads_enabled; +#endif /* DEVELOPMENT || DEBUG */ + +#if (DEVELOPMENT || DEBUG) && TASK_EXC_GUARD_DELIVER_CORPSE +uint32_t task_exc_guard_default = TASK_EXC_GUARD_MP_DELIVER | TASK_EXC_GUARD_MP_CORPSE | + TASK_EXC_GUARD_VM_DELIVER | TASK_EXC_GUARD_VM_CORPSE; +#else +uint32_t task_exc_guard_default = 0; +#endif + /* Forwards */ static void task_hold_locked(task_t task); @@ -306,21 +349,12 @@ static void task_release_locked(task_t task); static void task_synchronizer_destroy_all(task_t task); -void -task_backing_store_privileged( - task_t task) -{ - task_lock(task); - task->priv_flags |= VM_BACKING_STORE_PRIV; - task_unlock(task); - return; -} - void task_set_64bit( task_t task, - boolean_t is64bit) + boolean_t is_64bit, + boolean_t is_64bit_data) { #if defined(__i386__) || defined(__x86_64__) || defined(__arm64__) thread_t thread; @@ -328,15 +362,34 @@ task_set_64bit( task_lock(task); - if (is64bit) { - if (task_has_64BitAddr(task)) + /* + * Switching to/from 64-bit address spaces + */ + if (is_64bit) { + if (!task_has_64Bit_addr(task)) { + task_set_64Bit_addr(task); + } + } else { + if (task_has_64Bit_addr(task)) { + task_clear_64Bit_addr(task); + } + } + + /* + * Switching to/from 64-bit register state. + */ + if (is_64bit_data) { + if (task_has_64Bit_data(task)) goto out; - task_set_64BitAddr(task); + + task_set_64Bit_data(task); } else { - if ( !task_has_64BitAddr(task)) + if ( !task_has_64Bit_data(task)) goto out; - task_clear_64BitAddr(task); + + task_clear_64Bit_data(task); } + /* FIXME: On x86, the thread save state flavor can diverge from the * task's 64-bit feature flag due to the 32-bit/64-bit register save * state dichotomy. Since we can be pre-empted in this interval, @@ -381,6 +434,12 @@ task_set_64bit( task_unlock(task); } +boolean_t +task_get_64bit_data(task_t task) +{ + return task_has_64Bit_data(task); +} + void task_set_platform_binary( task_t task, @@ -526,6 +585,14 @@ task_wait_to_return(void) task_unlock(task); +#if CONFIG_MACF + /* + * Before jumping to userspace and allowing this process to execute any code, + * notify any interested parties. + */ + mac_proc_notify_exec_complete(current_proc()); +#endif + thread_bootstrap_return(); } @@ -702,10 +769,16 @@ task_init(void) #endif /* CONFIG_MEMORYSTATUS */ } -#if MACH_ASSERT - PE_parse_boot_argn("pmap_ledgers_panic", &pmap_ledgers_panic, - sizeof (pmap_ledgers_panic)); -#endif /* MACH_ASSERT */ +#if DEVELOPMENT || DEBUG + if (!PE_parse_boot_argn("exc_resource_threads", + &exc_resource_threads_enabled, + sizeof(exc_resource_threads_enabled))) { + exc_resource_threads_enabled = 1; + } + PE_parse_boot_argn("task_exc_guard_default", + &task_exc_guard_default, + sizeof(task_exc_guard_default)); +#endif /* DEVELOPMENT || DEBUG */ #if CONFIG_COREDUMP if (!PE_parse_boot_argn("hwm_user_cores", &hwm_user_cores, @@ -766,9 +839,9 @@ task_init(void) * Create the kernel task as the first task. */ #ifdef __LP64__ - if (task_create_internal(TASK_NULL, NULL, FALSE, TRUE, TF_NONE, TPF_NONE, &kernel_task) != KERN_SUCCESS) + if (task_create_internal(TASK_NULL, NULL, FALSE, TRUE, TRUE, TF_NONE, TPF_NONE, &kernel_task) != KERN_SUCCESS) #else - if (task_create_internal(TASK_NULL, NULL, FALSE, FALSE, TF_NONE, TPF_NONE, &kernel_task) != KERN_SUCCESS) + if (task_create_internal(TASK_NULL, NULL, FALSE, FALSE, FALSE, TF_NONE, TPF_NONE, &kernel_task) != KERN_SUCCESS) #endif panic("task_init\n"); @@ -874,8 +947,12 @@ init_task_ledgers(void) assert(kernel_task == TASK_NULL); #if MACH_ASSERT - PE_parse_boot_argn("pmap_ledgers_panic", &pmap_ledgers_panic, + PE_parse_boot_argn("pmap_ledgers_panic", + &pmap_ledgers_panic, sizeof (pmap_ledgers_panic)); + PE_parse_boot_argn("pmap_ledgers_panic_leeway", + &pmap_ledgers_panic_leeway, + sizeof (pmap_ledgers_panic_leeway)); #endif /* MACH_ASSERT */ if ((t = ledger_template_create("Per-task ledger")) == NULL) @@ -908,6 +985,12 @@ init_task_ledgers(void) task_ledgers.purgeable_nonvolatile = ledger_entry_add(t, "purgeable_nonvolatile", "physmem", "bytes"); task_ledgers.purgeable_volatile_compressed = ledger_entry_add(t, "purgeable_volatile_compress", "physmem", "bytes"); task_ledgers.purgeable_nonvolatile_compressed = ledger_entry_add(t, "purgeable_nonvolatile_compress", "physmem", "bytes"); + + task_ledgers.network_volatile = ledger_entry_add(t, "network_volatile", "physmem", "bytes"); + task_ledgers.network_nonvolatile = ledger_entry_add(t, "network_nonvolatile", "physmem", "bytes"); + task_ledgers.network_volatile_compressed = ledger_entry_add(t, "network_volatile_compressed", "physmem", "bytes"); + task_ledgers.network_nonvolatile_compressed = ledger_entry_add(t, "network_nonvolatile_compressed", "physmem", "bytes"); + task_ledgers.platform_idle_wakeups = ledger_entry_add(t, "platform_idle_wakeups", "power", "count"); task_ledgers.interrupt_wakeups = ledger_entry_add(t, "interrupt_wakeups", "power", @@ -965,6 +1048,10 @@ init_task_ledgers(void) (task_ledgers.purgeable_nonvolatile < 0) || (task_ledgers.purgeable_volatile_compressed < 0) || (task_ledgers.purgeable_nonvolatile_compressed < 0) || + (task_ledgers.network_volatile < 0) || + (task_ledgers.network_nonvolatile < 0) || + (task_ledgers.network_volatile_compressed < 0) || + (task_ledgers.network_nonvolatile_compressed < 0) || (task_ledgers.platform_idle_wakeups < 0) || (task_ledgers.interrupt_wakeups < 0) || (task_ledgers.cpu_time_billed_to_me < 0) || (task_ledgers.cpu_time_billed_to_others < 0) || @@ -988,6 +1075,11 @@ init_task_ledgers(void) ledger_track_credit_only(t, task_ledgers.purgeable_volatile_compressed); ledger_track_credit_only(t, task_ledgers.purgeable_nonvolatile_compressed); + ledger_track_credit_only(t, task_ledgers.network_volatile); + ledger_track_credit_only(t, task_ledgers.network_nonvolatile); + ledger_track_credit_only(t, task_ledgers.network_volatile_compressed); + ledger_track_credit_only(t, task_ledgers.network_nonvolatile_compressed); + ledger_track_maximum(t, task_ledgers.phys_footprint, 60); #if MACH_ASSERT if (pmap_ledgers_panic) { @@ -1002,6 +1094,11 @@ init_task_ledgers(void) ledger_panic_on_negative(t, task_ledgers.purgeable_nonvolatile); ledger_panic_on_negative(t, task_ledgers.purgeable_volatile_compressed); ledger_panic_on_negative(t, task_ledgers.purgeable_nonvolatile_compressed); + + ledger_panic_on_negative(t, task_ledgers.network_volatile); + ledger_panic_on_negative(t, task_ledgers.network_nonvolatile); + ledger_panic_on_negative(t, task_ledgers.network_volatile_compressed); + ledger_panic_on_negative(t, task_ledgers.network_nonvolatile_compressed); } #endif /* MACH_ASSERT */ @@ -1024,6 +1121,7 @@ task_create_internal( coalition_t *parent_coalitions __unused, boolean_t inherit_memory, __unused boolean_t is_64bit, + boolean_t is_64bit_data, uint32_t t_flags, uint32_t t_procflags, task_t *child_task) /* OUT */ @@ -1076,7 +1174,6 @@ task_create_internal( new_task->legacy_stop_count = 0; new_task->active = TRUE; new_task->halting = FALSE; - new_task->user_data = NULL; new_task->priv_flags = 0; new_task->t_flags = t_flags; new_task->t_procflags = t_procflags; @@ -1084,6 +1181,8 @@ task_create_internal( new_task->crashed_thread_id = 0; new_task->exec_token = 0; + new_task->task_exc_guard = task_exc_guard_default; + #if CONFIG_ATM new_task->atm_context = NULL; #endif @@ -1160,13 +1259,12 @@ task_create_internal( new_task->mem_notify_reserved = 0; new_task->memlimit_attrs_reserved = 0; -#if IMPORTANCE_INHERITANCE - new_task->task_imp_base = NULL; -#endif /* IMPORTANCE_INHERITANCE */ new_task->requested_policy = default_task_requested_policy; new_task->effective_policy = default_task_effective_policy; + task_importance_init_from_parent(new_task, parent_task); + if (parent_task != TASK_NULL) { new_task->sec_token = parent_task->sec_token; new_task->audit_token = parent_task->audit_token; @@ -1175,8 +1273,14 @@ task_create_internal( shared_region = vm_shared_region_get(parent_task); vm_shared_region_set(new_task, shared_region); - if(task_has_64BitAddr(parent_task)) - task_set_64BitAddr(new_task); + if(task_has_64Bit_addr(parent_task)) { + task_set_64Bit_addr(new_task); + } + + if(task_has_64Bit_data(parent_task)) { + task_set_64Bit_data(new_task); + } + new_task->all_image_info_addr = parent_task->all_image_info_addr; new_task->all_image_info_size = parent_task->all_image_info_size; @@ -1185,43 +1289,6 @@ task_create_internal( new_task->pset_hint = parent_task->pset_hint = task_choose_pset(parent_task); -#if IMPORTANCE_INHERITANCE - ipc_importance_task_t new_task_imp = IIT_NULL; - boolean_t inherit_receive = TRUE; - - if (task_is_marked_importance_donor(parent_task)) { - new_task_imp = ipc_importance_for_task(new_task, FALSE); - assert(IIT_NULL != new_task_imp); - ipc_importance_task_mark_donor(new_task_imp, TRUE); - } -#if CONFIG_EMBEDDED - /* Embedded only wants to inherit for exec copy task */ - if ((t_procflags & TPF_EXEC_COPY) == 0) { - inherit_receive = FALSE; - } -#endif /* CONFIG_EMBEDDED */ - - if (inherit_receive) { - if (task_is_marked_importance_receiver(parent_task)) { - if (IIT_NULL == new_task_imp) - new_task_imp = ipc_importance_for_task(new_task, FALSE); - assert(IIT_NULL != new_task_imp); - ipc_importance_task_mark_receiver(new_task_imp, TRUE); - } - if (task_is_marked_importance_denap_receiver(parent_task)) { - if (IIT_NULL == new_task_imp) - new_task_imp = ipc_importance_for_task(new_task, FALSE); - assert(IIT_NULL != new_task_imp); - ipc_importance_task_mark_denap_receiver(new_task_imp, TRUE); - } - } - - if (IIT_NULL != new_task_imp) { - assert(new_task->task_imp_base == new_task_imp); - ipc_importance_task_release(new_task_imp); - } -#endif /* IMPORTANCE_INHERITANCE */ - new_task->priority = BASEPRI_DEFAULT; new_task->max_priority = MAXPRI_USER; @@ -1230,9 +1297,15 @@ task_create_internal( new_task->sec_token = KERNEL_SECURITY_TOKEN; new_task->audit_token = KERNEL_AUDIT_TOKEN; #ifdef __LP64__ - if(is_64bit) - task_set_64BitAddr(new_task); + if(is_64bit) { + task_set_64Bit_addr(new_task); + } #endif + + if(is_64bit_data) { + task_set_64Bit_data(new_task); + } + new_task->all_image_info_addr = (mach_vm_address_t)0; new_task->all_image_info_size = (mach_vm_size_t)0; @@ -1269,6 +1342,7 @@ task_create_internal( new_task->total_user_time = 0; new_task->total_system_time = 0; new_task->total_ptime = 0; + new_task->total_runnable_time = 0; new_task->faults = 0; new_task->pageins = 0; new_task->cow_faults = 0; @@ -1341,9 +1415,6 @@ task_create_internal( new_task->dispatchqueue_offset = parent_task->dispatchqueue_offset; } - if (vm_backing_store_low && parent_task != NULL) - new_task->priv_flags |= (parent_task->priv_flags&VM_BACKING_STORE_PRIV); - new_task->task_volatile_objects = 0; new_task->task_nonvolatile_objects = 0; new_task->task_purgeable_disowning = FALSE; @@ -1351,14 +1422,26 @@ task_create_internal( queue_init(&new_task->task_objq); task_objq_lock_init(new_task); +#if __arm64__ + new_task->task_legacy_footprint = FALSE; +#endif /* __arm64__ */ new_task->task_region_footprint = FALSE; - + new_task->task_has_crossed_thread_limit = FALSE; + new_task->task_thread_limit = 0; #if CONFIG_SECLUDED_MEMORY new_task->task_can_use_secluded_mem = FALSE; new_task->task_could_use_secluded_mem = FALSE; new_task->task_could_also_use_secluded_mem = FALSE; + new_task->task_suppressed_secluded = FALSE; #endif /* CONFIG_SECLUDED_MEMORY */ + /* + * t_flags is set up above. But since we don't + * support darkwake mode being set that way + * currently, we clear it out here explicitly. + */ + new_task->t_flags &= ~(TF_DARKWAKE_MODE); + queue_init(&new_task->io_user_clients); ipc_task_enable(new_task); @@ -1389,6 +1472,7 @@ task_rollup_accounting_info(task_t to_task, task_t from_task) to_task->total_user_time = from_task->total_user_time; to_task->total_system_time = from_task->total_system_time; to_task->total_ptime = from_task->total_ptime; + to_task->total_runnable_time = from_task->total_runnable_time; to_task->faults = from_task->faults; to_task->pageins = from_task->pageins; to_task->cow_faults = from_task->cow_faults; @@ -2005,7 +2089,7 @@ task_duplicate_map_and_threads( kern_return_t kr = KERN_SUCCESS; int active; thread_t thread, self, thread_return = THREAD_NULL; - thread_t new_thread = THREAD_NULL; + thread_t new_thread = THREAD_NULL, first_thread = THREAD_NULL; thread_t *thread_array; uint32_t active_thread_count = 0, array_count = 0, i; vm_map_t oldmap; @@ -2049,9 +2133,13 @@ task_duplicate_map_and_threads( new_task->map = vm_map_fork(new_task->ledger, task->map, (VM_MAP_FORK_SHARE_IF_INHERIT_NONE | - VM_MAP_FORK_PRESERVE_PURGEABLE)); + VM_MAP_FORK_PRESERVE_PURGEABLE | + VM_MAP_FORK_CORPSE_FOOTPRINT)); vm_map_deallocate(oldmap); + /* copy ledgers that impact the memory footprint */ + vm_map_copy_footprint_ledgers(task, new_task); + /* Get all the udata pointers from kqueue */ est_knotes = kevent_proc_copy_uptrs(p, NULL, 0); if (est_knotes > 0) { @@ -2104,6 +2192,8 @@ task_duplicate_map_and_threads( if (thread_array[i] == self) { thread_return = new_thread; new_task->crashed_thread_id = thread_tid(new_thread); + } else if (first_thread == NULL) { + first_thread = new_thread; } else { /* drop the extra ref returned by thread_create_with_continuation */ thread_deallocate(new_thread); @@ -2119,9 +2209,19 @@ task_duplicate_map_and_threads( /* Copy thread name */ bsd_copythreadname(new_thread->uthread, thread_array[i]->uthread); + new_thread->thread_tag = thread_array[i]->thread_tag; thread_copy_resource_info(new_thread, thread_array[i]); } + /* return the first thread if we couldn't find the equivalent of current */ + if (thread_return == THREAD_NULL) { + thread_return = first_thread; + } + else if (first_thread != THREAD_NULL) { + /* drop the extra ref returned by thread_create_with_continuation */ + thread_deallocate(first_thread); + } + task_resume_internal(task); for (i = 0; i < array_count; i++) { @@ -2188,6 +2288,10 @@ task_terminate_internal( } task->task_could_use_secluded_mem = FALSE; task->task_could_also_use_secluded_mem = FALSE; + + if (task->task_suppressed_secluded) { + stop_secluded_suppression(task); + } #endif /* CONFIG_SECLUDED_MEMORY */ if (!task->active) { @@ -2339,9 +2443,11 @@ task_terminate_internal( * Final cleanup: * + no unnesting * + remove immutable mappings + * + allow gaps in range */ (VM_MAP_REMOVE_NO_UNNESTING | - VM_MAP_REMOVE_IMMUTABLE)); + VM_MAP_REMOVE_IMMUTABLE | + VM_MAP_REMOVE_GAPS_OK)); /* release our shared region */ vm_shared_region_set(task, NULL); @@ -2530,9 +2636,11 @@ task_complete_halt(task_t task) * Final cleanup: * + no unnesting * + remove immutable mappings + * + allow gaps in the range */ (VM_MAP_REMOVE_NO_UNNESTING | - VM_MAP_REMOVE_IMMUTABLE)); + VM_MAP_REMOVE_IMMUTABLE | + VM_MAP_REMOVE_GAPS_OK)); /* * Kick out any IOKitUser handles to the task. At best they're stale, @@ -2548,7 +2656,7 @@ task_complete_halt(task_t task) * This is a recursive-style suspension of the task, a count of * suspends is maintained. * - * CONDITIONS: the task is locked and active. + * CONDITIONS: the task is locked and active. */ void task_hold_locked( @@ -2561,6 +2669,10 @@ task_hold_locked( if (task->suspend_count++ > 0) return; + if (task->bsd_info) { + workq_proc_suspended(task->bsd_info); + } + /* * Iterate through all the threads and hold them. */ @@ -2675,6 +2787,10 @@ task_release_locked( if (--task->suspend_count > 0) return; + if (task->bsd_info) { + workq_proc_resumed(task->bsd_info); + } + queue_iterate(&task->threads, thread, thread_t, task_threads) { thread_mtx_lock(thread); thread_release(thread); @@ -3400,8 +3516,9 @@ task_freeze( uint32_t *clean_count, uint32_t *dirty_count, uint32_t dirty_budget, - boolean_t *shared, - boolean_t walk_only) + uint32_t *shared_count, + int *freezer_error_code, + boolean_t eval_only) { kern_return_t kr = KERN_SUCCESS; @@ -3426,22 +3543,29 @@ task_freeze( task_unlock(task); - if (walk_only) { - panic("task_freeze - walk_only == TRUE"); - } else { - kr = vm_map_freeze(task->map, purgeable_count, wired_count, clean_count, dirty_count, dirty_budget, shared); - } + kr = vm_map_freeze(task->map, + purgeable_count, + wired_count, + clean_count, + dirty_count, + dirty_budget, + shared_count, + freezer_error_code, + eval_only); task_lock(task); - if (walk_only == FALSE && kr == KERN_SUCCESS) + if ((kr == KERN_SUCCESS) && (eval_only == FALSE)) { task->frozen = TRUE; + } + task->changing_freeze_state = FALSE; thread_wakeup(&task->changing_freeze_state); task_unlock(task); - if (VM_CONFIG_COMPRESSOR_IS_PRESENT) { + if (VM_CONFIG_COMPRESSOR_IS_PRESENT && + (eval_only == FALSE)) { vm_wake_compactor_swapper(); /* * We do an explicit wakeup of the swapout thread here @@ -3807,7 +3931,7 @@ task_info( if (thread->options & TH_OPT_IDLE_THREAD) continue; - thread_read_times(thread, &user_time, &system_time); + thread_read_times(thread, &user_time, &system_time, NULL); time_value_add(×_info->user_time, &user_time); time_value_add(×_info->system_time, &system_time); @@ -3891,7 +4015,7 @@ task_info( /* only set format on output for those expecting it */ if (*task_info_count >= TASK_DYLD_INFO_COUNT) { - info->all_image_info_format = task_has_64BitAddr(task) ? + info->all_image_info_format = task_has_64Bit_addr(task) ? TASK_DYLD_ALL_IMAGE_INFO_64 : TASK_DYLD_ALL_IMAGE_INFO_32 ; *task_info_count = TASK_DYLD_INFO_COUNT; @@ -4359,7 +4483,7 @@ task_info( flags_info = (task_flags_info_t)task_info_out; /* only publish the 64-bit flag of the task */ - flags_info->flags = task->t_flags & TF_64B_ADDR; + flags_info->flags = task->t_flags & (TF_64B_ADDR | TF_64B_DATA); *task_info_count = TASK_FLAGS_INFO_COUNT; break; @@ -4984,9 +5108,6 @@ PROC_VIOLATED_GUARD__SEND_EXC_GUARD_AND_SUSPEND( return kr; } -extern kern_return_t -task_violated_guard(mach_exception_code_t, mach_exception_subcode_t, void *); - kern_return_t task_violated_guard( mach_exception_code_t code, @@ -5093,6 +5214,7 @@ PROC_CROSSED_HIGH_WATERMARK__SEND_EXC_RESOURCE_AND_SUSPEND(int max_footprint_mb, int pid = 0; const char *procname = "unknown"; mach_exception_data_type_t code[EXCEPTION_CODE_MAX]; + boolean_t send_sync_exc_resource = FALSE; #ifdef MACH_BSD pid = proc_selfpid(); @@ -5105,8 +5227,10 @@ PROC_CROSSED_HIGH_WATERMARK__SEND_EXC_RESOURCE_AND_SUSPEND(int max_footprint_mb, return; } - if (task->bsd_info != NULL) + if (task->bsd_info != NULL) { procname = proc_name_address(current_task()->bsd_info); + send_sync_exc_resource = proc_send_synchronous_EXC_RESOURCE(current_task()->bsd_info); + } #endif #if CONFIG_COREDUMP if (hwm_user_cores) { @@ -5155,10 +5279,13 @@ PROC_CROSSED_HIGH_WATERMARK__SEND_EXC_RESOURCE_AND_SUSPEND(int max_footprint_mb, EXC_RESOURCE_ENCODE_FLAVOR(code[0], FLAVOR_HIGH_WATERMARK); EXC_RESOURCE_HWM_ENCODE_LIMIT(code[0], max_footprint_mb); - /* Do not generate a corpse fork if the violation is a fatal one */ - if (is_fatal || exc_via_corpse_forking == 0) { - /* Do not send a EXC_RESOURCE is corpse_for_fatal_memkill is set */ - if (corpse_for_fatal_memkill == 0) { + /* + * Do not generate a corpse fork if the violation is a fatal one + * or the process wants synchronous EXC_RESOURCE exceptions. + */ + if (is_fatal || send_sync_exc_resource || exc_via_corpse_forking == 0) { + /* Do not send a EXC_RESOURCE if corpse_for_fatal_memkill is set */ + if (send_sync_exc_resource || corpse_for_fatal_memkill == 0) { /* * Use the _internal_ variant so that no user-space * process can resume our task from under us. @@ -5396,6 +5523,17 @@ task_get_phys_footprint_limit( } #endif /* CONFIG_MEMORYSTATUS */ +void +task_set_thread_limit(task_t task, uint16_t thread_limit) +{ + assert(task != kernel_task); + if (thread_limit <= TASK_MAX_THREAD_LIMIT) { + task_lock(task); + task->task_thread_limit = thread_limit; + task_unlock(task); + } +} + /* * We need to export some functions to other components that * are currently implemented in macros within the osfmk @@ -6153,7 +6291,8 @@ task_set_could_also_use_secluded_mem( boolean_t task_can_use_secluded_mem( - task_t task) + task_t task, + boolean_t is_alloc) { if (task->task_can_use_secluded_mem) { assert(task->task_could_use_secluded_mem); @@ -6165,6 +6304,20 @@ task_can_use_secluded_mem( assert(num_tasks_can_use_secluded_mem > 0); return TRUE; } + + /* + * If a single task is using more than some amount of + * memory, allow it to dip into secluded and also begin + * suppression of secluded memory until the tasks exits. + */ + if (is_alloc && secluded_shutoff_trigger != 0) { + uint64_t phys_used = get_task_phys_footprint(task); + if (phys_used > secluded_shutoff_trigger) { + start_secluded_suppression(task); + return TRUE; + } + } + return FALSE; } @@ -6219,3 +6372,38 @@ task_self_region_footprint_set( } task_unlock(curtask); } + +void +task_set_darkwake_mode(task_t task, boolean_t set_mode) +{ + assert(task); + + task_lock(task); + + if (set_mode) { + task->t_flags |= TF_DARKWAKE_MODE; + } else { + task->t_flags &= ~(TF_DARKWAKE_MODE); + } + + task_unlock(task); +} + +boolean_t +task_get_darkwake_mode(task_t task) +{ + assert(task); + return ((task->t_flags & TF_DARKWAKE_MODE) != 0); +} + +#if __arm64__ +void +task_set_legacy_footprint( + task_t task, + boolean_t new_val) +{ + task_lock(task); + task->task_legacy_footprint = new_val; + task_unlock(task); +} +#endif /* __arm64__ */ diff --git a/osfmk/kern/task.h b/osfmk/kern/task.h index f0cbdff1d..fe43b2db1 100644 --- a/osfmk/kern/task.h +++ b/osfmk/kern/task.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010, 2015 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -150,11 +150,12 @@ struct task { _Atomic uint32_t ref_count; /* Number of references to me */ boolean_t active; /* Task has not been terminated */ boolean_t halting; /* Task is being halted */ + /* Virtual timers */ + uint32_t vtimers; /* Miscellaneous */ vm_map_t map; /* Address space description */ queue_chain_t tasks; /* global list of tasks */ - void *user_data; /* Arbitrary data settable via IPC */ #if defined(CONFIG_SCHED_MULTIQ) sched_group_t sched_group; @@ -182,14 +183,12 @@ struct task { /* Task security and audit tokens */ security_token_t sec_token; audit_token_t audit_token; - + /* Statistics */ uint64_t total_user_time; /* terminated threads only */ uint64_t total_system_time; uint64_t total_ptime; - - /* Virtual timers */ - uint32_t vtimers; + uint64_t total_runnable_time; /* IPC structures */ decl_lck_mtx_data(,itk_lock_data) @@ -210,12 +209,11 @@ struct task { struct ipc_space *itk_space; + ledger_t ledger; /* Synchronizer ownership information */ queue_head_t semaphore_list; /* list of owned semaphores */ int semaphores_owned; /* number of semaphores owned */ - ledger_t ledger; - unsigned int priv_flags; /* privilege resource flags */ #define VM_BACKING_STORE_PRIV 0x1 @@ -257,15 +255,27 @@ struct task { #define TF_LRETURNWAITER 0x00000200 /* task is waiting for TF_LRETURNWAIT to get cleared */ #define TF_PLATFORM 0x00000400 /* task is a platform binary */ #define TF_CA_CLIENT_WI 0x00000800 /* task has CA_CLIENT work interval */ +#define TF_DARKWAKE_MODE 0x00001000 /* task is in darkwake mode */ + +/* + * Task is running within a 64-bit address space. + */ +#define task_has_64Bit_addr(task) \ + (((task)->t_flags & TF_64B_ADDR) != 0) +#define task_set_64Bit_addr(task) \ + ((task)->t_flags |= TF_64B_ADDR) +#define task_clear_64Bit_addr(task) \ + ((task)->t_flags &= ~TF_64B_ADDR) -#define task_has_64BitAddr(task) \ - (((task)->t_flags & TF_64B_ADDR) != 0) -#define task_set_64BitAddr(task) \ - ((task)->t_flags |= TF_64B_ADDR) -#define task_clear_64BitAddr(task) \ - ((task)->t_flags &= ~TF_64B_ADDR) -#define task_has_64BitData(task) \ - (((task)->t_flags & TF_64B_DATA) != 0) +/* + * Task is using 64-bit machine state. + */ +#define task_has_64Bit_data(task) \ + (((task)->t_flags & TF_64B_DATA) != 0) +#define task_set_64Bit_data(task) \ + ((task)->t_flags |= TF_64B_DATA) +#define task_clear_64Bit_data(task) \ + ((task)->t_flags &= ~TF_64B_DATA) #define task_is_a_corpse(task) \ (((task)->t_flags & TF_CORPSE) != 0) @@ -316,8 +326,11 @@ struct task { applied_ru_cpu_ext :4; uint8_t rusage_cpu_flags; uint8_t rusage_cpu_percentage; /* Task-wide CPU limit percentage */ - uint64_t rusage_cpu_interval; /* Task-wide CPU limit interval */ uint8_t rusage_cpu_perthr_percentage; /* Per-thread CPU limit percentage */ +#if MACH_ASSERT + int8_t suspends_outstanding; /* suspends this task performed in excess of resumes */ +#endif + uint64_t rusage_cpu_interval; /* Task-wide CPU limit interval */ uint64_t rusage_cpu_perthr_interval; /* Per-thread CPU limit interval */ uint64_t rusage_cpu_deadline; thread_call_t rusage_cpu_callt; @@ -338,10 +351,6 @@ struct task { vm_extmod_statistics_data_t extmod_statistics; -#if MACH_ASSERT - int8_t suspends_outstanding; /* suspends this task performed in excess of resumes */ -#endif - struct task_requested_policy requested_policy; struct task_effective_policy effective_policy; @@ -393,8 +402,13 @@ struct task { queue_head_t task_objq; decl_lck_mtx_data(,task_objq_lock) /* protects "task_objq" */ - boolean_t task_region_footprint; - + unsigned int task_thread_limit:16; +#if __arm64__ + unsigned int task_legacy_footprint:1; +#endif /* __arm64__ */ + unsigned int task_region_footprint:1; + unsigned int task_has_crossed_thread_limit:1; + uint32_t exec_token; /* * A task's coalition set is "adopted" in task_create_internal * and unset in task_deallocate_internal, so each array member @@ -416,15 +430,33 @@ struct task { #endif /* HYPERVISOR */ #if CONFIG_SECLUDED_MEMORY - boolean_t task_can_use_secluded_mem; - boolean_t task_could_use_secluded_mem; - boolean_t task_could_also_use_secluded_mem; + uint8_t task_can_use_secluded_mem; + uint8_t task_could_use_secluded_mem; + uint8_t task_could_also_use_secluded_mem; + uint8_t task_suppressed_secluded; #endif /* CONFIG_SECLUDED_MEMORY */ + uint32_t task_exc_guard; + queue_head_t io_user_clients; - uint32_t exec_token; }; +#define TASK_EXC_GUARD_VM_DELIVER 0x01 /* Deliver virtual memory EXC_GUARD exceptions */ +#define TASK_EXC_GUARD_VM_ONCE 0x02 /* Deliver them only once */ +#define TASK_EXC_GUARD_VM_CORPSE 0x04 /* Deliver them via a forked corpse */ +#define TASK_EXC_GUARD_VM_FATAL 0x08 /* Virtual Memory EXC_GUARD delivery is fatal */ +#define TASK_EXC_GUARD_VM_ALL 0x0f + +#define TASK_EXC_GUARD_MP_DELIVER 0x10 /* Deliver mach port EXC_GUARD exceptions */ +#define TASK_EXC_GUARD_MP_ONCE 0x20 /* Deliver them only once */ +#define TASK_EXC_GUARD_MP_CORPSE 0x04 /* Deliver them via a forked corpse */ +#define TASK_EXC_GUARD_MP_FATAL 0x80 /* mach port EXC_GUARD delivery is fatal */ + +extern uint32_t task_exc_guard_default; + +extern kern_return_t +task_violated_guard(mach_exception_code_t, mach_exception_subcode_t, void *); + #define task_lock(task) lck_mtx_lock(&(task)->lock) #define task_lock_assert_owned(task) LCK_MTX_ASSERT(&(task)->lock, LCK_MTX_ASSERT_OWNED) #define task_lock_try(task) lck_mtx_try_lock(&(task)->lock) @@ -552,8 +584,9 @@ extern kern_return_t task_freeze( uint32_t *clean_count, uint32_t *dirty_count, uint32_t dirty_budget, - boolean_t *shared, - boolean_t walk_only); + uint32_t *shared_count, + int *freezer_error_code, + boolean_t eval_only); /* Thaw a currently frozen task */ extern kern_return_t task_thaw( @@ -577,6 +610,7 @@ extern kern_return_t task_create_internal( coalition_t *parent_coalitions, boolean_t inherit_memory, boolean_t is_64bit, + boolean_t is_64bit_data, uint32_t flags, uint32_t procflags, task_t *child_task); /* OUT */ @@ -625,7 +659,11 @@ extern void task_vtimer_update( extern void task_set_64bit( task_t task, - boolean_t is64bit); + boolean_t is_64bit, + boolean_t is_64bit_data); + +extern boolean_t task_get_64bit_data( + task_t task); extern void task_set_platform_binary( task_t task, @@ -634,9 +672,6 @@ extern bool task_set_ca_client_wi( task_t task, boolean_t ca_client_wi); -extern void task_backing_store_privileged( - task_t task); - extern void task_set_dyld_info( task_t task, mach_vm_address_t addr, @@ -667,7 +702,9 @@ extern uint64_t get_task_resident_size(task_t); extern uint64_t get_task_compressed(task_t); extern uint64_t get_task_resident_max(task_t); extern uint64_t get_task_phys_footprint(task_t); -extern uint64_t get_task_phys_footprint_recent_max(task_t); +#if CONFIG_LEDGER_INTERVAL_MAX +extern uint64_t get_task_phys_footprint_interval_max(task_t, int reset); +#endif /* CONFIG_FOOTPRINT_INTERVAL_MAX */ extern uint64_t get_task_phys_footprint_lifetime_max(task_t); extern uint64_t get_task_phys_footprint_limit(task_t); extern uint64_t get_task_purgeable_size(task_t); @@ -686,6 +723,9 @@ extern uint64_t get_task_alternate_accounting(task_t); extern uint64_t get_task_alternate_accounting_compressed(task_t); extern uint64_t get_task_memory_region_count(task_t); extern uint64_t get_task_page_table(task_t); +extern uint64_t get_task_network_nonvolatile(task_t); +extern uint64_t get_task_network_nonvolatile_compressed(task_t); +extern uint64_t get_task_wired_mem(task_t); extern kern_return_t task_convert_phys_footprint_limit(int, int *); extern kern_return_t task_set_phys_footprint_limit_internal(task_t, int, int *, boolean_t, boolean_t); @@ -699,6 +739,9 @@ extern void task_set_memlimit_is_fatal(task_t task, boolean_t memlimit_is_fatal) extern boolean_t task_has_triggered_exc_resource(task_t task, boolean_t memlimit_is_active); extern void task_mark_has_triggered_exc_resource(task_t task, boolean_t memlimit_is_active); +extern void task_set_thread_limit(task_t task, uint16_t thread_limit); + + extern boolean_t is_kerneltask(task_t task); extern boolean_t is_corpsetask(task_t task); @@ -735,6 +778,10 @@ struct _task_ledger_indices { int purgeable_nonvolatile; int purgeable_volatile_compressed; int purgeable_nonvolatile_compressed; + int network_volatile; + int network_nonvolatile; + int network_volatile_compressed; + int network_nonvolatile_compressed; int platform_idle_wakeups; int interrupt_wakeups; #if CONFIG_SCHED_SFI @@ -826,10 +873,19 @@ extern void task_set_could_use_secluded_mem( extern void task_set_could_also_use_secluded_mem( task_t task, boolean_t could_also_use_secluded_mem); -extern boolean_t task_can_use_secluded_mem(task_t task); +extern boolean_t task_can_use_secluded_mem( + task_t task, + boolean_t is_allocate); extern boolean_t task_could_use_secluded_mem(task_t task); #endif /* CONFIG_SECLUDED_MEMORY */ +extern void task_set_darkwake_mode(task_t, boolean_t); +extern boolean_t task_get_darkwake_mode(task_t); + +#if __arm64__ +extern void task_set_legacy_footprint(task_t task, boolean_t new_val); +#endif /* __arm64__ */ + #if CONFIG_MACF extern struct label *get_task_crash_label(task_t task); #endif /* CONFIG_MACF */ diff --git a/osfmk/kern/task_policy.c b/osfmk/kern/task_policy.c index ca3e83f18..f44ba4c84 100644 --- a/osfmk/kern/task_policy.c +++ b/osfmk/kern/task_policy.c @@ -248,11 +248,7 @@ int proc_tal_disk_tier = THROTTLE_LEVEL_TIER1; int proc_graphics_timer_qos = (LATENCY_QOS_TIER_0 & 0xFF); -#if CONFIG_EMBEDDED -const int proc_default_bg_iotier = THROTTLE_LEVEL_TIER3; -#else const int proc_default_bg_iotier = THROTTLE_LEVEL_TIER2; -#endif /* Latency/throughput QoS fields remain zeroed, i.e. TIER_UNSPECIFIED at creation */ const struct task_requested_policy default_task_requested_policy = { @@ -323,8 +319,12 @@ qos_throughput_policy_package(uint32_t qv) { return (qv == THROUGHPUT_QOS_TIER_UNSPECIFIED) ? THROUGHPUT_QOS_TIER_UNSPECIFIED : ((0xFE << 16) | qv); } +#define TASK_POLICY_SUPPRESSION_DISABLE 0x1 +#define TASK_POLICY_SUPPRESSION_IOTIER2 0x2 +#define TASK_POLICY_SUPPRESSION_NONDONOR 0x4 /* TEMPORARY boot-arg controlling task_policy suppression (App Nap) */ -static boolean_t task_policy_suppression_disable = FALSE; +static boolean_t task_policy_suppression_flags = TASK_POLICY_SUPPRESSION_IOTIER2 | + TASK_POLICY_SUPPRESSION_NONDONOR; kern_return_t task_policy_set( @@ -462,7 +462,8 @@ task_policy_set( return kr; /* TEMPORARY disablement of task suppression */ - if (task_policy_suppression_disable && info->active) + if (info->active && + (task_policy_suppression_flags & TASK_POLICY_SUPPRESSION_DISABLE)) return KERN_SUCCESS; struct task_pend_token pend_token = {}; @@ -826,6 +827,11 @@ task_policy_update_internal_locked(task_t task, boolean_t in_create, task_pend_t next.tep_qos_ceiling = THREAD_QOS_UTILITY; break; + case TASK_DARWINBG_APPLICATION: + /* i.e. 'DARWIN_BG throttled background application' */ + next.tep_qos_ceiling = THREAD_QOS_BACKGROUND; + break; + case TASK_UNSPECIFIED: default: /* Apps that don't have an application role get @@ -849,16 +855,21 @@ task_policy_update_internal_locked(task_t task, boolean_t in_create, task_pend_t * * Backgrounding due to apptype does. */ - if (requested.trp_int_darwinbg || requested.trp_ext_darwinbg) + if (requested.trp_int_darwinbg || requested.trp_ext_darwinbg || + next.tep_role == TASK_DARWINBG_APPLICATION) wants_watchersbg = wants_all_sockets_bg = wants_darwinbg = TRUE; - /* Background TAL apps are throttled when TAL is enabled */ + /* + * Deprecated TAL implementation for TAL apptype + * Background TAL apps are throttled when TAL is enabled + */ if (requested.trp_apptype == TASK_APPTYPE_APP_TAL && requested.trp_role == TASK_BACKGROUND_APPLICATION && requested.trp_tal_enabled == 1) { next.tep_tal_engaged = 1; } + /* New TAL implementation based on TAL role alone, works for all apps */ if ((requested.trp_apptype == TASK_APPTYPE_APP_DEFAULT || requested.trp_apptype == TASK_APPTYPE_APP_TAL) && requested.trp_role == TASK_THROTTLE_APPLICATION) { @@ -941,13 +952,13 @@ task_policy_update_internal_locked(task_t task, boolean_t in_create, task_pend_t next.tep_io_passive = 1; /* Calculate suppression-active flag */ - boolean_t memorystatus_appnap_transition = FALSE; + boolean_t appnap_transition = FALSE; if (requested.trp_sup_active && requested.trp_boosted == 0) next.tep_sup_active = 1; if (task->effective_policy.tep_sup_active != next.tep_sup_active) - memorystatus_appnap_transition = TRUE; + appnap_transition = TRUE; /* Calculate timer QOS */ int latency_qos = requested.trp_base_latency_qos; @@ -1001,10 +1012,14 @@ task_policy_update_internal_locked(task_t task, boolean_t in_create, task_pend_t switch (requested.trp_apptype) { case TASK_APPTYPE_APP_TAL: case TASK_APPTYPE_APP_DEFAULT: - if (requested.trp_ext_darwinbg == 0) - next.tep_live_donor = 1; - else + if (requested.trp_ext_darwinbg == 1 || + (next.tep_sup_active == 1 && + (task_policy_suppression_flags & TASK_POLICY_SUPPRESSION_NONDONOR)) || + next.tep_role == TASK_DARWINBG_APPLICATION) { next.tep_live_donor = 0; + } else { + next.tep_live_donor = 1; + } break; case TASK_APPTYPE_DAEMON_INTERACTIVE: @@ -1193,11 +1208,13 @@ task_policy_update_internal_locked(task_t task, boolean_t in_create, task_pend_t /* * Use the app-nap transitions to influence the - * transition of the process within the jetsam band. + * transition of the process within the jetsam band + * [and optionally its live-donor status] * On macOS only. */ - if (memorystatus_appnap_transition == TRUE) { + if (appnap_transition == TRUE) { if (task->effective_policy.tep_sup_active == 1) { + memorystatus_update_priority_for_appnap(((proc_t) task->bsd_info), TRUE); } else { memorystatus_update_priority_for_appnap(((proc_t) task->bsd_info), FALSE); @@ -1761,6 +1778,9 @@ proc_darwin_role_to_task_role(int darwin_role, int* task_role) case PRIO_DARWIN_ROLE_TAL_LAUNCH: role = TASK_THROTTLE_APPLICATION; break; + case PRIO_DARWIN_ROLE_DARWIN_BG: + role = TASK_DARWINBG_APPLICATION; + break; default: return EINVAL; } @@ -1784,6 +1804,8 @@ proc_task_role_to_darwin_role(int task_role) return PRIO_DARWIN_ROLE_UI; case TASK_THROTTLE_APPLICATION: return PRIO_DARWIN_ROLE_TAL_LAUNCH; + case TASK_DARWINBG_APPLICATION: + return PRIO_DARWIN_ROLE_DARWIN_BG; case TASK_UNSPECIFIED: default: return PRIO_DARWIN_ROLE_DEFAULT; @@ -2305,9 +2327,14 @@ proc_init_cpumon_params(void) proc_max_cpumon_interval *= NSEC_PER_SEC; /* TEMPORARY boot arg to control App suppression */ - PE_parse_boot_argn("task_policy_suppression_disable", - &task_policy_suppression_disable, - sizeof(task_policy_suppression_disable)); + PE_parse_boot_argn("task_policy_suppression_flags", + &task_policy_suppression_flags, + sizeof(task_policy_suppression_flags)); + + /* adjust suppression disk policy if called for in boot arg */ + if (task_policy_suppression_flags & TASK_POLICY_SUPPRESSION_IOTIER2) { + proc_suppressed_disk_tier = THROTTLE_LEVEL_TIER2; + } } /* @@ -3183,8 +3210,48 @@ task_importance_reset(__imp_only task_t task) #endif /* IMPORTANCE_INHERITANCE */ } +void +task_importance_init_from_parent(__imp_only task_t new_task, __imp_only task_t parent_task) +{ #if IMPORTANCE_INHERITANCE + ipc_importance_task_t new_task_imp = IIT_NULL; + + new_task->task_imp_base = NULL; + if (!parent_task) return; + + if (task_is_marked_importance_donor(parent_task)) { + new_task_imp = ipc_importance_for_task(new_task, FALSE); + assert(IIT_NULL != new_task_imp); + ipc_importance_task_mark_donor(new_task_imp, TRUE); + } + if (task_is_marked_live_importance_donor(parent_task)) { + if (IIT_NULL == new_task_imp) + new_task_imp = ipc_importance_for_task(new_task, FALSE); + assert(IIT_NULL != new_task_imp); + ipc_importance_task_mark_live_donor(new_task_imp, TRUE); + } + /* Do not inherit 'receiver' on fork, vfexec or true spawn */ + if (task_is_exec_copy(new_task) && + task_is_marked_importance_receiver(parent_task)) { + if (IIT_NULL == new_task_imp) + new_task_imp = ipc_importance_for_task(new_task, FALSE); + assert(IIT_NULL != new_task_imp); + ipc_importance_task_mark_receiver(new_task_imp, TRUE); + } + if (task_is_marked_importance_denap_receiver(parent_task)) { + if (IIT_NULL == new_task_imp) + new_task_imp = ipc_importance_for_task(new_task, FALSE); + assert(IIT_NULL != new_task_imp); + ipc_importance_task_mark_denap_receiver(new_task_imp, TRUE); + } + if (IIT_NULL != new_task_imp) { + assert(new_task->task_imp_base == new_task_imp); + ipc_importance_task_release(new_task_imp); + } +#endif /* IMPORTANCE_INHERITANCE */ +} +#if IMPORTANCE_INHERITANCE /* * Sets the task boost bit to the provided value. Does NOT run the update function. * diff --git a/osfmk/kern/telemetry.c b/osfmk/kern/telemetry.c index 120885eac..723d48f5b 100644 --- a/osfmk/kern/telemetry.c +++ b/osfmk/kern/telemetry.c @@ -2,7 +2,7 @@ * Copyright (c) 2012-2013 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ #include @@ -35,9 +35,9 @@ #include #include #include -#include -#include -#include +#include +#include +#include #include #include #include @@ -52,6 +52,7 @@ #include #include +#include #include #include @@ -93,10 +94,11 @@ volatile boolean_t telemetry_needs_timer_arming_record = FALSE; * If TRUE, record micro-stackshot samples for all tasks. * If FALSE, only sample tasks which are marked for telemetry. */ -boolean_t telemetry_sample_all_tasks = FALSE; -uint32_t telemetry_active_tasks = 0; // Number of tasks opted into telemetry +boolean_t telemetry_sample_all_tasks = FALSE; +boolean_t telemetry_sample_pmis = FALSE; +uint32_t telemetry_active_tasks = 0; // Number of tasks opted into telemetry -uint32_t telemetry_timestamp = 0; +uint32_t telemetry_timestamp = 0; /* * The telemetry_buffer is responsible @@ -109,12 +111,16 @@ struct micro_snapshot_buffer telemetry_buffer = {0, 0, 0, 0}; int telemetry_bytes_since_last_mark = -1; // How much data since buf was last marked? int telemetry_buffer_notify_at = 0; -lck_grp_t telemetry_lck_grp; -lck_mtx_t telemetry_mtx; +lck_grp_t telemetry_lck_grp; +lck_mtx_t telemetry_mtx; +lck_mtx_t telemetry_pmi_mtx; -#define TELEMETRY_LOCK() do { lck_mtx_lock(&telemetry_mtx); } while(0) +#define TELEMETRY_LOCK() do { lck_mtx_lock(&telemetry_mtx); } while (0) #define TELEMETRY_TRY_SPIN_LOCK() lck_mtx_try_lock_spin(&telemetry_mtx) -#define TELEMETRY_UNLOCK() do { lck_mtx_unlock(&telemetry_mtx); } while(0) +#define TELEMETRY_UNLOCK() do { lck_mtx_unlock(&telemetry_mtx); } while (0) + +#define TELEMETRY_PMI_LOCK() do { lck_mtx_lock(&telemetry_pmi_mtx); } while (0) +#define TELEMETRY_PMI_UNLOCK() do { lck_mtx_unlock(&telemetry_pmi_mtx); } while (0) void telemetry_init(void) { @@ -123,6 +129,7 @@ void telemetry_init(void) lck_grp_init(&telemetry_lck_grp, "telemetry group", LCK_GRP_ATTR_NULL); lck_mtx_init(&telemetry_mtx, &telemetry_lck_grp, LCK_ATTR_NULL); + lck_mtx_init(&telemetry_pmi_mtx, &telemetry_lck_grp, LCK_ATTR_NULL); if (!PE_parse_boot_argn("telemetry_buffer_size", &telemetry_buffer.size, sizeof(telemetry_buffer.size))) { telemetry_buffer.size = TELEMETRY_DEFAULT_BUFFER_SIZE; @@ -180,7 +187,7 @@ void telemetry_init(void) * enable_disable == 0: turn it off */ void -telemetry_global_ctl(int enable_disable) +telemetry_global_ctl(int enable_disable) { if (enable_disable == 1) { telemetry_sample_all_tasks = TRUE; @@ -222,9 +229,9 @@ telemetry_task_ctl_locked(task_t task, uint32_t reasons, int enable_disable) task->t_flags |= reasons; if ((origflags & TF_TELEMETRY) == 0) { OSIncrementAtomic(&telemetry_active_tasks); -#if TELEMETRY_DEBUG +#if TELEMETRY_DEBUG printf("%s: telemetry OFF -> ON (%d active)\n", proc_name_address(task->bsd_info), telemetry_active_tasks); -#endif +#endif } } else { task->t_flags &= ~reasons; @@ -258,15 +265,15 @@ telemetry_is_active(thread_t thread) return FALSE; } - if (telemetry_sample_all_tasks == TRUE) { - return (TRUE); + if (telemetry_sample_all_tasks || telemetry_sample_pmis) { + return TRUE; } if ((telemetry_active_tasks > 0) && ((thread->task->t_flags & TF_TELEMETRY) != 0)) { - return (TRUE); + return TRUE; } - - return (FALSE); + + return FALSE; } /* @@ -284,11 +291,82 @@ int telemetry_timer_event(__unused uint64_t deadline, __unused uint64_t interval return (0); } +#if defined(MT_CORE_INSTRS) && defined(MT_CORE_CYCLES) +static void +telemetry_pmi_handler(bool user_mode, __unused void *ctx) +{ + telemetry_mark_curthread(user_mode, TRUE); +} +#endif /* defined(MT_CORE_INSTRS) && defined(MT_CORE_CYCLES) */ + +int telemetry_pmi_setup(enum telemetry_pmi pmi_ctr, uint64_t period) +{ +#if defined(MT_CORE_INSTRS) && defined(MT_CORE_CYCLES) + static boolean_t sample_all_tasks_aside = FALSE; + static uint32_t active_tasks_aside = FALSE; + int error = 0; + const char *name = "?"; + + unsigned int ctr = 0; + + TELEMETRY_PMI_LOCK(); + + switch (pmi_ctr) { + case TELEMETRY_PMI_NONE: + if (!telemetry_sample_pmis) { + error = 1; + goto out; + } + + telemetry_sample_pmis = FALSE; + telemetry_sample_all_tasks = sample_all_tasks_aside; + telemetry_active_tasks = active_tasks_aside; + error = mt_microstackshot_stop(); + if (!error) { + printf("telemetry: disabling ustackshot on PMI\n"); + } + goto out; + + case TELEMETRY_PMI_INSTRS: + ctr = MT_CORE_INSTRS; + name = "instructions"; + break; + + case TELEMETRY_PMI_CYCLES: + ctr = MT_CORE_CYCLES; + name = "cycles"; + break; + + default: + error = 1; + goto out; + } + + telemetry_sample_pmis = TRUE; + sample_all_tasks_aside = telemetry_sample_all_tasks; + active_tasks_aside = telemetry_active_tasks; + telemetry_sample_all_tasks = FALSE; + telemetry_active_tasks = 0; + + error = mt_microstackshot_start(ctr, period, telemetry_pmi_handler, NULL); + if (!error) { + printf("telemetry: ustackshot every %llu %s\n", period, name); + } + +out: + TELEMETRY_PMI_UNLOCK(); + return error; +#else /* defined(MT_CORE_INSTRS) && defined(MT_CORE_CYCLES) */ +#pragma unused(pmi_ctr, period) + return 1; +#endif /* !defined(MT_CORE_INSTRS) || !defined(MT_CORE_CYCLES) */ +} + /* * Mark the current thread for an interrupt-based * telemetry record, to be sampled at the next AST boundary. */ -void telemetry_mark_curthread(boolean_t interrupted_userspace) +void telemetry_mark_curthread(boolean_t interrupted_userspace, boolean_t pmi) { uint32_t ast_bits = 0; thread_t thread = current_thread(); @@ -302,6 +380,9 @@ void telemetry_mark_curthread(boolean_t interrupted_userspace) } ast_bits |= (interrupted_userspace ? AST_TELEMETRY_USER : AST_TELEMETRY_KERNEL); + if (pmi) { + ast_bits |= AST_TELEMETRY_PMI; + } telemetry_needs_record = FALSE; thread_ast_set(thread, ast_bits); @@ -324,33 +405,33 @@ void compute_telemetry(void *arg __unused) static void telemetry_notify_user(void) { - mach_port_t user_port; - uint32_t flags = 0; - int error; + mach_port_t user_port = MACH_PORT_NULL; - error = host_get_telemetry_port(host_priv_self(), &user_port); - if ((error != KERN_SUCCESS) || !IPC_PORT_VALID(user_port)) { + kern_return_t kr = host_get_telemetry_port(host_priv_self(), &user_port); + if ((kr != KERN_SUCCESS) || !IPC_PORT_VALID(user_port)) { return; } - telemetry_notification(user_port, flags); + telemetry_notification(user_port, 0); ipc_port_release_send(user_port); } void telemetry_ast(thread_t thread, ast_t reasons) { - assert((reasons & AST_TELEMETRY_ALL) != AST_TELEMETRY_ALL); /* only one is valid at a time */ - - boolean_t io_telemetry = (reasons & AST_TELEMETRY_IO) ? TRUE : FALSE; - boolean_t interrupted_userspace = (reasons & AST_TELEMETRY_USER) ? TRUE : FALSE; + assert((reasons & AST_TELEMETRY_ALL) != 0); - uint8_t microsnapshot_flags = kInterruptRecord; + uint8_t record_type = 0; + if (reasons & AST_TELEMETRY_IO) { + record_type |= kIORecord; + } + if (reasons & (AST_TELEMETRY_USER | AST_TELEMETRY_KERNEL)) { + record_type |= (reasons & AST_TELEMETRY_PMI) ? kPMIRecord : + kInterruptRecord; + } - if (io_telemetry == TRUE) - microsnapshot_flags = kIORecord; + uint8_t user_telemetry = (reasons & AST_TELEMETRY_USER) ? kUserMode : 0; - if (interrupted_userspace) - microsnapshot_flags |= kUserMode; + uint8_t microsnapshot_flags = record_type | user_telemetry; telemetry_take_sample(thread, microsnapshot_flags, &telemetry_buffer); } @@ -377,25 +458,10 @@ void telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct if ((task == TASK_NULL) || (task == kernel_task) || task_did_exec(task) || task_is_exec_copy(task)) return; - /* - * To avoid overloading the system with telemetry requests, make - * sure we don't add more requests while existing ones are - * in-flight. Attempt this by checking if we can grab the lock. - * - * This concerns me a little; this working as intended is - * contingent on the workload being done in the context of the - * telemetry lock being the expensive part of telemetry. This - * includes populating the buffer and the client gathering it, - * but excludes the copyin overhead. - */ - if (!TELEMETRY_TRY_SPIN_LOCK()) - return; - - TELEMETRY_UNLOCK(); - /* telemetry_XXX accessed outside of lock for instrumentation only */ - /* TODO */ - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_RECORD) | DBG_FUNC_START, microsnapshot_flags, telemetry_bytes_since_last_mark, 0, 0, (&telemetry_buffer != current_buffer)); + KDBG(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_RECORD) | DBG_FUNC_START, + microsnapshot_flags, telemetry_bytes_since_last_mark, 0, + (&telemetry_buffer != current_buffer)); p = get_bsdtask_info(task); @@ -444,7 +510,7 @@ void telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct */ uint32_t uuid_info_count = 0; mach_vm_address_t uuid_info_addr = 0; - if (task_has_64BitAddr(task)) { + if (task_has_64Bit_addr(task)) { struct user64_dyld_all_image_infos task_image_infos; if (copyin(task->all_image_info_addr, (char *)&task_image_infos, sizeof(task_image_infos)) == 0) { uuid_info_count = (uint32_t)task_image_infos.uuidArrayCount; @@ -475,7 +541,7 @@ void telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct uuid_info_count = TELEMETRY_MAX_UUID_COUNT; } - uint32_t uuid_info_size = (uint32_t)(task_has_64BitAddr(thread->task) ? sizeof(struct user64_dyld_uuid_info) : sizeof(struct user32_dyld_uuid_info)); + uint32_t uuid_info_size = (uint32_t)(task_has_64Bit_addr(thread->task) ? sizeof(struct user64_dyld_uuid_info) : sizeof(struct user32_dyld_uuid_info)); uint32_t uuid_info_array_size = uuid_info_count * uuid_info_size; char *uuid_info_array = NULL; @@ -505,10 +571,10 @@ void telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct if (dqkeyaddr != 0) { uint64_t dqaddr = 0; uint64_t dq_serialno_offset = get_task_dispatchqueue_serialno_offset(task); - if ((copyin(dqkeyaddr, (char *)&dqaddr, (task_has_64BitAddr(task) ? 8 : 4)) == 0) && + if ((copyin(dqkeyaddr, (char *)&dqaddr, (task_has_64Bit_addr(task) ? 8 : 4)) == 0) && (dqaddr != 0) && (dq_serialno_offset != 0)) { uint64_t dqserialnumaddr = dqaddr + dq_serialno_offset; - if (copyin(dqserialnumaddr, (char *)&dqserialnum, (task_has_64BitAddr(task) ? 8 : 4)) == 0) { + if (copyin(dqserialnumaddr, (char *)&dqserialnum, (task_has_64Bit_addr(task) ? 8 : 4)) == 0) { dqserialnum_valid = 1; } } @@ -556,7 +622,7 @@ void telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct msnap->snapshot_magic = STACKSHOT_MICRO_SNAPSHOT_MAGIC; msnap->ms_flags = microsnapshot_flags; msnap->ms_opaque_flags = 0; /* namespace managed by userspace */ - msnap->ms_cpu = 0; /* XXX - does this field make sense for a micro-stackshot? */ + msnap->ms_cpu = cpu_number(); msnap->ms_time = secs; msnap->ms_time_microsecs = usecs; @@ -580,7 +646,7 @@ void telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct tsnap->user_time_in_terminated_threads = task->total_user_time; tsnap->system_time_in_terminated_threads = task->total_system_time; tsnap->suspend_count = task->suspend_count; - tsnap->task_size = pmap_resident_count(task->map->pmap); + tsnap->task_size = (typeof(tsnap->task_size)) (get_task_phys_footprint(task) / PAGE_SIZE); tsnap->faults = task->faults; tsnap->pageins = task->pageins; tsnap->cow_faults = task->cow_faults; @@ -588,12 +654,12 @@ void telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct * The throttling counters are maintained as 64-bit counters in the proc * structure. However, we reserve 32-bits (each) for them in the task_snapshot * struct to save space and since we do not expect them to overflow 32-bits. If we - * find these values overflowing in the future, the fix would be to simply + * find these values overflowing in the future, the fix would be to simply * upgrade these counters to 64-bit in the task_snapshot struct */ tsnap->was_throttled = (uint32_t) proc_was_throttled(p); tsnap->did_throttle = (uint32_t) proc_did_throttle(p); - + if (task->t_flags & TF_TELEMETRY) { tsnap->ss_flags |= kTaskRsrcFlagged; } @@ -619,7 +685,7 @@ void telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct tsnap->latency_qos = task_grab_latency_qos(task); strlcpy(tsnap->p_comm, proc_name_address(p), sizeof(tsnap->p_comm)); - if (task_has_64BitAddr(thread->task)) { + if (task_has_64Bit_addr(thread->task)) { tsnap->ss_flags |= kUser64_p; } @@ -660,7 +726,7 @@ void telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct if ((current_buffer->size - current_buffer->current_position) < sizeof(struct thread_snapshot)) { /* wrap and overwrite */ - current_buffer->end_point = current_record_start; + current_buffer->end_point = current_record_start; current_buffer->current_position = 0; if (current_record_start == 0) { /* This sample is too large to fit in the buffer even when we started at 0, so skip it */ @@ -681,7 +747,8 @@ void telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct thsnap->ss_flags |= kStacksPCOnly; thsnap->ts_qos = thread->effective_policy.thep_qos; thsnap->ts_rqos = thread->requested_policy.thrp_qos; - thsnap->ts_rqos_override = thread->requested_policy.thrp_qos_override; + thsnap->ts_rqos_override = MAX(thread->requested_policy.thrp_qos_override, + thread->requested_policy.thrp_qos_workq_override); if (proc_get_effective_thread_policy(thread, TASK_POLICY_DARWIN_BG)) { thsnap->ss_flags |= kThreadDarwinBG; @@ -706,7 +773,7 @@ void telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct if (dqserialnum_valid) { if ((current_buffer->size - current_buffer->current_position) < sizeof(dqserialnum)) { /* wrap and overwrite */ - current_buffer->end_point = current_record_start; + current_buffer->end_point = current_record_start; current_buffer->current_position = 0; if (current_record_start == 0) { /* This sample is too large to fit in the buffer even when we started at 0, so skip it */ @@ -720,7 +787,7 @@ void telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct current_buffer->current_position += sizeof (dqserialnum); } - if (task_has_64BitAddr(task)) { + if (user64) { framesize = 8; thsnap->ss_flags |= kUser64_p; } else { @@ -772,11 +839,11 @@ void telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct } cancel_sample: - TELEMETRY_UNLOCK(); - /* TODO */ - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_RECORD) | DBG_FUNC_END, notify, telemetry_bytes_since_last_mark, current_buffer->current_position, current_buffer->end_point, (&telemetry_buffer != current_buffer)); + KDBG(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_RECORD) | DBG_FUNC_END, + notify, telemetry_bytes_since_last_mark, + current_buffer->current_position, current_buffer->end_point); if (notify) { telemetry_notify_user(); @@ -793,7 +860,7 @@ log_telemetry_output(vm_offset_t buf, uint32_t pos, uint32_t sz) { struct micro_snapshot *p; uint32_t offset; - + printf("Copying out %d bytes of telemetry at offset %d\n", sz, pos); buf += pos; @@ -820,13 +887,14 @@ int telemetry_buffer_gather(user_addr_t buffer, uint32_t *length, boolean_t mark int result = 0; uint32_t oldest_record_offset; - /* TODO */ - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_GATHER) | DBG_FUNC_START, mark, telemetry_bytes_since_last_mark, 0, 0, (&telemetry_buffer != current_buffer)); + KDBG(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_GATHER) | DBG_FUNC_START, + mark, telemetry_bytes_since_last_mark, 0, + (&telemetry_buffer != current_buffer)); TELEMETRY_LOCK(); if (current_buffer->buffer == 0) { - *length = 0; + *length = 0; goto out; } @@ -910,7 +978,9 @@ int telemetry_buffer_gather(user_addr_t buffer, uint32_t *length, boolean_t mark TELEMETRY_UNLOCK(); - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_GATHER) | DBG_FUNC_END, current_buffer->current_position, *length, current_buffer->end_point, 0, (&telemetry_buffer != current_buffer)); + KDBG(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_GATHER) | DBG_FUNC_END, + current_buffer->current_position, *length, + current_buffer->end_point, (&telemetry_buffer != current_buffer)); return (result); } @@ -1007,7 +1077,7 @@ void bootprofile_init(void) if (0 == strcmp(type, "boot")) { bootprofile_type = kBootProfileStartTimerAtBoot; } else if (0 == strcmp(type, "wake")) { - bootprofile_type = kBootProfileStartTimerAtWake; + bootprofile_type = kBootProfileStartTimerAtWake; } else { bootprofile_type = kBootProfileDisabled; } @@ -1182,7 +1252,7 @@ int bootprofile_gather(user_addr_t buffer, uint32_t *length) BOOTPROFILE_LOCK(); if (bootprofile_buffer == 0) { - *length = 0; + *length = 0; goto out; } diff --git a/osfmk/kern/telemetry.h b/osfmk/kern/telemetry.h index b5e023401..166b31c1a 100644 --- a/osfmk/kern/telemetry.h +++ b/osfmk/kern/telemetry.h @@ -36,6 +36,19 @@ __BEGIN_DECLS +#define TELEMETRY_CMD_TIMER_EVENT 1 +#define TELEMETRY_CMD_VOUCHER_NAME 2 +#define TELEMETRY_CMD_VOUCHER_STAIN TELEMETRY_CMD_VOUCHER_NAME + +enum telemetry_pmi { + TELEMETRY_PMI_NONE, + TELEMETRY_PMI_INSTRS, + TELEMETRY_PMI_CYCLES, +}; +#define TELEMETRY_CMD_PMI_SETUP 3 + +#if XNU_KERNEL_PRIVATE + extern volatile boolean_t telemetry_needs_record; extern void telemetry_init(void); @@ -46,24 +59,23 @@ extern void telemetry_ast(thread_t thread, uint32_t reasons); extern int telemetry_gather(user_addr_t buffer, uint32_t *length, boolean_t mark); -extern void telemetry_mark_curthread(boolean_t interrupted_userspace); +extern void telemetry_mark_curthread(boolean_t interrupted_userspace, + boolean_t pmi); extern void telemetry_task_ctl(task_t task, uint32_t reason, int enable_disable); extern void telemetry_task_ctl_locked(task_t task, uint32_t reason, int enable_disable); extern void telemetry_global_ctl(int enable_disable); extern int telemetry_timer_event(uint64_t deadline, uint64_t interval, uint64_t leeway); - -#define TELEMETRY_CMD_TIMER_EVENT 1 -#define TELEMETRY_CMD_VOUCHER_NAME 2 -#define TELEMETRY_CMD_VOUCHER_STAIN TELEMETRY_CMD_VOUCHER_NAME - +extern int telemetry_pmi_setup(enum telemetry_pmi pmi_type, uint64_t interval); extern void bootprofile_init(void); extern void bootprofile_wake_from_sleep(void); extern void bootprofile_get(void **buffer, uint32_t *length); extern int bootprofile_gather(user_addr_t buffer, uint32_t *length); +#endif /* XNU_KERNEL_PRIVATE */ + __END_DECLS #endif /* _KERNEL_TELEMETRY_H_ */ diff --git a/osfmk/kern/test_lock.c b/osfmk/kern/test_lock.c new file mode 100644 index 000000000..08056068a --- /dev/null +++ b/osfmk/kern/test_lock.c @@ -0,0 +1,932 @@ +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static lck_mtx_t test_mtx; +static lck_grp_t test_mtx_grp; +static lck_grp_attr_t test_mtx_grp_attr; +static lck_attr_t test_mtx_attr; + +static lck_grp_t test_mtx_stats_grp; +static lck_grp_attr_t test_mtx_stats_grp_attr; +static lck_attr_t test_mtx_stats_attr; + +struct lck_mtx_test_stats_elem { + lck_spin_t lock; + uint64_t samples; + uint64_t avg; + uint64_t max; + uint64_t min; + uint64_t tot; +}; + +#define TEST_MTX_LOCK_STATS 0 +#define TEST_MTX_TRY_LOCK_STATS 1 +#define TEST_MTX_LOCK_SPIN_STATS 2 +#define TEST_MTX_LOCK_SPIN_ALWAYS_STATS 3 +#define TEST_MTX_TRY_LOCK_SPIN_STATS 4 +#define TEST_MTX_TRY_LOCK_SPIN_ALWAYS_STATS 5 +#define TEST_MTX_UNLOCK_MTX_STATS 6 +#define TEST_MTX_UNLOCK_SPIN_STATS 7 +#define TEST_MTX_MAX_STATS 8 + +struct lck_mtx_test_stats_elem lck_mtx_test_stats[TEST_MTX_MAX_STATS]; +atomic_bool enabled = TRUE; + +static void +init_test_mtx_stats(void) +{ + int i; + + lck_grp_attr_setdefault(&test_mtx_stats_grp_attr); + lck_grp_init(&test_mtx_stats_grp, "testlck_stats_mtx", &test_mtx_stats_grp_attr); + lck_attr_setdefault(&test_mtx_stats_attr); + + atomic_store(&enabled, TRUE); + for(i = 0; i < TEST_MTX_MAX_STATS; i++){ + memset(&lck_mtx_test_stats[i], 0 , sizeof(struct lck_mtx_test_stats_elem)); + lck_mtx_test_stats[i].min = ~0; + lck_spin_init(&lck_mtx_test_stats[i].lock, &test_mtx_stats_grp, &test_mtx_stats_attr); + } +} + +static void +update_test_mtx_stats( + uint64_t start, + uint64_t end, + uint type) +{ + if (atomic_load(&enabled) == TRUE) { + assert(type < TEST_MTX_MAX_STATS); + assert(start <= end); + + uint64_t elapsed = end - start; + struct lck_mtx_test_stats_elem* stat = &lck_mtx_test_stats[type]; + + lck_spin_lock(&stat->lock); + + stat->samples++; + stat->tot += elapsed; + stat->avg = stat->tot / stat->samples; + if (stat->max < elapsed) + stat->max = elapsed; + if (stat->min > elapsed) + stat->min = elapsed; + lck_spin_unlock(&stat->lock); + } +} + +static void +erase_test_mtx_stats( + uint type) +{ + assert(type < TEST_MTX_MAX_STATS); + struct lck_mtx_test_stats_elem* stat = &lck_mtx_test_stats[type]; + + lck_spin_lock(&stat->lock); + + stat->samples = 0; + stat->tot = 0; + stat->avg = 0; + stat->max = 0; + stat->min = ~0; + + lck_spin_unlock(&stat->lock); +} + +void +erase_all_test_mtx_stats(void) +{ + int i; + for (i = 0; i < TEST_MTX_MAX_STATS; i++) { + erase_test_mtx_stats(i); + } +} + +static void +disable_all_test_mtx_stats(void) +{ + atomic_store(&enabled, FALSE); +} + +static void +enable_all_test_mtx_stats(void) +{ + atomic_store(&enabled, TRUE); +} + +static int +print_test_mtx_stats_string_name( + int type_num, + char* buffer, + int size) +{ + char* type = ""; + switch (type_num) { + case TEST_MTX_LOCK_STATS: + type = "TEST_MTX_LOCK_STATS"; + break; + case TEST_MTX_TRY_LOCK_STATS: + type = "TEST_MTX_TRY_LOCK_STATS"; + break; + case TEST_MTX_LOCK_SPIN_STATS: + type = "TEST_MTX_LOCK_SPIN_STATS"; + break; + case TEST_MTX_LOCK_SPIN_ALWAYS_STATS: + type = "TEST_MTX_LOCK_SPIN_ALWAYS_STATS"; + break; + case TEST_MTX_TRY_LOCK_SPIN_STATS: + type = "TEST_MTX_TRY_LOCK_SPIN_STATS"; + break; + case TEST_MTX_TRY_LOCK_SPIN_ALWAYS_STATS: + type = "TEST_MTX_TRY_LOCK_SPIN_ALWAYS_STATS"; + break; + case TEST_MTX_UNLOCK_MTX_STATS: + type = "TEST_MTX_UNLOCK_MTX_STATS"; + break; + case TEST_MTX_UNLOCK_SPIN_STATS: + type = "TEST_MTX_UNLOCK_SPIN_STATS"; + break; + default: + break; + } + + return snprintf(buffer, size, "%s ", type); +} + +int +get_test_mtx_stats_string( + char* buffer, + int size) +{ + int string_off = 0; + int ret = 0; + + ret = snprintf(&buffer[string_off], size, "\n"); + size -= ret; + string_off += ret; + + int i; + for (i = 0; i < TEST_MTX_MAX_STATS; i++) { + struct lck_mtx_test_stats_elem* stat = &lck_mtx_test_stats[i]; + + ret = snprintf(&buffer[string_off], size, "{ "); + size -= ret; + string_off += ret; + + lck_spin_lock(&stat->lock); + uint64_t time; + + ret = snprintf(&buffer[string_off], size, "samples %llu, ", stat->samples); + size -= ret; + string_off += ret; + + absolutetime_to_nanoseconds(stat->tot, &time); + ret = snprintf(&buffer[string_off], size, "tot %llu ns, ", time); + size -= ret; + string_off += ret; + + absolutetime_to_nanoseconds(stat->avg, &time); + ret = snprintf(&buffer[string_off], size, "avg %llu ns, ", time); + size -= ret; + string_off += ret; + + absolutetime_to_nanoseconds(stat->max, &time); + ret = snprintf(&buffer[string_off], size, "max %llu ns, ", time); + size -= ret; + string_off += ret; + + absolutetime_to_nanoseconds(stat->min, &time); + ret = snprintf(&buffer[string_off], size, "min %llu ns", time); + size -= ret; + string_off += ret; + + lck_spin_unlock(&stat->lock); + + ret = snprintf(&buffer[string_off], size, " } "); + size -= ret; + string_off += ret; + + ret = print_test_mtx_stats_string_name(i, &buffer[string_off], size); + size -= ret; + string_off += ret; + + ret = snprintf(&buffer[string_off], size, "\n"); + size -= ret; + string_off += ret; + } + + return string_off; +} + +void +lck_mtx_test_init(void) +{ + static int first = 0; + + /* + * This should be substituted with a version + * of dispatch_once for kernel (rdar:39537874) + */ + if (os_atomic_load(&first, acquire) >= 2) + return; + + if (os_atomic_cmpxchg(&first, 0, 1, relaxed)){ + lck_grp_attr_setdefault(&test_mtx_grp_attr); + lck_grp_init(&test_mtx_grp, "testlck_mtx", &test_mtx_grp_attr); + lck_attr_setdefault(&test_mtx_attr); + lck_mtx_init(&test_mtx, &test_mtx_grp, &test_mtx_attr); + + init_test_mtx_stats(); + + os_atomic_inc(&first, release); + } + + while(os_atomic_load(&first, acquire) < 2); +} + +void +lck_mtx_test_lock(void) +{ + uint64_t start; + + start = mach_absolute_time(); + + lck_mtx_lock(&test_mtx); + + update_test_mtx_stats(start, mach_absolute_time(), TEST_MTX_LOCK_STATS); +} + +static void +lck_mtx_test_try_lock(void) +{ + uint64_t start; + + start = mach_absolute_time(); + + lck_mtx_try_lock(&test_mtx); + + update_test_mtx_stats(start, mach_absolute_time(), TEST_MTX_TRY_LOCK_STATS); +} + +static void +lck_mtx_test_lock_spin(void) +{ + uint64_t start; + + start = mach_absolute_time(); + + lck_mtx_lock_spin(&test_mtx); + + update_test_mtx_stats(start, mach_absolute_time(), TEST_MTX_LOCK_SPIN_STATS); +} + +static void +lck_mtx_test_lock_spin_always(void) +{ + uint64_t start; + + start = mach_absolute_time(); + + lck_mtx_lock_spin_always(&test_mtx); + + update_test_mtx_stats(start, mach_absolute_time(), TEST_MTX_LOCK_SPIN_ALWAYS_STATS); +} + +static void +lck_mtx_test_try_lock_spin(void) +{ + uint64_t start; + + start = mach_absolute_time(); + + lck_mtx_try_lock_spin(&test_mtx); + + update_test_mtx_stats(start, mach_absolute_time(), TEST_MTX_TRY_LOCK_SPIN_STATS); +} + +static void +lck_mtx_test_try_lock_spin_always(void) +{ + uint64_t start; + + start = mach_absolute_time(); + + lck_mtx_try_lock_spin_always(&test_mtx); + + update_test_mtx_stats(start, mach_absolute_time(), TEST_MTX_TRY_LOCK_SPIN_ALWAYS_STATS); +} + +void +lck_mtx_test_unlock(void) +{ + uint64_t start; + + start = mach_absolute_time(); + + lck_mtx_unlock(&test_mtx); + + update_test_mtx_stats(start, mach_absolute_time(), TEST_MTX_UNLOCK_MTX_STATS); +} + +static void +lck_mtx_test_unlock_mtx(void) +{ + uint64_t start; + + start = mach_absolute_time(); + + lck_mtx_unlock(&test_mtx); + + update_test_mtx_stats(start, mach_absolute_time(), TEST_MTX_UNLOCK_MTX_STATS); +} + +static void +lck_mtx_test_unlock_spin(void) +{ + uint64_t start; + + start = mach_absolute_time(); + + lck_mtx_unlock(&test_mtx); + + update_test_mtx_stats(start, mach_absolute_time(), TEST_MTX_UNLOCK_SPIN_STATS); +} + +#define WARMUP_ITER 1000 + +int +lck_mtx_test_mtx_uncontended_loop_time( + int iter, char *buffer, int size) +{ + int i; + uint64_t tot_time[TEST_MTX_MAX_STATS]; + uint64_t run_time[TEST_MTX_MAX_STATS]; + uint64_t start; + uint64_t start_run; + + //warming up the test + for (i = 0; i < WARMUP_ITER; i++) { + lck_mtx_lock(&test_mtx); + lck_mtx_unlock(&test_mtx); + } + + start_run = thread_get_runtime_self(); + start = mach_absolute_time(); + + for (i = 0; i < iter; i++) { + lck_mtx_lock(&test_mtx); + lck_mtx_unlock(&test_mtx); + } + + absolutetime_to_nanoseconds(mach_absolute_time() - start, &tot_time[TEST_MTX_LOCK_STATS]); + absolutetime_to_nanoseconds(thread_get_runtime_self() - start_run, &run_time[TEST_MTX_LOCK_STATS]); + + //warming up the test + for (i = 0; i < WARMUP_ITER; i++) { + lck_mtx_try_lock(&test_mtx); + lck_mtx_unlock(&test_mtx); + } + + start_run = thread_get_runtime_self(); + start = mach_absolute_time(); + + for (i = 0; i < iter; i++) { + lck_mtx_try_lock(&test_mtx); + lck_mtx_unlock(&test_mtx); + } + + absolutetime_to_nanoseconds(mach_absolute_time() - start, &tot_time[TEST_MTX_TRY_LOCK_STATS]); + absolutetime_to_nanoseconds(thread_get_runtime_self() - start_run, &run_time[TEST_MTX_TRY_LOCK_STATS]); + + //warming up the test + for (i = 0; i < WARMUP_ITER; i++) { + lck_mtx_lock_spin(&test_mtx); + lck_mtx_unlock(&test_mtx); + } + + start_run = thread_get_runtime_self(); + start = mach_absolute_time(); + + for (i = 0; i < iter; i++) { + lck_mtx_lock_spin(&test_mtx); + lck_mtx_unlock(&test_mtx); + } + + absolutetime_to_nanoseconds(mach_absolute_time() - start, &tot_time[TEST_MTX_LOCK_SPIN_STATS]); + absolutetime_to_nanoseconds(thread_get_runtime_self() - start_run, &run_time[TEST_MTX_LOCK_SPIN_STATS]); + + //warming up the test + for (i = 0; i < WARMUP_ITER; i++) { + lck_mtx_lock_spin_always(&test_mtx); + lck_mtx_unlock(&test_mtx); + } + + start_run = thread_get_runtime_self(); + start = mach_absolute_time(); + + for (i = 0; i < iter; i++) { + lck_mtx_lock_spin_always(&test_mtx); + lck_mtx_unlock(&test_mtx); + } + + absolutetime_to_nanoseconds(mach_absolute_time() - start, &tot_time[TEST_MTX_LOCK_SPIN_ALWAYS_STATS]); + absolutetime_to_nanoseconds(thread_get_runtime_self() - start_run, &run_time[TEST_MTX_LOCK_SPIN_ALWAYS_STATS]); + + //warming up the test + for (i = 0; i < WARMUP_ITER; i++) { + lck_mtx_try_lock_spin(&test_mtx); + lck_mtx_unlock(&test_mtx); + } + + start_run = thread_get_runtime_self(); + start = mach_absolute_time(); + + for (i = 0; i < iter; i++) { + lck_mtx_try_lock_spin(&test_mtx); + lck_mtx_unlock(&test_mtx); + } + + absolutetime_to_nanoseconds(mach_absolute_time() - start, &tot_time[TEST_MTX_TRY_LOCK_SPIN_STATS]); + absolutetime_to_nanoseconds(thread_get_runtime_self() - start_run, &run_time[TEST_MTX_TRY_LOCK_SPIN_STATS]); + + //warming up the test + for (i = 0; i < WARMUP_ITER; i++) { + lck_mtx_try_lock_spin_always(&test_mtx); + lck_mtx_unlock(&test_mtx); + } + + start_run = thread_get_runtime_self(); + start = mach_absolute_time(); + + for (i = 0; i < iter; i++) { + lck_mtx_try_lock_spin_always(&test_mtx); + lck_mtx_unlock(&test_mtx); + } + + absolutetime_to_nanoseconds(mach_absolute_time() - start, &tot_time[TEST_MTX_TRY_LOCK_SPIN_ALWAYS_STATS]); + absolutetime_to_nanoseconds(thread_get_runtime_self() - start_run, &run_time[TEST_MTX_TRY_LOCK_SPIN_ALWAYS_STATS]); + + int string_off = 0; + int ret = 0; + + ret = snprintf(&buffer[string_off], size, "\n"); + size -= ret; + string_off += ret; + + for (i = 0; i < TEST_MTX_MAX_STATS - 2; i++) { + + ret = snprintf(&buffer[string_off], size, "total time %llu ns total run time %llu ns ", tot_time[i], run_time[i]); + size -= ret; + string_off += ret; + + ret = print_test_mtx_stats_string_name(i, &buffer[string_off], size); + size -= ret; + string_off += ret; + + ret = snprintf(&buffer[string_off], size, "\n"); + size -= ret; + string_off += ret; + } + + return string_off; +} + +static kern_return_t +lck_mtx_test_mtx_lock_uncontended( + int iter) +{ + int i; + + disable_all_test_mtx_stats(); + + //warming up the test for lock + for (i = 0; i < WARMUP_ITER; i++) { + lck_mtx_test_lock(); + lck_mtx_test_unlock_mtx(); + } + + enable_all_test_mtx_stats(); + + for (i = 0; i < iter; i++) { + lck_mtx_test_lock(); + lck_mtx_test_unlock_mtx(); + } + + disable_all_test_mtx_stats(); + + //warming up the test for try_lock + for (i = 0; i < WARMUP_ITER; i++) { + lck_mtx_test_try_lock(); + lck_mtx_test_unlock_mtx(); + } + + enable_all_test_mtx_stats(); + + for (i = 0; i < iter; i++) { + lck_mtx_test_try_lock(); + lck_mtx_test_unlock_mtx(); + } + + return KERN_SUCCESS; +} + +static kern_return_t +lck_mtx_test_mtx_spin_uncontended( + int iter) +{ + int i; + + disable_all_test_mtx_stats(); + + //warming up the test for lock_spin + for (i = 0; i < WARMUP_ITER; i++) { + lck_mtx_test_lock_spin(); + lck_mtx_test_unlock_spin(); + } + + enable_all_test_mtx_stats(); + + for (i = 0; i < iter; i++) { + lck_mtx_test_lock_spin(); + lck_mtx_test_unlock_spin(); + } + + disable_all_test_mtx_stats(); + + //warming up the test for try_lock_spin + for (i = 0; i < WARMUP_ITER; i++) { + lck_mtx_test_try_lock_spin(); + lck_mtx_test_unlock_spin(); + } + + enable_all_test_mtx_stats(); + + for (i = 0; i < iter; i++) { + lck_mtx_test_try_lock_spin(); + lck_mtx_test_unlock_spin(); + } + + disable_all_test_mtx_stats(); + + //warming up the test for lock_spin_always + for (i = 0; i < WARMUP_ITER; i++) { + lck_mtx_test_lock_spin_always(); + lck_mtx_test_unlock_spin(); + } + + enable_all_test_mtx_stats(); + + for (i = 0; i < iter; i++) { + lck_mtx_test_lock_spin_always(); + lck_mtx_test_unlock_spin(); + } + + disable_all_test_mtx_stats(); + + //warming up the test for try_lock_spin_always + for (i = 0; i < WARMUP_ITER; i++) { + lck_mtx_test_try_lock_spin_always(); + lck_mtx_test_unlock_spin(); + } + + enable_all_test_mtx_stats(); + + for (i = 0; i < iter; i++) { + lck_mtx_test_try_lock_spin_always(); + lck_mtx_test_unlock_spin(); + } + + return KERN_SUCCESS; +} + +int +lck_mtx_test_mtx_uncontended( + int iter, + char *buffer, + int size) +{ + erase_all_test_mtx_stats(); + lck_mtx_test_mtx_lock_uncontended(iter); + lck_mtx_test_mtx_spin_uncontended(iter); + + return get_test_mtx_stats_string(buffer,size); +} + +static int synch; +static int wait_barrier; +static int iterations; +static uint64_t start_loop_time; +static uint64_t start_loop_time_run; +static uint64_t end_loop_time; +static uint64_t end_loop_time_run; + +struct lck_mtx_thread_arg { + int my_locked; + int* other_locked; + thread_t other_thread; +}; + +static void +test_mtx_lock_unlock_contended_thread( + void *arg, + __unused wait_result_t wr) +{ + int i, val; + struct lck_mtx_thread_arg *info = (struct lck_mtx_thread_arg *) arg; + thread_t other_thread; + int* my_locked; + int* other_locked; + + printf("Starting thread %p\n", current_thread()); + + while(os_atomic_load(&info->other_thread, acquire) == NULL); + other_thread = info->other_thread; + + printf("Other thread %p\n", other_thread); + + my_locked = &info->my_locked; + other_locked = info->other_locked; + + *my_locked = 0; + val = os_atomic_inc(&synch, relaxed); + while(os_atomic_load(&synch, relaxed) < 2); + + //warming up the test + for (i = 0; i < WARMUP_ITER; i++) { + lck_mtx_test_lock(); + + os_atomic_xchg(my_locked, 1 , relaxed); + if (i != WARMUP_ITER - 1) { + while(os_atomic_load(&other_thread->state, relaxed) & TH_RUN); + os_atomic_xchg(my_locked, 0 , relaxed); + } + + lck_mtx_test_unlock(); + + if (i != WARMUP_ITER - 1) + while(os_atomic_load(other_locked, relaxed) == 0); + } + + printf("warmup done %p\n", current_thread()); + os_atomic_inc(&synch, relaxed); + while(os_atomic_load(&synch, relaxed) < 4); + + //erase statistics + if (val == 1) + erase_all_test_mtx_stats(); + + *my_locked = 0; + /* + * synch the threads so they start + * concurrently. + */ + os_atomic_inc(&synch, relaxed); + while(os_atomic_load(&synch, relaxed) < 6); + + for (i = 0; i < iterations; i++) { + lck_mtx_test_lock(); + + os_atomic_xchg(my_locked, 1 , relaxed); + if (i != iterations - 1) { + while(os_atomic_load(&other_thread->state, relaxed) & TH_RUN); + os_atomic_xchg(my_locked, 0 , relaxed); + } + lck_mtx_test_unlock_mtx(); + + if (i != iterations - 1) + while(os_atomic_load(other_locked, relaxed) == 0); + + } + + os_atomic_inc(&wait_barrier, relaxed); + thread_wakeup((event_t) &wait_barrier); + thread_terminate_self(); +} + + +kern_return_t +lck_mtx_test_mtx_contended( + int iter, + char* buffer, + int buffer_size) +{ + thread_t thread1, thread2; + kern_return_t result; + struct lck_mtx_thread_arg targs[2] = {}; + synch = 0; + wait_barrier = 0; + iterations = iter; + + erase_all_test_mtx_stats(); + + targs[0].other_thread = NULL; + targs[1].other_thread = NULL; + + result = kernel_thread_start((thread_continue_t)test_mtx_lock_unlock_contended_thread, &targs[0], &thread1); + if (result != KERN_SUCCESS) { + return 0; + } + + result = kernel_thread_start((thread_continue_t)test_mtx_lock_unlock_contended_thread, &targs[1], &thread2); + if (result != KERN_SUCCESS) { + thread_deallocate(thread1); + return 0; + } + + /* this are t1 args */ + targs[0].my_locked = 0; + targs[0].other_locked = &targs[1].my_locked; + + os_atomic_xchg(&targs[0].other_thread, thread2, release); + + /* this are t2 args */ + targs[1].my_locked = 0; + targs[1].other_locked = &targs[0].my_locked; + + os_atomic_xchg(&targs[1].other_thread, thread1, release); + + while (os_atomic_load(&wait_barrier, relaxed) != 2) { + assert_wait((event_t) &wait_barrier, THREAD_UNINT); + if (os_atomic_load(&wait_barrier, relaxed) != 2) { + (void) thread_block(THREAD_CONTINUE_NULL); + } else { + clear_wait(current_thread(), THREAD_AWAKENED); + } + } + + thread_deallocate(thread1); + thread_deallocate(thread2); + + return get_test_mtx_stats_string(buffer, buffer_size); +} + +static void +test_mtx_lck_unlock_contended_loop_time_thread( + __unused void *arg, + __unused wait_result_t wr) +{ + int i, val; + struct lck_mtx_thread_arg *info = (struct lck_mtx_thread_arg *) arg; + thread_t other_thread; + int* my_locked; + int* other_locked; + + printf("Starting thread %p\n", current_thread()); + + while(os_atomic_load(&info->other_thread, acquire) == NULL); + other_thread = info->other_thread; + + printf("Other thread %p\n", other_thread); + + my_locked = &info->my_locked; + other_locked = info->other_locked; + + *my_locked = 0; + val = os_atomic_inc(&synch, relaxed); + while(os_atomic_load(&synch, relaxed) < 2); + + //warming up the test + for (i = 0; i < WARMUP_ITER; i++) { + lck_mtx_lock(&test_mtx); + + os_atomic_xchg(my_locked, 1 , relaxed); + if (i != WARMUP_ITER - 1) { + while(os_atomic_load(&other_thread->state, relaxed) & TH_RUN); + os_atomic_xchg(my_locked, 0 , relaxed); + } + + lck_mtx_unlock(&test_mtx); + + if (i != WARMUP_ITER - 1) + while(os_atomic_load(other_locked, relaxed) == 0); + } + + printf("warmup done %p\n", current_thread()); + + os_atomic_inc(&synch, relaxed); + while(os_atomic_load(&synch, relaxed) < 4); + + *my_locked = 0; + + /* + * synch the threads so they start + * concurrently. + */ + os_atomic_inc(&synch, relaxed); + while(os_atomic_load(&synch, relaxed) < 6); + + if (val == 1) { + start_loop_time_run = thread_get_runtime_self(); + start_loop_time = mach_absolute_time(); + } + + for (i = 0; i < iterations; i++) { + lck_mtx_lock(&test_mtx); + + os_atomic_xchg(my_locked, 1 , relaxed); + if (i != iterations - 1) { + while(os_atomic_load(&other_thread->state, relaxed) & TH_RUN); + os_atomic_xchg(my_locked, 0 , relaxed); + } + + lck_mtx_unlock(&test_mtx); + + if (i != iterations - 1) + while(os_atomic_load(other_locked, relaxed) == 0); + } + + if (val == 1) { + end_loop_time = mach_absolute_time(); + end_loop_time_run = thread_get_runtime_self(); + } + + os_atomic_inc(&wait_barrier, relaxed); + thread_wakeup((event_t) &wait_barrier); + thread_terminate_self(); +} + + +int +lck_mtx_test_mtx_contended_loop_time( + int iter, + char *buffer, + int buffer_size) +{ + thread_t thread1, thread2; + kern_return_t result; + int ret; + struct lck_mtx_thread_arg targs[2] = {}; + synch = 0; + wait_barrier = 0; + iterations = iter; + uint64_t time, time_run; + + targs[0].other_thread = NULL; + targs[1].other_thread = NULL; + + result = kernel_thread_start((thread_continue_t)test_mtx_lck_unlock_contended_loop_time_thread, &targs[0], &thread1); + if (result != KERN_SUCCESS) { + return 0; + } + + result = kernel_thread_start((thread_continue_t)test_mtx_lck_unlock_contended_loop_time_thread, &targs[1], &thread2); + if (result != KERN_SUCCESS) { + thread_deallocate(thread1); + return 0; + } + + /* this are t1 args */ + targs[0].my_locked = 0; + targs[0].other_locked = &targs[1].my_locked; + + os_atomic_xchg(&targs[0].other_thread, thread2, release); + + /* this are t2 args */ + targs[1].my_locked = 0; + targs[1].other_locked = &targs[0].my_locked; + + os_atomic_xchg(&targs[1].other_thread, thread1, release); + + while (os_atomic_load(&wait_barrier, acquire) != 2) { + assert_wait((event_t) &wait_barrier, THREAD_UNINT); + if (os_atomic_load(&wait_barrier, acquire) != 2) { + (void) thread_block(THREAD_CONTINUE_NULL); + } else { + clear_wait(current_thread(), THREAD_AWAKENED); + } + } + + thread_deallocate(thread1); + thread_deallocate(thread2); + + absolutetime_to_nanoseconds(end_loop_time - start_loop_time, &time); + absolutetime_to_nanoseconds(end_loop_time_run - start_loop_time_run, &time_run); + + ret = snprintf(buffer, buffer_size, "\n"); + ret += snprintf(&buffer[ret], buffer_size - ret, "total time %llu ns total run time %llu ns ", time, time_run); + ret += print_test_mtx_stats_string_name(TEST_MTX_LOCK_STATS, &buffer[ret], buffer_size - ret); + ret += snprintf(&buffer[ret], buffer_size - ret, "\n"); + + return ret; +} + diff --git a/osfmk/kern/thread.c b/osfmk/kern/thread.c index 433a1ae90..81f934a17 100644 --- a/osfmk/kern/thread.c +++ b/osfmk/kern/thread.c @@ -121,6 +121,7 @@ #include #include #include +#include #include #if KPC @@ -169,8 +170,14 @@ static queue_head_t thread_stack_queue; decl_simple_lock_data(static,thread_terminate_lock) static queue_head_t thread_terminate_queue; +static queue_head_t thread_deallocate_queue; + +static queue_head_t turnstile_deallocate_queue; + static queue_head_t crashed_threads_queue; +static queue_head_t workq_deallocate_queue; + decl_simple_lock_data(static,thread_exception_lock) static queue_head_t thread_exception_queue; @@ -182,10 +189,8 @@ struct thread_exception_elt { }; static struct thread thread_template, init_thread; - -static void sched_call_null( - int type, - thread_t thread); +static void thread_deallocate_enqueue(thread_t thread); +static void thread_deallocate_complete(thread_t thread); #ifdef MACH_BSD extern void proc_exit(void *); @@ -193,6 +198,7 @@ extern mach_exception_data_type_t proc_encode_exit_exception_code(void *); extern uint64_t get_dispatchqueue_offset_from_proc(void *); extern uint64_t get_return_to_kernel_offset_from_proc(void *p); extern int proc_selfpid(void); +extern void proc_name(int, char*, int); extern char * proc_name_address(void *p); #endif /* MACH_BSD */ @@ -212,6 +218,13 @@ static void init_thread_ledgers(void); void jetsam_on_ledger_cpulimit_exceeded(void); #endif +extern int task_thread_soft_limit; +extern int exc_via_corpse_forking; + +#if DEVELOPMENT || DEBUG +extern int exc_resource_threads_enabled; +#endif /* DEVELOPMENT || DEBUG */ + /* * Level (in terms of percentage of the limit) at which the CPU usage monitor triggers telemetry. * @@ -222,6 +235,9 @@ void jetsam_on_ledger_cpulimit_exceeded(void); int cpumon_ustackshots_trigger_pct; /* Percentage. Level at which we start gathering telemetry. */ void __attribute__((noinline)) SENDING_NOTIFICATION__THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU(void); +#if DEVELOPMENT || DEBUG +void __attribute__((noinline)) SENDING_NOTIFICATION__TASK_HAS_TOO_MANY_THREADS(task_t, int); +#endif /* DEVELOPMENT || DEBUG */ /* * The smallest interval over which we support limiting CPU consumption is 1ms @@ -274,10 +290,8 @@ thread_bootstrap(void) thread_template.max_priority = 0; thread_template.task_priority = 0; thread_template.promotions = 0; - thread_template.pending_promoter_index = 0; - thread_template.pending_promoter[0] = NULL; - thread_template.pending_promoter[1] = NULL; thread_template.rwlock_count = 0; + thread_template.waiting_for_mutex = NULL; thread_template.realtime.deadline = UINT64_MAX; @@ -307,11 +321,12 @@ thread_bootstrap(void) thread_template.bound_processor = PROCESSOR_NULL; thread_template.last_processor = PROCESSOR_NULL; - thread_template.sched_call = sched_call_null; + thread_template.sched_call = NULL; timer_init(&thread_template.user_timer); timer_init(&thread_template.system_timer); timer_init(&thread_template.ptime); + timer_init(&thread_template.runnable_timer); thread_template.user_timer_save = 0; thread_template.system_timer_save = 0; thread_template.vtimer_user_save = 0; @@ -331,6 +346,9 @@ thread_bootstrap(void) thread_template.recover = (vm_offset_t)NULL; thread_template.map = VM_MAP_NULL; +#if DEVELOPMENT || DEBUG + thread_template.pmap_footprint_suspended = FALSE; +#endif /* DEVELOPMENT || DEBUG */ #if CONFIG_DTRACE thread_template.t_dtrace_predcache = 0; @@ -483,7 +501,6 @@ thread_terminate_self(void) { thread_t thread = current_thread(); task_t task; - spl_t s; int threadcnt; pal_thread_terminate_self(thread); @@ -496,34 +513,12 @@ thread_terminate_self(void) thread_mtx_unlock(thread); - s = splsched(); - thread_lock(thread); - - /* - * Cancel priority depression, wait for concurrent expirations - * on other processors. - */ - if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) { - thread->sched_flags &= ~TH_SFLAG_DEPRESSED_MASK; - - /* If our priority was low because of a depressed yield, restore it in case we block below */ - thread_recompute_sched_pri(thread, FALSE); - - if (timer_call_cancel(&thread->depress_timer)) - thread->depress_timer_active--; - } - - while (thread->depress_timer_active > 0) { - thread_unlock(thread); - splx(s); - - delay(1); + thread_sched_call(thread, NULL); - s = splsched(); - thread_lock(thread); - } + spl_t s = splsched(); + thread_lock(thread); - thread_sched_call(thread, NULL); + thread_depress_abort_locked(thread); thread_unlock(thread); splx(s); @@ -608,6 +603,32 @@ thread_terminate_self(void) s = splsched(); thread_lock(thread); + /* + * Ensure that the depress timer is no longer enqueued, + * so the timer (stored in the thread) can be safely deallocated + * + * TODO: build timer_call_cancel_wait + */ + + assert((thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) == 0); + + uint32_t delay_us = 1; + + while (thread->depress_timer_active > 0) { + thread_unlock(thread); + splx(s); + + delay(delay_us++); + + if (delay_us > USEC_PER_SEC) + panic("depress timer failed to inactivate!" + "thread: %p depress_timer_active: %d", + thread, thread->depress_timer_active); + + s = splsched(); + thread_lock(thread); + } + /* * Cancel wait timer, and wait for * concurrent expirations. @@ -619,11 +640,18 @@ thread_terminate_self(void) thread->wait_timer_active--; } + delay_us = 1; + while (thread->wait_timer_active > 0) { thread_unlock(thread); splx(s); - delay(1); + delay(delay_us++); + + if (delay_us > USEC_PER_SEC) + panic("wait timer failed to inactivate!" + "thread: %p wait_timer_active: %d", + thread, thread->wait_timer_active); s = splsched(); thread_lock(thread); @@ -642,10 +670,16 @@ thread_terminate_self(void) */ thread->state |= TH_TERMINATE; thread_mark_wait_locked(thread, THREAD_UNINT); + + assert((thread->sched_flags & TH_SFLAG_WAITQ_PROMOTED) == 0); + assert((thread->sched_flags & TH_SFLAG_RW_PROMOTED) == 0); + assert((thread->sched_flags & TH_SFLAG_EXEC_PROMOTED) == 0); assert((thread->sched_flags & TH_SFLAG_PROMOTED) == 0); assert(thread->promotions == 0); - assert(!(thread->sched_flags & TH_SFLAG_WAITQ_PROMOTED)); + assert(thread->was_promoted_on_wakeup == 0); + assert(thread->waiting_for_mutex == NULL); assert(thread->rwlock_count == 0); + thread_unlock(thread); /* splsched */ @@ -653,23 +687,33 @@ thread_terminate_self(void) /*NOTREACHED*/ } -/* Drop a thread refcount that definitely isn't the last one. */ +/* Drop a thread refcount safely without triggering a zfree */ void thread_deallocate_safe(thread_t thread) { + __assert_only uint32_t th_ref_count; + + if (thread == THREAD_NULL) + return; + assert_thread_magic(thread); - uint32_t old_refcount = atomic_fetch_sub_explicit(&thread->ref_count, 1, memory_order_release); + if (__probable(atomic_fetch_sub_explicit(&thread->ref_count, 1, + memory_order_release) - 1 > 0)) { + return; + } + + th_ref_count = atomic_load_explicit(&thread->ref_count, memory_order_acquire); + assert(th_ref_count == 0); - if (__improbable(old_refcount <= 1)) - panic("bad thread refcount: %d", old_refcount); + /* enqueue the thread for thread deallocate deamon to call thread_deallocate_complete */ + thread_deallocate_enqueue(thread); } void thread_deallocate( thread_t thread) { - task_t task; __assert_only uint32_t th_ref_count; if (thread == THREAD_NULL) @@ -685,6 +729,19 @@ thread_deallocate( th_ref_count = atomic_load_explicit(&thread->ref_count, memory_order_acquire); assert(th_ref_count == 0); + thread_deallocate_complete(thread); +} + +void +thread_deallocate_complete( + thread_t thread) +{ + task_t task; + + assert_thread_magic(thread); + + assert(thread->ref_count == 0); + assert(thread_owned_workloops_count(thread) == 0); if (!(thread->state & TH_TERMINATE2)) @@ -692,8 +749,6 @@ thread_deallocate( assert(thread->runq == PROCESSOR_NULL); - assert(thread->user_promotions == 0); - #if KPC kpc_thread_destroy(thread); #endif @@ -718,6 +773,10 @@ thread_deallocate( if (thread->t_threadledger) ledger_dereference(thread->t_threadledger); + assert(thread->turnstile != TURNSTILE_NULL); + if (thread->turnstile) + turnstile_deallocate(thread->turnstile); + if (IPC_VOUCHER_NULL != thread->ith_voucher) ipc_voucher_release(thread->ith_voucher); @@ -856,7 +915,6 @@ thread_copy_resource_info( thread_t dst_thread, thread_t src_thread) { - dst_thread->thread_tag = src_thread->thread_tag; dst_thread->c_switch = src_thread->c_switch; dst_thread->p_switch = src_thread->p_switch; dst_thread->ps_switch = src_thread->ps_switch; @@ -865,6 +923,7 @@ thread_copy_resource_info( dst_thread->user_timer_save = src_thread->user_timer_save; dst_thread->system_timer = src_thread->system_timer; dst_thread->system_timer_save = src_thread->system_timer_save; + dst_thread->runnable_timer = src_thread->runnable_timer; dst_thread->vtimer_user_save = src_thread->vtimer_user_save; dst_thread->vtimer_prof_save = src_thread->vtimer_prof_save; dst_thread->vtimer_rlim_save = src_thread->vtimer_rlim_save; @@ -892,6 +951,7 @@ thread_terminate_daemon(void) (void)splsched(); simple_lock(&thread_terminate_lock); +thread_terminate_start: while ((thread = qe_dequeue_head(&thread_terminate_queue, struct thread, runq_links)) != THREAD_NULL) { assert_thread_magic(thread); @@ -913,6 +973,7 @@ thread_terminate_daemon(void) task_lock(task); task->total_user_time += timer_grab(&thread->user_timer); task->total_ptime += timer_grab(&thread->ptime); + task->total_runnable_time += timer_grab(&thread->runnable_timer); if (thread->precise_user_kernel_time) { task->total_system_time += timer_grab(&thread->system_timer); } else { @@ -960,6 +1021,55 @@ thread_terminate_daemon(void) simple_lock(&thread_terminate_lock); } + while ((thread = qe_dequeue_head(&thread_deallocate_queue, struct thread, runq_links)) != THREAD_NULL) { + assert_thread_magic(thread); + + simple_unlock(&thread_terminate_lock); + (void)spllo(); + + thread_deallocate_complete(thread); + + (void)splsched(); + simple_lock(&thread_terminate_lock); + } + + struct turnstile *turnstile; + while ((turnstile = qe_dequeue_head(&turnstile_deallocate_queue, struct turnstile, ts_deallocate_link)) != TURNSTILE_NULL) { + + simple_unlock(&thread_terminate_lock); + (void)spllo(); + + turnstile_destroy(turnstile); + + (void)splsched(); + simple_lock(&thread_terminate_lock); + } + + queue_entry_t qe; + + /* + * see workq_deallocate_enqueue: struct workqueue is opaque to thread.c and + * we just link pieces of memory here + */ + while ((qe = dequeue_head(&workq_deallocate_queue))) { + simple_unlock(&thread_terminate_lock); + (void)spllo(); + + workq_destroy((struct workqueue *)qe); + + (void)splsched(); + simple_lock(&thread_terminate_lock); + } + + /* + * Check if something enqueued in thread terminate/deallocate queue + * while processing workq deallocate queue + */ + if (!queue_empty(&thread_terminate_queue) || + !queue_empty(&thread_deallocate_queue) || + !queue_empty(&turnstile_deallocate_queue)) + goto thread_terminate_start; + assert_wait((event_t)&thread_terminate_queue, THREAD_UNINT); simple_unlock(&thread_terminate_lock); /* splsched */ @@ -989,6 +1099,67 @@ thread_terminate_enqueue( thread_wakeup((event_t)&thread_terminate_queue); } +/* + * thread_deallocate_enqueue: + * + * Enqueue a thread for final deallocation. + */ +static void +thread_deallocate_enqueue( + thread_t thread) +{ + spl_t s = splsched(); + + simple_lock(&thread_terminate_lock); + enqueue_tail(&thread_deallocate_queue, &thread->runq_links); + simple_unlock(&thread_terminate_lock); + + thread_wakeup((event_t)&thread_terminate_queue); + splx(s); +} + +/* + * turnstile_deallocate_enqueue: + * + * Enqueue a turnstile for final deallocation. + */ +void +turnstile_deallocate_enqueue( + struct turnstile *turnstile) +{ + spl_t s = splsched(); + + simple_lock(&thread_terminate_lock); + enqueue_tail(&turnstile_deallocate_queue, &turnstile->ts_deallocate_link); + simple_unlock(&thread_terminate_lock); + + thread_wakeup((event_t)&thread_terminate_queue); + splx(s); +} + +/* + * workq_deallocate_enqueue: + * + * Enqueue a workqueue for final deallocation. + */ +void +workq_deallocate_enqueue( + struct workqueue *wq) +{ + spl_t s = splsched(); + + simple_lock(&thread_terminate_lock); + /* + * this is just to delay a zfree(), so we link the memory with no regards + * for how the struct looks like. + */ + enqueue_tail(&workq_deallocate_queue, (queue_entry_t)wq); + simple_unlock(&thread_terminate_lock); + + thread_wakeup((event_t)&thread_terminate_queue); + splx(s); +} + /* * thread_terminate_crashed_threads: * walk the list of crashed threads and put back set of threads @@ -999,6 +1170,7 @@ thread_terminate_crashed_threads() { thread_t th_remove; boolean_t should_wake_terminate_queue = FALSE; + spl_t s = splsched(); simple_lock(&thread_terminate_lock); /* @@ -1017,6 +1189,7 @@ thread_terminate_crashed_threads() } simple_unlock(&thread_terminate_lock); + splx(s); if (should_wake_terminate_queue == TRUE) { thread_wakeup((event_t)&thread_terminate_queue); } @@ -1093,6 +1266,9 @@ thread_daemon_init(void) simple_lock_init(&thread_terminate_lock, 0); queue_init(&thread_terminate_queue); + queue_init(&thread_deallocate_queue); + queue_init(&workq_deallocate_queue); + queue_init(&turnstile_deallocate_queue); queue_init(&crashed_threads_queue); result = kernel_thread_start_priority((thread_continue_t)thread_terminate_daemon, NULL, MINPRI_KERNEL, &thread); @@ -1123,6 +1299,7 @@ thread_daemon_init(void) #define TH_OPTION_NONE 0x00 #define TH_OPTION_NOCRED 0x01 #define TH_OPTION_NOSUSP 0x02 +#define TH_OPTION_WORKQ 0x04 /* * Create a new thread. @@ -1135,6 +1312,7 @@ thread_create_internal( task_t parent_task, integer_t priority, thread_continue_t continuation, + void *parameter, int options, thread_t *out_thread) { @@ -1195,6 +1373,10 @@ thread_create_internal( ipc_thread_init(new_thread); new_thread->continuation = continuation; + new_thread->parameter = parameter; + new_thread->inheritor_flags = TURNSTILE_UPDATE_FLAGS_NONE; + priority_queue_init(&new_thread->inheritor_queue, + PRIORITY_QUEUE_BUILTIN_MAX_HEAP); /* Allocate I/O Statistics structure */ new_thread->thread_io_stats = (io_stat_info_t)kalloc(sizeof(struct io_stat_info)); @@ -1211,6 +1393,24 @@ thread_create_internal( new_thread->decmp_upl = NULL; #endif /* CONFIG_IOSCHED */ +#if DEVELOPMENT || DEBUG + task_lock(parent_task); + uint16_t thread_limit = parent_task->task_thread_limit; + if (exc_resource_threads_enabled && + thread_limit > 0 && + parent_task->thread_count >= thread_limit && + !parent_task->task_has_crossed_thread_limit && + !(parent_task->t_flags & TF_CORPSE)) { + int thread_count = parent_task->thread_count; + parent_task->task_has_crossed_thread_limit = TRUE; + task_unlock(parent_task); + SENDING_NOTIFICATION__TASK_HAS_TOO_MANY_THREADS(parent_task, thread_count); + } + else { + task_unlock(parent_task); + } +#endif + lck_mtx_lock(&tasks_threads_lock); task_lock(parent_task); @@ -1340,6 +1540,7 @@ thread_create_internal( new_thread->inspection = FALSE; } new_thread->corpse_dup = FALSE; + new_thread->turnstile = turnstile_alloc(); *out_thread = new_thread; if (kdebug_enable) { @@ -1390,7 +1591,7 @@ thread_create_internal2( if (task == TASK_NULL || task == kernel_task) return (KERN_INVALID_ARGUMENT); - result = thread_create_internal(task, -1, continuation, TH_OPTION_NONE, &thread); + result = thread_create_internal(task, -1, continuation, NULL, TH_OPTION_NONE, &thread); if (result != KERN_SUCCESS) return (result); @@ -1449,6 +1650,7 @@ thread_create_waiting_internal( task_t task, thread_continue_t continuation, event_t event, + block_hint_t block_hint, int options, thread_t *new_thread) { @@ -1458,7 +1660,8 @@ thread_create_waiting_internal( if (task == TASK_NULL || task == kernel_task) return (KERN_INVALID_ARGUMENT); - result = thread_create_internal(task, -1, continuation, options, &thread); + result = thread_create_internal(task, -1, continuation, NULL, + options, &thread); if (result != KERN_SUCCESS) return (result); @@ -1468,6 +1671,11 @@ thread_create_waiting_internal( thread_hold(thread); thread_mtx_lock(thread); + thread_set_pending_block_hint(thread, block_hint); + if (options & TH_OPTION_WORKQ) { + thread->static_param = true; + event = workq_thread_init_and_wq_lock(task, thread); + } thread_start_in_assert_wait(thread, event, THREAD_INTERRUPTIBLE); thread_mtx_unlock(thread); @@ -1487,7 +1695,7 @@ thread_create_waiting( thread_t *new_thread) { return thread_create_waiting_internal(task, continuation, event, - TH_OPTION_NONE, new_thread); + kThreadWaitNone, TH_OPTION_NONE, new_thread); } @@ -1506,14 +1714,23 @@ thread_create_running_internal2( if (task == TASK_NULL || task == kernel_task) return (KERN_INVALID_ARGUMENT); - result = thread_create_internal(task, -1, (thread_continue_t)thread_bootstrap_return, TH_OPTION_NONE, &thread); + result = thread_create_internal(task, -1, + (thread_continue_t)thread_bootstrap_return, NULL, + TH_OPTION_NONE, &thread); if (result != KERN_SUCCESS) return (result); if (task->suspend_count > 0) thread_hold(thread); - result = machine_thread_set_state(thread, flavor, new_state, new_state_count); + if (from_user) { + result = machine_thread_state_convert_from_user(thread, flavor, + new_state, new_state_count); + } + if (result == KERN_SUCCESS) { + result = machine_thread_set_state(thread, flavor, new_state, + new_state_count); + } if (result != KERN_SUCCESS) { task_unlock(task); lck_mtx_unlock(&tasks_threads_lock); @@ -1573,46 +1790,15 @@ thread_create_running_from_user( new_thread, TRUE); } -kern_return_t -thread_create_workq( - task_t task, - thread_continue_t thread_return, - thread_t *new_thread) -{ - kern_return_t result; - thread_t thread; - - if (task == TASK_NULL || task == kernel_task) - return (KERN_INVALID_ARGUMENT); - - result = thread_create_internal(task, -1, thread_return, TH_OPTION_NOCRED | TH_OPTION_NOSUSP, &thread); - if (result != KERN_SUCCESS) - return (result); - - thread->user_stop_count = 1; - thread_hold(thread); - if (task->suspend_count > 0) - thread_hold(thread); - - task_unlock(task); - lck_mtx_unlock(&tasks_threads_lock); - - *new_thread = thread; - - return (KERN_SUCCESS); -} - kern_return_t thread_create_workq_waiting( task_t task, thread_continue_t continuation, - event_t event, thread_t *new_thread) { - - return thread_create_waiting_internal(task, continuation, event, - TH_OPTION_NOCRED | TH_OPTION_NOSUSP, - new_thread); + int options = TH_OPTION_NOCRED | TH_OPTION_NOSUSP | TH_OPTION_WORKQ; + return thread_create_waiting_internal(task, continuation, NULL, + kThreadWaitParkedWorkQueue, options, new_thread); } /* @@ -1632,7 +1818,8 @@ kernel_thread_create( thread_t thread; task_t task = kernel_task; - result = thread_create_internal(task, priority, continuation, TH_OPTION_NOCRED | TH_OPTION_NONE, &thread); + result = thread_create_internal(task, priority, continuation, parameter, + TH_OPTION_NOCRED | TH_OPTION_NONE, &thread); if (result != KERN_SUCCESS) return (result); @@ -1646,8 +1833,6 @@ kernel_thread_create( #endif thread->reserved_stack = thread->kernel_stack; - thread->parameter = parameter; - if(debug_task & 1) kprintf("kernel_thread_create: thread = %p continuation = %p\n", thread, continuation); *new_thread = thread; @@ -1697,7 +1882,7 @@ retrieve_thread_basic_info(thread_t thread, thread_basic_info_t basic_info) /* fill in info */ thread_read_times(thread, &basic_info->user_time, - &basic_info->system_time); + &basic_info->system_time, NULL); /* * Update lazy-evaluated scheduler info because someone wants it. @@ -1963,7 +2148,8 @@ void thread_read_times( thread_t thread, time_value_t *user_time, - time_value_t *system_time) + time_value_t *system_time, + time_value_t *runnable_time) { clock_sec_t secs; clock_usec_t usecs; @@ -1976,7 +2162,7 @@ thread_read_times( absolutetime_to_microtime(tval_user, &secs, &usecs); user_time->seconds = (typeof(user_time->seconds))secs; user_time->microseconds = usecs; - + absolutetime_to_microtime(tval_system, &secs, &usecs); system_time->seconds = (typeof(system_time->seconds))secs; system_time->microseconds = usecs; @@ -1990,6 +2176,13 @@ thread_read_times( system_time->seconds = 0; system_time->microseconds = 0; } + + if (runnable_time) { + uint64_t tval_runnable = timer_grab(&thread->runnable_timer); + absolutetime_to_microtime(tval_runnable, &secs, &usecs); + runnable_time->seconds = (typeof(runnable_time->seconds))secs; + runnable_time->microseconds = usecs; + } } uint64_t thread_get_runtime_self(void) @@ -2004,7 +2197,7 @@ uint64_t thread_get_runtime_self(void) /* Not interrupt safe, as the scheduler may otherwise update timer values underneath us */ interrupt_state = ml_set_interrupts_enabled(FALSE); processor = current_processor(); - timer_switch(PROCESSOR_DATA(processor, thread_timer), mach_absolute_time(), PROCESSOR_DATA(processor, thread_timer)); + timer_update(PROCESSOR_DATA(processor, thread_timer), mach_absolute_time()); runtime = (timer_grab(&thread->user_timer) + timer_grab(&thread->system_timer)); ml_set_interrupts_enabled(interrupt_state); @@ -2138,7 +2331,7 @@ clear_thread_rwlock_boost(void) if ((thread->rwlock_count-- == 1) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) { - lck_rw_clear_promotion(thread); + lck_rw_clear_promotion(thread, 0); } } @@ -2151,7 +2344,10 @@ thread_guard_violation(thread_t thread, mach_exception_data_type_t code, mach_exception_data_type_t subcode) { assert(thread == current_thread()); - assert(thread->task != kernel_task); + + /* don't set up the AST for kernel threads */ + if (thread->task == kernel_task) + return; spl_t s = splsched(); /* @@ -2184,10 +2380,16 @@ guard_ast(thread_t t) code = t->guard_exc_info.code, subcode = t->guard_exc_info.subcode; + t->guard_exc_info.code = 0; + t->guard_exc_info.subcode = 0; + switch (EXC_GUARD_DECODE_GUARD_TYPE(code)) { + case GUARD_TYPE_NONE: + /* lingering AST_GUARD on the processor? */ + break; case GUARD_TYPE_MACH_PORT: mach_port_guard_ast(t, code, subcode); - break; + break; case GUARD_TYPE_FD: fd_guard_ast(t, code, subcode); break; @@ -2196,6 +2398,9 @@ guard_ast(thread_t t) vn_guard_ast(t, code, subcode); break; #endif + case GUARD_TYPE_VIRT_MEMORY: + virt_memory_guard_ast(t, code, subcode); + break; default: panic("guard_exc_info %llx %llx", code, subcode); } @@ -2289,7 +2494,7 @@ SENDING_NOTIFICATION__THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU(void) interval_sec = (uint32_t)(interval_ns / NSEC_PER_SEC); - thread_read_times(thread, &thread_user_time, &thread_system_time); + thread_read_times(thread, &thread_user_time, &thread_system_time, NULL); time_value_add(&thread_total_time, &thread_user_time); time_value_add(&thread_total_time, &thread_system_time); ledger_get_entry_info(thread->t_threadledger, thread_ledgers.cpu_time, &lei); @@ -2376,6 +2581,52 @@ SENDING_NOTIFICATION__THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU(void) } } +#if DEVELOPMENT || DEBUG +void __attribute__((noinline)) SENDING_NOTIFICATION__TASK_HAS_TOO_MANY_THREADS(task_t task, int thread_count) +{ + mach_exception_data_type_t code[EXCEPTION_CODE_MAX] = {0}; + int pid = task_pid(task); + char procname[MAXCOMLEN+1] = "unknown"; + + if (pid == 1) { + /* + * Cannot suspend launchd + */ + return; + } + + proc_name(pid, procname, sizeof(procname)); + + if (disable_exc_resource) { + printf("process %s[%d] crossed thread count high watermark (%d), EXC_RESOURCE " + "supressed by a boot-arg. \n", procname, pid, thread_count); + return; + } + + if (audio_active) { + printf("process %s[%d] crossed thread count high watermark (%d), EXC_RESOURCE " + "supressed due to audio playback.\n", procname, pid, thread_count); + return; + } + + if (exc_via_corpse_forking == 0) { + printf("process %s[%d] crossed thread count high watermark (%d), EXC_RESOURCE " + "supressed due to corpse forking being disabled.\n", procname, pid, + thread_count); + return; + } + + printf("process %s[%d] crossed thread count high watermark (%d), sending " + "EXC_RESOURCE\n", procname, pid, thread_count); + + EXC_RESOURCE_ENCODE_TYPE(code[0], RESOURCE_TYPE_THREADS); + EXC_RESOURCE_ENCODE_FLAVOR(code[0], FLAVOR_THREADS_HIGH_WATERMARK); + EXC_RESOURCE_THREADS_ENCODE_THREADS(code[0], thread_count); + + task_enqueue_exception_with_corpse(task, EXC_RESOURCE, code, EXCEPTION_CODE_MAX, NULL); +} +#endif /* DEVELOPMENT || DEBUG */ + void thread_update_io_stats(thread_t thread, int size, int io_flags) { int io_tier; @@ -2585,77 +2836,38 @@ thread_set_cpulimit(int action, uint8_t percentage, uint64_t interval_ns) return (0); } -static void -sched_call_null( -__unused int type, -__unused thread_t thread) -{ - return; -} - void thread_sched_call( thread_t thread, sched_call_t call) { - thread->sched_call = (call != NULL)? call: sched_call_null; + assert((thread->state & TH_WAIT_REPORT) == 0); + thread->sched_call = call; } -sched_call_t -thread_disable_sched_call( - thread_t thread, - sched_call_t call) +uint64_t +thread_tid( + thread_t thread) { - if (call) { - spl_t s = splsched(); - thread_lock(thread); - if (thread->sched_call == call) { - thread->sched_call = sched_call_null; - } else { - call = NULL; - } - thread_unlock(thread); - splx(s); - } - return call; + return (thread != THREAD_NULL? thread->thread_id: 0); } -void -thread_reenable_sched_call( - thread_t thread, - sched_call_t call) +uint16_t +thread_set_tag(thread_t th, uint16_t tag) { - if (call) { - spl_t s = splsched(); - thread_lock(thread); - thread_sched_call(thread, call); - thread_unlock(thread); - splx(s); - } + return thread_set_tag_internal(th, tag); } -void -thread_static_param( - thread_t thread, - boolean_t state) +uint16_t +thread_get_tag(thread_t th) { - thread_mtx_lock(thread); - thread->static_param = state; - thread_mtx_unlock(thread); + return thread_get_tag_internal(th); } uint64_t -thread_tid( - thread_t thread) +thread_last_run_time(thread_t th) { - return (thread != THREAD_NULL? thread->thread_id: 0); -} - -uint16_t thread_set_tag(thread_t th, uint16_t tag) { - return thread_set_tag_internal(th, tag); -} -uint16_t thread_get_tag(thread_t th) { - return thread_get_tag_internal(th); + return th->last_run_time; } uint64_t @@ -2718,6 +2930,22 @@ thread_rettokern_addr( * within the osfmk component. */ +#undef thread_mtx_lock +void thread_mtx_lock(thread_t thread); +void +thread_mtx_lock(thread_t thread) +{ + lck_mtx_lock(&thread->mutex); +} + +#undef thread_mtx_unlock +void thread_mtx_unlock(thread_t thread); +void +thread_mtx_unlock(thread_t thread) +{ + lck_mtx_unlock(&thread->mutex); +} + #undef thread_reference void thread_reference(thread_t thread); void @@ -2754,7 +2982,7 @@ thread_set_voucher_name(mach_port_name_t voucher_name) ipc_voucher_t new_voucher = IPC_VOUCHER_NULL; ipc_voucher_t voucher; ledger_t bankledger = NULL; - thread_group_t banktg = NULL; + struct thread_group *banktg = NULL; if (MACH_PORT_DEAD == voucher_name) return KERN_INVALID_RIGHT; @@ -2879,7 +3107,7 @@ thread_set_mach_voucher( { ipc_voucher_t old_voucher; ledger_t bankledger = NULL; - thread_group_t banktg = NULL; + struct thread_group *banktg = NULL; if (THREAD_NULL == thread) return KERN_INVALID_ARGUMENT; @@ -2916,78 +3144,22 @@ thread_set_mach_voucher( * Conditions: callers holds a reference on the new and presumed old voucher(s). * nothing locked. * - * If the old voucher is still the same as passed in, replace it with new voucher - * and discard the old (and the reference passed in). Otherwise, discard the new - * and return an updated old voucher. + * This function is no longer supported. */ kern_return_t thread_swap_mach_voucher( - thread_t thread, - ipc_voucher_t new_voucher, - ipc_voucher_t *in_out_old_voucher) + __unused thread_t thread, + __unused ipc_voucher_t new_voucher, + ipc_voucher_t *in_out_old_voucher) { - mach_port_name_t old_voucher_name; - ipc_voucher_t old_voucher; - ledger_t bankledger = NULL; - thread_group_t banktg = NULL; - - if (THREAD_NULL == thread) - return KERN_INVALID_TASK; - - if (thread != current_thread() && thread->started) - return KERN_INVALID_ARGUMENT; - - bank_get_bank_ledger_and_thread_group(new_voucher, &bankledger, &banktg); - - thread_mtx_lock(thread); - - old_voucher = thread->ith_voucher; - - if (IPC_VOUCHER_NULL == old_voucher) { - old_voucher_name = thread->ith_voucher_name; - - /* perform lazy binding if needed */ - if (MACH_PORT_VALID(old_voucher_name)) { - old_voucher = convert_port_name_to_voucher(old_voucher_name); - thread->ith_voucher_name = MACH_PORT_NULL; - thread->ith_voucher = old_voucher; - - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - MACHDBG_CODE(DBG_MACH_IPC,MACH_THREAD_SET_VOUCHER) | DBG_FUNC_NONE, - (uintptr_t)thread_tid(thread), - (uintptr_t)old_voucher_name, - VM_KERNEL_ADDRPERM((uintptr_t)old_voucher), - 4, 0); - - } - } - - /* swap in new voucher, if old voucher matches the one supplied */ - if (old_voucher == *in_out_old_voucher) { - ipc_voucher_reference(new_voucher); - thread->ith_voucher = new_voucher; - thread->ith_voucher_name = MACH_PORT_NULL; - thread_mtx_unlock(thread); - bank_swap_thread_bank_ledger(thread, bankledger); - - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, - MACHDBG_CODE(DBG_MACH_IPC,MACH_THREAD_SET_VOUCHER) | DBG_FUNC_NONE, - (uintptr_t)thread_tid(thread), - (uintptr_t)MACH_PORT_NULL, - VM_KERNEL_ADDRPERM((uintptr_t)new_voucher), - 5, 0); - - ipc_voucher_release(old_voucher); - - *in_out_old_voucher = IPC_VOUCHER_NULL; - return KERN_SUCCESS; - } - - /* Otherwise, just return old voucher reference */ - ipc_voucher_reference(old_voucher); - thread_mtx_unlock(thread); - *in_out_old_voucher = old_voucher; - return KERN_SUCCESS; + /* + * Currently this function is only called from a MIG generated + * routine which doesn't release the reference on the voucher + * addressed by in_out_old_voucher. To avoid leaking this reference, + * a call to release it has been added here. + */ + ipc_voucher_release(*in_out_old_voucher); + return KERN_NOT_SUPPORTED; } /* @@ -3073,6 +3245,12 @@ kern_allocation_name_t thread_set_allocation_name(kern_allocation_name_t new_nam return ret; } +uint64_t +thread_get_last_wait_duration(thread_t thread) +{ + return thread->last_made_runnable_time - thread->last_run_time; +} + #if CONFIG_DTRACE uint32_t dtrace_get_thread_predcache(thread_t thread) { diff --git a/osfmk/kern/thread.h b/osfmk/kern/thread.h index ca2217584..d2cf4278e 100644 --- a/osfmk/kern/thread.h +++ b/osfmk/kern/thread.h @@ -109,6 +109,7 @@ #include #include #include + #include #include #include @@ -123,6 +124,7 @@ #include #include #include +#include #include #include @@ -132,6 +134,11 @@ #include #include +#ifdef XNU_KERNEL_PRIVATE +/* priority queue static asserts fail for __ARM64_ARCH_8_32__ kext builds */ +#include +#endif /* XNU_KERNEL_PRIVATE */ + #if MONOTONIC #include #include @@ -163,14 +170,18 @@ struct thread { * anywhere in the thread structure. */ union { - queue_chain_t runq_links; /* run queue links */ - queue_chain_t wait_links; /* wait queue links */ + queue_chain_t runq_links; /* run queue links */ + queue_chain_t wait_links; /* wait queue links */ + struct priority_queue_entry wait_prioq_links; /* priority ordered waitq links */ }; processor_t runq; /* run queue assignment */ event64_t wait_event; /* wait queue event */ struct waitq *waitq; /* wait queue this thread is enqueued on */ + struct turnstile *turnstile; /* thread's turnstile, protected by primitives interlock */ + void *inheritor; /* inheritor of the primitive the thread will block on */ + struct priority_queue inheritor_queue; /* Inheritor queue */ /* Data updated during assert_wait/thread_wakeup */ #if __SMP__ @@ -218,9 +229,10 @@ struct thread { #define TH_SUSP 0x02 /* stopped or requested to stop */ #define TH_RUN 0x04 /* running or on runq */ #define TH_UNINT 0x08 /* waiting uninteruptibly */ -#define TH_TERMINATE 0x10 /* halted at termination */ -#define TH_TERMINATE2 0x20 /* added to termination queue */ - +#define TH_TERMINATE 0x10 /* halted at termination */ +#define TH_TERMINATE2 0x20 /* added to termination queue */ +#define TH_WAIT_REPORT 0x40 /* the wait is using the sched_call, + only set if TH_WAIT is also set */ #define TH_IDLE 0x80 /* idling processor */ /* Scheduling information */ @@ -240,7 +252,7 @@ struct thread { #define TH_SFLAG_THROTTLED 0x0004 /* throttled thread forced to timeshare mode (may be applied in addition to failsafe) */ #define TH_SFLAG_DEMOTED_MASK (TH_SFLAG_THROTTLED | TH_SFLAG_FAILSAFE) /* saved_mode contains previous sched_mode */ -#define TH_SFLAG_PROMOTED 0x0008 /* sched pri has been promoted */ +#define TH_SFLAG_PROMOTED 0x0008 /* sched pri has been promoted by kernel mutex priority promotion */ #define TH_SFLAG_ABORT 0x0010 /* abort interruptible waits */ #define TH_SFLAG_ABORTSAFELY 0x0020 /* ... but only those at safe point */ #define TH_SFLAG_ABORTED_MASK (TH_SFLAG_ABORT | TH_SFLAG_ABORTSAFELY) @@ -249,13 +261,15 @@ struct thread { #define TH_SFLAG_DEPRESSED_MASK (TH_SFLAG_DEPRESS | TH_SFLAG_POLLDEPRESS) /* unused TH_SFLAG_PRI_UPDATE 0x0100 */ #define TH_SFLAG_EAGERPREEMPT 0x0200 /* Any preemption of this thread should be treated as if AST_URGENT applied */ -#define TH_SFLAG_RW_PROMOTED 0x0400 /* sched pri has been promoted due to blocking with RW lock held */ +#define TH_SFLAG_RW_PROMOTED 0x0400 /* promote reason: blocking with RW lock held */ /* unused TH_SFLAG_THROTTLE_DEMOTED 0x0800 */ -#define TH_SFLAG_WAITQ_PROMOTED 0x1000 /* sched pri promoted from waitq wakeup (generally for IPC receive) */ +#define TH_SFLAG_WAITQ_PROMOTED 0x1000 /* promote reason: waitq wakeup (generally for IPC receive) */ -#define TH_SFLAG_EXEC_PROMOTED 0x8000 /* sched pri has been promoted since thread is in an exec */ -#define TH_SFLAG_PROMOTED_MASK (TH_SFLAG_PROMOTED | TH_SFLAG_RW_PROMOTED | TH_SFLAG_WAITQ_PROMOTED | TH_SFLAG_EXEC_PROMOTED) +#define TH_SFLAG_EXEC_PROMOTED 0x8000 /* promote reason: thread is in an exec */ + +/* 'promote reasons' that request a priority floor only, not a custom priority */ +#define TH_SFLAG_PROMOTE_REASON_MASK (TH_SFLAG_RW_PROMOTED | TH_SFLAG_WAITQ_PROMOTED | TH_SFLAG_EXEC_PROMOTED) #define TH_SFLAG_RW_PROMOTED_BIT (10) /* 0x400 */ @@ -263,22 +277,24 @@ struct thread { int16_t base_pri; /* base priority */ int16_t max_priority; /* copy of max base priority */ int16_t task_priority; /* copy of task base priority */ + int16_t promotion_priority; /* priority thread is currently promoted to */ #if defined(CONFIG_SCHED_GRRR) #if 0 uint16_t grrr_deficit; /* fixed point (1/1000th quantum) fractional deficit */ #endif #endif - + int16_t promotions; /* level of promotion */ - int16_t pending_promoter_index; + int iotier_override; /* atomic operations to set, cleared on ret to user */ _Atomic uint32_t ref_count; /* number of references to me */ - void *pending_promoter[2]; + + lck_mtx_t* waiting_for_mutex; /* points to mutex we're waiting for until we acquire it */ uint32_t rwlock_count; /* Number of lck_rw_t locks held by thread */ integer_t importance; /* task-relative importance */ - uint32_t was_promoted_on_wakeup; + uint32_t was_promoted_on_wakeup; /* thread promoted on wakeup to acquire mutex */ /* Priority depression expiration */ integer_t depress_timer_active; @@ -321,7 +337,7 @@ struct thread { #if defined(CONFIG_SCHED_PROTO) uint32_t runqueue_generation; /* last time runqueue was drained */ #endif - + /* Statistics and timesharing calculations */ #if defined(CONFIG_SCHED_TIMESHARE_CORE) natural_t sched_stamp; /* last scheduler tick */ @@ -347,6 +363,7 @@ struct thread { uint64_t vtimer_qos_save; timer_data_t ptime; /* time executing in P mode */ + timer_data_t runnable_timer; /* time the thread is runnable (including running) */ #if CONFIG_SCHED_SFI /* Timing for wait state */ @@ -369,13 +386,13 @@ struct thread { /* Various bits of state to stash across a continuation, exclusive to the current thread block point */ union { struct { - mach_msg_return_t state; /* receive state */ + mach_msg_return_t state; /* receive state */ mach_port_seqno_t seqno; /* seqno of recvd message */ - ipc_object_t object; /* object received on */ - mach_vm_address_t msg_addr; /* receive buffer pointer */ + ipc_object_t object; /* object received on */ + mach_vm_address_t msg_addr; /* receive buffer pointer */ mach_msg_size_t rsize; /* max size for recvd msg */ mach_msg_size_t msize; /* actual size for recvd msg */ - mach_msg_option_t option; /* options for receive */ + mach_msg_option_t option; /* options for receive */ mach_port_name_t receiver_name; /* the receive port name */ struct knote *knote; /* knote fired for rcv */ union { @@ -389,16 +406,12 @@ struct thread { mach_msg_continue_t continuation; } receive; struct { - struct semaphore *waitsemaphore; /* semaphore ref */ + struct semaphore *waitsemaphore; /* semaphore ref */ struct semaphore *signalsemaphore; /* semaphore ref */ int options; /* semaphore options */ kern_return_t result; /* primary result */ mach_msg_continue_t continuation; } sema; - struct { - int option; /* switch option */ - boolean_t reenable_workq_callback; /* on entry, callbacks were suspended */ - } swtch; } saved; /* Only user threads can cause guard exceptions, only kernel threads can be thread call threads */ @@ -439,6 +452,9 @@ struct thread { /* Task membership */ struct task *task; vm_map_t map; +#if DEVELOPMENT || DEBUG + boolean_t pmap_footprint_suspended; +#endif /* DEVELOPMENT || DEBUG */ decl_lck_mtx_data(,mutex) @@ -543,17 +559,12 @@ struct thread { user_addr_t override_resource; } *overrides; - _Atomic uint32_t kqwl_owning_count; uint32_t ipc_overrides; + _Atomic uint32_t kqwl_owning_count; uint32_t sync_ipc_overrides; - uint32_t user_promotions; uint16_t user_promotion_basepri; _Atomic uint16_t kevent_ast_bits; - block_hint_t pending_block_hint; - block_hint_t block_hint; /* What type of primitive last caused us to block. */ - - int iotier_override; /* atomic operations to set, cleared on ret to user */ io_stat_info_t thread_io_stats; /* per-thread I/O statistics */ #if CONFIG_EMBEDDED @@ -582,6 +593,9 @@ struct thread { #if SCHED_TRACE_THREAD_WAKEUPS uintptr_t thread_wakeup_bt[64]; #endif + turnstile_update_flags_t inheritor_flags; /* inheritor flags for inheritor field */ + block_hint_t pending_block_hint; + block_hint_t block_hint; /* What type of primitive last caused us to block. */ }; #define ith_state saved.receive.state @@ -607,7 +621,15 @@ struct thread { #define ITH_KNOTE_NULL ((void *)NULL) #define ITH_KNOTE_PSEUDO ((void *)0xdeadbeef) -#define ITH_KNOTE_VALID(kn) ((kn) != ITH_KNOTE_NULL && (kn) != ITH_KNOTE_PSEUDO) +/* + * The ith_knote is used during message delivery, and can safely be interpreted + * only when used for one of these codepaths, which the test for the msgt_name + * being RECEIVE or SEND_ONCE is about. + */ +#define ITH_KNOTE_VALID(kn, msgt_name) \ + (((kn) != ITH_KNOTE_NULL && (kn) != ITH_KNOTE_PSEUDO) && \ + ((msgt_name) == MACH_MSG_TYPE_PORT_RECEIVE || \ + (msgt_name) == MACH_MSG_TYPE_PORT_SEND_ONCE)) #if MACH_ASSERT #define assert_thread_magic(thread) assertf((thread)->thread_magic == THREAD_MAGIC, \ @@ -668,6 +690,9 @@ extern void thread_copy_resource_info( extern void thread_terminate_crashed_threads(void); +extern void turnstile_deallocate_enqueue( + struct turnstile *turnstile); + extern void thread_stack_enqueue( thread_t thread); @@ -778,9 +803,22 @@ extern kern_return_t machine_thread_get_state( thread_state_t state, mach_msg_type_number_t *count); +extern kern_return_t machine_thread_state_convert_from_user( + thread_t thread, + thread_flavor_t flavor, + thread_state_t tstate, + mach_msg_type_number_t count); + +extern kern_return_t machine_thread_state_convert_to_user( + thread_t thread, + thread_flavor_t flavor, + thread_state_t tstate, + mach_msg_type_number_t *count); + extern kern_return_t machine_thread_dup( thread_t self, - thread_t target); + thread_t target, + boolean_t is_corpse); extern void machine_thread_init(void); @@ -839,6 +877,10 @@ extern void thread_set_options(uint32_t thopt); __BEGIN_DECLS +extern void thread_mtx_lock(thread_t thread); + +extern void thread_mtx_unlock(thread_t thread); + extern thread_t current_thread(void); extern void thread_reference( @@ -900,6 +942,7 @@ __BEGIN_DECLS uint16_t thread_set_tag(thread_t, uint16_t); uint16_t thread_get_tag(thread_t); +uint64_t thread_last_run_time(thread_t); extern kern_return_t thread_state_initialize( thread_t thread); @@ -910,12 +953,24 @@ extern kern_return_t thread_setstatus( thread_state_t tstate, mach_msg_type_number_t count); +extern kern_return_t thread_setstatus_from_user( + thread_t thread, + int flavor, + thread_state_t tstate, + mach_msg_type_number_t count); + extern kern_return_t thread_getstatus( thread_t thread, int flavor, thread_state_t tstate, mach_msg_type_number_t *count); +extern kern_return_t thread_getstatus_to_user( + thread_t thread, + int flavor, + thread_state_t tstate, + mach_msg_type_number_t *count); + extern kern_return_t thread_create_with_continuation( task_t task, thread_t *new_thread, @@ -926,21 +981,15 @@ extern kern_return_t thread_create_waiting(task_t task, event_t event, thread_t *new_thread); -extern kern_return_t thread_create_workq( - task_t task, - thread_continue_t thread_return, - thread_t *new_thread); - extern kern_return_t thread_create_workq_waiting( task_t task, thread_continue_t thread_return, - event_t event, thread_t *new_thread); extern void thread_yield_internal( mach_msg_timeout_t interval); -extern void thread_yield_to_preemption(void); +extern void thread_yield_to_preemption(void); /* * Thread-private CPU limits: apply a private CPU limit to this thread only. Available actions are: @@ -963,9 +1012,10 @@ extern int thread_get_cpulimit(int *action, uint8_t *percentage, uint64_t *inter extern int thread_set_cpulimit(int action, uint8_t percentage, uint64_t interval_ns); extern void thread_read_times( - thread_t thread, + thread_t thread, time_value_t *user_time, - time_value_t *system_time); + time_value_t *system_time, + time_value_t *runnable_time); extern uint64_t thread_get_runtime_self(void); @@ -1034,29 +1084,27 @@ extern void thread_sched_call( thread_t thread, sched_call_t call); -extern sched_call_t thread_disable_sched_call( - thread_t thread, - sched_call_t call); - -extern void thread_reenable_sched_call( - thread_t thread, - sched_call_t call); - -extern void thread_static_param( - thread_t thread, - boolean_t state); - extern boolean_t thread_is_static_param( thread_t thread); extern task_t get_threadtask(thread_t); -#define thread_is_64bit(thd) \ - task_has_64BitAddr(get_threadtask(thd)) +/* + * Thread is running within a 64-bit address space. + */ +#define thread_is_64bit_addr(thd) \ + task_has_64Bit_addr(get_threadtask(thd)) + +/* + * Thread is using 64-bit machine state. + */ +#define thread_is_64bit_data(thd) \ + task_has_64Bit_data(get_threadtask(thd)) extern void *get_bsdthread_info(thread_t); extern void set_bsdthread_info(thread_t, void *); extern void *uthread_alloc(task_t, thread_t, int); +extern event_t workq_thread_init_and_wq_lock(task_t, thread_t); // bsd/pthread/ extern void uthread_cleanup_name(void *uthread); extern void uthread_cleanup(task_t, void *, void *); extern void uthread_zone_free(void *); @@ -1119,6 +1167,8 @@ extern void vn_guard_ast(thread_t, #endif extern void mach_port_guard_ast(thread_t, mach_exception_code_t, mach_exception_subcode_t); +extern void virt_memory_guard_ast(thread_t, + mach_exception_code_t, mach_exception_subcode_t); extern void thread_guard_violation(thread_t, mach_exception_code_t, mach_exception_subcode_t); extern void thread_update_io_stats(thread_t, int size, int io_flags); @@ -1147,6 +1197,23 @@ extern void thread_set_thread_name(thread_t th, const char* name); extern void thread_enable_send_importance(thread_t thread, boolean_t enable); +/* + * Translate signal context data pointer to userspace representation + */ + +extern kern_return_t machine_thread_siguctx_pointer_convert_to_user( + thread_t thread, + user_addr_t *uctxp); + +/* + * Translate array of function pointer syscall arguments from userspace representation + */ + +extern kern_return_t machine_thread_function_pointers_convert_from_user( + thread_t thread, + user_addr_t *fptrs, + uint32_t count); + /* Get a backtrace for a threads kernel or user stack (user_p), with pc and optionally * frame pointer (getfp). Returns bytes added to buffer, and kThreadTruncatedBT in * thread_trace_flags if a user page is not present after kdp_lightweight_fault() is @@ -1163,12 +1230,18 @@ extern int machine_trace_thread( uint32_t *thread_trace_flags); extern int machine_trace_thread64(thread_t thread, - char *tracepos, - char *tracebound, - int nframes, - boolean_t user_p, - boolean_t getfp, - uint32_t *thread_trace_flags); + char *tracepos, + char *tracebound, + int nframes, + boolean_t user_p, + boolean_t getfp, + uint32_t *thread_trace_flags, + uint64_t *sp); + +/* + * Get the duration of the given thread's last wait. + */ +uint64_t thread_get_last_wait_duration(thread_t thread); #endif /* XNU_KERNEL_PRIVATE */ diff --git a/osfmk/kern/thread_act.c b/osfmk/kern/thread_act.c index 9c7aa300c..4faa1e9b5 100644 --- a/osfmk/kern/thread_act.c +++ b/osfmk/kern/thread_act.c @@ -49,6 +49,7 @@ * * Thread management routines */ + #include #include #include @@ -315,27 +316,26 @@ thread_resume(thread_t thread) } /* - * thread_depress_abort: + * thread_depress_abort_from_user: * * Prematurely abort priority depression if there is one. */ kern_return_t -thread_depress_abort( - thread_t thread) +thread_depress_abort_from_user(thread_t thread) { - kern_return_t result; + kern_return_t result; - if (thread == THREAD_NULL) + if (thread == THREAD_NULL) return (KERN_INVALID_ARGUMENT); - thread_mtx_lock(thread); + thread_mtx_lock(thread); if (thread->active) - result = thread_depress_abort_internal(thread); + result = thread_depress_abort(thread); else result = KERN_TERMINATED; - thread_mtx_unlock(thread); + thread_mtx_unlock(thread); return (result); } @@ -358,6 +358,7 @@ act_abort( if (!(thread->sched_flags & TH_SFLAG_ABORT)) { thread->sched_flags |= TH_SFLAG_ABORT; thread_set_apc_ast_locked(thread); + thread_depress_abort_locked(thread); } else { thread->sched_flags &= ~TH_SFLAG_ABORTSAFELY; } @@ -409,6 +410,7 @@ thread_abort_safely( if (!(thread->sched_flags & TH_SFLAG_ABORT)) { thread->sched_flags |= TH_SFLAG_ABORTED_MASK; thread_set_apc_ast_locked(thread); + thread_depress_abort_locked(thread); } } thread_unlock(thread); @@ -452,12 +454,13 @@ thread_info( return (result); } -kern_return_t -thread_get_state( +static inline kern_return_t +thread_get_state_internal( thread_t thread, int flavor, thread_state_t state, /* pointer to OUT array */ - mach_msg_type_number_t *state_count) /*IN/OUT*/ + mach_msg_type_number_t *state_count, /*IN/OUT*/ + boolean_t to_user) { kern_return_t result = KERN_SUCCESS; @@ -497,16 +500,50 @@ thread_get_state( else result = KERN_TERMINATED; + if (to_user && result == KERN_SUCCESS) { + result = machine_thread_state_convert_to_user(thread, flavor, state, + state_count); + } + thread_mtx_unlock(thread); return (result); } +/* No prototype, since thread_act_server.h has the _to_user version if KERNEL_SERVER */ + +kern_return_t +thread_get_state( + thread_t thread, + int flavor, + thread_state_t state, + mach_msg_type_number_t *state_count); + +kern_return_t +thread_get_state( + thread_t thread, + int flavor, + thread_state_t state, /* pointer to OUT array */ + mach_msg_type_number_t *state_count) /*IN/OUT*/ +{ + return thread_get_state_internal(thread, flavor, state, state_count, FALSE); +} + +kern_return_t +thread_get_state_to_user( + thread_t thread, + int flavor, + thread_state_t state, /* pointer to OUT array */ + mach_msg_type_number_t *state_count) /*IN/OUT*/ +{ + return thread_get_state_internal(thread, flavor, state, state_count, TRUE); +} + /* * Change thread's machine-dependent state. Called with nothing * locked. Returns same way. */ -static kern_return_t +static inline kern_return_t thread_set_state_internal( thread_t thread, int flavor, @@ -522,6 +559,13 @@ thread_set_state_internal( thread_mtx_lock(thread); if (thread->active) { + if (from_user) { + result = machine_thread_state_convert_from_user(thread, flavor, + state, state_count); + if (result != KERN_SUCCESS) { + goto out; + } + } if (thread != current_thread()) { thread_hold(thread); @@ -550,6 +594,7 @@ thread_set_state_internal( if ((result == KERN_SUCCESS) && from_user) extmod_statistics_incr_thread_set_state(thread); +out: thread_mtx_unlock(thread); return (result); @@ -650,7 +695,8 @@ thread_dup( if (thread_stop(target, TRUE)) { thread_mtx_lock(target); - result = machine_thread_dup(self, target); + result = machine_thread_dup(self, target, FALSE); + if (self->affinity_set != AFFINITY_SET_NULL) thread_affinity_dup(self, target); thread_unstop(target); @@ -699,7 +745,7 @@ thread_dup2( if (thread_stop(target, TRUE)) { thread_mtx_lock(target); - result = machine_thread_dup(source, target); + result = machine_thread_dup(source, target, TRUE); if (source->affinity_set != AFFINITY_SET_NULL) thread_affinity_dup(source, target); thread_unstop(target); @@ -736,6 +782,17 @@ thread_setstatus( return (thread_set_state(thread, flavor, tstate, count)); } +kern_return_t +thread_setstatus_from_user( + thread_t thread, + int flavor, + thread_state_t tstate, + mach_msg_type_number_t count) +{ + + return (thread_set_state_from_user(thread, flavor, tstate, count)); +} + /* * thread_getstatus: * @@ -751,6 +808,16 @@ thread_getstatus( return (thread_get_state(thread, flavor, tstate, count)); } +kern_return_t +thread_getstatus_to_user( + thread_t thread, + int flavor, + thread_state_t tstate, + mach_msg_type_number_t *count) +{ + return (thread_get_state_to_user(thread, flavor, tstate, count)); +} + /* * Change thread's machine-dependent userspace TSD base. * Called with nothing locked. Returns same way. @@ -826,16 +893,6 @@ thread_set_apc_ast(thread_t thread) static void thread_set_apc_ast_locked(thread_t thread) { - /* - * Temporarily undepress, so target has - * a chance to do locking required to - * block itself in thread_suspended. - * - * Leaves the depress flag set so we can reinstate when it's blocked. - */ - if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) - thread_recompute_sched_pri(thread, TRUE); - thread_ast_set(thread, AST_APC); if (thread == current_thread()) { @@ -861,9 +918,7 @@ thread_set_apc_ast_locked(thread_t thread) * * Continuation routine for thread suspension. It checks * to see whether there has been any new suspensions. If so, it - * installs the AST_APC handler again. Otherwise, it checks to see - * if the current depression needs to be re-instated (it may have - * been temporarily removed in order to get to this point in a hurry). + * installs the AST_APC handler again. */ __attribute__((noreturn)) static void @@ -878,27 +933,8 @@ thread_suspended(__unused void *parameter, wait_result_t result) else assert(thread->suspend_parked == FALSE); - if (thread->suspend_count > 0) { + if (thread->suspend_count > 0) thread_set_apc_ast(thread); - } else { - spl_t s = splsched(); - - thread_lock(thread); - if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) { - thread->sched_pri = DEPRESSPRI; - thread->last_processor->current_pri = thread->sched_pri; - thread->last_processor->current_perfctl_class = thread_get_perfcontrol_class(thread); - - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY), - (uintptr_t)thread_tid(thread), - thread->base_pri, - thread->sched_pri, - thread->sched_usage, - 0); - } - thread_unlock(thread); - splx(s); - } thread_mtx_unlock(thread); @@ -938,7 +974,8 @@ thread_apc_ast(thread_t thread) /* If we're suspended, go to sleep and wait for someone to wake us up. */ if (thread->suspend_count > 0) { thread->suspend_parked = TRUE; - assert_wait(&thread->suspend_count, THREAD_ABORTSAFE); + assert_wait(&thread->suspend_count, + THREAD_ABORTSAFE | THREAD_WAIT_NOREPORT_USER); thread_mtx_unlock(thread); thread_block(thread_suspended); @@ -984,6 +1021,14 @@ act_set_state_from_user( } +/* Prototype, see justification above */ +kern_return_t +act_get_state( + thread_t thread, + int flavor, + thread_state_t state, + mach_msg_type_number_t *count); + kern_return_t act_get_state( thread_t thread, @@ -997,6 +1042,19 @@ act_get_state( return (thread_get_state(thread, flavor, state, count)); } +kern_return_t +act_get_state_to_user( + thread_t thread, + int flavor, + thread_state_t state, + mach_msg_type_number_t *count) +{ + if (thread == current_thread()) + return (KERN_INVALID_ARGUMENT); + + return (thread_get_state_to_user(thread, flavor, state, count)); +} + static void act_set_ast( thread_t thread, diff --git a/osfmk/kern/thread_call.c b/osfmk/kern/thread_call.c index d43248ee5..ec92802f8 100644 --- a/osfmk/kern/thread_call.c +++ b/osfmk/kern/thread_call.c @@ -1285,7 +1285,6 @@ thread_call_finish(thread_call_t call, thread_call_group_t group, spl_t *s) uint64_t time; uint32_t flags; boolean_t signal; - boolean_t dowake = FALSE; boolean_t repend = FALSE; call->tc_finish_count++; @@ -1328,22 +1327,8 @@ thread_call_finish(thread_call_t call, thread_call_group_t group, spl_t *s) } } - if ((flags & THREAD_CALL_WAIT) != 0) { - dowake = TRUE; - - /* - * Dropping lock here because the sched call for the - * high-pri group can take the big lock from under - * a thread lock. - */ - thread_call_unlock(); - thread_wakeup((event_t)call); - thread_call_lock_spin(); - /* THREAD_CALL_SIGNAL call may have been freed */ - } - if (!signal && (call->tc_refs == 0)) { - if (dowake) { + if ((flags & THREAD_CALL_WAIT) != 0) { panic("Someone waiting on a thread call that is scheduled for free: %p\n", call->tc_call.func); } @@ -1356,6 +1341,18 @@ thread_call_finish(thread_call_t call, thread_call_group_t group, spl_t *s) *s = disable_ints_and_lock(); } + if ((flags & THREAD_CALL_WAIT) != 0) { + /* + * Dropping lock here because the sched call for the + * high-pri group can take the big lock from under + * a thread lock. + */ + thread_call_unlock(); + thread_wakeup((event_t)call); + thread_call_lock_spin(); + /* THREAD_CALL_SIGNAL call may have been freed */ + } + return (repend); } diff --git a/osfmk/kern/thread_group.h b/osfmk/kern/thread_group.h index 6e7991507..e19269ee2 100644 --- a/osfmk/kern/thread_group.h +++ b/osfmk/kern/thread_group.h @@ -33,7 +33,6 @@ #define _KERN_THREAD_GROUP_H_ struct thread_group; -typedef struct thread_group *thread_group_t; #include /* for proc_reg.h / CONFIG_THREAD_GROUPS */ diff --git a/osfmk/kern/thread_policy.c b/osfmk/kern/thread_policy.c index 6ede110d1..7b7e4f87d 100644 --- a/osfmk/kern/thread_policy.c +++ b/osfmk/kern/thread_policy.c @@ -53,13 +53,12 @@ uint32_t qos_override_mode; #define QOS_OVERRIDE_MODE_OVERHANG_PEAK 0 #define QOS_OVERRIDE_MODE_IGNORE_OVERRIDE 1 #define QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE 2 -#define QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_IGNORE_DISPATCH 3 -#define QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_SINGLE_MUTEX_OVERRIDE 4 +#define QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_SINGLE_MUTEX_OVERRIDE 3 extern zone_t thread_qos_override_zone; -static boolean_t -proc_thread_qos_remove_override_internal(thread_t thread, user_addr_t resource, int resource_type, boolean_t reset, boolean_t squash); +static void +proc_thread_qos_remove_override_internal(thread_t thread, user_addr_t resource, int resource_type, boolean_t reset); /* * THREAD_QOS_UNSPECIFIED is assigned the highest tier available, so it does not provide a limit @@ -83,15 +82,6 @@ const qos_policy_params_t thread_qos_policy_params = { * This table defines the highest IO priority that a thread marked with this * QoS class can have. */ -#if CONFIG_EMBEDDED - .qos_iotier[THREAD_QOS_UNSPECIFIED] = THROTTLE_LEVEL_TIER0, - .qos_iotier[THREAD_QOS_USER_INTERACTIVE] = THROTTLE_LEVEL_TIER0, - .qos_iotier[THREAD_QOS_USER_INITIATED] = THROTTLE_LEVEL_TIER0, - .qos_iotier[THREAD_QOS_LEGACY] = THROTTLE_LEVEL_TIER0, - .qos_iotier[THREAD_QOS_UTILITY] = THROTTLE_LEVEL_TIER0, - .qos_iotier[THREAD_QOS_BACKGROUND] = THROTTLE_LEVEL_TIER3, - .qos_iotier[THREAD_QOS_MAINTENANCE] = THROTTLE_LEVEL_TIER3, -#else .qos_iotier[THREAD_QOS_UNSPECIFIED] = THROTTLE_LEVEL_TIER0, .qos_iotier[THREAD_QOS_USER_INTERACTIVE] = THROTTLE_LEVEL_TIER0, .qos_iotier[THREAD_QOS_USER_INITIATED] = THROTTLE_LEVEL_TIER0, @@ -99,7 +89,6 @@ const qos_policy_params_t thread_qos_policy_params = { .qos_iotier[THREAD_QOS_UTILITY] = THROTTLE_LEVEL_TIER1, .qos_iotier[THREAD_QOS_BACKGROUND] = THROTTLE_LEVEL_TIER2, /* possibly overridden by bg_iotier */ .qos_iotier[THREAD_QOS_MAINTENANCE] = THROTTLE_LEVEL_TIER3, -#endif /* * This table defines the highest QoS level that @@ -643,35 +632,133 @@ thread_set_mode_and_absolute_pri_internal(thread_t thread, return kr; } +uint8_t +thread_workq_pri_for_qos(thread_qos_t qos) +{ + assert(qos < THREAD_QOS_LAST); + return (uint8_t)thread_qos_policy_params.qos_pri[qos]; +} + +thread_qos_t +thread_workq_qos_for_pri(int priority) +{ + int qos; + if (priority > thread_qos_policy_params.qos_pri[THREAD_QOS_USER_INTERACTIVE]) { + // indicate that workq should map >UI threads to workq's + // internal notation for above-UI work. + return THREAD_QOS_UNSPECIFIED; + } + for (qos = THREAD_QOS_USER_INTERACTIVE; qos > THREAD_QOS_MAINTENANCE; qos--) { + // map a given priority up to the next nearest qos band. + if (thread_qos_policy_params.qos_pri[qos - 1] < priority) { + return qos; + } + } + return THREAD_QOS_MAINTENANCE; +} + /* - * KPI for pthread kext + * private interface for pthread workqueues * * Set scheduling policy & absolute priority for thread - * May be called from waitqueue callout context with spinlocks held + * May be called with spinlocks held * Thread mutex lock is not held */ -kern_return_t +void +thread_reset_workq_qos(thread_t thread, uint32_t qos) +{ + struct task_pend_token pend_token = {}; + + assert(qos < THREAD_QOS_LAST); + + spl_t s = splsched(); + thread_lock(thread); + + proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, + TASK_POLICY_QOS_AND_RELPRIO, qos, 0, &pend_token); + proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, + TASK_POLICY_QOS_WORKQ_OVERRIDE, THREAD_QOS_UNSPECIFIED, 0, + &pend_token); + + assert(pend_token.tpt_update_sockets == 0); + + thread_unlock(thread); + splx(s); + + thread_policy_update_complete_unlocked(thread, &pend_token); +} + +/* + * private interface for pthread workqueues + * + * Set scheduling policy & absolute priority for thread + * May be called with spinlocks held + * Thread mutex lock is held + */ +void +thread_set_workq_override(thread_t thread, uint32_t qos) +{ + struct task_pend_token pend_token = {}; + + assert(qos < THREAD_QOS_LAST); + + spl_t s = splsched(); + thread_lock(thread); + + proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, + TASK_POLICY_QOS_WORKQ_OVERRIDE, qos, 0, &pend_token); + + assert(pend_token.tpt_update_sockets == 0); + + thread_unlock(thread); + splx(s); + + thread_policy_update_complete_unlocked(thread, &pend_token); +} + +/* + * private interface for pthread workqueues + * + * Set scheduling policy & absolute priority for thread + * May be called with spinlocks held + * Thread mutex lock is not held + */ +void thread_set_workq_pri(thread_t thread, + thread_qos_t qos, integer_t priority, integer_t policy) { struct task_pend_token pend_token = {}; sched_mode_t mode = convert_policy_to_sched_mode(policy); + assert(qos < THREAD_QOS_LAST); assert(thread->static_param); - if (!thread->static_param) - return KERN_FAILURE; + + if (!thread->static_param || !thread->active) + return; + + spl_t s = splsched(); + thread_lock(thread); + + proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, + TASK_POLICY_QOS_AND_RELPRIO, qos, 0, &pend_token); + proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, + TASK_POLICY_QOS_WORKQ_OVERRIDE, THREAD_QOS_UNSPECIFIED, + 0, &pend_token); + + thread_unlock(thread); + splx(s); /* Concern: this doesn't hold the mutex... */ - if (!thread->active) - return KERN_TERMINATED; - kern_return_t kr = thread_set_mode_and_absolute_pri_internal(thread, mode, priority, &pend_token); + __assert_only kern_return_t kr; + kr = thread_set_mode_and_absolute_pri_internal(thread, mode, priority, + &pend_token); + assert(kr == KERN_SUCCESS); if (pend_token.tpt_update_thread_sfi) sfi_reevaluate(thread); - - return kr; } /* @@ -762,7 +849,7 @@ thread_update_qos_cpu_time_locked(thread_t thread) * last context switch (embedded) or last user/kernel boundary transition (desktop) * because user_timer and system_timer are only updated then. * - * TODO: Consider running a thread_timer_event operation here to update it first. + * TODO: Consider running a timer_update operation here to update it first. * Maybe doable with interrupts disabled from current thread. * If the thread is on a different core, may not be easy to get right. * @@ -779,7 +866,7 @@ thread_update_qos_cpu_time_locked(thread_t thread) /* Update the task-level effective and requested qos stats atomically, because we don't have the task lock. */ switch (thread->effective_policy.thep_qos) { - case THREAD_QOS_DEFAULT: task_counter = &task->cpu_time_eqos_stats.cpu_time_qos_default; break; + case THREAD_QOS_UNSPECIFIED: task_counter = &task->cpu_time_eqos_stats.cpu_time_qos_default; break; case THREAD_QOS_MAINTENANCE: task_counter = &task->cpu_time_eqos_stats.cpu_time_qos_maintenance; break; case THREAD_QOS_BACKGROUND: task_counter = &task->cpu_time_eqos_stats.cpu_time_qos_background; break; case THREAD_QOS_UTILITY: task_counter = &task->cpu_time_eqos_stats.cpu_time_qos_utility; break; @@ -794,7 +881,7 @@ thread_update_qos_cpu_time_locked(thread_t thread) /* Update the task-level qos stats atomically, because we don't have the task lock. */ switch (thread->requested_policy.thrp_qos) { - case THREAD_QOS_DEFAULT: task_counter = &task->cpu_time_rqos_stats.cpu_time_qos_default; break; + case THREAD_QOS_UNSPECIFIED: task_counter = &task->cpu_time_rqos_stats.cpu_time_qos_default; break; case THREAD_QOS_MAINTENANCE: task_counter = &task->cpu_time_rqos_stats.cpu_time_qos_maintenance; break; case THREAD_QOS_BACKGROUND: task_counter = &task->cpu_time_rqos_stats.cpu_time_qos_background; break; case THREAD_QOS_UTILITY: task_counter = &task->cpu_time_rqos_stats.cpu_time_qos_utility; break; @@ -1183,7 +1270,7 @@ thread_policy_get( info->thps_requested_policy = *(uint64_t*)(void*)(&thread->requested_policy); info->thps_effective_policy = *(uint64_t*)(void*)(&thread->effective_policy); - info->thps_user_promotions = thread->user_promotions; + info->thps_user_promotions = 0; info->thps_user_promotion_basepri = thread->user_promotion_basepri; info->thps_ipc_overrides = thread->ipc_overrides; @@ -1346,14 +1433,10 @@ thread_policy_update_internal_spinlocked(thread_t thread, boolean_t recompute_pr uint32_t next_qos = requested.thrp_qos; if (requested.thrp_qos != THREAD_QOS_UNSPECIFIED) { - if (requested.thrp_qos_override != THREAD_QOS_UNSPECIFIED) - next_qos = MAX(requested.thrp_qos_override, next_qos); - - if (requested.thrp_qos_promote != THREAD_QOS_UNSPECIFIED) - next_qos = MAX(requested.thrp_qos_promote, next_qos); - - if (requested.thrp_qos_ipc_override != THREAD_QOS_UNSPECIFIED) - next_qos = MAX(requested.thrp_qos_ipc_override, next_qos); + next_qos = MAX(requested.thrp_qos_override, next_qos); + next_qos = MAX(requested.thrp_qos_promote, next_qos); + next_qos = MAX(requested.thrp_qos_ipc_override, next_qos); + next_qos = MAX(requested.thrp_qos_workq_override, next_qos); } next.thep_qos = next_qos; @@ -1379,8 +1462,7 @@ thread_policy_update_internal_spinlocked(thread_t thread, boolean_t recompute_pr } /* Apply the sync ipc qos override */ - if (requested.thrp_qos_sync_ipc_override != THREAD_QOS_UNSPECIFIED) - next.thep_qos = MAX(requested.thrp_qos_sync_ipc_override, next.thep_qos); + assert(requested.thrp_qos_sync_ipc_override == THREAD_QOS_UNSPECIFIED); /* * The QoS relative priority is only applicable when the original programmer's @@ -1597,55 +1679,6 @@ proc_set_thread_policy(thread_t thread, thread_policy_update_complete_unlocked(thread, &pend_token); } -/* - * KPI for pthread kext to call to set thread base QoS values during a workq wakeup - * May be called with interrupts disabled and workqueue/waitqueue/kqueue locks held - * - * Does NOT do update completion, so the thread MUST be in a safe place WRT - * IO throttling and SFI. - * - * TODO: Can I assert 'it must be in a safe place'? - */ -kern_return_t -thread_set_workq_qos(thread_t thread, - int qos_tier, - int relprio) /* relprio is -16 to 0 */ -{ - assert(qos_tier >= 0 && qos_tier <= THREAD_QOS_LAST); - assert(relprio <= 0 && relprio >= THREAD_QOS_MIN_TIER_IMPORTANCE); - - if (!(qos_tier >= 0 && qos_tier <= THREAD_QOS_LAST)) - return KERN_FAILURE; - if (!(relprio <= 0 && relprio >= THREAD_QOS_MIN_TIER_IMPORTANCE)) - return KERN_FAILURE; - - if (qos_tier == THREAD_QOS_UNSPECIFIED) { - assert(relprio == 0); - if (relprio != 0) - return KERN_FAILURE; - } - - assert(thread->static_param); - if (!thread->static_param) { - return KERN_FAILURE; - } - - /* Concern: this doesn't hold the mutex... */ - //if (!thread->active) - // return KERN_TERMINATED; - - struct task_pend_token pend_token = {}; - - proc_set_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_AND_RELPRIO, qos_tier, -relprio, &pend_token); - - assert(pend_token.tpt_update_sockets == 0); - /* we don't need to update throttle or sfi because pthread kext promises the thread is in a safe place */ - /* TODO: Do we need to update SFI to ensure it gets tagged with the AST? */ - - return KERN_SUCCESS; -} - - /* * Do the things that can't be done while holding a thread mutex. * These are set up to call back into thread policy to get the latest value, @@ -1804,6 +1837,11 @@ thread_set_requested_policy_spinlocked(thread_t thread, DTRACE_BOOST3(qos_set, uint64_t, thread->thread_id, int, requested.thrp_qos, int, requested.thrp_qos_relprio); break; + case TASK_POLICY_QOS_WORKQ_OVERRIDE: + assert(category == TASK_POLICY_ATTRIBUTE); + requested.thrp_qos_workq_override = value; + break; + case TASK_POLICY_QOS_PROMOTE: assert(category == TASK_POLICY_ATTRIBUTE); requested.thrp_qos_promote = value; @@ -1814,11 +1852,6 @@ thread_set_requested_policy_spinlocked(thread_t thread, requested.thrp_qos_ipc_override = value; break; - case TASK_POLICY_QOS_SYNC_IPC_OVERRIDE: - assert(category == TASK_POLICY_ATTRIBUTE); - requested.thrp_qos_sync_ipc_override = value; - break; - case TASK_POLICY_TERMINATED: assert(category == TASK_POLICY_ATTRIBUTE); requested.thrp_terminated = value; @@ -1923,6 +1956,10 @@ thread_get_requested_policy_spinlocked(thread_t thread, assert(category == TASK_POLICY_ATTRIBUTE); value = requested.thrp_through_qos; break; + case TASK_POLICY_QOS_WORKQ_OVERRIDE: + assert(category == TASK_POLICY_ATTRIBUTE); + value = requested.thrp_qos_workq_override; + break; case TASK_POLICY_QOS_AND_RELPRIO: assert(category == TASK_POLICY_ATTRIBUTE); assert(value2 != NULL); @@ -2218,11 +2255,6 @@ static void canonicalize_resource_and_type(user_addr_t *resource, int *resource_ *resource_type = THREAD_QOS_OVERRIDE_TYPE_UNKNOWN; } else if (qos_override_mode == QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE) { /* no transform */ - } else if (qos_override_mode == QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_IGNORE_DISPATCH) { - /* Map all dispatch overrides to a single one, to avoid memory overhead */ - if (*resource_type == THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE) { - *resource = USER_ADDR_NULL; - } } else if (qos_override_mode == QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_SINGLE_MUTEX_OVERRIDE) { /* Map all mutex overrides to a single one, to avoid memory overhead */ if (*resource_type == THREAD_QOS_OVERRIDE_TYPE_PTHREAD_MUTEX) { @@ -2314,11 +2346,7 @@ calculate_requested_qos_override(thread_t thread) override = thread->overrides; while (override) { - if (qos_override_mode != QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_IGNORE_DISPATCH || - override->override_resource_type != THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE) { - qos_override = MAX(qos_override, override->override_qos); - } - + qos_override = MAX(qos_override, override->override_qos); override = override->override_next; } @@ -2329,19 +2357,13 @@ calculate_requested_qos_override(thread_t thread) * Returns: * - 0 on success * - EINVAL if some invalid input was passed - * - EFAULT if user_lock_addr != NULL and needs to be faulted (userland has to - * fault and retry) - * - ESTALE if user_lock_addr != NULL && - * ulock_owner_value_to_port_name(*user_lock_addr) != user_lock_owner */ static int proc_thread_qos_add_override_internal(thread_t thread, int override_qos, boolean_t first_override_for_resource, user_addr_t resource, - int resource_type, - user_addr_t user_lock_addr, - mach_port_name_t user_lock_owner) + int resource_type) { struct task_pend_token pend_token = {}; int rc = 0; @@ -2373,26 +2395,6 @@ proc_thread_qos_add_override_internal(thread_t thread, thread_mtx_lock(thread); override = find_qos_override(thread, resource, resource_type); } - if (user_lock_addr) { - uint64_t val; - /* Workaround lack of explicit support for 'no-fault copyin' - * , as disabling preemption prevents paging in - */ - disable_preemption(); - rc = copyin_word(user_lock_addr, &val, sizeof(user_lock_owner)); - enable_preemption(); - if (rc == 0 && ulock_owner_value_to_port_name((uint32_t)val) != user_lock_owner) { - rc = ESTALE; - } - if (rc) { - prev_qos_override = proc_get_thread_policy_locked(thread, - TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_OVERRIDE, NULL); - new_qos_override = prev_qos_override; - new_effective_qos = proc_get_effective_thread_policy(thread, TASK_POLICY_QOS); - thread_mtx_unlock(thread); - goto out; - } - } if (first_override_for_resource && override) { /* Someone else already allocated while the thread lock was dropped */ override->override_contended_resource_count++; @@ -2435,7 +2437,6 @@ proc_thread_qos_add_override_internal(thread_t thread, thread_policy_update_complete_unlocked(thread, &pend_token); -out: if (override_new) { zfree(thread_qos_override_zone, override_new); } @@ -2450,20 +2451,6 @@ proc_thread_qos_add_override_internal(thread_t thread, } int -proc_thread_qos_add_override_check_owner(thread_t thread, - int override_qos, - boolean_t first_override_for_resource, - user_addr_t resource, - int resource_type, - user_addr_t user_lock_addr, - mach_port_name_t user_lock_owner) -{ - return proc_thread_qos_add_override_internal(thread, override_qos, - first_override_for_resource, resource, resource_type, - user_lock_addr, user_lock_owner); -} - -boolean_t proc_thread_qos_add_override(task_t task, thread_t thread, uint64_t tid, @@ -2482,33 +2469,31 @@ proc_thread_qos_add_override(task_t task, if (thread == THREAD_NULL) { KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_ADD_OVERRIDE)) | DBG_FUNC_NONE, tid, 0, 0xdead, 0, 0); - return FALSE; + return ESRCH; } has_thread_reference = TRUE; } else { assert(thread->task == task); } rc = proc_thread_qos_add_override_internal(thread, override_qos, - first_override_for_resource, resource, resource_type, 0, 0); + first_override_for_resource, resource, resource_type); if (has_thread_reference) { thread_deallocate(thread); } - return rc == 0; + return rc; } -static int +static void proc_thread_qos_remove_override_internal(thread_t thread, user_addr_t resource, int resource_type, - boolean_t reset, - boolean_t squash) + boolean_t reset) { struct task_pend_token pend_token = {}; struct thread_qos_override *deferred_free_override_list = NULL; - int new_qos_override, prev_qos_override, new_effective_qos, prev_qos; - int new_qos = THREAD_QOS_UNSPECIFIED; + int new_qos_override, prev_qos_override, new_effective_qos; thread_mtx_lock(thread); @@ -2536,24 +2521,6 @@ proc_thread_qos_remove_override_internal(thread_t thread, */ prev_qos_override = thread_get_requested_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_OVERRIDE, NULL); - if (squash) { - int prev_ipc_override; - int prev_override; - - /* - * Remove the specified overrides, and set the current override as the new base QoS. - * Return the new QoS value. - */ - prev_ipc_override = thread_get_requested_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_IPC_OVERRIDE, NULL); - prev_override = MAX(prev_qos_override, prev_ipc_override); - - prev_qos = thread_get_requested_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS, NULL); - - new_qos = MAX(prev_qos, prev_override); - if (new_qos != prev_qos) - proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS, new_qos, 0, &pend_token); - } - if (new_qos_override != prev_qos_override) proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_OVERRIDE, new_qos_override, 0, &pend_token); @@ -2577,12 +2544,10 @@ proc_thread_qos_remove_override_internal(thread_t thread, int, new_qos_override, int, new_effective_qos); KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_REMOVE_OVERRIDE)) | DBG_FUNC_END, - thread_tid(thread), squash, 0, 0, 0); - - return new_qos; + thread_tid(thread), 0, 0, 0, 0); } -boolean_t +int proc_thread_qos_remove_override(task_t task, thread_t thread, uint64_t tid, @@ -2598,80 +2563,24 @@ proc_thread_qos_remove_override(task_t task, if (thread == THREAD_NULL) { KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_REMOVE_OVERRIDE)) | DBG_FUNC_NONE, tid, 0, 0xdead, 0, 0); - return FALSE; - } - has_thread_reference = TRUE; - } else { - assert(task == thread->task); - } - - proc_thread_qos_remove_override_internal(thread, resource, resource_type, FALSE, FALSE); - - if (has_thread_reference) - thread_deallocate(thread); - - return TRUE; -} - -boolean_t -proc_thread_qos_reset_override(task_t task, - thread_t thread, - uint64_t tid, - user_addr_t resource, - int resource_type) - -{ - boolean_t has_thread_reference = FALSE; - - if (thread == THREAD_NULL) { - thread = task_findtid(task, tid); - /* returns referenced thread */ - - if (thread == THREAD_NULL) { - KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_REMOVE_OVERRIDE)) | DBG_FUNC_NONE, - tid, 0, 0xdead, 0, 0); - return FALSE; + return ESRCH; } has_thread_reference = TRUE; } else { assert(task == thread->task); } - proc_thread_qos_remove_override_internal(thread, resource, resource_type, TRUE, FALSE); + proc_thread_qos_remove_override_internal(thread, resource, resource_type, FALSE); if (has_thread_reference) thread_deallocate(thread); - return TRUE; -} - -/* - * Clears the requested overrides, and replaces the current QoS with the max - * of the current QoS and the current override, then returns the new QoS. - * - * This is useful in order to reset overrides before parking a workqueue thread, - * but avoid dropping priority and getting preempted right before parking. - * - * Called without any locks held. - */ -int -proc_thread_qos_squash_override(thread_t thread, user_addr_t resource, int resource_type) -{ - return proc_thread_qos_remove_override_internal(thread, resource, resource_type, TRUE, TRUE); + return 0; } /* Deallocate before thread termination */ void proc_thread_qos_deallocate(thread_t thread) { - /* - * There are no more references to this thread, - * therefore this thread must not own any more locks, - * therefore there must not be any more user promotions. - */ - assert(thread->user_promotions == 0); - assert(thread->requested_policy.thrp_qos_promote == THREAD_QOS_UNSPECIFIED); - assert(thread->user_promotion_basepri == 0); - /* This thread must have no more IPC overrides. */ assert(thread->ipc_overrides == 0); assert(thread->requested_policy.thrp_qos_ipc_override == THREAD_QOS_UNSPECIFIED); @@ -2746,148 +2655,77 @@ task_get_default_manager_qos(task_t task) return primordial_qos; } - /* - * Promote thread with the user level properties of 'promoter' - * Mutexes may be held, but it's OK to take the throttle lock + * Check if the user promotion on thread has changed + * and apply it. * - * if 'new_promotion' is TRUE, this is a new promotion. - * if FALSE, we are updating an existing promotion. + * thread locked on entry, might drop the thread lock + * and reacquire it. */ -static void -thread_user_promotion_promote(thread_t thread, - thread_t promoter, - struct promote_token* promote_token, - boolean_t new_promotion) +boolean_t +thread_recompute_user_promotion_locked(thread_t thread) { + boolean_t needs_update = FALSE; struct task_pend_token pend_token = {}; + int user_promotion_basepri = MIN(thread_get_inheritor_turnstile_priority(thread), MAXPRI_USER); + int old_base_pri = thread->base_pri; + thread_qos_t qos_promotion; - uint32_t promoter_base_pri = 0, promoter_qos = THREAD_QOS_UNSPECIFIED; - - spl_t s = splsched(); - thread_lock(promoter); - - /* - * We capture the 'promotion qos' here, which is captured - * before task-level clamping. - * - * This means that if the process gets unclamped while a promotion, - * is in effect, the owning thread ends up with the correct QoS. - * - * This does NOT work correctly across processes, as the correct QoS - * in one is not necessarily the correct QoS in another. - * When we add support for multi-process ulock boosting, we need to - * do something more complex. - */ - promoter_qos = promoter->effective_policy.thep_qos_promote; - - /* TODO: extract 'effective unclamped base pri' instead */ - promoter_base_pri = promoter->base_pri; - - thread_unlock(promoter); - splx(s); - - /* clamp out realtime to max user pri */ - promoter_base_pri = MIN(promoter_base_pri, MAXPRI_USER); - - /* add in the saved promotion token */ - assert(promote_token->pt_basepri <= MAXPRI_USER); - - promoter_base_pri = MAX(promoter_base_pri, promote_token->pt_basepri); - promoter_qos = MAX(promoter_qos, promote_token->pt_qos); - - /* save the max for later */ - promote_token->pt_basepri = promoter_base_pri; - promote_token->pt_qos = promoter_qos; - - s = splsched(); - thread_lock(thread); - - if (new_promotion) { - if (thread->user_promotions == 0) { - assert(thread->requested_policy.thrp_qos_promote == THREAD_QOS_UNSPECIFIED); - assert(thread->user_promotion_basepri == 0); - } - - thread->user_promotions++; + /* Check if user promotion has changed */ + if (thread->user_promotion_basepri == user_promotion_basepri) { + return needs_update; } else { - assert(thread->user_promotions > 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (TURNSTILE_CODE(TURNSTILE_PRIORITY_OPERATIONS, (THREAD_USER_PROMOTION_CHANGE))) | DBG_FUNC_NONE, + thread_tid(thread), + user_promotion_basepri, + thread->user_promotion_basepri, + 0, 0); } - uint32_t thread_qos = thread->requested_policy.thrp_qos_promote; - uint32_t thread_basepri = thread->user_promotion_basepri; + /* Update the user promotion base pri */ + thread->user_promotion_basepri = user_promotion_basepri; + pend_token.tpt_force_recompute_pri = 1; - uint32_t new_qos = MAX(thread_qos, promoter_qos); - uint32_t new_basepri = MAX(thread_basepri, promoter_base_pri); - - /* TODO: Fast path the 'new is lower than effective' case to avoid full reevaluation */ - if (thread_qos != new_qos || thread_basepri != new_basepri) { - - thread->user_promotion_basepri = new_basepri; + if (user_promotion_basepri <= MAXPRI_THROTTLE) { + qos_promotion = THREAD_QOS_UNSPECIFIED; + } else { + qos_promotion = thread_user_promotion_qos_for_pri(user_promotion_basepri); + } - pend_token.tpt_force_recompute_pri = 1; + proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, + TASK_POLICY_QOS_PROMOTE, qos_promotion, 0, &pend_token); - proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, - TASK_POLICY_QOS_PROMOTE, new_qos, - 0, &pend_token); + if (thread_get_waiting_turnstile(thread) && + thread->base_pri != old_base_pri) { + needs_update = TRUE; } thread_unlock(thread); - splx(s); thread_policy_update_complete_unlocked(thread, &pend_token); -} -/* Add a user promotion to thread */ -void -thread_user_promotion_add(thread_t thread, - thread_t promoter, - struct promote_token* promote_token) -{ - thread_user_promotion_promote(thread, promoter, promote_token, TRUE); -} + thread_lock(thread); -/* Update an existing user promotion on thread */ -void -thread_user_promotion_update(thread_t thread, - thread_t promoter, - struct promote_token* promote_token) -{ - thread_user_promotion_promote(thread, promoter, promote_token, FALSE); + return needs_update; } /* - * Drop a user promotion on thread - * Mutexes may be held, but it's OK to take the throttle lock + * Convert the thread user promotion base pri to qos for threads in qos world. + * For priority above UI qos, the qos would be set to UI. */ -void -thread_user_promotion_drop(thread_t thread) +thread_qos_t +thread_user_promotion_qos_for_pri(int priority) { - struct task_pend_token pend_token = {}; - - spl_t s = splsched(); - thread_lock(thread); - - assert(thread->user_promotions > 0); - - if (--thread->user_promotions == 0) { - thread->requested_policy.thrp_qos_promote = THREAD_QOS_UNSPECIFIED; - thread->user_promotion_basepri = 0; - - pend_token.tpt_force_recompute_pri = 1; - - proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, - TASK_POLICY_QOS_PROMOTE, THREAD_QOS_UNSPECIFIED, - 0, &pend_token); + int qos; + for (qos = THREAD_QOS_USER_INTERACTIVE; qos > THREAD_QOS_MAINTENANCE; qos--) { + if (thread_qos_policy_params.qos_pri[qos] <= priority) { + return qos; + } } - - thread_unlock(thread); - splx(s); - - thread_policy_update_complete_unlocked(thread, &pend_token); + return THREAD_QOS_MAINTENANCE; } - /* * Set the thread's QoS IPC override * Owned by the IPC subsystem @@ -2914,6 +2752,7 @@ thread_ipc_override(thread_t thread, assert(qos_override > THREAD_QOS_UNSPECIFIED); assert(qos_override < THREAD_QOS_LAST); + if (is_new_override) { if (thread->ipc_overrides++ == 0) { /* This add is the first override for this thread */ @@ -2948,10 +2787,6 @@ thread_ipc_override(thread_t thread, thread_unlock(thread); splx(s); - /* - * this is only safe after rethrottle_thread supports - * being called from spinlock context - */ thread_policy_update_complete_unlocked(thread, &pend_token); } @@ -2993,88 +2828,20 @@ thread_drop_ipc_override(thread_t thread) thread_unlock(thread); splx(s); - /* - * this is only safe after rethrottle_thread supports - * being called from spinlock context - */ thread_policy_update_complete_unlocked(thread, &pend_token); } -void -thread_add_sync_ipc_override(thread_t thread) +/* Get current requested qos / relpri, may be called from spinlock context */ +thread_qos_t +thread_get_requested_qos(thread_t thread, int *relpri) { - struct task_pend_token pend_token = {}; - - spl_t s = splsched(); - thread_lock(thread); - - uint32_t old_override __unused = thread->requested_policy.thrp_qos_sync_ipc_override; - - if (thread->sync_ipc_overrides++ == 0) { - /* This add is the first override for this thread */ - assert(old_override == THREAD_QOS_UNSPECIFIED); - } else { - /* There are already other overrides in effect for this thread */ - assert(old_override == THREAD_QOS_USER_INTERACTIVE); - thread_unlock(thread); - splx(s); - return; - } - - uint32_t new_override = THREAD_QOS_USER_INTERACTIVE; - - proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, - TASK_POLICY_QOS_SYNC_IPC_OVERRIDE, - new_override, 0, &pend_token); - - assert(pend_token.tpt_update_sockets == 0); + int relprio_value = 0; + thread_qos_t qos; - thread_unlock(thread); - splx(s); - - /* - * this is only safe after rethrottle_thread supports - * being called from spinlock context - */ - thread_policy_update_complete_unlocked(thread, &pend_token); -} - -void -thread_drop_sync_ipc_override(thread_t thread) -{ - struct task_pend_token pend_token = {}; - - spl_t s = splsched(); - thread_lock(thread); - - assert(thread->sync_ipc_overrides > 0); - - if (--thread->sync_ipc_overrides == 0) { - /* - * There are no more overrides for this thread, so we should - * clear out the saturated override value - */ - - proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, - TASK_POLICY_QOS_SYNC_IPC_OVERRIDE, THREAD_QOS_UNSPECIFIED, - 0, &pend_token); - } - - thread_unlock(thread); - splx(s); - - /* - * this is only safe after rethrottle_thread supports - * being called from spinlock context - */ - thread_policy_update_complete_unlocked(thread, &pend_token); -} - -/* Get current IPC override, may be called from spinlock context */ -uint32_t -thread_get_ipc_override(thread_t thread) -{ - return proc_get_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_IPC_OVERRIDE, NULL); + qos = proc_get_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, + TASK_POLICY_QOS_AND_RELPRIO, &relprio_value); + if (relpri) *relpri = -relprio_value; + return qos; } /* @@ -3082,27 +2849,16 @@ thread_get_ipc_override(thread_t thread) * since exec could block other threads calling * proc_find on the proc. This boost must be removed * via call to thread_clear_exec_promotion. + * + * This should be replaced with a generic 'priority inheriting gate' mechanism (24194397) */ void thread_set_exec_promotion(thread_t thread) { - spl_t s; - - s = splsched(); + spl_t s = splsched(); thread_lock(thread); - assert((thread->sched_flags & TH_SFLAG_EXEC_PROMOTED) == 0); - - if (thread->sched_pri < EXEC_BOOST_PRIORITY || - !(thread->sched_flags & TH_SFLAG_EXEC_PROMOTED)) { - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_EXEC_PROMOTE) | DBG_FUNC_NONE, - (uintptr_t)thread_tid(thread), - thread->sched_pri, thread->base_pri, - EXEC_BOOST_PRIORITY, 0); - thread->sched_flags |= TH_SFLAG_EXEC_PROMOTED; - if (thread->sched_pri < EXEC_BOOST_PRIORITY) - set_sched_pri(thread, EXEC_BOOST_PRIORITY); - } + sched_thread_promote_reason(thread, TH_SFLAG_EXEC_PROMOTED, 0); thread_unlock(thread); splx(s); @@ -3115,34 +2871,12 @@ thread_set_exec_promotion(thread_t thread) void thread_clear_exec_promotion(thread_t thread) { - spl_t s; - - s = splsched(); + spl_t s = splsched(); thread_lock(thread); - assert(thread->sched_flags & TH_SFLAG_EXEC_PROMOTED); - - if (thread->sched_flags & TH_SFLAG_EXEC_PROMOTED) { - thread->sched_flags &= ~TH_SFLAG_EXEC_PROMOTED; - - if (thread->sched_flags & TH_SFLAG_PROMOTED_MASK) { - /* it still has other promotions (mutex/rw_lock) */ - } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) { - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_EXEC_DEMOTE) | DBG_FUNC_NONE, - (uintptr_t)thread_tid(thread), - thread->sched_pri, - thread->base_pri, - DEPRESSPRI, 0); - set_sched_pri(thread, DEPRESSPRI); - } else { - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_EXEC_DEMOTE) | DBG_FUNC_NONE, - (uintptr_t)thread_tid(thread), - thread->sched_pri, - thread->base_pri, - thread->base_pri, 0); - thread_recompute_sched_pri(thread, FALSE); - } - } + + sched_thread_unpromote_reason(thread, TH_SFLAG_EXEC_PROMOTED, 0); thread_unlock(thread); splx(s); } + diff --git a/osfmk/kern/timer.c b/osfmk/kern/timer.c index f101eb171..8ccba9e2c 100644 --- a/osfmk/kern/timer.c +++ b/osfmk/kern/timer.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2018 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,39 +22,37 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* * @OSF_COPYRIGHT@ */ -/* +/* * Mach Operating System * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University * All Rights Reserved. - * + * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. - * + * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * + * * Carnegie Mellon requests users of this software to return to - * + * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 - * + * * any improvements or extensions that they make and grant Carnegie Mellon * the rights to redistribute these changes. */ -/* - */ #include #include @@ -70,110 +68,81 @@ int precise_user_kernel_time = 0; int precise_user_kernel_time = 1; #endif -/* - * timer_init initializes a timer. - */ void -timer_init( - timer_t timer) +timer_init(timer_t timer) { - timer->tstamp = 0; -#if defined(__LP64__) - timer->all_bits = 0; -#else - timer->low_bits = 0; - timer->high_bits = 0; - timer->high_bits_check = 0; -#endif /* defined(__LP64__) */ + memset(timer, 0, sizeof(*timer)); } -/* - * Calculate the difference between a timer - * and saved value, and update the saved value. - */ uint64_t -timer_delta( - timer_t timer, - uint64_t *save) +timer_delta(timer_t timer, uint64_t *prev_in_cur_out) { - uint64_t new, old = *save; - - *save = new = timer_grab(timer); - + uint64_t old = *prev_in_cur_out; + uint64_t new = *prev_in_cur_out = timer_grab(timer); return (new - old); } -void -timer_advance( - timer_t timer, - uint64_t delta) +static void +timer_advance(timer_t timer, uint64_t delta) { -#if defined(__LP64__) +#if defined(__LP64__) timer->all_bits += delta; -#else - uint64_t low; - - low = delta + timer->low_bits; - if (low >> 32) - timer_update(timer, (uint32_t)(timer->high_bits + (low >> 32)), (uint32_t)low); - else +#else /* defined(__LP64__) */ + extern void timer_advance_internal_32(timer_t timer, uint32_t high, + uint32_t low); + uint64_t low = delta + timer->low_bits; + if (low >> 32) { + timer_advance_internal_32(timer, + (uint32_t)(timer->high_bits + (low >> 32)), (uint32_t)low); + } else { timer->low_bits = (uint32_t)low; -#endif /* defined(__LP64__) */ + } +#endif /* defined(__LP64__) */ } void -timer_start( - timer_t timer, - uint64_t tstamp) +timer_start(timer_t timer, uint64_t tstamp) { timer->tstamp = tstamp; } void -timer_stop( - timer_t timer, - uint64_t tstamp) +timer_stop(timer_t timer, uint64_t tstamp) { timer_advance(timer, tstamp - timer->tstamp); } -/* - * Update the timer and start a new one. - */ void -timer_switch( - timer_t timer, - uint64_t tstamp, - timer_t new_timer) +timer_update(timer_t timer, uint64_t tstamp) +{ + timer_advance(timer, tstamp - timer->tstamp); + timer->tstamp = tstamp; +} + +void +timer_switch(timer_t timer, uint64_t tstamp, timer_t new_timer) { timer_advance(timer, tstamp - timer->tstamp); new_timer->tstamp = tstamp; } /* - * Update the current thread timer and - * start the new timer. Requires a current - * and new timer. + * Update the current processor's thread timer with `tstamp` and switch the + * processor's thread timer to `new_timer`. * - * Called with interrupts disabled. + * Called with interrupts disabled. */ void -thread_timer_event( - uint64_t tstamp, - timer_t new_timer) +processor_timer_switch_thread(uint64_t tstamp, timer_t new_timer) { - processor_t processor = current_processor(); - timer_t timer; + processor_t processor = current_processor(); + timer_t timer; - /* - * Update current timer. - */ + /* Update current timer. */ timer = PROCESSOR_DATA(processor, thread_timer); timer_advance(timer, tstamp - timer->tstamp); - /* - * Start new timer. - */ + /* Start new timer. */ PROCESSOR_DATA(processor, thread_timer) = new_timer; new_timer->tstamp = tstamp; } diff --git a/osfmk/kern/timer.h b/osfmk/kern/timer.h index a353c6c29..648a47c79 100644 --- a/osfmk/kern/timer.h +++ b/osfmk/kern/timer.h @@ -56,7 +56,7 @@ /* */ -#ifndef _KERN_TIMER_H_ +#ifndef _KERN_TIMER_H_ #define _KERN_TIMER_H_ #include @@ -80,85 +80,78 @@ extern int precise_user_kernel_time; * thread-local value (or in kernel debugger context). In the future, * we make take into account task-level or thread-level policy. */ -#define use_precise_user_kernel_time(thread) ( precise_user_kernel_time ) +#define use_precise_user_kernel_time(thread) (precise_user_kernel_time) /* - * Definitions for high resolution timers. A check - * word on the high portion allows atomic updates. + * Definitions for high resolution timers. */ struct timer { - uint64_t tstamp; -#if defined(__LP64__) - uint64_t all_bits; -#else - uint32_t low_bits; - uint32_t high_bits; - uint32_t high_bits_check; -#endif + uint64_t tstamp; +#if defined(__LP64__) + uint64_t all_bits; +#else /* defined(__LP64__) */ + /* A check word on the high portion allows atomic updates. */ + uint32_t low_bits; + uint32_t high_bits; + uint32_t high_bits_check; +#endif /* !defined(__LP64__) */ }; -typedef struct timer timer_data_t, *timer_t; +typedef struct timer timer_data_t, *timer_t; /* - * Exported kernel interface to timers + * Initialize the `timer`. */ +void timer_init(timer_t timer); -/* Start a timer by setting the timestamp */ -extern void timer_start( - timer_t timer, - uint64_t tstamp); - -/* Stop a timer by updating from the timestamp */ -extern void timer_stop( - timer_t timer, - uint64_t tstamp); - -/* Update the timer and start a new one */ -extern void timer_switch( - timer_t timer, - uint64_t tstamp, - timer_t new_timer); - -/* Update the thread timer at an event */ -extern void thread_timer_event( - uint64_t tstamp, - timer_t new_timer); - -/* Initialize a timer */ -extern void timer_init( - timer_t timer); - -/* Update a saved timer value and return delta to current value */ -extern uint64_t timer_delta( - timer_t timer, - uint64_t *save); - -/* Advance a timer by a 64 bit value */ -extern void timer_advance( - timer_t timer, - uint64_t delta); +/* + * Start the `timer` at time `tstamp`. + */ +void timer_start(timer_t timer, uint64_t tstamp); + +/* + * Stop the `timer` and update it with time `tstamp`. + */ +void timer_stop(timer_t timer, uint64_t tstamp); + +/* + * Update the `timer` at time `tstamp`, leaving it running. + */ +void timer_update(timer_t timer, uint64_t tstamp); + +/* + * Update the `timer` with time `tstamp` and start `new_timer`. + */ +void timer_switch(timer_t timer, uint64_t tstamp, timer_t new_timer); /* - * Exported hardware interface to timers + * Update the thread timer at an "event," like a context switch, at time + * `tstamp`. This stops the current timer and starts the `new_timer` running. + * + * Must be called with interrupts disabled. */ +void processor_timer_switch_thread(uint64_t tstamp, timer_t new_timer); -/* Read timer value */ -#if defined(__LP64__) -static inline uint64_t timer_grab( - timer_t timer) +/* + * Return the difference between the `timer` and a previous value pointed to by + * `prev_in_cur_out`. Store the current value of the timer to + * `prev_in_cur_out`. + */ +uint64_t timer_delta(timer_t timer, uint64_t *prev_in_cur_out); + +/* + * Read the accumulated time of `timer`. + */ +#if defined(__LP64__) +static inline +uint64_t +timer_grab(timer_t timer) { return timer->all_bits; } -#else -extern uint64_t timer_grab( - timer_t timer); - -/* Update timer value */ -extern void timer_update( - timer_t timer, - uint32_t new_high, - uint32_t new_low); -#endif /* defined(__LP64__) */ - -#endif /* _KERN_TIMER_H_ */ +#else /* defined(__LP64__) */ +uint64_t timer_grab(timer_t timer); +#endif /* !defined(__LP64__) */ + +#endif /* _KERN_TIMER_H_ */ diff --git a/osfmk/kern/trustcache.h b/osfmk/kern/trustcache.h new file mode 100644 index 000000000..4fd57d53a --- /dev/null +++ b/osfmk/kern/trustcache.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2018 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _KERN_TRUSTCACHE_H_ +#define _KERN_TRUSTCACHE_H_ + +#include + +#include + +#include + +/* Version 0 trust caches: No defined sorting order (thus only suitable for small trust caches). + * Used for loadable trust caches only, until phasing out support. */ +typedef uint8_t trust_cache_hash0[CS_CDHASH_LEN]; +struct trust_cache_module0 { + uint32_t version; + uuid_t uuid; + uint32_t num_hashes; + trust_cache_hash0 hashes[]; +} __attribute__((__packed__)); + + +/* Version 1 trust caches: Always sorted by cdhash, added hash type and flags field. + * Suitable for all trust caches. */ + +struct trust_cache_entry1 { + uint8_t cdhash[CS_CDHASH_LEN]; + uint8_t hash_type; + uint8_t flags; +} __attribute__((__packed__)); + +struct trust_cache_module1 { + uint32_t version; + uuid_t uuid; + uint32_t num_entries; + struct trust_cache_entry1 entries[]; +} __attribute__((__packed__)); + +// Trust Cache Entry Flags +#define CS_TRUST_CACHE_AMFID 0x1 // valid cdhash for amfid + +#define TC_LOOKUP_HASH_TYPE_SHIFT 16 +#define TC_LOOKUP_HASH_TYPE_MASK 0xff0000L; +#define TC_LOOKUP_FLAGS_SHIFT 8 +#define TC_LOOKUP_FLAGS_MASK 0xff00L +#define TC_LOOKUP_RESULT_SHIFT 0 +#define TC_LOOKUP_RESULT_MASK 0xffL + +#define TC_LOOKUP_FOUND 1 +#define TC_LOOKUP_FALLBACK 2 + +#ifdef XNU_KERNEL_PRIVATE + +// Serialized Trust Caches + +/* This is how iBoot delivers them to us. */ +struct serialized_trust_caches { + uint32_t num_caches; + uint32_t offsets[0]; +} __attribute__((__packed__)); + + +// Legacy Static Trust Cache + +/* This is the old legacy trust cache baked into the AMFI kext. + * We support it for a transitionary period, until external trust caches + * are fully established, and the AMFI trust cache can be removed. */ + +struct legacy_trust_cache_bucket { + uint16_t count; + uint16_t offset; +} __attribute__((__packed__)); + +#define LEGACY_TRUST_CACHE_ENTRY_LEN (CS_CDHASH_LEN-1) +#define LEGACY_TRUST_CACHE_BUCKET_COUNT (256) + +typedef uint8_t pmap_cs_legacy_stc_entry[CS_CDHASH_LEN-1]; // bucketized with first byte + +void trust_cache_init(void); + +uint32_t lookup_in_static_trust_cache(const uint8_t cdhash[CS_CDHASH_LEN]); + +bool lookup_in_trust_cache_module(struct trust_cache_module1 const * const module, + uint8_t const cdhash[CS_CDHASH_LEN], + uint8_t * const hash_type, + uint8_t * const flags); + +#endif + +#endif /* _KERN_TRUSTCACHE_H */ diff --git a/osfmk/kern/turnstile.c b/osfmk/kern/turnstile.c new file mode 100644 index 000000000..7a113412e --- /dev/null +++ b/osfmk/kern/turnstile.c @@ -0,0 +1,2745 @@ +/* + * Copyright (c) 2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static zone_t turnstiles_zone; +static int turnstile_max_hop; +#define MAX_TURNSTILES (thread_max) +#define TURNSTILES_CHUNK (THREAD_CHUNK) + +/* Global table for turnstile promote policy for all type of turnstiles */ +turnstile_promote_policy_t turnstile_promote_policy[TURNSTILE_TOTAL_TYPES] = { + [TURNSTILE_NONE] = TURNSTILE_PROMOTE_NONE, + [TURNSTILE_KERNEL_MUTEX] = TURNSTILE_KERNEL_PROMOTE, + [TURNSTILE_ULOCK] = TURNSTILE_USER_PROMOTE, + [TURNSTILE_PTHREAD_MUTEX] = TURNSTILE_USER_PROMOTE, + [TURNSTILE_SYNC_IPC] = TURNSTILE_USER_IPC_PROMOTE, + [TURNSTILE_WORKLOOPS] = TURNSTILE_USER_IPC_PROMOTE, + [TURNSTILE_WORKQS] = TURNSTILE_USER_IPC_PROMOTE, + [TURNSTILE_KNOTE] = TURNSTILE_USER_IPC_PROMOTE, +}; + +os_refgrp_decl(static, turnstile_refgrp, "turnstile", NULL); + +#if DEVELOPMENT || DEBUG +static queue_head_t turnstiles_list; +static lck_spin_t global_turnstile_lock; + +lck_grp_t turnstiles_dev_lock_grp; +lck_attr_t turnstiles_dev_lock_attr; +lck_grp_attr_t turnstiles_dev_lock_grp_attr; + +#define global_turnstiles_lock_init() \ + lck_spin_init(&global_turnstile_lock, &turnstiles_dev_lock_grp, &turnstiles_dev_lock_attr) +#define global_turnstiles_lock_destroy() \ + lck_spin_destroy(&global_turnstile_lock, &turnstiles_dev_lock_grp) +#define global_turnstiles_lock() \ + lck_spin_lock(&global_turnstile_lock) +#define global_turnstiles_lock_try() \ + lck_spin_try_lock(&global_turnstile_lock) +#define global_turnstiles_unlock() \ + lck_spin_unlock(&global_turnstile_lock) + +/* Array to store stats for multi-hop boosting */ +static struct turnstile_stats turnstile_boost_stats[TURNSTILE_MAX_HOP_DEFAULT] = {}; +static struct turnstile_stats turnstile_unboost_stats[TURNSTILE_MAX_HOP_DEFAULT] = {}; +uint64_t thread_block_on_turnstile_count; +uint64_t thread_block_on_regular_waitq_count; + +#endif + +#ifndef max +#define max(a,b) (((a) > (b)) ? (a) : (b)) +#endif /* max */ + +/* Static function declarations */ +static turnstile_type_t +turnstile_get_type(struct turnstile *turnstile); +static uint32_t +turnstile_get_gencount(struct turnstile *turnstile); +static void +turnstile_set_type_and_increment_gencount(struct turnstile *turnstile, turnstile_type_t type); +static void +turnstile_init(struct turnstile *turnstile); +static void +turnstile_update_inheritor_workq_priority_chain(struct turnstile *in_turnstile, spl_t s); +static void +turnstile_update_inheritor_thread_priority_chain(struct turnstile **in_turnstile, + thread_t *out_thread, int total_hop, turnstile_stats_update_flags_t tsu_flags); +static void +turnstile_update_inheritor_turnstile_priority_chain(struct turnstile **in_out_turnstile, + int total_hop, turnstile_stats_update_flags_t tsu_flags); +static void +thread_update_waiting_turnstile_priority_chain(thread_t *in_thread, + struct turnstile **out_turnstile, int thread_hop, int total_hop, + turnstile_stats_update_flags_t tsu_flags); +static boolean_t +turnstile_update_turnstile_promotion_locked(struct turnstile *dst_turnstile, + struct turnstile *src_turnstile); +static boolean_t +turnstile_update_turnstile_promotion(struct turnstile *dst_turnstile, + struct turnstile *src_turnstile); +static boolean_t +turnstile_need_turnstile_promotion_update(struct turnstile *dst_turnstile, + struct turnstile *src_turnstile); +static boolean_t +turnstile_add_turnstile_promotion(struct turnstile *dst_turnstile, + struct turnstile *src_turnstile); +static boolean_t +turnstile_remove_turnstile_promotion(struct turnstile *dst_turnstile, + struct turnstile *src_turnstile); +static boolean_t +turnstile_update_thread_promotion_locked(struct turnstile *dst_turnstile, + thread_t thread); +static boolean_t +turnstile_need_thread_promotion_update(struct turnstile *dst_turnstile, + thread_t thread); +static boolean_t +thread_add_turnstile_promotion( + thread_t thread, struct turnstile *turnstile); +static boolean_t +thread_remove_turnstile_promotion( + thread_t thread, struct turnstile *turnstile); +static boolean_t +thread_needs_turnstile_promotion_update(thread_t thread, + struct turnstile *turnstile); +static boolean_t +thread_update_turnstile_promotion( + thread_t thread, struct turnstile *turnstile); +static boolean_t +thread_update_turnstile_promotion_locked( + thread_t thread, struct turnstile *turnstile); +static boolean_t +workq_add_turnstile_promotion( + struct workqueue *wq_inheritor, struct turnstile *turnstile); +static turnstile_stats_update_flags_t +thread_get_update_flags_for_turnstile_propagation_stoppage(thread_t thread); +static turnstile_stats_update_flags_t +turnstile_get_update_flags_for_above_UI_pri_change(struct turnstile *turnstile); + +#if DEVELOPMENT || DEBUG +/* Test primitives and interfaces for testing turnstiles */ +struct tstile_test_prim { + struct turnstile *ttprim_turnstile; + thread_t ttprim_owner; + lck_spin_t ttprim_interlock; + uint32_t tt_prim_waiters; +}; + +struct tstile_test_prim *test_prim_ts_inline; +struct tstile_test_prim *test_prim_global_htable; +static void +tstile_test_prim_init(struct tstile_test_prim **test_prim_ptr); +#endif + +union turnstile_type_gencount { + uint32_t value; + struct { + uint32_t ts_type:(8 * sizeof(turnstile_type_t)), + ts_gencount: (8 *(sizeof(uint32_t) - sizeof(turnstile_type_t))); + }; +}; + +static turnstile_type_t +turnstile_get_type(struct turnstile *turnstile) +{ + union turnstile_type_gencount type_and_gencount; + + type_and_gencount.value = atomic_load_explicit(&turnstile->ts_type_gencount, memory_order_relaxed); + return (turnstile_type_t) type_and_gencount.ts_type; +} + +static uint32_t +turnstile_get_gencount(struct turnstile *turnstile) +{ + union turnstile_type_gencount type_and_gencount; + + type_and_gencount.value = atomic_load_explicit(&turnstile->ts_type_gencount, memory_order_relaxed); + return (uint32_t) type_and_gencount.ts_gencount; +} + +static void +turnstile_set_type_and_increment_gencount(struct turnstile *turnstile, turnstile_type_t type) +{ + union turnstile_type_gencount type_and_gencount; + + /* No need to compare exchange since the store happens under interlock of the primitive */ + type_and_gencount.value = atomic_load_explicit(&turnstile->ts_type_gencount, memory_order_relaxed); + type_and_gencount.ts_type = type; + type_and_gencount.ts_gencount++; + atomic_store_explicit(&turnstile->ts_type_gencount, type_and_gencount.value, memory_order_relaxed); +} + + +/* Turnstile hashtable Implementation */ + +/* + * Maximum number of buckets in the turnstile hashtable. This number affects the + * performance of the hashtable since it determines the hash collision + * rate. To experiment with the number of buckets in this hashtable use the + * "ts_htable_buckets" boot-arg. + */ +#define TURNSTILE_HTABLE_BUCKETS_DEFAULT 32 +#define TURNSTILE_HTABLE_BUCKETS_MAX 1024 + +SLIST_HEAD(turnstile_hashlist, turnstile); + +struct turnstile_htable_bucket { + lck_spin_t ts_ht_bucket_lock; + struct turnstile_hashlist ts_ht_bucket_list; +}; + +SECURITY_READ_ONLY_LATE(static uint32_t) ts_htable_buckets; +/* Global hashtable for turnstiles */ +SECURITY_READ_ONLY_LATE(static struct turnstile_htable_bucket *)turnstile_htable; + +/* Bucket locks for turnstile hashtable */ +lck_grp_t turnstiles_htable_lock_grp; +lck_attr_t turnstiles_htable_lock_attr; +lck_grp_attr_t turnstiles_htable_lock_grp_attr; + +#define turnstile_bucket_lock_init(bucket) \ + lck_spin_init(&bucket->ts_ht_bucket_lock, &turnstiles_htable_lock_grp, &turnstiles_htable_lock_attr) +#define turnstile_bucket_lock(bucket) \ + lck_spin_lock(&bucket->ts_ht_bucket_lock) +#define turnstile_bucket_unlock(bucket) \ + lck_spin_unlock(&bucket->ts_ht_bucket_lock) + +/* + * Name: turnstiles_hashtable_init + * + * Description: Initializes the global turnstile hash table. + * + * Args: + * None + * + * Returns: + * None + */ +static void +turnstiles_hashtable_init(void) +{ + /* Initialize number of buckets in the hashtable */ + if (PE_parse_boot_argn("ts_htable_buckets", &ts_htable_buckets, sizeof(ts_htable_buckets)) != TRUE) + ts_htable_buckets = TURNSTILE_HTABLE_BUCKETS_DEFAULT; + + assert(ts_htable_buckets <= TURNSTILE_HTABLE_BUCKETS_MAX); + uint32_t ts_htable_size = ts_htable_buckets * sizeof(struct turnstile_htable_bucket); + turnstile_htable = (struct turnstile_htable_bucket *)kalloc(ts_htable_size); + if (turnstile_htable == NULL) + panic("Turnstiles hash table memory allocation failed!"); + + lck_grp_attr_setdefault(&turnstiles_htable_lock_grp_attr); + lck_grp_init(&turnstiles_htable_lock_grp, "turnstiles_htable_locks", &turnstiles_htable_lock_grp_attr); + lck_attr_setdefault(&turnstiles_htable_lock_attr); + + /* Initialize all the buckets of the hashtable */ + for (uint32_t i = 0; i < ts_htable_buckets; i++) { + struct turnstile_htable_bucket *ts_bucket = &(turnstile_htable[i]); + turnstile_bucket_lock_init(ts_bucket); + SLIST_INIT(&ts_bucket->ts_ht_bucket_list); + } +} + +/* + * Name: turnstile_freelist_empty + * + * Description: Checks if the turnstile's freelist is empty + * Should be called with the primitive IL held. + * + * Args: + * Arg1: turnstile + * + * Returns: + * true if freelist is empty; false otherwise + */ +static inline boolean_t +turnstile_freelist_empty( + struct turnstile *ts) +{ + return SLIST_EMPTY(&ts->ts_free_turnstiles); +} + + +/* + * Name: turnstile_freelist_insert + * + * Description: Inserts the turnstile into the freelist of another turnstile + * Should be called with the primitive IL held. + * + * Args: + * Arg1: primitive turnstile + * Arg2: turnstile to add to the freelist + * + * Returns: + * None + */ +static void +turnstile_freelist_insert( + struct turnstile *dst_ts, + struct turnstile *free_ts) +{ + assert(turnstile_get_type(dst_ts) == turnstile_get_type(free_ts)); + assert(dst_ts->ts_proprietor == free_ts->ts_proprietor); + turnstile_state_add(free_ts, TURNSTILE_STATE_FREELIST); + SLIST_INSERT_HEAD(&dst_ts->ts_free_turnstiles, free_ts, ts_free_elm); +} + +/* + * Name: turnstile_freelist_remove + * + * Description: Removes a turnstile from the freelist of a turnstile + * Should be called with the primitive IL held. + * + * Args: + * Arg1: primitive turnstile + * + * Returns: + * turnstile removed from the freelist + */ +static struct turnstile * +turnstile_freelist_remove( + struct turnstile *ts) +{ + struct turnstile *ret_turnstile = TURNSTILE_NULL; + assert(!SLIST_EMPTY(&ts->ts_free_turnstiles)); + ret_turnstile = SLIST_FIRST(&ts->ts_free_turnstiles); + SLIST_REMOVE_HEAD(&ts->ts_free_turnstiles, ts_free_elm); + assert(ret_turnstile != TURNSTILE_NULL); + turnstile_state_remove(ret_turnstile, TURNSTILE_STATE_FREELIST); + /* Need to initialize the list again, since head and elm are in union */ + SLIST_INIT(&ret_turnstile->ts_free_turnstiles); + return ret_turnstile; +} + +/* + * Name: turnstile_hash + * + * Description: Calculates the hash bucket index for a given proprietor + * + * Args: + * Arg1: proprietor (key) for hashing + * + * Returns: + * hash table bucket index for provided proprietor + */ +static inline uint32_t +turnstile_hash(uintptr_t proprietor) +{ + char *key = (char *)&proprietor; + uint32_t hash = jenkins_hash(key, sizeof(key)); + hash &= (ts_htable_buckets - 1); + return hash; +} + +/* + * Name: turnstile_htable_lookup_add + * + * Description: Lookup the proprietor in the global turnstile hash table. + * If an entry is present, add the new turnstile to the entry's freelist. + * Otherwise add the passed in turnstile for that proprietor. + * The routine assumes that the turnstile->proprietor does not change + * while the turnstile is in the global hash table. + * + * Args: + * Arg1: proprietor + * Arg2: new turnstile for primitive + * + * Returns: + * Previous turnstile for proprietor in the hash table + */ +static struct turnstile * +turnstile_htable_lookup_add( + uintptr_t proprietor, + struct turnstile *new_turnstile) +{ + uint32_t index = turnstile_hash(proprietor); + assert(index < ts_htable_buckets); + struct turnstile_htable_bucket *ts_bucket = &(turnstile_htable[index]); + spl_t s; + + s = splsched(); + turnstile_bucket_lock(ts_bucket); + struct turnstile *ts; + + SLIST_FOREACH(ts, &ts_bucket->ts_ht_bucket_list, ts_htable_link) { + if (ts->ts_proprietor == proprietor) { + /* + * Found an entry in the hashtable for this proprietor; add thread turnstile to freelist + * and return this turnstile + */ + turnstile_bucket_unlock(ts_bucket); + splx(s); + turnstile_freelist_insert(ts, new_turnstile); + return ts; + } + } + + /* No entry for this proprietor; add the new turnstile in the hash table */ + SLIST_INSERT_HEAD(&ts_bucket->ts_ht_bucket_list, new_turnstile, ts_htable_link); + turnstile_state_add(new_turnstile, TURNSTILE_STATE_HASHTABLE); + turnstile_bucket_unlock(ts_bucket); + splx(s); + /* Since there was no previous entry for this proprietor, return TURNSTILE_NULL */ + return TURNSTILE_NULL; +} + +/* + * Name: turnstable_htable_lookup_remove + * + * Description: Lookup the proprietor in the global turnstile hash table. + * For the turnstile in the hash table, if the freelist has turnstiles on it + * return one of them from the freelist. Otherwise remove the turnstile from + * the hashtable and return that. + * The routine assumes that the turnstile->proprietor does not change + * while the turnstile is in the global hash table. + * + * Args: + * Arg1: proprietor + * Arg2: free turnstile to be returned + * + * Returns: + * turnstile for this proprietor in the hashtable after the removal + */ +static struct turnstile * +turnstable_htable_lookup_remove( + uintptr_t proprietor, + struct turnstile **free_turnstile) +{ + uint32_t index = turnstile_hash(proprietor); + assert(index < ts_htable_buckets); + struct turnstile_htable_bucket *ts_bucket = &(turnstile_htable[index]); + struct turnstile *ret_turnstile = TURNSTILE_NULL; + spl_t s; + + s = splsched(); + turnstile_bucket_lock(ts_bucket); + struct turnstile *ts, **prev_tslink; + /* Find the turnstile for the given proprietor in the hashtable */ + SLIST_FOREACH_PREVPTR(ts, prev_tslink, &ts_bucket->ts_ht_bucket_list, ts_htable_link) { + if (ts->ts_proprietor == proprietor) { + ret_turnstile = ts; + break; + } + } + assert(ret_turnstile != TURNSTILE_NULL); + + /* Check if the turnstile has any turnstiles on its freelist */ + if (turnstile_freelist_empty(ret_turnstile)) { + /* No turnstiles on the freelist; remove the turnstile from the hashtable and mark it freed */ + *prev_tslink = SLIST_NEXT(ret_turnstile, ts_htable_link); + turnstile_state_remove(ret_turnstile, TURNSTILE_STATE_HASHTABLE); + turnstile_bucket_unlock(ts_bucket); + splx(s); + *free_turnstile = ret_turnstile; + return TURNSTILE_NULL; + } else { + /* + * Turnstile has free turnstiles on its list; leave the hashtable unchanged + * and return the first turnstile in the freelist as the free turnstile + */ + turnstile_bucket_unlock(ts_bucket); + splx(s); + *free_turnstile = turnstile_freelist_remove(ret_turnstile); + return ret_turnstile; + } +} + +/* + * Name: turnstile_htable_lookup + * + * Description: Lookup the proprietor in the global turnstile hash table. + * The routine assumes that the turnstile->proprietor does not change + * while the turnstile is in the global hash table. + * + * Args: + * Arg1: proprietor + * + * Returns: + * Turnstile for proprietor in the hash table + */ +static struct turnstile * +turnstile_htable_lookup( + uintptr_t proprietor) +{ + uint32_t index = turnstile_hash(proprietor); + assert(index < ts_htable_buckets); + struct turnstile_htable_bucket *ts_bucket = &(turnstile_htable[index]); + spl_t s; + + s = splsched(); + turnstile_bucket_lock(ts_bucket); + struct turnstile *ts = TURNSTILE_NULL; + struct turnstile *ret_turnstile = TURNSTILE_NULL; + + SLIST_FOREACH(ts, &ts_bucket->ts_ht_bucket_list, ts_htable_link) { + if (ts->ts_proprietor == proprietor) { + /* Found an entry in the hashtable for this proprietor */ + ret_turnstile = ts; + break; + } + } + + turnstile_bucket_unlock(ts_bucket); + splx(s); + return ret_turnstile; +} + +/* + * Name: turnstiles_init + * + * Description: Initialize turnstile sub system. + * + * Args: None. + * + * Returns: None. + */ +void +turnstiles_init(void) +{ + turnstiles_zone = zinit(sizeof(struct turnstile), + MAX_TURNSTILES * sizeof(struct turnstile), + TURNSTILES_CHUNK * sizeof(struct turnstile), + "turnstiles"); + + if (!PE_parse_boot_argn("turnstile_max_hop", &turnstile_max_hop, sizeof(turnstile_max_hop))) { + turnstile_max_hop = TURNSTILE_MAX_HOP_DEFAULT; + } + + turnstiles_hashtable_init(); + +#if DEVELOPMENT || DEBUG + /* Initialize the global turnstile locks and lock group */ + + lck_grp_attr_setdefault(&turnstiles_dev_lock_grp_attr); + lck_grp_init(&turnstiles_dev_lock_grp, "turnstiles_dev_lock", &turnstiles_dev_lock_grp_attr); + lck_attr_setdefault(&turnstiles_dev_lock_attr); + global_turnstiles_lock_init(); + + queue_init(&turnstiles_list); + + /* Initialize turnstile test primitive */ + tstile_test_prim_init(&test_prim_ts_inline); + tstile_test_prim_init(&test_prim_global_htable); +#endif + return; +} + +/* + * Name: turnstile_alloc + * + * Description: Allocate a turnstile. + * + * Args: None. + * + * Returns: + * turnstile on Success. + */ +struct turnstile * +turnstile_alloc(void) +{ + struct turnstile *turnstile = TURNSTILE_NULL; + + turnstile = zalloc(turnstiles_zone); + turnstile_init(turnstile); + +#if DEVELOPMENT || DEBUG + /* Add turnstile to global list */ + global_turnstiles_lock(); + queue_enter(&turnstiles_list, turnstile, + struct turnstile *, ts_global_elm); + global_turnstiles_unlock(); +#endif + return turnstile; +} + +/* + * Name: turnstile_init + * + * Description: Initialize the turnstile. + * + * Args: + * Arg1: turnstile to initialize + * + * Returns: None. + */ +static void +turnstile_init(struct turnstile *turnstile) +{ + kern_return_t kret; + + /* Initialize the waitq */ + kret = waitq_init(&turnstile->ts_waitq, SYNC_POLICY_DISABLE_IRQ | SYNC_POLICY_REVERSED | + SYNC_POLICY_TURNSTILE); + assert(kret == KERN_SUCCESS); + + turnstile->ts_inheritor = TURNSTILE_INHERITOR_NULL; + SLIST_INIT(&turnstile->ts_free_turnstiles); + turnstile->ts_type_gencount = 0; + turnstile_set_type_and_increment_gencount(turnstile, TURNSTILE_NONE); + turnstile_state_init(turnstile, TURNSTILE_STATE_THREAD); + os_ref_init_count(&turnstile->ts_refcount, &turnstile_refgrp, 1); + turnstile->ts_proprietor = TURNSTILE_PROPRIETOR_NULL; + turnstile->ts_priority = MAXPRI_THROTTLE; + turnstile->ts_inheritor_flags = TURNSTILE_UPDATE_FLAGS_NONE; + turnstile->ts_port_ref = 0; + priority_queue_init(&turnstile->ts_inheritor_queue, + PRIORITY_QUEUE_BUILTIN_MAX_HEAP); + +#if DEVELOPMENT || DEBUG + turnstile->ts_thread = current_thread(); + turnstile->ts_prev_thread = NULL; +#endif +} + +/* + * Name: turnstile_reference + * + * Description: Take a reference on the turnstile. + * + * Arg1: turnstile + * + * Returns: None. + */ +void +turnstile_reference(struct turnstile *turnstile) +{ + if (turnstile == TURNSTILE_NULL) { + return; + } + os_ref_retain(&turnstile->ts_refcount); +} + +/* + * Name: turnstile_deallocate + * + * Description: Drop a reference on the turnstile. + * Destroy the turnstile if the last ref. + * + * Arg1: turnstile + * + * Returns: None. + */ +void +turnstile_deallocate(struct turnstile *turnstile) +{ + if (turnstile == TURNSTILE_NULL) { + return; + } + + if (__improbable(os_ref_release(&turnstile->ts_refcount) == 0)) { + turnstile_destroy(turnstile); + } +} + +/* + * Name: turnstile_deallocate_safe + * + * Description: Drop a reference on the turnstile safely without triggering zfree. + * + * Arg1: turnstile + * + * Returns: None. + */ +void +turnstile_deallocate_safe(struct turnstile *turnstile) +{ + if (turnstile == TURNSTILE_NULL) { + return; + } + + if (__improbable(os_ref_release(&turnstile->ts_refcount) == 0)) { + /* enqueue the turnstile for thread deallocate deamon to call turnstile_destroy */ + turnstile_deallocate_enqueue(turnstile); + } +} + +/* + * Name: turnstile_destroy + * + * Description: Deallocates the turnstile. + * + * Args: + * Arg1: turnstile + * + * Returns: None. + */ +void +turnstile_destroy(struct turnstile *turnstile) +{ + /* destroy the waitq */ + waitq_deinit(&turnstile->ts_waitq); + + assert(turnstile->ts_inheritor == TURNSTILE_INHERITOR_NULL); + assert(SLIST_EMPTY(&turnstile->ts_free_turnstiles)); + assert(turnstile->ts_state & TURNSTILE_STATE_THREAD); +#if DEVELOPMENT || DEBUG + /* Remove turnstile from global list */ + global_turnstiles_lock(); + queue_remove(&turnstiles_list, turnstile, + struct turnstile *, ts_global_elm); + global_turnstiles_unlock(); +#endif + zfree(turnstiles_zone, turnstile); +} + +/* + * Name: turnstile_prepare + * + * Description: Transfer current thread's turnstile to primitive or it's free turnstile list. + * Function is called holding the interlock (spinlock) of the primitive. + * The turnstile returned by this function is safe to use untill the thread calls turnstile_complete. + * When no turnstile is provided explicitly, the calling thread will not have a turnstile attached to + * it untill it calls turnstile_complete. + * + * Args: + * Arg1: proprietor + * Arg2: pointer in primitive struct to store turnstile + * Arg3: turnstile to use instead of taking it from thread. + * Arg4: type of primitive + * + * Returns: + * turnstile. + */ +struct turnstile * +turnstile_prepare( + uintptr_t proprietor, + struct turnstile **tstore, + struct turnstile *turnstile, + turnstile_type_t type) +{ + thread_t thread = current_thread(); + struct turnstile *ret_turnstile = TURNSTILE_NULL; + struct turnstile *thread_turnstile = turnstile; + + /* Get the thread's turnstile if no turnstile provided */ + if (thread_turnstile == TURNSTILE_NULL) { + thread_turnstile = thread->turnstile; + assert(thread_turnstile != TURNSTILE_NULL); + assert(thread->inheritor == NULL); + thread->turnstile = TURNSTILE_NULL; + } + + /* Prepare the thread turnstile to be the primitive turnstile */ + SLIST_INIT(&thread_turnstile->ts_free_turnstiles); + turnstile_set_type_and_increment_gencount(thread_turnstile, type); + thread_turnstile->ts_inheritor = TURNSTILE_INHERITOR_NULL; + thread_turnstile->ts_proprietor = proprietor; + turnstile_state_remove(thread_turnstile, TURNSTILE_STATE_THREAD); + + thread_turnstile->ts_priority = MAXPRI_THROTTLE; +#if DEVELOPMENT || DEBUG + thread_turnstile->ts_prev_thread = thread_turnstile->ts_thread; + thread_turnstile->ts_thread = NULL; +#endif + + if (tstore != NULL) { + /* + * If the primitive stores the turnstile, + * If there is already a turnstile, put the thread_turnstile if the primitive currently does not have a + * turnstile. + * Else, add the thread turnstile to freelist of the primitive turnstile. + */ + ret_turnstile = *tstore; + if (*tstore == TURNSTILE_NULL) { + turnstile_state_add(thread_turnstile, TURNSTILE_STATE_PROPRIETOR); + *tstore = thread_turnstile; + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (TURNSTILE_CODE(TURNSTILE_FREELIST_OPERATIONS, (TURNSTILE_PREPARE))) | DBG_FUNC_NONE, + VM_KERNEL_UNSLIDE_OR_PERM(thread_turnstile), + VM_KERNEL_UNSLIDE_OR_PERM(proprietor), + turnstile_get_type(thread_turnstile), 0, 0); + } else { + turnstile_freelist_insert(ret_turnstile, thread_turnstile); + } + ret_turnstile = *tstore; + } else { + /* + * Lookup the primitive in the turnstile hash table and see if it already has an entry. + */ + ret_turnstile = turnstile_htable_lookup_add(proprietor, thread_turnstile); + if (ret_turnstile == NULL) { + ret_turnstile = thread_turnstile; + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (TURNSTILE_CODE(TURNSTILE_FREELIST_OPERATIONS, (TURNSTILE_PREPARE))) | DBG_FUNC_NONE, + VM_KERNEL_UNSLIDE_OR_PERM(thread_turnstile), + VM_KERNEL_UNSLIDE_OR_PERM(proprietor), + turnstile_get_type(thread_turnstile), 0, 0); + } + } + + return ret_turnstile; +} + +/* + * Name: turnstile_complete + * + * Description: Transfer the primitive's turnstile or from it's freelist to current thread. + * Function is called holding the interlock (spinlock) of the primitive. + * Current thread will have a turnstile attached to it after this call. + * + * Args: + * Arg1: proprietor + * Arg2: pointer in primitive struct to update turnstile + * Arg3: pointer to store the returned turnstile instead of attaching it to thread + * + * Returns: + * None. + */ +void +turnstile_complete( + uintptr_t proprietor, + struct turnstile **tstore, + struct turnstile **out_turnstile) +{ + thread_t thread = current_thread(); + struct turnstile *primitive_turnstile = TURNSTILE_NULL; + struct turnstile *thread_turnstile = TURNSTILE_NULL; + + assert(thread->inheritor == NULL); + + if (tstore != NULL) { + /* + * If the primitive stores the turnstile, check if the primitive turnstile + * has any turnstiles on its freelist. + */ + assert(*tstore != TURNSTILE_NULL); + if (turnstile_freelist_empty(*tstore)) { + /* Last turnstile scenario; remove the primitive->turnstile */ + thread_turnstile = *tstore; + *tstore = TURNSTILE_NULL; + turnstile_state_remove(thread_turnstile, TURNSTILE_STATE_PROPRIETOR); + } else { + /* Freelist has turnstiles; remove one from the freelist */ + thread_turnstile = turnstile_freelist_remove(*tstore); + } + primitive_turnstile = *tstore; + } else { + /* Use the global hash to find and remove a turnstile */ + primitive_turnstile = turnstable_htable_lookup_remove(proprietor, &thread_turnstile); + } + if (primitive_turnstile == NULL) { + /* + * Primitive no longer has a turnstile associated with it, thread_turnstile + * was the last turnstile attached to primitive, clear out the inheritor and + * set the old inheritor for turnstile cleanup. + */ + if (thread_turnstile->ts_inheritor != TURNSTILE_INHERITOR_NULL) { + turnstile_update_inheritor(thread_turnstile, TURNSTILE_INHERITOR_NULL, + (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD)); + /* + * old inheritor is set in curret thread and its priority propagation + * will happen in turnstile cleanup call + */ + } + assert(thread_turnstile->ts_inheritor == TURNSTILE_INHERITOR_NULL); + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (TURNSTILE_CODE(TURNSTILE_FREELIST_OPERATIONS, (TURNSTILE_COMPLETE))) | DBG_FUNC_NONE, + VM_KERNEL_UNSLIDE_OR_PERM(thread_turnstile), + VM_KERNEL_UNSLIDE_OR_PERM(proprietor), + turnstile_get_type(thread_turnstile), 0, 0); + } else { + /* If primitive's turnstile needs priority update, set it up for turnstile cleanup */ + if (turnstile_recompute_priority(primitive_turnstile)) { + turnstile_reference(primitive_turnstile); + thread->inheritor = primitive_turnstile; + thread->inheritor_flags = (TURNSTILE_INHERITOR_TURNSTILE | + TURNSTILE_INHERITOR_NEEDS_PRI_UPDATE); + } + } + + turnstile_set_type_and_increment_gencount(thread_turnstile, TURNSTILE_NONE); +#if DEVELOPMENT || DEBUG + thread_turnstile->ts_prev_thread = NULL; + thread_turnstile->ts_thread = thread; +#endif + + turnstile_state_add(thread_turnstile, TURNSTILE_STATE_THREAD); + if (out_turnstile == NULL) { + /* Prepare the turnstile to become the thread's turnstile */ + thread->turnstile = thread_turnstile; + } else { + *out_turnstile = thread_turnstile; + } + return; +} + +/* + * Name: turnstile_update_inheritor_locked + * + * Description: Update the inheritor of the turnstile and boost the + * inheritor, called with turnstile locked. + * + * Args: + * Arg1: turnstile + * Implicit arg: new inheritor value is stashed in current thread's struct + * + * Returns: + * old inheritor reference is returned on current thread's struct. + */ +void +turnstile_update_inheritor_locked( + struct turnstile *turnstile) +{ + turnstile_inheritor_t old_inheritor = turnstile->ts_inheritor; + turnstile_update_flags_t old_inheritor_flags = turnstile->ts_inheritor_flags; + thread_t thread = current_thread(); + boolean_t old_inheritor_needs_update = FALSE; + boolean_t new_inheritor_needs_update = FALSE; + turnstile_stats_update_flags_t tsu_flags = + turnstile_get_update_flags_for_above_UI_pri_change(turnstile); + + assert(waitq_held(&turnstile->ts_waitq)); + + /* + * Get the new inheritor value from current thread's + * struct, the value was stashed by turnstile_update_inheritor + */ + turnstile_inheritor_t new_inheritor = thread->inheritor; + turnstile_update_flags_t new_inheritor_flags = thread->inheritor_flags; + + switch (turnstile_promote_policy[turnstile_get_type(turnstile)]) { + case TURNSTILE_USER_PROMOTE: + case TURNSTILE_USER_IPC_PROMOTE: + + /* Check if update is needed */ + if (old_inheritor == new_inheritor && old_inheritor == NULL) { + break; + } + + if (old_inheritor == new_inheritor) { + if (new_inheritor_flags & TURNSTILE_INHERITOR_THREAD) { + thread_t thread_inheritor = (thread_t)new_inheritor; + + assert(old_inheritor_flags & TURNSTILE_INHERITOR_THREAD); + + /* adjust turnstile position in the thread's inheritor list */ + new_inheritor_needs_update = thread_update_turnstile_promotion( + thread_inheritor, turnstile); + + } else if (new_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE) { + struct turnstile *inheritor_turnstile = new_inheritor; + + assert(old_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE); + + new_inheritor_needs_update = turnstile_update_turnstile_promotion( + inheritor_turnstile, turnstile); + + } else if (new_inheritor_flags & TURNSTILE_INHERITOR_WORKQ) { + /* + * When we are still picking "WORKQ" then possible racing + * updates will call redrive through their own propagation + * and we don't need to update anything here. + */ + turnstile_stats_update(1, TSU_NO_PRI_CHANGE_NEEDED | + TSU_TURNSTILE_ARG | TSU_BOOST_ARG, turnstile); + } else { + panic("Inheritor flags lost along the way"); + } + + /* Update turnstile stats */ + if (!new_inheritor_needs_update) { + turnstile_stats_update(1, TSU_PRI_PROPAGATION | + TSU_TURNSTILE_ARG | TSU_BOOST_ARG | tsu_flags, turnstile); + } + break; + } + + if (old_inheritor != NULL) { + if (old_inheritor_flags & TURNSTILE_INHERITOR_THREAD) { + thread_t thread_inheritor = (thread_t)old_inheritor; + + /* remove turnstile from thread's inheritor list */ + old_inheritor_needs_update = thread_remove_turnstile_promotion(thread_inheritor, turnstile); + + } else if (old_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE) { + struct turnstile *old_turnstile = old_inheritor; + + old_inheritor_needs_update = turnstile_remove_turnstile_promotion( + old_turnstile, turnstile); + + } else if (old_inheritor_flags & TURNSTILE_INHERITOR_WORKQ) { + /* + * We don't need to do anything when the push was WORKQ + * because nothing is pushed on in the first place. + */ + turnstile_stats_update(1, TSU_NO_PRI_CHANGE_NEEDED | + TSU_TURNSTILE_ARG, turnstile); + } else { + panic("Inheritor flags lost along the way"); + } + /* Update turnstile stats */ + if (!old_inheritor_needs_update) { + turnstile_stats_update(1, TSU_PRI_PROPAGATION | TSU_TURNSTILE_ARG, + turnstile); + } + } + + if (new_inheritor != NULL) { + if (new_inheritor_flags & TURNSTILE_INHERITOR_THREAD) { + thread_t thread_inheritor = (thread_t)new_inheritor; + + assert(new_inheritor_flags & TURNSTILE_INHERITOR_THREAD); + /* add turnstile to thread's inheritor list */ + new_inheritor_needs_update = thread_add_turnstile_promotion( + thread_inheritor, turnstile); + + } else if (new_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE) { + struct turnstile *new_turnstile = new_inheritor; + + new_inheritor_needs_update = turnstile_add_turnstile_promotion( + new_turnstile, turnstile); + + } else if (new_inheritor_flags & TURNSTILE_INHERITOR_WORKQ) { + struct workqueue *wq_inheritor = new_inheritor; + + new_inheritor_needs_update = workq_add_turnstile_promotion( + wq_inheritor, turnstile); + if (!new_inheritor_needs_update) { + turnstile_stats_update(1, TSU_NO_PRI_CHANGE_NEEDED | + TSU_TURNSTILE_ARG | TSU_BOOST_ARG, turnstile); + } + } else { + panic("Inheritor flags lost along the way"); + } + /* Update turnstile stats */ + if (!new_inheritor_needs_update) { + turnstile_stats_update(1, TSU_PRI_PROPAGATION | + TSU_TURNSTILE_ARG | TSU_BOOST_ARG | tsu_flags,turnstile); + } + } + + break; + + case TURNSTILE_KERNEL_PROMOTE: + break; + default: + panic("turnstile promotion for type %d not yet implemented", turnstile_get_type(turnstile)); + } + + if (old_inheritor_needs_update) { + old_inheritor_flags |= TURNSTILE_INHERITOR_NEEDS_PRI_UPDATE; + } + + /* + * If new inheritor needs priority updated, then set TURNSTILE_NEEDS_PRI_UPDATE + * on the old_inheritor_flags which will be copied to the thread. + */ + if (new_inheritor_needs_update) { + old_inheritor_flags |= TURNSTILE_NEEDS_PRI_UPDATE; + } + + turnstile->ts_inheritor = new_inheritor; + turnstile->ts_inheritor_flags = new_inheritor_flags; + thread->inheritor = old_inheritor; + thread->inheritor_flags = old_inheritor_flags; + return; +} + +/* + * Name: turnstile_update_inheritor + * + * Description: Update the inheritor of the turnstile and boost the + * inheritor. It will take a thread reference on the inheritor. + * Called with the interlock of the primitive held. + * + * Args: + * Arg1: turnstile + * Arg2: inheritor + * Arg3: flags - TURNSTILE_DELAYED_UPDATE - update will happen later in assert_wait + * + * Returns: + * old inheritor reference is stashed on current thread's struct. + */ +void +turnstile_update_inheritor( + struct turnstile *turnstile, + turnstile_inheritor_t new_inheritor, + turnstile_update_flags_t flags) +{ + thread_t thread = current_thread(); + spl_t spl; + + /* + * Set the inheritor on calling thread struct, no need + * to take the turnstile waitq lock since the inheritor + * is protected by the primitive's interlock + */ + assert(thread->inheritor == TURNSTILE_INHERITOR_NULL); + thread->inheritor = new_inheritor; + thread->inheritor_flags = TURNSTILE_UPDATE_FLAGS_NONE; + if (new_inheritor == TURNSTILE_INHERITOR_NULL) { + /* nothing to retain or remember */ + } else if (flags & TURNSTILE_INHERITOR_THREAD) { + thread->inheritor_flags |= TURNSTILE_INHERITOR_THREAD; + thread_reference((thread_t)new_inheritor); + } else if (flags & TURNSTILE_INHERITOR_TURNSTILE) { + thread->inheritor_flags |= TURNSTILE_INHERITOR_TURNSTILE; + turnstile_reference((struct turnstile *)new_inheritor); + } else if (flags & TURNSTILE_INHERITOR_WORKQ) { + thread->inheritor_flags |= TURNSTILE_INHERITOR_WORKQ; + workq_reference((struct workqueue *)new_inheritor); + } else { + panic("Missing type in flags (%x) for inheritor (%p)", flags, + new_inheritor); + } + + /* Do not perform the update if delayed update is specified */ + if (flags & TURNSTILE_DELAYED_UPDATE) { + return; + } + + /* lock the turnstile waitq */ + spl = splsched(); + waitq_lock(&turnstile->ts_waitq); + + turnstile_update_inheritor_locked(turnstile); + + waitq_unlock(&turnstile->ts_waitq); + splx(spl); + + return; +} + + +/* + * Name: turnstile_need_thread_promotion_update + * + * Description: Check if thread's place in the turnstile waitq needs to be updated. + * + * Arg1: dst turnstile + * Arg2: thread + * + * Returns: TRUE: if turnstile_update_thread_promotion_locked needs to be called. + * FALSE: otherwise. + * + * Condition: thread locked. + */ +static boolean_t +turnstile_need_thread_promotion_update( + struct turnstile *dst_turnstile __assert_only, + thread_t thread) +{ + int thread_link_priority; + boolean_t needs_update = FALSE; + + thread_link_priority = priority_queue_entry_key(&(dst_turnstile->ts_waitq.waitq_prio_queue), + &(thread->wait_prioq_links)); + + needs_update = (thread_link_priority == thread->base_pri) ? FALSE : TRUE; + return needs_update; +} + +/* + * Name: turnstile_priority_queue_update_entry_key + * + * Description: Updates the priority of an entry in a priority queue + * + * Arg1: a turnstile/thread/... priority queue + * Arg2: the element to change the priority of + * Arg3: the new priority + * + * Returns: whether the maximum priority of the queue changed. + */ +static boolean_t +turnstile_priority_queue_update_entry_key(struct priority_queue *q, + priority_queue_entry_t elt, priority_queue_key_t pri) +{ + priority_queue_key_t old_key = priority_queue_max_key(q); + + if (priority_queue_entry_key(q, elt) < pri) { + if (priority_queue_entry_increase(q, elt, pri, + PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) { + return old_key != priority_queue_max_key(q); + } + } else if (priority_queue_entry_key(q, elt) > pri) { + if (priority_queue_entry_decrease(q, elt, pri, + PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) { + return old_key != priority_queue_max_key(q); + } + } + + return FALSE; +} + +/* + * Name: turnstile_update_thread_promotion_locked + * + * Description: Update dst turnstile's inheritor link since one of the waiting + * thread's priority has changed. + * + * Arg1: dst turnstile + * Arg2: thread + * + * Returns: TRUE: if the dst turnstile priority has changed and needs propagation. + * FALSE: if the dst turnstile priority did not change or it does not need propagation. + * + * Condition: dst turnstile and thread are locked. + */ +static boolean_t +turnstile_update_thread_promotion_locked( + struct turnstile *dst_turnstile, + thread_t thread) +{ + int thread_link_priority = priority_queue_entry_key(&(dst_turnstile->ts_waitq.waitq_prio_queue), + &(thread->wait_prioq_links)); + + if (thread->base_pri != thread_link_priority) { + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (THREAD_MOVED_IN_TURNSTILE_WAITQ))) | DBG_FUNC_NONE, + VM_KERNEL_UNSLIDE_OR_PERM(dst_turnstile), + thread_tid(thread), + thread->base_pri, + thread_link_priority, 0); + } + + if (!turnstile_priority_queue_update_entry_key( + &dst_turnstile->ts_waitq.waitq_prio_queue, + &thread->wait_prioq_links, thread->base_pri)) { + return FALSE; + } + + /* Update dst turnstile's priority */ + return turnstile_recompute_priority_locked(dst_turnstile); +} + + +/* + * Name: thread_add_turnstile_promotion + * + * Description: Add a turnstile to thread's inheritor list and update thread's priority. + * + * Arg1: thread + * Arg2: turnstile + * + * Returns: TRUE: if the thread's priority has changed and needs propagation. + * FALSE: if the thread's priority did not change or it does not need propagation. + * + * Condition: turnstile locked. + */ +static boolean_t +thread_add_turnstile_promotion( + thread_t thread, + struct turnstile *turnstile) +{ + boolean_t needs_update = FALSE; + + /* Update the pairing heap */ + thread_lock(thread); + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (TURNSTILE_ADDED_TO_THREAD_HEAP))) | DBG_FUNC_NONE, + thread_tid(thread), + VM_KERNEL_UNSLIDE_OR_PERM(turnstile), + turnstile->ts_priority, 0, 0); + + priority_queue_entry_init(&(turnstile->ts_inheritor_links)); + if (priority_queue_insert(&thread->inheritor_queue, + &turnstile->ts_inheritor_links, turnstile->ts_priority, + PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) { + /* Update thread priority */ + needs_update = thread_recompute_user_promotion_locked(thread); + } + + /* Update turnstile stats */ + if (!needs_update) { + turnstile_stats_update(1, + thread_get_update_flags_for_turnstile_propagation_stoppage(thread) | + TSU_TURNSTILE_ARG | TSU_BOOST_ARG, + turnstile); + } + + thread_unlock(thread); + return needs_update; +} + + +/* + * Name: thread_remove_turnstile_promotion + * + * Description: Remove turnstile from thread's inheritor list and update thread's priority. + * + * Arg1: thread + * Arg2: turnstile + * + * Returns: TRUE: if the thread's priority has changed and needs propagation. + * FALSE: if the thread's priority did not change or it does not need propagation. + * + * Condition: turnstile locked. + */ +static boolean_t +thread_remove_turnstile_promotion( + thread_t thread, + struct turnstile *turnstile) +{ + boolean_t needs_update = FALSE; + + /* Update the pairing heap */ + thread_lock(thread); + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (TURNSTILE_REMOVED_FROM_THREAD_HEAP))) | DBG_FUNC_NONE, + thread_tid(thread), + VM_KERNEL_UNSLIDE_OR_PERM(turnstile), + 0, 0, 0); + + if (priority_queue_remove(&thread->inheritor_queue, + &turnstile->ts_inheritor_links, + PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) { + /* Update thread priority */ + needs_update = thread_recompute_user_promotion_locked(thread); + } + + /* Update turnstile stats */ + if (!needs_update) { + turnstile_stats_update(1, + thread_get_update_flags_for_turnstile_propagation_stoppage(thread) | TSU_TURNSTILE_ARG, + turnstile); + } + + thread_unlock(thread); + return needs_update; +} + +/* + * Name: thread_needs_turnstile_promotion_update + * + * Description: Check if turnstile position in thread's inheritor list needs to be updated. + * + * Arg1: thread + * Arg2: turnstile + * + * Returns: TRUE: if thread_update_turnstile_promotion needs to be called. + * FALSE: otherwise. + * + * Condition: turnstile locked. + */ +static boolean_t +thread_needs_turnstile_promotion_update( + thread_t thread __assert_only, + struct turnstile *turnstile) +{ + boolean_t needs_update = FALSE; + int turnstile_link_priority; + + /* Update the pairing heap */ + turnstile_link_priority = priority_queue_entry_key(&(thread->inheritor_queue), + &(turnstile->ts_inheritor_links)); + + needs_update = (turnstile_link_priority == turnstile->ts_priority) ? FALSE : TRUE; + return needs_update; +} + +/* + * Name: thread_update_turnstile_promotion_locked + * + * Description: Update turnstile position in thread's inheritor list and update thread's priority. + * + * Arg1: thread + * Arg2: turnstile + * + * Returns: TRUE: if the thread's priority has changed and needs propagation. + * FALSE: if the thread's priority did not change or it does not need propagation. + * + * Condition: turnstile and thread are locked. + */ +static boolean_t +thread_update_turnstile_promotion_locked( + thread_t thread, + struct turnstile *turnstile) +{ + int turnstile_link_priority = priority_queue_entry_key(&(thread->inheritor_queue), + &(turnstile->ts_inheritor_links)); + + if (turnstile->ts_priority != turnstile_link_priority) { + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (TURNSTILE_MOVED_IN_THREAD_HEAP))) | DBG_FUNC_NONE, + thread_tid(thread), + VM_KERNEL_UNSLIDE_OR_PERM(turnstile), + turnstile->ts_priority, + turnstile_link_priority, 0); + } + + if (!turnstile_priority_queue_update_entry_key(&thread->inheritor_queue, + &turnstile->ts_inheritor_links, turnstile->ts_priority)) { + return FALSE; + } + + /* Update thread priority */ + return thread_recompute_user_promotion_locked(thread); +} + + +/* + * Name: thread_update_turnstile_promotion + * + * Description: Update turnstile position in thread's inheritor list and update thread's priority. + * + * Arg1: thread + * Arg2: turnstile + * + * Returns: TRUE: if the thread's priority has changed and needs propagation. + * FALSE: if the thread's priority did not change or it does not need propagation. + * + * Condition: turnstile locked. + */ +static boolean_t +thread_update_turnstile_promotion( + thread_t thread, + struct turnstile *turnstile) +{ + /* Before grabbing the thread lock, check if update is needed */ + boolean_t needs_update = thread_needs_turnstile_promotion_update(thread, turnstile); + + if (!needs_update) { + turnstile_stats_update(1, TSU_NO_PRI_CHANGE_NEEDED | + TSU_TURNSTILE_ARG | TSU_BOOST_ARG, turnstile); + return needs_update; + } + + /* Update the pairing heap */ + thread_lock(thread); + needs_update = thread_update_turnstile_promotion_locked(thread, turnstile); + + /* Update turnstile stats */ + if (!needs_update) { + turnstile_stats_update(1, + thread_get_update_flags_for_turnstile_propagation_stoppage(thread) | + TSU_TURNSTILE_ARG | TSU_BOOST_ARG, + turnstile); + } + thread_unlock(thread); + return needs_update; +} + + +/* + * Name: thread_get_inheritor_turnstile_priority + * + * Description: Get the max priority of all the inheritor turnstiles + * + * Arg1: thread + * + * Returns: Max priority of all the inheritor turnstiles. + * + * Condition: thread locked + */ +int +thread_get_inheritor_turnstile_priority(thread_t thread) +{ + struct turnstile *max_turnstile; + + max_turnstile = priority_queue_max(&thread->inheritor_queue, + struct turnstile, ts_inheritor_links); + + if (max_turnstile) { + return priority_queue_entry_key(&thread->inheritor_queue, + &max_turnstile->ts_inheritor_links); + } + + return MAXPRI_THROTTLE; +} + + +/* + * Name: thread_get_waiting_turnstile + * + * Description: Get the turnstile if the thread is waiting on a turnstile. + * + * Arg1: thread + * + * Returns: turnstile: if the thread is blocked on a turnstile. + * TURNSTILE_NULL: otherwise. + * + * Condition: thread locked. + */ +struct turnstile * +thread_get_waiting_turnstile(thread_t thread) +{ + struct turnstile *turnstile = TURNSTILE_NULL; + struct waitq *waitq = thread->waitq; + + /* Check if the thread is on a waitq */ + if (waitq == NULL) { + return turnstile; + } + + /* Get the safeq if the waitq is a port queue */ + if (waitq_is_port_queue(waitq)) { + waitq = waitq_get_safeq(waitq); + } + + /* Check if the waitq is a turnstile queue */ + if (waitq_is_turnstile_queue(waitq)) { + turnstile = waitq_to_turnstile(waitq); + } + return turnstile; +} + + +/* + * Name: turnstile_lookup_by_proprietor + * + * Description: Get turnstile for a proprietor from global + * turnstile hash. + * + * Arg1: port + * + * Returns: turnstile: if the proprietor has a turnstile. + * TURNSTILE_NULL: otherwise. + * + * Condition: proprietor interlock held. + */ +struct turnstile * +turnstile_lookup_by_proprietor(uintptr_t proprietor) +{ + return turnstile_htable_lookup(proprietor); +} + + +/* + * Name: thread_get_update_flags_for_turnstile_propagation_stoppage + * + * Description: Get the turnstile stats flags based on the thread wait status. + * + * Arg1: thread + * + * Returns: TSU_THREAD_RUNNABLE: if the thread is runnable. + * TSU_NO_TURNSTILE: if thread waiting on a regular waitq. + * TSU_NO_PRI_CHANGE_NEEDED: otherwise. + * + * Condition: thread locked. + */ +static turnstile_stats_update_flags_t +thread_get_update_flags_for_turnstile_propagation_stoppage(thread_t thread) +{ + struct waitq *waitq = thread->waitq; + + /* Check if the thread is on a waitq */ + if (waitq == NULL) { + return TSU_THREAD_RUNNABLE; + } + + /* Get the safeq if the waitq is a port queue */ + if (waitq_is_port_queue(waitq)) { + waitq = waitq_get_safeq(waitq); + } + + /* Check if the waitq is a turnstile queue */ + if (!waitq_is_turnstile_queue(waitq)) { + return TSU_NO_TURNSTILE; + } + + /* Thread blocked on turnstile waitq but no propagation needed */ + return TSU_NO_PRI_CHANGE_NEEDED; +} + + +/* + * Name: turnstile_get_update_flags_for_above_UI_pri_change + * + * Description: Get the turnstile stats flags based on the turnstile priority. + * + * Arg1: turnstile + * + * Returns: TSU_ABOVE_UI_PRI_CHANGE: if turnstile priority is above 47 and it is not an ulock. + * TSU_FLAGS_NONE: otherwise. + * + * Condition: turnstile locked. + */ +static turnstile_stats_update_flags_t +turnstile_get_update_flags_for_above_UI_pri_change(struct turnstile *turnstile) +{ + if (turnstile->ts_priority > + (thread_qos_policy_params.qos_pri[THREAD_QOS_USER_INTERACTIVE] + 1) && + turnstile_get_type(turnstile) != TURNSTILE_ULOCK) { + return TSU_ABOVE_UI_PRI_CHANGE; + + } + + return TSU_FLAGS_NONE; +} + + +/* + * Name: workq_add_turnstile_promotion + * + * Description: Connect the workqueue turnstile to the workqueue as a fake + * inheritor + * + * Arg1: workqueue + * Arg2: turnstile + * + * Condition: turnstile locked. + */ +static boolean_t +workq_add_turnstile_promotion( + struct workqueue *wq_inheritor __unused, + struct turnstile *turnstile) +{ + /* + * If the push is higher than MAXPRI_THROTTLE then the workqueue should + * bring up a thread. + */ + return turnstile->ts_priority > MAXPRI_THROTTLE; +} + +/* + * Name: turnstile_need_turnstile_promotion_update + * + * Description: Check if turnstile position in turnstile's inheritor list needs to be updated. + * + * Arg1: dst turnstile + * Arg2: src turnstile + * + * Returns: TRUE: if turnstile_update_turnstile_promotion needs to be called. + * FALSE: otherwise. + * + * Condition: src turnstile locked. + */ +static boolean_t +turnstile_need_turnstile_promotion_update( + struct turnstile *dst_turnstile __assert_only, + struct turnstile *src_turnstile) +{ + int src_turnstile_link_priority; + boolean_t needs_update = FALSE; + + src_turnstile_link_priority = priority_queue_entry_key(&(dst_turnstile->ts_inheritor_queue), + &(src_turnstile->ts_inheritor_links)); + + needs_update = (src_turnstile_link_priority == src_turnstile->ts_priority) ? FALSE : TRUE; + return needs_update; +} + +/* + * Name: turnstile_update_turnstile_promotion_locked + * + * Description: Update dst turnstile's inheritor link since src turnstile's + * promote priority has changed. + * + * Arg1: dst turnstile + * Arg2: src turnstile + * + * Returns: TRUE: if the dst turnstile priority has changed and needs propagation. + * FALSE: if the dst turnstile priority did not change or it does not need propagation. + * + * Condition: src and dst turnstile locked. + */ +static boolean_t +turnstile_update_turnstile_promotion_locked( + struct turnstile *dst_turnstile, + struct turnstile *src_turnstile) +{ + int src_turnstile_link_priority; + src_turnstile_link_priority = priority_queue_entry_key(&(dst_turnstile->ts_inheritor_queue), + &(src_turnstile->ts_inheritor_links)); + + if (src_turnstile->ts_priority != src_turnstile_link_priority) { + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (TURNSTILE_MOVED_IN_TURNSTILE_HEAP))) | DBG_FUNC_NONE, + VM_KERNEL_UNSLIDE_OR_PERM(dst_turnstile), + VM_KERNEL_UNSLIDE_OR_PERM(src_turnstile), + src_turnstile->ts_priority, src_turnstile_link_priority, 0); + } + + if (!turnstile_priority_queue_update_entry_key( + &dst_turnstile->ts_inheritor_queue, &src_turnstile->ts_inheritor_links, + src_turnstile->ts_priority)) { + return FALSE; + } + + /* Update dst turnstile's priority */ + return turnstile_recompute_priority_locked(dst_turnstile); +} + +/* + * Name: turnstile_update_turnstile_promotion + * + * Description: Update dst turnstile's inheritor link since src turnstile's + * promote priority has changed. + * + * Arg1: dst turnstile + * Arg2: src turnstile + * + * Returns: TRUE: if the dst turnstile priority has changed and needs propagation. + * FALSE: if the dst turnstile priority did not change or it does not need propagation. + * + * Condition: src turnstile locked. + */ +static boolean_t +turnstile_update_turnstile_promotion( + struct turnstile *dst_turnstile, + struct turnstile *src_turnstile) +{ + /* Check if update is needed before grabbing the src turnstile lock */ + boolean_t needs_update = turnstile_need_turnstile_promotion_update(dst_turnstile, src_turnstile); + if (!needs_update) { + turnstile_stats_update(1, TSU_NO_PRI_CHANGE_NEEDED | + TSU_TURNSTILE_ARG | TSU_BOOST_ARG, + src_turnstile); + return needs_update; + } + + /* Update the pairing heap */ + waitq_lock(&dst_turnstile->ts_waitq); + needs_update = turnstile_update_turnstile_promotion_locked(dst_turnstile, src_turnstile); + + /* Update turnstile stats */ + if (!needs_update) { + turnstile_stats_update(1, + (dst_turnstile->ts_inheritor ? TSU_NO_PRI_CHANGE_NEEDED : TSU_NO_INHERITOR) | + TSU_TURNSTILE_ARG | TSU_BOOST_ARG, src_turnstile); + } + waitq_unlock(&dst_turnstile->ts_waitq); + return needs_update; +} + +/* + * Name: turnstile_add_turnstile_promotion + * + * Description: Add src turnstile to dst turnstile's inheritor link + * and update dst turnstile's priority. + * + * Arg1: dst turnstile + * Arg2: src turnstile + * + * Returns: TRUE: if the dst turnstile priority has changed and needs propagation. + * FALSE: if the dst turnstile priority did not change or it does not need propagation. + * + * Condition: src turnstile locked. + */ +static boolean_t +turnstile_add_turnstile_promotion( + struct turnstile *dst_turnstile, + struct turnstile *src_turnstile) +{ + boolean_t needs_update = FALSE; + + /* Update the pairing heap */ + waitq_lock(&dst_turnstile->ts_waitq); + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (TURNSTILE_ADDED_TO_TURNSTILE_HEAP))) | DBG_FUNC_NONE, + VM_KERNEL_UNSLIDE_OR_PERM(dst_turnstile), + VM_KERNEL_UNSLIDE_OR_PERM(src_turnstile), + src_turnstile->ts_priority, 0, 0); + + priority_queue_entry_init(&(src_turnstile->ts_inheritor_links)); + if (priority_queue_insert(&dst_turnstile->ts_inheritor_queue, + &src_turnstile->ts_inheritor_links, src_turnstile->ts_priority, + PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) { + /* Update dst turnstile priority */ + needs_update = turnstile_recompute_priority_locked(dst_turnstile); + } + + /* Update turnstile stats */ + if (!needs_update) { + turnstile_stats_update(1, + (dst_turnstile->ts_inheritor ? TSU_NO_PRI_CHANGE_NEEDED : TSU_NO_INHERITOR) | + TSU_TURNSTILE_ARG | TSU_BOOST_ARG, src_turnstile); + } + + waitq_unlock(&dst_turnstile->ts_waitq); + return needs_update; +} + +/* + * Name: turnstile_remove_turnstile_promotion + * + * Description: Remove src turnstile from dst turnstile's inheritor link + * and update dst turnstile's priority. + * + * Arg1: dst turnstile + * Arg2: src turnstile + * + * Returns: TRUE: if the dst turnstile priority has changed and needs propagation. + * FALSE: if the dst turnstile priority did not change or it does not need propagation. + * + * Condition: src turnstile locked. + */ +static boolean_t +turnstile_remove_turnstile_promotion( + struct turnstile *dst_turnstile, + struct turnstile *src_turnstile) +{ + boolean_t needs_update = FALSE; + + /* Update the pairing heap */ + waitq_lock(&dst_turnstile->ts_waitq); + + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (TURNSTILE_REMOVED_FROM_TURNSTILE_HEAP))) | DBG_FUNC_NONE, + VM_KERNEL_UNSLIDE_OR_PERM(dst_turnstile), + VM_KERNEL_UNSLIDE_OR_PERM(src_turnstile), + 0, 0, 0); + + if (priority_queue_remove(&dst_turnstile->ts_inheritor_queue, + &src_turnstile->ts_inheritor_links, + PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) { + /* Update dst turnstile priority */ + needs_update = turnstile_recompute_priority_locked(dst_turnstile); + } + + /* Update turnstile stats */ + if (!needs_update) { + turnstile_stats_update(1, + (dst_turnstile->ts_inheritor ? TSU_NO_PRI_CHANGE_NEEDED : TSU_NO_INHERITOR) | + TSU_TURNSTILE_ARG, src_turnstile); + } + + waitq_unlock(&dst_turnstile->ts_waitq); + return needs_update; +} + +/* + * Name: turnstile_recompute_priority_locked + * + * Description: Update turnstile priority based + * on highest waiter thread and highest blocking + * turnstile. + * + * Args: turnstile + * + * Returns: TRUE: if the turnstile priority changed and needs propagation. + * FALSE: if the turnstile priority did not change or it does not need propagation. + * + * Condition: turnstile locked + */ +boolean_t +turnstile_recompute_priority_locked( + struct turnstile *turnstile) +{ + int old_priority; + int new_priority; + boolean_t needs_priority_update = FALSE; + thread_t max_thread = THREAD_NULL; + struct turnstile *max_turnstile; + int thread_max_pri = MAXPRI_THROTTLE; + int turnstile_max_pri = MAXPRI_THROTTLE; + + switch (turnstile_promote_policy[turnstile_get_type(turnstile)]) { + + case TURNSTILE_USER_PROMOTE: + case TURNSTILE_USER_IPC_PROMOTE: + + old_priority = turnstile->ts_priority; + + max_thread = priority_queue_max(&turnstile->ts_waitq.waitq_prio_queue, + struct thread, wait_prioq_links); + + if (max_thread) { + thread_max_pri = priority_queue_entry_key(&turnstile->ts_waitq.waitq_prio_queue, + &max_thread->wait_prioq_links); + } + + max_turnstile = priority_queue_max(&turnstile->ts_inheritor_queue, + struct turnstile, ts_inheritor_links); + + if (max_turnstile) { + turnstile_max_pri = priority_queue_entry_key(&turnstile->ts_inheritor_queue, + &max_turnstile->ts_inheritor_links); + } + + new_priority = max(thread_max_pri, turnstile_max_pri); + turnstile->ts_priority = new_priority; + + if (old_priority != new_priority) { + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (TURNSTILE_CODE(TURNSTILE_PRIORITY_OPERATIONS, + (TURNSTILE_PRIORITY_CHANGE))) | DBG_FUNC_NONE, + VM_KERNEL_UNSLIDE_OR_PERM(turnstile), + new_priority, + old_priority, + 0, 0); + } + needs_priority_update = (!(old_priority == new_priority)) && + (turnstile->ts_inheritor != NULL); + break; + + case TURNSTILE_PROMOTE_NONE: + case TURNSTILE_KERNEL_PROMOTE: + + /* The turnstile was repurposed, do nothing */ + break; + + default: + + panic("Needs implementation for turnstile_recompute_priority"); + break; + + } + return needs_priority_update; + +} + + +/* + * Name: turnstile_recompute_priority + * + * Description: Update turnstile priority based + * on highest waiter thread and highest blocking + * turnstile. + * + * Args: turnstile + * + * Returns: TRUE: if the turnstile priority changed and needs propagation. + * FALSE: if the turnstile priority did not change or it does not need propagation. + */ +boolean_t +turnstile_recompute_priority( + struct turnstile *turnstile) +{ + boolean_t needs_priority_update = FALSE; + spl_t s = splsched(); + + waitq_lock(&turnstile->ts_waitq); + + needs_priority_update = turnstile_recompute_priority_locked(turnstile); + + waitq_unlock(&turnstile->ts_waitq); + splx(s); + return needs_priority_update; + +} + + +/* + * Name: turnstile_workq_proprietor_of_max_turnstile + * + * Description: Returns the highest priority and proprietor of a turnstile + * pushing on a workqueue turnstile. + * + * This will not return waiters that are at priority + * MAXPRI_THROTTLE or lower. + * + * Args: turnstile + * + * Returns: + * Priority of the max entry, or 0 + * Pointer to the max entry proprietor + */ +int +turnstile_workq_proprietor_of_max_turnstile( + struct turnstile *turnstile, + uintptr_t *proprietor_out) +{ + struct turnstile *max_turnstile; + int max_priority = 0; + uintptr_t proprietor = 0; + + assert(turnstile_get_type(turnstile) == TURNSTILE_WORKQS); + + spl_t s = splsched(); + + waitq_lock(&turnstile->ts_waitq); + + max_turnstile = priority_queue_max(&turnstile->ts_inheritor_queue, + struct turnstile, ts_inheritor_links); + if (max_turnstile) { + max_priority = priority_queue_entry_key(&turnstile->ts_inheritor_queue, + &max_turnstile->ts_inheritor_links); + proprietor = max_turnstile->ts_proprietor; + } + + waitq_unlock(&turnstile->ts_waitq); + splx(s); + + if (max_priority <= MAXPRI_THROTTLE) { + max_priority = 0; + proprietor = 0; + } + if (proprietor_out) *proprietor_out = proprietor; + return max_priority; +} + + +/* + * Name: turnstile_update_inheritor_priority_chain + * + * Description: Update turnstile inheritor's priority and propagate + * the priority if the inheritor is blocked on a turnstile. + * + * Arg1: inheritor + * Arg2: inheritor flags + * + * Returns: None. + */ +static void +turnstile_update_inheritor_priority_chain( + turnstile_inheritor_t inheritor, + turnstile_update_flags_t turnstile_flags) +{ + struct turnstile *turnstile = TURNSTILE_NULL; + thread_t thread = THREAD_NULL; + int total_hop = 0, thread_hop = 0; + spl_t s; + turnstile_stats_update_flags_t tsu_flags = ((turnstile_flags & TURNSTILE_UPDATE_BOOST) ? + TSU_BOOST_ARG : TSU_FLAGS_NONE) | TSU_PRI_PROPAGATION; + + if (inheritor == NULL) { + return; + } + + s = splsched(); + + if (turnstile_flags & TURNSTILE_INHERITOR_THREAD) { + thread = inheritor; + thread_lock(thread); + //TODO: Need to call sched promotion for kernel mutex. + thread_recompute_user_promotion_locked(thread); + } else if (turnstile_flags & TURNSTILE_INHERITOR_TURNSTILE) { + turnstile = inheritor; + waitq_lock(&turnstile->ts_waitq); + turnstile_recompute_priority_locked(turnstile); + tsu_flags |= turnstile_get_update_flags_for_above_UI_pri_change(turnstile); + } else { + /* + * we should never call turnstile_update_inheritor_priority_chain() + * for a workqueue, they have no "chain" after them. + */ + assert((turnstile_flags & TURNSTILE_INHERITOR_WORKQ) == 0); + } + + while (turnstile != TURNSTILE_NULL || thread != THREAD_NULL) { + if (turnstile != TURNSTILE_NULL) { + if (turnstile->ts_inheritor == NULL) { + turnstile_stats_update(total_hop + 1, TSU_NO_INHERITOR | + TSU_TURNSTILE_ARG | tsu_flags, + turnstile); + waitq_unlock(&turnstile->ts_waitq); + turnstile = TURNSTILE_NULL; + break; + } + if (turnstile->ts_inheritor_flags & TURNSTILE_INHERITOR_THREAD) { + turnstile_update_inheritor_thread_priority_chain(&turnstile, &thread, + total_hop, tsu_flags); + + } else if (turnstile->ts_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE) { + turnstile_update_inheritor_turnstile_priority_chain(&turnstile, + total_hop, tsu_flags); + + } else if (turnstile->ts_inheritor_flags & TURNSTILE_INHERITOR_WORKQ) { + turnstile_update_inheritor_workq_priority_chain(turnstile, s); + turnstile_stats_update(total_hop + 1, TSU_NO_PRI_CHANGE_NEEDED | tsu_flags, + NULL); + return; + + } else { + panic("Inheritor flags not passed in turnstile_update_inheritor"); + } + } else if (thread != THREAD_NULL) { + thread_update_waiting_turnstile_priority_chain(&thread, &turnstile, + thread_hop, total_hop, tsu_flags); + thread_hop++; + } + total_hop++; + } + + splx(s); + return; +} + +/* + * Name: turnstile_update_inheritor_complete + * + * Description: Update turnstile inheritor's priority and propagate the + * priority if the inheritor is blocked on a turnstile. + * Consumes thread ref of old inheritor returned by + * turnstile_update_inheritor. Recursive priority update + * will only happen when called with interlock dropped. + * + * Args: + * Arg1: turnstile + * Arg2: interlock held + * + * Returns: None. + */ +void +turnstile_update_inheritor_complete( + struct turnstile *turnstile, + turnstile_update_complete_flags_t flags __unused) +{ + thread_t thread = current_thread(); + + turnstile_update_flags_t inheritor_flags = thread->inheritor_flags; + + turnstile_cleanup(); + + /* Perform priority update for new inheritor */ + if (inheritor_flags & TURNSTILE_NEEDS_PRI_UPDATE) { + turnstile_update_inheritor_priority_chain(turnstile, + TURNSTILE_INHERITOR_TURNSTILE | TURNSTILE_UPDATE_BOOST); + } +} + +/* + * Name: turnstile_cleanup + * + * Description: Update priority of a turnstile inheritor + * if needed. + * + * Args: inheritor and flags passed on thread struct. + * + * Returns: None. + */ +void +turnstile_cleanup(void) +{ + thread_t thread = current_thread(); + + /* Get the old inheritor from calling thread struct */ + turnstile_inheritor_t old_inheritor = thread->inheritor; + turnstile_update_flags_t inheritor_flags = thread->inheritor_flags; + thread->inheritor = THREAD_NULL; + thread->inheritor_flags = TURNSTILE_UPDATE_FLAGS_NONE; + + if (old_inheritor == TURNSTILE_INHERITOR_NULL) { + /* no cleanup to do */ + return; + } + + /* Perform priority demotion for old inheritor */ + if (inheritor_flags & TURNSTILE_INHERITOR_NEEDS_PRI_UPDATE) { + turnstile_update_inheritor_priority_chain(old_inheritor, + inheritor_flags); + } + + /* Drop thread reference for old inheritor */ + if (inheritor_flags & TURNSTILE_INHERITOR_THREAD) { + thread_deallocate_safe(old_inheritor); + } else if (inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE) { + turnstile_deallocate_safe((struct turnstile *)old_inheritor); + } else if (inheritor_flags & TURNSTILE_INHERITOR_WORKQ) { + workq_deallocate_safe((struct workqueue *)old_inheritor); + } else { + panic("Inheritor flags lost along the way"); + } +} + +/* + * Name: turnstile_update_inheritor_workq_priority_chain + * + * Description: Helper function to update turnstile's inheritor(workq) + * priority and possibly redrive thread creation + * + * Arg1: turnstile: turnstile + * Arg2: s: whether iterrupts are disabled. + * + * Condition: turnstile is locked on entry, it is unlocked on exit, + * and interrupts re-enabled. + */ +static void +turnstile_update_inheritor_workq_priority_chain(struct turnstile *turnstile, spl_t s) +{ + struct workqueue *wq = turnstile->ts_inheritor; + bool workq_lock_held = workq_is_current_thread_updating_turnstile(wq); + + if (__improbable(turnstile->ts_priority <= MAXPRI_THROTTLE)) { + waitq_unlock(&turnstile->ts_waitq); + splx(s); + return; + } + + if (!workq_lock_held) workq_reference(wq); + waitq_unlock(&turnstile->ts_waitq); + splx(s); + + workq_schedule_creator_turnstile_redrive(wq, workq_lock_held); + + if (!workq_lock_held) workq_deallocate_safe(wq); +} + +/* + * Name: turnstile_update_inheritor_thread_priority_chain + * + * Description: Helper function to update turnstile's inheritor(thread) + * priority. + * + * Arg1: in_turnstile: address to turnstile + * Arg2: out_thread: address to return the thread inheritor + * Arg3: thread_hop: number to thread hop in propagation chain + * Arg4: tsu_flags: turnstile update flags + * + * Returns: Implicit returns locked thread in out_thread if it needs + * further propagation. + * + * Condition: *in_turnstile is locked on entry, it is unlocked on exit and + * *in_turnstile is set to NULL. + */ +static void +turnstile_update_inheritor_thread_priority_chain( + struct turnstile **in_turnstile, + thread_t *out_thread, + int total_hop, + turnstile_stats_update_flags_t tsu_flags) +{ + boolean_t needs_update = FALSE; + struct turnstile *turnstile = *in_turnstile; + thread_t thread_inheritor = turnstile->ts_inheritor; + boolean_t first_update = !total_hop; + + assert(turnstile->ts_inheritor_flags & TURNSTILE_INHERITOR_THREAD); + *in_turnstile = TURNSTILE_NULL; + + /* Check if update is needed before grabbing the thread lock */ + needs_update = thread_needs_turnstile_promotion_update(thread_inheritor, turnstile); + if (!needs_update && !first_update) { + turnstile_stats_update(total_hop + 1, TSU_NO_PRI_CHANGE_NEEDED | + TSU_TURNSTILE_ARG | tsu_flags, turnstile); + waitq_unlock(&turnstile->ts_waitq); + return; + } + + thread_lock(thread_inheritor); + + /* adjust turnstile position in the thread's inheritor list */ + needs_update = thread_update_turnstile_promotion_locked( + thread_inheritor, turnstile); + + /* + * Check if thread needs further priority propagation, + * since the first hop priority update was done in + * turnstile_update_inheritor, do not bailout if it is + * the first update as needs_update flag would evaluate to + * false for that case. + */ + if (!needs_update && !first_update) { + /* Update turnstile stats before returning */ + turnstile_stats_update(total_hop + 1, + (thread_get_update_flags_for_turnstile_propagation_stoppage(thread_inheritor)) | + TSU_TURNSTILE_ARG | tsu_flags, + turnstile); + thread_unlock(thread_inheritor); + waitq_unlock(&turnstile->ts_waitq); + return; + } + + /* Unlock the turnstile and update the thread */ + waitq_unlock(&turnstile->ts_waitq); + *out_thread = thread_inheritor; + return; +} + +/* + * Name: turnstile_update_inheritor_turnstile_priority_chain + * + * Description: Helper function to update turnstile's inheritor(turnstile) + * priority. + * + * Arg1: in_out_turnstile: address to turnstile + * Arg2: thread_hop: number of thread hop in propagation chain + * Arg3: tsu_flags: turnstile update flags + * + * Returns: Implicit returns locked turnstile in in_out_turnstile if it needs + * further propagation. + * + * Condition: *in_out_turnstile is locked on entry, *in_out_turnstile on exit, + * but the value of *in_out_turnstile might change and turnstile lock + * will be dropped for old value and will be acquired for the new value. + */ +static void +turnstile_update_inheritor_turnstile_priority_chain( + struct turnstile **in_out_turnstile, + int total_hop, + turnstile_stats_update_flags_t tsu_flags) +{ + boolean_t needs_update = FALSE; + struct turnstile *turnstile = *in_out_turnstile; + struct turnstile *inheritor_turnstile = turnstile->ts_inheritor; + boolean_t first_update = !total_hop; + + assert(turnstile->ts_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE); + *in_out_turnstile = TURNSTILE_NULL; + + /* Check if the inheritor turnstile needs to be updated before grabbing the lock */ + needs_update = turnstile_need_turnstile_promotion_update(inheritor_turnstile, turnstile); + if (!needs_update && !first_update) { + turnstile_stats_update(total_hop + 1, TSU_NO_PRI_CHANGE_NEEDED | + TSU_TURNSTILE_ARG | tsu_flags, + turnstile); + waitq_unlock(&turnstile->ts_waitq); + return; + } + + waitq_lock(&inheritor_turnstile->ts_waitq); + + needs_update = turnstile_update_turnstile_promotion_locked( + inheritor_turnstile, turnstile); + + /* + * Check if turnstile needs further priority propagation, + * since the first hop priority update was done in + * turnstile_update_inheritor, do not bailout if it is + * the first update as needs_update flag would evaluate to + * false for that case. + */ + if (!needs_update && !first_update) { + /* Update turnstile stats before returning */ + turnstile_stats_update(total_hop + 1, + (inheritor_turnstile->ts_inheritor ? TSU_NO_PRI_CHANGE_NEEDED : TSU_NO_INHERITOR) | + TSU_TURNSTILE_ARG | tsu_flags, + turnstile); + waitq_unlock(&inheritor_turnstile->ts_waitq); + waitq_unlock(&turnstile->ts_waitq); + return; + } + + /* Unlock the outer turnstile and update the inner turnstile */ + waitq_unlock(&turnstile->ts_waitq); + *in_out_turnstile = inheritor_turnstile; + return; +} + +/* + * Name: thread_update_waiting_turnstile_priority_chain + * + * Description: Helper function to update thread's waiting + * turnstile priority. + * + * Arg1: in_thread: pointer to thread + * Arg2: out_turnstile: pointer to turnstile to return to caller + * Arg3: thread_hop: Number of thread hops visited + * Arg4: total_hop: total hops visited + * Arg5: tsu_flags: turnstile update flags + * + * Returns: *out_turnstile returns the inheritor if it needs further propagation. + * + * Condition: *in_thread locked on entry, unlocked on exit and set to NULL. + */ +static void +thread_update_waiting_turnstile_priority_chain( + thread_t *in_thread, + struct turnstile **out_turnstile, + int thread_hop, + int total_hop, + turnstile_stats_update_flags_t tsu_flags) +{ + boolean_t needs_update = FALSE; + thread_t thread = *in_thread; + struct turnstile *waiting_turnstile = TURNSTILE_NULL; + uint32_t turnstile_gencount; + boolean_t first_update = !total_hop; + + *in_thread = THREAD_NULL; + + /* Check if thread waiting on a turnstile */ + waiting_turnstile = thread_get_waiting_turnstile(thread); + + if (waiting_turnstile == TURNSTILE_NULL || thread_hop > turnstile_max_hop) { + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, + (waiting_turnstile ? TURNSTILE_UPDATE_STOPPED_BY_LIMIT : THREAD_NOT_WAITING_ON_TURNSTILE) + )) | DBG_FUNC_NONE, + thread_tid(thread), + turnstile_max_hop, + thread_hop, + VM_KERNEL_UNSLIDE_OR_PERM(waiting_turnstile), 0); + turnstile_stats_update(total_hop + 1, TSU_NO_TURNSTILE | + TSU_THREAD_ARG | tsu_flags, thread); + thread_unlock(thread); + return; + } + + /* Check if the thread needs to update the waiting turnstile */ + needs_update = turnstile_need_thread_promotion_update(waiting_turnstile, thread); + if (!needs_update && !first_update) { + turnstile_stats_update(total_hop + 1, TSU_NO_PRI_CHANGE_NEEDED | + TSU_THREAD_ARG | tsu_flags, thread); + thread_unlock(thread); + return; + } + + /* take a reference on thread, turnstile and snapshot of gencount */ + turnstile_gencount = turnstile_get_gencount(waiting_turnstile); + turnstile_reference(waiting_turnstile); + thread_reference(thread); + + /* drop the thread lock and acquire the turnstile lock */ + thread_unlock(thread); + waitq_lock(&waiting_turnstile->ts_waitq); + thread_lock(thread); + + /* Check if the gencount matches and thread is still waiting on same turnstile */ + if (turnstile_gencount != turnstile_get_gencount(waiting_turnstile) || + waiting_turnstile != thread_get_waiting_turnstile(thread)) { + turnstile_stats_update(total_hop + 1, TSU_NO_PRI_CHANGE_NEEDED | + TSU_THREAD_ARG | tsu_flags, thread); + /* No updates required, bail out */ + thread_unlock(thread); + waitq_unlock(&waiting_turnstile->ts_waitq); + thread_deallocate_safe(thread); + turnstile_deallocate_safe(waiting_turnstile); + return; + } + + /* + * The thread is waiting on the waiting_turnstile and we have thread lock, + * we can drop the thread and turnstile reference since its on waitq and + * it could not be removed from the waitq without the thread lock. + */ + thread_deallocate_safe(thread); + turnstile_deallocate_safe(waiting_turnstile); + + /* adjust thread's position on turnstile waitq */ + needs_update = turnstile_update_thread_promotion_locked(waiting_turnstile, thread); + + /* + * Check if thread needs further priority propagation, + * since the first hop priority update was done in + * turnstile_update_inheritor, do not bailout if it is + * the first update as needs_update flag would evaluate to + * false for that case. + */ + if (!needs_update && !first_update) { + turnstile_stats_update(total_hop + 1, + (waiting_turnstile->ts_inheritor ? TSU_NO_PRI_CHANGE_NEEDED : TSU_NO_INHERITOR) | + TSU_THREAD_ARG | tsu_flags, thread); + thread_unlock(thread); + waitq_unlock(&waiting_turnstile->ts_waitq); + return; + } + + /* drop the thread lock and update the turnstile */ + thread_unlock(thread); + *out_turnstile = waiting_turnstile; +} + +/* + * Name: turnstile_stats_update + * + * Description: Function to update turnstile stats for dev kernel. + * + * Arg1: hops : number of thread hops in priority propagation + * Arg2: flags : turnstile stats update flags + * Arg3: inheritor: inheritor + * + * Returns: Nothing + */ +void +turnstile_stats_update( + int hop __assert_only, + turnstile_stats_update_flags_t flags __assert_only, + turnstile_inheritor_t inheritor __assert_only) +{ +#if DEVELOPMENT || DEBUG + if (flags & TSU_TURNSTILE_BLOCK_COUNT) { + os_atomic_inc(&thread_block_on_turnstile_count, relaxed); + } + + if (flags & TSU_REGULAR_WAITQ_BLOCK_COUNT) { + os_atomic_inc(&thread_block_on_regular_waitq_count, relaxed); + } + + if (hop > TURNSTILE_MAX_HOP_DEFAULT || hop == 0) { + return; + } + + assert(hop >= 0); + + /* + * Check if turnstile stats needs to be updated. + * Bail out if the turnstile or thread does not + * have any user promotion, i.e. pri 4. + * Bail out if it is the first hop of WQ turnstile + * since WQ's use of a turnstile for the admission check + * introduces a lot of noise due to state changes. + */ + if (flags & TSU_TURNSTILE_ARG) { + struct turnstile *ts = (struct turnstile *)inheritor; + if (ts->ts_priority <= MAXPRI_THROTTLE) { + return; + } + + if (hop == 1 && turnstile_get_type(ts) == TURNSTILE_WORKQS) { + return; + } + } else if (flags & TSU_THREAD_ARG) { + thread_t thread = (thread_t)inheritor; + if (thread->user_promotion_basepri <= MAXPRI_THROTTLE) { + return; + } + } else { + assert(inheritor == NULL); + } + + struct turnstile_stats *turnstile_stats; + if (flags & TSU_BOOST_ARG) { + turnstile_stats = turnstile_boost_stats; + } else { + turnstile_stats = turnstile_unboost_stats; + } + + if (flags & TSU_PRI_PROPAGATION) { + os_atomic_inc(&turnstile_stats[hop - 1].ts_priority_propagation, relaxed); + } + + if (flags & TSU_NO_INHERITOR) { + os_atomic_inc(&turnstile_stats[hop - 1].ts_no_inheritor, relaxed); + } + + if (flags & TSU_NO_TURNSTILE) { + os_atomic_inc(&turnstile_stats[hop - 1].ts_no_turnstile, relaxed); + } + + if (flags & TSU_NO_PRI_CHANGE_NEEDED) { + os_atomic_inc(&turnstile_stats[hop - 1].ts_no_priority_change_required, relaxed); + } + + if (flags & TSU_THREAD_RUNNABLE) { + os_atomic_inc(&turnstile_stats[hop - 1].ts_thread_runnable, relaxed); + } + + if (flags & TSU_ABOVE_UI_PRI_CHANGE) { + os_atomic_inc(&turnstile_stats[hop - 1].ts_above_ui_pri_change, relaxed); + } +#endif +} + + +#if DEVELOPMENT || DEBUG + +int sysctl_io_opaque(void *req,void *pValue, size_t valueSize, int *changed); + +/* + * Name: turnstile_get_boost_stats_sysctl + * + * Description: Function to get turnstile stats. + * + * Args: req : opaque struct to pass to sysctl_io_opaque + * + * Returns: errorno + */ +int +turnstile_get_boost_stats_sysctl( + void *req) +{ + return sysctl_io_opaque(req, turnstile_boost_stats, sizeof (struct turnstile_stats) * TURNSTILE_MAX_HOP_DEFAULT, NULL); +} + +/* + * Name: get_turnstile_stats_sysctl + * + * Description: Function to get turnstile stats. + * + * Args: req : opaque struct to pass to sysctl_io_opaque + * + * Returns: errorno + */ +int +turnstile_get_unboost_stats_sysctl( + void *req) +{ + return sysctl_io_opaque(req, turnstile_unboost_stats, sizeof (struct turnstile_stats) * TURNSTILE_MAX_HOP_DEFAULT, NULL); +} + +/* Testing interface for Development kernels */ +#define tstile_test_prim_lock_interlock(test_prim) \ + lck_spin_lock(&test_prim->ttprim_interlock) +#define tstile_test_prim_unlock_interlock(test_prim) \ + lck_spin_unlock(&test_prim->ttprim_interlock) + +static void +tstile_test_prim_init(struct tstile_test_prim **test_prim_ptr) +{ + struct tstile_test_prim *test_prim = (struct tstile_test_prim *) kalloc(sizeof(struct tstile_test_prim)); + + test_prim->ttprim_turnstile = TURNSTILE_NULL; + test_prim->ttprim_owner = NULL; + lck_spin_init(&test_prim->ttprim_interlock, &turnstiles_dev_lock_grp, &turnstiles_dev_lock_attr); + test_prim->tt_prim_waiters = 0; + + *test_prim_ptr = test_prim; + return; +} + +int +tstile_test_prim_lock(boolean_t use_hashtable) +{ + struct tstile_test_prim *test_prim = use_hashtable ? test_prim_global_htable : test_prim_ts_inline; +lock_start: + /* take the interlock of the primitive */ + tstile_test_prim_lock_interlock(test_prim); + + /* Check if the lock is available */ + if (test_prim->ttprim_owner == NULL && test_prim->tt_prim_waiters == 0) { + thread_reference(current_thread()); + test_prim->ttprim_owner = current_thread(); + tstile_test_prim_unlock_interlock(test_prim); + return 0; + } + + struct turnstile *prim_turnstile = TURNSTILE_NULL; + + /* primitive locked, get a turnstile */ + prim_turnstile = turnstile_prepare((uintptr_t)test_prim, + use_hashtable ? NULL : &test_prim->ttprim_turnstile, + TURNSTILE_NULL, TURNSTILE_ULOCK); + + assert(prim_turnstile != TURNSTILE_NULL); + + /* This is contented acquire case */ + if (test_prim->ttprim_owner == NULL) { + thread_reference(current_thread()); + test_prim->ttprim_owner = current_thread(); + + /* Update the turnstile owner */ + turnstile_update_inheritor(prim_turnstile, + current_thread(), + (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD)); + + turnstile_update_inheritor_complete(prim_turnstile, TURNSTILE_INTERLOCK_HELD); + + turnstile_complete((uintptr_t)test_prim, + use_hashtable ? NULL : &test_prim->ttprim_turnstile, NULL); + + tstile_test_prim_unlock_interlock(test_prim); + + turnstile_cleanup(); + + return 0; + } + + test_prim->tt_prim_waiters++; + turnstile_update_inheritor(prim_turnstile, + test_prim->ttprim_owner, + (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD)); + + waitq_assert_wait64(&prim_turnstile->ts_waitq, + CAST_EVENT64_T(test_prim), THREAD_ABORTSAFE, + TIMEOUT_WAIT_FOREVER); + + /* drop the interlock */ + tstile_test_prim_unlock_interlock(test_prim); + + turnstile_update_inheritor_complete(prim_turnstile, TURNSTILE_INTERLOCK_NOT_HELD); + + wait_result_t result; + result = thread_block(THREAD_CONTINUE_NULL); + + /* re-acquire the interlock to get turnstile back */ + tstile_test_prim_lock_interlock(test_prim); + test_prim->tt_prim_waiters--; + turnstile_complete((uintptr_t)test_prim, + use_hashtable ? NULL : &test_prim->ttprim_turnstile, NULL); + + tstile_test_prim_unlock_interlock(test_prim); + + turnstile_cleanup(); + + /* Return if thread interrupted */ + if (result == THREAD_INTERRUPTED) { + return 1; + } + + goto lock_start; +} + +int +tstile_test_prim_unlock(boolean_t use_hashtable) +{ + + struct tstile_test_prim *test_prim = use_hashtable ? test_prim_global_htable : test_prim_ts_inline; + /* take the interlock of the primitive */ + tstile_test_prim_lock_interlock(test_prim); + + if (test_prim->ttprim_owner == NULL) { + tstile_test_prim_unlock_interlock(test_prim); + return 1; + } + + /* Check if the lock is contended */ + if (test_prim->ttprim_owner != NULL && test_prim->tt_prim_waiters == 0) { + /* lock is not contended */ + thread_t old_owner = test_prim->ttprim_owner; + test_prim->ttprim_owner = NULL; + tstile_test_prim_unlock_interlock(test_prim); + + thread_deallocate(old_owner); + return 0; + } + + struct turnstile *prim_turnstile = TURNSTILE_NULL; + + thread_t old_owner = test_prim->ttprim_owner; + test_prim->ttprim_owner = NULL; + + /* primitive locked, get a turnstile */ + prim_turnstile = turnstile_prepare((uintptr_t)test_prim, + use_hashtable ? NULL : &test_prim->ttprim_turnstile, + TURNSTILE_NULL, TURNSTILE_ULOCK); + + assert(prim_turnstile != TURNSTILE_NULL); + + /* Update the turnstile owner */ + turnstile_update_inheritor(prim_turnstile, + NULL, + (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD)); + + waitq_wakeup64_one(&prim_turnstile->ts_waitq, + CAST_EVENT64_T(test_prim), + THREAD_AWAKENED, WAITQ_SELECT_MAX_PRI); + + turnstile_update_inheritor_complete(prim_turnstile, TURNSTILE_INTERLOCK_HELD); + + turnstile_complete((uintptr_t)test_prim, + use_hashtable ? NULL : &test_prim->ttprim_turnstile, NULL); + + tstile_test_prim_unlock_interlock(test_prim); + + turnstile_cleanup(); + + if (old_owner) { + /* Changing this to thread_deallocate_safe to exercise thread_deallocate_safe path */ + thread_deallocate_safe(old_owner); + } + + return 0; +} + +#endif diff --git a/osfmk/kern/turnstile.h b/osfmk/kern/turnstile.h new file mode 100644 index 000000000..f8f9ebe87 --- /dev/null +++ b/osfmk/kern/turnstile.h @@ -0,0 +1,686 @@ +/* + * Copyright (c) 2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _TURNSTILE_H_ +#define _TURNSTILE_H_ + +#include +#include +#include + +#if PRIVATE +#define TURNSTILE_MAX_HOP_DEFAULT (10) +struct turnstile_stats { + uint64_t ts_priority_propagation; + uint64_t ts_no_inheritor; + uint64_t ts_thread_runnable; + uint64_t ts_no_priority_change_required; + uint64_t ts_above_ui_pri_change; + uint64_t ts_no_turnstile; +}; +#endif + +#ifdef KERNEL_PRIVATE +#include +#include +#include +#include +#include +#include +#include + +/* + * turnstile_type_t : Indicates the type of primitive the turnstile is associated with + * Please populate turnstile_promote_policy array if a new type is added here. + */ +typedef enum __attribute__((packed)) turnstile_type { + TURNSTILE_NONE = 0, + TURNSTILE_KERNEL_MUTEX = 1, + TURNSTILE_ULOCK = 2, + TURNSTILE_PTHREAD_MUTEX = 3, + TURNSTILE_SYNC_IPC = 4, + TURNSTILE_WORKLOOPS = 5, + TURNSTILE_WORKQS = 6, + TURNSTILE_KNOTE = 7, + TURNSTILE_TOTAL_TYPES = 8, +} turnstile_type_t; + +/* + * For each type of turnstile, following are the type of + * inheritors passed: + * + * TURNSTILE_KERNEL_MUTEX + * Interlock: kernel mutex interlock. + * Inheritor: threads. + * Lock order: turnstile lock, thread lock. + * + * TURNSTILE_ULOCK + * Interlock: ulocks interlock. + * Inheritor: threads. + * Lock order: turnstile lock, thread lock. + * + * TURNSTILE_PTHREAD_MUTEX + * Interlock: pthread mtx interlock. + * Inheritor: threads. + * Lock order: turnstile lock, thread lock. + * + * TURNSTILE_SYNC_IPC + * Interlock: port's mqueue lock + * Inheritor: turnstile (of port in which we are enqueued or WL turnstile. + * Lock order: Our turnstile, then turnstile of the port we are enqueued in. + * Port circularity will make sure there is never a cycle formation + * and lock order is maintained. + * + * TURNSTILE_WORKLOOPS + * Interlock: + * - kq req lock + * - wq lock when "filt_wlworkq_interlock_needed() is true" + * Inheritor: thread, turnstile (of workq) + * Lock order: turnstile lock, thread lock + * WL turnstile lock, Workq turnstile lock + * + * TURNSTILE_WORKQS + * Interlock: workqueue lock + * Inheritor: thread + * Lock order: turnstile lock, thread lock. + * + * TURNSTILE_KNOTE + * Interlock: the knote lock + * Inheritor: WL turnstile + */ + +typedef enum __attribute__((flag_enum)) turnstile_promote_policy { + TURNSTILE_PROMOTE_NONE = 0, + TURNSTILE_KERNEL_PROMOTE = 0x1, + TURNSTILE_USER_PROMOTE = 0x2, + TURNSTILE_USER_IPC_PROMOTE = 0x4, +} turnstile_promote_policy_t; + +/* + * Turnstile state flags + * + * The turnstile state flags represent the current ownership of a turnstile. + * The supported flags are: + * - TURNSTILE_STATE_THREAD : Turnstile is attached to a thread + * - TURNSTILE_STATE_FREELIST : Turnstile is hanging off the freelist of another turnstile + * - TURNSTILE_STATE_HASHTABLE : Turnstile is in the global hash table as the turnstile for a primitive + * - TURNSTILE_STATE_PROPRIETOR : Turnstile is attached to a proprietor + * + * The flag updates are done while holding the primitive interlock. + * */ + +#define TURNSTILE_STATE_THREAD 0x1 +#define TURNSTILE_STATE_FREELIST 0x2 +#define TURNSTILE_STATE_HASHTABLE 0x4 +#define TURNSTILE_STATE_PROPRIETOR 0x8 + +/* Helper macros to set/unset turnstile state flags */ +#if DEVELOPMENT || DEBUG + +#define turnstile_state_init(ts, state) \ +MACRO_BEGIN \ + ts->ts_state = state; \ +MACRO_END + +#define turnstile_state_add(ts, state) \ +MACRO_BEGIN \ + assert((ts->ts_state & (state)) == 0); \ + ts->ts_state |= state; \ +MACRO_END + +#define turnstile_state_remove(ts, state) \ +MACRO_BEGIN \ + assert(ts->ts_state & (state)); \ + ts->ts_state &= ~(state); \ +MACRO_END + +#else /* DEVELOPMENT || DEBUG */ + +#define turnstile_state_init(ts, state) \ +MACRO_BEGIN \ + (void)ts; \ +MACRO_END + +#define turnstile_state_add(ts, state) \ +MACRO_BEGIN \ + (void)ts; \ +MACRO_END + +#define turnstile_state_remove(ts, state) \ +MACRO_BEGIN \ + (void)ts; \ +MACRO_END + +#endif /* DEVELOPMENT || DEBUG */ + +/* Foward declaration of turnstile */ +struct turnstile; + +/* + * Turnstile update flags + * + * TURNSTILE_IMMEDIATE_UPDATE + * When passed to turnstile_update_inheritor + * update the inheritor of the turnstile in + * the same call. + * + * TURNSTILE_DELAYED_UPDATE + * When passed to turnstile_update_inheritor + * it stashed the inheritor on the thread and + * turnstile's inheritor is updated in + * assert wait. + * + * TURNSTILE_INHERITOR_THREAD + * The turnstile inheritor is of type thread. + * + * TURNSTILE_INHERITOR_TURNSTILE + * The turnstile inheritor is of type turnstile. + * + * TURNSTILE_INHERITOR_WORKQ + * The turnstile inheritor is of type workqueue + * + * TURNSTILE_INHERITOR_NEEDS_PRI_UPDATE + * The inheritor needs a chain priority update. + * + * TURNSTILE_NEEDS_PRI_UPDATE + * Current turnstile needs a chain priority update. + * + * Locking order for passing thread and turnstile as inheritor + * + * Thread as an inheritor: + * When thread is passed as an inheritor of a turnstile + * turnstile lock is taken and then thread lock. + * + * Turnstile as in inheritor: + * When turnstile (T1) is passed as an inheritor of + * a turnstile (T2), turnstile lock of T2 is taken + * and then turnstile lock of T1 is taken. + * + * Caution: While passing turnstile as an inheritor, its + * job of the adopter to make sure that there is no + * lock inversion. + */ +typedef enum __attribute__((flag_enum)) __attribute__((packed)) turnstile_update_flags { + TURNSTILE_UPDATE_FLAGS_NONE = 0, + TURNSTILE_IMMEDIATE_UPDATE = 0x1, + TURNSTILE_DELAYED_UPDATE = 0x2, + TURNSTILE_INHERITOR_THREAD = 0x4, + TURNSTILE_INHERITOR_TURNSTILE = 0x8, + TURNSTILE_INHERITOR_NEEDS_PRI_UPDATE = 0x10, + TURNSTILE_NEEDS_PRI_UPDATE = 0x20, + TURNSTILE_INHERITOR_WORKQ = 0x40, + TURNSTILE_UPDATE_BOOST = 0x80, +} turnstile_update_flags_t; + +#define TURNSTILE_NULL ((struct turnstile *)0) + +typedef void * turnstile_inheritor_t; + +#define TURNSTILE_INHERITOR_NULL NULL + +#ifdef XNU_KERNEL_PRIVATE + +/* Turnstile stats update flags + * + * TSU_TURNSTILE_BLOCK_COUNT + * thread blocking on turnstile waitq, increment global + * thread block on turnstile count. + * + * TSU_REGULAR_WAITQ_BLOCK_COUNT + * thread blocking on regular waitq, increment global + * thread block on regular waitq count. + * + * TSU_PRI_PROPAGATION + * turnstile propagation update stopped at nth hop, update + * priority change count for nth element in stats array. + * + * TSU_NO_INHERITOR + * turnstile propagation update stopped due to turnstile + * not having an inheritor after nth hop, update the no + * inheritor count for nth element in the stats array. + * + * TSU_NO_TURNSTILE + * turnstile propagation update stopped due to thread + * not blocked on a turnstile waitq after nth hop, update + * the no turnstile count for the nth element in the stats + * array. + * + * TSU_NO_PRI_CHANGE_NEEDED + * turnstile propagation update stopped due to thread or + * turnstile having the correct priority or not blocked. + * update the no priority change count for the nth element + * in the stats array. + * + * TSU_THREAD_RUNNABLE + * turnstile propagation update stopped due to thread + * being runnable, update the thread runnable count for + * the nth element in the stats array. + * + * TSU_ABOVE_UI_PRI_CHANGE + * turnstile propagation caused an above UI priority change. + */ +typedef enum __attribute__((flag_enum)) turnstile_stats_update_flags { + TSU_FLAGS_NONE = 0, + TSU_TURNSTILE_BLOCK_COUNT = 0x1, + TSU_REGULAR_WAITQ_BLOCK_COUNT = 0x2, + TSU_PRI_PROPAGATION = 0x4, + TSU_NO_INHERITOR = 0x8, + TSU_NO_TURNSTILE = 0x10, + TSU_NO_PRI_CHANGE_NEEDED = 0x20, + TSU_THREAD_RUNNABLE = 0x40, + TSU_ABOVE_UI_PRI_CHANGE = 0x80, + TSU_THREAD_ARG = 0x100, + TSU_TURNSTILE_ARG = 0x200, + TSU_BOOST_ARG = 0x400, +} turnstile_stats_update_flags_t; + +SLIST_HEAD(turnstile_list, turnstile); + +struct turnstile { + struct waitq ts_waitq; /* waitq embedded in turnstile */ + turnstile_inheritor_t ts_inheritor; /* thread/turnstile inheriting the priority (IL, WL) */ + union { + struct turnstile_list ts_free_turnstiles; /* turnstile free list (IL) */ + SLIST_ENTRY(turnstile) ts_free_elm; /* turnstile free list element (IL) */ + }; + struct priority_queue ts_inheritor_queue; /* Queue of turnstile with us as an inheritor (WL) */ + union { + struct priority_queue_entry ts_inheritor_links; /* Inheritor queue links */ + queue_chain_t ts_deallocate_link; /* thread deallocate link */ + }; + SLIST_ENTRY(turnstile) ts_htable_link; /* linkage for turnstile in global hash table */ + uintptr_t ts_proprietor; /* hash key lookup turnstile (IL) */ + os_refcnt_t ts_refcount; /* reference count for turnstiles */ + _Atomic uint32_t ts_type_gencount; /* gen count used for priority chaining (IL), type of turnstile (IL) */ + uint32_t ts_port_ref; /* number of explicit refs from ports on send turnstile */ + turnstile_update_flags_t ts_inheritor_flags; /* flags for turnstile inheritor (IL, WL) */ + uint8_t ts_priority; /* priority of turnstile (WL) */ + +#if DEVELOPMENT || DEBUG + uint8_t ts_state; /* current state of turnstile (IL) */ + queue_chain_t ts_global_elm; /* global turnstile chain */ + thread_t ts_thread; /* thread the turnstile is attached to */ + thread_t ts_prev_thread; /* thread the turnstile was attached before donation */ +#endif +}; + +#define waitq_to_turnstile(waitq) __container_of(waitq, struct turnstile, ts_waitq) + +/* IL - interlock, WL - turnstile lock i.e. waitq lock */ + +#define TURNSTILE_PROPRIETOR_NULL 0 + +/* + * Name: turnstiles_init + * + * Description: Initialize turnstile sub system. + * + * Args: None. + * + * Returns: None. + */ +void +turnstiles_init(void); + +/* + * Name: turnstile_alloc + * + * Description: Allocate a turnstile. + * + * Args: None. + * + * Returns: + * turnstile on Success. + */ +struct turnstile * +turnstile_alloc(void); + +/* + * Name: turnstile_destroy + * + * Description: Deallocates the turnstile. + * + * Args: + * Arg1: turnstile + * + * Returns: None. + */ +void +turnstile_destroy(struct turnstile *turnstile); + +/* + * Name: turnstile_reference + * + * Description: Take a reference on the turnstile. + * + * Arg1: turnstile + * + * Returns: None. + */ +void +turnstile_reference(struct turnstile *turnstile); + +/* + * Name: turnstile_deallocate + * + * Description: Drop a reference on the turnstile. + * Destroy the turnstile if the last ref. + * + * Arg1: turnstile + * + * Returns: None. + */ +void +turnstile_deallocate(struct turnstile *turnstile); + +/* + * Name: turnstile_deallocate_safe + * + * Description: Drop a reference on the turnstile safely without triggering zfree. + * + * Arg1: turnstile + * + * Returns: None. + */ +void +turnstile_deallocate_safe(struct turnstile *turnstile); + +/* + * Name: turnstile_recompute_priority_locked + * + * Description: Update turnstile priority based + * on highest waiter thread and highest blocking + * turnstile. + * + * Args: turnstile + * + * Returns: TRUE: if the turnstile priority changed and needs propagation. + * FALSE: if the turnstile priority did not change or it does not need propagation. + * + * Condition: turnstile locked + */ +boolean_t +turnstile_recompute_priority_locked( + struct turnstile *turnstile); + +/* + * Name: turnstile_recompute_priority + * + * Description: Update turnstile priority based + * on highest waiter thread and highest blocking + * turnstile. + * + * Args: turnstile + * + * Returns: TRUE: if the turnstile priority changed and needs propagation. + * FALSE: if the turnstile priority did not change or it does not need propagation. + */ +boolean_t +turnstile_recompute_priority( + struct turnstile *turnstile); + +/* + * Name: turnstile_workq_proprietor_of_max_turnstile + * + * Description: Returns the highest priority and proprietor of a turnstile + * pushing on a workqueue turnstile. + * + * This will not return waiters that are at priority + * MAXPRI_THROTTLE or lower. + * + * Args: turnstile + * + * Returns: + * Priority of the max entry, or 0 + * Pointer to the max entry proprietor + */ +int +turnstile_workq_proprietor_of_max_turnstile( + struct turnstile *turnstile, + uintptr_t *proprietor); + +/* + * Name: turnstile_cleanup + * + * Description: Update priority of a turnstile inheritor + * if needed. + * + * Args: inheritor and flags passed on thread struct. + * + * Returns: None. + */ +void +turnstile_cleanup(void); + +/* + * Name: turnstile_update_inheritor_locked + * + * Description: Update the inheritor of the turnstile and boost the + * inheritor, called with turnstile locked. + * + * Args: + * Arg1: turnstile + * Implicit arg: new inheritor value is stashed in current thread's struct + * + * Returns: + * old inheritor reference is returned on current thread's struct. + */ +void +turnstile_update_inheritor_locked(struct turnstile *turnstile); + +/* + * Name: thread_get_inheritor_turnstile_priority + * + * Description: Get the max priority of all the inheritor turnstiles + * + * Arg1: thread + * + * Returns: Max priority of all the inheritor turnstiles. + * + * Condition: thread locked + */ +int +thread_get_inheritor_turnstile_priority(thread_t thread); + +/* + * Name: thread_get_waiting_turnstile + * + * Description: Get the turnstile if the thread is waiting on a turnstile. + * + * Arg1: thread + * + * Returns: turnstile: if the thread is blocked on a turnstile. + * TURNSTILE_NULL: otherwise. + * + * Condition: thread locked. + */ +struct turnstile * +thread_get_waiting_turnstile(thread_t thread); + +/* + * Name: turnstile_lookup_by_proprietor + * + * Description: Get turnstile for a proprietor from global + * turnstile hash. + * + * Arg1: port + * + * Returns: turnstile: if the proprietor has a turnstile. + * TURNSTILE_NULL: otherwise. + * + * Condition: proprietor interlock held. + */ +struct turnstile * +turnstile_lookup_by_proprietor(uintptr_t proprietor); + +/* + * Name: turnstile_stats_update + * + * Description: Function to update turnstile stats for dev kernel. + * + * Arg1: hops : number of thread hops in priority propagation + * Arg2: flags : turnstile stats update flags + * Arg3: inheritor: inheritor + * + * Returns: Nothing + */ +void +turnstile_stats_update( + int hop __assert_only, + turnstile_stats_update_flags_t flags __assert_only, + turnstile_inheritor_t inheritor __assert_only); + +#if DEVELOPMENT || DEBUG + +/* Functions used by debug test primitive exported by sysctls */ +int +tstile_test_prim_lock(boolean_t use_hashtable); + +int +tstile_test_prim_unlock(boolean_t use_hashtable); + +int +turnstile_get_boost_stats_sysctl(void *req); +int +turnstile_get_unboost_stats_sysctl(void *req); +#endif /* DEVELOPMENT || DEBUG */ +#endif /* XNU_KERNEL_PRIVATE */ + +/* Interface */ + +/* + * Name: turnstile_prepare + * + * Description: Transfer current thread's turnstile to primitive or it's free turnstile list. + * Function is called holding the interlock (spinlock) of the primitive. + * The turnstile returned by this function is safe to use untill the thread calls turnstile_complete. + * When no turnstile is provided explicitly, the calling thread will not have a turnstile attached to + * it untill it calls turnstile_complete. + * + * Args: + * Arg1: proprietor + * Arg2: pointer in primitive struct to store turnstile + * Arg3: turnstile to use instead of taking it from thread. + * Arg4: type of primitive + * + * Returns: + * turnstile. + */ +struct turnstile * +turnstile_prepare( + uintptr_t proprietor, + struct turnstile **tstore, + struct turnstile *turnstile, + turnstile_type_t type); + +/* + * Name: turnstile_complete + * + * Description: Transfer the primitive's turnstile or from it's freelist to current thread. + * Function is called holding the interlock (spinlock) of the primitive. + * Current thread will have a turnstile attached to it after this call. + * + * Args: + * Arg1: proprietor + * Arg2: pointer in primitive struct to update turnstile + * Arg3: pointer to store the returned turnstile instead of attaching it to thread + * + * Returns: + * None. + */ +void +turnstile_complete( + uintptr_t proprietor, + struct turnstile **tstore, + struct turnstile **turnstile); + +/* + * Name: turnstile_update_inheritor + * + * Description: Update the inheritor of the turnstile and boost the + * inheritor. It will take a thread reference on the inheritor. + * Called with the interlock of the primitive held. + * + * Args: + * Arg1: turnstile + * Arg2: inheritor + * Arg3: flags - TURNSTILE_DELAYED_UPDATE - update will happen later in assert_wait + * + * Returns: + * old inheritor reference is stashed on current thread's struct. + */ +void +turnstile_update_inheritor( + struct turnstile *turnstile, + turnstile_inheritor_t new_inheritor, + turnstile_update_flags_t flags); + +typedef enum turnstile_update_complete_flags { + TURNSTILE_INTERLOCK_NOT_HELD = 0x1, + TURNSTILE_INTERLOCK_HELD = 0x2, +} turnstile_update_complete_flags_t; + +/* + * Name: turnstile_update_inheritor_complete + * + * Description: Update turnstile inheritor's priority and propagate the + * priority if the inheritor is blocked on a turnstile. + * Consumes thread ref of old inheritor returned by + * turnstile_update_inheritor. Recursive priority update + * will only happen when called with interlock dropped. + * + * Args: + * Arg1: turnstile + * Arg2: interlock held + * + * Returns: None. + */ +void +turnstile_update_inheritor_complete( + struct turnstile *turnstile, + turnstile_update_complete_flags_t flags); + +#endif /* KERNEL_PRIVATE */ +#if XNU_KERNEL_PRIVATE + +struct workqueue; + +/* pthread_workqueue.c */ +extern void workq_reference(struct workqueue *wq); +extern void workq_deallocate_safe(struct workqueue *wq); +extern void workq_destroy(struct workqueue *wq); +extern bool workq_is_current_thread_updating_turnstile(struct workqueue *wq); +extern void workq_schedule_creator_turnstile_redrive(struct workqueue *wq, + bool locked); + +/* thread.c */ +extern void workq_deallocate_enqueue(struct workqueue *wq); + +#endif /* XNU_KERNEL_PRIVATE */ + +#endif /* _TURNSTILE_H_ */ diff --git a/osfmk/kern/ux_handler.c b/osfmk/kern/ux_handler.c new file mode 100644 index 000000000..5f81a2ea9 --- /dev/null +++ b/osfmk/kern/ux_handler.c @@ -0,0 +1,260 @@ +/* + * Copyright (c) 2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include + +#include + +/* + * Mach kobject port to reflect Mach exceptions into Unix signals. + * + * This is the default Mach exception handler for initproc, which + * then filters to all subprocesses as the host level exception handler for + * most Mach exceptions. + */ + +static const void *ux_handler_kobject = NULL; +SECURITY_READ_ONLY_LATE(ipc_port_t) ux_handler_port = IP_NULL; + +/* + * init is called early in Mach initialization + * when we can initialize read-only memory + */ +void +ux_handler_init(void) +{ + ux_handler_port = ipc_port_alloc_kernel(); + + if (ux_handler_port == IP_NULL) + panic("can't allocate unix exception port"); + + ipc_kobject_set(ux_handler_port, (ipc_kobject_t)&ux_handler_kobject, IKOT_UX_HANDLER); +} + +/* + * setup is called late in BSD initialization from initproc's context + * so the MAC hook goo inside host_set_exception_ports will be able to + * set up labels without falling over. + */ +void +ux_handler_setup(void) +{ + ipc_port_t ux_handler_send_right = ipc_port_make_send(ux_handler_port); + + if (!IP_VALID(ux_handler_send_right)) + panic("Couldn't allocate send right for ux_handler_port!\n"); + + kern_return_t kr = KERN_SUCCESS; + + /* + * Consumes 1 send right. + * + * Instruments uses the RPC_ALERT port, so don't register for that. + */ + kr = host_set_exception_ports(host_priv_self(), + EXC_MASK_ALL & ~(EXC_MASK_RPC_ALERT), + ux_handler_send_right, + EXCEPTION_DEFAULT | MACH_EXCEPTION_CODES, + 0); + + if (kr != KERN_SUCCESS) + panic("host_set_exception_ports failed to set ux_handler! %d", kr); +} + +/* + * Is this port the ux_handler? + * If so, it's safe to send an exception without checking labels. + */ +boolean_t +is_ux_handler_port(mach_port_t port) +{ + if (ux_handler_port == port) + return TRUE; + else + return FALSE; +} + +kern_return_t +catch_mach_exception_raise( + mach_port_t exception_port, + mach_port_t thread_port, + mach_port_t task_port, + exception_type_t exception, + mach_exception_data_t code, + __unused mach_msg_type_number_t codeCnt) +{ + if (exception_port != ux_handler_port) + return KERN_FAILURE; + + kern_return_t kr = KERN_SUCCESS; + + thread_t target_thread = THREAD_NULL; + task_t target_task = TASK_NULL; + + if ((target_thread = convert_port_to_thread(thread_port)) == THREAD_NULL) { + kr = KERN_INVALID_ARGUMENT; + goto out; + } + + if ((target_task = convert_port_to_task(task_port)) == TASK_NULL) { + kr = KERN_INVALID_ARGUMENT; + goto out; + } + + kr = handle_ux_exception(target_thread, exception, code[0], code[1]); + +out: + if (kr == KERN_SUCCESS) { + /* + * Following the MIG 'consume on success' protocol, + * consume references to the port arguments. + * (but NOT the exception_port, as the first argument is borrowed) + * + * If we return non-success, the kobject server will eat the port + * references for us. + */ + + ipc_port_release_send(thread_port); + ipc_port_release_send(task_port); + } + + thread_deallocate(target_thread); + task_deallocate(target_task); + + return kr; +} + +kern_return_t +catch_exception_raise( + mach_port_t exception_port, + mach_port_t thread, + mach_port_t task, + exception_type_t exception, + exception_data_t code, + mach_msg_type_number_t codeCnt) +{ + if (exception_port != ux_handler_port) + return KERN_FAILURE; + + mach_exception_data_type_t big_code[EXCEPTION_CODE_MAX] = { + [0] = code[0], + [1] = code[1], + }; + + return catch_mach_exception_raise(exception_port, + thread, + task, + exception, + big_code, + codeCnt); +} + +kern_return_t +catch_exception_raise_state( + __unused mach_port_t exception_port, + __unused exception_type_t exception, + __unused const exception_data_t code, + __unused mach_msg_type_number_t codeCnt, + __unused int *flavor, + __unused const thread_state_t old_state, + __unused mach_msg_type_number_t old_stateCnt, + __unused thread_state_t new_state, + __unused mach_msg_type_number_t *new_stateCnt) +{ + return(KERN_INVALID_ARGUMENT); +} + +kern_return_t +catch_mach_exception_raise_state( + __unused mach_port_t exception_port, + __unused exception_type_t exception, + __unused const mach_exception_data_t code, + __unused mach_msg_type_number_t codeCnt, + __unused int *flavor, + __unused const thread_state_t old_state, + __unused mach_msg_type_number_t old_stateCnt, + __unused thread_state_t new_state, + __unused mach_msg_type_number_t *new_stateCnt) +{ + return(KERN_INVALID_ARGUMENT); +} + +kern_return_t +catch_exception_raise_state_identity( + __unused mach_port_t exception_port, + __unused mach_port_t thread, + __unused mach_port_t task, + __unused exception_type_t exception, + __unused exception_data_t code, + __unused mach_msg_type_number_t codeCnt, + __unused int *flavor, + __unused thread_state_t old_state, + __unused mach_msg_type_number_t old_stateCnt, + __unused thread_state_t new_state, + __unused mach_msg_type_number_t *new_stateCnt) +{ + return(KERN_INVALID_ARGUMENT); +} + +kern_return_t +catch_mach_exception_raise_state_identity( + __unused mach_port_t exception_port, + __unused mach_port_t thread, + __unused mach_port_t task, + __unused exception_type_t exception, + __unused mach_exception_data_t code, + __unused mach_msg_type_number_t codeCnt, + __unused int *flavor, + __unused thread_state_t old_state, + __unused mach_msg_type_number_t old_stateCnt, + __unused thread_state_t new_state, + __unused mach_msg_type_number_t *new_stateCnt) +{ + return(KERN_INVALID_ARGUMENT); +} + diff --git a/osfmk/corecrypto/ccaes/src/ccaes_private_types.h b/osfmk/kern/ux_handler.h similarity index 74% rename from osfmk/corecrypto/ccaes/src/ccaes_private_types.h rename to osfmk/kern/ux_handler.h index 7a30fad3e..a3c473b84 100644 --- a/osfmk/corecrypto/ccaes/src/ccaes_private_types.h +++ b/osfmk/kern/ux_handler.h @@ -1,11 +1,5 @@ /* - * ccaes_private_types.h - * corecrypto - * - * Created on 02/15/2012 - * - * Copyright (c) 2012,2015 Apple Inc. All rights reserved. - * + * Copyright (c) 2017 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -32,17 +26,14 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#ifndef _CORECRYPTO_CCAES_PRIVATE_TYPES_H_ -#define _CORECRYPTO_CCAES_PRIVATE_TYPES_H_ +#ifndef _KERN_UX_HANDLER_H_ +#define _KERN_UX_HANDLER_H_ -#include -#include +#include -typedef struct ltc_rijndael_key -{ - uint32_t eK[60], dK[60]; - int Nr; -} ltc_rijndael_keysched; +extern void ux_handler_init(void); +extern void ux_handler_setup(void); +extern boolean_t is_ux_handler_port(mach_port_t port); +#endif /* !defined(_KERN_UX_HANDLER_H_) */ -#endif // _CORECRYPTO_CCAES_PRIVATE_TYPES_H_ diff --git a/osfmk/kern/waitq.c b/osfmk/kern/waitq.c index 380b3f1f9..98ee900ba 100644 --- a/osfmk/kern/waitq.c +++ b/osfmk/kern/waitq.c @@ -73,6 +73,7 @@ #include #include #include +#include #include #include @@ -135,7 +136,7 @@ static __inline__ void waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], int s #if __arm64__ #define waitq_lock_to(wq,to) \ - (hw_lock_bit_to(&(wq)->waitq_interlock, LCK_ILOCK, (uint32_t)to)) + (hw_lock_bit_to(&(wq)->waitq_interlock, LCK_ILOCK, to)) #define waitq_lock_unlock(wq) \ (hw_unlock_bit(&(wq)->waitq_interlock, LCK_ILOCK)) @@ -146,7 +147,7 @@ static __inline__ void waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], int s #else #define waitq_lock_to(wq,to) \ - (hw_lock_to(&(wq)->waitq_interlock, (uint32_t)to)) + (hw_lock_to(&(wq)->waitq_interlock, to)) #define waitq_lock_unlock(wq) \ (hw_lock_unlock(&(wq)->waitq_interlock)) @@ -533,7 +534,7 @@ int walk_waitq_links(int walk_type, struct waitq *waitq, * invalidated before we grabbed the lock! */ if (wqset->wqset_id != link->wql_setid.id) { - /*This is the bottom of the tree: just get out */ + /* This is the bottom of the tree: just get out */ if (should_unlock) { waitq_set_unlock(wqset); } @@ -1390,6 +1391,8 @@ static void wq_prepost_do_post_locked(struct waitq_set *wqset, if (wq_is_preposted_on_set(waitq, wqset)) return; + assert(waitqs_is_linked(wqset)); + /* * This function is called because an event is being posted to 'waitq'. * We need a prepost object associated with this queue. Allocate one @@ -1683,7 +1686,7 @@ static __inline__ void waitq_stats_count_fail(struct waitq *waitq) int waitq_is_valid(struct waitq *waitq) { - return (waitq != NULL) && waitq->waitq_isvalid && ((waitq->waitq_type & ~1) == WQT_QUEUE); + return (waitq != NULL) && waitq->waitq_isvalid; } int waitq_set_is_valid(struct waitq_set *wqset) @@ -1704,6 +1707,20 @@ int waitq_irq_safe(struct waitq *waitq) return waitq->waitq_irq; } +struct waitq * waitq_get_safeq(struct waitq *waitq) +{ + struct waitq *safeq; + + /* Check if it's a port waitq */ + if (waitq_is_port_queue(waitq)) { + assert(!waitq_irq_safe(waitq)); + safeq = ipc_port_rcv_turnstile_waitq(waitq); + } else { + safeq = global_eventq(waitq); + } + return safeq; +} + static uint32_t waitq_hash_size(void) { uint32_t hsize, queues; @@ -1717,6 +1734,65 @@ static uint32_t waitq_hash_size(void) return hsize; } +/* + * Since the priority ordered waitq uses basepri as the + * ordering key assert that this value fits in a uint8_t. + */ +static_assert(MAXPRI <= UINT8_MAX); + +static inline void waitq_thread_insert(struct waitq *wq, + thread_t thread, boolean_t fifo) +{ + if (waitq_is_turnstile_queue(wq)) { + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (THREAD_ADDED_TO_TURNSTILE_WAITQ))) | DBG_FUNC_NONE, + VM_KERNEL_UNSLIDE_OR_PERM(waitq_to_turnstile(wq)), + thread_tid(thread), + thread->base_pri, 0, 0); + + turnstile_stats_update(0, TSU_TURNSTILE_BLOCK_COUNT, NULL); + + /* + * For turnstile queues (which use priority queues), + * insert the thread in the heap based on its current + * base_pri. Note that the priority queue implementation + * is currently not stable, so does not maintain fifo for + * threads at the same base_pri. Also, if the base_pri + * of the thread changes while its blocked in the waitq, + * the thread position should be updated in the priority + * queue by calling priority queue increase/decrease + * operations. + */ + priority_queue_entry_init(&(thread->wait_prioq_links)); + priority_queue_insert(&wq->waitq_prio_queue, + &thread->wait_prioq_links, thread->base_pri, + PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE); + } else { + turnstile_stats_update(0, TSU_REGULAR_WAITQ_BLOCK_COUNT, NULL); + if (fifo) { + enqueue_tail(&wq->waitq_queue, &thread->wait_links); + } else { + enqueue_head(&wq->waitq_queue, &thread->wait_links); + } + } +} + +static inline void waitq_thread_remove(struct waitq *wq, + thread_t thread) +{ + if (waitq_is_turnstile_queue(wq)) { + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, + (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (THREAD_REMOVED_FROM_TURNSTILE_WAITQ))) | DBG_FUNC_NONE, + VM_KERNEL_UNSLIDE_OR_PERM(waitq_to_turnstile(wq)), + thread_tid(thread), + 0, 0, 0); + priority_queue_remove(&wq->waitq_prio_queue, &thread->wait_prioq_links, + PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE); + } else { + remqueue(&(thread->wait_links)); + } +} + void waitq_bootstrap(void) { kern_return_t kret; @@ -1914,6 +1990,8 @@ static int waitq_select_walk_cb(struct waitq *waitq, void *ctx, if (wqset->wqset_id != link->wql_setid.id) goto out_unlock; + assert(waitqs_is_linked(wqset)); + /* * Find any threads waiting on this wait queue set, * and recurse into any waitq set to which this set belongs. @@ -1956,6 +2034,187 @@ static int waitq_select_walk_cb(struct waitq *waitq, void *ctx, return ret; } +/** + * Routine to iterate over the waitq for non-priority ordered waitqs + * + * Conditions: + * args->waitq (and args->posted_waitq) is locked + * + * Notes: + * Uses the optional select callback function to refine the selection + * of one or more threads from a waitq. The select callback is invoked + * once for every thread that is found to be waiting on the input args->waitq. + * + * If one or more threads are selected, this may disable interrupts. + * The previous interrupt state is returned in args->spl and should + * be used in a call to splx() if threads are returned to the caller. + */ +static thread_t waitq_queue_iterate_locked(struct waitq *safeq, struct waitq *waitq, + spl_t spl, struct waitq_select_args *args, + uint32_t *remaining_eventmask) +{ + int max_threads = args->max_threads; + int *nthreads = args->nthreads; + thread_t thread = THREAD_NULL; + thread_t first_thread = THREAD_NULL; + + qe_foreach_element_safe(thread, &safeq->waitq_queue, wait_links) { + thread_t t = THREAD_NULL; + assert_thread_magic(thread); + + /* + * For non-priority ordered waitqs, we allow multiple events to be + * mux'ed into the same waitq. Also safeqs may contain threads from + * multiple waitqs. Only pick threads that match the + * requested wait event. + */ + if (thread->waitq == waitq && thread->wait_event == args->event) { + t = thread; + if (first_thread == THREAD_NULL) + first_thread = thread; + + /* allow the caller to futher refine the selection */ + if (args->select_cb) + t = args->select_cb(args->select_ctx, waitq, + waitq_is_global(waitq), thread); + if (t != THREAD_NULL) { + *nthreads += 1; + if (args->threadq) { + /* if output queue, add locked thread to it */ + if (*nthreads == 1) + *(args->spl) = (safeq != waitq) ? spl : splsched(); + thread_lock(t); + thread_clear_waitq_state(t); + re_queue_tail(args->threadq, &t->wait_links); + } + /* only enqueue up to 'max' threads */ + if (*nthreads >= max_threads && max_threads > 0) + break; + } + } + /* thread wasn't selected so track it's event */ + if (t == THREAD_NULL) { + *remaining_eventmask |= (thread->waitq != safeq) ? + _CAST_TO_EVENT_MASK(thread->waitq) : _CAST_TO_EVENT_MASK(thread->wait_event); + } + } + + return first_thread; +} + +/** + * Routine to iterate and remove threads from priority ordered waitqs + * + * Conditions: + * args->waitq (and args->posted_waitq) is locked + * + * Notes: + * The priority ordered waitqs only support maximum priority element removal. + * + * Also, the implementation makes sure that all threads in a priority ordered + * waitq are waiting on the same wait event. This is not necessarily true for + * non-priority ordered waitqs. If one or more threads are selected, this may + * disable interrupts. The previous interrupt state is returned in args->spl + * and should be used in a call to splx() if threads are returned to the caller. + * + * In the future, we could support priority ordered waitqs with multiple wait + * events in the same queue. The way to implement that would be to keep removing + * elements from the waitq and if the event does not match the requested one, + * add it to a local list. This local list of elements needs to be re-inserted + * into the priority queue at the end and the select_cb return value & + * remaining_eventmask would need to be handled appropriately. The implementation + * is not very efficient but would work functionally. + */ +static thread_t waitq_prioq_iterate_locked(struct waitq *safeq, struct waitq *waitq, + spl_t spl, struct waitq_select_args *args, + uint32_t *remaining_eventmask) +{ + int max_threads = args->max_threads; + int *nthreads = args->nthreads; + thread_t first_thread = THREAD_NULL; + thread_t thread = THREAD_NULL; + + /* + * The waitq select routines need to handle two cases: + * Case 1: Peek at maximum priority thread in the waitq (remove_op = 0) + * Get the maximum priority thread from the waitq without removing it. + * In that case args->threadq == NULL and max_threads == 1. + * Case 2: Remove 'n' highest priority threads from waitq (remove_op = 1) + * Get max_threads (if available) while removing them from the waitq. + * In that case args->threadq != NULL and max_threads is one of {-1, 1}. + * + * The only possible values for remaining_eventmask for the priority queue + * waitq are either 0 (for the remove all threads case) or the original + * safeq->waitq_eventmask (for the lookup/remove one thread cases). + */ + *remaining_eventmask = safeq->waitq_eventmask; + boolean_t remove_op = !!(args->threadq); + + while ((max_threads <= 0) || (*nthreads < max_threads)) { + + if (priority_queue_empty(&(safeq->waitq_prio_queue))) { + *remaining_eventmask = 0; + break; + } + + if (remove_op) { + thread = priority_queue_remove_max(&safeq->waitq_prio_queue, + struct thread, wait_prioq_links, + PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE); + } else { + /* For the peek operation, the only valid value for max_threads is 1 */ + assert(max_threads == 1); + thread = priority_queue_max(&safeq->waitq_prio_queue, + struct thread, wait_prioq_links); + } + /* + * Ensure the wait event matches since priority ordered waitqs do not + * support multiple events in the same waitq. + */ + assert((thread->waitq == waitq) && (thread->wait_event == args->event)); + + if (args->select_cb) { + /* + * Call the select_cb passed into the waitq_select args. The callback + * updates the select_ctx with information about the highest priority + * thread which is eventually used by the caller. + */ + thread_t __assert_only ret_thread = args->select_cb(args->select_ctx, waitq, + waitq_is_global(waitq), thread); + if (!remove_op) { + /* For the peek operation, the thread should not be selected for addition */ + assert(ret_thread == THREAD_NULL); + } else { + /* + * For the remove operation, the select routine should always return a valid + * thread for priority waitqs. Since all threads in a prioq are equally + * eligible, it should match the thread removed from the prioq. If this + * invariant changes, the implementation would need to handle the + * remaining_eventmask here correctly. + */ + assert(ret_thread == thread); + } + } + + if (first_thread == THREAD_NULL) + first_thread = thread; + + /* For the peek operation, break out early */ + if (!remove_op) + break; + + /* Add the thread to the result thread list */ + *nthreads += 1; + if (*nthreads == 1) + *(args->spl) = (safeq != waitq) ? spl : splsched(); + thread_lock(thread); + thread_clear_waitq_state(thread); + enqueue_tail(args->threadq, &(thread->wait_links)); + } + + return first_thread; +} + /** * generic thread selection from a waitq (and sets to which the waitq belongs) * @@ -1976,7 +2235,7 @@ static void do_waitq_select_n_locked(struct waitq_select_args *args) { struct waitq *waitq = args->waitq; int max_threads = args->max_threads; - thread_t thread = THREAD_NULL, first_thread = THREAD_NULL; + thread_t first_thread = THREAD_NULL; struct waitq *safeq; uint32_t remaining_eventmask = 0; uint32_t eventmask; @@ -1988,7 +2247,7 @@ static void do_waitq_select_n_locked(struct waitq_select_args *args) if (!waitq_irq_safe(waitq)) { /* JMM - add flag to waitq to avoid global lookup if no waiters */ eventmask = _CAST_TO_EVENT_MASK(waitq); - safeq = global_eventq(waitq); + safeq = waitq_get_safeq(waitq); if (*nthreads == 0) spl = splsched(); waitq_lock(safeq); @@ -2005,41 +2264,14 @@ static void do_waitq_select_n_locked(struct waitq_select_args *args) if (!waitq_is_global(safeq) || (safeq->waitq_eventmask & eventmask) == eventmask) { - /* look through each thread waiting directly on the safeq */ - qe_foreach_element_safe(thread, &safeq->waitq_queue, wait_links) { - thread_t t = THREAD_NULL; - assert_thread_magic(thread); - - if (thread->waitq == waitq && thread->wait_event == args->event) { - t = thread; - if (first_thread == THREAD_NULL) - first_thread = thread; - - /* allow the caller to futher refine the selection */ - if (args->select_cb) - t = args->select_cb(args->select_ctx, waitq, - waitq_is_global(waitq), thread); - if (t != THREAD_NULL) { - *nthreads += 1; - if (args->threadq) { - if (*nthreads == 1) - *(args->spl) = (safeq != waitq) ? spl : splsched(); - thread_lock(t); - thread_clear_waitq_state(t); - /* put locked thread on output queue */ - re_queue_tail(args->threadq, &t->wait_links); - } - /* only enqueue up to 'max' threads */ - if (*nthreads >= max_threads && max_threads > 0) - break; - } - } - /* thread wasn't selected so track it's event */ - if (t == THREAD_NULL) { - remaining_eventmask |= (thread->waitq != safeq) ? - _CAST_TO_EVENT_MASK(thread->waitq): - _CAST_TO_EVENT_MASK(thread->wait_event); - } + if (waitq_is_turnstile_queue(safeq)) { + first_thread = waitq_prioq_iterate_locked(safeq, waitq, + spl, args, + &remaining_eventmask); + } else { + first_thread = waitq_queue_iterate_locked(safeq, waitq, + spl, args, + &remaining_eventmask); } /* @@ -2052,7 +2284,7 @@ static void do_waitq_select_n_locked(struct waitq_select_args *args) * computed is complete - so reset it. */ if (waitq_is_global(safeq)) { - if (queue_empty(&safeq->waitq_queue)) + if (waitq_empty(safeq)) safeq->waitq_eventmask = 0; else if (max_threads < 0 || *nthreads < max_threads) safeq->waitq_eventmask = remaining_eventmask; @@ -2070,10 +2302,11 @@ static void do_waitq_select_n_locked(struct waitq_select_args *args) *(args->spl) = (safeq != waitq) ? spl : splsched(); thread_lock(first_thread); thread_clear_waitq_state(first_thread); - re_queue_tail(args->threadq, &first_thread->wait_links); + waitq_thread_remove(safeq, first_thread); + enqueue_tail(args->threadq, &(first_thread->wait_links)); /* update the eventmask on [now] empty global queues */ - if (waitq_is_global(safeq) && queue_empty(&safeq->waitq_queue)) + if (waitq_is_global(safeq) && waitq_empty(safeq)) safeq->waitq_eventmask = 0; } @@ -2127,8 +2360,8 @@ static void do_waitq_select_n_locked(struct waitq_select_args *args) * been placed onto the input 'threadq' * * Notes: - * The 'select_cb' function is invoked for every thread found waiting - * on 'waitq' for 'event'. The thread is _not_ locked upon callback + * The 'select_cb' function is invoked for every thread found waiting on + * 'waitq' for 'event'. The thread is _not_ locked upon callback * invocation. This parameter may be NULL. * * If one or more threads are returned in 'threadq' then the caller is @@ -2269,8 +2502,9 @@ waitq_select_max_locked(struct waitq *waitq, event64_t event, * Scan the waitq to find the highest priority thread. * This doesn't remove any thread from the queue */ - nthreads = waitq_select_n_locked(waitq, event, waitq_find_max_pri_cb, &ctx, - reserved_preposts, NULL, 1, spl); + nthreads = waitq_select_n_locked(waitq, event, + waitq_find_max_pri_cb, + &ctx, reserved_preposts, NULL, 1, spl); assert(nthreads == 0); @@ -2336,14 +2570,14 @@ static int waitq_select_thread_cb(struct waitq *waitq, void *ctx, s = splsched(); /* find and lock the interrupt-safe waitq the thread is thought to be on */ - safeq = global_eventq(wqsetq); + safeq = waitq_get_safeq(wqsetq); waitq_lock(safeq); thread_lock(thread); if ((thread->waitq == wqsetq) && (thread->wait_event == event)) { - remqueue(&thread->wait_links); - if (queue_empty(&safeq->waitq_queue)) { + waitq_thread_remove(wqsetq, thread); + if (waitq_empty(safeq)) { safeq->waitq_eventmask = 0; } thread_clear_waitq_state(thread); @@ -2387,7 +2621,7 @@ static kern_return_t waitq_select_thread_locked(struct waitq *waitq, /* Find and lock the interrupts disabled queue the thread is actually on */ if (!waitq_irq_safe(waitq)) { - safeq = global_eventq(waitq); + safeq = waitq_get_safeq(waitq); waitq_lock(safeq); } else { safeq = waitq; @@ -2396,8 +2630,8 @@ static kern_return_t waitq_select_thread_locked(struct waitq *waitq, thread_lock(thread); if ((thread->waitq == waitq) && (thread->wait_event == event)) { - remqueue(&thread->wait_links); - if (queue_empty(&safeq->waitq_queue)) { + waitq_thread_remove(safeq, thread); + if (waitq_empty(safeq)) { safeq->waitq_eventmask = 0; } thread_clear_waitq_state(thread); @@ -2517,7 +2751,7 @@ wait_result_t waitq_assert_wait64_locked(struct waitq *waitq, * Otherwise, determine a global queue to use and lock it. */ if (!waitq_irq_safe(waitq)) { - safeq = global_eventq(waitq); + safeq = waitq_get_safeq(waitq); eventmask = _CAST_TO_EVENT_MASK(waitq); waitq_lock(safeq); } else { @@ -2551,9 +2785,9 @@ wait_result_t waitq_assert_wait64_locked(struct waitq *waitq, if (!safeq->waitq_fifo || (thread->options & TH_OPT_VMPRIV) || realtime) - enqueue_head(&safeq->waitq_queue, &thread->wait_links); + waitq_thread_insert(safeq, thread, false); else - enqueue_tail(&safeq->waitq_queue, &thread->wait_links); + waitq_thread_insert(safeq, thread, true); /* mark the event and real waitq, even if enqueued on a global safeq */ thread->wait_event = wait_event; @@ -2580,6 +2814,12 @@ wait_result_t waitq_assert_wait64_locked(struct waitq *waitq, /* unlock the thread */ thread_unlock(thread); + /* update the inheritor's thread priority if the waitq is embedded in turnstile */ + if (waitq_is_turnstile_queue(safeq) && wait_result == THREAD_WAITING) { + turnstile_recompute_priority_locked(waitq_to_turnstile(safeq)); + turnstile_update_inheritor_locked(waitq_to_turnstile(safeq)); + } + /* unlock the safeq if we locked it here */ if (safeq != waitq) { waitq_unlock(safeq); @@ -2610,7 +2850,7 @@ int waitq_pull_thread_locked(struct waitq *waitq, thread_t thread) /* Find the interrupts disabled queue thread is waiting on */ if (!waitq_irq_safe(waitq)) { - safeq = global_eventq(waitq); + safeq = waitq_get_safeq(waitq); } else { safeq = waitq; } @@ -2619,12 +2859,12 @@ int waitq_pull_thread_locked(struct waitq *waitq, thread_t thread) if (!waitq_lock_try(safeq)) return 0; - remqueue(&thread->wait_links); + waitq_thread_remove(safeq, thread); thread_clear_waitq_state(thread); waitq_stats_count_clear_wakeup(waitq); /* clear the global event mask if this was the last thread there! */ - if (waitq_is_global(safeq) && queue_empty(&safeq->waitq_queue)) { + if (waitq_is_global(safeq) && waitq_empty(safeq)) { safeq->waitq_eventmask = 0; /* JMM - also mark no-waiters on waitq (if not the same as the safeq) */ } @@ -2636,80 +2876,58 @@ int waitq_pull_thread_locked(struct waitq *waitq, thread_t thread) static __inline__ -void maybe_adjust_thread_pri(thread_t thread, int priority) { - if (thread->sched_pri < priority) { - if (priority <= MAXPRI) { - set_sched_pri(thread, priority); - - thread->was_promoted_on_wakeup = 1; - thread->sched_flags |= TH_SFLAG_PROMOTED; - } - return; - } +void maybe_adjust_thread_pri(thread_t thread, + int priority, + __kdebug_only struct waitq *waitq) +{ /* * If the caller is requesting the waitq subsystem to promote the * priority of the awoken thread, then boost the thread's priority to * the default WAITQ_BOOST_PRIORITY (if it's not already equal or * higher priority). This boost must be removed via a call to - * waitq_clear_promotion_locked. + * waitq_clear_promotion_locked before the thread waits again. + * + * WAITQ_PROMOTE_PRIORITY is -2. + * Anything above 0 represents a mutex promotion. + * The default 'no action' value is -1. + * TODO: define this in a header */ - if (priority == WAITQ_PROMOTE_PRIORITY && - (thread->sched_pri < WAITQ_BOOST_PRIORITY || - !(thread->sched_flags & TH_SFLAG_WAITQ_PROMOTED))) { - - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAITQ_PROMOTE) | DBG_FUNC_NONE, - (uintptr_t)thread_tid(thread), - thread->sched_pri, thread->base_pri, - WAITQ_BOOST_PRIORITY, 0); - thread->sched_flags |= TH_SFLAG_WAITQ_PROMOTED; - if (thread->sched_pri < WAITQ_BOOST_PRIORITY) - set_sched_pri(thread, WAITQ_BOOST_PRIORITY); + if (priority == WAITQ_PROMOTE_PRIORITY) { + uintptr_t trace_waitq = 0; + if (__improbable(kdebug_enable)) + trace_waitq = VM_KERNEL_UNSLIDE_OR_PERM(waitq); + + sched_thread_promote_reason(thread, TH_SFLAG_WAITQ_PROMOTED, trace_waitq); + } else if (priority > 0) { + /* Mutex subsystem wants to see this thread before we 'go' it */ + lck_mtx_wakeup_adjust_pri(thread, priority); } } -/** - * Clear a thread's waitq priority promotion state and the waitq's boost flag +/* + * Clear a potential thread priority promotion from a waitq wakeup + * with WAITQ_PROMOTE_PRIORITY. * - * This function will always clear the waitq's 'waitq_boost' flag. If the - * 'thread' parameter is non-null, the this function will also check the - * priority promotion (boost) state of that thread. If this thread was boosted - * (by having been awoken from a boosting waitq), then this boost state is - * cleared. This function is to be paired with waitq_enable_promote_locked. + * This must be called on the thread which was woken up with TH_SFLAG_WAITQ_PROMOTED. */ void waitq_clear_promotion_locked(struct waitq *waitq, thread_t thread) { spl_t s; assert(waitq_held(waitq)); - if (thread == THREAD_NULL) + assert(thread != THREAD_NULL); + assert(thread == current_thread()); + + /* This flag is only cleared by the thread itself, so safe to check outside lock */ + if ((thread->sched_flags & TH_SFLAG_WAITQ_PROMOTED) != TH_SFLAG_WAITQ_PROMOTED) return; if (!waitq_irq_safe(waitq)) s = splsched(); thread_lock(thread); - if (thread->sched_flags & TH_SFLAG_WAITQ_PROMOTED) { - thread->sched_flags &= ~TH_SFLAG_WAITQ_PROMOTED; - - if (thread->sched_flags & TH_SFLAG_PROMOTED_MASK) { - /* it still has other promotions (mutex/rw_lock) */ - } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) { - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAITQ_DEMOTE) | DBG_FUNC_NONE, - (uintptr_t)thread_tid(thread), - thread->sched_pri, - thread->base_pri, - DEPRESSPRI, 0); - set_sched_pri(thread, DEPRESSPRI); - } else { - KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAITQ_DEMOTE) | DBG_FUNC_NONE, - (uintptr_t)thread_tid(thread), - thread->sched_pri, - thread->base_pri, - thread->base_pri, 0); - thread_recompute_sched_pri(thread, FALSE); - } - } + sched_thread_unpromote_reason(thread, TH_SFLAG_WAITQ_PROMOTED, 0); thread_unlock(thread); if (!waitq_irq_safe(waitq)) @@ -2763,7 +2981,7 @@ kern_return_t waitq_wakeup64_all_locked(struct waitq *waitq, qe_foreach_element_safe(thread, &wakeup_queue, wait_links) { assert_thread_magic(thread); remqueue(&thread->wait_links); - maybe_adjust_thread_pri(thread, priority); + maybe_adjust_thread_pri(thread, priority, waitq); ret = thread_go(thread, result); assert(ret == KERN_SUCCESS); thread_unlock(thread); @@ -2817,7 +3035,7 @@ kern_return_t waitq_wakeup64_one_locked(struct waitq *waitq, waitq_unlock(waitq); if (thread != THREAD_NULL) { - maybe_adjust_thread_pri(thread, priority); + maybe_adjust_thread_pri(thread, priority, waitq); kern_return_t ret = thread_go(thread, result); assert(ret == KERN_SUCCESS); thread_unlock(thread); @@ -2954,13 +3172,21 @@ kern_return_t waitq_init(struct waitq *waitq, int policy) waitq->waitq_irq = !!(policy & SYNC_POLICY_DISABLE_IRQ); waitq->waitq_prepost = 0; waitq->waitq_type = WQT_QUEUE; + waitq->waitq_turnstile_or_port = !!(policy & SYNC_POLICY_TURNSTILE); waitq->waitq_eventmask = 0; waitq->waitq_set_id = 0; waitq->waitq_prepost_id = 0; waitq_lock_init(waitq); - queue_init(&waitq->waitq_queue); + if (waitq_is_turnstile_queue(waitq)) { + /* For turnstile, initialize it as a priority queue */ + priority_queue_init(&waitq->waitq_prio_queue, + PRIORITY_QUEUE_BUILTIN_MAX_HEAP); + assert(waitq->waitq_fifo == 0); + } else { + queue_init(&waitq->waitq_queue); + } waitq->waitq_isvalid = 1; return KERN_SUCCESS; @@ -3050,7 +3276,6 @@ void waitq_deinit(struct waitq *waitq) return; } - waitq->waitq_type = WQT_INVALID; waitq->waitq_isvalid = 0; if (!waitq_irq_safe(waitq)) { @@ -3061,7 +3286,7 @@ void waitq_deinit(struct waitq *waitq) splx(s); } - assert(queue_empty(&waitq->waitq_queue)); + assert(waitq_empty(waitq)); } void waitq_invalidate_locked(struct waitq *waitq) @@ -3095,7 +3320,10 @@ static int wqset_clear_prepost_chain_cb(struct waitq_set __unused *wqset, * may block * * Returns: - * allocated / initialized waitq_set object + * allocated / initialized waitq_set object. + * the waits_set object returned does not have + * a waitq_link associated. + * * NULL on failure */ struct waitq_set *waitq_set_alloc(int policy, void *prepost_hook) @@ -3119,9 +3347,9 @@ struct waitq_set *waitq_set_alloc(int policy, void *prepost_hook) /** * initialize a waitq set object * - * Conditions: - * may (rarely) block if link table needs to grow, and - * no 'reserved_link' object is passed. + * if no 'reserved_link' object is passed + * the waitq_link will be lazily allocated + * on demand through waitq_set_lazy_init_link. */ kern_return_t waitq_set_init(struct waitq_set *wqset, int policy, uint64_t *reserved_link, @@ -3148,21 +3376,96 @@ kern_return_t waitq_set_init(struct waitq_set *wqset, if (reserved_link && *reserved_link != 0) { link = wql_get_reserved(*reserved_link, WQL_WQS); + + if (!link) + panic("Can't allocate link object for waitq set: %p", wqset); + /* always consume the caller's reference */ *reserved_link = 0; + + link->wql_wqs.wql_set = wqset; + wql_mkvalid(link); + + wqset->wqset_id = link->wql_setid.id; + wql_put_link(link); + } else { - link = wql_alloc_link(WQL_WQS); + /* + * Lazy allocate the link only when an actual id is needed. + */ + wqset->wqset_id = WQSET_NOT_LINKED; } + + return KERN_SUCCESS; +} + +#if DEVELOPMENT || DEBUG + +int +sysctl_helper_waitq_set_nelem(void) +{ + return ltable_nelem(&g_wqlinktable); +} + +#endif + +/** + * initialize a waitq set link. + * + * Conditions: + * may block + * locks and unlocks the waiq set lock + * + */ +void +waitq_set_lazy_init_link(struct waitq_set *wqset) +{ + struct waitq_link *link; + + assert(get_preemption_level() == 0 && waitq_wait_possible(current_thread())); + + waitq_set_lock(wqset); + if (!waitq_set_should_lazy_init_link(wqset)){ + waitq_set_unlock(wqset); + return; + } + + assert(wqset->wqset_id == WQSET_NOT_LINKED); + waitq_set_unlock(wqset); + + link = wql_alloc_link(WQL_WQS); if (!link) panic("Can't allocate link object for waitq set: %p", wqset); link->wql_wqs.wql_set = wqset; - wql_mkvalid(link); - wqset->wqset_id = link->wql_setid.id; + waitq_set_lock(wqset); + if (waitq_set_should_lazy_init_link(wqset)) { + wql_mkvalid(link); + wqset->wqset_id = link->wql_setid.id; + } + + assert(wqset->wqset_id != 0); + assert(wqset->wqset_id != WQSET_NOT_LINKED); + + waitq_set_unlock(wqset); + wql_put_link(link); - return KERN_SUCCESS; + return; +} + +/** + * checks if a waitq set needs to be linked. + * + */ +boolean_t +waitq_set_should_lazy_init_link(struct waitq_set *wqset) +{ + if (waitqs_is_linked(wqset) || wqset->wqset_id == 0) { + return FALSE; + } + return TRUE; } /** @@ -3183,27 +3486,32 @@ void waitq_set_deinit(struct waitq_set *wqset) panic("trying to de-initialize an invalid wqset @%p", wqset); assert(!waitq_irq_safe(&wqset->wqset_q)); + waitq_set_lock(wqset); set_id = wqset->wqset_id; - /* grab the set's link object */ - link = wql_get_link(set_id); - if (link) - wql_invalidate(link); + if (waitqs_is_linked(wqset) || set_id == 0) { - /* someone raced us to deinit */ - if (!link || wqset->wqset_id != set_id || set_id != link->wql_setid.id) { - if (link) - wql_put_link(link); - waitq_set_unlock(wqset); - return; - } + /* grab the set's link object */ + link = wql_get_link(set_id); + if (link) { + wql_invalidate(link); + } + /* someone raced us to deinit */ + if (!link || wqset->wqset_id != set_id || set_id != link->wql_setid.id) { + if (link) { + wql_put_link(link); + } + waitq_set_unlock(wqset); + return; + } - /* every wait queue set should have a valid link object */ - assert(link != NULL && wql_type(link) == WQL_WQS); + /* the link should be a valid link object at this point */ + assert(link != NULL && wql_type(link) == WQL_WQS); - wqset->wqset_id = 0; + wqset->wqset_id = 0; + } /* * This set may have a lot of preposts, or may have been a member of @@ -3213,12 +3521,13 @@ void waitq_set_deinit(struct waitq_set *wqset) * objects and free those outside the critical section. */ prepost_id = 0; - if (wqset->wqset_q.waitq_prepost && wqset->wqset_prepost_id) + if (wqset->wqset_q.waitq_prepost && wqset->wqset_prepost_id) { + assert(link != NULL); prepost_id = wqset->wqset_prepost_id; + } /* else { TODO: notify kqueue subsystem? } */ wqset->wqset_prepost_id = 0; - wqset->wqset_q.waitq_type = WQT_INVALID; wqset->wqset_q.waitq_fifo = 0; wqset->wqset_q.waitq_prepost = 0; wqset->wqset_q.waitq_isvalid = 0; @@ -3229,16 +3538,19 @@ void waitq_set_deinit(struct waitq_set *wqset) waitq_unlink_all_unlock(&wqset->wqset_q); /* wqset->wqset_q unlocked and set links deallocated */ - /* - * walk_waitq_links may race with us for access to the waitq set. - * If walk_waitq_links has a reference to the set, then we should wait - * until the link's refcount goes to 1 (our reference) before we exit - * this function. That way we ensure that the waitq set memory will - * remain valid even though it's been cleared out. - */ - while (wql_refcnt(link) > 1) - delay(1); - wql_put_link(link); + + if (link) { + /* + * walk_waitq_links may race with us for access to the waitq set. + * If walk_waitq_links has a reference to the set, then we should wait + * until the link's refcount goes to 1 (our reference) before we exit + * this function. That way we ensure that the waitq set memory will + * remain valid even though it's been cleared out. + */ + while (wql_refcnt(link) > 1) + delay(1); + wql_put_link(link); + } /* drop / unlink all the prepost table objects */ /* JMM - can this happen before the delay? */ @@ -3274,6 +3586,11 @@ uint64_t wqset_id(struct waitq_set *wqset) return 0; assert(waitqs_is_set(wqset)); + + if (!waitqs_is_linked(wqset)) { + waitq_set_lazy_init_link(wqset); + } + return wqset->wqset_id; } @@ -3483,12 +3800,13 @@ boolean_t waitq_member(struct waitq *waitq, struct waitq_set *wqset) if (!waitqs_is_set(wqset)) return FALSE; - + waitq_lock(waitq); + if (!waitqs_is_linked(wqset)) + goto out_unlock; + setid = wqset->wqset_id; - if (!setid) - goto out_unlock; /* fast path: most waitqs are members of only 1 set */ if (waitq->waitq_set_id == setid) { @@ -3606,6 +3924,8 @@ static kern_return_t waitq_link_internal(struct waitq *waitq, kern_return_t kr; assert(waitq_held(waitq)); + assert(setid != 0); + assert(setid != WQSET_NOT_LINKED); /* * If the waitq_set_id field is empty, then this waitq is not @@ -3636,7 +3956,7 @@ static kern_return_t waitq_link_internal(struct waitq *waitq, kr = walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id, WQL_ALL, (void *)&setid, waitq_inset_cb); if (kr == WQ_ITERATE_FOUND) - return kr; + return KERN_ALREADY_IN_SET; /* * This wait queue is a member of at least one set already, @@ -3666,9 +3986,14 @@ static kern_return_t waitq_link_internal(struct waitq *waitq, * may (rarely) block on link table allocation if the table has to grow, * and no 'reserved_link' object is passed. * + * may block and acquire wqset lock if the wqset passed has no link. + * * Notes: * The caller can guarantee that this function will never block by - * pre-allocating a link table object and passing its ID in 'reserved_link' + * - pre-allocating a link table object and passing its ID in 'reserved_link' + * - and pre-allocating the waitq set link calling waitq_set_lazy_init_link. + * It is not possible to provide a reserved_link without having also linked + * the wqset. */ kern_return_t waitq_link(struct waitq *waitq, struct waitq_set *wqset, waitq_lock_state_t lock_state, uint64_t *reserved_link) @@ -3683,6 +4008,12 @@ kern_return_t waitq_link(struct waitq *waitq, struct waitq_set *wqset, if (!waitqs_is_set(wqset)) return KERN_INVALID_ARGUMENT; + if (!reserved_link || *reserved_link == 0) { + if (!waitqs_is_linked(wqset)) { + waitq_set_lazy_init_link(wqset); + } + } + wqdbg_v("Link waitq %p to wqset 0x%llx", (void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq), wqset->wqset_id); @@ -3990,8 +4321,6 @@ static kern_return_t waitq_unlink_locked(struct waitq *waitq, assert(!waitq_irq_safe(waitq)); - setid = wqset->wqset_id; - if (waitq->waitq_set_id == 0) { /* * TODO: @@ -4004,6 +4333,16 @@ static kern_return_t waitq_unlink_locked(struct waitq *waitq, return KERN_NOT_IN_SET; } + if (!waitqs_is_linked(wqset)) { + /* + * No link has been allocated for the wqset, + * so no waitq could have been linked to it. + */ + return KERN_NOT_IN_SET; + } + + setid = wqset->wqset_id; + if (waitq->waitq_set_id == setid) { waitq->waitq_set_id = 0; /* @@ -4284,24 +4623,27 @@ kern_return_t waitq_set_unlink_all_unlock(struct waitq_set *wqset) * constituent wait queues. All we have to do is invalidate the SetID */ - /* invalidate and re-alloc the link object first */ - link = wql_get_link(wqset->wqset_id); + if (waitqs_is_linked(wqset)){ - /* we may have raced with a waitq_set_deinit: handle this */ - if (!link) { - waitq_set_unlock(wqset); - return KERN_SUCCESS; - } + /* invalidate and re-alloc the link object first */ + link = wql_get_link(wqset->wqset_id); + + /* we may have raced with a waitq_set_deinit: handle this */ + if (!link) { + waitq_set_unlock(wqset); + return KERN_SUCCESS; + } - wql_invalidate(link); + wql_invalidate(link); - /* re-alloc the object to get a new generation ID */ - wql_realloc_link(link, WQL_WQS); - link->wql_wqs.wql_set = wqset; + /* re-alloc the object to get a new generation ID */ + wql_realloc_link(link, WQL_WQS); + link->wql_wqs.wql_set = wqset; - wqset->wqset_id = link->wql_setid.id; - wql_mkvalid(link); - wql_put_link(link); + wqset->wqset_id = link->wql_setid.id; + wql_mkvalid(link); + wql_put_link(link); + } /* clear any preposts attached to this set */ prepost_id = 0; diff --git a/osfmk/kern/waitq.h b/osfmk/kern/waitq.h index e5874895e..c3fee4a8c 100644 --- a/osfmk/kern/waitq.h +++ b/osfmk/kern/waitq.h @@ -39,6 +39,11 @@ #include +#ifdef XNU_KERNEL_PRIVATE +/* priority queue static asserts fail for __ARM64_ARCH_8_32__ kext builds */ +#include +#endif /* XNU_KERNEL_PRIVATE */ + /* * Constants and types used in the waitq APIs */ @@ -102,13 +107,12 @@ jenkins_hash(char *key, size_t length) #include #include -#include #include #include /* machine_timeout_suspended() */ /* - * The event mask is of 59 bits on 64 bit architeture and 27 bits on + * The event mask is of 57 bits on 64 bit architeture and 25 bits on * 32 bit architecture and so we calculate its size using sizeof(long). * If the bitfield for wq_type and wq_fifo is changed, then value of * EVENT_MASK_BITS will also change. @@ -116,9 +120,8 @@ jenkins_hash(char *key, size_t length) * New plan: this is an optimization anyway, so I'm stealing 32bits * from the mask to shrink the waitq object even further. */ -#define _EVENT_MASK_BITS ((sizeof(uint32_t) * 8) - 6) +#define _EVENT_MASK_BITS ((sizeof(uint32_t) * 8) - 7) -#define WAITQ_BOOST_PRIORITY 31 enum waitq_type { WQT_INVALID = 0, @@ -162,6 +165,7 @@ struct waitq { waitq_prepost:1, /* waitq supports prepost? */ waitq_irq:1, /* waitq requires interrupts disabled */ waitq_isvalid:1, /* waitq structure is valid */ + waitq_turnstile_or_port:1, /* waitq is embedded in a turnstile (if irq safe), or port (if not irq safe) */ waitq_eventmask:_EVENT_MASK_BITS; /* the wait queue set (set-of-sets) to which this queue belongs */ #if __arm64__ @@ -172,7 +176,10 @@ struct waitq { uint64_t waitq_set_id; uint64_t waitq_prepost_id; - queue_head_t waitq_queue; /* queue of elements */ + union { + queue_head_t waitq_queue; /* queue of elements */ + struct priority_queue waitq_prio_queue; /* priority ordered queue of elements */ + }; }; static_assert(sizeof(struct waitq) == WQ_OPAQUE_SIZE, "waitq structure size mismatch"); @@ -192,6 +199,7 @@ struct waitq_set { }; }; +#define WQSET_NOT_LINKED ((uint64_t)(~0)) static_assert(sizeof(struct waitq_set) == WQS_OPAQUE_SIZE, "waitq_set structure size mismatch"); static_assert(__alignof(struct waitq_set) == WQS_OPAQUE_ALIGN, "waitq_set structure alignment mismatch"); @@ -200,6 +208,12 @@ extern void waitq_bootstrap(void); #define waitq_is_queue(wq) \ ((wq)->waitq_type == WQT_QUEUE) +#define waitq_is_turnstile_queue(wq) \ + (((wq)->waitq_irq) && (wq)->waitq_turnstile_or_port) + +#define waitq_is_port_queue(wq) \ + (!((wq)->waitq_irq) && (wq)->waitq_turnstile_or_port) + #define waitq_is_set(wq) \ ((wq)->waitq_type == WQT_SET && ((struct waitq_set *)(wq))->wqset_id != 0) @@ -207,7 +221,10 @@ extern void waitq_bootstrap(void); (((wqs)->wqset_q.waitq_type == WQT_SET) && ((wqs)->wqset_id != 0)) #define waitq_valid(wq) \ - ((wq) != NULL && (wq)->waitq_isvalid && ((wq)->waitq_type & ~1) == WQT_QUEUE) + ((wq) != NULL && (wq)->waitq_isvalid) + +#define waitqs_is_linked(wqs) \ + (((wqs)->wqset_id != WQSET_NOT_LINKED) && ((wqs)->wqset_id != 0)) /* * Invalidate a waitq. The only valid waitq functions to call after this are: @@ -216,8 +233,14 @@ extern void waitq_bootstrap(void); */ extern void waitq_invalidate_locked(struct waitq *wq); -#define waitq_empty(wq) \ - (queue_empty(&(wq)->waitq_queue)) +static inline boolean_t waitq_empty(struct waitq *wq) +{ + if (waitq_is_turnstile_queue(wq)) { + return priority_queue_empty(&(wq->waitq_prio_queue)); + } else { + return queue_empty(&(wq->waitq_queue)); + } +} #if __arm64__ @@ -400,6 +423,7 @@ extern void waitq_set_deinit(struct waitq_set *wqset); extern kern_return_t waitq_set_free(struct waitq_set *wqset); #if DEVELOPMENT || DEBUG +extern int sysctl_helper_waitq_set_nelem(void); #if CONFIG_WAITQ_DEBUG extern uint64_t wqset_id(struct waitq_set *wqset); @@ -412,6 +436,8 @@ struct waitq *wqset_waitq(struct waitq_set *wqset); * set membership */ extern uint64_t waitq_link_reserve(struct waitq *waitq); +extern void waitq_set_lazy_init_link(struct waitq_set *wqset); +extern boolean_t waitq_set_should_lazy_init_link(struct waitq_set *wqset); extern void waitq_link_release(uint64_t id); @@ -458,6 +484,8 @@ extern int waitq_is_global(struct waitq *waitq); extern int waitq_irq_safe(struct waitq *waitq); +extern struct waitq * waitq_get_safeq(struct waitq *waitq); + #if CONFIG_WAITQ_STATS /* * waitq statistics diff --git a/osfmk/kern/zalloc.c b/osfmk/kern/zalloc.c index 8da4fe3c8..a9091abf4 100644 --- a/osfmk/kern/zalloc.c +++ b/osfmk/kern/zalloc.c @@ -70,7 +70,6 @@ #include #include #include -#include #include #include @@ -102,6 +101,7 @@ #include #include +#include #include #include @@ -191,6 +191,9 @@ sample_counter(volatile uint32_t * count_p, uint32_t factor) #define ZP_POISON 0xdeadbeef #endif +boolean_t zfree_poison_element(zone_t zone, vm_offset_t elem); +void zalloc_poison_element(boolean_t check_poison, zone_t zone, vm_offset_t addr); + #define ZP_DEFAULT_SAMPLING_FACTOR 16 #define ZP_DEFAULT_SCALE_FACTOR 4 @@ -202,7 +205,12 @@ sample_counter(volatile uint32_t * count_p, uint32_t factor) */ /* set by zp-factor=N boot arg, zero indicates non-tiny poisoning disabled */ -uint32_t zp_factor = 0; +#if DEBUG +#define DEFAULT_ZP_FACTOR (1) +#else +#define DEFAULT_ZP_FACTOR (0) +#endif +uint32_t zp_factor = DEFAULT_ZP_FACTOR; /* set by zp-scale=N boot arg, scales zp_factor by zone size */ uint32_t zp_scale = 0; @@ -218,6 +226,7 @@ uintptr_t zp_nopoison_cookie = 0; boolean_t zone_tagging_on; #endif /* VM_MAX_TAG_ZONES */ +SECURITY_READ_ONLY_LATE(boolean_t) copyio_zalloc_check = TRUE; static struct bool_gen zone_bool_gen; /* @@ -362,6 +371,33 @@ struct zone_free_element { /* void *backup_ptr; */ }; +#if CONFIG_ZCACHE + +#if !CONFIG_GZALLOC +bool use_caching = TRUE; +#else +bool use_caching = FALSE; +#endif /* !CONFIG_GZALLOC */ + +/* + * Decides whether per-cpu zone caching is to be enabled for all zones. + * Can be set to TRUE via the boot-arg '-zcache_all'. + */ +bool cache_all_zones = FALSE; + +/* + * Specifies a single zone to enable CPU caching for. + * Can be set using boot-args: zcc_enable_for_zone_name= + */ +static char cache_zone_name[MAX_ZONE_NAME]; + +static inline bool zone_caching_enabled(zone_t z) +{ + return (z->cpu_cache_enabled && !z->tags && !z->zleak_on); +} + +#endif /* CONFIG_ZCACHE */ + /* * Protects zone_array, num_zones, num_zones_in_use, and zone_empty_bitmap */ @@ -446,6 +482,7 @@ struct zone_page_metadata { /* Magic value to indicate empty element free list */ #define PAGE_METADATA_EMPTY_FREELIST ((uint32_t)(~0)) +vm_map_copy_t create_vm_map_copy(vm_offset_t start_addr, vm_size_t total_size, vm_size_t used_size); boolean_t get_zone_info(zone_t z, mach_zone_name_t *zn, mach_zone_info_t *zi); boolean_t is_zone_map_nearing_exhaustion(void); extern void vm_pageout_garbage_collect(int collect); @@ -513,14 +550,22 @@ zone_populate_metadata_page(struct zone_page_metadata *page_meta) { vm_offset_t page_metadata_begin = trunc_page(page_meta); vm_offset_t page_metadata_end = trunc_page((vm_offset_t)page_meta + sizeof(struct zone_page_metadata)); - + for(;page_metadata_begin <= page_metadata_end; page_metadata_begin += PAGE_SIZE) { +#if !KASAN + /* + * This can race with another thread doing a populate on the same metadata + * page, where we see an updated pmap but unmapped KASan shadow, causing a + * fault in the shadow when we first access the metadata page. Avoid this + * by always synchronizing on the zone_metadata_region lock with KASan. + */ if (pmap_find_phys(kernel_pmap, (vm_map_address_t)page_metadata_begin)) continue; +#endif /* All updates to the zone_metadata_region are done under the zone_metadata_region_lck */ lck_mtx_lock(&zone_metadata_region_lck); if (0 == pmap_find_phys(kernel_pmap, (vm_map_address_t)page_metadata_begin)) { - kern_return_t __unused ret = kernel_memory_populate(zone_map, + kern_return_t __assert_only ret = kernel_memory_populate(zone_map, page_metadata_begin, PAGE_SIZE, KMA_KOBJECT, @@ -559,8 +604,9 @@ get_zone_page_metadata(struct zone_free_element *element, boolean_t init) } else { page_meta = (struct zone_page_metadata *)(trunc_page((vm_offset_t)element)); } - if (init) - __nosan_bzero((char *)page_meta, sizeof(struct zone_page_metadata)); + if (init) { + bzero((char *)page_meta, sizeof(struct zone_page_metadata)); + } return ((PAGE_METADATA_GET_ZINDEX(page_meta) != MULTIPAGE_METADATA_MAGIC) ? page_meta : page_metadata_get_realmeta(page_meta)); } @@ -1200,16 +1246,17 @@ free_to_zone(zone_t zone, assert(PAGE_METADATA_GET_ZONE(page_meta) == zone); old_head = (vm_offset_t)page_metadata_get_freelist(page_meta); -#if MACH_ASSERT if (__improbable(!is_sane_zone_element(zone, old_head))) panic("zfree: invalid head pointer %p for freelist of zone %s\n", (void *) old_head, zone->zone_name); -#endif if (__improbable(!is_sane_zone_element(zone, element))) panic("zfree: freeing invalid pointer %p to zone %s\n", (void *) element, zone->zone_name); + if (__improbable(old_head == element)) + panic("zfree: double free of %p to zone %s\n", + (void *) element, zone->zone_name); /* * Always write a redundant next pointer * So that it is more difficult to forge, xor it with a random cookie @@ -1485,7 +1532,7 @@ static int num_zones_logged = 0; static char zone_name_to_log[MAX_ZONE_NAME] = ""; /* the zone name we're logging, if any */ /* Log allocations and frees to help debug a zone element corruption */ -boolean_t corruption_debug_flag = FALSE; /* enabled by "-zc" boot-arg */ +boolean_t corruption_debug_flag = DEBUG; /* enabled by "-zc" boot-arg */ /* Making pointer scanning leaks detection possible for all zones */ #if DEBUG || DEVELOPMENT @@ -1515,13 +1562,6 @@ boolean_t leak_scan_debug_flag = FALSE; /* enabled by "-zl" boot-ar */ -/* - * Opcodes for the btlog operation field: - */ - -#define ZOP_ALLOC 1 -#define ZOP_FREE 0 - /* * Decide if we want to log this zone by doing a string compare between a zone name and the name * of the zone to log. Return true if the strings are equal, false otherwise. Because it's not @@ -1532,7 +1572,7 @@ boolean_t leak_scan_debug_flag = FALSE; /* enabled by "-zl" boot-ar int track_this_zone(const char *zonename, const char *logname) { - int len; + unsigned int len; const char *zc = zonename; const char *lc = logname; @@ -2068,6 +2108,101 @@ compute_element_size(vm_size_t requested_size) return element_size; } +#if KASAN_ZALLOC + +/* + * Called from zinit(). + * + * Fixes up the zone's element size to incorporate the redzones. + */ +static void +kasan_update_element_size_for_redzone( + zone_t zone, /* the zone that needs to be updated */ + vm_size_t *size, /* requested zone element size */ + vm_size_t *max, /* maximum memory to use */ + const char *name) /* zone name */ +{ + /* Expand the zone allocation size to include the redzones. For page-multiple + * zones add a full guard page because they likely require alignment. kalloc + * and fakestack handles its own KASan state, so ignore those zones. */ + /* XXX: remove this when zinit_with_options() is a thing */ + const char *kalloc_name = "kalloc."; + const char *fakestack_name = "fakestack."; + if (strncmp(name, kalloc_name, strlen(kalloc_name)) == 0) { + zone->kasan_redzone = 0; + } else if (strncmp(name, fakestack_name, strlen(fakestack_name)) == 0) { + zone->kasan_redzone = 0; + } else { + if ((*size % PAGE_SIZE) != 0) { + zone->kasan_redzone = KASAN_GUARD_SIZE; + } else { + zone->kasan_redzone = PAGE_SIZE; + } + *max = (*max / *size) * (*size + zone->kasan_redzone * 2); + *size += zone->kasan_redzone * 2; + } +} + +/* + * Called from zalloc_internal() to fix up the address of the newly + * allocated element. + * + * Returns the element address skipping over the redzone on the left. + */ +static vm_offset_t +kasan_fixup_allocated_element_address( + zone_t zone, /* the zone the element belongs to */ + vm_offset_t addr) /* address of the element, including the redzone */ +{ + /* Fixup the return address to skip the redzone */ + if (zone->kasan_redzone) { + addr = kasan_alloc(addr, zone->elem_size, + zone->elem_size - 2 * zone->kasan_redzone, zone->kasan_redzone); + } + return addr; +} + +/* + * Called from zfree() to add the element being freed to the KASan quarantine. + * + * Returns true if the newly-freed element made it into the quarantine without + * displacing another, false otherwise. In the latter case, addrp points to the + * address of the displaced element, which will be freed by the zone. + */ +static bool +kasan_quarantine_freed_element( + zone_t *zonep, /* the zone the element is being freed to */ + void **addrp) /* address of the element being freed */ +{ + zone_t zone = *zonep; + void *addr = *addrp; + + /* + * Resize back to the real allocation size and hand off to the KASan + * quarantine. `addr` may then point to a different allocation, if the + * current element replaced another in the quarantine. The zone then + * takes ownership of the swapped out free element. + */ + vm_size_t usersz = zone->elem_size - 2 * zone->kasan_redzone; + vm_size_t sz = usersz; + + if (addr && zone->kasan_redzone) { + kasan_check_free((vm_address_t)addr, usersz, KASAN_HEAP_ZALLOC); + addr = (void *)kasan_dealloc((vm_address_t)addr, &sz); + assert(sz == zone->elem_size); + } + if (addr && zone->kasan_quarantine) { + kasan_free(&addr, &sz, KASAN_HEAP_ZALLOC, zonep, usersz, true); + if (!addr) { + return TRUE; + } + } + *addrp = addr; + return FALSE; +} + +#endif /* KASAN_ZALLOC */ + /* * zinit initializes a new zone. The zone data structures themselves * are stored in a zone, which is initially a static structure that @@ -2138,25 +2273,7 @@ zinit( simple_unlock(&all_zones_lock); #if KASAN_ZALLOC - /* Expand the zone allocation size to include the redzones. For page-multiple - * zones add a full guard page because they likely require alignment. kalloc - * and fakestack handles its own KASan state, so ignore those zones. */ - /* XXX: remove this when zinit_with_options() is a thing */ - const char *kalloc_name = "kalloc."; - const char *fakestack_name = "fakestack."; - if (strncmp(name, kalloc_name, strlen(kalloc_name)) == 0) { - z->kasan_redzone = 0; - } else if (strncmp(name, fakestack_name, strlen(fakestack_name)) == 0) { - z->kasan_redzone = 0; - } else { - if ((size % PAGE_SIZE) != 0) { - z->kasan_redzone = KASAN_GUARD_SIZE; - } else { - z->kasan_redzone = PAGE_SIZE; - } - max = (max / size) * (size + z->kasan_redzone * 2); - size += z->kasan_redzone * 2; - } + kasan_update_element_size_for_redzone(z, &size, &max, name); #endif max = round_page(max); @@ -2213,6 +2330,7 @@ zinit( z->zp_count = 0; z->kasan_quarantine = TRUE; z->zone_valid = TRUE; + z->cpu_cache_enabled = FALSE; #if CONFIG_ZLEAKS z->zleak_capture = 0; @@ -2367,6 +2485,13 @@ zinit( gzalloc_zone_init(z); #endif +#if CONFIG_ZCACHE + /* Check if boot-arg specified it should have a cache */ + if (cache_all_zones || track_this_zone(name, cache_zone_name)) { + zone_change(z, Z_CACHING_ENABLED, TRUE); + } +#endif + return(z); } unsigned zone_replenish_loops, zone_replenish_wakeups, zone_replenish_wakeups_initiated, zone_replenish_throttle_count; @@ -2486,6 +2611,13 @@ zdestroy(zone_t z) #endif unlock_zone(z); +#if CONFIG_ZCACHE + /* Drain the per-cpu caches if caching is enabled for the zone. */ + if (zone_caching_enabled(z)) { + panic("zdestroy: Zone caching enabled for zone %s", z->zone_name); + } +#endif /* CONFIG_ZCACHE */ + /* Dump all the free elements */ drop_free_elements(z); @@ -2545,6 +2677,7 @@ zcram_metadata_init(vm_offset_t newmem, vm_size_t size, struct zone_page_metadat return; } + static void random_free_to_zone( zone_t zone, @@ -2558,7 +2691,7 @@ random_free_to_zone( vm_size_t elem_size; int index; - assert(element_count <= ZONE_CHUNK_MAXELEMENTS); + assert(element_count && element_count <= ZONE_CHUNK_MAXELEMENTS); elem_size = zone->elem_size; last_element_offset = first_element_offset + ((element_count * elem_size) - elem_size); for (index = 0; index < element_count; index++) { @@ -2668,11 +2801,11 @@ zcram( } else { first_element_offset = zone_page_metadata_size + (ZONE_ELEMENT_ALIGNMENT - (zone_page_metadata_size % ZONE_ELEMENT_ALIGNMENT)); } - element_count = (int)((PAGE_SIZE - first_element_offset) / elem_size); + element_count = (unsigned int)((PAGE_SIZE - first_element_offset) / elem_size); random_free_to_zone(zone, newmem, first_element_offset, element_count, entropy_buffer); } } else { - element_count = (int)(size / elem_size); + element_count = (unsigned int)(size / elem_size); random_free_to_zone(zone, newmem, 0, element_count, entropy_buffer); } unlock_zone(zone); @@ -2742,9 +2875,13 @@ zone_bootstrap(void) /* should zlog log to debug zone corruption instead of leaks? */ if (PE_parse_boot_argn("-zc", temp_buf, sizeof(temp_buf))) { corruption_debug_flag = TRUE; - } + } #if DEBUG || DEVELOPMENT + /* should perform zone element size checking in copyin/copyout? */ + if (PE_parse_boot_argn("-no-copyio-zalloc-check", temp_buf, sizeof(temp_buf))) { + copyio_zalloc_check = FALSE; + } #if VM_MAX_TAG_ZONES /* enable tags for zones that ask for */ if (PE_parse_boot_argn("-zt", temp_buf, sizeof(temp_buf))) { @@ -2777,6 +2914,19 @@ zone_bootstrap(void) lck_attr_setdefault(&zone_metadata_lock_attr); lck_mtx_init_ext(&zone_metadata_region_lck, &zone_metadata_region_lck_ext, &zone_locks_grp, &zone_metadata_lock_attr); + +#if CONFIG_ZCACHE + /* zcc_enable_for_zone_name=: enable per-cpu zone caching for . */ + if (PE_parse_boot_arg_str("zcc_enable_for_zone_name", cache_zone_name, sizeof(cache_zone_name))) { + printf("zcache: caching enabled for zone %s\n", cache_zone_name); + } + + /* -zcache_all: enable per-cpu zone caching for all zones, overrides 'zcc_enable_for_zone_name'. */ + if (PE_parse_boot_argn("-zcache_all", temp_buf, sizeof(temp_buf))) { + cache_all_zones = TRUE; + printf("zcache: caching enabled for all zones\n"); + } +#endif /* CONFIG_ZCACHE */ } /* @@ -2854,8 +3004,8 @@ static void kill_process_in_largest_zone(void) * vm_map_entry_zone as the largest. This lets us target a specific process to jetsam to quickly recover from the zone map bloat. */ if (largest_zone == vm_object_zone) { - int vm_object_zone_count = vm_object_zone->count; - int vm_map_entry_zone_count = vm_map_entry_zone->count; + unsigned int vm_object_zone_count = vm_object_zone->count; + unsigned int vm_map_entry_zone_count = vm_map_entry_zone->count; /* Is the VM map entries zone count >= 98% of the VM objects zone count? */ if (vm_map_entry_zone_count >= ((vm_object_zone_count * VMENTRY_TO_VMOBJECT_COMPARISON_RATIO) / 100)) { largest_zone = vm_map_entry_zone; @@ -2904,6 +3054,7 @@ zone_init( #if CONFIG_GZALLOC gzalloc_init(max_zonemap_size); #endif + /* * Setup garbage collection information: */ @@ -2953,13 +3104,42 @@ zone_init( zone_map_jetsam_limit = jetsam_limit_temp; } -extern volatile SInt32 kfree_nop_count; - #pragma mark - #pragma mark zalloc_canblock extern boolean_t early_boot_complete; +void +zalloc_poison_element(boolean_t check_poison, zone_t zone, vm_offset_t addr) +{ + vm_offset_t inner_size = zone->elem_size; + if (__improbable(check_poison && addr)) { + vm_offset_t *element_cursor = ((vm_offset_t *) addr) + 1; + vm_offset_t *backup = get_backup_ptr(inner_size, (vm_offset_t *) addr); + + for ( ; element_cursor < backup ; element_cursor++) + if (__improbable(*element_cursor != ZP_POISON)) + zone_element_was_modified_panic(zone, + addr, + *element_cursor, + ZP_POISON, + ((vm_offset_t)element_cursor) - addr); + } + + if (addr) { + /* + * Clear out the old next pointer and backup to avoid leaking the cookie + * and so that only values on the freelist have a valid cookie + */ + + vm_offset_t *primary = (vm_offset_t *) addr; + vm_offset_t *backup = get_backup_ptr(inner_size, primary); + + *primary = ZP_POISON; + *backup = ZP_POISON; + } +} + /* * zalloc returns an element from the specified zone. */ @@ -2978,7 +3158,7 @@ zalloc_internal( vm_offset_t addr = 0; kern_return_t retval; uintptr_t zbt[MAX_ZTRACE_DEPTH]; /* used in zone leak logging and zone leak detection */ - int numsaved = 0; + unsigned int numsaved = 0; boolean_t zone_replenish_wakeup = FALSE, zone_alloc_throttle = FALSE; thread_t thr = current_thread(); boolean_t check_poison = FALSE; @@ -3033,6 +3213,21 @@ zalloc_internal( if (__improbable(zone->tags)) vm_tag_will_update_zone(tag, zone->tag_zone_index); #endif /* VM_MAX_TAG_ZONES */ +#if CONFIG_ZCACHE + if (__probable(addr == 0)) { + if (zone_caching_enabled(zone)) { + addr = zcache_alloc_from_cpu_cache(zone); + if (addr) { +#if KASAN_ZALLOC + addr = kasan_fixup_allocated_element_address(zone, addr); +#endif + DTRACE_VM2(zalloc, zone_t, zone, void*, addr); + return((void *)addr); + } + } + } +#endif /* CONFIG_ZCACHE */ + lock_zone(zone); assert(zone->zone_valid); @@ -3220,7 +3415,7 @@ zalloc_internal( (unsigned long)zone_largest->cur_size, zone_largest->count); } - panic("zalloc: \"%s\" (%d elements) retry fail %d, kfree_nop_count: %d", zone->zone_name, zone->count, retval, (int)kfree_nop_count); + panic("zalloc: \"%s\" (%d elements) retry fail %d", zone->zone_name, zone->count, retval); } } else { break; @@ -3288,43 +3483,19 @@ zalloc_internal( unlock_zone(zone); - vm_offset_t inner_size = zone->elem_size; - if (__improbable(DO_LOGGING(zone) && addr)) { btlog_add_entry(zone->zlog_btlog, (void *)addr, ZOP_ALLOC, (void **)zbt, numsaved); } - if (__improbable(check_poison && addr)) { - vm_offset_t *element_cursor = ((vm_offset_t *) addr) + 1; - vm_offset_t *backup = get_backup_ptr(inner_size, (vm_offset_t *) addr); - - for ( ; element_cursor < backup ; element_cursor++) - if (__improbable(*element_cursor != ZP_POISON)) - zone_element_was_modified_panic(zone, - addr, - *element_cursor, - ZP_POISON, - ((vm_offset_t)element_cursor) - addr); - } + zalloc_poison_element(check_poison, zone, addr); if (addr) { - /* - * Clear out the old next pointer and backup to avoid leaking the cookie - * and so that only values on the freelist have a valid cookie - */ - - vm_offset_t *primary = (vm_offset_t *) addr; - vm_offset_t *backup = get_backup_ptr(inner_size, primary); - - *primary = ZP_POISON; - *backup = ZP_POISON; - #if DEBUG || DEVELOPMENT if (__improbable(leak_scan_debug_flag && !(zone->elem_size & (sizeof(uintptr_t) - 1)))) { - int count, idx; + unsigned int count, idx; /* Fill element, from tail, with backtrace in reverse order */ if (numsaved == 0) numsaved = backtrace(zbt, MAX_ZTRACE_DEPTH); - count = (int) (zone->elem_size / sizeof(uintptr_t)); + count = (unsigned int)(zone->elem_size / sizeof(uintptr_t)); if (count >= numsaved) count = numsaved - 1; for (idx = 0; idx < count; idx++) ((uintptr_t *)addr)[count - 1 - idx] = zbt[idx + 1]; } @@ -3333,12 +3504,9 @@ zalloc_internal( TRACE_MACHLEAKS(ZALLOC_CODE, ZALLOC_CODE_2, zone->elem_size, addr); + #if KASAN_ZALLOC - /* Fixup the return address to skip the redzone */ - if (zone->kasan_redzone) { - addr = kasan_alloc(addr, zone->elem_size, - zone->elem_size - 2 * zone->kasan_redzone, zone->kasan_redzone); - } + addr = kasan_fixup_allocated_element_address(zone, addr); #endif DTRACE_VM2(zalloc, zone_t, zone, void*, addr); @@ -3376,6 +3544,22 @@ zalloc_canblock(zone_t zone, boolean_t canblock) return (zalloc_internal(zone, canblock, FALSE, 0, VM_KERN_MEMORY_NONE)); } +void * +zalloc_attempt(zone_t zone) +{ + boolean_t check_poison = FALSE; + vm_offset_t addr = try_alloc_from_zone(zone, VM_KERN_MEMORY_NONE, &check_poison); + zalloc_poison_element(check_poison, zone, addr); + return (void *)addr; +} + +void +zfree_direct(zone_t zone, vm_offset_t elem) +{ + boolean_t poison = zfree_poison_element(zone, elem); + free_to_zone(zone, elem, poison); +} + void zalloc_async( @@ -3467,6 +3651,41 @@ static void zone_check_freelist(zone_t zone, vm_offset_t elem) } } +boolean_t +zfree_poison_element(zone_t zone, vm_offset_t elem) +{ + boolean_t poison = FALSE; + if (zp_factor != 0 || zp_tiny_zone_limit != 0) { + /* + * Poison the memory before it ends up on the freelist to catch + * use-after-free and use of uninitialized memory + * + * Always poison tiny zones' elements (limit is 0 if -no-zp is set) + * Also poison larger elements periodically + */ + + vm_offset_t inner_size = zone->elem_size; + + uint32_t sample_factor = zp_factor + (((uint32_t)inner_size) >> zp_scale); + + if (inner_size <= zp_tiny_zone_limit) + poison = TRUE; + else if (zp_factor != 0 && sample_counter(&zone->zp_count, sample_factor) == TRUE) + poison = TRUE; + + if (__improbable(poison)) { + + /* memset_pattern{4|8} could help make this faster: */ + /* Poison everything but primary and backup */ + vm_offset_t *element_cursor = ((vm_offset_t *) elem) + 1; + vm_offset_t *backup = get_backup_ptr(inner_size, (vm_offset_t *)elem); + + for ( ; element_cursor < backup; element_cursor++) + *element_cursor = ZP_POISON; + } + } + return poison; +} void zfree( zone_t zone, @@ -3474,7 +3693,7 @@ zfree( { vm_offset_t elem = (vm_offset_t) addr; uintptr_t zbt[MAX_ZTRACE_DEPTH]; /* only used if zone logging is enabled via boot-args */ - int numsaved = 0; + unsigned int numsaved = 0; boolean_t gzfreed = FALSE; boolean_t poison = FALSE; #if VM_MAX_TAG_ZONES @@ -3483,24 +3702,9 @@ zfree( assert(zone != ZONE_NULL); DTRACE_VM2(zfree, zone_t, zone, void*, addr); - #if KASAN_ZALLOC - /* - * Resize back to the real allocation size and hand off to the KASan - * quarantine. `addr` may then point to a different allocation. - */ - vm_size_t usersz = zone->elem_size - 2 * zone->kasan_redzone; - vm_size_t sz = usersz; - if (addr && zone->kasan_redzone) { - kasan_check_free((vm_address_t)addr, usersz, KASAN_HEAP_ZALLOC); - addr = (void *)kasan_dealloc((vm_address_t)addr, &sz); - assert(sz == zone->elem_size); - } - if (addr && zone->kasan_quarantine) { - kasan_free(&addr, &sz, KASAN_HEAP_ZALLOC, &zone, usersz, true); - if (!addr) { - return; - } + if (kasan_quarantine_freed_element(&zone, &addr)) { + return; } elem = (vm_offset_t)addr; #endif @@ -3536,34 +3740,8 @@ zfree( panic("zfree: non-allocated memory in collectable zone!"); } - if ((zp_factor != 0 || zp_tiny_zone_limit != 0) && !gzfreed) { - /* - * Poison the memory before it ends up on the freelist to catch - * use-after-free and use of uninitialized memory - * - * Always poison tiny zones' elements (limit is 0 if -no-zp is set) - * Also poison larger elements periodically - */ - - vm_offset_t inner_size = zone->elem_size; - - uint32_t sample_factor = zp_factor + (((uint32_t)inner_size) >> zp_scale); - - if (inner_size <= zp_tiny_zone_limit) - poison = TRUE; - else if (zp_factor != 0 && sample_counter(&zone->zp_count, sample_factor) == TRUE) - poison = TRUE; - - if (__improbable(poison)) { - - /* memset_pattern{4|8} could help make this faster: */ - /* Poison everything but primary and backup */ - vm_offset_t *element_cursor = ((vm_offset_t *) elem) + 1; - vm_offset_t *backup = get_backup_ptr(inner_size, (vm_offset_t *)elem); - - for ( ; element_cursor < backup; element_cursor++) - *element_cursor = ZP_POISON; - } + if (!gzfreed) { + poison = zfree_poison_element(zone, elem); } /* @@ -3589,6 +3767,14 @@ zfree( } } +#if CONFIG_ZCACHE + if (zone_caching_enabled(zone)) { + int __assert_only ret = zcache_free_to_cpu_cache(zone, addr); + assert(ret != FALSE); + return; + } +#endif /* CONFIG_ZCACHE */ + lock_zone(zone); assert(zone->zone_valid); @@ -3607,12 +3793,10 @@ zfree( free_to_zone(zone, elem, poison); } -#if MACH_ASSERT - if (zone->count < 0) + if (__improbable(zone->count < 0)) { panic("zfree: zone count underflow in zone %s while freeing element %p, possible cause: double frees or freeing memory that did not come from this zone", zone->zone_name, addr); -#endif - + } #if CONFIG_ZLEAKS /* @@ -3699,6 +3883,18 @@ zone_change( case Z_KASAN_QUARANTINE: zone->kasan_quarantine = value; break; + case Z_CACHING_ENABLED: +#if CONFIG_ZCACHE + if (value == TRUE && use_caching) { + if (zcache_ready()) { + zcache_init(zone); + } else { + zone->cpu_cache_enable_when_ready = TRUE; + } + + } +#endif + break; default: panic("Zone_change: Wrong Item Type!"); /* break; */ @@ -3731,7 +3927,7 @@ void drop_free_elements(zone_t z) { vm_size_t elt_size, size_freed; - int total_freed_pages = 0; + unsigned int total_freed_pages = 0; uint64_t old_all_free_count; struct zone_page_metadata *page_meta; queue_head_t page_meta_head; @@ -3834,7 +4030,11 @@ zone_gc(boolean_t consider_jetsams) if (!z->collectable) { continue; } - +#if CONFIG_ZCACHE + if (zone_caching_enabled(z)) { + zcache_drain_depot(z); + } +#endif /* CONFIG_ZCACHE */ if (queue_empty(&z->pages.all_free)) { continue; } @@ -3873,6 +4073,40 @@ consider_zone_gc(boolean_t consider_jetsams) zone_gc(consider_jetsams); } +/* + * Creates a vm_map_copy_t to return to the caller of mach_* MIG calls + * requesting zone information. + * Frees unused pages towards the end of the region, and zero'es out unused + * space on the last page. + */ +vm_map_copy_t +create_vm_map_copy( + vm_offset_t start_addr, + vm_size_t total_size, + vm_size_t used_size) +{ + kern_return_t kr; + vm_offset_t end_addr; + vm_size_t free_size; + vm_map_copy_t copy; + + if (used_size != total_size) { + end_addr = start_addr + used_size; + free_size = total_size - (round_page(end_addr) - start_addr); + + if (free_size >= PAGE_SIZE) { + kmem_free(ipc_kernel_map, + round_page(end_addr), free_size); + } + bzero((char *) end_addr, round_page(end_addr) - end_addr); + } + + kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)start_addr, + (vm_map_size_t)used_size, TRUE, ©); + assert(kr == KERN_SUCCESS); + + return copy; +} boolean_t get_zone_info( @@ -3960,15 +4194,13 @@ mach_memory_info( vm_offset_t memory_info_addr; vm_size_t memory_info_size; vm_size_t memory_info_vmsize; - unsigned int num_info; + unsigned int num_info; unsigned int max_zones, used_zones, i; mach_zone_name_t *zn; mach_zone_info_t *zi; kern_return_t kr; - vm_size_t used; - vm_map_copy_t copy; uint64_t zones_collectable_bytes = 0; if (host == HOST_NULL) @@ -4018,42 +4250,10 @@ mach_memory_info( zi++; } - used = used_zones * sizeof *names; - if (used != names_size) { - vm_offset_t names_addr_end = names_addr + used; - vm_size_t free_size = names_size - (round_page(names_addr_end) - names_addr); - - if (free_size >= PAGE_SIZE) { - kmem_free(ipc_kernel_map, - round_page(names_addr_end), free_size); - } - bzero((char *) names_addr_end, round_page(names_addr_end) - names_addr_end); - } - - kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)names_addr, - (vm_map_size_t)used, TRUE, ©); - assert(kr == KERN_SUCCESS); - - *namesp = (mach_zone_name_t *) copy; + *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, used_zones * sizeof *names); *namesCntp = used_zones; - used = used_zones * sizeof *info; - if (used != info_size) { - vm_offset_t info_addr_end = info_addr + used; - vm_size_t free_size = info_size - (round_page(info_addr_end) - info_addr); - - if (free_size >= PAGE_SIZE) { - kmem_free(ipc_kernel_map, - round_page(info_addr_end), free_size); - } - bzero((char *) info_addr_end, round_page(info_addr_end) - info_addr_end); - } - - kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)info_addr, - (vm_map_size_t)used, TRUE, ©); - assert(kr == KERN_SUCCESS); - - *infop = (mach_zone_info_t *) copy; + *infop = (mach_zone_info_t *) create_vm_map_copy(info_addr, info_size, used_zones * sizeof *info); *infoCntp = used_zones; num_info = 0; @@ -4061,6 +4261,7 @@ mach_memory_info( if (memoryInfop && memoryInfoCntp) { + vm_map_copy_t copy; num_info = vm_page_diagnose_estimate(); memory_info_size = num_info * sizeof(*memory_info); memory_info_vmsize = round_page(memory_info_size); @@ -4121,7 +4322,7 @@ mach_zone_info_for_zone( assert(z != ZONE_NULL); /* Find the requested zone by name */ - if (!strncmp(name.mzn_name, z->zone_name, strlen(z->zone_name))) { + if (track_this_zone(z->zone_name, name.mzn_name)) { zone_ptr = z; break; } @@ -4181,6 +4382,143 @@ get_zones_collectable_bytes(void) return zones_collectable_bytes; } +kern_return_t +mach_zone_get_zlog_zones( + host_priv_t host, + mach_zone_name_array_t *namesp, + mach_msg_type_number_t *namesCntp) +{ +#if DEBUG || DEVELOPMENT + unsigned int max_zones, logged_zones, i; + kern_return_t kr; + zone_t zone_ptr; + mach_zone_name_t *names; + vm_offset_t names_addr; + vm_size_t names_size; + + if (host == HOST_NULL) + return KERN_INVALID_HOST; + + if (namesp == NULL || namesCntp == NULL) + return KERN_INVALID_ARGUMENT; + + simple_lock(&all_zones_lock); + max_zones = (unsigned int)(num_zones); + simple_unlock(&all_zones_lock); + + names_size = round_page(max_zones * sizeof *names); + kr = kmem_alloc_pageable(ipc_kernel_map, + &names_addr, names_size, VM_KERN_MEMORY_IPC); + if (kr != KERN_SUCCESS) + return kr; + names = (mach_zone_name_t *) names_addr; + + zone_ptr = ZONE_NULL; + logged_zones = 0; + for (i = 0; i < max_zones; i++) { + zone_t z = &(zone_array[i]); + assert(z != ZONE_NULL); + + /* Copy out the zone name if zone logging is enabled */ + if(z->zlog_btlog) { + get_zone_info(z, &names[logged_zones], NULL); + logged_zones++; + } + } + + *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, logged_zones * sizeof *names); + *namesCntp = logged_zones; + + return KERN_SUCCESS; + +#else /* DEBUG || DEVELOPMENT */ +#pragma unused(host, namesp, namesCntp) + return KERN_FAILURE; +#endif /* DEBUG || DEVELOPMENT */ +} + +kern_return_t +mach_zone_get_btlog_records( + host_priv_t host, + mach_zone_name_t name, + zone_btrecord_array_t *recsp, + mach_msg_type_number_t *recsCntp) +{ +#if DEBUG || DEVELOPMENT + unsigned int max_zones, i, numrecs = 0; + zone_btrecord_t *recs; + kern_return_t kr; + zone_t zone_ptr; + vm_offset_t recs_addr; + vm_size_t recs_size; + + if (host == HOST_NULL) + return KERN_INVALID_HOST; + + if (recsp == NULL || recsCntp == NULL) + return KERN_INVALID_ARGUMENT; + + simple_lock(&all_zones_lock); + max_zones = (unsigned int)(num_zones); + simple_unlock(&all_zones_lock); + + zone_ptr = ZONE_NULL; + for (i = 0; i < max_zones; i++) { + zone_t z = &(zone_array[i]); + assert(z != ZONE_NULL); + + /* Find the requested zone by name */ + if (track_this_zone(z->zone_name, name.mzn_name)) { + zone_ptr = z; + break; + } + } + + /* No zones found with the requested zone name */ + if (zone_ptr == ZONE_NULL) { + return KERN_INVALID_ARGUMENT; + } + + /* Logging not turned on for the requested zone */ + if (!DO_LOGGING(zone_ptr)) { + return KERN_FAILURE; + } + + /* Allocate memory for btlog records */ + numrecs = (unsigned int)(get_btlog_records_count(zone_ptr->zlog_btlog)); + recs_size = round_page(numrecs * sizeof *recs); + + kr = kmem_alloc_pageable(ipc_kernel_map, &recs_addr, recs_size, VM_KERN_MEMORY_IPC); + if (kr != KERN_SUCCESS) { + return kr; + } + + /* + * We will call get_btlog_records() below which populates this region while holding a spinlock + * (the btlog lock). So these pages need to be wired. + */ + kr = vm_map_wire_kernel(ipc_kernel_map, recs_addr, recs_addr + recs_size, + VM_PROT_READ|VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE); + assert(kr == KERN_SUCCESS); + + recs = (zone_btrecord_t *)recs_addr; + get_btlog_records(zone_ptr->zlog_btlog, recs, &numrecs); + + kr = vm_map_unwire(ipc_kernel_map, recs_addr, recs_addr + recs_size, FALSE); + assert(kr == KERN_SUCCESS); + + *recsp = (zone_btrecord_t *) create_vm_map_copy(recs_addr, recs_size, numrecs * sizeof *recs); + *recsCntp = numrecs; + + return KERN_SUCCESS; + +#else /* DEBUG || DEVELOPMENT */ +#pragma unused(host, name, recsp, recsCntp) + return KERN_FAILURE; +#endif /* DEBUG || DEVELOPMENT */ +} + + #if DEBUG || DEVELOPMENT kern_return_t @@ -4235,6 +4573,8 @@ mach_memory_info_check(void) return (kr); } +extern boolean_t (* volatile consider_buffer_cache_collect)(int); + #endif /* DEBUG || DEVELOPMENT */ kern_return_t @@ -4245,6 +4585,10 @@ mach_zone_force_gc( return KERN_INVALID_HOST; #if DEBUG || DEVELOPMENT + /* Callout to buffer cache GC to drop elements in the apfs zones */ + if (consider_buffer_cache_collect != NULL) { + (void)(*consider_buffer_cache_collect)(0); + } consider_zone_gc(FALSE); #endif /* DEBUG || DEVELOPMENT */ return (KERN_SUCCESS); @@ -4445,7 +4789,7 @@ kdp_is_in_zone(void *addr, const char *zone_name) boolean_t run_zone_test(void) { - int i = 0, max_iter = 5; + unsigned int i = 0, max_iter = 5; void * test_ptr; zone_t test_zone; diff --git a/osfmk/kern/zalloc.h b/osfmk/kern/zalloc.h index 6a585b83f..b45020f11 100644 --- a/osfmk/kern/zalloc.h +++ b/osfmk/kern/zalloc.h @@ -68,6 +68,7 @@ #define _KERN_ZALLOC_H_ #include +#include #include #include @@ -84,6 +85,10 @@ #include #endif +#ifdef CONFIG_ZCACHE +#include +#endif + #if CONFIG_GZALLOC typedef struct gzalloc_data { uint32_t gzfc_index; @@ -103,6 +108,9 @@ struct zone_free_element; struct zone_page_metadata; struct zone { +#ifdef CONFIG_ZCACHE + struct zone_cache *zcache; +#endif /* CONFIG_ZCACHE */ struct zone_free_element *free_elements; /* free elements directly linked */ struct { queue_head_t any_free_foreign; /* foreign pages crammed into zone */ @@ -145,7 +153,9 @@ struct zone { /* boolean_t */ tags_inline :1, /* future */ tag_zone_index :6, /* boolean_t */ zone_valid :1, - /* future */ _reserved :5; + /* boolean_t */ cpu_cache_enable_when_ready :1, + /* boolean_t */ cpu_cache_enabled :1, + /* future */ _reserved :3; int index; /* index into zone_info arrays for this zone */ const char *zone_name; /* a name for the zone */ @@ -267,12 +277,19 @@ __BEGIN_DECLS #ifdef XNU_KERNEL_PRIVATE #define Z_TAGS_ENABLED 11 /* Store tags */ #endif /* XNU_KERNEL_PRIVATE */ +#define Z_CACHING_ENABLED 12 /*enable and initialize per-cpu caches for the zone*/ #ifdef XNU_KERNEL_PRIVATE extern vm_offset_t zone_map_min_address; extern vm_offset_t zone_map_max_address; +/* free an element with no regard for gzalloc, zleaks, or kasan*/ +extern void zfree_direct( zone_t zone, + vm_offset_t elem); + +/* attempts to allocate an element with no regard for gzalloc, zleaks, or kasan*/ +extern void * zalloc_attempt( zone_t zone); /* Non-waiting for memory version of zalloc */ extern void * zalloc_nopagewait( @@ -321,16 +338,6 @@ extern vm_size_t zone_element_size( void *addr, zone_t *z); -/* - * MAX_ZTRACE_DEPTH configures how deep of a stack trace is taken on each zalloc in the zone of interest. 15 - * levels is usually enough to get past all the layers of code in kalloc and IOKit and see who the actual - * caller is up above these lower levels. - * - * This is used both for the zone leak detector and the zone corruption log. - */ - -#define MAX_ZTRACE_DEPTH 15 - /* * Structure for keeping track of a backtrace, used for leak detection. * This is in the .h file because it is used during panic, see kern/debug.c diff --git a/osfmk/kern/zcache.c b/osfmk/kern/zcache.c new file mode 100644 index 000000000..dab30e61e --- /dev/null +++ b/osfmk/kern/zcache.c @@ -0,0 +1,642 @@ +/* + * Copyright (c) 2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include + + +#if defined(__i386__) || defined(__x86_64__) +#include +#endif + +#if defined (__arm__) || defined (__arm64__) +#include +#endif + +#define DEFAULT_MAGAZINE_SIZE 8 /* Default number of elements for all magazines allocated from the magazine_zone */ +#define DEFAULT_DEPOT_SIZE 8 /* Default number of elements for the array zcc_depot_list */ +#define ZCC_MAX_CPU_CACHE_LINE_SIZE 64 /* We should use a platform specific macro for this in the future, right now this is the max cache line size for all platforms*/ + +lck_grp_t zcache_locks_grp; /* lock group for depot_lock */ +zone_t magazine_zone; /* zone to allocate zcc_magazine structs from */ +uint16_t magazine_element_count = 0; /* Size of array in magazine determined by boot-arg or default */ +uint16_t depot_element_count = 0; /* Size of depot lists determined by boot-arg or default */ +bool zone_cache_ready = FALSE; /* Flag to check if zone caching has been set up by zcache_bootstrap */ +uintptr_t zcache_canary = 0; /* Canary used for the caching layer to prevent UaF attacks */ + +/* The zcc_magazine is used as a stack to store cached zone elements. These + * sets of elements can be moved around to perform bulk operations. +*/ +struct zcc_magazine { + uint32_t zcc_magazine_index; /* Used as a stack pointer to acess elements in the array */ + uint32_t zcc_magazine_capacity; /* Number of pointers able to be stored in the zcc_elements array */ + void *zcc_elements[0]; /* Array of pointers to objects */ +}; + + +/* Each CPU will use one of these to store its elements +*/ +struct zcc_per_cpu_cache { + struct zcc_magazine *current; /* Magazine from which we will always try to allocate from and free to first */ + struct zcc_magazine *previous; /* Dedicated magazine for a quick reload and to prevent thrashing wen we swap with the depot */ +} __attribute__(( aligned(ZCC_MAX_CPU_CACHE_LINE_SIZE) )); /* we want to align this to a cache line size so it does not thrash when multiple cpus want to access their caches in paralell */ + + +/* + * The depot layer can be invalid while zone_gc() is draining it out. + * During that time, the CPU caches are active. For CPU magazine allocs and + * frees, the caching layer reaches directly into the zone allocator. + */ +#define ZCACHE_DEPOT_INVALID -1 +#define zcache_depot_available(zcache) (zcache->zcc_depot_index != ZCACHE_DEPOT_INVALID) + +/* This is the basic struct to take care of cahing and is included within + * the zone. +*/ +struct zone_cache { + lck_mtx_t zcc_depot_lock; /* Lock for the depot layer of caching */ + struct zcc_per_cpu_cache zcc_per_cpu_caches[MAX_CPUS]; /* An array of caches, one for each CPU */ + int zcc_depot_index; /* marks the point in the array where empty magazines begin */ + struct zcc_magazine *zcc_depot_list[0]; /* Stores full and empty magazines in the depot layer */ +}; + + +void zcache_init_marked_zones(void); +bool zcache_mag_fill(zone_t zone, struct zcc_magazine *mag); +void zcache_mag_drain(zone_t zone, struct zcc_magazine *mag); +void zcache_mag_init(struct zcc_magazine *mag, int count); +void *zcache_mag_pop(struct zcc_magazine *mag); +void zcache_mag_push(struct zcc_magazine *mag, void *elem); +bool zcache_mag_has_space(struct zcc_magazine *mag); +bool zcache_mag_has_elements(struct zcc_magazine *mag); +void zcache_swap_magazines(struct zcc_magazine **a, struct zcc_magazine **b); +void zcache_mag_depot_swap_for_alloc(struct zone_cache *depot, struct zcc_per_cpu_cache *cache); +void zcache_mag_depot_swap_for_free(struct zone_cache *depot, struct zcc_per_cpu_cache *cache); +void zcache_mag_depot_swap(struct zone_cache *depot, struct zcc_per_cpu_cache *cache, boolean_t load_full); +void zcache_canary_add(zone_t zone, void *addr); +void zcache_canary_validate(zone_t zone, void *addr); + +/* + * zcache_ready + * + * Description: returns whether or not the zone caches are ready to use + * + */ +bool zcache_ready(void){ + return zone_cache_ready; +} + +/* + * zcache_init_marked_zones + * + * Description: Initializes all parts of the per-cpu caches for the list of + * marked zones once we are able to initalize caches. This should + * only be called once, and will be called during the time that the + * system is single threaded so we don't have to take the lock. + * + */ +void zcache_init_marked_zones(void){ + unsigned int i; + for(i = 0; i < num_zones; i ++){ + if(zone_array[i].cpu_cache_enable_when_ready){ + zcache_init(&zone_array[i]); + zone_array[i].cpu_cache_enable_when_ready = FALSE; + } + } +} + +/* + * zcache_bootstrap + * + * Description: initializes zone to allocate magazines from and sets + * magazine_element_count and depot_element_count from + * boot-args or default values + * + */ +void zcache_bootstrap(void) +{ + /* use boot-arg for custom magazine size*/ + if (! PE_parse_boot_argn("zcc_magazine_element_count", &magazine_element_count, sizeof (uint16_t))) + magazine_element_count = DEFAULT_MAGAZINE_SIZE; + + int magazine_size = sizeof(struct zcc_magazine) + magazine_element_count * sizeof(void *); + + magazine_zone = zinit(magazine_size, 100000 * magazine_size , magazine_size, "zcc_magazine_zone"); + + assert(magazine_zone != NULL); + + /* use boot-arg for custom depot size*/ + if (! PE_parse_boot_argn("zcc_depot_element_count", &depot_element_count, sizeof (uint16_t))) + depot_element_count = DEFAULT_DEPOT_SIZE; + + lck_grp_init(&zcache_locks_grp, "zcc_depot_lock", LCK_GRP_ATTR_NULL); + + /* Generate the canary value for zone caches */ + zcache_canary = (uintptr_t) early_random(); + + zone_cache_ready = TRUE; + + zcache_init_marked_zones(); +} + + +/* + * zcache_init + * + * Description: Initializes all parts of the per-cpu caches for a given zone + * + * Parameters: zone pointer to zone on which to iniitalize caching + * + */ + void zcache_init(zone_t zone) + { + int i; /* used as index in for loops */ + vm_size_t total_size; /* Used for allocating the zone_cache struct with the proper size of depot list */ + struct zone_cache *temp_cache; /* Temporary variable to initialize a zone_cache before assigning to the specified zone */ + + /* Allocate chunk of memory for all structs */ + total_size = sizeof(struct zone_cache) + (depot_element_count * sizeof(void *)); + + temp_cache = (struct zone_cache *) kalloc(total_size); + + + /* Initialize a cache for every CPU */ + for (i = 0; i < MAX_CPUS; i++) { + temp_cache->zcc_per_cpu_caches[i].current = (struct zcc_magazine *)zalloc(magazine_zone); + temp_cache->zcc_per_cpu_caches[i].previous = (struct zcc_magazine *)zalloc(magazine_zone); + + assert(temp_cache->zcc_per_cpu_caches[i].current != NULL && temp_cache->zcc_per_cpu_caches[i].previous != NULL); + + zcache_mag_init(temp_cache->zcc_per_cpu_caches[i].current, magazine_element_count); + zcache_mag_init(temp_cache->zcc_per_cpu_caches[i].previous, magazine_element_count); + } + + /* Initialize the lock on the depot layer */ + lck_mtx_init(&(temp_cache->zcc_depot_lock), &zcache_locks_grp, LCK_ATTR_NULL); + + /* Initialize empty magazines in the depot list */ + for (i = 0; i < depot_element_count; i++) { + temp_cache->zcc_depot_list[i] = (struct zcc_magazine *)zalloc(magazine_zone); + + assert(temp_cache->zcc_depot_list[i] != NULL); + + zcache_mag_init(temp_cache->zcc_depot_list[i], magazine_element_count); + } + + temp_cache->zcc_depot_index = 0; + + lock_zone(zone); + zone->zcache = temp_cache; + /* Set flag to know caching is enabled */ + zone->cpu_cache_enabled = TRUE; + unlock_zone(zone); + return; + } + +/* + * zcache_drain_depot + * + * Description: Frees all the full magazines from the depot layer to the zone allocator as part + * of zone_gc(). The routine assumes that only one zone_gc() is in progress (zone_gc_lock + * ensures that) + * + * Parameters: zone pointer to zone for which the depot layer needs to be drained + * + * Returns: None + * + */ +void zcache_drain_depot(zone_t zone) +{ + struct zone_cache *zcache = zone->zcache; + int drain_depot_index = 0; + + /* + * Grab the current depot list from the zone cache. If it has full magazines, + * mark the depot as invalid and drain it. + */ + lck_mtx_lock_spin_always(&(zcache->zcc_depot_lock)); + if (!zcache_depot_available(zcache) || (zcache->zcc_depot_index == 0)) { + /* no full magazines in the depot or depot unavailable; nothing to drain here */ + lck_mtx_unlock(&(zcache->zcc_depot_lock)); + return; + } + drain_depot_index = zcache->zcc_depot_index; + /* Mark the depot as unavailable */ + zcache->zcc_depot_index = ZCACHE_DEPOT_INVALID; + lck_mtx_unlock(&(zcache->zcc_depot_lock)); + + /* Now drain the full magazines in the depot */ + for (int i = 0; i < drain_depot_index; i++) + zcache_mag_drain(zone, zcache->zcc_depot_list[i]); + + lck_mtx_lock_spin_always(&(zcache->zcc_depot_lock)); + /* Mark the depot as available again */ + zcache->zcc_depot_index = 0; + lck_mtx_unlock(&(zcache->zcc_depot_lock)); +} + + +/* + * zcache_free_to_cpu_cache + * + * Description: Checks per-cpu caches to free element there if possible + * + * Parameters: zone pointer to zone for which element comes from + * addr pointer to element to free + * + * Returns: TRUE if successfull, FALSE otherwise + * + * Precondition: check that caching is enabled for zone + */ +bool zcache_free_to_cpu_cache(zone_t zone, void *addr) +{ + int curcpu; /* Current cpu is used to index into array of zcc_per_cpu_cache structs */ + struct zone_cache *zcache; /* local storage of the zone's cache */ + struct zcc_per_cpu_cache *per_cpu_cache; /* locally store the current per_cpu_cache */ + + disable_preemption(); + curcpu = current_processor()->cpu_id; + zcache = zone->zcache; + per_cpu_cache = &zcache->zcc_per_cpu_caches[curcpu]; + + if (zcache_mag_has_space(per_cpu_cache->current)) { + /* If able, free into current magazine */ + goto free_to_current; + } else if (zcache_mag_has_space(per_cpu_cache->previous)) { + /* If able, swap current and previous magazine and retry */ + zcache_swap_magazines(&per_cpu_cache->previous, &per_cpu_cache->current); + goto free_to_current; + } else{ + lck_mtx_lock_spin_always(&(zcache->zcc_depot_lock)); + if (zcache_depot_available(zcache) && (zcache->zcc_depot_index < depot_element_count)) { + /* If able, rotate in a new empty magazine from the depot and retry */ + zcache_mag_depot_swap_for_free(zcache, per_cpu_cache); + lck_mtx_unlock(&(zcache->zcc_depot_lock)); + goto free_to_current; + } + lck_mtx_unlock(&(zcache->zcc_depot_lock)); + /* Attempt to free an entire magazine of elements */ + zcache_mag_drain(zone, per_cpu_cache->current); + if(zcache_mag_has_space(per_cpu_cache->current)){ + goto free_to_current; + } + } + + /* If not able to use cache return FALSE and fall through to zfree */ + enable_preemption(); + return FALSE; + +free_to_current: + assert(zcache_mag_has_space(per_cpu_cache->current)); + zcache_canary_add(zone, addr); + zcache_mag_push(per_cpu_cache->current, addr); + +#if KASAN_ZALLOC + kasan_poison_range((vm_offset_t)addr, zone->elem_size, ASAN_HEAP_FREED); +#endif + + enable_preemption(); + return TRUE; +} + + +/* + * zcache_alloc_from_cpu_cache + * + * Description: Checks per-cpu caches to allocate element from there if possible + * + * Parameters: zone pointer to zone for which element will come from + * + * Returns: pointer to usable element + * + * Precondition: check that caching is enabled for zone + */ +vm_offset_t zcache_alloc_from_cpu_cache(zone_t zone) +{ + int curcpu; /* Current cpu is used to index into array of zcc_per_cpu_cache structs */ + void *ret = NULL; /* Points to the element which will be returned */ + struct zone_cache *zcache; /* local storage of the zone's cache */ + struct zcc_per_cpu_cache *per_cpu_cache; /* locally store the current per_cpu_cache */ + + disable_preemption(); + curcpu = current_processor()->cpu_id; + zcache = zone->zcache; + per_cpu_cache = &zcache->zcc_per_cpu_caches[curcpu]; + + if (zcache_mag_has_elements(per_cpu_cache->current)) { + /* If able, allocate from current magazine */ + goto allocate_from_current; + } else if (zcache_mag_has_elements(per_cpu_cache->previous)) { + /* If able, swap current and previous magazine and retry */ + zcache_swap_magazines(&per_cpu_cache->previous, &per_cpu_cache->current); + goto allocate_from_current; + } else { + lck_mtx_lock_spin_always(&(zcache->zcc_depot_lock)); + if (zcache_depot_available(zcache) && (zcache->zcc_depot_index > 0)) { + /* If able, rotate in a full magazine from the depot */ + zcache_mag_depot_swap_for_alloc(zcache, per_cpu_cache); + lck_mtx_unlock(&(zcache->zcc_depot_lock)); + goto allocate_from_current; + } + lck_mtx_unlock(&(zcache->zcc_depot_lock)); + /* Attempt to allocate an entire magazine of elements */ + if(zcache_mag_fill(zone, per_cpu_cache->current)){ + goto allocate_from_current; + } + } + + /* If unable to allocate from cache return NULL and fall through to zalloc */ + enable_preemption(); + return (vm_offset_t) NULL; + +allocate_from_current: + ret = zcache_mag_pop(per_cpu_cache->current); + assert(ret != NULL); + zcache_canary_validate(zone, ret); + +#if KASAN_ZALLOC + kasan_poison_range((vm_offset_t)ret, zone->elem_size, ASAN_VALID); +#endif + + enable_preemption(); + return (vm_offset_t) ret; +} + + +/* + * zcache_mag_init + * + * Description: initializes fields in a zcc_magazine struct + * + * Parameters: mag pointer to magazine to initialize + * + */ +void zcache_mag_init(struct zcc_magazine *mag, int count) +{ + mag->zcc_magazine_index = 0; + mag->zcc_magazine_capacity = count; +} + + +/* + * zcache_mag_fill + * + * Description: fills a magazine with as many elements as the zone can give + * without blocking to carve out more memory + * + * Parameters: zone zone from which to allocate + * mag pointer to magazine to fill + * + * Return: True if able to allocate elements, false is mag is still empty + */ +bool zcache_mag_fill(zone_t zone, struct zcc_magazine *mag) +{ + assert(mag->zcc_magazine_index == 0); + void* elem = NULL; + uint32_t i; + lock_zone(zone); + for(i = mag->zcc_magazine_index; i < mag->zcc_magazine_capacity; i ++){ + elem = zalloc_attempt(zone); + if(elem) { + zcache_canary_add(zone, elem); + zcache_mag_push(mag, elem); +#if KASAN_ZALLOC + kasan_poison_range((vm_offset_t)elem, zone->elem_size, ASAN_HEAP_FREED); +#endif + } else { + break; + } + } + unlock_zone(zone); + if (i == 0){ + return FALSE; + } + return TRUE; +} + +/* + * zcache_mag_drain + * + * Description: frees all elements in a magazine + * + * Parameters: zone zone to which elements will be freed + * mag pointer to magazine to empty + * + */ +void zcache_mag_drain(zone_t zone, struct zcc_magazine *mag) +{ + assert(mag->zcc_magazine_index == mag->zcc_magazine_capacity); + lock_zone(zone); + while(mag->zcc_magazine_index > 0){ + uint32_t index = --mag->zcc_magazine_index; + zcache_canary_validate(zone, mag->zcc_elements[index]); + zfree_direct(zone,(vm_offset_t)mag->zcc_elements[index]); + mag->zcc_elements[mag->zcc_magazine_index] = 0; + } + unlock_zone(zone); +} + +/* + * zcache_mag_pop + * + * Description: removes last element from magazine in a stack pop fashion + * zcc_magazine_index represents the number of elements on the + * stack, so it the index of where to save the next element, when + * full, it will be 1 past the last index of the array + * + * Parameters: mag pointer to magazine from which to remove element + * + * Returns: pointer to element removed from magazine + * + * Precondition: must check that magazine is not empty before calling + */ +void *zcache_mag_pop(struct zcc_magazine *mag) +{ + void *elem; + assert(zcache_mag_has_elements(mag)); + elem = mag->zcc_elements[--mag->zcc_magazine_index]; + /* Ensure pointer to element cannot be accessed after we pop it */ + mag->zcc_elements[mag->zcc_magazine_index] = NULL; + assert(elem != NULL); + return elem; +} + + +/* + * zcache_mag_push + * + * Description: adds element to magazine and increments zcc_magazine_index + * zcc_magazine_index represents the number of elements on the + * stack, so it the index of where to save the next element, when + * full, it will be 1 past the last index of the array + * + * Parameters: mag pointer to magazine from which to remove element + * elem pointer to element to add + * + * Precondition: must check that magazine is not full before calling + */ +void zcache_mag_push(struct zcc_magazine *mag, void *elem) +{ + assert(zcache_mag_has_space(mag)); + mag->zcc_elements[mag->zcc_magazine_index ++] = elem; +} + + +/* + * zcache_mag_has_space + * + * Description: checks if magazine still has capacity + * + * Parameters: mag pointer to magazine to check + * + * Returns: true if magazine is full + * + */ +bool zcache_mag_has_space(struct zcc_magazine *mag) +{ + return (mag->zcc_magazine_index < mag->zcc_magazine_capacity); +} + + +/* + * zcache_mag_has_elements + * + * Description: checks if magazine is empty + * + * Parameters: mag pointer to magazine to check + * + * Returns: true if magazine has no elements + * + */ +bool zcache_mag_has_elements(struct zcc_magazine *mag) +{ + return (mag->zcc_magazine_index > 0); +} + + +/* + * zcache_swap_magazines + * + * Description: Function which swaps two pointers of any type + * + * Parameters: a pointer to first pointer + * b pointer to second pointer + */ +void zcache_swap_magazines(struct zcc_magazine **a, struct zcc_magazine **b) +{ + struct zcc_magazine *temp = *a; + *a = *b; + *b = temp; +} + + +/* + * zcache_mag_depot_swap_for_alloc + * + * Description: Swaps a full magazine into the current position + * + * Parameters: zcache pointer to the zone_cache to access the depot + * cache pointer to the current per-cpu cache + * + * Precondition: Check that the depot list has full elements + */ +void zcache_mag_depot_swap_for_alloc(struct zone_cache *zcache, struct zcc_per_cpu_cache *cache) +{ + /* Loads a full magazine from which we can allocate */ + assert(zcache_depot_available(zcache)); + assert(zcache->zcc_depot_index > 0); + zcache->zcc_depot_index --; + zcache_swap_magazines(&cache->current, &zcache->zcc_depot_list[zcache->zcc_depot_index]); +} + + +/* + * zcache_mag_depot_swap_for_free + * + * Description: Swaps an empty magazine into the current position + * + * Parameters: zcache pointer to the zone_cache to access the depot + * cache pointer to the current per-cpu cache + * + * Precondition: Check that the depot list has empty elements + */ +void zcache_mag_depot_swap_for_free(struct zone_cache *zcache, struct zcc_per_cpu_cache *cache) +{ + /* Loads an empty magazine into which we can free */ + assert(zcache_depot_available(zcache)); + assert(zcache->zcc_depot_index < depot_element_count); + zcache_swap_magazines(&cache->current, &zcache->zcc_depot_list[zcache->zcc_depot_index]); + zcache->zcc_depot_index ++; +} + +/* + * zcache_canary_add + * + * Description: Adds a canary to an element by putting zcache_canary at the first + * and last location of the element + * + * Parameters: zone zone for the element + * addr element address to add canary to + * + */ +void zcache_canary_add(zone_t zone, void *element) +{ + vm_offset_t *primary = (vm_offset_t *)element; + vm_offset_t *backup = (vm_offset_t *)((vm_offset_t)primary + zone->elem_size - sizeof(vm_offset_t)); + *primary = *backup = (zcache_canary ^ (uintptr_t)element); +} + +/* + * zcache_canary_validate + * + * Description: Validates an element of the zone cache to make sure it still contains the zone + * caching canary. + * + * Parameters: zone zone for the element + * addr element address to validate + * + */ +void zcache_canary_validate(zone_t zone, void *element) +{ + vm_offset_t *primary = (vm_offset_t *)element; + vm_offset_t *backup = (vm_offset_t *)((vm_offset_t)primary + zone->elem_size - sizeof(vm_offset_t)); + + vm_offset_t primary_value = (*primary ^ (uintptr_t)element); + if (primary_value != zcache_canary) { + panic("Zone cache element was used after free! Element %p was corrupted at beginning; Expected %p but found %p; canary %p", + element, (void *)(zcache_canary ^ (uintptr_t)element) , (void *)(*primary), (void *)zcache_canary); + } + + vm_offset_t backup_value = (*backup ^ (uintptr_t)element); + if (backup_value != zcache_canary) { + panic("Zone cache element was used after free! Element %p was corrupted at end; Expected %p but found %p; canary %p", + element, (void *)(zcache_canary ^ (uintptr_t)element), (void *)(*backup), (void *)zcache_canary); + } +} diff --git a/osfmk/kern/zcache.h b/osfmk/kern/zcache.h new file mode 100644 index 000000000..6919aa5b8 --- /dev/null +++ b/osfmk/kern/zcache.h @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * Below is a diagram of the caching system. This design is based of the + * paper "Magazines and Vmem: Extending the Slab Allocator to Many CPUs and + * Arbitrary Resources" by Jeff Bonwick and Jonathan Adams. It is divided into 3 + * layers: the Per-cpu Layer, the Depot Layer, and the Zone Allocator. The + * Per-CPU and Depot layers store elements using arrays we call magazines. + * + * Magazines function like a stack (we push and pop elements) and can be + * moved around for bulk operations. + * _________ _________ _________ + * | CPU 1 | | CPU 2 | | CPU 3 | + * | _ _ | | _ _ | | _ _ | + * | |#| | | | | | | |#| | | |#| |#| | Per-CPU Layer + * | |#| |_| | | |_| |#| | | |#| |#| | + * |_________| |_________| |_________| + * + * ______________________________________________ + * | _ _ _ _ _ _ | + * | |#| |#| |#| | | | | | | | Depot Layer + * | |#| |#| |#| |_| |_| |_| | + * |______________________________________________| + * + * _______________________________________________ + * | # | # | # | # | # | # | # | # | # | # | # | # | Zone Allocator + * |_______________________________________________| + * + * The top layer is the per-cpu cache and consists of a current and + * previous magazine for each CPU. The current magazine is the one we always try + * to allocate from and free to first. Only if we are unable, do we check the + * previous magazine. If the previous magazine can satisfy the allocate or free, + * then we switch the two and allocate from the new current magazine. This layer + * requires no locking, so we can access multiple CPU's caches concurrently. + * This is the main source of the speedup. + * + * We have two magazines here to prevent thrashing when swapping magazines + * with the depot layer. If a certain pattern of alloc and free are called we + * can waste a lot of time swapping magazines to and from the depot layer. We + * prevent this by dividing the per-cpu cache into two separate magazines. + * + * The middle layer is the magazine depot. This layer consists of a + * collection of full and empty magazines. These are used to reload the per-cpu + * caches when needed. This is implemented as an array of magazines which are + * initially all empty and as we fill up magazines we increment the index to + * point at the first empty magazine. Since this layer is per-zone, it allows us + * to balance the cache between cpus, but does require taking a lock. + * + * When neither the current nor previous magazine for a given CPU can + * satisfy the free or allocation, we look to the depot layer. If there are + * magazines in the depot that can satisfy the free or allocation we swap + * that magazine into the current position. In the example below, to allocate on + * the given CPU we must lock the depot layer and swap magazine A with magazine + * B and decrement the depot index. + * + * _____________________ _______________________________________ + * | Per-CPU Cache | | Depot Layer | + * | | | | + * | A___ ____ | | ____ B___ ____ ____ | + * | | | | | | | | ## | | ## | | | | | | + * | | | | | | | | ## | | ## | | | | | | + * | | | | | | | | ## | | ## | | | | | | + * | | | | | | | | ## | | ## | | | | | | + * | |____| |____| | | |_##_| |_##_| |____| |____| | + * | Current Previous | | | + * |_____________________| |_______________________________________| + * + * The bottom layer is the Zone Allocator. This is already implemented in + * XNU and will remain mostly unchanged. Implementation for this can be found + * in zalloc.c and zalloc.h. We will only use the zone if all other layers are + * unable to satisfy the allocation or free. When we do use the zone, we will + * try to allocate an entire magazine of elements or free an entire magazine of + * elements at once. + * + * Caching must be enabled explicitly, by calling zone_change() with the + * Z_CACHING_ENABLED flag, for every zone you want to cache elements for. Zones + * which are good candidates for this are ones with highly contended zone locks. + * + * Some good potential candidates are kalloc.16, kalloc.48, Vm objects, VM map + * entries, ipc vouchers, and ipc ports. + * + * + * Some factors can be tuned by boot-arg: + * zcc_enable_for_zone_name name of a single zone to enable caching for + * (replace space characters with '.') + * + * zcc_magazine_element_count integer value for magazine size used for all + * zones (default 8 is used if not specified) + * + * zcc_depot_element_count integer value for how many full and empty + * magazines to store in the depot, if N specified + * depot will have N full and N empty magazines + * (default 16 used if not specified) +*/ +#include +#include + + +/* + * zcache_ready + * + * Description: returns whether or not the zone caches are ready to use + * + */ +bool zcache_ready(void); + + +/* + * zcache_bootstrap + * + * Description: initializes zone to allocate magazines from + * + */ +void zcache_bootstrap(void); + + +/* + * zcache_init + * + * Description: Initializes all parts of the per-cpu caches for a given zone + * + * Parameters: zone pointer to zone on which to iniitalize caching + * + */ +void zcache_init(zone_t zone); + + +/* + * zcache_free_to_cpu_cache + * + * Description: Checks per-cpu caches to free element there if possible + * + * Parameters: zone pointer to zone for which element comes from + * addr pointer to element to free + * + * Returns: TRUE if successfull, FALSE otherwise + * + * Precondition: check that caching is enabled for zone + */ +bool zcache_free_to_cpu_cache(zone_t zone, void *addr); + + +/* + * zcache_alloc_from_cpu_cache + * + * Description: Checks per-cpu caches to allocate element from there if possible + * + * Parameters: zone pointer to zone for which element will come from + * + * Returns: pointer to usable element + * + * Precondition: check that caching is enabled for zone + */ +vm_offset_t zcache_alloc_from_cpu_cache(zone_t zone); + +/* + * zcache_drain_depot + * + * Description: Frees all the full magazines from the depot layer to the zone allocator + * Invoked by zone_gc() + * + * Parameters: zone pointer to zone for which the depot layer needs to be drained + * + * Returns: None + * + */ +void zcache_drain_depot(zone_t zone); diff --git a/osfmk/kperf/Makefile b/osfmk/kperf/Makefile index 385bc0520..467e33774 100644 --- a/osfmk/kperf/Makefile +++ b/osfmk/kperf/Makefile @@ -13,6 +13,7 @@ EXPORT_ONLY_FILES = \ kperfbsd.h \ kperf_timer.h \ kdebug_trigger.h \ + lazy.h \ pet.h EXPORT_MI_DIR = kperf diff --git a/osfmk/kperf/action.c b/osfmk/kperf/action.c index ab33ded44..94afda342 100644 --- a/osfmk/kperf/action.c +++ b/osfmk/kperf/action.c @@ -2,7 +2,7 @@ * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -57,8 +58,7 @@ #define ACTION_MAX (32) /* the list of different actions to take */ -struct action -{ +struct action { uint32_t sample; uint32_t ucallstack_depth; uint32_t kcallstack_depth; @@ -67,14 +67,14 @@ struct action }; /* the list of actions */ -static unsigned actionc = 0; +static unsigned int actionc = 0; static struct action *actionv = NULL; /* should emit tracepoint on context switch */ int kperf_kdebug_cswitch = 0; bool -kperf_sample_has_non_system(unsigned actionid) +kperf_action_has_non_system(unsigned int actionid) { if (actionid > actionc) { return false; @@ -87,6 +87,26 @@ kperf_sample_has_non_system(unsigned actionid) } } +bool +kperf_action_has_task(unsigned int actionid) +{ + if (actionid > actionc) { + return false; + } + + return (actionv[actionid - 1].sample & SAMPLER_TASK_MASK); +} + +bool +kperf_action_has_thread(unsigned int actionid) +{ + if (actionid > actionc) { + return false; + } + + return (actionv[actionid - 1].sample & SAMPLER_THREAD_MASK); +} + static void kperf_system_memory_log(void) { @@ -94,6 +114,10 @@ kperf_system_memory_log(void) (uintptr_t)vm_page_wire_count, (uintptr_t)vm_page_external_count, (uintptr_t)(vm_page_active_count + vm_page_inactive_count + vm_page_speculative_count)); + BUF_DATA(PERF_MI_SYS_DATA_2, (uintptr_t)vm_page_anonymous_count, + (uintptr_t)vm_page_internal_count, + (uintptr_t)vm_pageout_vminfo.vm_pageout_compressions, + (uintptr_t)VM_PAGE_COMPRESSOR_COUNT); } static kern_return_t @@ -106,6 +130,7 @@ kperf_sample_internal(struct kperf_sample *sbuf, int pended_th_dispatch = 0; bool on_idle_thread = false; uint32_t userdata = actionid; + bool task_only = false; /* not much point continuing here, but what to do ? return * Shutdown? cut a tracepoint and continue? @@ -123,8 +148,20 @@ kperf_sample_internal(struct kperf_sample *sbuf, sample_what &= SAMPLER_SYS_MEM; } - context->cur_thread->kperf_pet_gen = kperf_pet_gen; - boolean_t is_kernel = (context->cur_pid == 0); + assert((sample_flags & (SAMPLE_FLAG_THREAD_ONLY | SAMPLE_FLAG_TASK_ONLY)) + != (SAMPLE_FLAG_THREAD_ONLY | SAMPLE_FLAG_TASK_ONLY)); + if (sample_flags & SAMPLE_FLAG_THREAD_ONLY) { + sample_what &= SAMPLER_THREAD_MASK; + } + if (sample_flags & SAMPLE_FLAG_TASK_ONLY) { + task_only = true; + sample_what &= SAMPLER_TASK_MASK; + } + + if (!task_only) { + context->cur_thread->kperf_pet_gen = kperf_pet_gen; + } + bool is_kernel = (context->cur_pid == 0); if (actionid && actionid <= actionc) { sbuf->kcallstack.nframes = actionv[actionid - 1].kcallstack_depth; @@ -175,13 +212,13 @@ kperf_sample_internal(struct kperf_sample *sbuf, } } if (sample_what & SAMPLER_TK_SNAPSHOT) { - kperf_task_snapshot_sample(&(sbuf->tk_snapshot), context); + kperf_task_snapshot_sample(context->cur_task, &(sbuf->tk_snapshot)); } /* sensitive ones */ if (!is_kernel) { if (sample_what & SAMPLER_MEMINFO) { - kperf_meminfo_sample(&(sbuf->meminfo), context); + kperf_meminfo_sample(context->cur_task, &(sbuf->meminfo)); } if (sample_flags & SAMPLE_FLAG_PEND_USER) { @@ -257,6 +294,9 @@ kperf_sample_internal(struct kperf_sample *sbuf, if (sample_what & SAMPLER_TK_SNAPSHOT) { kperf_task_snapshot_log(&(sbuf->tk_snapshot)); } + if (sample_what & SAMPLER_TK_INFO) { + kperf_task_info_log(context); + } /* dump user stuff */ if (!is_kernel) { @@ -331,7 +371,6 @@ void kperf_kdebug_handler(uint32_t debugid, uintptr_t *starting_fp) { uint32_t sample_flags = SAMPLE_FLAG_PEND_USER; - struct kperf_context ctx; struct kperf_sample *sample = NULL; kern_return_t kr = KERN_SUCCESS; int s; @@ -342,10 +381,15 @@ kperf_kdebug_handler(uint32_t debugid, uintptr_t *starting_fp) BUF_VERB(PERF_KDBG_HNDLR | DBG_FUNC_START, debugid); - ctx.cur_thread = current_thread(); - ctx.cur_pid = task_pid(get_threadtask(ctx.cur_thread)); - ctx.trigger_type = TRIGGER_TYPE_KDEBUG; - ctx.trigger_id = 0; + thread_t thread = current_thread(); + task_t task = get_threadtask(thread); + struct kperf_context ctx = { + .cur_thread = thread, + .cur_task = task, + .cur_pid = task_pid(task), + .trigger_type = TRIGGER_TYPE_KDEBUG, + .trigger_id = 0, + }; s = ml_set_interrupts_enabled(0); @@ -385,9 +429,11 @@ kperf_thread_ast_handler(thread_t thread) } /* make a context, take a sample */ - struct kperf_context ctx; - ctx.cur_thread = thread; - ctx.cur_pid = task_pid(task); + struct kperf_context ctx = { + .cur_thread = thread, + .cur_task = task, + .cur_pid = task_pid(task), + }; /* decode the flags to determine what to sample */ unsigned int sample_what = 0; diff --git a/osfmk/kperf/action.h b/osfmk/kperf/action.h index f4e2e72bd..be150c401 100644 --- a/osfmk/kperf/action.h +++ b/osfmk/kperf/action.h @@ -30,6 +30,7 @@ #define KPERF_ACTION_H #include +#include #include /* fwd decl */ @@ -50,6 +51,13 @@ struct kperf_context; #define SAMPLER_TK_SNAPSHOT (1U << 10) #define SAMPLER_SYS_MEM (1U << 11) #define SAMPLER_TH_INSCYC (1U << 12) +#define SAMPLER_TK_INFO (1U << 13) + +#define SAMPLER_TASK_MASK (SAMPLER_MEMINFO | SAMPLER_TK_SNAPSHOT | \ + SAMPLER_TK_INFO) +#define SAMPLER_THREAD_MASK (SAMPLER_TH_INFO | SAMPLER_TH_SNAPSHOT | \ + SAMPLER_KSTACK | SAMPLER_USTACK | SAMPLER_PMC_THREAD | \ + SAMPLER_TH_SCHEDULING | SAMPLER_TH_DISPATCH | SAMPLER_TH_INSCYC) /* flags for sample calls */ @@ -67,6 +75,10 @@ struct kperf_context; #define SAMPLE_FLAG_SYSTEM (1U << 5) /* sample should not include non-system samplers */ #define SAMPLE_FLAG_ONLY_SYSTEM (1U << 6) +/* sample should only include task samplers */ +#define SAMPLE_FLAG_TASK_ONLY (1U << 7) +/* sample should only include thread samplers */ +#define SAMPLE_FLAG_THREAD_ONLY (1U << 8) /* Take a sample into "sbuf" using current thread "cur_thread" */ kern_return_t kperf_sample(struct kperf_sample *sbuf, @@ -75,7 +87,9 @@ kern_return_t kperf_sample(struct kperf_sample *sbuf, unsigned sample_flags); /* Whether the action provided samples non-system values. */ -bool kperf_sample_has_non_system(unsigned actionid); +bool kperf_action_has_non_system(unsigned actionid); +bool kperf_action_has_thread(unsigned int actionid); +bool kperf_action_has_task(unsigned int actionid); /* return codes from taking a sample * either keep trigger, or something went wrong (or we're shutting down) @@ -105,4 +119,11 @@ int kperf_action_get_kcallstack_depth(unsigned int actionid, uint32_t * depth_ou int kperf_action_set_filter(unsigned int actionid, int pid); int kperf_action_get_filter(unsigned int actionid, int *pid_out); +void kperf_kdebug_handler(uint32_t debugid, uintptr_t *starting_fp); + +/* whether to output tracepoints on context-switch */ +extern int kperf_kdebug_cswitch; +int kperf_kdbg_cswitch_get(void); +int kperf_kdbg_cswitch_set(int newval); + #endif /* !defined(KPERF_ACTION_H) */ diff --git a/osfmk/kperf/buffer.h b/osfmk/kperf/buffer.h index 8dbe06fbc..fb24d4097 100644 --- a/osfmk/kperf/buffer.h +++ b/osfmk/kperf/buffer.h @@ -43,7 +43,7 @@ #define PERF_KPC (6) #define PERF_KDBG (7) #define PERF_TASK (8) -/* 9 unused */ +#define PERF_LAZY (9) #define PERF_MEMINFO (10) /* helpers for 32-bit */ @@ -78,6 +78,8 @@ #define PERF_TI_INSCYCDATA_32 PERF_TI_CODE(18) #define PERF_TI_SCHEDDATA_2 PERF_TI_CODE(19) #define PERF_TI_SCHEDDATA2_32_2 PERF_TI_CODE(20) +#define PERF_TI_SCHEDDATA3_32 PERF_TI_CODE(21) +#define PERF_TI_SCHEDDATA_3 PERF_TI_CODE(22) #define PERF_CS_CODE(code) PERF_CODE(PERF_CALLSTACK, code) #define PERF_CS_KSAMPLE PERF_CS_CODE(0) @@ -136,11 +138,18 @@ #define PERF_TK_SNAP_DATA PERF_TK_CODE(1) #define PERF_TK_SNAP_DATA1_32 PERF_TK_CODE(2) #define PERF_TK_SNAP_DATA2_32 PERF_TK_CODE(3) +#define PERF_TK_INFO_DATA PERF_TK_CODE(4) + +#define PERF_LZ_CODE(code) PERF_CODE(PERF_LAZY, code) +#define PERF_LZ_MKRUNNABLE PERF_LZ_CODE(0) +#define PERF_LZ_WAITSAMPLE PERF_LZ_CODE(1) +#define PERF_LZ_CPUSAMPLE PERF_LZ_CODE(2) #define PERF_MI_CODE(code) PERF_CODE(PERF_MEMINFO, code) #define PERF_MI_SAMPLE PERF_MI_CODE(0) #define PERF_MI_DATA PERF_MI_CODE(1) #define PERF_MI_SYS_DATA PERF_MI_CODE(2) +#define PERF_MI_SYS_DATA_2 PERF_MI_CODE(3) /* error sub-codes for trace data */ enum diff --git a/osfmk/kperf/callstack.c b/osfmk/kperf/callstack.c index 2fe676882..7c93e8137 100644 --- a/osfmk/kperf/callstack.c +++ b/osfmk/kperf/callstack.c @@ -662,6 +662,8 @@ chudxnu_thread_get_callstack64_kperf( return chudxnu_thread_get_callstack64_internal( thread, callStack, count, user_only, 0 ); } #elif __arm64__ + + // chudxnu_thread_get_callstack gathers a raw callstack along with any information needed to // fix it up later (in case we stopped program as it was saving values into prev stack frame, etc.) // after sampling has finished. diff --git a/osfmk/kperf/context.h b/osfmk/kperf/context.h index 14eadfe7a..d0fd4c290 100644 --- a/osfmk/kperf/context.h +++ b/osfmk/kperf/context.h @@ -36,11 +36,12 @@ struct kperf_context { /* who was running during the event */ int cur_pid; thread_t cur_thread; + task_t cur_task; uintptr_t *starting_fp; /* who caused the event */ - unsigned trigger_type; - unsigned trigger_id; + unsigned int trigger_type; + unsigned int trigger_id; }; #endif /* !defined(KPERF_CONTEXT_H) */ diff --git a/osfmk/kperf/kdebug_trigger.c b/osfmk/kperf/kdebug_trigger.c index 7c343631b..b649888ac 100644 --- a/osfmk/kperf/kdebug_trigger.c +++ b/osfmk/kperf/kdebug_trigger.c @@ -157,14 +157,14 @@ kperf_kdebug_set_filter(user_addr_t user_filter, uint32_t user_size) return err; } + n_debugids_provided = (uint32_t)KPERF_KDEBUG_N_DEBUGIDS(user_size); + /* detect disabling the filter completely */ - if (user_filter == USER_ADDR_NULL || user_size == 0) { + if (n_debugids_provided == 0) { bzero(kperf_kdebug_filter, sizeof(*kperf_kdebug_filter)); goto out; } - n_debugids_provided = (uint32_t)KPERF_KDEBUG_N_DEBUGIDS(user_size); - if ((err = kperf_kdebug_set_n_debugids(n_debugids_provided))) { goto out; } diff --git a/osfmk/kperf/kperf.c b/osfmk/kperf/kperf.c index 19f7d8704..831f3afd8 100644 --- a/osfmk/kperf/kperf.c +++ b/osfmk/kperf/kperf.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -62,6 +63,9 @@ static boolean_t kperf_initted = FALSE; /* whether or not to callback to kperf on context switch */ boolean_t kperf_on_cpu_active = FALSE; +unsigned int kperf_thread_blocked_action; +unsigned int kperf_cpu_sample_action; + struct kperf_sample * kperf_intr_sample_buffer(void) { @@ -140,6 +144,7 @@ kperf_reset(void) (void)kperf_sampling_disable(); /* cleanup miscellaneous configuration first */ + kperf_lazy_reset(); (void)kperf_kdbg_cswitch_set(0); (void)kperf_set_lightweight_pet(0); kperf_kdebug_reset(); @@ -209,12 +214,17 @@ kperf_kernel_configure(const char *config) } } while (*(config++) == ','); - kperf_sampling_enable(); + int error = kperf_sampling_enable(); + if (error) { + kprintf("kperf: cannot enable sampling at boot: %d", error); + } out: ktrace_end_single_threaded(); } +void kperf_on_cpu_internal(thread_t thread, thread_continue_t continuation, + uintptr_t *starting_fp); void kperf_on_cpu_internal(thread_t thread, thread_continue_t continuation, uintptr_t *starting_fp) @@ -222,19 +232,22 @@ kperf_on_cpu_internal(thread_t thread, thread_continue_t continuation, if (kperf_kdebug_cswitch) { /* trace the new thread's PID for Instruments */ int pid = task_pid(get_threadtask(thread)); - BUF_DATA(PERF_TI_CSWITCH, thread_tid(thread), pid); } if (kperf_lightweight_pet_active) { kperf_pet_on_cpu(thread, continuation, starting_fp); } + if (kperf_lazy_wait_action != 0) { + kperf_lazy_wait_sample(thread, continuation, starting_fp); + } } void kperf_on_cpu_update(void) { kperf_on_cpu_active = kperf_kdebug_cswitch || - kperf_lightweight_pet_active; + kperf_lightweight_pet_active || + kperf_lazy_wait_action != 0; } /* random misc-ish functions */ @@ -321,21 +334,16 @@ kperf_thread_set_dirty(thread_t thread, boolean_t dirty) int kperf_port_to_pid(mach_port_name_t portname) { - task_t task; - int pid; - if (!MACH_PORT_VALID(portname)) { return -1; } - task = port_name_to_task(portname); - + task_t task = port_name_to_task(portname); if (task == TASK_NULL) { return -1; } - - pid = task_pid(task); - + pid_t pid = task_pid(task); + /* drop the ref taken by port_name_to_task */ task_deallocate_internal(task); return pid; diff --git a/osfmk/kperf/kperf.h b/osfmk/kperf/kperf.h index 040d032c1..673a02cd3 100644 --- a/osfmk/kperf/kperf.h +++ b/osfmk/kperf/kperf.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2011-2018 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,7 +22,7 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ @@ -35,9 +35,11 @@ extern lck_grp_t kperf_lck_grp; /* the trigger types supported by kperf */ -#define TRIGGER_TYPE_TIMER (0) -#define TRIGGER_TYPE_PMI (1) -#define TRIGGER_TYPE_KDEBUG (2) +#define TRIGGER_TYPE_TIMER (0) +#define TRIGGER_TYPE_PMI (1) +#define TRIGGER_TYPE_KDEBUG (2) +#define TRIGGER_TYPE_LAZY_WAIT (3) +#define TRIGGER_TYPE_LAZY_CPU (3) /* helpers to get and set AST flags on a thread */ uint32_t kperf_get_thread_flags(thread_t thread); @@ -69,51 +71,78 @@ extern int kperf_sampling_disable(void); struct kperf_sample *kperf_intr_sample_buffer(void); /* - * kperf AST handler + * Callbacks into kperf from other systems. */ -extern __attribute__((noinline)) void kperf_thread_ast_handler(thread_t thread); /* - * thread on core callback + * kperf AST handler + * + * Prevent inlining, since the sampling function allocates on the stack and + * branches calling ast_taken (but never on a kperf AST) may blow their stacks. */ - -/* controls whether the callback is called on context switch */ -extern boolean_t kperf_on_cpu_active; +extern __attribute__((noinline)) void kperf_thread_ast_handler(thread_t thread); /* update whether the callback is set */ void kperf_on_cpu_update(void); -/* handle a thread being switched on */ -void kperf_on_cpu_internal(thread_t thread, thread_continue_t continuation, - uintptr_t *starting_fp); - -/* for scheduler threads switching threads on */ +/* for scheduler switching threads on */ static inline void kperf_on_cpu(thread_t thread, thread_continue_t continuation, uintptr_t *starting_fp) { + extern boolean_t kperf_on_cpu_active; + void kperf_on_cpu_internal(thread_t thread, thread_continue_t continuation, + uintptr_t *starting_fp); + if (__improbable(kperf_on_cpu_active)) { kperf_on_cpu_internal(thread, continuation, starting_fp); } } -/* - * kdebug callback - */ +/* for scheduler switching threads off */ +static inline void +kperf_off_cpu(thread_t thread) +{ + extern unsigned int kperf_lazy_cpu_action; + void kperf_lazy_off_cpu(thread_t thread); + + if (__improbable(kperf_lazy_cpu_action != 0)) { + kperf_lazy_off_cpu(thread); + } +} -/* controls whether the kdebug callback is called */ -extern boolean_t kperf_kdebug_active; +/* for scheduler making threads runnable */ +static inline void +kperf_make_runnable(thread_t thread, int interrupt) +{ + extern unsigned int kperf_lazy_cpu_action; + void kperf_lazy_make_runnable(thread_t thread, bool interrupt); -/* handle the kdebug event */ -void kperf_kdebug_callback_internal(uint32_t debugid); + if (__improbable(kperf_lazy_cpu_action != 0)) { + kperf_lazy_make_runnable(thread, interrupt); + } +} -/* handle a kdebug event */ -void kperf_kdebug_handler(uint32_t debugid, uintptr_t *starting_fp); +/* for interrupt handler epilogue */ +static inline void +kperf_interrupt(void) +{ + extern unsigned int kperf_lazy_cpu_action; + extern void kperf_lazy_cpu_sample(thread_t thread, unsigned int flags, + bool interrupt); + + if (__improbable(kperf_lazy_cpu_action != 0)) { + kperf_lazy_cpu_sample(current_thread(), 0, true); + } +} -/* called inside of kernel_debug_internal */ +/* for kdebug on every traced event */ static inline void kperf_kdebug_callback(uint32_t debugid, uintptr_t *starting_fp) { + extern boolean_t kperf_kdebug_active; + void kperf_kdebug_handler(uint32_t debugid, uintptr_t *starting_fp); + if (__improbable(kperf_kdebug_active)) { kperf_kdebug_handler(debugid, starting_fp); } @@ -129,21 +158,11 @@ extern void kperf_reset(void); */ void kperf_kernel_configure(const char *config); -/* get and set whether we're recording stacks on interesting kdebug events */ -extern int kperf_kdbg_get_stacks(void); -extern int kperf_kdbg_set_stacks(int); - -extern int kperf_kdebug_cswitch; +/* given a task port, find out its pid */ +int kperf_port_to_pid(mach_port_name_t portname); #if DEVELOPMENT || DEBUG extern _Atomic long long kperf_pending_ipis; #endif /* DEVELOPMENT || DEBUG */ -/* get and set whether to output tracepoints on context-switch */ -extern int kperf_kdbg_cswitch_get(void); -extern int kperf_kdbg_cswitch_set(int newval); - -/* given a task port, find out its pid */ -int kperf_port_to_pid(mach_port_name_t portname); - #endif /* !defined(KPERF_H) */ diff --git a/osfmk/kperf/kperf_arch.h b/osfmk/kperf/kperf_arch.h index c83992191..6c84d89f5 100644 --- a/osfmk/kperf/kperf_arch.h +++ b/osfmk/kperf/kperf_arch.h @@ -32,6 +32,5 @@ struct kperf_timer; bool kperf_mp_broadcast_other_running(struct kperf_timer *trigger); void kperf_signal_handler(unsigned int cpu_number); -kern_return_t kperf_get_phys_footprint(task_t, uint64_t *); #endif /* KPERF_ARCH_H */ diff --git a/osfmk/kperf/kperf_timer.c b/osfmk/kperf/kperf_timer.c index 86ed35d87..49c16419c 100644 --- a/osfmk/kperf/kperf_timer.c +++ b/osfmk/kperf/kperf_timer.c @@ -118,12 +118,15 @@ kperf_sample_cpu(struct kperf_timer *timer, bool system_sample, #endif /* DEVELOPMENT || DEBUG */ /* On a timer, we can see the "real" current thread */ + thread_t thread = current_thread(); + task_t task = get_threadtask(thread); struct kperf_context ctx = { - .cur_thread = current_thread(), + .cur_thread = thread, + .cur_task = task, + .cur_pid = task_pid(task), .trigger_type = TRIGGER_TYPE_TIMER, .trigger_id = (unsigned int)(timer - kperf_timerv), }; - ctx.cur_pid = task_pid(get_threadtask(ctx.cur_thread)); if (ctx.trigger_id == pet_timer_id && ncpu < machine_info.logical_cpu_max) { kperf_tid_on_cpus[ncpu] = thread_tid(ctx.cur_thread); @@ -192,7 +195,7 @@ kperf_timer_handler(void *param0, __unused void *param1) /* * IPI other cores only if the action has non-system samplers. */ - if (kperf_sample_has_non_system(timer->actionid)) { + if (kperf_action_has_non_system(timer->actionid)) { /* * If the core that's handling the timer is not scheduling * threads, only run system samplers. diff --git a/osfmk/kperf/kperf_timer.h b/osfmk/kperf/kperf_timer.h index fcc642217..3d5b91cad 100644 --- a/osfmk/kperf/kperf_timer.h +++ b/osfmk/kperf/kperf_timer.h @@ -66,23 +66,23 @@ void kperf_ipi_handler(void *param); #if defined(__x86_64__) #define KP_MIN_PERIOD_NS (20 * NSEC_PER_USEC) -#define KP_MIN_PERIOD_BG_NS (10 * NSEC_PER_MSEC) +#define KP_MIN_PERIOD_BG_NS (1 * NSEC_PER_MSEC) #define KP_MIN_PERIOD_PET_NS (2 * NSEC_PER_MSEC) -#define KP_MIN_PERIOD_PET_BG_NS (10 * NSEC_PER_MSEC) +#define KP_MIN_PERIOD_PET_BG_NS (5 * NSEC_PER_MSEC) #elif defined(__arm64__) #define KP_MIN_PERIOD_NS (50 * NSEC_PER_USEC) -#define KP_MIN_PERIOD_BG_NS (20 * NSEC_PER_MSEC) +#define KP_MIN_PERIOD_BG_NS (1 * NSEC_PER_MSEC) #define KP_MIN_PERIOD_PET_NS (2 * NSEC_PER_MSEC) -#define KP_MIN_PERIOD_PET_BG_NS (50 * NSEC_PER_MSEC) +#define KP_MIN_PERIOD_PET_BG_NS (10 * NSEC_PER_MSEC) #elif defined(__arm__) #define KP_MIN_PERIOD_NS (100 * NSEC_PER_USEC) -#define KP_MIN_PERIOD_BG_NS (50 * NSEC_PER_MSEC) +#define KP_MIN_PERIOD_BG_NS (10 * NSEC_PER_MSEC) #define KP_MIN_PERIOD_PET_NS (2 * NSEC_PER_MSEC) -#define KP_MIN_PERIOD_PET_BG_NS (100 * NSEC_PER_MSEC) +#define KP_MIN_PERIOD_PET_BG_NS (50 * NSEC_PER_MSEC) #else /* defined(__x86_64__) */ #error "unsupported architecture" diff --git a/osfmk/kperf/kperfbsd.c b/osfmk/kperf/kperfbsd.c index b89125126..6fe1b5c29 100644 --- a/osfmk/kperf/kperfbsd.c +++ b/osfmk/kperf/kperfbsd.c @@ -44,32 +44,44 @@ #include #include #include +#include #include -/* IDs for dispatch from SYSCTL macros */ -#define REQ_SAMPLING (1) -#define REQ_ACTION_COUNT (2) -#define REQ_ACTION_SAMPLERS (3) -#define REQ_TIMER_COUNT (4) -#define REQ_TIMER_PERIOD (5) -#define REQ_TIMER_PET (6) -#define REQ_TIMER_ACTION (7) -#define REQ_BLESS (8) -#define REQ_ACTION_USERDATA (9) -#define REQ_ACTION_FILTER_BY_TASK (10) -#define REQ_ACTION_FILTER_BY_PID (11) -/* 12 unused */ -#define REQ_PET_IDLE_RATE (13) -#define REQ_BLESS_PREEMPT (14) -#define REQ_KDBG_CSWITCH (15) -#define REQ_RESET (16) -/* 17 unused */ -#define REQ_ACTION_UCALLSTACK_DEPTH (18) -#define REQ_ACTION_KCALLSTACK_DEPTH (19) -#define REQ_LIGHTWEIGHT_PET (20) -#define REQ_KDEBUG_ACTION (21) -#define REQ_KDEBUG_FILTER (22) +/* Requests from kperf sysctls. */ +enum kperf_request { + REQ_SAMPLING, + REQ_RESET, + + REQ_ACTION_COUNT, + REQ_ACTION_SAMPLERS, + REQ_ACTION_USERDATA, + REQ_ACTION_FILTER_BY_TASK, + REQ_ACTION_FILTER_BY_PID, + REQ_ACTION_UCALLSTACK_DEPTH, + REQ_ACTION_KCALLSTACK_DEPTH, + + REQ_TIMER_COUNT, + REQ_TIMER_PERIOD, + REQ_TIMER_PET, + REQ_TIMER_ACTION, + + REQ_KDBG_CSWITCH, + + REQ_BLESS, + REQ_BLESS_PREEMPT, + + REQ_PET_IDLE_RATE, + REQ_LIGHTWEIGHT_PET, + + REQ_KDEBUG_FILTER, + REQ_KDEBUG_ACTION, + + REQ_LAZY_WAIT_TIME_THRESHOLD, + REQ_LAZY_WAIT_ACTION, + REQ_LAZY_CPU_TIME_THRESHOLD, + REQ_LAZY_CPU_ACTION, +}; int kperf_debug_level = 0; @@ -78,7 +90,7 @@ _Atomic long long kperf_pending_ipis = 0; #endif /* DEVELOPMENT || DEBUG */ /* - * kperf has a different sysctl model than others. + * kperf has unique requirements from sysctl. * * For simple queries like the number of actions, the normal sysctl style * of get/set works well. @@ -137,6 +149,28 @@ kperf_sysctl_get_set_int(struct sysctl_req *req, return set(value); } +static int +kperf_sysctl_get_set_uint64(struct sysctl_req *req, + uint64_t (*get)(void), int (*set)(uint64_t)) +{ + assert(req != NULL); + assert(get != NULL); + assert(set != NULL); + + uint64_t value = 0; + if (req->oldptr) { + value = get(); + } + + int error = sysctl_io_number(req, value, sizeof(value), &value, NULL); + + if (error || !req->newptr) { + return error; + } + + return set(value); +} + static int kperf_sysctl_get_set_unsigned_uint32(struct sysctl_req *req, int (*get)(unsigned int, uint32_t *), int (*set)(unsigned int, uint32_t)) @@ -311,9 +345,11 @@ sysctl_kdebug_filter(struct sysctl_req *req) } return SYSCTL_OUT(req, filter, filter_size); + } else if (req->newptr != USER_ADDR_NULL) { + return kperf_kdebug_set_filter(req->newptr, (uint32_t)req->newlen); + } else { + return EINVAL; } - - return kperf_kdebug_set_filter(req->newptr, (uint32_t)req->newlen); } static int @@ -407,12 +443,40 @@ sysctl_kdbg_cswitch(struct sysctl_req *req) kperf_kdbg_cswitch_set); } +static int +sysctl_lazy_wait_time_threshold(struct sysctl_req *req) +{ + return kperf_sysctl_get_set_uint64(req, kperf_lazy_get_wait_time_threshold, + kperf_lazy_set_wait_time_threshold); +} + +static int +sysctl_lazy_wait_action(struct sysctl_req *req) +{ + return kperf_sysctl_get_set_int(req, kperf_lazy_get_wait_action, + kperf_lazy_set_wait_action); +} + +static int +sysctl_lazy_cpu_time_threshold(struct sysctl_req *req) +{ + return kperf_sysctl_get_set_uint64(req, kperf_lazy_get_cpu_time_threshold, + kperf_lazy_set_cpu_time_threshold); +} + +static int +sysctl_lazy_cpu_action(struct sysctl_req *req) +{ + return kperf_sysctl_get_set_int(req, kperf_lazy_get_cpu_action, + kperf_lazy_set_cpu_action); +} + static int kperf_sysctl SYSCTL_HANDLER_ARGS { #pragma unused(oidp, arg2) int ret; - uintptr_t type = (uintptr_t)arg1; + enum kperf_request type = (enum kperf_request)arg1; ktrace_lock(); @@ -487,6 +551,18 @@ kperf_sysctl SYSCTL_HANDLER_ARGS case REQ_LIGHTWEIGHT_PET: ret = sysctl_lightweight_pet(req); break; + case REQ_LAZY_WAIT_TIME_THRESHOLD: + ret = sysctl_lazy_wait_time_threshold(req); + break; + case REQ_LAZY_WAIT_ACTION: + ret = sysctl_lazy_wait_action(req); + break; + case REQ_LAZY_CPU_TIME_THRESHOLD: + ret = sysctl_lazy_cpu_time_threshold(req); + break; + case REQ_LAZY_CPU_ACTION: + ret = sysctl_lazy_cpu_action(req); + break; default: ret = ENOENT; break; @@ -552,7 +628,8 @@ SYSCTL_NODE(_kperf, OID_AUTO, action, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "action"); SYSCTL_PROC(_kperf_action, OID_AUTO, count, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | + CTLFLAG_MASKED, (void *)REQ_ACTION_COUNT, sizeof(int), kperf_sysctl, "I", "Number of actions"); @@ -598,7 +675,8 @@ SYSCTL_NODE(_kperf, OID_AUTO, timer, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "timer"); SYSCTL_PROC(_kperf_timer, OID_AUTO, count, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED + | CTLFLAG_MASKED, (void *)REQ_TIMER_COUNT, sizeof(int), kperf_sysctl, "I", "Number of time triggers"); @@ -615,7 +693,8 @@ SYSCTL_PROC(_kperf_timer, OID_AUTO, action, "Timer number and actionid"); SYSCTL_PROC(_kperf_timer, OID_AUTO, pet_timer, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED + | CTLFLAG_MASKED, (void *)REQ_TIMER_PET, sizeof(int), kperf_sysctl, "I", "Which timer ID does PET"); @@ -625,7 +704,8 @@ SYSCTL_NODE(_kperf, OID_AUTO, kdebug, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "kdebug"); SYSCTL_PROC(_kperf_kdebug, OID_AUTO, action, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED + | CTLFLAG_MASKED, (void*)REQ_KDEBUG_ACTION, sizeof(int), kperf_sysctl, "I", "ID of action to trigger on kdebug events"); @@ -634,10 +714,40 @@ SYSCTL_PROC(_kperf_kdebug, OID_AUTO, filter, (void*)REQ_KDEBUG_FILTER, sizeof(int), kperf_sysctl, "P", "The filter that determines which kdebug events trigger a sample"); +/* lazy sampling */ + +SYSCTL_NODE(_kperf, OID_AUTO, lazy, CTLFLAG_RW | CTLFLAG_LOCKED, 0, + "lazy"); + +SYSCTL_PROC(_kperf_lazy, OID_AUTO, wait_time_threshold, + CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MASKED | CTLFLAG_LOCKED, + (void *)REQ_LAZY_WAIT_TIME_THRESHOLD, + sizeof(uint64_t), kperf_sysctl, "UQ", + "How many ticks a thread must wait to take a sample"); + +SYSCTL_PROC(_kperf_lazy, OID_AUTO, wait_action, + CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MASKED | CTLFLAG_LOCKED, + (void *)REQ_LAZY_WAIT_ACTION, + sizeof(uint64_t), kperf_sysctl, "UQ", + "Which action to fire when a thread waits longer than threshold"); + +SYSCTL_PROC(_kperf_lazy, OID_AUTO, cpu_time_threshold, + CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MASKED | CTLFLAG_LOCKED, + (void *)REQ_LAZY_CPU_TIME_THRESHOLD, + sizeof(uint64_t), kperf_sysctl, "UQ", + "Minimum number of ticks a CPU must run between samples"); + +SYSCTL_PROC(_kperf_lazy, OID_AUTO, cpu_action, + CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MASKED | CTLFLAG_LOCKED, + (void *)REQ_LAZY_CPU_ACTION, + sizeof(uint64_t), kperf_sysctl, "UQ", + "Which action to fire for lazy CPU samples"); + /* misc */ SYSCTL_PROC(_kperf, OID_AUTO, sampling, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED + | CTLFLAG_MASKED, (void *)REQ_SAMPLING, sizeof(int), kperf_sysctl, "I", "Sampling running"); @@ -647,29 +757,34 @@ SYSCTL_PROC(_kperf, OID_AUTO, reset, 0, kperf_sysctl, "-", "Reset kperf"); SYSCTL_PROC(_kperf, OID_AUTO, blessed_pid, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, /* must be root */ + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED /* must be root */ + | CTLFLAG_MASKED, (void *)REQ_BLESS, sizeof(int), kperf_sysctl_bless_handler, "I", "Blessed pid"); SYSCTL_PROC(_kperf, OID_AUTO, blessed_preempt, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED | + CTLFLAG_MASKED, (void *)REQ_BLESS_PREEMPT, sizeof(int), kperf_sysctl, "I", "Blessed preemption"); SYSCTL_PROC(_kperf, OID_AUTO, kdbg_cswitch, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED + | CTLFLAG_MASKED, (void *)REQ_KDBG_CSWITCH, sizeof(int), kperf_sysctl, "I", "Generate context switch info"); SYSCTL_PROC(_kperf, OID_AUTO, pet_idle_rate, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED + | CTLFLAG_MASKED, (void *)REQ_PET_IDLE_RATE, sizeof(int), kperf_sysctl, "I", "Rate at which unscheduled threads are forced to be sampled in " "PET mode"); SYSCTL_PROC(_kperf, OID_AUTO, lightweight_pet, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED + | CTLFLAG_MASKED, (void *)REQ_LIGHTWEIGHT_PET, sizeof(int), kperf_sysctl, "I", "Status of lightweight PET mode"); @@ -679,16 +794,18 @@ SYSCTL_PROC(_kperf, OID_AUTO, lightweight_pet, SYSCTL_NODE(_kperf, OID_AUTO, limits, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "limits"); -#define REQ_LIM_PERIOD_NS (1) -#define REQ_LIM_BG_PERIOD_NS (2) -#define REQ_LIM_PET_PERIOD_NS (3) -#define REQ_LIM_BG_PET_PERIOD_NS (4) +enum kperf_limit_request { + REQ_LIM_PERIOD_NS, + REQ_LIM_BG_PERIOD_NS, + REQ_LIM_PET_PERIOD_NS, + REQ_LIM_BG_PET_PERIOD_NS, +}; static int kperf_sysctl_limits SYSCTL_HANDLER_ARGS { #pragma unused(oidp, arg2) - int type = (int)arg1; + enum kperf_limit_request type = (enum kperf_limit_request)arg1; uint64_t limit = 0; switch (type) { diff --git a/osfmk/kperf/kperfbsd.h b/osfmk/kperf/kperfbsd.h index 16bfb7c91..2e71d403c 100644 --- a/osfmk/kperf/kperfbsd.h +++ b/osfmk/kperf/kperfbsd.h @@ -29,9 +29,7 @@ #ifndef __KPERF_BSD_H__ #define __KPERF_BSD_H__ -/* bless a process to allow kperf access to a non-root process - */ -extern int kperf_bless_pid(pid_t newpid); - +/* bless a process to allow kperf access to a non-root process */ +int kperf_bless_pid(pid_t newpid); #endif /* __KPERF_BSD_H__ */ diff --git a/osfmk/kperf/lazy.c b/osfmk/kperf/lazy.c new file mode 100644 index 000000000..78e01b206 --- /dev/null +++ b/osfmk/kperf/lazy.c @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2018 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include + +#include + +#include +#include +#include +#include +#include + +unsigned int kperf_lazy_wait_action = 0; +unsigned int kperf_lazy_cpu_action = 0; +uint64_t kperf_lazy_wait_time_threshold = 0; +uint64_t kperf_lazy_cpu_time_threshold = 0; + +void +kperf_lazy_reset(void) +{ + kperf_lazy_wait_action = 0; + kperf_lazy_wait_time_threshold = 0; + kperf_lazy_cpu_action = 0; + kperf_lazy_cpu_time_threshold = 0; + kperf_on_cpu_update(); +} + +void +kperf_lazy_off_cpu(thread_t thread) +{ + /* try to lazily sample the CPU if the thread was pre-empted */ + if ((thread->reason & AST_SCHEDULING) != 0) { + kperf_lazy_cpu_sample(thread, 0, 0); + } +} + +void +kperf_lazy_make_runnable(thread_t thread, bool in_interrupt) +{ + assert(thread->last_made_runnable_time != THREAD_NOT_RUNNABLE); + /* ignore threads that race to wait and in waking up */ + if (thread->last_run_time > thread->last_made_runnable_time) { + return; + } + + uint64_t wait_time = thread_get_last_wait_duration(thread); + if (wait_time > kperf_lazy_wait_time_threshold) { + BUF_DATA(PERF_LZ_MKRUNNABLE, (uintptr_t)thread_tid(thread), + thread->sched_pri, in_interrupt ? 1 : 0); + } +} + +void +kperf_lazy_wait_sample(thread_t thread, thread_continue_t continuation, + uintptr_t *starting_fp) +{ + /* ignore idle threads */ + if (thread->last_made_runnable_time == THREAD_NOT_RUNNABLE) { + return; + } + /* ignore invalid made runnable times */ + if (thread->last_made_runnable_time < thread->last_run_time) { + return; + } + + /* take a sample if thread was waiting for longer than threshold */ + uint64_t wait_time = thread_get_last_wait_duration(thread); + if (wait_time > kperf_lazy_wait_time_threshold) { + uint64_t time_now = mach_absolute_time(); + timer_update(&thread->runnable_timer, time_now); + timer_update(&thread->system_timer, time_now); + + uint64_t runnable_time = timer_grab(&thread->runnable_timer); + uint64_t running_time = timer_grab(&thread->user_timer) + + timer_grab(&thread->system_timer); + + BUF_DATA(PERF_LZ_WAITSAMPLE, wait_time, runnable_time, running_time); + + task_t task = get_threadtask(thread); + struct kperf_context ctx = { + .cur_thread = thread, + .cur_task = task, + .cur_pid = task_pid(task), + .trigger_type = TRIGGER_TYPE_LAZY_WAIT, + .starting_fp = starting_fp, + }; + + struct kperf_sample *sample = kperf_intr_sample_buffer(); + if (!sample) { + return; + } + + unsigned int flags = SAMPLE_FLAG_PEND_USER; + flags |= continuation ? SAMPLE_FLAG_CONTINUATION : 0; + flags |= !ml_at_interrupt_context() ? SAMPLE_FLAG_NON_INTERRUPT : 0; + + kperf_sample(sample, &ctx, kperf_lazy_wait_action, flags); + } +} + +void +kperf_lazy_cpu_sample(thread_t thread, unsigned int flags, bool interrupt) +{ + assert(ml_get_interrupts_enabled() == FALSE); + + /* take a sample if this CPU's last sample time is beyond the threshold */ + processor_t processor = current_processor(); + uint64_t time_now = mach_absolute_time(); + uint64_t since_last_sample = time_now - processor->kperf_last_sample_time; + if (since_last_sample > kperf_lazy_cpu_time_threshold) { + processor->kperf_last_sample_time = time_now; + timer_update(&thread->runnable_timer, time_now); + timer_update(&thread->system_timer, time_now); + + uint64_t runnable_time = timer_grab(&thread->runnable_timer); + uint64_t running_time = timer_grab(&thread->user_timer) + + timer_grab(&thread->system_timer); + + BUF_DATA(PERF_LZ_CPUSAMPLE, running_time, runnable_time, + thread->sched_pri, interrupt ? 1 : 0); + + task_t task = get_threadtask(thread); + struct kperf_context ctx = { + .cur_thread = thread, + .cur_task = task, + .cur_pid = task_pid(task), + .trigger_type = TRIGGER_TYPE_LAZY_CPU, + .starting_fp = 0, + }; + + struct kperf_sample *sample = kperf_intr_sample_buffer(); + if (!sample) { + return; + } + + kperf_sample(sample, &ctx, kperf_lazy_cpu_action, + SAMPLE_FLAG_PEND_USER | flags); + } +} + +/* + * Accessors for configuration. + */ + +int kperf_lazy_get_wait_action(void) { return kperf_lazy_wait_action; } + +int +kperf_lazy_set_wait_action(int action_id) +{ + if (action_id < 0 || (unsigned int)action_id > kperf_action_get_count()) { + return 1; + } + + kperf_lazy_wait_action = action_id; + kperf_on_cpu_update(); + return 0; +} + +uint64_t +kperf_lazy_get_wait_time_threshold(void) +{ + return kperf_lazy_wait_time_threshold; +} + +int +kperf_lazy_set_wait_time_threshold(uint64_t threshold) +{ + kperf_lazy_wait_time_threshold = threshold; + return 0; +} + +int kperf_lazy_get_cpu_action(void) { return kperf_lazy_cpu_action; } + +int +kperf_lazy_set_cpu_action(int action_id) +{ + if (action_id < 0 || (unsigned int)action_id > kperf_action_get_count()) { + return 1; + } + + kperf_lazy_cpu_action = action_id; + return 0; +} + +uint64_t +kperf_lazy_get_cpu_time_threshold(void) +{ + return kperf_lazy_cpu_time_threshold; +} + +int +kperf_lazy_set_cpu_time_threshold(uint64_t threshold) +{ + kperf_lazy_cpu_time_threshold = threshold; + return 0; +} + diff --git a/osfmk/kperf/arm/kperf_meminfo.c b/osfmk/kperf/lazy.h similarity index 55% rename from osfmk/kperf/arm/kperf_meminfo.c rename to osfmk/kperf/lazy.h index e9d6b1049..c09fabe7c 100644 --- a/osfmk/kperf/arm/kperf_meminfo.c +++ b/osfmk/kperf/lazy.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2015 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2018 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,33 +22,35 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#include -#include /* task_ledgers */ -#include -#include +#ifndef KPERF_LAZY_H +#define KPERF_LAZY_H -#include +#include +#include +#include -kern_return_t -kperf_get_phys_footprint(task_t task, uint64_t *phys_footprint_out) -{ - kern_return_t kr; - ledger_amount_t credit, debit; - uint64_t phys_footprint; +extern unsigned int kperf_lazy_wait_action; +extern unsigned int kperf_lazy_cpu_action; - kr = ledger_get_entries(task->ledger, task_ledgers.phys_footprint, - &credit, &debit); - if (kr == KERN_SUCCESS) { - phys_footprint = credit - debit; - } else { - return kr; - } +void kperf_lazy_reset(void); +void kperf_lazy_off_cpu(thread_t thread); +void kperf_lazy_make_runnable(thread_t thread, bool in_interrupt); +void kperf_lazy_wait_sample(thread_t thread, + thread_continue_t continuation, uintptr_t *starting_fp); +void kperf_lazy_cpu_sample(thread_t thread, unsigned int flags, bool interrupt); - *phys_footprint_out = phys_footprint; - return KERN_SUCCESS; -} +/* accessors for configuration */ +int kperf_lazy_get_wait_action(void); +int kperf_lazy_get_cpu_action(void); +int kperf_lazy_set_wait_action(int action_id); +int kperf_lazy_set_cpu_action(int action_id); +uint64_t kperf_lazy_get_wait_time_threshold(void); +uint64_t kperf_lazy_get_cpu_time_threshold(void); +int kperf_lazy_set_wait_time_threshold(uint64_t threshold); +int kperf_lazy_set_cpu_time_threshold(uint64_t threshold); +#endif /* !defined(KPERF_LAZY_H) */ diff --git a/osfmk/kperf/meminfo.c b/osfmk/kperf/meminfo.c index 15de26436..03616d085 100644 --- a/osfmk/kperf/meminfo.c +++ b/osfmk/kperf/meminfo.c @@ -2,7 +2,7 @@ * Copyright (c) 2011 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,17 +22,15 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ #include #include /* task_ledgers */ -#include #include #include -#include #include #include @@ -40,28 +38,16 @@ /* collect current memory info */ void -kperf_meminfo_sample(struct meminfo *mi, struct kperf_context *context) +kperf_meminfo_sample(task_t task, struct meminfo *mi) { - task_t task; ledger_amount_t credit, debit; - uint64_t phys_footprint; kern_return_t kr; - assert(mi); - assert(context); - - thread_t thread = context->cur_thread; + assert(mi != NULL); - BUF_INFO(PERF_MI_SAMPLE | DBG_FUNC_START, (uintptr_t)thread_tid(thread)); + BUF_INFO(PERF_MI_SAMPLE | DBG_FUNC_START); - task = get_threadtask(thread); - - kr = kperf_get_phys_footprint(task, &phys_footprint); - if (kr == KERN_SUCCESS) { - mi->phys_footprint = phys_footprint; - } else { - mi->phys_footprint = UINT64_MAX; - } + mi->phys_footprint = get_task_phys_footprint(task); kr = ledger_get_entries(task->ledger, task_ledgers.purgeable_volatile, &credit, &debit); @@ -80,7 +66,7 @@ kperf_meminfo_sample(struct meminfo *mi, struct kperf_context *context) mi->purgeable_volatile_compressed = UINT64_MAX; } - BUF_INFO(PERF_MI_SAMPLE | DBG_FUNC_END, (uintptr_t)thread_tid(thread)); + BUF_INFO(PERF_MI_SAMPLE | DBG_FUNC_END); } /* log an existing sample into the buffer */ @@ -90,4 +76,3 @@ kperf_meminfo_log(struct meminfo *mi) BUF_DATA(PERF_MI_DATA, mi->phys_footprint, mi->purgeable_volatile, mi->purgeable_volatile_compressed); } - diff --git a/osfmk/kperf/meminfo.h b/osfmk/kperf/meminfo.h index 5103e1ef4..a51c1794f 100644 --- a/osfmk/kperf/meminfo.h +++ b/osfmk/kperf/meminfo.h @@ -31,6 +31,7 @@ #include #include +#include struct meminfo { uint64_t phys_footprint; @@ -39,7 +40,7 @@ struct meminfo { }; struct kperf_context; -extern void kperf_meminfo_sample(struct meminfo *, struct kperf_context *); +extern void kperf_meminfo_sample(task_t, struct meminfo *); extern void kperf_meminfo_log(struct meminfo *mi); #endif /* __KPERF_MEMINFO_H__ */ diff --git a/osfmk/kperf/pet.c b/osfmk/kperf/pet.c index 5af12821c..0bfb626ce 100644 --- a/osfmk/kperf/pet.c +++ b/osfmk/kperf/pet.c @@ -120,7 +120,8 @@ static kern_return_t pet_threads_prepare(task_t task); static void pet_sample_all_tasks(uint32_t idle_rate); static void pet_sample_task(task_t task, uint32_t idle_rate); -static void pet_sample_thread(int pid, thread_t thread, uint32_t idle_rate); +static void pet_sample_thread(int pid, task_t task, thread_t thread, + uint32_t idle_rate); /* functions called by other areas of kperf */ @@ -161,9 +162,11 @@ kperf_pet_on_cpu(thread_t thread, thread_continue_t continuation, if (thread->kperf_pet_gen != kperf_pet_gen) { BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_START, kperf_pet_gen, thread->kperf_pet_gen); + task_t task = get_threadtask(thread); struct kperf_context ctx = { .cur_thread = thread, - .cur_pid = task_pid(get_threadtask(thread)), + .cur_task = task, + .cur_pid = task_pid(task), .starting_fp = starting_fp, }; /* @@ -345,17 +348,18 @@ pet_thread_loop(void *param, wait_result_t wr) /* sampling */ static void -pet_sample_thread(int pid, thread_t thread, uint32_t idle_rate) +pet_sample_thread(int pid, task_t task, thread_t thread, uint32_t idle_rate) { lck_mtx_assert(pet_lock, LCK_MTX_ASSERT_OWNED); - uint32_t sample_flags = SAMPLE_FLAG_IDLE_THREADS; + uint32_t sample_flags = SAMPLE_FLAG_IDLE_THREADS | SAMPLE_FLAG_THREAD_ONLY; BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_START); /* work out the context */ struct kperf_context ctx = { .cur_thread = thread, + .cur_task = task, .cur_pid = pid, }; @@ -441,21 +445,51 @@ pet_sample_task(task_t task, uint32_t idle_rate) BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_START); - kern_return_t kr = pet_threads_prepare(task); - if (kr != KERN_SUCCESS) { - BUF_INFO(PERF_PET_ERROR, ERR_THREAD, kr); - BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_END, 1); + int pid = task_pid(task); + if (kperf_action_has_task(pet_action_id)) { + struct kperf_context ctx = { + .cur_task = task, + .cur_pid = pid, + }; + + kperf_sample(pet_sample, &ctx, pet_action_id, SAMPLE_FLAG_TASK_ONLY); + } + + if (!kperf_action_has_thread(pet_action_id)) { + BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_END); return; } - int pid = task_pid(task); + kern_return_t kr = KERN_SUCCESS; + + /* + * Suspend the task to see an atomic snapshot of all its threads. This + * is expensive, and disruptive. + */ + bool needs_suspend = task != kernel_task; + if (needs_suspend) { + kr = task_suspend_internal(task); + if (kr != KERN_SUCCESS) { + BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_END, 1); + return; + } + needs_suspend = true; + } + + kr = pet_threads_prepare(task); + if (kr != KERN_SUCCESS) { + BUF_INFO(PERF_PET_ERROR, ERR_THREAD, kr); + goto out; + } for (unsigned int i = 0; i < pet_threads_count; i++) { thread_t thread = pet_threads[i]; - int cpu; - assert(thread); + assert(thread != THREAD_NULL); - /* do not sample the thread if it was on a CPU during the IPI. */ + /* + * Do not sample the thread if it was on a CPU when the timer fired. + */ + int cpu = 0; for (cpu = 0; cpu < machine_info.logical_cpu_max; cpu++) { if (kperf_tid_on_cpus[cpu] == thread_tid(thread)) { break; @@ -464,12 +498,17 @@ pet_sample_task(task_t task, uint32_t idle_rate) /* the thread was not on a CPU */ if (cpu == machine_info.logical_cpu_max) { - pet_sample_thread(pid, thread, idle_rate); + pet_sample_thread(pid, task, thread, idle_rate); } thread_deallocate(pet_threads[i]); } +out: + if (needs_suspend) { + task_resume_internal(task); + } + BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_END, pet_threads_count); } @@ -556,18 +595,7 @@ pet_sample_all_tasks(uint32_t idle_rate) for (unsigned int i = 0; i < pet_tasks_count; i++) { task_t task = pet_tasks[i]; - if (task != kernel_task) { - kr = task_suspend_internal(task); - if (kr != KERN_SUCCESS) { - continue; - } - } - pet_sample_task(task, idle_rate); - - if (task != kernel_task) { - task_resume_internal(task); - } } for(unsigned int i = 0; i < pet_tasks_count; i++) { diff --git a/osfmk/kperf/task_samplers.c b/osfmk/kperf/task_samplers.c index f976518d4..3d521b782 100644 --- a/osfmk/kperf/task_samplers.c +++ b/osfmk/kperf/task_samplers.c @@ -33,22 +33,15 @@ #include -extern boolean_t memorystatus_proc_is_dirty_unsafe(void *v); +extern void memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, + boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit); void -kperf_task_snapshot_sample(struct kperf_task_snapshot *tksn, - struct kperf_context *ctx) +kperf_task_snapshot_sample(task_t task, struct kperf_task_snapshot *tksn) { - thread_t thread; - task_t task; - BUF_INFO(PERF_TK_SNAP_SAMPLE | DBG_FUNC_START); assert(tksn != NULL); - assert(ctx != NULL); - - thread = ctx->cur_thread; - task = get_threadtask(thread); tksn->kptksn_flags = 0; if (task->effective_policy.tep_darwinbg) { @@ -61,9 +54,17 @@ kperf_task_snapshot_sample(struct kperf_task_snapshot *tksn, tksn->kptksn_flags |= KPERF_TASK_FLAG_BOOSTED; } #if CONFIG_MEMORYSTATUS - if (memorystatus_proc_is_dirty_unsafe(task->bsd_info)) { + boolean_t dirty = FALSE, dirty_tracked = FALSE, allow_idle_exit = FALSE; + memorystatus_proc_flags_unsafe(task->bsd_info, &dirty, &dirty_tracked, &allow_idle_exit); + if (dirty) { tksn->kptksn_flags |= KPERF_TASK_FLAG_DIRTY; } + if (dirty_tracked) { + tksn->kptksn_flags |= KPERF_TASK_FLAG_DIRTY_TRACKED; + } + if (allow_idle_exit) { + tksn->kptksn_flags |= KPERF_TASK_ALLOW_IDLE_EXIT; + } #endif tksn->kptksn_suspend_count = task->suspend_count; @@ -96,3 +97,11 @@ kperf_task_snapshot_log(struct kperf_task_snapshot *tksn) LOWER_32(tksn->kptksn_system_time_in_terminated_threads)); #endif /* defined(__LP64__) */ } + +void +kperf_task_info_log(struct kperf_context *ctx) +{ + assert(ctx != NULL); + + BUF_DATA(PERF_TK_INFO_DATA, ctx->cur_pid); +} diff --git a/osfmk/kperf/task_samplers.h b/osfmk/kperf/task_samplers.h index ebebeb552..d47b15bca 100644 --- a/osfmk/kperf/task_samplers.h +++ b/osfmk/kperf/task_samplers.h @@ -30,6 +30,7 @@ #define KPERF_TASK_SAMPLERS_H #include +#include struct kperf_task_snapshot { uint64_t kptksn_flags; @@ -46,9 +47,11 @@ struct kperf_task_snapshot { #define KPERF_TASK_FLAG_WQ_FLAGS_VALID (1U << 4) #define KPERF_TASK_FLAG_WQ_EXCEEDED_TOTAL (1U << 5) #define KPERF_TASK_FLAG_WQ_EXCEEDED_CONSTRAINED (1U << 6) +#define KPERF_TASK_FLAG_DIRTY_TRACKED (1U << 7) +#define KPERF_TASK_ALLOW_IDLE_EXIT (1U << 8) -void kperf_task_snapshot_sample(struct kperf_task_snapshot *tksn, - struct kperf_context *ctx); +void kperf_task_snapshot_sample(task_t task, struct kperf_task_snapshot *tksn); void kperf_task_snapshot_log(struct kperf_task_snapshot *tksn); +void kperf_task_info_log(struct kperf_context *ctx); #endif /* !defined(KPERF_TASK_SAMPLERS_H) */ diff --git a/osfmk/kperf/thread_samplers.c b/osfmk/kperf/thread_samplers.c index 176520f0c..36e2196fe 100644 --- a/osfmk/kperf/thread_samplers.c +++ b/osfmk/kperf/thread_samplers.c @@ -140,8 +140,8 @@ kperf_thread_scheduling_sample(struct kperf_thread_scheduling *thsc, BUF_INFO(PERF_TI_SCHEDSAMPLE | DBG_FUNC_START, (uintptr_t)thread_tid(thread)); - thsc->kpthsc_user_time = timer_grab(&(thread->user_timer)); - uint64_t system_time = timer_grab(&(thread->system_timer)); + thsc->kpthsc_user_time = timer_grab(&thread->user_timer); + uint64_t system_time = timer_grab(&thread->system_timer); if (thread->precise_user_kernel_time) { thsc->kpthsc_system_time = system_time; @@ -150,12 +150,14 @@ kperf_thread_scheduling_sample(struct kperf_thread_scheduling *thsc, thsc->kpthsc_system_time = 0; } + thsc->kpthsc_runnable_time = timer_grab(&thread->runnable_timer); thsc->kpthsc_state = thread->state; thsc->kpthsc_base_priority = thread->base_pri; thsc->kpthsc_sched_priority = thread->sched_pri; thsc->kpthsc_effective_qos = thread->effective_policy.thep_qos; thsc->kpthsc_requested_qos = thread->requested_policy.thrp_qos; - thsc->kpthsc_requested_qos_override = thread->requested_policy.thrp_qos_override; + thsc->kpthsc_requested_qos_override = MAX(thread->requested_policy.thrp_qos_override, + thread->requested_policy.thrp_qos_workq_override); thsc->kpthsc_requested_qos_promote = thread->requested_policy.thrp_qos_promote; thsc->kpthsc_requested_qos_ipc_override = thread->requested_policy.thrp_qos_ipc_override; thsc->kpthsc_requested_qos_sync_ipc_override = thread->requested_policy.thrp_qos_sync_ipc_override; @@ -183,6 +185,7 @@ kperf_thread_scheduling_log(struct kperf_thread_scheduling *thsc) | ((uint64_t)thsc->kpthsc_requested_qos_ipc_override << 55) | ((uint64_t)thsc->kpthsc_requested_qos_sync_ipc_override << 52) ); + BUF_DATA(PERF_TI_SCHEDDATA_3, thsc->kpthsc_runnable_time); #else BUF_DATA(PERF_TI_SCHEDDATA1_32, UPPER_32(thsc->kpthsc_user_time), LOWER_32(thsc->kpthsc_user_time), @@ -200,6 +203,8 @@ kperf_thread_scheduling_log(struct kperf_thread_scheduling *thsc) | ((uint32_t)thsc->kpthsc_requested_qos_ipc_override << 23) | ((uint32_t)thsc->kpthsc_requested_qos_sync_ipc_override << 20) ); + BUF_DATA(PERF_TI_SCHEDDATA3_32, UPPER_32(thsc->kpthsc_runnable_time), + LOWER_32(thsc->kpthsc_runnable_time)); #endif /* defined(__LP64__) */ } @@ -282,7 +287,7 @@ kperf_thread_dispatch_sample(struct kperf_thread_dispatch *thdi, BUF_INFO(PERF_TI_DISPSAMPLE | DBG_FUNC_START, (uintptr_t)thread_tid(thread)); task_t task = thread->task; - boolean_t task_64 = task_has_64BitAddr(task); + boolean_t task_64 = task_has_64Bit_addr(task); size_t user_addr_size = task_64 ? 8 : 4; assert(thread->task != kernel_task); @@ -364,7 +369,9 @@ kperf_thread_inscyc_log(struct kperf_context *context) BUF_DATA(PERF_TI_INSCYCDATA_32, 0, 0, UPPER_32(counts[MT_CORE_CYCLES]), LOWER_32(counts[MT_CORE_CYCLES])); #endif /* !defined(__LP64__) */ -#else /* MONOTONIC */ + +#else #pragma unused(context) -#endif /* !MONOTONIC */ +#endif /* MONOTONIC */ + } diff --git a/osfmk/kperf/thread_samplers.h b/osfmk/kperf/thread_samplers.h index e5a9eaeff..f443be7dd 100644 --- a/osfmk/kperf/thread_samplers.h +++ b/osfmk/kperf/thread_samplers.h @@ -47,6 +47,7 @@ void kperf_thread_info_log(struct kperf_thread_info *); struct kperf_thread_scheduling { uint64_t kpthsc_user_time; uint64_t kpthsc_system_time; + uint64_t kpthsc_runnable_time; unsigned int kpthsc_state; uint16_t kpthsc_base_priority; uint16_t kpthsc_sched_priority; diff --git a/osfmk/kperf/x86_64/kperf_meminfo.c b/osfmk/kperf/x86_64/kperf_meminfo.c deleted file mode 100644 index 9ed5acc22..000000000 --- a/osfmk/kperf/x86_64/kperf_meminfo.c +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2015 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include -#include /* task_ledgers */ -#include -#include - -#include - -kern_return_t -kperf_get_phys_footprint(task_t task, uint64_t *phys_footprint_out) -{ - kern_return_t kr; - ledger_amount_t credit, debit; - uint64_t phys_footprint; - - kr = ledger_get_entries(task->ledger, task_ledgers.internal, - &credit, &debit); - if (kr == KERN_SUCCESS) { - phys_footprint = credit - debit; - } else { - return kr; - } - - kr = ledger_get_entries(task->ledger, task_ledgers.internal_compressed, - &credit, &debit); - if (kr == KERN_SUCCESS) { - phys_footprint += credit - debit; - } else { - return kr; - } - - *phys_footprint_out = phys_footprint; - return KERN_SUCCESS; -} - diff --git a/osfmk/mach/Makefile b/osfmk/mach/Makefile index 9c293a5ce..e728f0a4f 100644 --- a/osfmk/mach/Makefile +++ b/osfmk/mach/Makefile @@ -39,6 +39,7 @@ MIG_DEFS = \ mach_vm.defs \ mach_voucher.defs \ mach_voucher_attr_control.defs \ + memory_entry.defs \ notify.defs \ processor.defs \ processor_set.defs \ @@ -86,6 +87,7 @@ MIG_UUHDRS = \ mach_vm.h \ mach_voucher.h \ mach_voucher_attr_control.h \ + memory_entry.h \ memory_object_control.h \ processor.h \ processor_set.h \ @@ -166,7 +168,6 @@ INSTALL_MI_LIST = \ # installed into System.framework's PrivateHeaders/mach subdirectory PRIVATE_DATAFILES = \ bootstrap.h \ - branch_predicates.h \ coalition.h \ coalition_notification.defs \ host_info.h \ @@ -206,7 +207,6 @@ INSTALL_MI_GEN_LIST = INSTALL_MI_DIR = mach EXPORT_MI_LIST = \ - branch_predicates.h \ coalition.h \ mach_interface.h \ resource_monitors.h \ @@ -303,6 +303,7 @@ MIG_KSHDRS = \ mach_vm_server.h \ mach_voucher_server.h \ mach_voucher_attr_control_server.h \ + memory_entry_server.h \ memory_object_control_server.h \ memory_object_default_server.h \ processor_server.h \ @@ -327,6 +328,7 @@ MIG_KSSRC = \ mach_vm_server.c \ mach_voucher_server.c \ mach_voucher_attr_control_server.c \ + memory_entry_server.c \ memory_object_control_server.c \ memory_object_default_server.c \ processor_server.c \ diff --git a/osfmk/mach/arm/_structs.h b/osfmk/mach/arm/_structs.h index b7db3d2b6..239230760 100644 --- a/osfmk/mach/arm/_structs.h +++ b/osfmk/mach/arm/_structs.h @@ -112,11 +112,58 @@ _STRUCT_ARM_THREAD_STATE64 __uint64_t fp; /* Frame pointer x29 */ __uint64_t lr; /* Link register x30 */ __uint64_t sp; /* Stack pointer x31 */ - __uint64_t pc; /* Program counter */ + __uint64_t pc; /* Program counter */ __uint32_t cpsr; /* Current program status register */ __uint32_t __pad; /* Same size for 32-bit or 64-bit clients */ }; #endif /* __DARWIN_UNIX03 */ +#if !defined(KERNEL) +#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL && defined(__arm64__) +#if __DARWIN_UNIX03 +#define __darwin_arm_thread_state64_get_pc(ts) \ + ((ts).__pc) +#define __darwin_arm_thread_state64_get_pc_fptr(ts) \ + ((void*)(uintptr_t)((ts).__pc)) +#define __darwin_arm_thread_state64_set_pc_fptr(ts, fptr) \ + ((ts).__pc = (uintptr_t)(fptr)) +#define __darwin_arm_thread_state64_get_lr(ts) \ + ((ts).__lr) +#define __darwin_arm_thread_state64_get_lr_fptr(ts) \ + ((void*)(uintptr_t)((ts).__lr)) +#define __darwin_arm_thread_state64_set_lr_fptr(ts, fptr) \ + ((ts).__lr = (uintptr_t)(fptr)) +#define __darwin_arm_thread_state64_get_sp(ts) \ + ((ts).__sp) +#define __darwin_arm_thread_state64_set_sp(ts, ptr) \ + ((ts).__sp = (uintptr_t)(ptr)) +#define __darwin_arm_thread_state64_get_fp(ts) \ + ((ts).__fp) +#define __darwin_arm_thread_state64_set_fp(ts, ptr) \ + ((ts).__fp = (uintptr_t)(ptr)) +#else /* !__DARWIN_UNIX03 */ +#define __darwin_arm_thread_state64_get_pc(ts) \ + ((ts).pc) +#define __darwin_arm_thread_state64_get_pc_fptr(ts) \ + ((void*)(uintptr_t)((ts).pc)) +#define __darwin_arm_thread_state64_set_pc_fptr(ts, fptr) \ + ((ts).pc = (uintptr_t)(fptr)) +#define __darwin_arm_thread_state64_get_lr(ts) \ + ((ts).lr) +#define __darwin_arm_thread_state64_get_lr_fptr(ts) \ + ((void*)(uintptr_t)((ts).lr)) +#define __darwin_arm_thread_state64_set_lr_fptr(ts, fptr) \ + ((ts).lr = (uintptr_t)(fptr)) +#define __darwin_arm_thread_state64_get_sp(ts) \ + ((ts).sp) +#define __darwin_arm_thread_state64_set_sp(ts, ptr) \ + ((ts).sp = (uintptr_t)(ptr)) +#define __darwin_arm_thread_state64_get_fp(ts) \ + ((ts).fp) +#define __darwin_arm_thread_state64_set_fp(ts, ptr) \ + ((ts).fp = (uintptr_t)(ptr)) +#endif /* __DARWIN_UNIX03 */ +#endif /* __DARWIN_C_LEVEL >= __DARWIN_C_FULL && defined(__arm64__) */ +#endif /* !defined(KERNEL) */ #if __DARWIN_UNIX03 #define _STRUCT_ARM_VFP_STATE struct __darwin_arm_vfp_state diff --git a/osfmk/mach/arm/sdt_isa.h b/osfmk/mach/arm/sdt_isa.h index 318134d1f..0751024a2 100644 --- a/osfmk/mach/arm/sdt_isa.h +++ b/osfmk/mach/arm/sdt_isa.h @@ -51,18 +51,20 @@ #define DTRACE_LABEL(p, n) \ ".pushsection __DATA, __data\n\t" \ - ".globl " DTRACE_LAB(p, n) "\n\t" \ - DTRACE_LAB(p, n) ":" ".long 1f""\n\t" \ + ".p2align 2\n\t" \ + ".globl " DTRACE_LAB(p, n) "\n\t" \ + DTRACE_LAB(p, n) ":" ".long 1f""\n\t" \ ".popsection" "\n\t" \ "1:" #else /* __arm64__ */ #define DTRACE_LAB(p, n) \ - "__dtrace_probe$" DTRACE_TOSTRING(%=__LINE__) DTRACE_STRINGIFY(_##p##___##n) + "__dtrace_probe$" DTRACE_TOSTRING(%=__LINE__) DTRACE_STRINGIFY(_##p##___##n) #define DTRACE_LABEL(p, n) \ ".pushsection __DATA, __data\n\t" \ - ".globl " DTRACE_LAB(p, n) "\n\t" \ - DTRACE_LAB(p, n) ":" ".quad 1f""\n\t" \ + ".p2align 3\n\t" \ + ".globl " DTRACE_LAB(p, n) "\n\t" \ + DTRACE_LAB(p, n) ":" ".quad 1f""\n\t" \ ".popsection" "\n\t" \ "1:" #endif diff --git a/osfmk/mach/arm/thread_status.h b/osfmk/mach/arm/thread_status.h index 7f4ac7d04..8bdbe8a9e 100644 --- a/osfmk/mach/arm/thread_status.h +++ b/osfmk/mach/arm/thread_status.h @@ -96,6 +96,31 @@ typedef _STRUCT_ARM_THREAD_STATE arm_thread_state_t; typedef _STRUCT_ARM_THREAD_STATE arm_thread_state32_t; typedef _STRUCT_ARM_THREAD_STATE64 arm_thread_state64_t; +#if !defined(KERNEL) +#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL && defined(__arm64__) +#define arm_thread_state64_get_pc(ts) \ + __darwin_arm_thread_state64_get_pc(ts) +#define arm_thread_state64_get_pc_fptr(ts) \ + __darwin_arm_thread_state64_get_pc_fptr(ts) +#define arm_thread_state64_set_pc_fptr(ts, fptr) \ + __darwin_arm_thread_state64_set_pc_fptr(ts, fptr) +#define arm_thread_state64_get_lr(ts) \ + __darwin_arm_thread_state64_get_lr(ts) +#define arm_thread_state64_get_lr_fptr(ts) \ + __darwin_arm_thread_state64_get_lr_fptr(ts) +#define arm_thread_state64_set_lr_fptr(ts, fptr) \ + __darwin_arm_thread_state64_set_lr_fptr(ts, fptr) +#define arm_thread_state64_get_sp(ts) \ + __darwin_arm_thread_state64_get_sp(ts) +#define arm_thread_state64_set_sp(ts, ptr) \ + __darwin_arm_thread_state64_set_sp(ts, ptr) +#define arm_thread_state64_get_fp(ts) \ + __darwin_arm_thread_state64_get_fp(ts) +#define arm_thread_state64_set_fp(ts, ptr) \ + __darwin_arm_thread_state64_set_fp(ts, ptr) +#endif /* __DARWIN_C_LEVEL >= __DARWIN_C_FULL && defined(__arm64__) */ +#endif /* !defined(KERNEL) */ + struct arm_unified_thread_state { arm_state_hdr_t ash; union { @@ -364,12 +389,12 @@ typedef struct arm_saved_state32_tagged arm_saved_state32_tagged_t; (sizeof (arm_saved_state32_t)/sizeof(unsigned int))) struct arm_saved_state64 { - uint64_t x[29]; /* General purpose registers x0-x28 */ - uint64_t fp; /* Frame pointer x29 */ - uint64_t lr; /* Link register x30 */ - uint64_t sp; /* Stack pointer x31 */ - uint64_t pc; /* Program counter */ - uint32_t cpsr; /* Current program status register */ + uint64_t x[29]; /* General purpose registers x0-x28 */ + uint64_t fp; /* Frame pointer x29 */ + uint64_t lr; /* Link register x30 */ + uint64_t sp; /* Stack pointer x31 */ + uint64_t pc; /* Program counter */ + uint32_t cpsr; /* Current program status register */ uint32_t reserved; /* Reserved padding */ uint64_t far; /* Virtual fault address */ uint32_t esr; /* Exception syndrome register */ diff --git a/osfmk/mach/arm/vm_param.h b/osfmk/mach/arm/vm_param.h index 915e237e6..c0ed53a8e 100644 --- a/osfmk/mach/arm/vm_param.h +++ b/osfmk/mach/arm/vm_param.h @@ -166,6 +166,7 @@ extern unsigned PAGE_SHIFT_CONST; #ifdef KERNEL #if defined (__arm__) +#define VM_KERNEL_POINTER_SIGNIFICANT_BITS 32 #define VM_MIN_KERNEL_ADDRESS ((vm_address_t) 0x80000000) #define VM_MAX_KERNEL_ADDRESS ((vm_address_t) 0xFFFEFFFF) #define VM_HIGH_KERNEL_WINDOW ((vm_address_t) 0xFFFE0000) @@ -174,6 +175,7 @@ extern unsigned PAGE_SHIFT_CONST; * The minimum and maximum kernel address; some configurations may * constrain the address space further. */ +#define VM_KERNEL_POINTER_SIGNIFICANT_BITS 37 #define VM_MIN_KERNEL_ADDRESS ((vm_address_t) 0xffffffe000000000ULL) #define VM_MAX_KERNEL_ADDRESS ((vm_address_t) 0xfffffff3ffffffffULL) #else @@ -183,8 +185,11 @@ extern unsigned PAGE_SHIFT_CONST; #define VM_MIN_KERNEL_AND_KEXT_ADDRESS \ VM_MIN_KERNEL_ADDRESS -#define VM_KERNEL_ADDRESS(va) ((((vm_address_t)(va))>=VM_MIN_KERNEL_ADDRESS) && \ - (((vm_address_t)(va))<=VM_MAX_KERNEL_ADDRESS)) +#define VM_KERNEL_STRIP_PTR(_v) (_v) + +#define VM_KERNEL_ADDRESS(_va) \ + ((((vm_address_t)VM_KERNEL_STRIP_PTR(_va)) >= VM_MIN_KERNEL_ADDRESS) && \ + (((vm_address_t)VM_KERNEL_STRIP_PTR(_va)) <= VM_MAX_KERNEL_ADDRESS)) #ifdef MACH_KERNEL_PRIVATE /* @@ -193,22 +198,41 @@ extern unsigned PAGE_SHIFT_CONST; extern unsigned long gVirtBase, gPhysBase, gPhysSize; #define isphysmem(a) (((vm_address_t)(a) - gPhysBase) < gPhysSize) -#define phystokv(a) ((vm_address_t)(a) - gPhysBase + gVirtBase) #if KASAN /* Increase the stack sizes to account for the redzones that get added to every * stack object. */ # define KERNEL_STACK_SIZE (4*4*4096) -# define INTSTACK_SIZE (4*4*4096) #else # define KERNEL_STACK_SIZE (4*4096) -# define INTSTACK_SIZE (4*4096) +#endif + +#define INTSTACK_SIZE (4*4096) + +#ifdef __arm64__ +#define EXCEPSTACK_SIZE (4*4096) +#else +#define FIQSTACK_SIZE (4096) #endif #if defined (__arm__) #define HIGH_EXC_VECTORS ((vm_address_t) 0xFFFF0000) #endif +/* + * TODO: We're hardcoding the expected virtual TEXT base here; + * that gives us an ugly dependency on a linker argument in + * the make files. Clean this up, so we don't hardcode it + * twice; this is nothing but trouble. + */ +#if defined (__arm__) +#define VM_KERNEL_LINK_ADDRESS ((vm_address_t) 0x80000000) +#elif defined (__arm64__) +#define VM_KERNEL_LINK_ADDRESS ((vm_address_t) 0xFFFFFFF007004000) +#else +#error architecture not supported +#endif + #endif /* MACH_KERNEL_PRIVATE */ #endif /* KERNEL */ diff --git a/osfmk/mach/exc.defs b/osfmk/mach/exc.defs index c412d3192..734e7408f 100644 --- a/osfmk/mach/exc.defs +++ b/osfmk/mach/exc.defs @@ -61,6 +61,10 @@ */ subsystem +#if KERNEL_SERVER + KernelServer +#endif /* KERNEL_SERVER */ + #if KERNEL_USER KernelUser #endif @@ -80,6 +84,14 @@ routine exception_raise( task : mach_port_t; exception : exception_type_t; code : exception_data_t +#if EXC_SERVER_SECTOKEN + ; + ServerSecToken stoken : security_token_t +#endif +#if EXC_SERVER_AUDITTOKEN + ; + ServerAuditToken atoken: audit_token_t +#endif ); routine exception_raise_state( @@ -88,7 +100,16 @@ routine exception_raise_state( code : exception_data_t, const; inout flavor : int; old_state : thread_state_t, const; - out new_state : thread_state_t); + out new_state : thread_state_t +#if EXC_SERVER_SECTOKEN + ; + ServerSecToken stoken : security_token_t +#endif +#if EXC_SERVER_AUDITTOKEN + ; + ServerAuditToken atoken: audit_token_t +#endif + ); routine exception_raise_state_identity( exception_port : mach_port_t; @@ -98,6 +119,15 @@ routine exception_raise_state_identity( code : exception_data_t; inout flavor : int; old_state : thread_state_t; - out new_state : thread_state_t); + out new_state : thread_state_t +#if EXC_SERVER_SECTOKEN + ; + ServerSecToken stoken : security_token_t +#endif +#if EXC_SERVER_AUDITTOKEN + ; + ServerAuditToken atoken: audit_token_t +#endif + ); /* vim: set ft=c : */ diff --git a/osfmk/mach/host_info.h b/osfmk/mach/host_info.h index 9339fad37..716c17960 100644 --- a/osfmk/mach/host_info.h +++ b/osfmk/mach/host_info.h @@ -99,7 +99,8 @@ typedef integer_t host_flavor_t; #define HOST_MACH_MSG_TRAP 8 /* Has mach_msg_trap */ #define HOST_VM_PURGABLE 9 /* purg'e'able memory info */ #define HOST_DEBUG_INFO_INTERNAL 10 /* Used for kernel internal development tests only */ -#define HOST_CAN_HAS_DEBUGGER 11 +#define HOST_CAN_HAS_DEBUGGER 11 +#define HOST_PREFERRED_USER_ARCH 12 /* Get the preferred user-space architecture */ #ifdef MACH_KERNEL_PRIVATE struct host_basic_info_old { @@ -260,6 +261,16 @@ typedef struct host_cpu_load_info *host_cpu_load_info_t; #define HOST_CPU_LOAD_INFO_COUNT ((mach_msg_type_number_t) \ (sizeof (host_cpu_load_info_data_t) / sizeof (integer_t))) +struct host_preferred_user_arch { + cpu_type_t cpu_type; /* Preferred user-space cpu type */ + cpu_subtype_t cpu_subtype; /* Preferred user-space cpu subtype */ +}; + +typedef struct host_preferred_user_arch host_preferred_user_arch_data_t; +typedef struct host_preferred_user_arch *host_preferred_user_arch_t; +#define HOST_PREFERRED_USER_ARCH_COUNT ((mach_msg_type_number_t) \ + (sizeof(host_preferred_user_arch_data_t)/sizeof(integer_t))) + #ifdef PRIVATE /* * CPU Statistics information diff --git a/osfmk/mach/host_special_ports.h b/osfmk/mach/host_special_ports.h index 82ed8d003..8c97b882e 100644 --- a/osfmk/mach/host_special_ports.h +++ b/osfmk/mach/host_special_ports.h @@ -69,6 +69,8 @@ */ #define HOST_SECURITY_PORT 0 +#define HOST_MIN_SPECIAL_PORT HOST_SECURITY_PORT + /* * Always provided by kernel (cannot be set from user-space). */ @@ -77,6 +79,8 @@ #define HOST_IO_MASTER_PORT 3 #define HOST_MAX_SPECIAL_KERNEL_PORT 7 /* room to grow */ +#define HOST_LAST_SPECIAL_KERNEL_PORT HOST_IO_MASTER_PORT + /* * Not provided by kernel */ @@ -101,9 +105,10 @@ #define HOST_NODE_PORT (19 + HOST_MAX_SPECIAL_KERNEL_PORT) #define HOST_RESOURCE_NOTIFY_PORT (20 + HOST_MAX_SPECIAL_KERNEL_PORT) #define HOST_CLOSURED_PORT (21 + HOST_MAX_SPECIAL_KERNEL_PORT) +#define HOST_SYSPOLICYD_PORT (22 + HOST_MAX_SPECIAL_KERNEL_PORT) -#define HOST_MAX_SPECIAL_PORT HOST_CLOSURED_PORT - /* MAX = last since rdar://19421223 */ +#define HOST_MAX_SPECIAL_PORT HOST_SYSPOLICYD_PORT + /* MAX = last since rdar://35861175 */ /* obsolete name */ #define HOST_CHUD_PORT HOST_LAUNCHCTL_PORT @@ -249,6 +254,12 @@ #define host_set_closured_port(host, port) \ (host_set_special_port((host), HOST_CLOSURED_PORT, (port))) +#define host_get_syspolicyd_port(host, port) \ + (host_get_special_port((host), \ + HOST_LOCAL_NODE, HOST_SYSPOLICYD_PORT, (port))) +#define host_set_syspolicyd_port(host, port) \ + (host_set_special_port((host), HOST_SYSPOLICYD_PORT, (port))) + /* HOST_RESOURCE_NOTIFY_PORT doesn't #defines these conveniences. All lookups go through send_resource_violation() */ diff --git a/osfmk/mach/i386/vm_param.h b/osfmk/mach/i386/vm_param.h index 040472dfe..18a8fcab8 100644 --- a/osfmk/mach/i386/vm_param.h +++ b/osfmk/mach/i386/vm_param.h @@ -194,6 +194,7 @@ #define KERNEL_IMAGE_TO_PHYS(x) (x) +#define VM_KERNEL_POINTER_SIGNIFICANT_BITS 39 #define VM_MIN_KERNEL_ADDRESS ((vm_offset_t) 0xFFFFFF8000000000UL) #define VM_MIN_KERNEL_PAGE ((ppnum_t)0) #define VM_MIN_KERNEL_AND_KEXT_ADDRESS (VM_MIN_KERNEL_ADDRESS - 0x80000000ULL) @@ -203,6 +204,8 @@ #define KEXT_ALLOC_BASE(x) ((x) - KEXT_ALLOC_MAX_OFFSET) #define KEXT_ALLOC_SIZE(x) (KEXT_ALLOC_MAX_OFFSET - (x)) +#define VM_KERNEL_STRIP_PTR(_v) (_v) + #define VM_KERNEL_ADDRESS(va) ((((vm_address_t)(va))>=VM_MIN_KERNEL_AND_KEXT_ADDRESS) && \ (((vm_address_t)(va))<=VM_MAX_KERNEL_ADDRESS)) @@ -215,6 +218,9 @@ * stack object. */ # define INTSTACK_SIZE (I386_PGBYTES*4*4) # define KERNEL_STACK_SIZE (I386_PGBYTES*4*4) +#elif DEBUG +# define INTSTACK_SIZE (I386_PGBYTES*4) +# define KERNEL_STACK_SIZE (I386_PGBYTES*6) #else # define INTSTACK_SIZE (I386_PGBYTES*4) # define KERNEL_STACK_SIZE (I386_PGBYTES*4) @@ -277,7 +283,7 @@ MACRO_END #define IS_USERADDR64_CANONICAL(addr) \ - ((addr) < (VM_MAX_USER_PAGE_ADDRESS + PAGE_SIZE)) + ((addr) < (VM_MAX_USER_PAGE_ADDRESS)) #endif /* MACH_KERNEL_PRIVATE */ diff --git a/osfmk/mach/kmod.h b/osfmk/mach/kmod.h index 99449b7bf..412246592 100644 --- a/osfmk/mach/kmod.h +++ b/osfmk/mach/kmod.h @@ -175,6 +175,7 @@ extern void kmod_panic_dump(vm_offset_t * addr, unsigned int dump_cnt); * flag overrides system mode in dtrace_modload(). */ #define KMOD_DTRACE_FORCE_INIT 0x01 +#define KMOD_DTRACE_STATIC_KEXT 0x02 #endif /* CONFIG_DTRACE */ #endif /* KERNEL_PRIVATE */ diff --git a/osfmk/mach/mach_exc.defs b/osfmk/mach/mach_exc.defs index a2a7669da..5ce6427bc 100644 --- a/osfmk/mach/mach_exc.defs +++ b/osfmk/mach/mach_exc.defs @@ -64,6 +64,10 @@ subsystem #if KERNEL_USER KernelUser #endif +#if KERNEL_SERVER + KernelServer +#endif /* KERNEL_SERVER */ + mach_exc 2405; #include @@ -80,6 +84,14 @@ routine mach_exception_raise( task : mach_port_t; exception : exception_type_t; code : mach_exception_data_t +#if MACH_EXC_SERVER_SECTOKEN + ; + ServerSecToken stoken : security_token_t +#endif +#if MACH_EXC_SERVER_AUDITTOKEN + ; + ServerAuditToken atoken: audit_token_t +#endif ); routine mach_exception_raise_state( @@ -88,7 +100,16 @@ routine mach_exception_raise_state( code : mach_exception_data_t, const; inout flavor : int; old_state : thread_state_t, const; - out new_state : thread_state_t); + out new_state : thread_state_t +#if MACH_EXC_SERVER_SECTOKEN + ; + ServerSecToken stoken : security_token_t +#endif +#if MACH_EXC_SERVER_AUDITTOKEN + ; + ServerAuditToken atoken: audit_token_t +#endif + ); routine mach_exception_raise_state_identity( exception_port : mach_port_t; @@ -98,6 +119,15 @@ routine mach_exception_raise_state_identity( code : mach_exception_data_t; inout flavor : int; old_state : thread_state_t; - out new_state : thread_state_t); + out new_state : thread_state_t +#if MACH_EXC_SERVER_SECTOKEN + ; + ServerSecToken stoken : security_token_t +#endif +#if MACH_EXC_SERVER_AUDITTOKEN + ; + ServerAuditToken atoken: audit_token_t +#endif + ); /* vim: set ft=c : */ diff --git a/osfmk/mach/mach_host.defs b/osfmk/mach/mach_host.defs index 5ca0e125e..83d485388 100644 --- a/osfmk/mach/mach_host.defs +++ b/osfmk/mach/mach_host.defs @@ -375,4 +375,31 @@ routine mach_zone_info_for_largest_zone( skip; #endif +#ifdef PRIVATE +/* + * Returns names of zones that have zlog enabled. + */ +routine mach_zone_get_zlog_zones( + host : host_priv_t; + out names : mach_zone_name_array_t, + Dealloc); +#else +skip; +#endif + +#ifdef PRIVATE +/* + * Returns BTLog records for a specific zone. + * The zone name is passed in via the argument name, + * recs returns an array of zone_btrecord_t's. + */ +routine mach_zone_get_btlog_records( + host : host_priv_t; + name : mach_zone_name_t; + out recs : zone_btrecord_array_t, + Dealloc); +#else +skip; +#endif + /* vim: set ft=c : */ diff --git a/osfmk/mach/mach_port.defs b/osfmk/mach/mach_port.defs index dcb43c76e..5bc503421 100644 --- a/osfmk/mach/mach_port.defs +++ b/osfmk/mach/mach_port.defs @@ -611,4 +611,16 @@ routine mach_port_space_basic_info( task : ipc_space_inspect_t; out basic_info : ipc_info_space_basic_t); +#if KERNEL || !LIBSYSCALL_INTERFACE +/* + * Returns sync ipc turnstile link status + * for special reply ports. + */ +routine mach_port_special_reply_port_reset_link( + task : ipc_space_t; + name : mach_port_name_t; + out srp_lost_link : boolean_t); +#else +skip; +#endif /* vim: set ft=c : */ diff --git a/osfmk/mach/mach_traps.h b/osfmk/mach/mach_traps.h index 462d7972f..9e712b9b1 100644 --- a/osfmk/mach/mach_traps.h +++ b/osfmk/mach/mach_traps.h @@ -222,6 +222,14 @@ extern kern_return_t _kernelrpc_mach_port_insert_right_trap( mach_msg_type_name_t polyPoly ); +extern kern_return_t _kernelrpc_mach_port_get_attributes_trap( + mach_port_name_t target, + mach_port_name_t name, + mach_port_flavor_t flavor, + mach_port_info_t port_info_out, + mach_msg_type_number_t *port_info_outCnt +); + extern kern_return_t _kernelrpc_mach_port_insert_member_trap( mach_port_name_t target, mach_port_name_t name, @@ -714,6 +722,16 @@ struct _kernelrpc_mach_port_insert_right_args { extern kern_return_t _kernelrpc_mach_port_insert_right_trap( struct _kernelrpc_mach_port_insert_right_args *args); +struct _kernelrpc_mach_port_get_attributes_args { + PAD_ARG_(mach_port_name_t, target); + PAD_ARG_(mach_port_name_t, name); + PAD_ARG_(mach_port_flavor_t, flavor); + PAD_ARG_(user_addr_t, info); + PAD_ARG_(user_addr_t, count); +}; +extern kern_return_t _kernelrpc_mach_port_get_attributes_trap( + struct _kernelrpc_mach_port_get_attributes_args *args); + struct _kernelrpc_mach_port_insert_member_args { PAD_ARG_(mach_port_name_t, target); PAD_ARG_(mach_port_name_t, name); diff --git a/osfmk/mach/machine.h b/osfmk/mach/machine.h index 87c1ba64f..b7a5d36b6 100644 --- a/osfmk/mach/machine.h +++ b/osfmk/mach/machine.h @@ -127,8 +127,8 @@ __END_DECLS /* * Capability bits used in the definition of cpu_type. */ -#define CPU_ARCH_MASK 0xff000000 /* mask for architecture bits */ -#define CPU_ARCH_ABI64 0x01000000 /* 64 bit ABI */ +#define CPU_ARCH_MASK 0xff000000 /* mask for architecture bits */ +#define CPU_ARCH_ABI64 0x01000000 /* 64 bit ABI */ /* * Machine types known by all. @@ -151,7 +151,7 @@ __END_DECLS #define CPU_TYPE_MC98000 ((cpu_type_t) 10) #define CPU_TYPE_HPPA ((cpu_type_t) 11) #define CPU_TYPE_ARM ((cpu_type_t) 12) -#define CPU_TYPE_ARM64 (CPU_TYPE_ARM | CPU_ARCH_ABI64) +#define CPU_TYPE_ARM64 (CPU_TYPE_ARM | CPU_ARCH_ABI64) #define CPU_TYPE_MC88000 ((cpu_type_t) 13) #define CPU_TYPE_SPARC ((cpu_type_t) 14) #define CPU_TYPE_I860 ((cpu_type_t) 15) @@ -368,6 +368,7 @@ __END_DECLS #define CPU_SUBTYPE_ARM64_ALL ((cpu_subtype_t) 0) #define CPU_SUBTYPE_ARM64_V8 ((cpu_subtype_t) 1) + #endif /* !__ASSEMBLER__ */ /* @@ -407,6 +408,7 @@ __END_DECLS #define CPUFAMILY_ARM_TYPHOON 0x2c91a47e #define CPUFAMILY_ARM_TWISTER 0x92fb37c8 #define CPUFAMILY_ARM_HURRICANE 0x67ceee93 +#define CPUFAMILY_ARM_MONSOON_MISTRAL 0xe81e7ef6 /* The following synonyms are deprecated: */ #define CPUFAMILY_INTEL_6_23 CPUFAMILY_INTEL_PENRYN diff --git a/osfmk/prng/YarrowCoreLib/src/entropysources.h b/osfmk/mach/memory_entry.defs similarity index 66% rename from osfmk/prng/YarrowCoreLib/src/entropysources.h rename to osfmk/mach/memory_entry.defs index 1821acc2d..07e8fa454 100644 --- a/osfmk/prng/YarrowCoreLib/src/entropysources.h +++ b/osfmk/mach/memory_entry.defs @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999, 2000-2013 Apple Inc. All rights reserved. + * Copyright (c) 2017 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -26,28 +26,27 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* entropysources.h */ -/* This files contain the defination of the entropy sources */ +subsystem +#if KERNEL_SERVER + KernelServer +#endif /* KERNEL_SERVER */ + memory_entry 4900; -#ifndef __YARROW_ENTROPY_SOURCES_H__ -#define __YARROW_ENTROPY_SOURCES_H__ - -#if defined(macintosh) || defined(__APPLE__) -/* - * In our implementation, all sources are user sources. - */ -enum entropy_sources { - ENTROPY_SOURCES = 0 -}; -#else -enum entropy_sources { - KEYTIMESOURCE = 0, - MOUSETIMESOURCE, - MOUSEMOVESOURCE, - SLOWPOLLSOURCE, - ENTROPY_SOURCES, /* Leave as second to last source */ - MSG_CLOSE_PIPE /* Leave as last source */ -}; +#if !KERNEL && !LIBSYSCALL_INTERFACE + UserPrefix _kernelrpc_; #endif -#endif +#include +#include +#include + +routine mach_memory_entry_purgable_control( + mem_entry : mem_entry_name_port_t; + control : vm_purgable_t; + inout state : int); + +routine mach_memory_entry_access_tracking( + mem_entry : mem_entry_name_port_t; + inout access_tracking : int; + out access_tracking_reads : uint32_t; + out access_tracking_writes : uint32_t); diff --git a/osfmk/mach/message.h b/osfmk/mach/message.h index 13481e1bd..edc862503 100644 --- a/osfmk/mach/message.h +++ b/osfmk/mach/message.h @@ -721,7 +721,9 @@ typedef integer_t mach_msg_option_t; #define MACH_SEND_NODENAP MACH_SEND_NOIMPORTANCE #define MACH_SEND_IMPORTANCE 0x00080000 /* msg carries importance - kernel only */ #define MACH_SEND_SYNC_OVERRIDE 0x00100000 /* msg should do sync ipc override */ - +#define MACH_SEND_PROPAGATE_QOS 0x00200000 /* IPC should propagate the caller's QoS */ +#define MACH_SEND_SYNC_USE_THRPRI MACH_SEND_PROPAGATE_QOS /* obsolete name */ +#define MACH_SEND_KERNEL 0x00400000 /* full send from kernel space - kernel only */ #define MACH_RCV_TIMEOUT 0x00000100 /* timeout value applies to receive */ #define MACH_RCV_NOTIFY 0x00000200 /* reserved - legacy */ @@ -740,7 +742,7 @@ typedef integer_t mach_msg_option_t; * If more than one thread attempts to MACH_PEEK_MSG on a port or set, one of * the threads may miss messages (in fact, it may never wake up). */ -#define MACH_PEEK_MSG 0x00100000 /* receive, but leave msgs queued */ +#define MACH_PEEK_MSG 0x80000000 /* receive, but leave msgs queued */ #endif @@ -772,7 +774,7 @@ typedef integer_t mach_msg_option_t; #define MACH_SEND_USER (MACH_SEND_MSG | MACH_SEND_TIMEOUT | \ MACH_SEND_NOTIFY | MACH_SEND_OVERRIDE | \ MACH_SEND_TRAILER | MACH_SEND_NOIMPORTANCE | \ - MACH_SEND_SYNC_OVERRIDE) + MACH_SEND_SYNC_OVERRIDE | MACH_SEND_PROPAGATE_QOS) #define MACH_RCV_USER (MACH_RCV_MSG | MACH_RCV_TIMEOUT | \ MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY | \ @@ -1016,8 +1018,6 @@ extern kern_return_t mach_voucher_deallocate( extern mach_msg_return_t mach_msg_receive_results(mach_msg_size_t *size); -extern mach_msg_priority_t mach_msg_priority_combine(mach_msg_priority_t msg_qos, - mach_msg_priority_t recv_qos); #endif /* KERNEL */ __END_DECLS diff --git a/osfmk/mach/port.h b/osfmk/mach/port.h index 30b55a1b0..763e6f94f 100644 --- a/osfmk/mach/port.h +++ b/osfmk/mach/port.h @@ -405,11 +405,25 @@ typedef mach_port_options_t *mach_port_options_ptr_t; /* Reasons for exception for a guarded mach port */ enum mach_port_guard_exception_codes { - kGUARD_EXC_DESTROY = 1u << 0, - kGUARD_EXC_MOD_REFS = 1u << 1, - kGUARD_EXC_SET_CONTEXT = 1u << 2, - kGUARD_EXC_UNGUARDED = 1u << 3, - kGUARD_EXC_INCORRECT_GUARD = 1u << 4 + kGUARD_EXC_DESTROY = 1u << 0, + kGUARD_EXC_MOD_REFS = 1u << 1, + kGUARD_EXC_SET_CONTEXT = 1u << 2, + kGUARD_EXC_UNGUARDED = 1u << 3, + kGUARD_EXC_INCORRECT_GUARD = 1u << 4, + /* start of non-fatal guards */ + kGUARD_EXC_INVALID_RIGHT = 1u << 8, + kGUARD_EXC_INVALID_NAME = 1u << 9, + kGUARD_EXC_INVALID_VALUE = 1u << 10, + kGUARD_EXC_INVALID_ARGUMENT = 1u << 11, + kGUARD_EXC_RIGHT_EXISTS = 1u << 12, + kGUARD_EXC_KERN_NO_SPACE = 1u << 13, + kGUARD_EXC_KERN_FAILURE = 1u << 14, + kGUARD_EXC_KERN_RESOURCE = 1u << 15, + kGUARD_EXC_SEND_INVALID_REPLY = 1u << 16, + kGUARD_EXC_SEND_INVALID_VOUCHER = 1u << 16, + kGUARD_EXC_SEND_INVALID_RIGHT = 1u << 17, + kGUARD_EXC_RCV_INVALID_NAME = 1u << 18, + kGUARD_EXC_RCV_INVALID_NOTIFY = 1u << 19 }; #if !__DARWIN_UNIX03 && !defined(_NO_PORT_T_FROM_MACH) diff --git a/osfmk/mach/shared_region.h b/osfmk/mach/shared_region.h index 19351b07f..d5799dc2e 100644 --- a/osfmk/mach/shared_region.h +++ b/osfmk/mach/shared_region.h @@ -70,13 +70,14 @@ #define SHARED_REGION_NESTING_MIN_ARM ? #define SHARED_REGION_NESTING_MAX_ARM ? + #ifdef XNU_KERNEL_PRIVATE /* ARM64_TODO: move to higher memory */ #endif #define SHARED_REGION_BASE_ARM64 0x180000000ULL -#define SHARED_REGION_SIZE_ARM64 0x40000000ULL +#define SHARED_REGION_SIZE_ARM64 0x100000000ULL #define SHARED_REGION_NESTING_BASE_ARM64 0x180000000ULL -#define SHARED_REGION_NESTING_SIZE_ARM64 0x40000000ULL +#define SHARED_REGION_NESTING_SIZE_ARM64 0x100000000ULL #define SHARED_REGION_NESTING_MIN_ARM64 ? #define SHARED_REGION_NESTING_MAX_ARM64 ? @@ -101,7 +102,7 @@ #define SHARED_REGION_NESTING_SIZE SHARED_REGION_NESTING_SIZE_ARM #define SHARED_REGION_NESTING_MIN SHARED_REGION_NESTING_MIN_ARM #define SHARED_REGION_NESTING_MAX SHARED_REGION_NESTING_MAX_ARM -#elif defined(__arm64__) +#elif defined(__arm64__) && defined(__LP64__) #define SHARED_REGION_BASE SHARED_REGION_BASE_ARM64 #define SHARED_REGION_SIZE SHARED_REGION_SIZE_ARM64 #define SHARED_REGION_NESTING_BASE SHARED_REGION_NESTING_BASE_ARM64 diff --git a/osfmk/mach/sync_policy.h b/osfmk/mach/sync_policy.h index 11277d0da..239d11baf 100644 --- a/osfmk/mach/sync_policy.h +++ b/osfmk/mach/sync_policy.h @@ -49,8 +49,15 @@ typedef int sync_policy_t; * These options provide addition (kernel-private) behaviors */ -#define SYNC_POLICY_PREPOST 0x4 -#define SYNC_POLICY_DISABLE_IRQ 0x8 +#define SYNC_POLICY_PREPOST 0x4 +#define SYNC_POLICY_DISABLE_IRQ 0x8 + +/* + * If the waitq is IRQ safe, 0x10 suggests it's a waitq embedded in turnstile. + * If the waitq is not IRQ safe, 0x10 suggests it's a waitq of a port and should use it's turnstile safeq. + */ +#define SYNC_POLICY_TURNSTILE 0x10 +#define SYNC_POLICY_PORT 0x10 #endif /* KERNEL_PRIVATE */ diff --git a/osfmk/mach/syscall_sw.h b/osfmk/mach/syscall_sw.h index 381bfc510..b8c1c4d32 100644 --- a/osfmk/mach/syscall_sw.h +++ b/osfmk/mach/syscall_sw.h @@ -114,6 +114,7 @@ kernel_trap(semaphore_wait_signal_trap,-37,2) kernel_trap(semaphore_timedwait_trap,-38,3) kernel_trap(semaphore_timedwait_signal_trap,-39,4) +kernel_trap(_kernelrpc_mach_port_get_attributes_trap,-40,5) kernel_trap(_kernelrpc_mach_port_guard_trap,-41,5) kernel_trap(_kernelrpc_mach_port_unguard_trap,-42,4) kernel_trap(mach_generate_activity_id, -43, 3) diff --git a/osfmk/mach/task_info.h b/osfmk/mach/task_info.h index 58e4e7554..62824b760 100644 --- a/osfmk/mach/task_info.h +++ b/osfmk/mach/task_info.h @@ -479,7 +479,8 @@ typedef struct task_flags_info * task_flags_info_t; #define TASK_FLAGS_INFO_COUNT ((mach_msg_type_number_t) \ (sizeof(task_flags_info_data_t) / sizeof (natural_t))) -#define TF_LP64 0x00000001 /* task has 64-bit addressing */ +#define TF_LP64 0x00000001 /* task has 64-bit addressing */ +#define TF_64B_DATA 0x00000002 /* task has 64-bit data registers */ #define TASK_DEBUG_INFO_INTERNAL 29 /* Used for kernel internal development tests. */ diff --git a/osfmk/mach/task_policy.h b/osfmk/mach/task_policy.h index 9ad6d0798..1c58d6067 100644 --- a/osfmk/mach/task_policy.h +++ b/osfmk/mach/task_policy.h @@ -112,15 +112,16 @@ kern_return_t task_policy_get( enum task_role { - TASK_RENICED = -1, - TASK_UNSPECIFIED = 0, - TASK_FOREGROUND_APPLICATION, - TASK_BACKGROUND_APPLICATION, - TASK_CONTROL_APPLICATION, - TASK_GRAPHICS_SERVER, - TASK_THROTTLE_APPLICATION, - TASK_NONUI_APPLICATION, - TASK_DEFAULT_APPLICATION + TASK_RENICED = -1, + TASK_UNSPECIFIED = 0, + TASK_FOREGROUND_APPLICATION = 1, + TASK_BACKGROUND_APPLICATION = 2, + TASK_CONTROL_APPLICATION = 3, + TASK_GRAPHICS_SERVER = 4, + TASK_THROTTLE_APPLICATION = 5, + TASK_NONUI_APPLICATION = 6, + TASK_DEFAULT_APPLICATION = 7, + TASK_DARWINBG_APPLICATION = 8, }; typedef integer_t task_role_t; @@ -193,7 +194,7 @@ typedef struct task_qos_policy *task_qos_policy_t; * When they do, we will update TASK_POLICY_INTERNAL_STRUCT_VERSION. */ -#define TASK_POLICY_INTERNAL_STRUCT_VERSION 1 +#define TASK_POLICY_INTERNAL_STRUCT_VERSION 2 struct task_requested_policy { uint64_t trp_int_darwinbg :1, /* marked as darwinbg via setpriority */ @@ -209,7 +210,7 @@ struct task_requested_policy { trp_apptype :3, /* What apptype did launchd tell us this was (inherited) */ trp_boosted :1, /* Has a non-zero importance assertion count */ - trp_role :3, /* task's system role */ + trp_role :4, /* task's system role */ trp_tal_enabled :1, /* TAL mode is enabled */ trp_over_latency_qos :3, /* Timer latency QoS override */ trp_over_through_qos :3, /* Computation throughput QoS override */ @@ -225,7 +226,7 @@ struct task_requested_policy { trp_sup_cpu :1, /* Wants suppressed CPU priority (MAXPRI_SUPPRESSED) */ trp_sup_bg_sockets :1, /* Wants background sockets */ - trp_reserved :18; + trp_reserved :17; }; struct task_effective_policy { @@ -244,14 +245,14 @@ struct task_effective_policy { tep_tal_engaged :1, /* TAL mode is in effect */ tep_watchers_bg :1, /* watchers are BG-ed */ tep_sup_active :1, /* suppression behaviors are in effect */ - tep_role :3, /* task's system role */ + tep_role :4, /* task's system role */ tep_suppressed_cpu :1, /* cpu priority == MAXPRI_SUPPRESSED (trumped by lowpri_cpu) */ tep_sfi_managed :1, /* SFI Managed task */ tep_live_donor :1, /* task is a live importance boost donor */ tep_qos_clamp :3, /* task qos clamp (applies to qos-disabled threads too) */ tep_qos_ceiling :3, /* task qos ceiling (applies to only qos-participating threads) */ - tep_reserved :32; + tep_reserved :31; }; #endif /* PRIVATE */ diff --git a/osfmk/mach/task_special_ports.h b/osfmk/mach/task_special_ports.h index 66fd7ed05..9080a451e 100644 --- a/osfmk/mach/task_special_ports.h +++ b/osfmk/mach/task_special_ports.h @@ -92,6 +92,8 @@ typedef int task_special_port_t; #define TASK_RESOURCE_NOTIFY_PORT 11 /* overrides host special RN port */ +#define TASK_MAX_SPECIAL_PORT TASK_RESOURCE_NOTIFY_PORT + /* * Definitions for ease of use */ diff --git a/osfmk/mach/thread_act.defs b/osfmk/mach/thread_act.defs index 716026ac5..205fff541 100644 --- a/osfmk/mach/thread_act.defs +++ b/osfmk/mach/thread_act.defs @@ -96,7 +96,12 @@ routine thread_terminate( * may be stale. [Flavor THREAD_STATE_FLAVOR_LIST provides a * list of valid flavors for the target thread.] */ -routine act_get_state( +routine +#ifdef KERNEL_SERVER +act_get_state_to_user( +#else +act_get_state( +#endif target_act : thread_act_t; flavor : int; out old_state : thread_state_t, CountInOut); @@ -125,7 +130,12 @@ act_set_state( * may be stale. [Flavor THREAD_STATE_FLAVOR_LIST provides a * list of valid flavors for the target thr_act.] */ -routine thread_get_state( +routine +#ifdef KERNEL_SERVER +thread_get_state_to_user( +#else +thread_get_state( +#endif target_act : thread_act_t; flavor : thread_state_flavor_t; out old_state : thread_state_t, CountInOut); @@ -191,7 +201,12 @@ routine thread_abort_safely( target_act : thread_act_t); -routine thread_depress_abort( +routine +#ifdef KERNEL_SERVER +thread_depress_abort_from_user( +#else +thread_depress_abort( +#endif thread : thread_act_t); diff --git a/osfmk/mach/thread_policy.h b/osfmk/mach/thread_policy.h index 915425333..626e0d3f1 100644 --- a/osfmk/mach/thread_policy.h +++ b/osfmk/mach/thread_policy.h @@ -295,6 +295,7 @@ typedef struct thread_policy_state *thread_policy_state_t; #define THREAD_QOS_POLICY 9 #define THREAD_QOS_POLICY_OVERRIDE 10 +typedef uint8_t thread_qos_t; #define THREAD_QOS_UNSPECIFIED 0 #define THREAD_QOS_DEFAULT THREAD_QOS_UNSPECIFIED /* Temporary rename */ #define THREAD_QOS_MAINTENANCE 1 @@ -336,18 +337,6 @@ typedef struct thread_policy_state *thread_policy_state_t; * either be a memory allocation in userspace, or the pthread_t of the * overrider if no allocation was used. * - * THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE are used to - * override the QoS of a thread currently draining a serial dispatch - * queue, so that it can get to a block of higher QoS than its - * predecessors. The override is applied by a thread enqueueing work - * with resource=&queue, and reset by the thread that was overriden - * once it has drained the queue. Since the ++ and reset are - * asynchronous, there is the possibility of a ++ after the target - * thread has issued a reset, in which case the workqueue thread may - * issue a reset-all in its outermost scope before deciding whether it - * should return to dequeueing work from the global concurrent queues, - * or return to the kernel. - * * THREAD_QOS_OVERRIDE_TYPE_WILDCARD is a catch-all which will reset every * resource matching the resource value. Passing * THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD as well will reset everything. @@ -357,7 +346,6 @@ typedef struct thread_policy_state *thread_policy_state_t; #define THREAD_QOS_OVERRIDE_TYPE_PTHREAD_MUTEX (1) #define THREAD_QOS_OVERRIDE_TYPE_PTHREAD_RWLOCK (2) #define THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE (3) -#define THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE (4) #define THREAD_QOS_OVERRIDE_TYPE_WILDCARD (5) /* A special resource value to indicate a resource wildcard */ @@ -385,7 +373,7 @@ typedef struct thread_qos_policy *thread_qos_policy_t; * When they do, we will update THREAD_POLICY_INTERNAL_STRUCT_VERSION. */ -#define THREAD_POLICY_INTERNAL_STRUCT_VERSION 4 +#define THREAD_POLICY_INTERNAL_STRUCT_VERSION 5 struct thread_requested_policy { uint64_t thrp_int_darwinbg :1, /* marked as darwinbg via setpriority */ @@ -404,9 +392,10 @@ struct thread_requested_policy { thrp_qos_promote :3, /* thread qos class from promotion */ thrp_qos_ipc_override :3, /* thread qos class from ipc override */ thrp_terminated :1, /* heading for termination */ - thrp_qos_sync_ipc_override:3, /* thread qos class from sync ipc override */ + thrp_qos_sync_ipc_override:3, /* now unused */ + thrp_qos_workq_override :3, /* thread qos class override (workq) */ - thrp_reserved :29; + thrp_reserved :26; }; struct thread_effective_policy { diff --git a/osfmk/mach/vm_param.h b/osfmk/mach/vm_param.h index 96bd1f445..f6709419a 100644 --- a/osfmk/mach/vm_param.h +++ b/osfmk/mach/vm_param.h @@ -269,12 +269,14 @@ extern vm_offset_t vm_kernel_base; extern vm_offset_t vm_kernel_top; extern vm_offset_t vm_hib_base; -#define VM_KERNEL_IS_SLID(_o) \ - (((vm_offset_t)(_o) >= vm_kernel_slid_base) && \ - ((vm_offset_t)(_o) < vm_kernel_slid_top)) +extern vm_offset_t vm_kernel_builtinkmod_text; +extern vm_offset_t vm_kernel_builtinkmod_text_end; -#define VM_KERNEL_SLIDE(_u) \ - ((vm_offset_t)(_u) + vm_kernel_slide) +#define VM_KERNEL_IS_SLID(_o) \ + (((vm_offset_t)VM_KERNEL_STRIP_PTR(_o) >= vm_kernel_slid_base) && \ + ((vm_offset_t)VM_KERNEL_STRIP_PTR(_o) < vm_kernel_slid_top)) + +#define VM_KERNEL_SLIDE(_u) ((vm_offset_t)(_u) + vm_kernel_slide) /* * The following macros are to be used when exposing kernel addresses to @@ -319,20 +321,20 @@ __BEGIN_DECLS extern vm_offset_t vm_kernel_addrhash(vm_offset_t addr); __END_DECLS -#define __DO_UNSLIDE(_v) ((vm_offset_t)(_v) - vm_kernel_slide) +#define __DO_UNSLIDE(_v) ((vm_offset_t)VM_KERNEL_STRIP_PTR(_v) - vm_kernel_slide) #if DEBUG || DEVELOPMENT -# define VM_KERNEL_ADDRHIDE(_v) (VM_KERNEL_IS_SLID(_v) ? __DO_UNSLIDE(_v) : (vm_address_t)(_v)) +#define VM_KERNEL_ADDRHIDE(_v) (VM_KERNEL_IS_SLID(_v) ? __DO_UNSLIDE(_v) : (vm_address_t)VM_KERNEL_STRIP_PTR(_v)) #else -# define VM_KERNEL_ADDRHIDE(_v) (VM_KERNEL_IS_SLID(_v) ? __DO_UNSLIDE(_v) : (vm_address_t)0) -#endif +#define VM_KERNEL_ADDRHIDE(_v) (VM_KERNEL_IS_SLID(_v) ? __DO_UNSLIDE(_v) : (vm_address_t)0) +#endif /* DEBUG || DEVELOPMENT */ #define VM_KERNEL_ADDRHASH(_v) vm_kernel_addrhash((vm_offset_t)(_v)) #define VM_KERNEL_UNSLIDE_OR_PERM(_v) ({ \ VM_KERNEL_IS_SLID(_v) ? __DO_UNSLIDE(_v) : \ - VM_KERNEL_ADDRESS(_v) ? ((vm_offset_t)(_v) + vm_kernel_addrperm) : \ - (vm_offset_t)(_v); \ + VM_KERNEL_ADDRESS(_v) ? ((vm_offset_t)VM_KERNEL_STRIP_PTR(_v) + vm_kernel_addrperm) : \ + (vm_offset_t)VM_KERNEL_STRIP_PTR(_v); \ }) #define VM_KERNEL_UNSLIDE(_v) ({ \ diff --git a/osfmk/mach/vm_statistics.h b/osfmk/mach/vm_statistics.h index 04fafe640..e4552b60f 100644 --- a/osfmk/mach/vm_statistics.h +++ b/osfmk/mach/vm_statistics.h @@ -350,6 +350,15 @@ typedef struct pmap_statistics *pmap_statistics_t; #define VM_FLAGS_SUPERPAGE_SIZE_2MB (SUPERPAGE_SIZE_2MB< +#define _os_atomic_c11_atomic(p) \ + ((typeof(*(p)) _Atomic *)(p)) + +#define _os_atomic_basetypeof(p) \ + typeof(atomic_load(((typeof(*(p)) _Atomic *)(p)))) + +#define _os_atomic_c11_op_orig(p, v, m, o) \ + atomic_##o##_explicit(_os_atomic_c11_atomic(p), v, \ + memory_order_##m) + +#define _os_atomic_c11_op(p, v, m, o, op) \ + ({ typeof(v) _v = (v); _os_atomic_c11_op_orig(p, v, m, o) op _v; }) + +#define os_atomic_thread_fence(m) atomic_thread_fence(memory_order_##m) + +#define os_atomic_load(p, m) \ + atomic_load_explicit(_os_atomic_c11_atomic(p), memory_order_##m) +#define os_atomic_store(p, v, m) _os_atomic_c11_op_orig(p, v, m, store) + +#define os_atomic_add_orig(p, v, m) _os_atomic_c11_op_orig(p, v, m, fetch_add) +#define os_atomic_add(p, v, m) _os_atomic_c11_op(p, v, m, fetch_add, +) + +#define os_atomic_inc_orig(p, m) _os_atomic_c11_op_orig(p, 1, m, fetch_add) +#define os_atomic_inc(p, m) _os_atomic_c11_op(p, 1, m, fetch_add, +) + +#define os_atomic_sub_orig(p, v, m) _os_atomic_c11_op_orig(p, v, m, fetch_sub) +#define os_atomic_sub(p, v, m) _os_atomic_c11_op(p, v, m, fetch_sub, -) + +#define os_atomic_dec_orig(p, m) _os_atomic_c11_op_orig(p, 1, m, fetch_sub) +#define os_atomic_dec(p, m) _os_atomic_c11_op(p, 1, m, fetch_sub, -) + +#define os_atomic_and_orig(p, v, m) _os_atomic_c11_op_orig(p, v, m, fetch_and) +#define os_atomic_and(p, v, m) _os_atomic_c11_op(p, v, m, fetch_and, &) + +#define os_atomic_or_orig(p, v, m) _os_atomic_c11_op_orig(p, v, m, fetch_or) +#define os_atomic_or(p, v, m) _os_atomic_c11_op(p, v, m, fetch_or, |) + +#define os_atomic_xor_orig(p, v, m) _os_atomic_c11_op_orig(p, v, m, fetch_xor) +#define os_atomic_xor(p, v, m) _os_atomic_c11_op(p, v, m, fetch_xor, ^) + +#define os_atomic_xchg(p, v, m) _os_atomic_c11_op_orig(p, v, m, exchange) + +#define os_atomic_cmpxchg(p, e, v, m) \ + ({ _os_atomic_basetypeof(p) _r = (e); \ + atomic_compare_exchange_strong_explicit(_os_atomic_c11_atomic(p), \ + &_r, v, memory_order_##m, memory_order_relaxed); }) +#define os_atomic_cmpxchgv(p, e, v, g, m) \ + ({ _os_atomic_basetypeof(p) _r = (e); int _b = \ + atomic_compare_exchange_strong_explicit(_os_atomic_c11_atomic(p), \ + &_r, v, memory_order_##m, memory_order_relaxed); *(g) = _r; _b; }) +#define os_atomic_cmpxchgvw(p, e, v, g, m) \ + ({ _os_atomic_basetypeof(p) _r = (e); int _b = \ + atomic_compare_exchange_weak_explicit(_os_atomic_c11_atomic(p), \ + &_r, v, memory_order_##m, memory_order_relaxed); *(g) = _r; _b; }) + +#define os_atomic_rmw_loop(p, ov, nv, m, ...) ({ \ + bool _result = false; \ + typeof(p) _p = (p); \ + ov = os_atomic_load(_p, relaxed); \ + do { \ + __VA_ARGS__; \ + _result = os_atomic_cmpxchgvw(_p, ov, nv, &ov, m); \ + } while (!_result); \ + _result; \ + }) + +#define os_atomic_rmw_loop_give_up_with_fence(m, expr) \ + ({ os_atomic_thread_fence(m); expr; __builtin_unreachable(); }) +#define os_atomic_rmw_loop_give_up(expr) \ + os_atomic_rmw_loop_give_up_with_fence(relaxed, expr) + +#define os_atomic_force_dependency_on(p, e) (p) +#define os_atomic_load_with_dependency_on(p, e) \ + os_atomic_load(os_atomic_force_dependency_on(p, e), relaxed) + #if defined (__x86_64__) #include "i386/atomic.h" #elif defined (__arm__) || defined (__arm64__) diff --git a/osfmk/machine/monotonic.h b/osfmk/machine/monotonic.h index b3e75c8d3..9de057044 100644 --- a/osfmk/machine/monotonic.h +++ b/osfmk/machine/monotonic.h @@ -67,4 +67,12 @@ uint64_t mt_core_snap(unsigned int ctr); void mt_core_set_snap(unsigned int ctr, uint64_t snap); void mt_mtc_set_snap(struct mt_cpu *mtc, unsigned int ctr, uint64_t snap); +typedef void (*mt_pmi_fn)(bool user_mode, void *ctx); +extern bool mt_microstackshots; +extern unsigned int mt_microstackshot_ctr; +extern mt_pmi_fn mt_microstackshot_pmi_handler; +extern void *mt_microstackshot_ctx; +extern uint64_t mt_core_reset_values[MT_CORE_NFIXED]; +int mt_microstackshot_start_arch(uint64_t period); + #endif /* !defined(MACHINE_MONOTONIC_H) */ diff --git a/osfmk/prng/YarrowCoreLib/include/WindowsTypesForMac.h b/osfmk/prng/YarrowCoreLib/include/WindowsTypesForMac.h deleted file mode 100644 index 06f6a5bf0..000000000 --- a/osfmk/prng/YarrowCoreLib/include/WindowsTypesForMac.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (c) 1999, 2000-2013 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - File: WindowsTypesForMac.h - - Contains: Define common Windows data types in mac terms. - - Written by: Doug Mitchell - - Copyright: (c) 2000 by Apple Computer, Inc., all rights reserved. - - Change History (most recent first): - - 02/10/99 dpm Created. - -*/ - -#ifndef _WINDOWS_TYPES_FOR_MAC_H_ -#define _WINDOWS_TYPES_FOR_MAC_H_ - -#include - -typedef u_int8_t UCHAR; -typedef int8_t CHAR; -typedef u_int8_t BYTE; -typedef char TCHAR; -typedef int16_t WORD; -typedef int32_t DWORD; -typedef u_int16_t USHORT; -typedef u_int32_t ULONG; -typedef int32_t LONG; -typedef u_int32_t UINT; -typedef int64_t LONGLONG; -typedef u_int8_t *LPBYTE; -typedef int8_t *LPSTR; -typedef int16_t *LPWORD; -typedef int8_t *LPCTSTR; /* ??? */ -typedef int8_t *LPCSTR; /* ??? */ -typedef void *LPVOID; -typedef void *HINSTANCE; -typedef void *HANDLE; - -#define WINAPI - -#endif /* _WINDOWS_TYPES_FOR_MAC_H_*/ - diff --git a/osfmk/prng/YarrowCoreLib/include/yarrow.h b/osfmk/prng/YarrowCoreLib/include/yarrow.h deleted file mode 100644 index 282da7631..000000000 --- a/osfmk/prng/YarrowCoreLib/include/yarrow.h +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Copyright (c) 1999, 2000-2013 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - File: yarrow.h - - Contains: Public header file for Counterpane's Yarrow Pseudo-random - number generator. - - Written by: Counterpane, Inc. - - Copyright: (c) 2000 by Apple Computer, Inc., all rights reserved. - - Change History (most recent first): - - 02/10/99 dpm Created, based on Counterpane source. - -*/ -/* - yarrow.h - - Main header file for Counterpane's Yarrow Pseudo-random number generator. -*/ - -#ifndef __YARROW_H__ -#define __YARROW_H__ - -#if defined(macintosh) || defined(__APPLE__) -#include "WindowsTypesForMac.h" -#endif - -#if defined(__cplusplus) -extern "C" { -#endif - -/* Error Codes */ -typedef enum prng_error_status { - PRNG_SUCCESS = 0, - PRNG_ERR_REINIT, - PRNG_ERR_WRONG_CALLER, - PRNG_ERR_NOT_READY, - PRNG_ERR_NULL_POINTER, - PRNG_ERR_LOW_MEMORY, - PRNG_ERR_OUT_OF_BOUNDS, - PRNG_ERR_COMPRESSION, - PRNG_ERR_NOT_ENOUGH_ENTROPY, - PRNG_ERR_MUTEX, - PRNG_ERR_TIMEOUT, - PRNG_ERR_PROGRAM_FLOW -} prng_error_status; - -/* - * Entropy sources - */ -enum user_sources { - CLIENT_SOURCE = 0, - ENTROPY_FILE_SOURCE, - SYSTEM_SOURCE, - USER_SOURCES /* Leave as last source */ -}; - - -/* Declare YARROWAPI as __declspec(dllexport) before - including this file in the actual DLL */ -#ifndef YARROWAPI -#if defined(macintosh) || defined(__APPLE__) -#define YARROWAPI -#else -#define YARROWAPI __declspec(dllimport) -#endif -#endif - -/* Public function forward declarations */ - -#if defined(macintosh) || defined(__APPLE__) -/* - * Mac changes: - * 1. PrngRef context for all functions. Thus no global variables. - * 2. Strong enum typing (prng_error_status instead of int return). - */ -struct PRNG; -typedef struct PRNG *PrngRef; - -YARROWAPI prng_error_status -prngInitialize( - PrngRef *prng); -YARROWAPI prng_error_status -prngDestroy( - PrngRef prng); -YARROWAPI prng_error_status -prngOutput( - PrngRef prng, - BYTE *outbuf, - UINT outbuflen); -/* this one has no context */ -YARROWAPI prng_error_status -prngStretch( - BYTE *inbuf, - UINT inbuflen, - BYTE *outbuf, - UINT outbuflen); -YARROWAPI prng_error_status -prngInput( - PrngRef prng, - BYTE *inbuf, - UINT inbuflen, - UINT poolnum, - UINT estbits); -YARROWAPI prng_error_status -prngForceReseed( - PrngRef prng, - LONGLONG ticks); -YARROWAPI prng_error_status -prngAllowReseed( - PrngRef prng, - LONGLONG ticks); -YARROWAPI prng_error_status -prngProcessSeedBuffer( - PrngRef prng, - BYTE *buf, - LONGLONG ticks); -YARROWAPI prng_error_status -prngSlowPoll( - PrngRef prng, - UINT pollsize); -#else -/* original Counterpane API */ -YARROWAPI int prngOutput(BYTE *outbuf,UINT outbuflen); -YARROWAPI int prngStretch(BYTE *inbuf,UINT inbuflen,BYTE *outbuf,UINT outbuflen); -YARROWAPI int prngInput(BYTE *inbuf,UINT inbuflen,UINT poolnum,UINT estbits); -YARROWAPI int prngForceReseed(LONGLONG ticks); -YARROWAPI int prngAllowReseed(LONGLONG ticks); -YARROWAPI int prngProcessSeedBuffer(BYTE *buf,LONGLONG ticks); -YARROWAPI int prngSlowPoll(UINT pollsize); -#endif - -#if defined(__cplusplus) -} -#endif - -#endif diff --git a/osfmk/prng/YarrowCoreLib/include/yarrowUtils.h b/osfmk/prng/YarrowCoreLib/include/yarrowUtils.h deleted file mode 100644 index 95a43f5d0..000000000 --- a/osfmk/prng/YarrowCoreLib/include/yarrowUtils.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 1999, 2000-2013 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - File: yarrowUtils.h - - Contains: Misc. utility functions. - - Written by: Doug Mitchell - - Copyright: (c) 2000 by Apple Computer, Inc., all rights reserved. - - Change History (most recent first): - - 02/29/00 dpm Created. - -*/ - -#ifndef _YARROW_UTILS_H_ -#define _YARROW_UTILS_H_ - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Wipe a piece of memory clean. - */ -void trashMemory(void* mem, int len); - -#ifdef __cplusplus -} -#endif - -#endif /* _YARROW_UTILS_H_*/ diff --git a/osfmk/prng/YarrowCoreLib/src/assertverify.h b/osfmk/prng/YarrowCoreLib/src/assertverify.h deleted file mode 100644 index 7f2c35a90..000000000 --- a/osfmk/prng/YarrowCoreLib/src/assertverify.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 1999, 2000-2013 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef ASSERT_VERIFY_H -#define ASSERT_VERIFY_H - -/****************************************************************************** -Written by: Jeffrey Richter -Notices: Copyright (c) 1995-1997 Jeffrey Richter -Purpose: Common header file containing handy macros and definitions used - throughout all the applications in the book. -******************************************************************************/ - -/* These header functions were copied from the cmnhdr.h file that accompanies - Advanced Windows 3rd Edition by Jeffrey Richter */ - -//////////////////////////// Assert/Verify Macros ///////////////////////////// - -#if defined(macintosh) || defined(__APPLE__) -/* TBD */ -#define chFAIL(szMSG) -#define chASSERTFAIL(file,line,expr) -#else -#define chFAIL(szMSG) { \ - MessageBox(GetActiveWindow(), szMSG, \ - __TEXT("Assertion Failed"), MB_OK | MB_ICONERROR); \ - DebugBreak(); \ - } - -/* Put up an assertion failure message box. */ -#define chASSERTFAIL(file,line,expr) { \ - TCHAR sz[128]; \ - wsprintf(sz, __TEXT("File %hs, line %d : %hs"), file, line, expr); \ - chFAIL(sz); \ - } - -#endif /* macintosh */ - -/* Put up a message box if an assertion fails in a debug build. */ -#ifdef _DEBUG -#define chASSERT(x) if (!(x)) chASSERTFAIL(__FILE__, __LINE__, #x) -#else -#define chASSERT(x) -#endif - -/* Assert in debug builds, but don't remove the code in retail builds. */ -#ifdef _DEBUG -#define chVERIFY(x) chASSERT(x) -#else -#define chVERIFY(x) (x) -#endif - -#endif /* ASSERT_VERIFY_H */ diff --git a/osfmk/prng/YarrowCoreLib/src/comp.c b/osfmk/prng/YarrowCoreLib/src/comp.c deleted file mode 100644 index 28f3c07c2..000000000 --- a/osfmk/prng/YarrowCoreLib/src/comp.c +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Copyright (c) 1999, 2000-2013 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - File: comp.c - - Contains: NULL compression. Kernel version of Yarrow assumes - incoming seed data is truly random. -*/ -#include "prng/YarrowCoreLib/include/WindowsTypesForMac.h" -#include "comp.h" - -#ifdef YARROW_KERNEL - -/* null compression */ -comp_error_status comp_init(__unused COMP_CTX* ctx) -{ - return COMP_SUCCESS; -} - - -comp_error_status comp_add_data( __unused COMP_CTX* ctx, - __unused Bytef* inp, - __unused uInt inplen ) -{ - return COMP_SUCCESS; -} - -comp_error_status comp_get_ratio( __unused COMP_CTX* ctx,float* out ) -{ - *out = 1.0; - return COMP_SUCCESS; -} - -comp_error_status comp_end( __unused COMP_CTX* ctx ) -{ - return COMP_SUCCESS; -} - -#else - -/* original Yarrow compression, must be linked with zlib */ - -#if defined(macintosh) || defined(__APPLE__) -#include "WindowsTypesForMac.h" -#include "yarrowUtils.h" -#include -#include -#else -#include -#endif -#include -#include "comp.h" - -/* Check that the pointer is not NULL */ -#define PCHECK(ptr) if(ptr==NULL) {return COMP_ERR_NULL_POINTER;} -#define MMPCHECK(mmptr) if(mmptr==MM_NULL) {return COMP_ERR_NULL_POINTER;} -/* Check that the important parts of the context are ok */ -#define CTXCHECK(ctx) \ -PCHECK(ctx) \ -MMPCHECK(ctx->buf) - -/* Might want to vary these by context */ -#define BUFSIZE 16384 /* 16K */ -#define OUTBUFSIZE 16800 /* = inbufsize*1.01 + 12 (See zlib docs) */ -#define SHIFTSIZE 4096 /* BUFSIZE/4 */ - -#define _MIN(a,b) (((a)<(b))?(a):(b)) - - -/* Initialize these routines */ -comp_error_status comp_init(COMP_CTX* ctx) -{ - ctx->buf = mmMalloc(BUFSIZE); - if(ctx->buf == MM_NULL) {goto cleanup_comp_init;} - ctx->spaceused = 0; - - return COMP_SUCCESS; - -cleanup_comp_init: - mmFree(ctx->buf); - - return COMP_ERR_LOW_MEMORY; -} - - -comp_error_status comp_add_data(COMP_CTX* ctx,Bytef* inp,uInt inplen) -{ - uInt shifts; - uInt blocksize; - BYTE* buf; - - CTXCHECK(ctx); - PCHECK(inp); - - buf = (BYTE*)mmGetPtr(ctx->buf); - - if(inplen+SHIFTSIZE>BUFSIZE) - { - blocksize = _MIN(inplen,BUFSIZE); - memmove(buf,inp,blocksize); - ctx->spaceused = blocksize; - } - else - { - if(inplen+ctx->spaceused>BUFSIZE) - { - shifts = (uInt)ceil((inplen+ctx->spaceused-BUFSIZE)/(float)SHIFTSIZE); - blocksize = _MIN(shifts*SHIFTSIZE,ctx->spaceused); - memmove(buf,buf+blocksize,BUFSIZE-blocksize); - ctx->spaceused = ctx->spaceused - blocksize; - } - memmove(buf+ctx->spaceused,inp,inplen); - ctx->spaceused += inplen; - } - - return COMP_SUCCESS; -} - -comp_error_status comp_get_ratio(COMP_CTX* ctx,float* out) -{ - Bytef *inbuf,*outbuf; - uLong insize,outsize; - int resp; - - *out = 0; - - CTXCHECK(ctx); - PCHECK(out); - - if(ctx->spaceused == 0) {return COMP_SUCCESS;} - - inbuf = (Bytef*)mmGetPtr(ctx->buf); - outbuf = (Bytef*)malloc(OUTBUFSIZE); - if(outbuf==NULL) {return COMP_ERR_LOW_MEMORY;} - - insize = ctx->spaceused; - outsize = OUTBUFSIZE; - - resp = compress(outbuf,&outsize,inbuf,insize); - if(resp==Z_MEM_ERROR) {return COMP_ERR_LOW_MEMORY;} - if(resp==Z_BUF_ERROR) {return COMP_ERR_LIB;} - - *out = (float)outsize/(float)insize; - - /* Thrash the memory and free it */ - trashMemory(outbuf, OUTBUFSIZE); - free(outbuf); - - return COMP_SUCCESS; -} - -comp_error_status comp_end(COMP_CTX* ctx) -{ - if(ctx == NULL) {return COMP_SUCCESS;} /* Since nothing is left undone */ - - mmFree(ctx->buf); - ctx->buf = MM_NULL; - - return COMP_SUCCESS; -} - -#endif /* YARROW_KERNEL */ - diff --git a/osfmk/prng/YarrowCoreLib/src/comp.h b/osfmk/prng/YarrowCoreLib/src/comp.h deleted file mode 100644 index 0cbeca32f..000000000 --- a/osfmk/prng/YarrowCoreLib/src/comp.h +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) 1999, 2000-2013 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - File: comp.h - - Contains: Glue between core prng code to the Zlib library. - - Written by: Counterpane, Inc. - - Copyright: (c) 2000 by Apple Computer, Inc., all rights reserved. - - Change History (most recent first): - - 02/10/99 dpm Created, based on Counterpane source. - -*/ -/* comp.h - - Header for the compression routines added to the Counterpane PRNG. -*/ - -#ifndef __YARROW_COMP_H__ -#define __YARROW_COMP_H__ - -#include "smf.h" - -/* - * Kernel version does NULL compression.... - */ -#define YARROW_KERNEL - -#ifdef YARROW_KERNEL -/* - * Shrink this down to almost nothing to simplify kernel port; - * with additional hacking on prng.c, this could go away entirely - */ -typedef char COMP_CTX; - -/* and define some type3s normally picked up from zlib */ -typedef unsigned char Bytef; -typedef unsigned uInt; - -#else - -#include "zlib.h" - -/* Top level compression context */ -typedef struct{ - MMPTR buf; - uInt spaceused; -} COMP_CTX; -#endif /* YARROW_KERNEL */ - -typedef enum comp_error_status { - COMP_SUCCESS = 0, - COMP_ERR_NULL_POINTER, - COMP_ERR_LOW_MEMORY, - COMP_ERR_LIB -} comp_error_status; - -/* Exported functions from compress.c */ -comp_error_status comp_init(COMP_CTX* ctx); -comp_error_status comp_add_data(COMP_CTX* ctx,Bytef* inp,uInt inplen); -comp_error_status comp_end(COMP_CTX* ctx); -comp_error_status comp_get_ratio(COMP_CTX* ctx,float* out); - -#endif diff --git a/osfmk/prng/YarrowCoreLib/src/prng.c b/osfmk/prng/YarrowCoreLib/src/prng.c deleted file mode 100644 index 5c1d6ad40..000000000 --- a/osfmk/prng/YarrowCoreLib/src/prng.c +++ /dev/null @@ -1,638 +0,0 @@ -/* - * Copyright (c) 1999, 2000-2016 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - File: prng.c - - Contains: Core routines for the Counterpane Yarrow PRNG. - - Written by: Counterpane, Inc. - - Copyright: (c) 2000 by Apple Computer, Inc., all rights reserved. - - Change History (most recent first): - - 02/10/99 dpm Created, based on Counterpane source. - -*/ -/* - prng.c - - Core routines for the Counterpane PRNG -*/ -#include "userdefines.h" -#include "assertverify.h" -#include "prng/YarrowCoreLib/include/yarrowUtils.h" - -#if defined(macintosh) || defined(__APPLE__) -/* FIXME - this file needs to be in a platform-independent place */ - -#include "macOnly.h" -#endif /* macintosh */ -#include "smf.h" -#include "sha1mod.h" -#include "entropysources.h" -#include "comp.h" -#include "prng/YarrowCoreLib/include/yarrow.h" -#include "prng.h" -#include "prngpriv.h" - - -#define _MAX(a,b) (((a)>(b))?(a):(b)) -#define _MIN(a,b) (((a)<(b))?(a):(b)) - -#if defined(macintosh) || defined(__APPLE__) -/* - * No mutexes in this module for Macintosh/OSX. We handle the - * required locking elsewhere. - */ -#define MUTEX_ENABLE 0 - -#include /* memcpy, etc. */ -#if TARGET_API_MAC_OSX - #include /* for timespec */ -#elif TARGET_API_MAC_CARBON - #include /* Microseconds */ - #include -#elif KERNEL_BUILD - #include -#elif MACH_KERNEL_PRIVATE - #include - #include -#else - #error Unknown TARGET_API -#endif /* TARGET_API */ -#else -#define MUTEX_ENABLE 1 -#endif /* macintosh */ - -#if MUTEX_ENABLE -static HANDLE Statmutex = NULL; -static DWORD mutexCreatorId = 0; -#endif - -#if 0 -#pragma mark - -#pragma mark * * * Static Utility functions * * * -#endif - -/* All error checking should be done in the function that calls these */ - -/* - * out := SHA1(IV | out) - */ -static void -prng_do_SHA1(GEN_CTX *ctx) -{ - YSHA1_CTX sha; - - YSHA1Init(&sha); - YSHA1Update(&sha,ctx->IV,20); - YSHA1Update(&sha,ctx->out,20); - YSHA1Final(ctx->out,&sha); - ctx->index = 0; -} - -/* - * IV := newState - * out := SHA1(IV) - * - * Called from init, prngForceReseed(), and prngOutput() - * as anti-backtracking mechanism. - */ -static void -prng_make_new_state(GEN_CTX *ctx,BYTE *newState) -{ - YSHA1_CTX sha; - - memcpy(ctx->IV,newState,20); - YSHA1Init(&sha); - YSHA1Update(&sha,ctx->IV,20); - YSHA1Final(ctx->out,&sha); - ctx->numout = 0; - ctx->index = 0; -} - -#if SLOW_POLL_ENABLE - - -/* Initialize the secret state with a slow poll */ -/* Currently only called from prngInitialize */ - -#define SPLEN 65536 /* 64K */ - -static void -prng_slow_init(PRNG *p) -/* This fails silently and must be fixed. */ -{ - YSHA1_CTX* ctx = NULL; - MMPTR mmctx = MM_NULL; - BYTE* bigbuf = NULL; - MMPTR mmbigbuf = MM_NULL; - BYTE* buf = NULL; - MMPTR mmbuf = MM_NULL; - DWORD polllength; - - mmbigbuf = mmMalloc(SPLEN); - if(mmbigbuf == MM_NULL) {goto cleanup_slow_init;} - bigbuf = (BYTE*)mmGetPtr(mmbigbuf); - - mmbuf = mmMalloc(20); - if(mmbuf == MM_NULL) {goto cleanup_slow_init;} - buf = (BYTE*)mmGetPtr(mmbuf); - - mmctx = mmMalloc(sizeof(YSHA1_CTX)); - if(mmctx == MM_NULL) {goto cleanup_slow_init;} - ctx = (YSHA1_CTX*)mmGetPtr(mmctx); - - - /* Initialize the secret state. */ - /* Init entropy pool */ - YSHA1Init(&p->pool); - /* Init output generator */ - polllength = prng_slow_poll(bigbuf,SPLEN); - YSHA1Init(ctx); - YSHA1Update(ctx,bigbuf,polllength); - YSHA1Final(buf,ctx); - prng_make_new_state(&p->outstate, buf); - -cleanup_slow_init: - mmFree(mmctx); - mmFree(mmbigbuf); - mmFree(mmbuf); - - return; -} - -#endif /* SLOW_POLL_ENABLE */ - -/* In-place modifed bubble sort */ -static void -bubbleSort( UINT *data, LONG len ) -{ - LONG i,last,newlast; - UINT temp; - - last = len-1; - while(last!=-1) - { - newlast = -1; - for(i=0;i data[i]) - { - newlast = i; - temp = data[i]; - data[i] = data[i+1]; - data[i+1] = temp; - } - } - last = newlast; - } -} - -#if 0 -#pragma mark - -#pragma mark * * * Public functions * * * -#endif - -/* Set up the PRNG */ -prng_error_status -prngInitialize(PrngRef *prng) -{ - UINT i; - comp_error_status resp; - prng_error_status retval = PRNG_ERR_LOW_MEMORY; - MMPTR mmp; - PRNG *p; - - mmInit(); - - #if MUTEX_ENABLE - /* Create the mutex */ - /* NOTE: on return the mutex should bve held, since our caller (prngInitialize) - * will release it. - */ - if(mutexCreatorId!=0) {return PRNG_ERR_REINIT;} - Statmutex = CreateMutex(NULL,TRUE,NULL); - if(Statmutex == NULL) {mutexCreatorId = 0; return PRNG_ERR_MUTEX;} - DuplicateHandle(GetCurrentProcess(),Statmutex,GetCurrentProcess(),&mutex,SYNCHRONIZE,FALSE,0); - mutexCreatorId = GetCurrentProcessId(); - #endif /* MUTEX_ENABLE */ - - /* Assign memory */ - mmp = mmMalloc(sizeof(PRNG)); - if(mmp==MM_NULL) - { - goto cleanup_init; - } - else - { - p = (PRNG*)mmGetPtr(mmp); - memset(p, 0, sizeof(PRNG)); - } - - /* Initialize Variables */ - for(i=0;ipoolSize[i] = 0; - p->poolEstBits[i] = 0; - } - -#ifdef WIN_NT - /* Setup security on the registry so that remote users cannot predict the slow pool */ - prng_set_NT_security(); -#endif - - /* Initialize the secret state. */ - /* FIXME - might want to make this an option here and have the caller - * do it after we return....? */ - YSHA1Init(&p->pool); -#if SLOW_POLL_ENABLE - prng_slow_init(p); /* Does a slow poll and then calls prng_make_state(...) */ -#else - /* NULL init */ - prng_do_SHA1(&p->outstate); - prng_make_new_state(&p->outstate, p->outstate.out); -#endif /* SLOW_POLL_ENABLE */ - - /* Initialize compression routines */ - for(i=0;icomp_state)+i); - if(resp!=COMP_SUCCESS) {retval = PRNG_ERR_COMPRESSION; goto cleanup_init;} - } - - p->ready = PRNG_READY; - *prng = (PrngRef)p; - - return PRNG_SUCCESS; - -cleanup_init: - /* Program failed on one of the mmmallocs */ - mmFree(mmp); - mmp = MM_NULL; - - #if MUTEX_ENABLE - CloseHandle(Statmutex); - Statmutex = NULL; - mutexCreatorId = 0; - #endif - - return retval; /* default PRNG_ERR_LOW_MEMORY */ -} - -/* Provide output */ -prng_error_status -prngOutput(PRNG *p, BYTE *outbuf,UINT outbuflen) -{ - UINT i; - GEN_CTX *ctx = &p->outstate; - - CHECKSTATE(p); - GENCHECK(p); - PCHECK(outbuf); - chASSERT(BACKTRACKLIMIT > 0); - - for(i=0;iindex++,ctx->numout++) - { - /* Check backtracklimit */ - if(ctx->numout > BACKTRACKLIMIT) - { - prng_do_SHA1(ctx); - prng_make_new_state(ctx, ctx->out); - } - /* Check position in IV */ - if(ctx->index>=20) - { - prng_do_SHA1(ctx); - } - /* Output data */ - outbuf[i] = (ctx->out)[ctx->index]; - } - - return PRNG_SUCCESS; -} - - -/* Cause the PRNG to reseed now regardless of entropy pool */ -/* Should this be public? */ -prng_error_status -prngForceReseed(PRNG *p, LONGLONG ticks) -{ - int i; -#ifdef WIN_NT - FILETIME a,b,c,usertime; -#endif - BYTE buf[64]; - BYTE dig[20]; -#if defined(macintosh) || defined(__APPLE__) - #if (defined(TARGET_API_MAC_OSX) || defined(KERNEL_BUILD)) - struct timeval tv; - int64_t endTime, curTime; - #elif defined(MACH_KERNEL_PRIVATE) - int64_t endTime, curTime; - #else /* TARGET_API_MAC_CARBON */ - UnsignedWide uwide; /* struct needed for Microseconds() */ - LONGLONG start; - LONGLONG now; - #endif -#endif - - CHECKSTATE(p); - POOLCHECK(p); - ZCHECK(ticks); - - /* Set up start and end times */ - #if defined(macintosh) || defined(__APPLE__) - #if (defined(TARGET_API_MAC_OSX) || defined(KERNEL_BUILD)) - /* note we can't loop for more than a million microseconds */ - #ifdef KERNEL_BUILD - microuptime (&tv); - #else - gettimeofday(&tv, NULL); - #endif - endTime = (int64_t)tv.tv_sec*1000000LL + (int64_t)tv.tv_usec + ticks; - #elif defined(MACH_KERNEL_PRIVATE) - endTime = mach_absolute_time() + (ticks*NSEC_PER_USEC); - #else /* TARGET_API_MAC_OSX */ - Microseconds(&uwide); - start = UnsignedWideToUInt64(uwide); - #endif /* TARGET_API_xxx */ - #endif /* macintosh */ - do - { - /* Do a couple of iterations between time checks */ - prngOutput(p, buf,64); - YSHA1Update(&p->pool,buf,64); - prngOutput(p, buf,64); - YSHA1Update(&p->pool,buf,64); - prngOutput(p, buf,64); - YSHA1Update(&p->pool,buf,64); - prngOutput(p, buf,64); - YSHA1Update(&p->pool,buf,64); - prngOutput(p, buf,64); - YSHA1Update(&p->pool,buf,64); - -#if defined(macintosh) || defined(__APPLE__) - #if defined(TARGET_API_MAC_OSX) || defined(KERNEL_BUILD) - #ifdef TARGET_API_MAC_OSX - gettimeofday(&tv, NULL); - #else - microuptime (&tv); - curTime = (int64_t)tv.tv_sec*1000000LL + (int64_t)tv.tv_usec; - #endif - } while(curTime < endTime); - #elif defined(MACH_KERNEL_PRIVATE) - curTime = mach_absolute_time(); - } while(curTime < endTime); - #else - Microseconds(&uwide); - now = UnsignedWideToUInt64(uwide); - } while ( (now-start) < ticks) ; - #endif -#else - } while ( (now-start) < ticks) ; -#endif - YSHA1Final(dig,&p->pool); - YSHA1Update(&p->pool,dig,20); - YSHA1Final(dig,&p->pool); - - /* Reset secret state */ - YSHA1Init(&p->pool); - prng_make_new_state(&p->outstate,dig); - - /* Clear counter variables */ - for(i=0;ipoolSize[i] = 0; - p->poolEstBits[i] = 0; - } - - /* Cleanup memory */ - trashMemory(dig,20*sizeof(char)); - trashMemory(buf,64*sizeof(char)); - - return PRNG_SUCCESS; -} - - -/* Input a state into the PRNG */ -prng_error_status -prngProcessSeedBuffer(PRNG *p, BYTE *buf,LONGLONG ticks) -{ - CHECKSTATE(p); - GENCHECK(p); - PCHECK(buf); - - /* Put the data into the entropy, add some data from the unknown state, reseed */ - YSHA1Update(&p->pool,buf,20); /* Put it into the entropy pool */ - prng_do_SHA1(&p->outstate); /* Output 20 more bytes and */ - YSHA1Update(&p->pool,p->outstate.out,20);/* add it to the pool as well. */ - prngForceReseed(p, ticks); /* Do a reseed */ - return prngOutput(p, buf,20); /* Return the first 20 bytes of output in buf */ -} - - -/* Take some "random" data and make more "random-looking" data from it */ -/* note: this routine has no context, no mutex wrapper */ -prng_error_status -prngStretch(BYTE *inbuf,UINT inbuflen,BYTE *outbuf,UINT outbuflen) { - long int left,prev; - YSHA1_CTX ctx; - BYTE dig[20]; - - PCHECK(inbuf); - PCHECK(outbuf); - - if(inbuflen >= outbuflen) - { - memcpy(outbuf,inbuf,outbuflen); - return PRNG_SUCCESS; - } - else /* Extend using SHA1 hash of inbuf */ - { - YSHA1Init(&ctx); - YSHA1Update(&ctx,inbuf,inbuflen); - YSHA1Final(dig,&ctx); - for(prev=0,left=outbuflen;left>0;prev+=20,left-=20) - { - YSHA1Update(&ctx,dig,20); - YSHA1Final(dig,&ctx); - memcpy(outbuf+prev,dig,(left>20)?20:left); - } - trashMemory(dig,20*sizeof(BYTE)); - - return PRNG_SUCCESS; - } -} - - -/* Add entropy to the PRNG from a source */ -prng_error_status -prngInput(PRNG *p, BYTE *inbuf,UINT inbuflen,UINT poolnum, __unused UINT estbits) -{ - #ifndef YARROW_KERNEL - comp_error_status resp; - #endif - - CHECKSTATE(p); - POOLCHECK(p); - PCHECK(inbuf); - if(poolnum >= TOTAL_SOURCES) {return PRNG_ERR_OUT_OF_BOUNDS;} - - /* Add to entropy pool */ - YSHA1Update(&p->pool,inbuf,inbuflen); - - #ifndef YARROW_KERNEL - /* skip this step for the kernel */ - - /* Update pool size, pool user estimate and pool compression context */ - p->poolSize[poolnum] += inbuflen; - p->poolEstBits[poolnum] += estbits; - if(poolnumcomp_state)+poolnum,inbuf,inbuflen); - if(resp!=COMP_SUCCESS) {return PRNG_ERR_COMPRESSION;} - } - #endif /* YARROW_KERNEL */ - - return PRNG_SUCCESS; -} - - - -/* If we have enough entropy, allow a reseed of the system */ -prng_error_status -prngAllowReseed(PRNG *p, LONGLONG ticks) -{ - UINT temp[TOTAL_SOURCES]; - LONG i; - UINT sum; -#ifndef KERNEL_BUILD - float ratio; -#endif - -#ifndef KERNEL_BUILD - comp_error_status resp; -#endif - - CHECKSTATE(p); - - for(i=0;icomp_state)+i,&ratio); - if(resp!=COMP_SUCCESS) {return PRNG_ERR_COMPRESSION;} - /* Use 4 instead of 8 to half compression estimate */ - temp[i] = (int)(ratio*p->poolSize[i]*4); -#else - temp[i] = p->poolSize[i] * 4; -#endif - - } - /* Use minumum of user and compression estimate for compressed sources */ - for(i=ENTROPY_SOURCES;icomp_state)+i,&ratio); - if(resp!=COMP_SUCCESS) {return PRNG_ERR_COMPRESSION;} - /* Use 4 instead of 8 to half compression estimate */ - temp[i] = _MIN((int)(ratio*p->poolSize[i]*4),(int)p->poolEstBits[i]); -#else - temp[i] = _MIN (p->poolSize[i] * 4, p->poolEstBits[i]); -#endif - - } - /* Use user estimate for remaining sources */ - for(i=COMP_SOURCES;ipoolEstBits[i];} - - if(K > 0) { - /* pointless if we're not ignoring any sources */ - bubbleSort(temp,TOTAL_SOURCES); - } - for(i=K,sum=0;iTHRESHOLD) - return prngForceReseed(p, ticks); - else - return PRNG_ERR_NOT_ENOUGH_ENTROPY; -} - -#if SLOW_POLL_ENABLE -/* Call a slow poll and insert the data into the entropy pool */ -static prng_error_status -prngSlowPoll(PRNG *p, UINT pollsize) -{ - BYTE *buf; - DWORD len; - prng_error_status retval; - - CHECKSTATE(p); - - buf = (BYTE*)malloc(pollsize); - if(buf==NULL) {return PRNG_ERR_LOW_MEMORY;} - len = prng_slow_poll(buf,pollsize); /* OS specific call */ - retval = prngInput(p, buf,len,SLOWPOLLSOURCE, len * 8); - trashMemory(buf,pollsize); - free(buf); - - return retval; -} -#endif /* SLOW_POLL_ENABLE */ - - -/* Delete the PRNG */ -prng_error_status -prngDestroy(PRNG *p) -{ - UINT i; - - #if MUTEX_ENABLE - if(GetCurrentProcessId()!=mutexCreatorId) {return PRNG_ERR_WRONG_CALLER;} - #endif - if(p==NULL) {return PRNG_SUCCESS;} /* Well, there is nothing to destroy... */ - - p->ready = PRNG_NOT_READY; - - for(i=0;icomp_state)+i); - } - - #if MUTEX_ENABLE - CloseHandle(Statmutex); - Statmutex = NULL; - mutexCreatorId = 0; - #endif - - return PRNG_SUCCESS; -} - - diff --git a/osfmk/prng/YarrowCoreLib/src/prng.h b/osfmk/prng/YarrowCoreLib/src/prng.h deleted file mode 100644 index 7d80758c1..000000000 --- a/osfmk/prng/YarrowCoreLib/src/prng.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (c) 1999, 2000-2013 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - File: prng.h - - Contains: Core routines for the Counterpane Yarrow PRNG. - - Written by: Counterpane, Inc. - - Copyright: (c) 2000 by Apple Computer, Inc., all rights reserved. - - Change History (most recent first): - - 02/10/99 dpm Created, based on Counterpane source. - -*/ -/* - prng.h - - Main private header for the Counterpane PRNG. Use this to be able access the - initialization and destruction routines from the DLL. -*/ - -#ifndef __YARROW_PRNG_H__ -#define __YARROW_PRNG_H__ - -#if defined(macintosh) || defined(__APPLE__) -#include "prng/YarrowCoreLib/include/yarrow.h" -/* Private function forward declarations */ -// this is in yarrow.h...YARROWAPI prng_error_status prngInitialize(void); -// ditto.... YARROWAPI prng_error_status prngDestroy(void); -YARROWAPI prng_error_status prngInputEntropy(PrngRef prng, BYTE *inbuf,UINT inbuflen,UINT poolnum); -#else /* original yarrow code */ -/* Declare YARROWAPI as __declspec(dllexport) before - including this file in the actual DLL */ -#ifndef YARROWAPI -#define YARROWAPI __declspec(dllimport) -#endif - -/* Private function forward declarations */ -YARROWAPI int prngInitialize(void); -YARROWAPI int prngDestroy(void); -YARROWAPI int prngInputEntropy(BYTE *inbuf,UINT inbuflen,UINT poolnum); - -#endif /* macintosh */ -#endif /* __YARROW_PRNG_H__ */ diff --git a/osfmk/prng/YarrowCoreLib/src/prngpriv.h b/osfmk/prng/YarrowCoreLib/src/prngpriv.h deleted file mode 100644 index 3014b4f6e..000000000 --- a/osfmk/prng/YarrowCoreLib/src/prngpriv.h +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 1999, 2000-2013 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - File: prngpriv.h - - Contains: Private typedefs and #defines for Counterpane Yarrow PRNG. - - Written by: Counterpane, Inc. - - Copyright: (c) 2000 by Apple Computer, Inc., all rights reserved. - - Change History (most recent first): - - 02/10/99 dpm Created, based on Counterpane source. - -*/ -/* - prngpriv.h - - Completely private header for the Counterpane PRNG. Should only be included by prng.c -*/ - -#ifndef __YARROW_PRNG_PRIV_H__ -#define __YARROW_PRNG_PRIV_H__ - -#include "userdefines.h" -#include "prng/YarrowCoreLib/include/yarrow.h" -#include "entropysources.h" -#include "comp.h" -#include "sha1mod.h" -#include "smf.h" - -#define TOTAL_SOURCES ENTROPY_SOURCES+USER_SOURCES - -#ifdef COMPRESSION_ON -#define COMP_SOURCES TOTAL_SOURCES -#else -#define COMP_SOURCES ENTROPY_SOURCES -#endif - -/* Error numbers */ -typedef enum prng_ready_status { - PRNG_READY = 33, /* Compiler will initialize to either 0 or random if allowed to */ - PRNG_NOT_READY = 0 -} prng_ready_status; - -/* Top level output state */ -typedef struct{ - BYTE IV[20]; - BYTE out[20]; - UINT index; /* current byte to output */ - UINT numout; /* bytes since last prng_make_new_state */ -} GEN_CTX; - -/* PRNG state structure */ -struct PRNG { - /* Output State */ - GEN_CTX outstate; - - /* Entropy Pools (somewhat unlike a gene pool) */ - YSHA1_CTX pool; - UINT poolSize[TOTAL_SOURCES]; /* Note that size is in bytes and est in bits */ - UINT poolEstBits[TOTAL_SOURCES]; - COMP_CTX comp_state[COMP_SOURCES]; - - /* Status Flags */ - prng_ready_status ready; -}; - -/* - * Clients see an opaque PrngRef; internal code uses the - * following typedef. - */ -typedef struct PRNG PRNG; - - -/* Test Macros */ -#define CHECKSTATE(p) \ -if(p==NULL) {return PRNG_ERR_NOT_READY;} /* Does the state exist? */ \ -if(p->ready != PRNG_READY) {return PRNG_ERR_NOT_READY;} /* Set error state and return */ -/* To make sure that a pointer isn't NULL */ -#define PCHECK(ptr) if(ptr==NULL) {return PRNG_ERR_NULL_POINTER;} -/* To make sure that malloc returned a valid value */ -#define MCHECK(ptr) if(ptr==NULL) {return PRNG_ERR_LOW_MEMORY;} -/* To make sure that a given value is non-negative */ -#if defined(macintosh) || defined(__APPLE__) -/* original looks like a bogon */ -#define ZCHECK(val) if(val<0) {return PRNG_ERR_OUT_OF_BOUNDS;} -#else -#define ZCHECK(val) if(p<0) {return PRNG_ERR_OUT_OF_BOUNDS;} -#endif /* macintosh */ -/* To make sure that the generator state is valid */ -#define GENCHECK(p) if(p->outstate.index>20) {return PRNG_ERR_OUT_OF_BOUNDS;} /* index is unsigned */ -/* To make sure that the entropy pool is valid */ -#define POOLCHECK(p) /* */ - - -#endif diff --git a/osfmk/prng/YarrowCoreLib/src/readme-prnguser.txt b/osfmk/prng/YarrowCoreLib/src/readme-prnguser.txt deleted file mode 100644 index c7b41fff8..000000000 --- a/osfmk/prng/YarrowCoreLib/src/readme-prnguser.txt +++ /dev/null @@ -1,90 +0,0 @@ -12345678901234567890123456789012345678901234567890123456789012345678901234567890 - -Description of User Routines in Prngcore ----------------------------------------------- - -This files describes routines in prngcore that are designed to be called by the -user (ie client apps). Those interested in the details of the library are -directed to readme-prngcoder. - -Files of interest in this directory ------------------------------------ - -yarrow.h - -Main header file (and the only one needed) for client apps. - -userdefines.h - -Header file with macros that can be defined to specify the system that this -code is being compiled on, as well as other details of the prng operation. - -usersources.h - -Header file containing the names of the various user sources of entropic data. -You can add/delete/rename sources by altering the entries in the enumeration. - - -PRNG Client Routines --------------------- - -All major routines return the success/error value for their operation. - -prngOutput(outbuf,outbuflen) - -Writes outbuflen worth of "random" data to outbuf. This routine has -backtracking protection, but you should call prngAllowReseed whenever you can -spare the cycles to guarantee good output. - -prngStretch(inbuf,inbuflen,outbuf,outbuflen) - -Takes inbuflen bytes of data from inbuf and turns it into outbuflen bytes of -data stored in outbuf. - -prngInput(inbuf,inbuflen,poolnum,estbits) - -Takes inbuflen bytes of data from inbuf and places it in entropy pool poolnum. -The user entropy pool names can be found in usersources.h (see above). - -prngForceReseed(ticks) - -Forces a reseed that lasts about ticks ticks long. Be very careful when using -this function to ensure that you do not produce a poor output state. It is -suggested that you instead use prngAllowReseed. - -prngAllowReseed(ticks) - -Will force a reseed if there is enough entropy. A reseed (of length ticks) -will be done if the total entropy estimate, ignoring the K greatest sources, -is greater than THRESHOLD. Currently, K = 0 (a bad idea) and THRESHOLD = 100 -(likely to remain so). These values can be found and edited in userdefines.h. -Will return PRNG_ERR_NOT_ENOUGH_ENTROPY if there is not enough entropy in the -pool at this time. - -prngProcessSeedBuffer(buf,ticks) - -Takes 20 bytes of data from buf and churns it into the entropy pool, and then -forces a reseed of length ticks. The first 20 bytes of output are then -returned in buf for future use with this function. It is recommended that data -used with this function be stored very securely. - -prngSlowPoll(pollsize) - -Does a slow poll to collect a large amount of vaguely random data from the OS -itself. The poll with collect at most pollsize bytes, and this parameter can -be used to control (approximately) the length of the poll. The collected data -is fed into the entropy pool. After calling this function you may call either -allow (recommended) or force a reseed if desired. - --------- - -Any questions can be directed to the programmer (me), Ari Benbasat, at -pigsfly@unixg.ubc.ca. Comments would be greatly appreciated. Please cc: all -e-mail to Bruce Schneier, John Kelsey and Chris Hall -{schneier,kelsey,hall}@counterpane.com. - -Thank you. - - - -i diff --git a/osfmk/prng/YarrowCoreLib/src/sha1mod.c b/osfmk/prng/YarrowCoreLib/src/sha1mod.c deleted file mode 100644 index 3f308d952..000000000 --- a/osfmk/prng/YarrowCoreLib/src/sha1mod.c +++ /dev/null @@ -1,239 +0,0 @@ -/* - * Copyright (c) 1999, 2000-2013 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* -SHA-1 in C -By Steve Reid -100% Public Domain -*/ -/* Header portion split from main code for convenience (AYB 3/02/98) */ -#include "sha1mod.h" -#ifdef SHA1HANDSOFF -#include -#endif - -#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits)))) - -/* - * Apple change... - */ -#if defined(macintosh) || defined (__APPLE__) -#undef LITTLE_ENDIAN -#endif - -/* blk0() and blk() perform the initial expand. */ -/* I got the idea of expanding during the round function from SSLeay */ -#ifdef LITTLE_ENDIAN -#define blk0(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \ - |(rol(block->l[i],8)&0x00FF00FF)) -#else -#define blk0(i) block->l[i] -#endif -#define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] \ - ^block->l[(i+2)&15]^block->l[i&15],1)) - -/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */ -#define R0(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk0(i)+0x5A827999+rol(v,5);w=rol(w,30); -#define R1(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=rol(w,30); -#define R2(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=rol(w,30); -#define R3(v,w,x,y,z,i) z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=rol(w,30); -#define R4(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30); - - -/* Hash a single 512-bit block. This is the core of the algorithm. */ - -__private_extern__ void -YSHA1Transform(u_int32_t state[5], const unsigned char buffer[64]) -{ -u_int32_t a, b, c, d, e; -typedef union { - unsigned char c[64]; - u_int32_t l[16]; -} CHAR64LONG16; -CHAR64LONG16* block; -#ifdef SHA1HANDSOFF -static unsigned char workspace[64]; - block = (CHAR64LONG16*)workspace; - memcpy(block, buffer, 64); -#else - block = (CHAR64LONG16*)buffer; -#endif - /* Copy context->state[] to working vars */ - a = state[0]; - b = state[1]; - c = state[2]; - d = state[3]; - e = state[4]; - /* 4 rounds of 20 operations each. Loop unrolled. */ - R0(a,b,c,d,e, 0); R0(e,a,b,c,d, 1); R0(d,e,a,b,c, 2); R0(c,d,e,a,b, 3); - R0(b,c,d,e,a, 4); R0(a,b,c,d,e, 5); R0(e,a,b,c,d, 6); R0(d,e,a,b,c, 7); - R0(c,d,e,a,b, 8); R0(b,c,d,e,a, 9); R0(a,b,c,d,e,10); R0(e,a,b,c,d,11); - R0(d,e,a,b,c,12); R0(c,d,e,a,b,13); R0(b,c,d,e,a,14); R0(a,b,c,d,e,15); - R1(e,a,b,c,d,16); R1(d,e,a,b,c,17); R1(c,d,e,a,b,18); R1(b,c,d,e,a,19); - R2(a,b,c,d,e,20); R2(e,a,b,c,d,21); R2(d,e,a,b,c,22); R2(c,d,e,a,b,23); - R2(b,c,d,e,a,24); R2(a,b,c,d,e,25); R2(e,a,b,c,d,26); R2(d,e,a,b,c,27); - R2(c,d,e,a,b,28); R2(b,c,d,e,a,29); R2(a,b,c,d,e,30); R2(e,a,b,c,d,31); - R2(d,e,a,b,c,32); R2(c,d,e,a,b,33); R2(b,c,d,e,a,34); R2(a,b,c,d,e,35); - R2(e,a,b,c,d,36); R2(d,e,a,b,c,37); R2(c,d,e,a,b,38); R2(b,c,d,e,a,39); - R3(a,b,c,d,e,40); R3(e,a,b,c,d,41); R3(d,e,a,b,c,42); R3(c,d,e,a,b,43); - R3(b,c,d,e,a,44); R3(a,b,c,d,e,45); R3(e,a,b,c,d,46); R3(d,e,a,b,c,47); - R3(c,d,e,a,b,48); R3(b,c,d,e,a,49); R3(a,b,c,d,e,50); R3(e,a,b,c,d,51); - R3(d,e,a,b,c,52); R3(c,d,e,a,b,53); R3(b,c,d,e,a,54); R3(a,b,c,d,e,55); - R3(e,a,b,c,d,56); R3(d,e,a,b,c,57); R3(c,d,e,a,b,58); R3(b,c,d,e,a,59); - R4(a,b,c,d,e,60); R4(e,a,b,c,d,61); R4(d,e,a,b,c,62); R4(c,d,e,a,b,63); - R4(b,c,d,e,a,64); R4(a,b,c,d,e,65); R4(e,a,b,c,d,66); R4(d,e,a,b,c,67); - R4(c,d,e,a,b,68); R4(b,c,d,e,a,69); R4(a,b,c,d,e,70); R4(e,a,b,c,d,71); - R4(d,e,a,b,c,72); R4(c,d,e,a,b,73); R4(b,c,d,e,a,74); R4(a,b,c,d,e,75); - R4(e,a,b,c,d,76); R4(d,e,a,b,c,77); R4(c,d,e,a,b,78); R4(b,c,d,e,a,79); - /* Add the working vars back into context.state[] */ - state[0] += a; - state[1] += b; - state[2] += c; - state[3] += d; - state[4] += e; - /* Wipe variables */ - a = b = c = d = e = 0; -} - - -/* YSHA1Init - Initialize new context */ - -__private_extern__ void -YSHA1Init(YSHA1_CTX* context) -{ - /* SHA1 initialization constants */ - context->state[0] = 0x67452301; - context->state[1] = 0xEFCDAB89; - context->state[2] = 0x98BADCFE; - context->state[3] = 0x10325476; - context->state[4] = 0xC3D2E1F0; - context->count[0] = context->count[1] = 0; -} - - -/* Run your data through this. */ - -__private_extern__ void -YSHA1Update(YSHA1_CTX* context, const unsigned char* data, unsigned int len) -{ -unsigned int i, j; - - j = (context->count[0] >> 3) & 63; - if ((context->count[0] += len << 3) < (len << 3)) context->count[1]++; - context->count[1] += (len >> 29); - if ((j + len) > 63) { - memcpy(&context->buffer[j], data, (i = 64-j)); - YSHA1Transform(context->state, context->buffer); - for ( ; i + 63 < len; i += 64) { - YSHA1Transform(context->state, &data[i]); - } - j = 0; - } - else i = 0; - memcpy(&context->buffer[j], &data[i], len - i); -} - - -/* Add padding and return the message digest. */ - -__private_extern__ void -YSHA1Final(unsigned char digest[20], YSHA1_CTX* context) -{ -u_int32_t i, j; -unsigned char finalcount[8]; - - for (i = 0; i < 8; i++) { - finalcount[i] = (unsigned char)((context->count[(i >= 4 ? 0 : 1)] - >> ((3-(i & 3)) * 8) ) & 255); /* Endian independent */ - } - YSHA1Update(context, (const unsigned char *)"\200", 1); - while ((context->count[0] & 504) != 448) { - YSHA1Update(context, (const unsigned char *)"\0", 1); - } - YSHA1Update(context, finalcount, 8); /* Should cause a YSHA1Transform() */ - for (i = 0; i < 20; i++) { - digest[i] = (unsigned char) - ((context->state[i>>2] >> ((3-(i & 3)) * 8) ) & 255); - } - /* Wipe variables */ - i = j = 0; - memset(context->buffer, 0, 64); - memset(context->state, 0, 20); - memset(context->count, 0, 8); - memset(finalcount, 0, 8); -#ifdef SHA1HANDSOFF /* make YSHA1Transform overwrite it's own static vars */ - YSHA1Transform(context->state, context->buffer); -#endif -} - - -/*************************************************************/ - -/* Test Code */ - -#if 0 - -int main(int argc, char** argv) -{ -int i, j; -YSHA1_CTX context; -unsigned char digest[20], buffer[16384]; -FILE* file; - - if (argc > 2) { - puts("Public domain SHA-1 implementation - by Steve Reid "); - puts("Produces the SHA-1 hash of a file, or stdin if no file is specified."); - exit(0); - } - if (argc < 2) { - file = stdin; - } - else { - if (!(file = fopen(argv[1], "rb"))) { - fputs("Unable to open file.", stderr); - exit(-1); - } - } - YSHA1Init(&context); - while (!feof(file)) { /* note: what if ferror(file) */ - i = fread(buffer, 1, 16384, file); - YSHA1Update(&context, buffer, i); - } - YSHA1Final(digest, &context); - fclose(file); - for (i = 0; i < 5; i++) { - for (j = 0; j < 4; j++) { - printf("%02X", digest[i*4+j]); - } - putchar(' '); - } - putchar('\n'); - exit(0); -} - -#endif diff --git a/osfmk/prng/YarrowCoreLib/src/sha1mod.h b/osfmk/prng/YarrowCoreLib/src/sha1mod.h deleted file mode 100644 index d969f2c27..000000000 --- a/osfmk/prng/YarrowCoreLib/src/sha1mod.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (c) 1999, 2000-2013 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* -SHA-1 in C -By Steve Reid -100% Public Domain -*/ -/* Header portion split from main code for convenience (AYB 3/02/98) */ - -#ifndef __SHA1_H__ - -#define __SHA1_H__ - -#include - -/* -Test Vectors (from FIPS PUB 180-1) -"abc" - A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D -"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq" - 84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1 -A million repetitions of "a" - 34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F -*/ - -/* Apple change - define this in the source file which uses it */ -/* #define LITTLE_ENDIAN This should be #define'd if true. */ -#define SHA1HANDSOFF /* Copies data before messing with it. */ - -//Context declaration -typedef struct { - u_int32_t state[5]; - u_int32_t count[2]; - unsigned char buffer[64]; -} YSHA1_CTX; - -//Function forward declerations -__private_extern__ void YSHA1Transform(u_int32_t state[5], - const unsigned char buffer[64]); -__private_extern__ void YSHA1Init(YSHA1_CTX* context); -__private_extern__ void YSHA1Update(YSHA1_CTX* context, - const unsigned char* data, unsigned int len); -__private_extern__ void YSHA1Final(unsigned char digest[20], - YSHA1_CTX* context); - -#endif /* __SHA1_H__ */ diff --git a/osfmk/prng/YarrowCoreLib/src/smf.h b/osfmk/prng/YarrowCoreLib/src/smf.h deleted file mode 100644 index b152732b2..000000000 --- a/osfmk/prng/YarrowCoreLib/src/smf.h +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 1999-2013 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - File: smf.h - - Contains: Secure malloc/free API. - - Written by: Doug Mitchell - - Copyright: (c) 2000 by Apple Computer, Inc., all rights reserved. - - Change History (most recent first): - - 02/10/00 dpm Created, based on Counterpane's Yarrow code. - -*/ - -#ifndef _YARROW_SMF_H_ -#define _YARROW_SMF_H_ - -#if defined(__cplusplus) -extern "C" { -#endif - -/* smf.h */ - - /* - Header file for secure malloc and free routines used by the Counterpane - PRNG. Use this code to set up a memory-mapped file out of the system - paging file, allocate and free memory from it, and then return - the memory to the system registry after having securely overwritten it. - Details of the secure overwrite can be found in Gutmann 1996 (Usenix). - Trying to explain it here will cause my head to begin to hurt. - Ari Benbasat (pigsfly@unixg.ubc.ca) - */ - - - -#if defined(macintosh) || defined(__APPLE__) -#include "macOnly.h" -#define MMPTR void * - -#ifndef SMFAPI -#define SMFAPI -#endif - -#else /* original Yarrow */ - -/* Declare HOOKSAPI as __declspec(dllexport) before - including this file in the actual DLL */ -#ifndef SMFAPI -#define SMFAPI __declspec(dllimport) -#endif -#define MMPTR BYTE - -#endif /* macintosh */ - - -#define MM_NULL ((void *)0) - -/* Function forward declarations */ -SMFAPI void mmInit( void ); -SMFAPI MMPTR mmMalloc(DWORD request); -SMFAPI void mmFree(MMPTR ptrnum); -SMFAPI LPVOID mmGetPtr(MMPTR ptrnum); -SMFAPI void mmReturnPtr(MMPTR ptrnum); -#if 0 -SMFAPI void mmFreePtr(LPVOID ptr); -#endif - -#if defined(__cplusplus) -} -#endif - -#endif /* _YARROW_SMF_H_*/ diff --git a/osfmk/prng/YarrowCoreLib/src/userdefines.h b/osfmk/prng/YarrowCoreLib/src/userdefines.h deleted file mode 100644 index 3d76b4b7a..000000000 --- a/osfmk/prng/YarrowCoreLib/src/userdefines.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 1999, 2000-2013 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - userdefines.h - - Header file that contains the major user-defineable quantities for the Counterpane PRNG. -*/ -#ifndef __YARROW_USER_DEFINES_H__ -#define __YARROW_USER_DEFINES_H__ - -/* User-alterable define statements */ -#define STRICT /* Define to force strict type checking */ -#define K 0 /* How many sources should we ignore when calculating total entropy? */ -#define THRESHOLD 100 /* Minimum amount of entropy for a reseed */ -#define BACKTRACKLIMIT 500 /* Number of outputed bytes after which to generate a new state */ -#define COMPRESSION_ON /* Define this variable to add on-the-fly compression (recommended) */ - /* for user sources */ -#if !defined(macintosh) && !defined(__APPLE__) -#define WIN_95 /* Choose an OS: WIN_95, WIN_NT */ -#endif - -/* Setup Microsoft flag for NT4.0 */ -#ifdef WIN_NT -#define _WIN32_WINNT 0x0400 -#endif - -#endif /* __YARROW_USER_DEFINES_H__ */ diff --git a/osfmk/prng/YarrowCoreLib/src/yarrowUtils.c b/osfmk/prng/YarrowCoreLib/src/yarrowUtils.c deleted file mode 100644 index 68b6ce0e3..000000000 --- a/osfmk/prng/YarrowCoreLib/src/yarrowUtils.c +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 1999, 2000-2013 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - File: yarrowUtils.c - - Contains: Misc. utility functions. - - Written by: Doug Mitchell - - Copyright: (c) 2000 by Apple Computer, Inc., all rights reserved. - - Change History (most recent first): - - 02/29/00 dpm Created. - -*/ - -#include "prng/YarrowCoreLib/include/yarrowUtils.h" -#include - -void -trashMemory(void* mem, int len) -/* This function should only be used on data in RAM */ -{ - if(len == 0) { - /* some memsets really don't like this */ - return; - } - - /* Cycle a bit just in case it is one of those weird memory units */ - /* No, I don't know which units those would be */ - memset(mem,0x00,len); - memset(mem,0xFF,len); - memset(mem,0x00,len); -} - - diff --git a/osfmk/prng/fips_sha1.c b/osfmk/prng/fips_sha1.c deleted file mode 100644 index 93a006804..000000000 --- a/osfmk/prng/fips_sha1.c +++ /dev/null @@ -1,386 +0,0 @@ -/* - * Copyright (c) 2000-2013 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - * This SHA1 code is based on the basic framework from the reference - * implementation for MD5. That implementation is Copyright (C) - * 1991-2, RSA Data Security, Inc. Created 1991. All rights reserved. - * - * License to copy and use this software is granted provided that it - * is identified as the "RSA Data Security, Inc. MD5 Message-Digest - * Algorithm" in all material mentioning or referencing this software - * or this function. - * - * License is also granted to make and use derivative works provided - * that such works are identified as "derived from the RSA Data - * Security, Inc. MD5 Message-Digest Algorithm" in all material - * mentioning or referencing the derived work. - * - * RSA Data Security, Inc. makes no representations concerning either - * the merchantability of this software or the suitability of this - * software for any particular purpose. It is provided "as is" - * without express or implied warranty of any kind. - * - * These notices must be retained in any copies of any part of this - * documentation and/or software. - * - * Based on the FIPS 180-1: Secure Hash Algorithm (SHA-1) available at - * http://www.itl.nist.gov/div897/pubs/fip180-1.htm - */ - -/* - WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! - - THIS FILE IS NEEDED TO PASS FIPS ACCEPTANCE FOR THE RANDOM NUMBER GENERATOR. - IF YOU ALTER IT IN ANY WAY, WE WILL NEED TO GO THOUGH FIPS ACCEPTANCE AGAIN, - AN OPERATION THAT IS VERY EXPENSIVE AND TIME CONSUMING. IN OTHER WORDS, - DON'T MESS WITH THIS FILE. - - WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! -*/ - -#include -#include - -#include "fips_sha1.h" - -typedef int Boolean; - -/* Internal mappings to the legacy sha1_ctxt structure. */ -#define state h.b32 -#define bcount c.b32 -#define buffer m.b8 - -/* - * The digest algorithm interprets the input message as a sequence of 32-bit - * big-endian words. We must reverse bytes in each word on x86/64 platforms, - * but not on big-endian ones such as PPC. For performance, we take advantage - * of the bswap instruction on x86/64 to perform byte-reversal. On PPC, we - * could do 4-byte load if the address is 4-byte aligned which should further - * improve the performance. But for code simplicity, we punt and do 1-byte - * loads instead. - */ -#if (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__) -#define FETCH_32(p) ({ \ - u_int32_t l = (u_int32_t)*((const u_int32_t *)(p)); \ - __asm__ __volatile__("bswap %0" : "=r" (l) : "0" (l)); \ - l; \ -}) -#else -#define FETCH_32(p) \ - (((u_int32_t)*((const u_int8_t *)(p) + 3)) | \ - (((u_int32_t)*((const u_int8_t *)(p) + 2)) << 8) | \ - (((u_int32_t)*((const u_int8_t *)(p) + 1)) << 16) | \ - (((u_int32_t)*((const u_int8_t *)(p))) << 24)) -#endif /* __i386__ || __x86_64__ */ - -/* - * Encodes input (u_int32_t) into output (unsigned char). Assumes len is - * a multiple of 4. This is not compatible with memcpy(). - */ -static void -Encode(unsigned char *output, u_int32_t *input, unsigned int len) -{ - unsigned int i, j; - - for (i = 0, j = 0; j < len; i++, j += 4) { - output[j + 3] = input[i] & 0xff; - output[j + 2] = (input[i] >> 8) & 0xff; - output[j + 1] = (input[i] >> 16) & 0xff; - output[j] = (input[i] >> 24) & 0xff; - } -} - -static unsigned char PADDING[64] = { 0x80, /* zeros */ }; - -/* Constants from FIPS 180-1 */ -#define K_00_19 0x5a827999UL -#define K_20_39 0x6ed9eba1UL -#define K_40_59 0x8f1bbcdcUL -#define K_60_79 0xca62c1d6UL - -/* F, G, H and I are basic SHA1 functions. */ -#define F(b, c, d) ((((c) ^ (d)) & (b)) ^ (d)) -#define G(b, c, d) ((b) ^ (c) ^ (d)) -#define H(b, c, d) (((b) & (c)) | (((b) | (c)) & (d))) - -/* ROTATE_LEFT rotates x left n bits. */ -#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) - -/* R, R1-R4 are macros used during each transformation round. */ -#define R(f, k, v, w, x, y, z, i) { \ - (v) = ROTATE_LEFT(w, 5) + f(x, y, z) + (v) + (i) + (k); \ - (x) = ROTATE_LEFT(x, 30); \ -} - -#define R1(v, w, x, y, z, i) R(F, K_00_19, v, w, x, y, z, i) -#define R2(v, w, x, y, z, i) R(G, K_20_39, v, w, x, y, z, i) -#define R3(v, w, x, y, z, i) R(H, K_40_59, v, w, x, y, z, i) -#define R4(v, w, x, y, z, i) R(G, K_60_79, v, w, x, y, z, i) - -/* WUPDATE represents Wt variable that gets updated for steps 16-79 */ -#define WUPDATE(p, q, r, s) { \ - (p) = ((q) ^ (r) ^ (s) ^ (p)); \ - (p) = ROTATE_LEFT(p, 1); \ -} - -static void SHA1Transform(u_int32_t, u_int32_t, u_int32_t, u_int32_t, - u_int32_t, const u_int8_t *, SHA1_CTX *); - -/* - * SHA1 initialization. Begins a SHA1 operation, writing a new context. - */ -void -FIPS_SHA1Init(SHA1_CTX *context) -{ - context->bcount[0] = context->bcount[1] = 0; - context->count = 0; - - /* Load magic initialization constants. */ - context->state[0] = 0x67452301UL; - context->state[1] = 0xefcdab89UL; - context->state[2] = 0x98badcfeUL; - context->state[3] = 0x10325476UL; - context->state[4] = 0xc3d2e1f0UL; -} - -/* - * SHA1 block update operation. Continues a SHA1 message-digest - * operation, processing another message block, and updating the - * context. - */ -void FIPS_SHA1Update(SHA1_CTX *context, const void *inpp, size_t inputLen) -{ - u_int32_t i, index, partLen; - const unsigned char *input = (const unsigned char *)inpp; - - if (inputLen == 0) - return; - - /* Compute number of bytes mod 64 */ - index = (context->bcount[1] >> 3) & 0x3F; - - /* Update number of bits */ - if ((context->bcount[1] += (inputLen << 3)) < (inputLen << 3)) - context->bcount[0]++; - context->bcount[0] += (inputLen >> 29); - - partLen = 64 - index; - - /* Transform as many times as possible. */ - i = 0; - if (inputLen >= partLen) { - if (index != 0) { - memcpy(&context->buffer[index], input, partLen); - SHA1Transform(context->state[0], context->state[1], - context->state[2], context->state[3], - context->state[4], context->buffer, context); - i = partLen; - } - - for (; i + 63 < inputLen; i += 64) - SHA1Transform(context->state[0], context->state[1], - context->state[2], context->state[3], - context->state[4], &input[i], context); - - if (inputLen == i) - return; - - index = 0; - } - - /* Buffer remaining input */ - memcpy(&context->buffer[index], &input[i], inputLen - i); -} - - - - -/* - * This is function is only called in from the pagefault path or from page_copy(). - * So we assume that we can safely convert the virtual address to the physical address and use it. - * Assumptions: The passed in address(inpp) is a kernel virtual address - * and a physical page has been faulted in. - * The inputLen passed in should always be less than or equal to a page size (4096) - * and inpp should be on a page boundary. - * "performSHA1WithinKernelOnly" is initialized only when the hardware driver exists and is ready. - */ - - - -/* - * SHA1 finalization. Ends an SHA1 message-digest operation, writing the - * the message digest and zeroizing the context. - */ -void -FIPS_SHA1Final(void *digest, SHA1_CTX *context) -{ - unsigned char bits[8]; - u_int32_t index = (context->bcount[1] >> 3) & 0x3f; - - /* Save number of bits */ - Encode(bits, context->bcount, 8); - - /* Pad out to 56 mod 64. */ - FIPS_SHA1Update(context, PADDING, ((index < 56) ? 56 : 120) - index); - - /* Append length (before padding) */ - FIPS_SHA1Update(context, bits, 8); - - /* Store state in digest */ - Encode(digest, context->state, 20); - - /* Zeroize sensitive information. */ - memset(context, 0, sizeof (*context)); -} - -/* - * SHA1 basic transformation. Transforms state based on block. - */ -static void -SHA1Transform(u_int32_t a, u_int32_t b, u_int32_t c, u_int32_t d, - u_int32_t e, const u_int8_t block[64], SHA1_CTX *context) -{ - /* Register (instead of array) is a win in most cases */ - u_int32_t w0, w1, w2, w3, w4, w5, w6, w7; - u_int32_t w8, w9, w10, w11, w12, w13, w14, w15; - - w15 = FETCH_32(block + 60); - w14 = FETCH_32(block + 56); - w13 = FETCH_32(block + 52); - w12 = FETCH_32(block + 48); - w11 = FETCH_32(block + 44); - w10 = FETCH_32(block + 40); - w9 = FETCH_32(block + 36); - w8 = FETCH_32(block + 32); - w7 = FETCH_32(block + 28); - w6 = FETCH_32(block + 24); - w5 = FETCH_32(block + 20); - w4 = FETCH_32(block + 16); - w3 = FETCH_32(block + 12); - w2 = FETCH_32(block + 8); - w1 = FETCH_32(block + 4); - w0 = FETCH_32(block + 0); - - /* Round 1 */ - R1(e, a, b, c, d, w0); /* 0 */ - R1(d, e, a, b, c, w1); /* 1 */ - R1(c, d, e, a, b, w2); /* 2 */ - R1(b, c, d, e, a, w3); /* 3 */ - R1(a, b, c, d, e, w4); /* 4 */ - R1(e, a, b, c, d, w5); /* 5 */ - R1(d, e, a, b, c, w6); /* 6 */ - R1(c, d, e, a, b, w7); /* 7 */ - R1(b, c, d, e, a, w8); /* 8 */ - R1(a, b, c, d, e, w9); /* 9 */ - R1(e, a, b, c, d, w10); /* 10 */ - R1(d, e, a, b, c, w11); /* 11 */ - R1(c, d, e, a, b, w12); /* 12 */ - R1(b, c, d, e, a, w13); /* 13 */ - R1(a, b, c, d, e, w14); /* 14 */ - R1(e, a, b, c, d, w15); /* 15 */ - WUPDATE( w0, w13, w8, w2); R1(d, e, a, b, c, w0); /* 16 */ - WUPDATE( w1, w14, w9, w3); R1(c, d, e, a, b, w1); /* 17 */ - WUPDATE( w2, w15, w10, w4); R1(b, c, d, e, a, w2); /* 18 */ - WUPDATE( w3, w0, w11, w5); R1(a, b, c, d, e, w3); /* 19 */ - - /* Round 2 */ - WUPDATE( w4, w1, w12, w6); R2(e, a, b, c, d, w4); /* 20 */ - WUPDATE( w5, w2, w13, w7); R2(d, e, a, b, c, w5); /* 21 */ - WUPDATE( w6, w3, w14, w8); R2(c, d, e, a, b, w6); /* 22 */ - WUPDATE( w7, w4, w15, w9); R2(b, c, d, e, a, w7); /* 23 */ - WUPDATE( w8, w5, w0, w10); R2(a, b, c, d, e, w8); /* 24 */ - WUPDATE( w9, w6, w1, w11); R2(e, a, b, c, d, w9); /* 25 */ - WUPDATE(w10, w7, w2, w12); R2(d, e, a, b, c, w10); /* 26 */ - WUPDATE(w11, w8, w3, w13); R2(c, d, e, a, b, w11); /* 27 */ - WUPDATE(w12, w9, w4, w14); R2(b, c, d, e, a, w12); /* 28 */ - WUPDATE(w13, w10, w5, w15); R2(a, b, c, d, e, w13); /* 29 */ - WUPDATE(w14, w11, w6, w0); R2(e, a, b, c, d, w14); /* 30 */ - WUPDATE(w15, w12, w7, w1); R2(d, e, a, b, c, w15); /* 31 */ - WUPDATE( w0, w13, w8, w2); R2(c, d, e, a, b, w0); /* 32 */ - WUPDATE( w1, w14, w9, w3); R2(b, c, d, e, a, w1); /* 33 */ - WUPDATE( w2, w15, w10, w4); R2(a, b, c, d, e, w2); /* 34 */ - WUPDATE( w3, w0, w11, w5); R2(e, a, b, c, d, w3); /* 35 */ - WUPDATE( w4, w1, w12, w6); R2(d, e, a, b, c, w4); /* 36 */ - WUPDATE( w5, w2, w13, w7); R2(c, d, e, a, b, w5); /* 37 */ - WUPDATE( w6, w3, w14, w8); R2(b, c, d, e, a, w6); /* 38 */ - WUPDATE( w7, w4, w15, w9); R2(a, b, c, d, e, w7); /* 39 */ - - /* Round 3 */ - WUPDATE( w8, w5, w0, w10); R3(e, a, b, c, d, w8); /* 40 */ - WUPDATE( w9, w6, w1, w11); R3(d, e, a, b, c, w9); /* 41 */ - WUPDATE(w10, w7, w2, w12); R3(c, d, e, a, b, w10); /* 42 */ - WUPDATE(w11, w8, w3, w13); R3(b, c, d, e, a, w11); /* 43 */ - WUPDATE(w12, w9, w4, w14); R3(a, b, c, d, e, w12); /* 44 */ - WUPDATE(w13, w10, w5, w15); R3(e, a, b, c, d, w13); /* 45 */ - WUPDATE(w14, w11, w6, w0); R3(d, e, a, b, c, w14); /* 46 */ - WUPDATE(w15, w12, w7, w1); R3(c, d, e, a, b, w15); /* 47 */ - WUPDATE( w0, w13, w8, w2); R3(b, c, d, e, a, w0); /* 48 */ - WUPDATE( w1, w14, w9, w3); R3(a, b, c, d, e, w1); /* 49 */ - WUPDATE( w2, w15, w10, w4); R3(e, a, b, c, d, w2); /* 50 */ - WUPDATE( w3, w0, w11, w5); R3(d, e, a, b, c, w3); /* 51 */ - WUPDATE( w4, w1, w12, w6); R3(c, d, e, a, b, w4); /* 52 */ - WUPDATE( w5, w2, w13, w7); R3(b, c, d, e, a, w5); /* 53 */ - WUPDATE( w6, w3, w14, w8); R3(a, b, c, d, e, w6); /* 54 */ - WUPDATE( w7, w4, w15, w9); R3(e, a, b, c, d, w7); /* 55 */ - WUPDATE( w8, w5, w0, w10); R3(d, e, a, b, c, w8); /* 56 */ - WUPDATE( w9, w6, w1, w11); R3(c, d, e, a, b, w9); /* 57 */ - WUPDATE(w10, w7, w2, w12); R3(b, c, d, e, a, w10); /* 58 */ - WUPDATE(w11, w8, w3, w13); R3(a, b, c, d, e, w11); /* 59 */ - - WUPDATE(w12, w9, w4, w14); R4(e, a, b, c, d, w12); /* 60 */ - WUPDATE(w13, w10, w5, w15); R4(d, e, a, b, c, w13); /* 61 */ - WUPDATE(w14, w11, w6, w0); R4(c, d, e, a, b, w14); /* 62 */ - WUPDATE(w15, w12, w7, w1); R4(b, c, d, e, a, w15); /* 63 */ - WUPDATE( w0, w13, w8, w2); R4(a, b, c, d, e, w0); /* 64 */ - WUPDATE( w1, w14, w9, w3); R4(e, a, b, c, d, w1); /* 65 */ - WUPDATE( w2, w15, w10, w4); R4(d, e, a, b, c, w2); /* 66 */ - WUPDATE( w3, w0, w11, w5); R4(c, d, e, a, b, w3); /* 67 */ - WUPDATE( w4, w1, w12, w6); R4(b, c, d, e, a, w4); /* 68 */ - WUPDATE( w5, w2, w13, w7); R4(a, b, c, d, e, w5); /* 69 */ - WUPDATE( w6, w3, w14, w8); R4(e, a, b, c, d, w6); /* 70 */ - WUPDATE( w7, w4, w15, w9); R4(d, e, a, b, c, w7); /* 71 */ - WUPDATE( w8, w5, w0, w10); R4(c, d, e, a, b, w8); /* 72 */ - WUPDATE( w9, w6, w1, w11); R4(b, c, d, e, a, w9); /* 73 */ - WUPDATE(w10, w7, w2, w12); R4(a, b, c, d, e, w10); /* 74 */ - WUPDATE(w11, w8, w3, w13); R4(e, a, b, c, d, w11); /* 75 */ - WUPDATE(w12, w9, w4, w14); R4(d, e, a, b, c, w12); /* 76 */ - WUPDATE(w13, w10, w5, w15); R4(c, d, e, a, b, w13); /* 77 */ - WUPDATE(w14, w11, w6, w0); R4(b, c, d, e, a, w14); /* 78 */ - WUPDATE(w15, w12, w7, w1); R4(a, b, c, d, e, w15); /* 79 */ - - context->state[0] += a; - context->state[1] += b; - context->state[2] += c; - context->state[3] += d; - context->state[4] += e; - - /* Zeroize sensitive information. */ - w15 = w14 = w13 = w12 = w11 = w10 = w9 = w8 = 0; - w7 = w6 = w5 = w4 = w3 = w2 = w1 = w0 = 0; -} diff --git a/osfmk/prng/fips_sha1.h b/osfmk/prng/fips_sha1.h deleted file mode 100644 index 092c48b36..000000000 --- a/osfmk/prng/fips_sha1.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2000-2013 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! - - THIS FILE IS NEEDED TO PASS FIPS ACCEPTANCE FOR THE RANDOM NUMBER GENERATOR. - IF YOU ALTER IT IN ANY WAY, WE WILL NEED TO GO THOUGH FIPS ACCEPTANCE AGAIN, - AN OPERATION THAT IS VERY EXPENSIVE AND TIME CONSUMING. IN OTHER WORDS, - DON'T MESS WITH THIS FILE. - - WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! -*/ - -#ifndef _CRYPTO_FIPS_SHA1_H_ -#define _CRYPTO_FIPS_SHA1_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -#include - -#define SHA_DIGEST_LENGTH 20 -#define SHA1_RESULTLEN SHA_DIGEST_LENGTH - -typedef struct sha1_ctxt { - union { - u_int8_t b8[20]; - u_int32_t b32[5]; /* state (ABCDE) */ - } h; - union { - u_int8_t b8[8]; - u_int32_t b32[2]; - u_int64_t b64[1]; /* # of bits, modulo 2^64 (msb first) */ - } c; - union { - u_int8_t b8[64]; - u_int32_t b32[16]; /* input buffer */ - } m; - u_int8_t count; /* unused; for compatibility only */ -} SHA1_CTX; - -extern void FIPS_SHA1Init(SHA1_CTX *); -extern void FIPS_SHA1Update(SHA1_CTX *, const void *, size_t); -extern void FIPS_SHA1Final(void *, SHA1_CTX *); - -#ifdef __cplusplus -} -#endif - -#endif /*_CRYPTO_SHA1_H_*/ diff --git a/osfmk/prng/random.c b/osfmk/prng/prng_random.c similarity index 59% rename from osfmk/prng/random.c rename to osfmk/prng/prng_random.c index 5dc056f8a..ec90cc789 100644 --- a/osfmk/prng/random.c +++ b/osfmk/prng/prng_random.c @@ -2,7 +2,7 @@ * Copyright (c) 2013 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,117 +22,131 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#include -#include -#include #include #include #include #include #include -#include +#include #include +#include #include #include +#include +#include #include #include -#include -#include +#include #include #include +#include +#include -#include +#include #include +#include #include -#include #include +#include -#include -#include #include +#include +#include + +#include + +#if defined(__arm__) || defined(__arm64__) +#include // For MAX_CPUS +#endif #if defined(__x86_64__) #include -static int rdseed_step(uint64_t *seed) +static int +rdseed_step(uint64_t * seed) { uint8_t ok; - - asm volatile ("rdseed %0; setc %1" : "=r" (*seed), "=qm" (ok)); - - return (int) ok; + + asm volatile("rdseed %0; setc %1" : "=r"(*seed), "=qm"(ok)); + + return (int)ok; } -static int rdseed_retry(uint64_t *seed, size_t nretries) +static int +rdseed_retry(uint64_t * seed, size_t nretries) { size_t i; - + for (i = 0; i < nretries; i += 1) { if (rdseed_step(seed)) { return 1; } else { - asm volatile ("pause"); + asm volatile("pause"); } } - + return 0; } -static size_t rdseed_seed(void *buf, size_t nwords) +static size_t +rdseed_seed(void * buf, size_t nwords) { - uint64_t *buf_words; + uint64_t * buf_words; size_t i; - + if (nwords > 8) { nwords = 8; } - + buf_words = buf; for (i = 0; i < nwords; i += 1) { if (!rdseed_retry(buf_words + i, 10)) { return i; } } - + return nwords; } -static int rdrand_step(uint64_t *rand) +static int +rdrand_step(uint64_t * rand) { uint8_t ok; - - asm volatile ("rdrand %0; setc %1" : "=r" (*rand), "=qm" (ok)); - - return (int) ok; + + asm volatile("rdrand %0; setc %1" : "=r"(*rand), "=qm"(ok)); + + return (int)ok; } -static int rdrand_retry(uint64_t *rand, size_t nretries) +static int +rdrand_retry(uint64_t * rand, size_t nretries) { size_t i; - + for (i = 0; i < nretries; i += 1) { if (rdrand_step(rand)) { return 1; } } - + return 0; } -static size_t rdrand_seed(void *buf, size_t nwords) +static size_t +rdrand_seed(void * buf, size_t nwords) { size_t i; uint64_t w; uint8_t hash[CCSHA256_OUTPUT_SIZE]; - const struct ccdigest_info *di = &ccsha256_ltc_di; - + const struct ccdigest_info * di = &ccsha256_ltc_di; + ccdigest_di_decl(di, ctx); ccdigest_init(di, ctx); - + for (i = 0; i < 1023; i += 1) { if (!rdrand_retry(&w, 10)) { nwords = 0; @@ -140,61 +154,65 @@ static size_t rdrand_seed(void *buf, size_t nwords) } ccdigest_update(di, ctx, sizeof w, &w); } - + ccdigest_final(di, ctx, hash); - + if (nwords > 2) { nwords = 2; } - - memcpy(buf, hash, nwords * sizeof (uint64_t)); - + + memcpy(buf, hash, nwords * sizeof(uint64_t)); + out: ccdigest_di_clear(di, ctx); bzero(hash, sizeof hash); bzero(&w, sizeof w); - + return nwords; } -static void intel_entropysource(void *buf, size_t *nbytes) +static void +intel_entropysource(void * buf, size_t * nbytes) { size_t nwords; - + /* only handle complete words */ - assert(*nbytes % sizeof (uint64_t) == 0); - - nwords = (*nbytes) / sizeof (uint64_t); + assert(*nbytes % sizeof(uint64_t) == 0); + + nwords = (*nbytes) / sizeof(uint64_t); if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_RDSEED) { - nwords = rdseed_seed(buf, nwords); - *nbytes = nwords * sizeof (uint64_t); + nwords = rdseed_seed(buf, nwords); + *nbytes = nwords * sizeof(uint64_t); } else if (cpuid_features() & CPUID_FEATURE_RDRAND) { - nwords = rdrand_seed(buf, nwords); - *nbytes = nwords * sizeof (uint64_t); + nwords = rdrand_seed(buf, nwords); + *nbytes = nwords * sizeof(uint64_t); } else { *nbytes = 0; } } -#endif +#endif /* defined(__x86_64__) */ + +void entropy_buffer_read(void * buffer, size_t * count); -typedef void (*entropysource)(void *buf, size_t *nbytes); +typedef void (*entropysource)(void * buf, size_t * nbytes); static const entropysource entropysources[] = { - entropy_buffer_read, + entropy_buffer_read, #if defined(__x86_64__) - intel_entropysource, + intel_entropysource, #endif }; static const size_t nsources = sizeof entropysources / sizeof entropysources[0]; -static size_t entropy_readall(void *buf, size_t nbytes_persource) +static size_t +entropy_readall(void * buf, size_t nbytes_persource) { - uint8_t *buf_bytes = buf; + uint8_t * buf_bytes = buf; size_t i; size_t nbytes_total = 0; - + for (i = 0; i < nsources; i += 1) { size_t nbytes = nbytes_persource; entropysources[i](buf_bytes, &nbytes); @@ -202,48 +220,84 @@ static size_t entropy_readall(void *buf, size_t nbytes_persource) nbytes_total += nbytes; buf_bytes += nbytes_persource; } - + return nbytes_total; } static struct { - lck_grp_t *group; - lck_attr_t *attrs; - lck_grp_attr_t *group_attrs; - lck_mtx_t *mutex; -} lock; - -typedef struct prngContext { - struct ccdrbg_info *infop; - struct ccdrbg_state *statep; - uint64_t bytes_generated; - uint64_t bytes_reseeded; -} *prngContextp; - -ccdrbg_factory_t prng_ccdrbg_factory = NULL; - -entropy_data_t EntropyData = { - .index_ptr = EntropyData.buffer -}; + struct cckprng_ctx ctx; + struct { + lck_grp_t * group; + lck_attr_t * attrs; + lck_grp_attr_t * group_attrs; + lck_mtx_t * mutex; + } lock; +} prng; + +static SECURITY_READ_ONLY_LATE(prng_fns_t) prng_fns = NULL; + +static int +prng_init(cckprng_ctx_t ctx, size_t nbytes, const void * seed) +{ + int err = prng_fns->init(ctx, nbytes, seed); + if (err == CCKPRNG_ABORT) { + panic("prng_init"); + } + return err; +} + +#define PERMIT_WRITE_RANDOM 0 + +#if PERMIT_WRITE_RANDOM +static int +prng_reseed(cckprng_ctx_t ctx, size_t nbytes, const void * seed) +{ + int err = prng_fns->reseed(ctx, nbytes, seed); + if (err == CCKPRNG_ABORT) { + panic("prng_reseed"); + } + return err; +} +#endif + +static int +prng_addentropy(cckprng_ctx_t ctx, size_t nbytes, const void * entropy) +{ + int err = prng_fns->addentropy(ctx, nbytes, entropy); + if (err == CCKPRNG_ABORT) { + panic("prng_addentropy"); + } + return err; +} + +static int +prng_generate(cckprng_ctx_t ctx, size_t nbytes, void * out) +{ + int err = prng_fns->generate(ctx, nbytes, out); + if (err == CCKPRNG_ABORT) { + panic("prng_generate"); + } + return err; +} + +entropy_data_t EntropyData = {.index_ptr = EntropyData.buffer}; static struct { uint8_t seed[nsources][EARLY_RANDOM_SEED_SIZE]; - size_t seedset; + int seedset; uint8_t master_drbg_state[EARLY_RANDOM_STATE_STATIC_SIZE]; - struct ccdrbg_state *drbg_states[MAX_CPUS]; + struct ccdrbg_state * drbg_states[MAX_CPUS]; struct ccdrbg_info drbg_info; const struct ccdrbg_nisthmac_custom drbg_custom; -} erandom = { - .drbg_custom = { - .di = &ccsha1_eay_di, - .strictFIPS = 0, - } -}; +} erandom = {.drbg_custom = { + .di = &ccsha1_eay_di, + .strictFIPS = 0, + }}; -static void read_erandom(void *buf, uint32_t nbytes); +static void read_erandom(void * buf, uint32_t nbytes); -void -entropy_buffer_read(void *buffer, size_t *count) +void +entropy_buffer_read(void * buffer, size_t * count) { boolean_t current_state; unsigned int i, j; @@ -264,19 +318,18 @@ entropy_buffer_read(void *buffer, size_t *count) for (i = 0, j = (ENTROPY_BUFFER_SIZE - 1); i < ENTROPY_BUFFER_SIZE; j = i, i++) EntropyData.buffer[i] = EntropyData.buffer[i] ^ EntropyData.buffer[j]; - (void) ml_set_interrupts_enabled(current_state); + (void)ml_set_interrupts_enabled(current_state); #if DEVELOPMENT || DEBUG - uint32_t *word = buffer; + uint32_t * word = buffer; /* Good for both 32-bit and 64-bit kernels. */ for (i = 0; i < ENTROPY_BUFFER_SIZE; i += 4) /* * We use "EARLY" here so that we can grab early entropy on * ARM, where tracing is not started until after PRNG is * initialized. - */ - KERNEL_DEBUG_EARLY(ENTROPY_READ(i/4), - word[i+0], word[i+1], word[i+2], word[i+3]); + */ + KERNEL_DEBUG_EARLY(ENTROPY_READ(i / 4), word[i + 0], word[i + 1], word[i + 2], word[i + 3]); #endif } @@ -317,33 +370,31 @@ entropy_buffer_read(void *buffer, size_t *count) uint64_t early_random(void) { - uint32_t cnt = 0; - uint64_t result; - uint64_t nonce; - int rc; - int ps; - struct ccdrbg_state *state; + uint32_t cnt = 0; + uint64_t result; + uint64_t nonce; + int rc; + int ps; + struct ccdrbg_state * state; if (!erandom.seedset) { erandom.seedset = 1; - cnt = PE_get_random_seed((unsigned char *) EntropyData.buffer, - sizeof(EntropyData.buffer)); + cnt = PE_get_random_seed((unsigned char *)EntropyData.buffer, sizeof(EntropyData.buffer)); if (cnt < sizeof(EntropyData.buffer)) { /* * Insufficient entropy is fatal. We must fill the * entire entropy buffer during initializaton. */ - panic("EntropyData needed %lu bytes, but got %u.\n", - sizeof(EntropyData.buffer), cnt); - } + panic("EntropyData needed %lu bytes, but got %u.\n", sizeof(EntropyData.buffer), cnt); + } entropy_readall(&erandom.seed, EARLY_RANDOM_SEED_SIZE); /* Init DRBG for NIST HMAC */ ccdrbg_factory_nisthmac(&erandom.drbg_info, &erandom.drbg_custom); assert(erandom.drbg_info.size <= sizeof(erandom.master_drbg_state)); - state = (struct ccdrbg_state *) erandom.master_drbg_state; + state = (struct ccdrbg_state *)erandom.master_drbg_state; erandom.drbg_states[master_cpu] = state; /* @@ -352,22 +403,17 @@ early_random(void) */ assert(sizeof(erandom.seed) > sizeof(nonce)); nonce = ml_get_timebase(); - ps = 0; /* boot cpu */ - rc = ccdrbg_init(&erandom.drbg_info, state, - sizeof(erandom.seed), erandom.seed, - sizeof(nonce), &nonce, - sizeof(ps), &ps); + ps = 0; /* boot cpu */ + rc = ccdrbg_init(&erandom.drbg_info, state, sizeof(erandom.seed), erandom.seed, sizeof(nonce), &nonce, sizeof(ps), &ps); cc_clear(sizeof(nonce), &nonce); if (rc != CCDRBG_STATUS_OK) panic("ccdrbg_init() returned %d", rc); /* Generate output */ - rc = ccdrbg_generate(&erandom.drbg_info, state, - sizeof(result), &result, - 0, NULL); + rc = ccdrbg_generate(&erandom.drbg_info, state, sizeof(result), &result, 0, NULL); if (rc != CCDRBG_STATUS_OK) panic("ccdrbg_generate() returned %d", rc); - + return result; }; @@ -377,31 +423,27 @@ early_random(void) } static void -read_erandom(void *buffer, u_int numBytes) +read_erandom(void * buffer, u_int numBytes) { - int cpu; - int rc; + int cpu; + int rc; size_t nbytes; - struct ccdrbg_state *state; + struct ccdrbg_state * state; mp_disable_preemption(); - cpu = cpu_number(); + cpu = cpu_number(); state = erandom.drbg_states[cpu]; assert(state); for (;;) { /* Generate output */ - rc = ccdrbg_generate(&erandom.drbg_info, state, - numBytes, buffer, - 0, NULL); + rc = ccdrbg_generate(&erandom.drbg_info, state, numBytes, buffer, 0, NULL); if (rc == CCDRBG_STATUS_OK) break; if (rc == CCDRBG_STATUS_NEED_RESEED) { /* It's time to reseed. Get more entropy */ nbytes = entropy_readall(erandom.seed, EARLY_RANDOM_SEED_SIZE); assert(nbytes >= EARLY_RANDOM_SEED_SIZE); - rc = ccdrbg_reseed(&erandom.drbg_info, state, - sizeof(erandom.seed), erandom.seed, - 0, NULL); + rc = ccdrbg_reseed(&erandom.drbg_info, state, sizeof(erandom.seed), erandom.seed, 0, NULL); cc_clear(sizeof(erandom.seed), erandom.seed); if (rc == CCDRBG_STATUS_OK) continue; @@ -413,9 +455,9 @@ read_erandom(void *buffer, u_int numBytes) } void -read_frandom(void *buffer, u_int numBytes) +read_frandom(void * buffer, u_int numBytes) { - uint8_t *buffer_bytes = buffer; + uint8_t * buffer_bytes = buffer; int nbytes; /* @@ -431,203 +473,114 @@ read_frandom(void *buffer, u_int numBytes) } } -/* - * Register a DRBG factory routine to e used in constructing the kernel PRNG. - * XXX to be called from the corecrypto kext. - */ void -prng_factory_register(ccdrbg_factory_t factory) +early_random_cpu_init(int cpu) { - prng_ccdrbg_factory = factory; - thread_wakeup((event_t) &prng_ccdrbg_factory); -} - -void -prng_cpu_init(int cpu) -{ - uint64_t nonce; - int rc; - struct ccdrbg_state *state; - prngContextp pp; + uint64_t nonce; + int rc; + struct ccdrbg_state * state; /* * Allocate state and initialize DBRG state for early_random() - * for this processor, if necessary. + * for this processor. */ - if (erandom.drbg_states[cpu] == NULL) { - - state = kalloc(erandom.drbg_info.size); - if (state == NULL) { - panic("prng_init kalloc failed\n"); - } - erandom.drbg_states[cpu] = state; - - /* - * Init our DBRG from boot entropy, nonce as timestamp - * and use the cpu number as the personalization parameter. - */ - nonce = ml_get_timebase(); - rc = ccdrbg_init(&erandom.drbg_info, state, - sizeof(erandom.seed), erandom.seed, - sizeof(nonce), &nonce, - sizeof(cpu), &cpu); - cc_clear(sizeof(nonce), &nonce); - if (rc != CCDRBG_STATUS_OK) - panic("ccdrbg_init() returned %d", rc); - } + assert(cpu != master_cpu); + assert(erandom.drbg_states[cpu] == NULL); - /* Non-boot cpus use the master cpu's global context */ - if (cpu != master_cpu) { - cpu_datap(cpu)->cpu_prng = master_prng_context(); - return; + state = kalloc(erandom.drbg_info.size); + if (state == NULL) { + panic("prng_init kalloc failed\n"); } + erandom.drbg_states[cpu] = state; - assert(lock.mutex == NULL); /* Once only, please */ - - /* make a mutex to control access */ - lock.group_attrs = lck_grp_attr_alloc_init(); - lock.group = lck_grp_alloc_init("random", lock.group_attrs); - lock.attrs = lck_attr_alloc_init(); - lock.mutex = lck_mtx_alloc_init(lock.group, lock.attrs); - - pp = kalloc(sizeof(*pp)); - if (pp == NULL) - panic("Unable to allocate prng context"); - pp->bytes_generated = 0; - pp->bytes_reseeded = 0; - pp->infop = NULL; - - /* XXX Temporary registration */ - prng_factory_register(ccdrbg_factory_yarrow); - - master_prng_context() = pp; + /* + * Init our DBRG from boot entropy, nonce as timestamp + * and use the cpu number as the personalization parameter. + */ + nonce = ml_get_timebase(); + rc = ccdrbg_init(&erandom.drbg_info, state, sizeof(erandom.seed), erandom.seed, sizeof(nonce), &nonce, sizeof(cpu), &cpu); + cc_clear(sizeof(nonce), &nonce); + if (rc != CCDRBG_STATUS_OK) + panic("ccdrbg_init() returned %d", rc); } -static struct ccdrbg_info * -prng_infop(prngContextp pp) +void +register_and_init_prng(prng_fns_t fns) { uint8_t buf[nsources][ENTROPY_BUFFER_BYTE_SIZE]; size_t nbytes; - - lck_mtx_assert(lock.mutex, LCK_MTX_ASSERT_OWNED); - - /* Usual case: the info is all set */ - if (pp->infop) - return pp->infop; - - /* - * Possibly wait for the CCDRBG factory routune to be registered - * by corecypto. But panic after waiting for more than 10 seconds. - */ - while (prng_ccdrbg_factory == NULL ) { - wait_result_t wait_result; - assert_wait_timeout((event_t) &prng_ccdrbg_factory, TRUE, - 10, NSEC_PER_USEC); - lck_mtx_unlock(lock.mutex); - wait_result = thread_block(THREAD_CONTINUE_NULL); - if (wait_result == THREAD_TIMED_OUT) - panic("prng_ccdrbg_factory registration timeout"); - lck_mtx_lock(lock.mutex); - } - /* Check we didn't lose the set-up race */ - if (pp->infop) - return pp->infop; - pp->infop = (struct ccdrbg_info *) kalloc(sizeof(struct ccdrbg_info)); - if (pp->infop == NULL) - panic("Unable to allocate prng info"); + assert(cpu_number() == master_cpu); + assert(prng_fns == NULL); - prng_ccdrbg_factory(pp->infop, NULL); + prng_fns = fns; - pp->statep = kalloc(pp->infop->size); - if (pp->statep == NULL) - panic("Unable to allocate prng state"); + /* make a mutex to control access */ + prng.lock.group_attrs = lck_grp_attr_alloc_init(); + prng.lock.group = lck_grp_alloc_init("random", prng.lock.group_attrs); + prng.lock.attrs = lck_attr_alloc_init(); + prng.lock.mutex = lck_mtx_alloc_init(prng.lock.group, prng.lock.attrs); nbytes = entropy_readall(buf, ENTROPY_BUFFER_BYTE_SIZE); - - (void) ccdrbg_init(pp->infop, pp->statep, - nbytes, buf, - 0, NULL, - 0, NULL); - cc_clear(sizeof (buf), buf); - return pp->infop; + (void)prng_init(&prng.ctx, nbytes, buf); + cc_clear(sizeof(buf), buf); } static void -Reseed(prngContextp pp) +Reseed(void) { uint8_t buf[nsources][ENTROPY_BUFFER_BYTE_SIZE]; size_t nbytes; - - nbytes = entropy_readall(buf, ENTROPY_BUFFER_BYTE_SIZE); - PRNG_CCDRBG((void) ccdrbg_reseed(pp->infop, pp->statep, - nbytes, buf, - 0, NULL)); + lck_mtx_assert(prng.lock.mutex, LCK_MTX_ASSERT_OWNED); - cc_clear(sizeof (buf), buf); - pp->bytes_reseeded = pp->bytes_generated; + nbytes = entropy_readall(buf, ENTROPY_BUFFER_BYTE_SIZE); + PRNG_CCKPRNG((void)prng_addentropy(&prng.ctx, nbytes, buf)); + cc_clear(sizeof(buf), buf); } - /* export good random numbers to the rest of the kernel */ void -read_random(void* buffer, u_int numbytes) +read_random(void * buffer, u_int numbytes) { - prngContextp pp; - struct ccdrbg_info *infop; - int ccdrbg_err; + int err; - lck_mtx_lock(lock.mutex); - - pp = current_prng_context(); - infop = prng_infop(pp); + lck_mtx_lock(prng.lock.mutex); /* - * Call DRBG, reseeding and retrying if requested. + * Call PRNG, reseeding and retrying if requested. */ for (;;) { - PRNG_CCDRBG( - ccdrbg_err = ccdrbg_generate(infop, pp->statep, - numbytes, buffer, - 0, NULL)); - if (ccdrbg_err == CCDRBG_STATUS_OK) + PRNG_CCKPRNG(err = prng_generate(&prng.ctx, numbytes, buffer)); + if (err == CCKPRNG_OK) break; - if (ccdrbg_err == CCDRBG_STATUS_NEED_RESEED) { - Reseed(pp); + if (err == CCKPRNG_NEED_ENTROPY) { + Reseed(); continue; } - panic("read_random ccdrbg error %d\n", ccdrbg_err); + panic("read_random() error %d\n", err); } - pp->bytes_generated += numbytes; - lck_mtx_unlock(lock.mutex); + lck_mtx_unlock(prng.lock.mutex); } int -write_random(void* buffer, u_int numbytes) +write_random(void * buffer, u_int numbytes) { -#if 0 - int retval = 0; - prngContextp pp; - - lck_mtx_lock(lock.mutex); +#if PERMIT_WRITE_RANDOM + int err; - pp = current_prng_context(); + lck_mtx_lock(prng.lock.mutex); + err = prng_reseed(&prng.ctx, numbytes, buffer); + lck_mtx_unlock(prng.lock.mutex); - if (ccdrbg_reseed(prng_infop(pp), pp->statep, - bytesToInput, rdBuffer, 0, NULL) != 0) - retval = EIO; - - lck_mtx_unlock(lock.mutex); - return retval; + return err ? EIO : 0; #else #pragma unused(buffer, numbytes) return 0; #endif } - /* * Boolean PRNG for generating booleans to randomize order of elements * in certain kernel data structures. The algorithm is a @@ -639,7 +592,8 @@ write_random(void* buffer, u_int numbytes) */ /* Initialize the PRNG structures. */ -void random_bool_init(struct bool_gen *bg) +void +random_bool_init(struct bool_gen * bg) { /* Seed the random boolean generator */ for (int i = 0; i < RANDOM_BOOL_GEN_SEED_COUNT; i++) { @@ -650,21 +604,18 @@ void random_bool_init(struct bool_gen *bg) } /* Generate random bits and add them to an entropy pool. */ -void random_bool_gen_entropy( - struct bool_gen *bg, - unsigned int *buffer, - int count) +void +random_bool_gen_entropy(struct bool_gen * bg, unsigned int * buffer, int count) { - simple_lock(&bg->lock); int i, t; for (i = 0; i < count; i++) { bg->seed[1] ^= (bg->seed[1] << 5); bg->seed[1] ^= (bg->seed[1] >> 7); bg->seed[1] ^= (bg->seed[1] << 22); - t = bg->seed[2] + bg->seed[3] + bg->state; + t = bg->seed[2] + bg->seed[3] + bg->state; bg->seed[2] = bg->seed[3]; - bg->state = t < 0; + bg->state = t < 0; bg->seed[3] = t & 2147483647; bg->seed[0] += 1411392427; buffer[i] = (bg->seed[0] + bg->seed[1] + bg->seed[3]); @@ -673,11 +624,8 @@ void random_bool_gen_entropy( } /* Get some number of bits from the entropy pool, refilling if necessary. */ -unsigned int random_bool_gen_bits( - struct bool_gen *bg, - unsigned int *buffer, - unsigned int count, - unsigned int numbits) +unsigned int +random_bool_gen_bits(struct bool_gen * bg, unsigned int * buffer, unsigned int count, unsigned int numbits) { unsigned int index = 0; unsigned int rbits = 0; @@ -697,8 +645,8 @@ unsigned int random_bool_gen_bits( /* Collect-a-bit */ unsigned int bit = buffer[index] & 1; - buffer[index] = buffer[index] >> 1; - rbits = bit | (rbits << 1); + buffer[index] = buffer[index] >> 1; + rbits = bit | (rbits << 1); } return rbits; } diff --git a/osfmk/prng/prng_yarrow.c b/osfmk/prng/prng_yarrow.c deleted file mode 100644 index b5f414468..000000000 --- a/osfmk/prng/prng_yarrow.c +++ /dev/null @@ -1,413 +0,0 @@ -/* - * Copyright (c) 1999-2013 Apple, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - - -#include -#include -#include -#include -#include -#include - -#include - -#include - -#include -#include - -#include - -#include - -#include "fips_sha1.h" - - -/* - WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! - - THIS FILE IS NEEDED TO PASS FIPS ACCEPTANCE FOR THE RANDOM NUMBER GENERATOR. - IF YOU ALTER IT IN ANY WAY, WE WILL NEED TO GO THOUGH FIPS ACCEPTANCE AGAIN, - AN OPERATION THAT IS VERY EXPENSIVE AND TIME CONSUMING. IN OTHER WORDS, - DON'T MESS WITH THIS FILE. - - WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! -*/ -/* - WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! - - ANY CODE PROTECTED UNDER "#ifdef __arm__" IS SERIOUSLY SUPPOSED TO BE THERE! - IF YOU REMOVE ARM CODE, RANDOM WILL NOT MEAN ANYTHING FOR iPHONES ALL OVER. - PLEASE DON'T TOUCH __arm__ CODE IN THIS FILE! - - WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! -*/ - - -#define RESEED_TICKS 50 /* how long a reseed operation can take */ - - -typedef u_int8_t BlockWord; -enum {kBSize = 20}; -typedef BlockWord Block[kBSize]; -enum {kBlockSize = sizeof(Block)}; - -struct YarrowContext { - PrngRef PrngRef; - Block xkey; - Block random_data; - int bytes_used; - unsigned char SelfTestInitialized; - u_int32_t LastBlockChecksum; - uint64_t bytes_since_reseed; -}; -typedef struct YarrowContext *YarrowContextp; - -/* define prototypes to keep the compiler happy... */ - -void add_blocks(Block a, Block b, BlockWord carry); -void fips_initialize(YarrowContextp yp); -void random_block(YarrowContextp yp, Block b, int addOptional); -u_int32_t CalculateCRC(u_int8_t* buffer, size_t length); - -/* - * Get 120 bits from yarrow - */ - -/* - * add block b to block a - */ -void -add_blocks(Block a, Block b, BlockWord carry) -{ - int i = kBlockSize - 1; - while (i >= 0) - { - u_int32_t c = (u_int32_t)carry + - (u_int32_t)a[i] + - (u_int32_t)b[i]; - a[i] = c & 0xff; - carry = c >> 8; - i -= 1; - } -} - - - -static char zeros[(512 - kBSize * 8) / 8]; - -static const u_int32_t g_crc_table[] = -{ - 0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3, - 0x0EDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91, - 0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7, - 0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5, - 0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B, - 0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59, - 0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F, - 0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D, - 0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433, - 0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01, - 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457, - 0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65, - 0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB, - 0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9, - 0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F, - 0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD, - 0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, 0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683, - 0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1, - 0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7, - 0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5, - 0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B, - 0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79, - 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F, - 0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D, - 0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713, - 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38, 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21, - 0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777, - 0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45, - 0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB, - 0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9, - 0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF, - 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D, -}; - -/* - * Setup for fips compliance - */ - -/* - * calculate a crc-32 checksum - */ -u_int32_t CalculateCRC(u_int8_t* buffer, size_t length) -{ - u_int32_t crc = 0; - - size_t i; - for (i = 0; i < length; ++i) - { - u_int32_t temp = (crc ^ ((u_int32_t) buffer[i])) & 0xFF; - crc = (crc >> 8) ^ g_crc_table[temp]; - } - - return crc; -} - -/* - * get a random block of data per fips 186-2 - */ -void -random_block(YarrowContextp pp, Block b, int addOptional) -{ - SHA1_CTX sha1_ctx; - - int repeatCount = 0; - do - { - // do one iteration - - if (addOptional) - { - // create an xSeed to add. - Block xSeed; - prngOutput (pp->PrngRef, (BYTE*) &xSeed, sizeof (xSeed)); - - // add the seed to the previous value of xkey - add_blocks (pp->xkey, xSeed, 0); - } - - // initialize the value of H - FIPS_SHA1Init(&sha1_ctx); - - // to stay compatible with the FIPS specification, we need to flip the bytes in - // xkey to little endian byte order. In our case, this makes exactly no difference - // (random is random), but we need to do it anyway to keep FIPS happy - - // compute "G" - FIPS_SHA1Update(&sha1_ctx, pp->xkey, kBlockSize); - - // add zeros to fill the internal SHA-1 buffer - FIPS_SHA1Update (&sha1_ctx, (const u_int8_t *)zeros, sizeof (zeros)); - - // we have to do a byte order correction here because the sha1 math is being done internally - // as u_int32_t, not a stream of bytes. Since we maintain our data as a byte stream, we need - // to convert - - u_int32_t* finger = (u_int32_t*) b; - - unsigned j; - for (j = 0; j < kBlockSize / sizeof (u_int32_t); ++j) - { - *finger++ = OSSwapHostToBigInt32(sha1_ctx.h.b32[j]); - } - - // calculate the CRC-32 of the block - u_int32_t new_crc = CalculateCRC(sha1_ctx.h.b8, sizeof (Block)); - - // make sure we don't repeat - int cmp = new_crc == pp->LastBlockChecksum; - pp->LastBlockChecksum = new_crc; - if (!pp->SelfTestInitialized) - { - pp->SelfTestInitialized = 1; - return; - } - else if (!cmp) - { - return; - } - - repeatCount += 1; - - // fix up the next value of xkey - add_blocks (pp->xkey, b, 1); - } while (repeatCount < 2); - - /* - * If we got here, three sucessive checksums of the random number - * generator have been the same. Since the odds of this happening are - * 1 in 18,446,744,073,709,551,616, (1 in 18 quintillion) one of the following has - * most likely happened: - * - * 1: There is a significant bug in this code. - * 2: There has been a massive system failure. - * 3: The universe has ceased to exist. - * - * There is no good way to recover from any of these cases. We - * therefore panic. - */ - - panic("FIPS random self-test failed."); -} - -const Block kKnownAnswer = {0x92, 0xb4, 0x04, 0xe5, 0x56, 0x58, 0x8c, 0xed, 0x6c, 0x1a, 0xcd, 0x4e, 0xbf, 0x05, 0x3f, 0x68, 0x09, 0xf7, 0x3a, 0x93}; - -void -fips_initialize(YarrowContextp yp) -{ - /* So that we can do the self test, set the seed to zero */ - memset(&yp->xkey, 0, sizeof(yp->xkey)); - - /* other initializations */ - memset (zeros, 0, sizeof (zeros)); - yp->bytes_used = 0; - random_block(yp, yp->random_data, FALSE); - - // check here to see if we got the initial data we were expecting - if (memcmp(kKnownAnswer, yp->random_data, kBlockSize) != 0) - { - panic("FIPS random self test failed"); - } - - // now do the random block again to make sure that userland doesn't get predicatable data - random_block(yp, yp->random_data, TRUE); -} - - -static int -yarrow_init( - const struct ccdrbg_info *info, - struct ccdrbg_state *drbg, - unsigned long entropyLength, const void* entropy, - unsigned long nonceLength, const void* nonce, - unsigned long psLength, const void* ps) -{ -#pragma unused(info) -#pragma unused(nonceLength) -#pragma unused(nonce) -#pragma unused(psLength) -#pragma unused(ps) - YarrowContextp yp = (YarrowContextp) drbg; - prng_error_status perr; - char buffer[16]; - - yp->SelfTestInitialized = 0; - - /* create a Yarrow object */ - perr = prngInitialize(&yp->PrngRef); - if (perr != 0) { - panic("Couldn't initialize Yarrow, /dev/random will not work."); - } - - perr = prngInput(yp->PrngRef, __DECONST(BYTE*, entropy), (UINT) entropyLength, - SYSTEM_SOURCE, (UINT) entropyLength * 8); - if (perr != 0) { - /* an error, complain */ - panic("Couldn't seed Yarrow.\n"); - } - - /* turn the data around */ - perr = prngOutput(yp->PrngRef, (BYTE*) buffer, (UINT) sizeof(buffer)); - - /* and scramble it some more */ - perr = prngForceReseed(yp->PrngRef, RESEED_TICKS); - - fips_initialize(yp); - - yp->bytes_since_reseed = 0; - - return perr; -} - -static int -yarrow_generate( - struct ccdrbg_state *prng, - unsigned long outlen, void *out, - unsigned long inlen, const void *in) -{ -#pragma unused(inlen) -#pragma unused(in) - YarrowContextp yp = (YarrowContextp) prng; - int bytes_read = 0; - int bytes_remaining = (int) outlen; - - yp->bytes_since_reseed += outlen; - /* Reseed needed? But allow any length immediately after reseeding. */ - if (yp->bytes_since_reseed != outlen && - yp->bytes_since_reseed > RESEED_BYTES) - return CCDRBG_STATUS_NEED_RESEED; - - while (bytes_remaining > 0) { - int bytes_to_read = MIN(bytes_remaining, - kBlockSize - yp->bytes_used); - if (bytes_to_read == 0) { - random_block(yp, yp->random_data, TRUE); - yp->bytes_used = 0; - bytes_to_read = MIN(bytes_remaining, kBlockSize); - } - - memmove((u_int8_t*) out + bytes_read, - ((u_int8_t*)yp->random_data) + yp->bytes_used, - bytes_to_read); - yp->bytes_used += bytes_to_read; - bytes_read += bytes_to_read; - bytes_remaining -= bytes_to_read; - } - - return CCDRBG_STATUS_OK; -} - -static int -yarrow_reseed( - struct ccdrbg_state *prng, - unsigned long entropylen, const void *entropy, - unsigned long inlen, const void *in) -{ -#pragma unused(inlen) -#pragma unused(in) - YarrowContextp yp = (YarrowContextp) prng; - - (void) prngInput(yp->PrngRef, __DECONST(BYTE*, entropy), (UINT) entropylen, - SYSTEM_SOURCE, (UINT) entropylen * 8); - (void) prngForceReseed(yp->PrngRef, RESEED_TICKS); - - yp->bytes_since_reseed = 0; - - return CCDRBG_STATUS_OK; -} - -static void -yarrow_destroy( - struct ccdrbg_state *prng) -{ -#pragma unused(prng) -} - - -void -ccdrbg_factory_yarrow( - struct ccdrbg_info *info, - const void *custom) -{ - info->size = sizeof(struct YarrowContext); - info->init = yarrow_init; - info->generate = yarrow_generate; - info->reseed = yarrow_reseed; - info->done = yarrow_destroy; - info->custom = custom; -} diff --git a/osfmk/prng/random.h b/osfmk/prng/random.h index 7ba5f00e1..a49b6c730 100644 --- a/osfmk/prng/random.h +++ b/osfmk/prng/random.h @@ -2,7 +2,7 @@ * Copyright (c) 2013 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,12 +22,12 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#ifndef _PRNG_RANDOM_H_ -#define _PRNG_RANDOM_H_ +#ifndef _PRNG_RANDOM_H_ +#define _PRNG_RANDOM_H_ __BEGIN_DECLS @@ -42,46 +42,24 @@ typedef struct entropy_data { * TODO: Should index_ptr be volatile? Are we exposed to any races that * we care about if it is not? */ - uint32_t *index_ptr; + uint32_t * index_ptr; uint32_t buffer[ENTROPY_BUFFER_SIZE]; } entropy_data_t; extern entropy_data_t EntropyData; /* Trace codes for DBG_SEC_KERNEL: */ -#define ENTROPY_READ(n) SECURITYDBG_CODE(DBG_SEC_KERNEL, n) /* n: 0 .. 3 */ +#define ENTROPY_READ(n) SECURITYDBG_CODE(DBG_SEC_KERNEL, n) /* n: 0 .. 3 */ /* * Early_random implementation params: */ -#define EARLY_RANDOM_SEED_SIZE (16) -#define EARLY_RANDOM_STATE_STATIC_SIZE (264) - -#if defined (__x86_64__) -#define current_prng_context() (current_cpu_datap()->cpu_prng) -#define master_prng_context() (cpu_datap(master_cpu)->cpu_prng) -#elif defined (__arm__) || defined(__arm64__) -#include // For MAX_CPUS -#define current_prng_context() (getCpuDatap()->cpu_prng) -#define master_prng_context() (cpu_datap(master_cpu)->cpu_prng) -#else -#error architecture unknown -#endif +#define EARLY_RANDOM_SEED_SIZE (16) +#define EARLY_RANDOM_STATE_STATIC_SIZE (264) -#include -#include - -typedef void (*ccdrbg_factory_t)(struct ccdrbg_info *info, const void *custom); - -extern void ccdrbg_factory_yarrow(struct ccdrbg_info *info, const void *custom); - -void prng_factory_register(ccdrbg_factory_t factory); -void prng_cpu_init(int cpu); - -void entropy_buffer_read(void *buffer, size_t *count); -void entropy_boot_trace(void); +void early_random_cpu_init(int cpu); /* - * Wrapper for requesting a CCDRBG operation. + * Wrapper for requesting a CCKPRNG operation. * This macro makes the DRBG call with pre-emption disabled to ensure that * any attempt to block will cause a panic. And the operation is timed and * cannot exceed 10msec (for development kernels). @@ -89,31 +67,39 @@ void entropy_boot_trace(void); */ #define YARROW 1 #if YARROW -#define PRNG_CCDRBG(op) \ -MACRO_BEGIN \ - op; \ -MACRO_END +#define PRNG_CCKPRNG(op) \ + MACRO_BEGIN \ + op; \ + MACRO_END #else -#define PRNG_CCDRBG(op) \ -MACRO_BEGIN \ - uint64_t start; \ - uint64_t stop; \ - disable_preemption(); \ - start = mach_absolute_time(); \ - op; \ - stop = mach_absolute_time(); \ - enable_preemption(); \ - assert(stop - start < 10*NSEC_PER_MSEC || \ - machine_timeout_suspended()); \ - (void) start; \ - (void) stop; \ -MACRO_END +#define PRNG_CCKPRNG(op) \ + MACRO_BEGIN \ + uint64_t start; \ + uint64_t stop; \ + disable_preemption(); \ + start = mach_absolute_time(); \ + op; \ + stop = mach_absolute_time(); \ + enable_preemption(); \ + assert(stop - start < 10 * NSEC_PER_MSEC || machine_timeout_suspended()); \ + (void)start; \ + (void)stop; \ + MACRO_END #endif #endif /* XNU_KERNEL_PRIVATE */ -/* /dev/random's PRNG is reseeded after generating this many bytes: */ -#define RESEED_BYTES (17597) +#include + +/* kernel prng */ +typedef const struct prng_fns { + int (*init)(cckprng_ctx_t ctx, size_t nbytes, const void * seed); + int (*reseed)(cckprng_ctx_t ctx, size_t nbytes, const void * seed); + int (*addentropy)(cckprng_ctx_t ctx, size_t nbytes, const void * entropy); + int (*generate)(cckprng_ctx_t ctx, size_t nbytes, void * out); +} * prng_fns_t; + +void register_and_init_prng(prng_fns_t fns); #include /* Definitions for boolean PRNG */ @@ -124,18 +110,11 @@ struct bool_gen { decl_simple_lock_data(, lock) }; -extern void random_bool_init(struct bool_gen *bg); +extern void random_bool_init(struct bool_gen * bg); -extern void random_bool_gen_entropy( - struct bool_gen *bg, - unsigned int *buffer, - int count); +extern void random_bool_gen_entropy(struct bool_gen * bg, unsigned int * buffer, int count); -extern unsigned int random_bool_gen_bits( - struct bool_gen *bg, - unsigned int *buffer, - unsigned int count, - unsigned int numbits); +extern unsigned int random_bool_gen_bits(struct bool_gen * bg, unsigned int * buffer, unsigned int count, unsigned int numbits); __END_DECLS diff --git a/osfmk/tests/Makefile b/osfmk/tests/Makefile new file mode 100644 index 000000000..7e3492e4d --- /dev/null +++ b/osfmk/tests/Makefile @@ -0,0 +1,19 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + +include $(MakeInc_cmd) +include $(MakeInc_def) + +EXPORT_FILES = xnupost.h ktest.h + +EXPORT_MI_LIST = ${EXPORT_FILES} + +EXPORT_MI_DIR = tests + +INSTALL_KF_MI_LCL_LIST = +INSTALL_KF_MI_LIST = + +include $(MakeInc_rule) +include $(MakeInc_dir) diff --git a/osfmk/tests/README.md b/osfmk/tests/README.md new file mode 100644 index 000000000..34d753b4b --- /dev/null +++ b/osfmk/tests/README.md @@ -0,0 +1,125 @@ +# Kernel Power On Self Tests (POST) + +The tests directories osfmk/tests and bsd/tests include set of tests that run in kernel at boot-time. The primary objective for these tests is to verify functionality of various subsystems like memory allocators, scheduling, VM, IPC ports etc. Following are some tips and guidelines to creating and running tests. + +## Features: + * Compiled out of RELEASE kernels. + * enabled with boot-arg kernPOST [ 0x1 : for on desk testing, 0x3 for BATs testing] + * Automatically skips tests that are designed to panic kernel for on-desk testing, but run in BATs environment. + * Does not require complete install on device to run. Just kernelcache is enough. + * Ability to check for assertions and panic path as well. + +## How to run kernel POST + + * Start usbterm and setup your target machine/device in iBoot. + * set boot-args to include "```kernPOST=0x1```"" to enable kernel testing on boot. + * load kernelcache using "```usb get /patch/to/kc```" + * boot the image "```bootx```" + * watch for nanokdp serial output with tags like "```[KTEST] logs```" + +## How do I configure to run just test #8? + +Kernel POST supports configuring test through boot-args. For example if you want to run your test #8 (say you are tweaking it to do more testing). Just set "```kernPOST_config=8```" and only your test will be run. The configuration also takes ranges as follows +``` +-> kernPOST_config=1_3,5_9999 # skip test#4. Will run tests 1,2,3 and 5,6 and onwards. + +-> kernPOST_config=1_3,4_9999 # will not skip anything. lower_upper are both inclusive. + +``` + +## How do I add a new test? +Adding a new kernel POST test is very simple. Here are a few steps and guidelines for adding tests. + + * There are two locations ```osfmk/tests/``` and ```bsd/tests``` where you can add tests based on your area of testing. + * If you wish to add a new *.c* file for your tests then, use ```#include ``` to include required functions and macros for testing. Remember to add file_name.c in ```osfmk/conf/files``` or ```bsd/conf/files``` as + + ```osfmk/tests/my_tests.c optional config_xnupost``` + * To add a test function just declare a function with prototype as + + ```kern_return_t my_sample_tests(void); ``` + * And add to struct xnupost_test array in osfmk/tests/kernel_tests.c or bsd/tests/bsd_tests.c as + +``` +struct xnupost_test kernel_post_tests[] = { + XNUPOST_TEST_CONFIG_BASIC(my_sample_tests), // simple test + XNUPOST_TEST_CONFIG_TEST_PANIC(panic_test) // test that is expected to panic +}; +``` + * And you are set. Use KERN_SUCCESS to report successful run and any other error for failure. Here is an example with some available macros. + +``` +kern_return_t my_sample_tests() { + uint64_t test_begin_timestamp = 0; + uint64_t cur_timestamp = 0, tmp; + + T_SETUPBEGIN; + test_begin_timestamp = mach_absolute_time(); + T_ASSERT_NOTNULL(test_begin_timestamp, "mach_absolute_time returned 0."); + T_SETUPEND; + + T_LOG("Testing mach_absolute_time for 100 iterations"); + for (int i = 0; i < 100; i++) { + tmp = mach_absolute_time(); + T_EXPECT_TRUE((cur_timestamp <= tmp ), "Time went backwards"); + cur_timestamp = tmp; + } + + T_LOG("Completed mach_absolute_time tests."); + return KERN_SUCCESS; +} +``` + + * There are many ** T_* ** macros available for your convenience. + * **Note**: Please make sure your test does a proper cleanup of state. The kernel is expected to continue to boot after testing. If you are unable to cleanup and require a reboot then use XNUPOST_TEST_CONFIG_TEST_PANIC type and panic at the end of the function. This will make sure the test controller reboots and runs the next test in automation. + +## What is the difference between T_EXPECT and T_ASSERT macros? + + * T_ASSERT macros will check for condition and upon failure return with KERN_FAILURE. This way it ensures that no further execution of test code is done. + * T_EXPECT will just report failure of that test case, but will continue to run further test code. + +## How do I run my tests in BATs? + +Bats has a new test type **kernel_POST** that runs Lean test environment tests. You can run the following command to get POST testing. + +``` +~osdev/tat/dev/bin/bats build -b -t darwinLTE -p xnu: -r +``` + +## How do I test for panic/assertions? + +The xnupost subsystem provides mechanism for setting up a `panic widget`. This widget can check for some conditions and report test case SUCCESS/FAILURE. See xnupost.h for `XT_RET* ` style return values. There are convenience macros for registering for generic panic and for assertion handling. For example if you wish to check for api foo(int arg) { assert(arg > 0); ... } then a test case could be like + +``` +kern_return_t test_foo_arg_assertion(void) { + void * assert_retval = NULL; + kern_return_t kr = T_REGISTER_ASSERT_CHECK("arg > 0", &assert_retval); + T_ASSERT(kr == KERN_SUCCESS, "register assertion handler"); + + foo(-1); /* this will cause assert to fire */ + + T_ASSERT(assert_retval == (void *)XT_RET_W_SUCCESS, "verify assertion was hit"); +} + +``` + +## How do XNUPOST panic widgets work? + +On debug/development kernels, the `panic()` code is modified to call out to XNUPOST system `xnupost_process_panic()`. This callout can then determine if testing was enabled and has a widget registered for checking panics. If yes, then the corresponding widget function is called and the return value determines what action is taken. For example a widget could return either of the following values + + XT_PANIC_UNRELATED /* not related. continue panic */ + XT_RET_W_FAIL /* report FAILURE and return from panic */ + XT_RET_W_SUCCESS /* report SUCCESS and return from panic */ + XT_PANIC_W_FAIL /* report FAILURE and continue to panic */ + XT_PANIC_W_SUCCESS /* report SUCCESS and continue to panic */ + +The panic widget data is saved in internal data array where each is of type: +struct xnupost_panic_widget { + void * xtp_context_p; /* a context pointer for callbacks to track */ + void ** xtp_outval_p; /* an out param for function to return some value to running test */ + const char * xtp_func_name; /* widget name for tracking in serial output */ + xt_panic_widget_func xtp_func; +}; + +There is an example use case in `osfmk/tests/kernel_tests.c :check_panic_test() and panic_test()` for writing a widget. +For basic assertion check see example in `osfmkt/tests/kernel_tests.c :kcdata_api_assert_tests()` + diff --git a/osfmk/tests/bitmap_test.c b/osfmk/tests/bitmap_test.c new file mode 100644 index 000000000..121d92ea1 --- /dev/null +++ b/osfmk/tests/bitmap_test.c @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + + +#if DEVELOPMENT || DEBUG + +#include +#include +#include + +extern void dump_bitmap_next(bitmap_t *map, uint nbits); +extern void dump_bitmap_lsb(bitmap_t *map, uint nbits); +extern void test_bitmap(void); +extern kern_return_t bitmap_post_test(void); + +void +dump_bitmap_next(bitmap_t *map, uint nbits) +{ + for (int i = bitmap_first(map, nbits); i >= 0; i = bitmap_next(map, i)) { + printf(" %d", i); + } + printf("\n"); +} + +void +dump_bitmap_lsb(bitmap_t *map, uint nbits) +{ + for (int i = bitmap_lsb_first(map, nbits); i >= 0; i = bitmap_lsb_next(map, nbits, i)) { + printf(" %d", i); + } + printf("\n"); +} + +#ifdef NOTDEF +#ifdef assert +#undef assert +#endif +#define assert(x) T_ASSERT(x, NULL) +#endif + +void +test_bitmap(void) +{ + uint start = 60; + for (uint nbits = start; nbits <= 192; nbits++) { + bitmap_t *map = bitmap_alloc(nbits); + + for (uint i = 0; i < nbits; i++) { + bitmap_set(map, i); + } + + int expected_result = nbits - 1; + for (int i = bitmap_first(map, nbits); i >= 0; i = bitmap_next(map, i)) { + assert(i == expected_result); + expected_result--; + } + assert(expected_result == -1); + + expected_result = 0; + for (int i = bitmap_lsb_first(map, nbits); i >= 0; i = bitmap_lsb_next(map, nbits, i)) { + assert(i == expected_result); + expected_result++; + } + assert(expected_result == (int)nbits); + + for (uint i = 0; i < nbits; i++) { + bitmap_clear(map, i); + } + assert(bitmap_first(map, nbits) == -1); + assert(bitmap_lsb_first(map, nbits) == -1); + + bitmap_free(map, nbits); + } +} + +kern_return_t +bitmap_post_test(void) +{ + test_bitmap(); + + kern_return_t ret = KERN_SUCCESS; + + T_ASSERT(ret == KERN_SUCCESS, NULL); + + return ret; +} +#endif diff --git a/osfmk/tests/kernel_tests.c b/osfmk/tests/kernel_tests.c new file mode 100644 index 000000000..9bac2abff --- /dev/null +++ b/osfmk/tests/kernel_tests.c @@ -0,0 +1,920 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if !(DEVELOPMENT || DEBUG) +#error "Testing is not enabled on RELEASE configurations" +#endif + +#include + +extern boolean_t get_range_bounds(char * c, int64_t * lower, int64_t * upper); +__private_extern__ void qsort(void * a, size_t n, size_t es, int (*cmp)(const void *, const void *)); + +uint32_t total_post_tests_count = 0; +void xnupost_reset_panic_widgets(void); + +/* test declarations */ +kern_return_t zalloc_test(void); +kern_return_t RandomULong_test(void); +kern_return_t kcdata_api_test(void); +kern_return_t priority_queue_test(void); + +#if defined(__arm__) || defined(__arm64__) +kern_return_t pmap_coredump_test(void); +#endif + +extern kern_return_t console_serial_test(void); +extern kern_return_t console_serial_alloc_rel_tests(void); +extern kern_return_t console_serial_parallel_log_tests(void); +extern kern_return_t test_os_log(void); +extern kern_return_t test_os_log_parallel(void); +extern kern_return_t bitmap_post_test(void); + +#ifdef __arm64__ +extern kern_return_t arm64_munger_test(void); +extern kern_return_t ex_cb_test(void); +#if __ARM_PAN_AVAILABLE__ +extern kern_return_t arm64_pan_test(void); +#endif +#endif /* __arm64__ */ + +extern kern_return_t test_thread_call(void); + + +struct xnupost_panic_widget xt_panic_widgets = {NULL, NULL, NULL, NULL}; + +struct xnupost_test kernel_post_tests[] = {XNUPOST_TEST_CONFIG_BASIC(zalloc_test), + XNUPOST_TEST_CONFIG_BASIC(RandomULong_test), + XNUPOST_TEST_CONFIG_BASIC(test_os_log), + XNUPOST_TEST_CONFIG_BASIC(test_os_log_parallel), +#ifdef __arm64__ + XNUPOST_TEST_CONFIG_BASIC(arm64_munger_test), + XNUPOST_TEST_CONFIG_BASIC(ex_cb_test), +#if __ARM_PAN_AVAILABLE__ + XNUPOST_TEST_CONFIG_BASIC(arm64_pan_test), +#endif +#endif /* __arm64__ */ + XNUPOST_TEST_CONFIG_BASIC(kcdata_api_test), + XNUPOST_TEST_CONFIG_BASIC(console_serial_test), + XNUPOST_TEST_CONFIG_BASIC(console_serial_alloc_rel_tests), + XNUPOST_TEST_CONFIG_BASIC(console_serial_parallel_log_tests), +#if defined(__arm__) || defined(__arm64__) + XNUPOST_TEST_CONFIG_BASIC(pmap_coredump_test), +#endif + XNUPOST_TEST_CONFIG_BASIC(bitmap_post_test), + //XNUPOST_TEST_CONFIG_TEST_PANIC(kcdata_api_assert_tests) + XNUPOST_TEST_CONFIG_BASIC(test_thread_call), + XNUPOST_TEST_CONFIG_BASIC(priority_queue_test), +}; + +uint32_t kernel_post_tests_count = sizeof(kernel_post_tests) / sizeof(xnupost_test_data_t); + +#define POSTARGS_RUN_TESTS 0x1 +#define POSTARGS_CONTROLLER_AVAILABLE 0x2 +#define POSTARGS_CUSTOM_TEST_RUNLIST 0x4 +uint64_t kernel_post_args = 0x0; + +/* static variables to hold state */ +static kern_return_t parse_config_retval = KERN_INVALID_CAPABILITY; +static char kernel_post_test_configs[256]; +boolean_t xnupost_should_run_test(uint32_t test_num); + +kern_return_t +xnupost_parse_config() +{ + if (parse_config_retval != KERN_INVALID_CAPABILITY) + return parse_config_retval; + PE_parse_boot_argn("kernPOST", &kernel_post_args, sizeof(kernel_post_args)); + + if (PE_parse_boot_argn("kernPOST_config", &kernel_post_test_configs[0], sizeof(kernel_post_test_configs)) == TRUE) { + kernel_post_args |= POSTARGS_CUSTOM_TEST_RUNLIST; + } + + if (kernel_post_args != 0) { + parse_config_retval = KERN_SUCCESS; + goto out; + } + parse_config_retval = KERN_NOT_SUPPORTED; +out: + return parse_config_retval; +} + +boolean_t +xnupost_should_run_test(uint32_t test_num) +{ + if (kernel_post_args & POSTARGS_CUSTOM_TEST_RUNLIST) { + int64_t begin = 0, end = 999999; + char * b = kernel_post_test_configs; + while (*b) { + get_range_bounds(b, &begin, &end); + if (test_num >= begin && test_num <= end) { + return TRUE; + } + + /* skip to the next "," */ + while (*b != ',') { + if (*b == '\0') + return FALSE; + b++; + } + /* skip past the ',' */ + b++; + } + return FALSE; + } + return TRUE; +} + +kern_return_t +xnupost_list_tests(xnupost_test_t test_list, uint32_t test_count) +{ + if (KERN_SUCCESS != xnupost_parse_config()) + return KERN_FAILURE; + + xnupost_test_t testp; + for (uint32_t i = 0; i < test_count; i++) { + testp = &test_list[i]; + if (testp->xt_test_num == 0) { + testp->xt_test_num = ++total_post_tests_count; + } + /* make sure the boot-arg based test run list is honored */ + if (kernel_post_args & POSTARGS_CUSTOM_TEST_RUNLIST) { + testp->xt_config |= XT_CONFIG_IGNORE; + if (xnupost_should_run_test(testp->xt_test_num)) { + testp->xt_config &= ~(XT_CONFIG_IGNORE); + testp->xt_config |= XT_CONFIG_RUN; + printf("\n[TEST] #%u is marked as ignored", testp->xt_test_num); + } + } + printf("\n[TEST] TOC#%u name: %s expected: %d config: %x\n", testp->xt_test_num, testp->xt_name, testp->xt_expected_retval, + testp->xt_config); + } + + return KERN_SUCCESS; +} + +kern_return_t +xnupost_run_tests(xnupost_test_t test_list, uint32_t test_count) +{ + uint32_t i = 0; + int retval = KERN_SUCCESS; + + if ((kernel_post_args & POSTARGS_RUN_TESTS) == 0) { + printf("No POST boot-arg set.\n"); + return retval; + } + + T_START; + xnupost_test_t testp; + for (; i < test_count; i++) { + xnupost_reset_panic_widgets(); + testp = &test_list[i]; + T_BEGIN(testp->xt_name); + testp->xt_begin_time = mach_absolute_time(); + testp->xt_end_time = testp->xt_begin_time; + + /* + * If test is designed to panic and controller + * is not available then mark as SKIPPED + */ + if ((testp->xt_config & XT_CONFIG_EXPECT_PANIC) && !(kernel_post_args & POSTARGS_CONTROLLER_AVAILABLE)) { + T_SKIP( + "Test expects panic but " + "no controller is present"); + testp->xt_test_actions = XT_ACTION_SKIPPED; + continue; + } + + if ((testp->xt_config & XT_CONFIG_IGNORE)) { + T_SKIP("Test is marked as XT_CONFIG_IGNORE"); + testp->xt_test_actions = XT_ACTION_SKIPPED; + continue; + } + + testp->xt_func(); + T_END; + testp->xt_retval = T_TESTRESULT; + testp->xt_end_time = mach_absolute_time(); + if (testp->xt_retval == testp->xt_expected_retval) { + testp->xt_test_actions = XT_ACTION_PASSED; + } else { + testp->xt_test_actions = XT_ACTION_FAILED; + } + } + T_FINISH; + return retval; +} + +kern_return_t +kernel_list_tests() +{ + return xnupost_list_tests(kernel_post_tests, kernel_post_tests_count); +} + +kern_return_t +kernel_do_post() +{ + return xnupost_run_tests(kernel_post_tests, kernel_post_tests_count); +} + +kern_return_t +xnupost_register_panic_widget(xt_panic_widget_func funcp, const char * funcname, void * context, void ** outval) +{ + if (xt_panic_widgets.xtp_context_p != NULL || xt_panic_widgets.xtp_func != NULL) + return KERN_RESOURCE_SHORTAGE; + + xt_panic_widgets.xtp_context_p = context; + xt_panic_widgets.xtp_func = funcp; + xt_panic_widgets.xtp_func_name = funcname; + xt_panic_widgets.xtp_outval_p = outval; + + return KERN_SUCCESS; +} + +void +xnupost_reset_panic_widgets() +{ + bzero(&xt_panic_widgets, sizeof(xt_panic_widgets)); +} + +kern_return_t +xnupost_process_kdb_stop(const char * panic_s) +{ + xt_panic_return_t retval = 0; + struct xnupost_panic_widget * pw = &xt_panic_widgets; + const char * name = "unknown"; + if (xt_panic_widgets.xtp_func_name) { + name = xt_panic_widgets.xtp_func_name; + } + + /* bail early on if kernPOST is not set */ + if (kernel_post_args == 0) { + return KERN_INVALID_CAPABILITY; + } + + if (xt_panic_widgets.xtp_func) { + T_LOG("%s: Calling out to widget: %s", __func__, xt_panic_widgets.xtp_func_name); + retval = pw->xtp_func(panic_s, pw->xtp_context_p, pw->xtp_outval_p); + } else { + return KERN_INVALID_CAPABILITY; + } + + switch (retval) { + case XT_RET_W_SUCCESS: + T_EXPECT_EQ_INT(retval, XT_RET_W_SUCCESS, "%s reported successful handling. Returning from kdb_stop.", name); + /* KERN_SUCCESS means return from panic/assertion */ + return KERN_SUCCESS; + + case XT_RET_W_FAIL: + T_FAIL("%s reported XT_RET_W_FAIL: Returning from kdb_stop", name); + return KERN_SUCCESS; + + case XT_PANIC_W_FAIL: + T_FAIL("%s reported XT_PANIC_W_FAIL: Continuing to kdb_stop", name); + return KERN_FAILURE; + + case XT_PANIC_W_SUCCESS: + T_EXPECT_EQ_INT(retval, XT_PANIC_W_SUCCESS, "%s reported successful testcase. But continuing to kdb_stop.", name); + return KERN_FAILURE; + + case XT_PANIC_UNRELATED: + default: + T_LOG("UNRELATED: Continuing to kdb_stop."); + return KERN_FAILURE; + } +} + +xt_panic_return_t +_xt_generic_assert_check(const char * s, void * str_to_match, void ** outval) +{ + xt_panic_return_t ret = XT_PANIC_UNRELATED; + + if (NULL != strnstr(__DECONST(char *, s), (char *)str_to_match, strlen(s))) { + T_LOG("%s: kdb_stop string: '%s' MATCHED string: '%s'", __func__, s, (char *)str_to_match); + ret = XT_RET_W_SUCCESS; + } + + if (outval) + *outval = (void *)(uintptr_t)ret; + return ret; +} + +kern_return_t +xnupost_reset_tests(xnupost_test_t test_list, uint32_t test_count) +{ + uint32_t i = 0; + xnupost_test_t testp; + for (; i < test_count; i++) { + testp = &test_list[i]; + testp->xt_begin_time = 0; + testp->xt_end_time = 0; + testp->xt_test_actions = XT_ACTION_NONE; + testp->xt_retval = -1; + } + return KERN_SUCCESS; +} + + +kern_return_t +zalloc_test() +{ + zone_t test_zone; + void * test_ptr; + + T_SETUPBEGIN; + test_zone = zinit(sizeof(uint64_t), 100 * sizeof(uint64_t), sizeof(uint64_t), "test_uint64_zone"); + T_ASSERT_NOTNULL(test_zone, NULL); + + T_ASSERT_EQ_INT(zone_free_count(test_zone), 0, NULL); + T_SETUPEND; + + T_ASSERT_NOTNULL(test_ptr = zalloc(test_zone), NULL); + + zfree(test_zone, test_ptr); + + /* A sample report for perfdata */ + T_PERF("num_threads_at_ktest", threads_count, "count", "# of threads in system at zalloc_test"); + + return KERN_SUCCESS; +} + +/* + * Function used for comparison by qsort() + */ +static int +compare_numbers_ascending(const void * a, const void * b) +{ + const uint64_t x = *(const uint64_t *)a; + const uint64_t y = *(const uint64_t *)b; + if (x < y) { + return -1; + } else if (x > y) { + return 1; + } else { + return 0; + } +} + +/* + * Function used for comparison by qsort() + */ +static int +compare_numbers_descending(const void * a, const void * b) +{ + const uint32_t x = *(const uint32_t *)a; + const uint32_t y = *(const uint32_t *)b; + if (x > y) { + return -1; + } else if (x < y) { + return 1; + } else { + return 0; + } +} + +/* Node structure for the priority queue tests */ +struct priority_queue_test_node { + struct priority_queue_entry link; + priority_queue_key_t node_key; +}; + +static void +priority_queue_test_queue(struct priority_queue *pq, int type, + priority_queue_compare_fn_t cmp_fn) +{ + /* Configuration for the test */ +#define PRIORITY_QUEUE_NODES 7 + static uint32_t priority_list[] = { 20, 3, 7, 6, 50, 2, 8}; + uint32_t increase_pri = 100; + uint32_t decrease_pri = 90; + struct priority_queue_test_node *result; + uint32_t key = 0; + boolean_t update_result = false; + + struct priority_queue_test_node *node = NULL; + /* Add all priorities to the first priority queue */ + for (int i = 0; i < PRIORITY_QUEUE_NODES; i++) { + node = kalloc(sizeof(struct priority_queue_test_node)); + T_ASSERT_NOTNULL(node, NULL); + + priority_queue_entry_init(&(node->link)); + node->node_key = priority_list[i]; + key = (type == PRIORITY_QUEUE_GENERIC_KEY) ? PRIORITY_QUEUE_KEY_NONE : priority_list[i]; + priority_queue_insert(pq, &(node->link), key, cmp_fn); + } + + T_ASSERT_NOTNULL(node, NULL); + key = (type == PRIORITY_QUEUE_GENERIC_KEY) ? node->node_key : priority_queue_entry_key(pq, &(node->link)); + T_ASSERT((key == node->node_key), "verify node stored key correctly"); + + /* Test the priority increase operation by updating the last node added (8) */ + T_ASSERT_NOTNULL(node, NULL); + node->node_key = increase_pri; + key = (type == PRIORITY_QUEUE_GENERIC_KEY) ? PRIORITY_QUEUE_KEY_NONE : node->node_key; + update_result = priority_queue_entry_increase(pq, &node->link, key, cmp_fn); + T_ASSERT((update_result == true), "increase key updated root"); + result = priority_queue_max(pq, struct priority_queue_test_node, link); + T_ASSERT((result->node_key == increase_pri), "verify priority_queue_entry_increase() operation"); + + + /* Test the priority decrease operation by updating the last node added */ + T_ASSERT((result == node), NULL); + node->node_key = decrease_pri; + key = (type == PRIORITY_QUEUE_GENERIC_KEY) ? PRIORITY_QUEUE_KEY_NONE : node->node_key; + update_result = priority_queue_entry_decrease(pq, &node->link, key, cmp_fn); + T_ASSERT((update_result == true), "decrease key updated root"); + result = priority_queue_max(pq, struct priority_queue_test_node, link); + T_ASSERT((result->node_key == decrease_pri), "verify priority_queue_entry_decrease() operation"); + + /* Update our local priority list as well */ + priority_list[PRIORITY_QUEUE_NODES - 1] = decrease_pri; + + /* Sort the local list in descending order */ + qsort(priority_list, PRIORITY_QUEUE_NODES, sizeof(priority_list[0]), compare_numbers_descending); + + /* Test the maximum operation by comparing max node with local list */ + result = priority_queue_max(pq, struct priority_queue_test_node, link); + T_ASSERT((result->node_key == priority_list[0]), "(heap (%u) == qsort (%u)) priority queue max node lookup", + (uint32_t)result->node_key, priority_list[0]); + + /* Remove all remaining elements and verify they match local list */ + for (int i = 0; i < PRIORITY_QUEUE_NODES; i++) { + result = priority_queue_remove_max(pq, struct priority_queue_test_node, link, cmp_fn); + T_ASSERT((result->node_key == priority_list[i]), "(heap (%u) == qsort (%u)) priority queue max node removal", + (uint32_t)result->node_key, priority_list[i]); + } + + priority_queue_destroy(pq, struct priority_queue_test_node, link, ^(void *n) { + kfree(n, sizeof(struct priority_queue_test_node)); + }); +} + +kern_return_t +priority_queue_test(void) +{ + /* + * Initialize two priority queues + * - One which uses the key comparator + * - Other which uses the node comparator + */ + static struct priority_queue pq; + static struct priority_queue pq_nodes; + + T_SETUPBEGIN; + + priority_queue_init(&pq, PRIORITY_QUEUE_BUILTIN_KEY | PRIORITY_QUEUE_MAX_HEAP); + priority_queue_init(&pq_nodes, PRIORITY_QUEUE_GENERIC_KEY | PRIORITY_QUEUE_MAX_HEAP); + + T_SETUPEND; + + priority_queue_test_queue(&pq, PRIORITY_QUEUE_BUILTIN_KEY, + PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE); + + priority_queue_test_queue(&pq_nodes, PRIORITY_QUEUE_GENERIC_KEY, + priority_heap_make_comparator(a, b, struct priority_queue_test_node, link, { + return (a->node_key > b->node_key) ? 1 : ((a->node_key == b->node_key) ? 0 : -1); + })); + + return KERN_SUCCESS; +} + +/* + * Function to count number of bits that are set in a number. + * It uses Side Addition using Magic Binary Numbers + */ +static int +count_bits(uint64_t number) +{ + return __builtin_popcountll(number); +} + +kern_return_t +RandomULong_test() +{ +/* + * Randomness test for RandomULong() + * + * This test verifies that: + * a. RandomULong works + * b. The generated numbers match the following entropy criteria: + * For a thousand iterations, verify: + * 1. mean entropy > 12 bits + * 2. min entropy > 4 bits + * 3. No Duplicate + * 4. No incremental/decremental pattern in a window of 3 + * 5. No Zero + * 6. No -1 + * + * Add test to increase code coverage for /dev/random + */ + +#define CONF_MIN_ENTROPY 4 +#define CONF_MEAN_ENTROPY 12 +#define CONF_ITERATIONS 1000 +#define CONF_WINDOW_SIZE 3 +#define CONF_WINDOW_TREND_LIMIT ((CONF_WINDOW_SIZE / 2) + (CONF_WINDOW_SIZE & 1)) >> 0 + + int i; + uint32_t min_bit_entropy, max_bit_entropy, bit_entropy; + uint32_t aggregate_bit_entropy = 0; + uint32_t mean_bit_entropy = 0; + uint64_t numbers[CONF_ITERATIONS]; + min_bit_entropy = UINT32_MAX; + max_bit_entropy = 0; + + /* + * TEST 1: Number generation and basic and basic validation + * Check for non-zero (no bits set), -1 (all bits set) and error + */ + for (i = 0; i < CONF_ITERATIONS; i++) { + read_random(&numbers[i], sizeof(numbers[i])); + if (numbers[i] == 0) { + T_ASSERT_NE_ULLONG(numbers[i], 0, "read_random returned zero value."); + } + if (numbers[i] == UINT64_MAX) { + T_ASSERT_NE_ULLONG(numbers[i], UINT64_MAX, "read_random returned -1."); + } + } + T_PASS("Generated %d non-zero random numbers with atleast one bit reset.", CONF_ITERATIONS); + + /* + * TEST 2: Mean and Min Bit Entropy + * Check the bit entropy and its mean over the generated numbers. + */ + for (i = 1; i < CONF_ITERATIONS; i++) { + bit_entropy = count_bits(numbers[i - 1] ^ numbers[i]); + if (bit_entropy < min_bit_entropy) + min_bit_entropy = bit_entropy; + if (bit_entropy > max_bit_entropy) + max_bit_entropy = bit_entropy; + + if (bit_entropy < CONF_MIN_ENTROPY) { + T_EXPECT_GE_UINT(bit_entropy, CONF_MIN_ENTROPY, + "Number of differing bits in consecutive numbers does not satisfy the min criteria."); + } + + aggregate_bit_entropy += bit_entropy; + } + T_PASS("Passed the min bit entropy expectation of %d bits", CONF_MIN_ENTROPY); + + mean_bit_entropy = aggregate_bit_entropy / CONF_ITERATIONS; + T_EXPECT_GE_UINT(mean_bit_entropy, CONF_MEAN_ENTROPY, "Test criteria for mean number of differing bits."); + T_PASS("Mean bit entropy criteria satisfied (Required %d, Actual: %d).", CONF_MEAN_ENTROPY, mean_bit_entropy); + T_LOG("{PERFORMANCE} iterations: %d, min_bit_entropy: %d, mean_bit_entropy: %d, max_bit_entropy: %d", CONF_ITERATIONS, + min_bit_entropy, mean_bit_entropy, max_bit_entropy); + T_PERF("min_bit_entropy_" T_TOSTRING(CONF_ITERATIONS), min_bit_entropy, "bits", "minimum bit entropy in RNG. High is better"); + T_PERF("mean_bit_entropy_" T_TOSTRING(CONF_ITERATIONS), mean_bit_entropy, "bits", "mean bit entropy in RNG. High is better"); + T_PERF("max_bit_entropy_" T_TOSTRING(CONF_ITERATIONS), max_bit_entropy, "bits", "max bit entropy in RNG. High is better"); + + /* + * TEST 3: Incremental Pattern Search + * Check that incremental/decremental pattern does not exist in the given window + */ + int window_start, window_end, trend; + window_start = window_end = trend = 0; + + do { + /* + * Set the window + */ + window_end = window_start + CONF_WINDOW_SIZE - 1; + if (window_end >= CONF_ITERATIONS) + window_end = CONF_ITERATIONS - 1; + + trend = 0; + for (i = window_start; i < window_end; i++) { + if (numbers[i] < numbers[i + 1]) + trend++; + else if (numbers[i] > numbers[i + 1]) + trend--; + } + /* + * Check that there is no increasing or decreasing trend + * i.e. trend <= ceil(window_size/2) + */ + if (trend < 0) { + trend = -trend; + } + if (trend > CONF_WINDOW_TREND_LIMIT) { + T_ASSERT_LE_INT(trend, CONF_WINDOW_TREND_LIMIT, "Found increasing/decreasing trend in random numbers."); + } + + /* + * Move to the next window + */ + window_start++; + + } while (window_start < (CONF_ITERATIONS - 1)); + T_PASS("Did not find increasing/decreasing trends in a window of %d numbers.", CONF_WINDOW_SIZE); + + /* + * TEST 4: Find Duplicates + * Check no duplicate values are generated + */ + qsort(numbers, CONF_ITERATIONS, sizeof(numbers[0]), compare_numbers_ascending); + for (i = 1; i < CONF_ITERATIONS; i++) { + if (numbers[i] == numbers[i - 1]) { + T_ASSERT_NE_ULLONG(numbers[i], numbers[i - 1], "read_random generated duplicate values."); + } + } + T_PASS("Test did not find any duplicates as expected."); + + return KERN_SUCCESS; +} + + +/* KCDATA kernel api tests */ +static struct kcdata_descriptor test_kc_data;//, test_kc_data2; +struct sample_disk_io_stats { + uint64_t disk_reads_count; + uint64_t disk_reads_size; + uint64_t io_priority_count[4]; + uint64_t io_priority_size; +} __attribute__((packed)); + +struct kcdata_subtype_descriptor test_disk_io_stats_def[] = { + {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT64, 0 * sizeof(uint64_t), sizeof(uint64_t), "disk_reads_count"}, + {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT64, 1 * sizeof(uint64_t), sizeof(uint64_t), "disk_reads_size"}, + {KCS_SUBTYPE_FLAGS_ARRAY, KC_ST_UINT64, 2 * sizeof(uint64_t), KCS_SUBTYPE_PACK_SIZE(4, sizeof(uint64_t)), "io_priority_count"}, + {KCS_SUBTYPE_FLAGS_ARRAY, KC_ST_UINT64, (2 + 4) * sizeof(uint64_t), sizeof(uint64_t), "io_priority_size"}, +}; + +kern_return_t +kcdata_api_test() +{ + kern_return_t retval = KERN_SUCCESS; + + /* test for NULL input */ + retval = kcdata_memory_static_init(NULL, (mach_vm_address_t)0, KCDATA_BUFFER_BEGIN_STACKSHOT, 100, KCFLAG_USE_MEMCOPY); + T_ASSERT(retval == KERN_INVALID_ARGUMENT, "kcdata_memory_static_init with NULL struct"); + + /* another negative test with buffer size < 32 bytes */ + char data[30] = "sample_disk_io_stats"; + retval = kcdata_memory_static_init(&test_kc_data, (mach_vm_address_t)&data, KCDATA_BUFFER_BEGIN_CRASHINFO, sizeof(data), + KCFLAG_USE_MEMCOPY); + T_ASSERT(retval == KERN_RESOURCE_SHORTAGE, "init with 30 bytes failed as expected with KERN_RESOURCE_SHORTAGE"); + + /* test with COPYOUT for 0x0 address. Should return KERN_NO_ACCESS */ + retval = kcdata_memory_static_init(&test_kc_data, (mach_vm_address_t)0, KCDATA_BUFFER_BEGIN_CRASHINFO, PAGE_SIZE, + KCFLAG_USE_COPYOUT); + T_ASSERT(retval == KERN_NO_ACCESS, "writing to 0x0 returned KERN_NO_ACCESS"); + + /* test with successful kcdata_memory_static_init */ + test_kc_data.kcd_length = 0xdeadbeef; + mach_vm_address_t address = (mach_vm_address_t)kalloc(PAGE_SIZE); + T_EXPECT_NOTNULL(address, "kalloc of PAGE_SIZE data."); + + retval = kcdata_memory_static_init(&test_kc_data, (mach_vm_address_t)address, KCDATA_BUFFER_BEGIN_STACKSHOT, PAGE_SIZE, + KCFLAG_USE_MEMCOPY); + + T_ASSERT(retval == KERN_SUCCESS, "successful kcdata_memory_static_init call"); + + T_ASSERT(test_kc_data.kcd_length == PAGE_SIZE, "kcdata length is set correctly to PAGE_SIZE."); + T_LOG("addr_begin 0x%llx and end 0x%llx and address 0x%llx", test_kc_data.kcd_addr_begin, test_kc_data.kcd_addr_end, address); + T_ASSERT(test_kc_data.kcd_addr_begin == address, "kcdata begin address is correct 0x%llx", (uint64_t)address); + + /* verify we have BEGIN and END HEADERS set */ + uint32_t * mem = (uint32_t *)address; + T_ASSERT(mem[0] == KCDATA_BUFFER_BEGIN_STACKSHOT, "buffer does contain KCDATA_BUFFER_BEGIN_STACKSHOT"); + T_ASSERT(mem[4] == KCDATA_TYPE_BUFFER_END, "KCDATA_TYPE_BUFFER_END is appended as expected"); + T_ASSERT(mem[5] == 0, "size of BUFFER_END tag is zero"); + + /* verify kcdata_memory_get_used_bytes() */ + uint64_t bytes_used = 0; + bytes_used = kcdata_memory_get_used_bytes(&test_kc_data); + T_ASSERT(bytes_used == (2 * sizeof(struct kcdata_item)), "bytes_used api returned expected %llu", bytes_used); + + /* test for kcdata_get_memory_addr() */ + + mach_vm_address_t user_addr = 0; + /* negative test for NULL user_addr AND/OR kcdata_descriptor */ + retval = kcdata_get_memory_addr(NULL, KCDATA_TYPE_MACH_ABSOLUTE_TIME, sizeof(uint64_t), &user_addr); + T_ASSERT(retval == KERN_INVALID_ARGUMENT, "kcdata_get_memory_addr with NULL struct -> KERN_INVALID_ARGUMENT"); + + retval = kcdata_get_memory_addr(&test_kc_data, KCDATA_TYPE_MACH_ABSOLUTE_TIME, sizeof(uint64_t), NULL); + T_ASSERT(retval == KERN_INVALID_ARGUMENT, "kcdata_get_memory_addr with NULL user_addr -> KERN_INVALID_ARGUMENT"); + + /* successful case with size 0. Yes this is expected to succeed as just a item type could be used as boolean */ + retval = kcdata_get_memory_addr(&test_kc_data, KCDATA_TYPE_USECS_SINCE_EPOCH, 0, &user_addr); + T_ASSERT(retval == KERN_SUCCESS, "Successfully got kcdata entry for 0 size data"); + T_ASSERT(user_addr == test_kc_data.kcd_addr_end, "0 sized data did not add any extra buffer space"); + + /* successful case with valid size. */ + user_addr = 0xdeadbeef; + retval = kcdata_get_memory_addr(&test_kc_data, KCDATA_TYPE_MACH_ABSOLUTE_TIME, sizeof(uint64_t), &user_addr); + T_ASSERT(retval == KERN_SUCCESS, "kcdata_get_memory_addr with valid values succeeded."); + T_ASSERT(user_addr > test_kc_data.kcd_addr_begin, "user_addr is in range of buffer"); + T_ASSERT(user_addr < test_kc_data.kcd_addr_end, "user_addr is in range of buffer"); + + /* Try creating an item with really large size */ + user_addr = 0xdeadbeef; + bytes_used = kcdata_memory_get_used_bytes(&test_kc_data); + retval = kcdata_get_memory_addr(&test_kc_data, KCDATA_TYPE_MACH_ABSOLUTE_TIME, PAGE_SIZE * 4, &user_addr); + T_ASSERT(retval == KERN_RESOURCE_SHORTAGE, "Allocating entry with size > buffer -> KERN_RESOURCE_SHORTAGE"); + T_ASSERT(user_addr == 0xdeadbeef, "user_addr remained unaffected with failed kcdata_get_memory_addr"); + T_ASSERT(bytes_used == kcdata_memory_get_used_bytes(&test_kc_data), "The data structure should be unaffected"); + + /* verify convenience functions for uint32_with_description */ + retval = kcdata_add_uint32_with_description(&test_kc_data, 0xbdc0ffee, "This is bad coffee"); + T_ASSERT(retval == KERN_SUCCESS, "add uint32 with description succeeded."); + + retval = kcdata_add_uint64_with_description(&test_kc_data, 0xf001badc0ffee, "another 8 byte no."); + T_ASSERT(retval == KERN_SUCCESS, "add uint64 with desc succeeded."); + + /* verify creating an KCDATA_TYPE_ARRAY here */ + user_addr = 0xdeadbeef; + bytes_used = kcdata_memory_get_used_bytes(&test_kc_data); + /* save memory address where the array will come up */ + struct kcdata_item * item_p = (struct kcdata_item *)test_kc_data.kcd_addr_end; + + retval = kcdata_get_memory_addr_for_array(&test_kc_data, KCDATA_TYPE_MACH_ABSOLUTE_TIME, sizeof(uint64_t), 20, &user_addr); + T_ASSERT(retval == KERN_SUCCESS, "Array of 20 integers should be possible"); + T_ASSERT(user_addr != 0xdeadbeef, "user_addr is updated as expected"); + T_ASSERT((kcdata_memory_get_used_bytes(&test_kc_data) - bytes_used) >= 20 * sizeof(uint64_t), "memory allocation is in range"); + kcdata_iter_t iter = kcdata_iter(item_p, PAGE_SIZE - kcdata_memory_get_used_bytes(&test_kc_data)); + T_ASSERT(kcdata_iter_array_elem_count(iter) == 20, "array count is 20"); + + /* FIXME add tests here for ranges of sizes and counts */ + + T_ASSERT(item_p->flags == (((uint64_t)KCDATA_TYPE_MACH_ABSOLUTE_TIME << 32) | 20), "flags are set correctly"); + + /* test adding of custom type */ + + retval = kcdata_add_type_definition(&test_kc_data, 0x999, data, &test_disk_io_stats_def[0], + sizeof(test_disk_io_stats_def) / sizeof(struct kcdata_subtype_descriptor)); + T_ASSERT(retval == KERN_SUCCESS, "adding custom type succeeded."); + + return KERN_SUCCESS; +} + +/* +kern_return_t +kcdata_api_assert_tests() +{ + kern_return_t retval = 0; + void * assert_check_retval = NULL; + test_kc_data2.kcd_length = 0xdeadbeef; + mach_vm_address_t address = (mach_vm_address_t)kalloc(PAGE_SIZE); + T_EXPECT_NOTNULL(address, "kalloc of PAGE_SIZE data."); + + retval = kcdata_memory_static_init(&test_kc_data2, (mach_vm_address_t)address, KCDATA_BUFFER_BEGIN_STACKSHOT, PAGE_SIZE, + KCFLAG_USE_MEMCOPY); + + T_ASSERT(retval == KERN_SUCCESS, "successful kcdata_memory_static_init call"); + + retval = T_REGISTER_ASSERT_CHECK("KCDATA_DESC_MAXLEN", &assert_check_retval); + T_ASSERT(retval == KERN_SUCCESS, "registered assert widget"); + + // this will assert + retval = kcdata_add_uint32_with_description(&test_kc_data2, 0xc0ffee, "really long description string for kcdata"); + T_ASSERT(retval == KERN_INVALID_ARGUMENT, "API param check returned KERN_INVALID_ARGUMENT correctly"); + T_ASSERT(assert_check_retval == (void *)XT_RET_W_SUCCESS, "assertion handler verified that it was hit"); + + return KERN_SUCCESS; +} +*/ + +#if defined(__arm__) || defined(__arm64__) + +#include + +#define MAX_PMAP_OBJECT_ELEMENT 100000 + +extern struct vm_object pmap_object_store; /* store pt pages */ +extern unsigned long gPhysBase, gPhysSize, first_avail; + +/* + * Define macros to transverse the pmap object structures and extract + * physical page number with information from low global only + * This emulate how Astris extracts information from coredump + */ +#if defined(__arm64__) + +static inline uintptr_t +astris_vm_page_unpack_ptr(uintptr_t p) +{ + if (!p) + return ((uintptr_t)0); + + return (p & lowGlo.lgPmapMemFromArrayMask) + ? lowGlo.lgPmapMemStartAddr + (p & ~(lowGlo.lgPmapMemFromArrayMask)) * lowGlo.lgPmapMemPagesize + : lowGlo.lgPmapMemPackedBaseAddr + (p << lowGlo.lgPmapMemPackedShift); +} + +// assume next pointer is the first element +#define astris_vm_page_queue_next(qc) (astris_vm_page_unpack_ptr(*((uint32_t *)(qc)))) + +#endif + +#if defined(__arm__) + +// assume next pointer is the first element +#define astris_vm_page_queue_next(qc) *((uintptr_t *)(qc)) + +#endif + +#define astris_vm_page_queue_first(q) astris_vm_page_queue_next(q) + +#define astris_vm_page_queue_end(q, qe) ((q) == (qe)) + +#define astris_vm_page_queue_iterate(head, elt) \ + for ((elt) = (uintptr_t)astris_vm_page_queue_first((head)); !astris_vm_page_queue_end((head), (elt)); \ + (elt) = (uintptr_t)astris_vm_page_queue_next(((elt) + (uintptr_t)lowGlo.lgPmapMemChainOffset))) + +#define astris_ptoa(x) ((vm_address_t)(x) << lowGlo.lgPageShift) + +static inline ppnum_t +astris_vm_page_get_phys_page(uintptr_t m) +{ + return (m >= lowGlo.lgPmapMemStartAddr && m < lowGlo.lgPmapMemEndAddr) + ? (ppnum_t)((m - lowGlo.lgPmapMemStartAddr) / lowGlo.lgPmapMemPagesize + lowGlo.lgPmapMemFirstppnum) + : *((ppnum_t *)(m + lowGlo.lgPmapMemPageOffset)); +} + +kern_return_t +pmap_coredump_test(void) +{ + int iter = 0; + uintptr_t p; + + T_LOG("Testing coredump info for PMAP."); + + T_ASSERT_GE_ULONG(lowGlo.lgStaticAddr, gPhysBase, NULL); + T_ASSERT_LE_ULONG(lowGlo.lgStaticAddr + lowGlo.lgStaticSize, first_avail, NULL); + T_ASSERT_EQ_ULONG(lowGlo.lgLayoutMajorVersion, 3, NULL); + T_ASSERT_EQ_ULONG(lowGlo.lgLayoutMinorVersion, 0, NULL); + T_ASSERT_EQ_ULONG(lowGlo.lgLayoutMagic, LOWGLO_LAYOUT_MAGIC, NULL); + + // check the constant values in lowGlo + T_ASSERT_EQ_ULONG(lowGlo.lgPmapMemQ, ((uint64_t) & (pmap_object_store.memq)), NULL); + T_ASSERT_EQ_ULONG(lowGlo.lgPmapMemPageOffset, offsetof(struct vm_page_with_ppnum, vmp_phys_page), NULL); + T_ASSERT_EQ_ULONG(lowGlo.lgPmapMemChainOffset, offsetof(struct vm_page, vmp_listq), NULL); + T_ASSERT_EQ_ULONG(lowGlo.lgPmapMemPagesize, sizeof(struct vm_page), NULL); + +#if defined(__arm64__) + T_ASSERT_EQ_ULONG(lowGlo.lgPmapMemFromArrayMask, VM_PACKED_FROM_VM_PAGES_ARRAY, NULL); + T_ASSERT_EQ_ULONG(lowGlo.lgPmapMemPackedShift, VM_PACKED_POINTER_SHIFT, NULL); + T_ASSERT_EQ_ULONG(lowGlo.lgPmapMemPackedBaseAddr, VM_MIN_KERNEL_AND_KEXT_ADDRESS, NULL); +#endif + + vm_object_lock_shared(&pmap_object_store); + astris_vm_page_queue_iterate(lowGlo.lgPmapMemQ, p) + { + ppnum_t ppnum = astris_vm_page_get_phys_page(p); + pmap_paddr_t pa = (pmap_paddr_t)astris_ptoa(ppnum); + T_ASSERT_GE_ULONG(pa, gPhysBase, NULL); + T_ASSERT_LT_ULONG(pa, gPhysBase + gPhysSize, NULL); + iter++; + T_ASSERT_LT_INT(iter, MAX_PMAP_OBJECT_ELEMENT, NULL); + } + vm_object_unlock(&pmap_object_store); + + T_ASSERT_GT_INT(iter, 0, NULL); + return KERN_SUCCESS; +} +#endif diff --git a/osfmk/tests/ktest.c b/osfmk/tests/ktest.c new file mode 100644 index 000000000..14dcb69d5 --- /dev/null +++ b/osfmk/tests/ktest.c @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include + +void +ktest_start(void) { + ktest_emit_start(); +} + +void +ktest_finish(void) { + ktest_emit_finish(); +} + +void +ktest_testbegin(const char * test_name) { + ktest_current_time = mach_absolute_time(); + ktest_test_name = test_name; + ktest_emit_testbegin(test_name); +} + +void +ktest_testend() { + ktest_current_time = mach_absolute_time(); + ktest_emit_testend(); + ktest_test_index++; +} + +void +ktest_testskip(const char * msg, ...) { + va_list args; + + ktest_current_time = mach_absolute_time(); + + va_start(args, msg); + ktest_emit_testskip(msg, args); + va_end(args); + +} + +void +ktest_log(const char * msg, ...) { + va_list args; + + ktest_current_time = mach_absolute_time(); + + va_start(args, msg); + ktest_emit_log(msg, args); + va_end(args); +} + +void +ktest_perf(const char * metric, const char * unit, double value, const char * desc) +{ + ktest_current_time = mach_absolute_time(); + ktest_emit_perfdata(metric, unit, value, desc); +} + +void +ktest_testcase(int success) +{ + ktest_current_time = mach_absolute_time(); + + if(success && !ktest_expectfail) { + /* PASS */ + ktest_passcount++; + ktest_testcase_result = T_RESULT_PASS; + } else if(!success && !ktest_expectfail) { + /* FAIL */ + ktest_failcount++; + ktest_testcase_result = T_RESULT_FAIL; + } else if(success && ktest_expectfail) { + /* UXPASS */ + ktest_xpasscount++; + ktest_testcase_result = T_RESULT_UXPASS; + } else if(!success && ktest_expectfail) { + /* XFAIL */ + ktest_xfailcount++; + ktest_testcase_result = T_RESULT_XFAIL; + } + + ktest_update_test_result_state(); + if(ktest_quiet == 0 || + ktest_testcase_result == T_RESULT_FAIL || + ktest_testcase_result == T_RESULT_UXPASS) { + ktest_emit_testcase(); + } + ktest_expression_index++; + + ktest_quiet = 0; + ktest_expectfail = 0; + ktest_output_buf[0] = '\0'; + ktest_current_msg[0] = '\0'; + ktest_current_expr[0] = '\0'; + for(int i = 0; i < KTEST_MAXVARS; i++) { + ktest_current_var_names[i][0] = '\0'; + ktest_current_var_values[i][0] = '\0'; + } + ktest_current_var_index = 0; +} + +void +ktest_update_test_result_state(void) { + ktest_test_result = ktest_test_result_statetab[ktest_test_result] + [ktest_testcase_result] + [ktest_testcase_mode]; +} + +void +ktest_assertion_check(void) { + if (ktest_testcase_result == T_RESULT_FAIL || ktest_testcase_result == T_RESULT_UXPASS) { + ktest_testend(); + panic("XNUPOST: Assertion failed : %s : at %s:%d", ktest_test_name, ktest_current_file, ktest_current_line); + } +} diff --git a/osfmk/tests/ktest.h b/osfmk/tests/ktest.h new file mode 100644 index 000000000..4cc95c3c8 --- /dev/null +++ b/osfmk/tests/ktest.h @@ -0,0 +1,646 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _TESTS_KTEST_H +#define _TESTS_KTEST_H + +/* Symbol name prefix */ +#define T_SYM(sym) ktest_ ## sym + +#include + +extern unsigned int T_SYM(current_line); +extern const char * T_SYM(current_file); +extern const char * T_SYM(current_func); +extern int T_SYM(testcase_mode); +extern int T_SYM(testcase_result); +extern int T_SYM(test_result); +extern int T_SYM(quiet); + +void T_SYM(start)(void); +void T_SYM(finish)(void); +void T_SYM(testbegin)(const char * test_name); +void T_SYM(testend)(void); +void T_SYM(testskip)(const char * msg, ...); +void T_SYM(testcase)(int expr); +void T_SYM(log)(const char * msg, ...); +void T_SYM(perf)(const char * metric, const char * unit, double value, const char * desc); +void T_SYM(update_test_result_state)(void); +void T_SYM(assertion_check)(void); + +void T_SYM(set_current_msg)(const char * msg, ...); +void T_SYM(set_current_expr)(const char * expr_fmt, ...); +void T_SYM(set_current_var)(const char * name, const char * value_fmt, ...); + +typedef union { + char _char; + unsigned char _uchar; + + short _short; + unsigned short _ushort; + + int _int; + unsigned int _uint; + + long _long; + unsigned long _ulong; + + long long _llong; + unsigned long long _ullong; + + float _float; + + double _double; + + long double _ldouble; + + void* _ptr; +} T_SYM(temp); + +extern T_SYM(temp) T_SYM(temp1), T_SYM(temp2), T_SYM(temp3); + +#define T_SUCCESS 1 +#define T_FAILURE 0 + +/* Testcase modes */ +#define T_MAIN 0 +#define T_SETUP 1 + +/* Testcase result states */ +#define T_RESULT_PASS 0 +#define T_RESULT_FAIL 1 +#define T_RESULT_UXPASS 2 +#define T_RESULT_XFAIL 3 + +/* Test result states */ +#define T_STATE_UNRESOLVED 0 +#define T_STATE_PASS 1 +#define T_STATE_FAIL 2 +#define T_STATE_SETUPFAIL 3 + +/* + * Helpers + */ + +#define T_TOSTRING_HELPER(x) #x +#define T_TOSTRING(x) T_TOSTRING_HELPER(x) + +#define T_SAVEINFO do {\ + T_SYM(current_line) = __LINE__;\ + T_SYM(current_func) = (const char *)__func__;\ + T_SYM(current_file) = (const char *)__FILE__;\ +} while(0) + +#define T_SET_AUX_VARS do {\ + /* Only used in userspace lib for now */\ +} while(0) + +#define T_ASSERTION_CHECK do {\ + T_SYM(assertion_check)();\ +} while(0) + +#define T_EXPECT_BLOCK2(type, fmt, cmp, lhs, rhs, msg, ...) do {\ + T_SAVEINFO;\ + T_SYM(temp1).type = (lhs);\ + T_SYM(temp2).type = (rhs);\ + T_SYM(set_current_expr)(T_TOSTRING(lhs) " "\ + T_TOSTRING(cmp) " "\ + T_TOSTRING(rhs));\ + T_SYM(set_current_var)(T_TOSTRING(lhs), fmt, T_SYM(temp1).type);\ + T_SYM(set_current_var)(T_TOSTRING(rhs), fmt, T_SYM(temp2).type);\ + T_SET_AUX_VARS;\ + T_SYM(set_current_msg)(msg, ## __VA_ARGS__);\ + T_SYM(testcase)(T_SYM(temp1).type cmp T_SYM(temp2).type);\ +} while(0) + +#define T_ASSERT_BLOCK2(type, fmt, cmp, lhs, rhs, msg, ...) do {\ + T_EXPECT_BLOCK2(type, fmt, cmp, lhs, rhs, msg, ## __VA_ARGS__);\ + T_ASSERTION_CHECK;\ +} while(0) + +/* + * Core functions + */ + +/* Denotes start of testing. All prior output is ignored. */ +#define T_START do {\ + T_SAVEINFO;\ + T_SYM(start)();\ +} while(0) + +/* Denotes end of testing. All subsequent output is ignored. */ +#define T_FINISH do {\ + T_SAVEINFO;\ + T_SYM(finish)();\ +} while(0) + +/* Denotes beginning of a test. */ +#define T_BEGIN(name) do {\ + T_SAVEINFO;\ + T_SYM(testbegin)(name);\ +} while(0) + +/* Denotes end of current test. */ +#define T_END do {\ + T_SAVEINFO;\ + T_SYM(testend)();\ +} while(0) + +/* Denotes beginning of a setup section of the current test. */ +#define T_SETUPBEGIN do {\ + T_SYM(testcase_mode) = T_SETUP;\ +} while(0) + +/* Denotes end of the current setup section of the current test. */ +#define T_SETUPEND do {\ + T_SYM(testcase_mode) = T_MAIN;\ +} while(0) + +/* Denotes end of current test because test was skipped. */ +#define T_SKIP(msg, ...) do {\ + T_SAVEINFO;\ + T_SYM(testskip)(msg, ## __VA_ARGS__);\ +} while(0) + +/* Returns result of latest testrun. */ +#define T_TESTRESULT (T_SYM(test_result)) + +/* Return result of latest testcase. */ +#define T_TESTCASERESULT (T_SYM(testcase_result)) + +/* Flags the next testcase as expected failure. */ +#define T_EXPECTFAIL do {\ + T_SYM(expectfail) = 1;\ +} while(0) + +/* Only emit output for next testcase if it is a FAIL or UXPASS. */ +#define T_QUIET do {\ + T_SYM(quiet) = 1;\ +} while(0) + +/* Logs a message. */ +#define T_LOG(msg, ...) do {\ + T_SAVEINFO;\ + T_SYM(log)(msg, ## __VA_ARGS__);\ +} while(0) + +/* Explicit pass. */ +#define T_PASS(msg, ...) do {\ + T_SAVEINFO;\ + T_SYM(set_current_msg)(msg, ## __VA_ARGS__);\ + T_SYM(testcase)(T_SUCCESS);\ +} while(0) + +/* Explicit fail. */ +#define T_FAIL(msg, ...) do {\ + T_SAVEINFO;\ + T_SYM(set_current_msg)(msg, ## __VA_ARGS__);\ + T_SYM(testcase)(T_FAILURE);\ +} while(0) + +/* Explicit assert fail. */ +#define T_ASSERT_FAIL(msg, ...) do {\ + T_SAVEINFO;\ + T_SET_AUX_VARS;\ + T_SYM(set_current_msg)(msg, ## __VA_ARGS__);\ + T_SYM(testcase)(T_FAILURE);\ + T_SYM(assertion_fail)();\ +} while(0) + +/* Generic expect. */ +#define T_EXPECT(expr, msg, ...) do {\ + T_SAVEINFO;\ + T_SYM(temp1)._int = (int)(!!(expr));\ + T_SYM(set_current_expr)(T_TOSTRING(expr));\ + T_SET_AUX_VARS;\ + T_SYM(set_current_msg)(msg, ## __VA_ARGS__);\ + T_SYM(testcase)(T_SYM(temp1)._int);\ +} while(0) + +/* Generic assert. */ +#define T_ASSERT(expr, msg, ...) do {\ + T_EXPECT(expr, msg, ## __VA_ARGS__);\ + T_ASSERTION_CHECK;\ +} while(0) + +/* + * Convenience functions + */ + +/* null */ + +#define T_EXPECT_NOTNULL(expr, msg, ...) do {\ + T_SAVEINFO;\ + T_SYM(temp1)._int = (int)(!!(expr));\ + T_SYM(set_current_expr)(T_TOSTRING(expr) " != NULL");\ + T_SYM(set_current_var)(T_TOSTRING(expr),\ + "%s",\ + T_SYM(temp1)._int ? "" : "NULL");\ + T_SET_AUX_VARS;\ + T_SYM(set_current_msg)(msg, ## __VA_ARGS__);\ + T_SYM(testcase)(T_SYM(temp1)._int);\ +} while(0) + +#define T_EXPECT_NULL(expr, msg, ...) do {\ + T_SAVEINFO;\ + T_SYM(temp1)._int = (int)(!(expr));\ + T_SYM(set_current_expr)(T_TOSTRING(expr) " == NULL");\ + T_SYM(set_current_var)(T_TOSTRING(expr),\ + "%s",\ + T_SYM(temp1)._int ? "NULL" : "");\ + T_SET_AUX_VARS;\ + T_SYM(set_current_msg)(msg, ## __VA_ARGS__);\ + T_SYM(testcase)(T_SYM(temp1)._int);\ +} while(0) + +#define T_ASSERT_NOTNULL(expr, msg, ...) do {\ + T_EXPECT_NOTNULL(expr, msg, ## __VA_ARGS__);\ + T_ASSERTION_CHECK;\ +} while(0) + +#define T_ASSERT_NULL(expr, msg, ...) do {\ + T_EXPECT_NULL(expr, msg, ## __VA_ARGS__);\ + T_ASSERTION_CHECK;\ +} while(0) + +/* string */ + +// TODO: check/truncate inputs +#define T_EXPECT_EQ_STR(lhs, rhs, msg, ...) do {\ + T_SAVEINFO;\ + T_SYM(temp1)._ptr = (lhs);\ + T_SYM(temp2)._ptr = (rhs);\ + T_SYM(set_current_expr)(T_TOSTRING(lhs) " == " T_TOSTRING(rhs));\ + T_SYM(set_current_var)(T_TOSTRING(lhs), "%s", T_SYM(temp1)._ptr);\ + T_SYM(set_current_var)(T_TOSTRING(rhs), "%s", T_SYM(temp2)._ptr);\ + T_SET_AUX_VARS;\ + T_SYM(set_current_msg)(msg, ## __VA_ARGS__);\ + T_SYM(testcase)(strcmp(T_SYM(temp1)._ptr, T_SYM(temp2)._ptr) == 0);\ +} while(0) + +#define T_EXPECT_NE_STR(lhs, rhs, msg, ...) do {\ + T_SAVEINFO;\ + T_SYM(temp1)._ptr = (lhs);\ + T_SYM(temp2)._ptr = (rhs);\ + T_SYM(set_current_expr)(T_TOSTRING(lhs) " == " T_TOSTRING(rhs));\ + T_SYM(set_current_var)(T_TOSTRING(lhs), "%s", T_SYM(temp1)._ptr);\ + T_SYM(set_current_var)(T_TOSTRING(rhs), "%s", T_SYM(temp2)._ptr);\ + T_SET_AUX_VARS;\ + T_SYM(set_current_msg)(msg, ## __VA_ARGS__);\ + T_SYM(testcase)(strcmp(T_SYM(temp1)._ptr, T_SYM(temp2)._ptr) != 0);\ +} while(0) + +#define T_ASSERT_EQ_STR(lhs, rhs, msg, ...) do {\ + T_EXPECT_EQ_STR(lhs, rhs, msg, # __VA_ARGS__);\ + T_ASSERTION_CHECK;\ +} while(0) + +#define T_ASSERT_NE_STR(lhs, rhs, msg, ...) do {\ + T_EXPECT_NE_STR(lhs, rhs, msg, # __VA_ARGS__);\ + T_ASSERTION_CHECK;\ +} while(0) + +/* char */ + +#define T_EXPECT_EQ_CHAR(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_char, "%c", ==, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_NE_CHAR(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_char, "%c", !=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_LT_CHAR(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_char, "%c", <, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_GT_CHAR(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_char, "%c", >, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_LE_CHAR(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_char, "%c", <=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_GE_CHAR(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_char, "%c", >=, lhs, rhs, msg, ## __VA_ARGS__) + +#define T_ASSERT_EQ_CHAR(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_char, "%c", ==, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_NE_CHAR(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_char, "%c", !=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_LT_CHAR(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_char, "%c", <, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_GT_CHAR(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_char, "%c", >, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_LE_CHAR(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_char, "%c", <=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_GE_CHAR(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_char, "%c", >=, lhs, rhs, msg, ## __VA_ARGS__) + +/* unsigned char */ + +#define T_EXPECT_EQ_UCHAR(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_uchar, "%c", ==, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_NE_UCHAR(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_uchar, "%c", !=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_LT_UCHAR(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_uchar, "%c", <, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_GT_UCHAR(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_uchar, "%c", >, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_LE_UCHAR(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_uchar, "%c", <=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_GE_UCHAR(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_uchar, "%c", >=, lhs, rhs, msg, ## __VA_ARGS__) + +#define T_ASSERT_EQ_UCHAR(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_uchar, "%c", ==, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_NE_UCHAR(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_uchar, "%c", !=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_LT_UCHAR(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_uchar, "%c", <, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_GT_UCHAR(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_uchar, "%c", >, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_LE_UCHAR(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_uchar, "%c", <=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_GE_UCHAR(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_uchar, "%c", >=, lhs, rhs, msg, ## __VA_ARGS__) + +/* short */ + +#define T_EXPECT_EQ_SHORT(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_short, "%hi", ==, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_NE_SHORT(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_short, "%hi", !=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_LT_SHORT(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_short, "%hi", <, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_GT_SHORT(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_short, "%hi", >, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_LE_SHORT(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_short, "%hi", <=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_GE_SHORT(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_short, "%hi", >=, lhs, rhs, msg, ## __VA_ARGS__) + +#define T_ASSERT_EQ_SHORT(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_short, "%hi", ==, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_NE_SHORT(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_short, "%hi", !=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_LT_SHORT(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_short, "%hi", <, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_GT_SHORT(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_short, "%hi", >, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_LE_SHORT(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_short, "%hi", <=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_GE_SHORT(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_short, "%hi", >=, lhs, rhs, msg, ## __VA_ARGS__) + +/* unsigned short */ + +#define T_EXPECT_EQ_USHORT(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_ushort, "%hu", ==, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_NE_USHORT(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_ushort, "%hu", !=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_LT_USHORT(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_ushort, "%hu", <, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_GT_USHORT(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_ushort, "%hu", >, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_LE_USHORT(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_ushort, "%hu", <=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_GE_USHORT(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_ushort, "%hu", >=, lhs, rhs, msg, ## __VA_ARGS__) + +#define T_ASSERT_EQ_USHORT(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_ushort, "%hu", ==, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_NE_USHORT(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_ushort, "%hu", !=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_LT_USHORT(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_ushort, "%hu", <, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_GT_USHORT(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_ushort, "%hu", >, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_LE_USHORT(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_ushort, "%hu", <=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_GE_USHORT(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_ushort, "%hu", >=, lhs, rhs, msg, ## __VA_ARGS__) + +/* int */ + +#define T_EXPECT_EQ_INT(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_int, "%d", ==, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_NE_INT(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_int, "%d", !=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_LT_INT(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_int, "%d", <, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_GT_INT(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_int, "%d", >, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_LE_INT(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_int, "%d", <=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_GE_INT(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_int, "%d", >=, lhs, rhs, msg, ## __VA_ARGS__) + +#define T_ASSERT_EQ_INT(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_int, "%d", ==, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_NE_INT(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_int, "%d", !=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_LT_INT(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_int, "%d", <, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_GT_INT(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_int, "%d", >, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_LE_INT(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_int, "%d", <=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_GE_INT(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_int, "%d", >=, lhs, rhs, msg, ## __VA_ARGS__) + +/* unsigned int */ + +#define T_EXPECT_EQ_UINT(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_uint, "%u", ==, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_NE_UINT(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_uint, "%u", !=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_LT_UINT(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_uint, "%u", <, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_GT_UINT(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_uint, "%u", >, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_LE_UINT(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_uint, "%u", <=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_GE_UINT(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_uint, "%u", >=, lhs, rhs, msg, ## __VA_ARGS__) + +#define T_ASSERT_EQ_UINT(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_uint, "%u", ==, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_NE_UINT(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_uint, "%u", !=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_LT_UINT(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_uint, "%u", <, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_GT_UINT(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_uint, "%u", >, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_LE_UINT(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_uint, "%u", <=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_GE_UINT(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_uint, "%u", >=, lhs, rhs, msg, ## __VA_ARGS__) + +/* long */ + +#define T_EXPECT_EQ_LONG(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_long, "%li", ==, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_NE_LONG(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_long, "%li", !=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_LT_LONG(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_long, "%li", <, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_GT_LONG(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_long, "%li", >, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_LE_LONG(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_long, "%li", <=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_GE_LONG(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_long, "%li", >=, lhs, rhs, msg, ## __VA_ARGS__) + +#define T_ASSERT_EQ_LONG(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_long, "%li", ==, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_NE_LONG(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_long, "%li", !=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_LT_LONG(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_long, "%li", <, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_GT_LONG(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_long, "%li", >, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_LE_LONG(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_long, "%li", <=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_GE_LONG(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_long, "%li", >=, lhs, rhs, msg, ## __VA_ARGS__) + +/* unsigned long */ + +#define T_EXPECT_EQ_ULONG(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_ulong, "%lu", ==, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_NE_ULONG(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_ulong, "%lu", !=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_LT_ULONG(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_ulong, "%lu", <, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_GT_ULONG(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_ulong, "%lu", >, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_LE_ULONG(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_ulong, "%lu", <=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_GE_ULONG(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_ulong, "%lu", >=, lhs, rhs, msg, ## __VA_ARGS__) + +#define T_ASSERT_EQ_ULONG(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_ulong, "%lu", ==, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_NE_ULONG(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_ulong, "%lu", !=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_LT_ULONG(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_ulong, "%lu", <, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_GT_ULONG(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_ulong, "%lu", >, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_LE_ULONG(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_ulong, "%lu", <=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_GE_ULONG(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_ulong, "%lu", >=, lhs, rhs, msg, ## __VA_ARGS__) + +/* long long */ + +#define T_EXPECT_EQ_LLONG(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_llong, "%lli", ==, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_NE_LLONG(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_llong, "%lli", !=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_LT_LLONG(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_llong, "%lli", <, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_GT_LLONG(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_llong, "%lli", >, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_LE_LLONG(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_llong, "%lli", <=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_GE_LLONG(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_llong, "%lli", >=, lhs, rhs, msg, ## __VA_ARGS__) + +#define T_ASSERT_EQ_LLONG(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_llong, "%lli", ==, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_NE_LLONG(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_llong, "%lli", !=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_LT_LLONG(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_llong, "%lli", <, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_GT_LLONG(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_llong, "%lli", >, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_LE_LLONG(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_llong, "%lli", <=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_GE_LLONG(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_llong, "%lli", >=, lhs, rhs, msg, ## __VA_ARGS__) + +/* unsigned long long */ + +#define T_EXPECT_EQ_ULLONG(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_ullong, "%llu", ==, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_NE_ULLONG(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_ullong, "%llu", !=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_LT_ULLONG(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_ullong, "%llu", <, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_GT_ULLONG(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_ullong, "%llu", >, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_LE_ULLONG(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_ullong, "%llu", <=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_GE_ULLONG(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_ullong, "%llu", >=, lhs, rhs, msg, ## __VA_ARGS__) + +#define T_ASSERT_EQ_ULLONG(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_ullong, "%llu", ==, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_NE_ULLONG(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_ullong, "%llu", !=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_LT_ULLONG(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_ullong, "%llu", <, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_GT_ULLONG(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_ullong, "%llu", >, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_LE_ULLONG(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_ullong, "%llu", <=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_GE_ULLONG(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_ullong, "%llu", >=, lhs, rhs, msg, ## __VA_ARGS__) + +/* pointer */ + +#define T_EXPECT_EQ_PTR(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_ptr, "%p", ==, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_NE_PTR(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_ptr, "%p", !=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_LT_PTR(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_ptr, "%p", <, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_GT_PTR(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_ptr, "%p", >, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_LE_PTR(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_ptr, "%p", <=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_EXPECT_GE_PTR(lhs, rhs, msg, ...)\ + T_EXPECT_BLOCK2(_ptr, "%p", >=, lhs, rhs, msg, ## __VA_ARGS__) + +#define T_ASSERT_EQ_PTR(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_ptr, "%p", ==, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_NE_PTR(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_ptr, "%p", !=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_LT_PTR(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_ptr, "%p", <, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_GT_PTR(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_ptr, "%p", >, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_LE_PTR(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_ptr, "%p", <=, lhs, rhs, msg, ## __VA_ARGS__) +#define T_ASSERT_GE_PTR(lhs, rhs, msg, ...)\ + T_ASSERT_BLOCK2(_ptr, "%p", >=, lhs, rhs, msg, ## __VA_ARGS__) + +/* + * Log a perfdata measurement. For example: + * T_PERF("name_of_metric", 3234, "nsec", "time since first test run") + */ +#define T_PERF(metric, value, unit, desc) \ + do { \ + T_SAVEINFO; \ + T_SYM(perf)(metric, unit, value, desc); \ + } while (0) + +#endif /* _TESTS_KTEST_H */ diff --git a/osfmk/prng/YarrowCoreLib/port/smf.c b/osfmk/tests/ktest_accessor.c similarity index 54% rename from osfmk/prng/YarrowCoreLib/port/smf.c rename to osfmk/tests/ktest_accessor.c index 5cb4a3664..ab660d57c 100644 --- a/osfmk/prng/YarrowCoreLib/port/smf.c +++ b/osfmk/tests/ktest_accessor.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 1999-2013 Apple Inc. All rights reserved. + * Copyright (c) 2015 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,52 +22,58 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - File: smf.c - - Contains: platform-dependent malloc/free - -*/ +#include +#include +#include -#include -#include -#include +int vsnprintf(char *, size_t, const char *, va_list); -/* Shim emulating _MALLOC */ +void +ktest_set_current_expr(const char * expr_fmt, ...) { + int ret; + va_list args; -SMFAPI void mmInit( void ) -{ - return; + va_start(args, expr_fmt); + ret = vsnprintf(ktest_current_expr, KTEST_MAXLEN, expr_fmt, args); + va_end(args); } -SMFAPI MMPTR mmMalloc(DWORD request) -{ - void *addr; +void +ktest_set_current_var(const char * name, const char * value_fmt, ...) { + int ret; + va_list args; - addr = (void *) kalloc(request); - if (addr == NULL) - return NULL; - - return (MMPTR) addr; -} + if(ktest_current_var_index >= KTEST_MAXVARS) { + panic("Internal ktest error in " __func__); + } -SMFAPI void mmFree(MMPTR ptrnum) -{ - kfree_addr(ptrnum); -} + strlcpy(ktest_current_var_names[ktest_current_var_index], + name, + KTEST_MAXLEN); + + va_start(args, value_fmt); + ret = vsnprintf(ktest_current_var_values[ktest_current_var_index], + KTEST_MAXLEN, + value_fmt, + args); + va_end(args); -SMFAPI LPVOID mmGetPtr(MMPTR ptrnum) -{ - return (LPVOID)ptrnum; + ktest_current_var_index++; } -SMFAPI void mmReturnPtr(__unused MMPTR ptrnum) -{ - /* nothing */ - return; +void +ktest_set_current_msg(const char * msg, ...) { + int ret; + va_list args; + + if(msg == NULL) return; + + va_start(args, msg); + ret = vsnprintf(ktest_current_msg, KTEST_MAXLEN, msg, args); + va_end(args); } diff --git a/osfmk/tests/ktest_emit.c b/osfmk/tests/ktest_emit.c new file mode 100644 index 000000000..088dd386d --- /dev/null +++ b/osfmk/tests/ktest_emit.c @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include + +#define EMIT(buf,size) do { \ + console_write(buf, size); \ + } while(0) + +/* TODO: intelligently truncate messages if possible */ +#define BOUNDS_CHECK_AND_UPDATE(ret, size) do {\ + if(ret < 0 || ret >= size) {\ + panic("Internal ktest error in %s", __func__);\ + }\ + size -= ret;\ + msg += ret;\ +} while(0) + +int vsnprintf(char *, size_t, const char *, va_list); + +void +ktest_emit_start(void) { + char str[] = "\n[KTEST]\tSTART\t" KTEST_VERSION_STR "\n"; + EMIT((char *)&str[0], sizeof(str)-1); +} + +void +ktest_emit_finish(void) { + char str[] = "\n[KTEST]\tFINISH\n"; + EMIT((char *)&str[0], sizeof(str)-1); +} + +void +ktest_emit_testbegin(const char * test_name) { + char * msg = ktest_output_buf; + int size = sizeof(ktest_output_buf); + int ret; + + /* left trim the file path for readability */ + char *fname = strnstr((char *)(uintptr_t)ktest_current_file, "xnu", 100); + + ret = snprintf(msg, + size, + "\n[KTEST]\t" /* header */ + "TESTBEGIN\t" /* type */ + "%lld\t" /* time */ + "%d\t" /* index */ + "%s\t" /* file */ + "%d\t" /* line */ + "%s\n", /* name */ + ktest_current_time, + ktest_test_index, + fname, + ktest_current_line, + test_name); + BOUNDS_CHECK_AND_UPDATE(ret, size); + + EMIT(ktest_output_buf, (int)(msg - ktest_output_buf)); +} + +void +ktest_emit_testskip(const char * skip_msg, va_list args) { + char * msg = ktest_output_buf; + int size = sizeof(ktest_output_buf); + int ret; + + char *fname = strnstr((char *)(uintptr_t)ktest_current_file, "xnu", 100); + + ret = snprintf(msg, + size, + "\n[KTEST]\t" /* header */ + "TESTSKIP\t" /* type */ + "%lld\t" /* time */ + "%s\t" /* file */ + "%d\t", /* line */ + ktest_current_time, + fname, + ktest_current_line); + BOUNDS_CHECK_AND_UPDATE(ret, size); + + ret = vsnprintf(msg, size, skip_msg, args); + BOUNDS_CHECK_AND_UPDATE(ret, size); + + ret = snprintf(msg, size, "\n"); + BOUNDS_CHECK_AND_UPDATE(ret, size); + + EMIT(ktest_output_buf, (int)(msg - ktest_output_buf)); + +} + +void +ktest_emit_testend() { + char * msg = ktest_output_buf; + int size = sizeof(ktest_output_buf); + int ret; + + char *fname = strnstr((char *)(uintptr_t)ktest_current_file, "xnu", 100); + + ret = snprintf(msg, + size, + "\n[KTEST]\t" /* header */ + "TESTEND\t" /* type */ + "%lld\t" /* time */ + "%d\t" /* index */ + "%s\t" /* file */ + "%d\t" /* line */ + "%s\n", /* name */ + ktest_current_time, + ktest_test_index, + fname, + ktest_current_line, + ktest_test_name); + BOUNDS_CHECK_AND_UPDATE(ret, size); + + EMIT(ktest_output_buf, (int)(msg - ktest_output_buf)); + +} + +void +ktest_emit_log(const char * log_msg, va_list args) { + char * msg = ktest_output_buf; + int size = sizeof(ktest_output_buf); + int ret; + + char *fname = strnstr((char *)(uintptr_t)ktest_current_file, "xnu", 100); + + ret = snprintf(msg, + size, + "\n[KTEST]\t" /* header */ + "LOG\t" /* type */ + "%lld\t" /* time */ + "%s\t" /* file */ + "%d\t", /* line */ + ktest_current_time, + fname, + ktest_current_line); + BOUNDS_CHECK_AND_UPDATE(ret, size); + + ret = vsnprintf(msg, size, log_msg, args); + BOUNDS_CHECK_AND_UPDATE(ret, size); + + ret = snprintf(msg, size, "\n"); + BOUNDS_CHECK_AND_UPDATE(ret, size); + + EMIT(ktest_output_buf, (int)(msg - ktest_output_buf)); + +} + +void +ktest_emit_perfdata(const char * metric, const char * unit, double value, const char * desc) +{ + static const char * perfstr = "%s\t%lld\t%s\t\"%s\""; + char * msg = ktest_output_buf; + int64_t print_value = (int64_t)value; + int size = sizeof(ktest_output_buf); + int ret; + + char *fname = strnstr((char *)(uintptr_t)ktest_current_file, "xnu", 100); + + ret = snprintf(msg, size, + "\n[KTEST]\t" /* header */ + "PERF\t" /* type */ + "%lld\t" /* time */ + "%s\t" /* file */ + "%d\t", /* line */ + ktest_current_time, + fname, + ktest_current_line); + BOUNDS_CHECK_AND_UPDATE(ret, size); + + ret = snprintf(msg, size, perfstr, metric, print_value, unit, desc); + BOUNDS_CHECK_AND_UPDATE(ret, size); + + ret = snprintf(msg, size, "\n"); + BOUNDS_CHECK_AND_UPDATE(ret, size); + + EMIT(ktest_output_buf, (int)(msg - ktest_output_buf)); + +} + +void +ktest_emit_testcase(void) { + char * msg = ktest_output_buf; + int size = sizeof(ktest_output_buf); + int ret; + + char *fname = strnstr((char *)(uintptr_t)ktest_current_file, "xnu", 100); + + ret = snprintf(msg, + size, + "\n[KTEST]\t" /* header */ + "%s\t" /* type */ + "%lld\t" /* time */ + "%d\t" /* index */ + "%s\t" /* file */ + "%d\t" /* line */ + "%s\t" /* message */ + "%s", /* current_expr */ + ktest_testcase_result_tokens[ktest_testcase_mode] + [ktest_testcase_result], + ktest_current_time, + ktest_expression_index, + fname, + ktest_current_line, + ktest_current_msg, + ktest_current_expr); + BOUNDS_CHECK_AND_UPDATE(ret, size); + + for(int i = 0; ktest_current_var_names[i][0]; i++) { + ret = snprintf(msg, + size, + "\t%s\t%s", + ktest_current_var_names[i], + ktest_current_var_values[i]); + BOUNDS_CHECK_AND_UPDATE(ret, size); + } + + ret = snprintf(msg, size, "\n"); + BOUNDS_CHECK_AND_UPDATE(ret, size); + + EMIT(ktest_output_buf, (int)(msg - ktest_output_buf)); +} diff --git a/osfmk/tests/ktest_global.c b/osfmk/tests/ktest_global.c new file mode 100644 index 000000000..eee7e0af4 --- /dev/null +++ b/osfmk/tests/ktest_global.c @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include + +unsigned int ktest_current_line = 0; +const char * ktest_current_file = NULL; +const char * ktest_current_func = NULL; +uint64_t ktest_current_time = 0; + +const char * ktest_test_name = ""; + +char ktest_current_msg[KTEST_MAXLEN] = ""; +char ktest_current_expr[KTEST_MAXOUTLEN] = ""; +char ktest_current_var_names[KTEST_MAXVARS][KTEST_MAXLEN] = { "", "", "" }; +char ktest_current_var_values[KTEST_MAXVARS][KTEST_MAXLEN] = { "", "", "" }; +unsigned int ktest_expression_index = 0; +unsigned int ktest_current_var_index = 0; +unsigned int ktest_test_index = 0; +unsigned int ktest_passcount = 0; +unsigned int ktest_failcount = 0; +unsigned int ktest_xpasscount = 0; +unsigned int ktest_xfailcount = 0; +int ktest_expectfail = 0; +int ktest_quiet = 0; + +int ktest_testcase_result = T_RESULT_FAIL; +int ktest_test_result = T_STATE_UNRESOLVED; +int ktest_testcase_mode = T_MAIN; + +ktest_temp ktest_temp1, ktest_temp2, ktest_temp3; + +char ktest_output_buf[KTEST_MAXLEN] = ""; + +int +ktest_test_result_statetab[KTEST_NUM_TEST_STATES] + [KTEST_NUM_TESTCASE_STATES] + [KTEST_NUM_TESTCASE_MODES] = { + [T_STATE_UNRESOLVED][T_RESULT_PASS][T_MAIN] = T_STATE_PASS, + [T_STATE_UNRESOLVED][T_RESULT_FAIL][T_MAIN] = T_STATE_FAIL, + [T_STATE_UNRESOLVED][T_RESULT_UXPASS][T_MAIN] = T_STATE_FAIL, + [T_STATE_UNRESOLVED][T_RESULT_XFAIL][T_MAIN] = T_STATE_PASS, + + [T_STATE_PASS][T_RESULT_PASS][T_MAIN] = T_STATE_PASS, + [T_STATE_PASS][T_RESULT_FAIL][T_MAIN] = T_STATE_FAIL, + [T_STATE_PASS][T_RESULT_UXPASS][T_MAIN] = T_STATE_FAIL, + [T_STATE_PASS][T_RESULT_XFAIL][T_MAIN] = T_STATE_PASS, + + [T_STATE_FAIL][T_RESULT_PASS][T_MAIN] = T_STATE_FAIL, + [T_STATE_FAIL][T_RESULT_FAIL][T_MAIN] = T_STATE_FAIL, + [T_STATE_FAIL][T_RESULT_UXPASS][T_MAIN] = T_STATE_FAIL, + [T_STATE_FAIL][T_RESULT_XFAIL][T_MAIN] = T_STATE_FAIL, + + [T_STATE_SETUPFAIL][T_RESULT_PASS][T_MAIN] = T_STATE_SETUPFAIL, + [T_STATE_SETUPFAIL][T_RESULT_FAIL][T_MAIN] = T_STATE_SETUPFAIL, + [T_STATE_SETUPFAIL][T_RESULT_UXPASS][T_MAIN] = T_STATE_SETUPFAIL, + [T_STATE_SETUPFAIL][T_RESULT_XFAIL][T_MAIN] = T_STATE_SETUPFAIL, + + [T_STATE_UNRESOLVED][T_RESULT_PASS][T_SETUP] = T_STATE_UNRESOLVED, + [T_STATE_UNRESOLVED][T_RESULT_FAIL][T_SETUP] = T_STATE_SETUPFAIL, + [T_STATE_UNRESOLVED][T_RESULT_UXPASS][T_SETUP] = T_STATE_SETUPFAIL, + [T_STATE_UNRESOLVED][T_RESULT_XFAIL][T_SETUP] = T_STATE_UNRESOLVED, + + [T_STATE_PASS][T_RESULT_PASS][T_SETUP] = T_STATE_PASS, + [T_STATE_PASS][T_RESULT_FAIL][T_SETUP] = T_STATE_SETUPFAIL, + [T_STATE_PASS][T_RESULT_UXPASS][T_SETUP] = T_STATE_SETUPFAIL, + [T_STATE_PASS][T_RESULT_XFAIL][T_SETUP] = T_STATE_PASS, + + [T_STATE_FAIL][T_RESULT_PASS][T_SETUP] = T_STATE_FAIL, + [T_STATE_FAIL][T_RESULT_FAIL][T_SETUP] = T_STATE_FAIL, + [T_STATE_FAIL][T_RESULT_UXPASS][T_SETUP] = T_STATE_FAIL, + [T_STATE_FAIL][T_RESULT_XFAIL][T_SETUP] = T_STATE_FAIL, + + [T_STATE_SETUPFAIL][T_RESULT_PASS][T_SETUP] = T_STATE_SETUPFAIL, + [T_STATE_SETUPFAIL][T_RESULT_FAIL][T_SETUP] = T_STATE_SETUPFAIL, + [T_STATE_SETUPFAIL][T_RESULT_UXPASS][T_SETUP] = T_STATE_SETUPFAIL, + [T_STATE_SETUPFAIL][T_RESULT_XFAIL][T_SETUP] = T_STATE_SETUPFAIL, +}; + +const char * ktest_testcase_result_tokens[KTEST_NUM_TESTCASE_MODES] + [KTEST_NUM_TESTCASE_STATES] = { + [T_MAIN][T_RESULT_PASS] = "PASS", + [T_MAIN][T_RESULT_FAIL] = "FAIL", + [T_MAIN][T_RESULT_UXPASS] = "UXPASS", + [T_MAIN][T_RESULT_XFAIL] = "XFAIL", + [T_SETUP][T_RESULT_PASS] = "SETUP_PASS", + [T_SETUP][T_RESULT_FAIL] = "SETUP_FAIL", + [T_SETUP][T_RESULT_UXPASS] = "SETUP_UXPASS", + [T_SETUP][T_RESULT_XFAIL] = "SETUP_XFAIL", +}; + diff --git a/osfmk/tests/ktest_internal.h b/osfmk/tests/ktest_internal.h new file mode 100644 index 000000000..bf82d45d0 --- /dev/null +++ b/osfmk/tests/ktest_internal.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _TESTS_KTEST_INTERNAL_H +#define _TESTS_KTEST_INTERNAL_H + +#include +#include + +#define KTEST_VERSION 1 +#define KTEST_VERSION_STR T_TOSTRING(KTEST_VERSION) + +#define KTEST_MAXLEN 1024 +#define KTEST_MAXOUTLEN 4096 +#define KTEST_MAXVARS 3 + +#define KTEST_NUM_TESTCASE_MODES 2 +#define KTEST_NUM_TESTCASE_STATES 4 +#define KTEST_NUM_TEST_STATES 4 + +extern unsigned int ktest_current_line; +extern const char * ktest_current_file; +extern const char * ktest_current_func; +extern uint64_t ktest_current_time; + +extern const char * ktest_test_name; + +extern char ktest_current_msg[KTEST_MAXLEN]; +extern char ktest_current_expr[KTEST_MAXOUTLEN]; +extern char ktest_current_var_names[KTEST_MAXVARS][KTEST_MAXLEN]; +extern char ktest_current_var_values[KTEST_MAXVARS][KTEST_MAXLEN]; +extern unsigned int ktest_expression_index; +extern unsigned int ktest_current_var_index; +extern unsigned int ktest_test_index; +extern unsigned int ktest_passcount; +extern unsigned int ktest_failcount; +extern unsigned int ktest_xpasscount; +extern unsigned int ktest_xfailcount; +extern int ktest_expectfail; + +extern int ktest_testcase_result; +extern int ktest_test_result; +extern int ktest_testcase_mode; + +extern ktest_temp ktest_temp1, ktest_temp2, ktest_temp3; + +extern char ktest_output_buf[KTEST_MAXLEN]; + +extern int ktest_test_result_statetab[KTEST_NUM_TEST_STATES] + [KTEST_NUM_TESTCASE_STATES] + [KTEST_NUM_TESTCASE_MODES]; + +extern const char * ktest_testcase_result_tokens[KTEST_NUM_TESTCASE_MODES] + [KTEST_NUM_TESTCASE_STATES]; + + +void ktest_emit_start(void); +void ktest_emit_finish(void); +void ktest_emit_testbegin(const char * test_name); +void ktest_emit_testskip(const char * skip_msg, va_list args); +void ktest_emit_testend(void); +void ktest_emit_log(const char * log_msg, va_list args); +void ktest_emit_perfdata(const char * metric, const char * unit, double value, const char * desc); +void ktest_emit_testcase(void); + +#endif /* _TESTS_KTEST_INTERNAL_H */ + diff --git a/osfmk/tests/pmap_tests.c b/osfmk/tests/pmap_tests.c new file mode 100644 index 000000000..d0a116463 --- /dev/null +++ b/osfmk/tests/pmap_tests.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include + + +extern ledger_template_t task_ledger_template; + +kern_return_t test_pmap_enter_disconnect(unsigned int num_loops); +kern_return_t test_pmap_iommu_disconnect(void); + +#define PMAP_TEST_VA (0xDEAD << PAGE_SHIFT) + +typedef struct { + pmap_t pmap; + volatile boolean_t stop; + ppnum_t pn; +} pmap_test_thread_args; + +static pmap_t +pmap_create_wrapper() { + pmap_t new_pmap = NULL; + ledger_t ledger; + assert(task_ledger_template != NULL); + if ((ledger = ledger_instantiate(task_ledger_template, LEDGER_CREATE_ACTIVE_ENTRIES)) == NULL) + return NULL; + new_pmap = pmap_create(ledger, 0, FALSE); + ledger_dereference(ledger); + return new_pmap; +} + +static void +pmap_disconnect_thread(void *arg, wait_result_t __unused wres) { + pmap_test_thread_args *args = arg; + do { + pmap_disconnect(args->pn); + } while (!args->stop); + thread_wakeup((event_t)args); +} + +kern_return_t +test_pmap_enter_disconnect(unsigned int num_loops) +{ + kern_return_t kr = KERN_SUCCESS; + thread_t disconnect_thread; + pmap_t new_pmap = pmap_create_wrapper(); + if (new_pmap == NULL) + return KERN_FAILURE; + vm_page_t m = vm_page_grab(); + if (m == VM_PAGE_NULL) { + pmap_destroy(new_pmap); + return KERN_FAILURE; + } + ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m); + pmap_test_thread_args args = {new_pmap, FALSE, phys_page}; + kern_return_t res = kernel_thread_start(pmap_disconnect_thread, &args, &disconnect_thread); + if (res) { + pmap_destroy(new_pmap); + vm_page_lock_queues(); + vm_page_free(m); + vm_page_unlock_queues(); + return res; + } + thread_deallocate(disconnect_thread); + + while (num_loops-- != 0) { + kr = pmap_enter(new_pmap, PMAP_TEST_VA, phys_page, + VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, FALSE); + assert(kr == KERN_SUCCESS); + } + + assert_wait((event_t)&args, THREAD_UNINT); + args.stop = TRUE; + thread_block(THREAD_CONTINUE_NULL); + + pmap_remove(new_pmap, PMAP_TEST_VA, PMAP_TEST_VA + PAGE_SIZE); + vm_page_lock_queues(); + vm_page_free(m); + vm_page_unlock_queues(); + pmap_destroy(new_pmap); + return KERN_SUCCESS; +} + +kern_return_t +test_pmap_iommu_disconnect(void) +{ + return KERN_SUCCESS; +} + diff --git a/osfmk/tests/test_thread_call.c b/osfmk/tests/test_thread_call.c new file mode 100644 index 000000000..ad3702312 --- /dev/null +++ b/osfmk/tests/test_thread_call.c @@ -0,0 +1,209 @@ +/* + * Copyright (c) 2017 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#if !(DEVELOPMENT || DEBUG) +#error "Testing is not enabled on RELEASE configurations" +#endif + +#include +#include +#include +#include + +kern_return_t test_thread_call(void); + +lck_grp_t test_lock_grp; +lck_mtx_t test_lock; + +typedef enum { + TEST_ARG1 = 0x1234, + TEST_ARG2 = 0x3456, +} test_param; + +int wait_for_callback; +int wait_for_main; + +int once_callback_counter = 0; + +static void +test_once_callback(thread_call_param_t param0, + thread_call_param_t param1) +{ + T_ASSERT_EQ_INT((test_param)param0, TEST_ARG1, "param0 is correct"); + T_ASSERT_EQ_INT((test_param)param1, TEST_ARG2, "param1 is correct"); + + once_callback_counter++; + + T_ASSERT_EQ_INT(once_callback_counter, 1, "only one callback"); + + lck_mtx_lock(&test_lock); + + thread_wakeup(&wait_for_callback); + + uint64_t deadline; + clock_interval_to_deadline(10, NSEC_PER_SEC, &deadline); + + kern_return_t kr; + /* wait for the main thread to finish, time out after 10s */ + kr = lck_mtx_sleep_deadline(&test_lock, LCK_SLEEP_DEFAULT, &wait_for_main, THREAD_UNINT, deadline); + T_ASSERT_EQ_INT(kr, THREAD_AWAKENED, " callback woken by main function"); + + lck_mtx_unlock(&test_lock); + + /* sleep for 1s to let the main thread begin the cancel and wait */ + delay_for_interval(1, NSEC_PER_SEC); +} + +static void +test_once_thread_call(void) +{ + lck_grp_init(&test_lock_grp, "test_thread_call", LCK_GRP_ATTR_NULL); + lck_mtx_init(&test_lock, &test_lock_grp, LCK_ATTR_NULL); + + thread_call_t call; + call = thread_call_allocate_with_options(&test_once_callback, + (thread_call_param_t)TEST_ARG1, + THREAD_CALL_PRIORITY_HIGH, + THREAD_CALL_OPTIONS_ONCE); + + thread_call_param_t arg2_param = (thread_call_param_t)TEST_ARG2; + + lck_mtx_lock(&test_lock); + + thread_call_enter1(call, arg2_param); + + uint64_t deadline; + clock_interval_to_deadline(10, NSEC_PER_SEC, &deadline); + + kern_return_t kr; + /* wait for the call to execute, time out after 10s */ + kr = lck_mtx_sleep_deadline(&test_lock, LCK_SLEEP_DEFAULT, &wait_for_callback, THREAD_UNINT, deadline); + T_ASSERT_EQ_INT(kr, THREAD_AWAKENED, "main function woken by callback"); + + lck_mtx_unlock(&test_lock); + + /* at this point the callback is stuck waiting */ + + T_ASSERT_EQ_INT(once_callback_counter, 1, "callback fired"); + + boolean_t canceled, pending, freed; + + canceled = thread_call_cancel(call); + T_ASSERT_EQ_INT(canceled, FALSE, "thread_call_cancel should not succeed"); + + pending = thread_call_enter1(call, arg2_param); + T_ASSERT_EQ_INT(pending, FALSE, "call should not be pending"); + + /* sleep for 10ms, the call should not execute */ + delay_for_interval(10, NSEC_PER_MSEC); + + canceled = thread_call_cancel(call); + T_ASSERT_EQ_INT(canceled, TRUE, "thread_call_cancel should succeed"); + + pending = thread_call_enter1(call, arg2_param); + T_ASSERT_EQ_INT(pending, FALSE, "call should not be pending"); + + freed = thread_call_free(call); + T_ASSERT_EQ_INT(freed, FALSE, "thread_call_free should not succeed"); + + pending = thread_call_enter1(call, arg2_param); + T_ASSERT_EQ_INT(pending, TRUE, "call should be pending"); + + thread_wakeup(&wait_for_main); + + canceled = thread_call_cancel_wait(call); + T_ASSERT_EQ_INT(canceled, TRUE, "thread_call_cancel_wait should succeed"); + + canceled = thread_call_cancel(call); + T_ASSERT_EQ_INT(canceled, FALSE, "thread_call_cancel should not succeed"); + + freed = thread_call_free(call); + T_ASSERT_EQ_INT(freed, TRUE, "thread_call_free should succeed"); +} + +int signal_callback_counter = 0; + +static void +test_signal_callback(__unused thread_call_param_t param0, + __unused thread_call_param_t param1) +{ + /* + * ktest sometimes panics if you assert from interrupt context, + * and the serial logging will blow past the delay to wait for the interrupt + * so don't print in this context. + */ + + signal_callback_counter++; +} + +static void +test_signal_thread_call(void) +{ + thread_call_t call; + call = thread_call_allocate_with_options(&test_signal_callback, + (thread_call_param_t)TEST_ARG1, + THREAD_CALL_PRIORITY_HIGH, + THREAD_CALL_OPTIONS_ONCE|THREAD_CALL_OPTIONS_SIGNAL); + + thread_call_param_t arg2_param = (thread_call_param_t)TEST_ARG2; + + uint64_t deadline; + + boolean_t canceled, pending, freed; + + clock_interval_to_deadline(10, NSEC_PER_SEC, &deadline); + pending = thread_call_enter1_delayed(call, arg2_param, deadline); + T_ASSERT_EQ_INT(pending, FALSE, "call should not be pending"); + + canceled = thread_call_cancel(call); + T_ASSERT_EQ_INT(canceled, TRUE, "thread_call_cancel should succeed"); + + clock_interval_to_deadline(10, NSEC_PER_MSEC, &deadline); + pending = thread_call_enter1_delayed(call, arg2_param, deadline); + T_ASSERT_EQ_INT(pending, FALSE, "call should not be pending"); + + /* sleep for 50ms to let the interrupt fire */ + delay_for_interval(50, NSEC_PER_MSEC); + + T_ASSERT_EQ_INT(signal_callback_counter, 1, "callback fired"); + + canceled = thread_call_cancel(call); + T_ASSERT_EQ_INT(canceled, FALSE, "thread_call_cancel should not succeed"); + + freed = thread_call_free(call); + T_ASSERT_EQ_INT(freed, TRUE, "thread_call_free should succeed"); +} + +kern_return_t +test_thread_call(void) +{ + test_once_thread_call(); + test_signal_thread_call(); + + return KERN_SUCCESS; +} diff --git a/osfmk/tests/xnupost.h b/osfmk/tests/xnupost.h new file mode 100644 index 000000000..326858612 --- /dev/null +++ b/osfmk/tests/xnupost.h @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2015 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _TESTS_XNUPOST_H +#define _TESTS_XNUPOST_H + +#ifndef CONFIG_XNUPOST +#error "Testing is not enabled if CONFIG_XNUPOST is not enabled" +#endif + +#include +#include +#include + +#define XT_CONFIG_RUN 0x0 +#define XT_CONFIG_IGNORE 0x1 +#define XT_CONFIG_EXPECT_PANIC 0x2 + +#define XTCTL_RUN_TESTS 1 +#define XTCTL_RESET_TESTDATA 2 + +typedef enum { XT_ACTION_NONE = 0, XT_ACTION_SKIPPED, XT_ACTION_PASSED, XT_ACTION_FAILED } xnupost_test_action_t; + +typedef kern_return_t (*test_function)(void); +struct xnupost_test { + uint16_t xt_config; + uint16_t xt_test_num; + kern_return_t xt_retval; + kern_return_t xt_expected_retval; + uint64_t xt_begin_time; + uint64_t xt_end_time; + xnupost_test_action_t xt_test_actions; + test_function xt_func; + const char * xt_name; +}; + +typedef kern_return_t xt_panic_return_t; +#define XT_PANIC_UNRELATED 0x8 /* not related. continue panic */ +#define XT_RET_W_FAIL 0x9 /* report FAILURE and return from panic */ +#define XT_RET_W_SUCCESS 0xA /* report SUCCESS and return from panic */ +#define XT_PANIC_W_FAIL 0xB /* report FAILURE and continue to panic */ +#define XT_PANIC_W_SUCCESS 0xC /* report SUCCESS and continue to panic */ + +typedef xt_panic_return_t (*xt_panic_widget_func)(const char * panicstr, void * context, void ** outval); +struct xnupost_panic_widget { + void * xtp_context_p; + void ** xtp_outval_p; + const char * xtp_func_name; + xt_panic_widget_func xtp_func; +}; + +/* for internal use only. Use T_REGISTER_* macros */ +extern xt_panic_return_t _xt_generic_assert_check(const char * s, void * str_to_match, void ** outval); +kern_return_t xnupost_register_panic_widget(xt_panic_widget_func funcp, const char * funcname, void * context, void ** outval); + +#define T_REGISTER_PANIC_WIDGET(func, ctx, outval) xnupost_register_panic_widget((func), #func, (ctx), (outval)) +#define T_REGISTER_ASSERT_CHECK(assert_str, retval) \ + T_REGISTER_PANIC_WIDGET(_xt_generic_assert_check, (void *)__DECONST(char *, assert_str), retval) + +typedef struct xnupost_test xnupost_test_data_t; +typedef struct xnupost_test * xnupost_test_t; + +extern struct xnupost_test kernel_post_tests[]; +extern uint32_t kernel_post_tests_count; +extern uint32_t total_post_tests_count; + +#define XNUPOST_TEST_CONFIG_BASIC(func) \ + { \ + XT_CONFIG_RUN, 0, -1, T_STATE_PASS, 0, 0, 0, (func), "xnu."#func \ + } + +#define XNUPOST_TEST_CONFIG_TEST_PANIC(func) \ + { \ + XT_CONFIG_EXPECT_PANIC, 0, -1, T_STATE_PASS, 0, 0, 0, (func), "xnu."#func \ + } + +void xnupost_init(void); +/* + * Parse boot-args specific to POST testing and setup enabled/disabled settings + * returns: KERN_SUCCESS - if testing is enabled. + */ +kern_return_t xnupost_parse_config(void); +kern_return_t xnupost_run_tests(xnupost_test_t test_list, uint32_t test_count); +kern_return_t xnupost_list_tests(xnupost_test_t test_list, uint32_t test_count); +kern_return_t xnupost_reset_tests(xnupost_test_t test_list, uint32_t test_count); + +int xnupost_export_testdata(void * outp, uint32_t size, uint32_t * lenp); +uint32_t xnupost_get_estimated_testdata_size(void); + +kern_return_t kernel_do_post(void); +kern_return_t xnupost_process_kdb_stop(const char * panic_s); +int xnupost_reset_all_tests(void); + +kern_return_t kernel_list_tests(void); +int bsd_do_post(void); +int bsd_list_tests(void); + +#endif /* _TESTS_XNUPOST_H */ diff --git a/osfmk/vm/bsd_vm.c b/osfmk/vm/bsd_vm.c index af3985f95..0999ea757 100644 --- a/osfmk/vm/bsd_vm.c +++ b/osfmk/vm/bsd_vm.c @@ -177,10 +177,6 @@ int pagerdebug=0; extern int proc_resetpcontrol(int); -#if DEVELOPMENT || DEBUG -extern unsigned long vm_cs_validated_resets; -#endif - extern int uiomove64(addr64_t, int, void *); #define MAX_RUN 32 @@ -240,7 +236,7 @@ memory_object_control_uiomove( break; - if (dst_page->busy || dst_page->cleaning) { + if (dst_page->vmp_busy || dst_page->vmp_cleaning) { /* * someone else is playing with the page... if we've * already collected pages into this run, go ahead @@ -253,28 +249,28 @@ memory_object_control_uiomove( PAGE_SLEEP(object, dst_page, THREAD_UNINT); continue; } - if (dst_page->laundry) + if (dst_page->vmp_laundry) vm_pageout_steal_laundry(dst_page, FALSE); if (mark_dirty) { - if (dst_page->dirty == FALSE) + if (dst_page->vmp_dirty == FALSE) dirty_count++; SET_PAGE_DIRTY(dst_page, FALSE); - if (dst_page->cs_validated && - !dst_page->cs_tainted) { + if (dst_page->vmp_cs_validated && + !dst_page->vmp_cs_tainted) { /* * CODE SIGNING: * We're modifying a code-signed * page: force revalidate */ - dst_page->cs_validated = FALSE; -#if DEVELOPMENT || DEBUG - vm_cs_validated_resets++; -#endif + dst_page->vmp_cs_validated = FALSE; + + VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1); + pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page)); } } - dst_page->busy = TRUE; + dst_page->vmp_busy = TRUE; page_run[cur_run++] = dst_page; @@ -334,7 +330,7 @@ memory_object_control_uiomove( * update clustered and speculative state * */ - if (dst_page->clustered) + if (dst_page->vmp_clustered) VM_PAGE_CONSUME_CLUSTERED(dst_page); PAGE_WAKEUP_DONE(dst_page); @@ -370,6 +366,8 @@ vnode_pager_bootstrap(void) #if __arm64__ fourk_pager_bootstrap(); #endif /* __arm64__ */ + shared_region_pager_bootstrap(); + return; } @@ -476,6 +474,21 @@ vnode_pager_data_unlock( return KERN_FAILURE; } +void +vnode_pager_dirtied( + memory_object_t mem_obj, + vm_object_offset_t s_offset, + vm_object_offset_t e_offset) +{ + vnode_pager_t vnode_object; + + if (mem_obj && mem_obj->mo_pager_ops == &vnode_pager_ops) { + + vnode_object = vnode_pager_lookup(mem_obj); + vnode_pager_was_dirtied(vnode_object->vnode_handle, s_offset, e_offset); + } +} + kern_return_t vnode_pager_get_isinuse( memory_object_t mem_obj, diff --git a/osfmk/vm/lz4.c b/osfmk/vm/lz4.c index 3c1d5be0a..caec717a6 100644 --- a/osfmk/vm/lz4.c +++ b/osfmk/vm/lz4.c @@ -419,8 +419,6 @@ size_t lz4raw_encode_buffer(uint8_t * __restrict dst_buffer, size_t dst_size, return (size_t)(dst - dst_buffer); // bytes produced } -#define likely(expr) __builtin_expect((expr) != 0, 1) -#define unlikely(expr) __builtin_expect((expr) != 0, 0) typedef uint32_t lz4_uint128 __attribute__((ext_vector_type(4))) __attribute__((__aligned__(1))); int lz4_decode(uint8_t ** dst_ptr, @@ -446,25 +444,25 @@ int lz4_decode(uint8_t ** dst_ptr, uint32_t matchLength = 4 + (cmd & 15); // 4..19 // extra bytes for literalLength - if (unlikely(literalLength == 15)) + if (__improbable(literalLength == 15)) { uint8_t s; do { #if DEBUG_LZ4_DECODE_ERRORS - if (unlikely(src >= src_end)) printf("Truncated SRC literal length\n"); + if (__improbable(src >= src_end)) printf("Truncated SRC literal length\n"); #endif - if (unlikely(src >= src_end)) goto IN_FAIL; // unexpected end of input (1 byte needed) + if (__improbable(src >= src_end)) goto IN_FAIL; // unexpected end of input (1 byte needed) s = *src++; literalLength += s; - } while (unlikely(s == 255)); + } while (__improbable(s == 255)); } // copy literal #if DEBUG_LZ4_DECODE_ERRORS - if (unlikely(literalLength > (size_t)(src_end - src))) printf("Truncated SRC literal\n"); + if (__improbable(literalLength > (size_t)(src_end - src))) printf("Truncated SRC literal\n"); #endif - if (unlikely(literalLength > (size_t)(src_end - src))) goto IN_FAIL; - if (unlikely(literalLength > (size_t)(dst_end - dst))) { + if (__improbable(literalLength > (size_t)(src_end - src))) goto IN_FAIL; + if (__improbable(literalLength > (size_t)(dst_end - dst))) { // literal will take us past the end of the destination buffer, // so we can only copy part of it. literalLength = (uint32_t)(dst_end - dst); @@ -476,11 +474,11 @@ int lz4_decode(uint8_t ** dst_ptr, src += literalLength; dst += literalLength; - if (unlikely(src >= src_end)) goto OUT_FULL; // valid end of stream + if (__improbable(src >= src_end)) goto OUT_FULL; // valid end of stream #if DEBUG_LZ4_DECODE_ERRORS - if (unlikely(2 > (size_t)(src_end - src))) printf("Truncated SRC distance\n"); + if (__improbable(2 > (size_t)(src_end - src))) printf("Truncated SRC distance\n"); #endif - if (unlikely(2 > (size_t)(src_end - src))) goto IN_FAIL; // unexpected end of input (2 bytes needed) + if (__improbable(2 > (size_t)(src_end - src))) goto IN_FAIL; // unexpected end of input (2 bytes needed) //DRKTODO: this causes an alignment increase warning (legitimate?) //DRKTODO: cast of char * to uint16_t* @@ -494,29 +492,29 @@ int lz4_decode(uint8_t ** dst_ptr, #if DEBUG_LZ4_DECODE_ERRORS if (matchDistance == 0) printf("Invalid match distance D = 0\n"); #endif - if (unlikely(matchDistance == 0)) goto IN_FAIL; // 0x0000 invalid + if (__improbable(matchDistance == 0)) goto IN_FAIL; // 0x0000 invalid uint8_t * ref = dst - matchDistance; #if DEBUG_LZ4_DECODE_ERRORS - if (unlikely(ref < dst_begin)) printf("Invalid reference D=0x%llx dst_begin=%p dst=%p dst_end=%p\n",matchDistance,dst_begin,dst,dst_end); + if (__improbable(ref < dst_begin)) printf("Invalid reference D=0x%llx dst_begin=%p dst=%p dst_end=%p\n",matchDistance,dst_begin,dst,dst_end); #endif - if (unlikely(ref < dst_begin)) goto OUT_FAIL; // out of range + if (__improbable(ref < dst_begin)) goto OUT_FAIL; // out of range // extra bytes for matchLength - if (unlikely(matchLength == 19)) + if (__improbable(matchLength == 19)) { uint8_t s; do { #if DEBUG_LZ4_DECODE_ERRORS - if (unlikely(src >= src_end)) printf("Truncated SRC match length\n"); + if (__improbable(src >= src_end)) printf("Truncated SRC match length\n"); #endif - if (unlikely(src >= src_end)) goto IN_FAIL; // unexpected end of input (1 byte needed) + if (__improbable(src >= src_end)) goto IN_FAIL; // unexpected end of input (1 byte needed) s = *src++; matchLength += s; - } while (unlikely(s == 255)); + } while (__improbable(s == 255)); } // copy match (may overlap) - if (unlikely(matchLength > (size_t)(dst_end - dst))) { + if (__improbable(matchLength > (size_t)(dst_end - dst))) { // match will take us past the end of the destination buffer, // so we can only copy part of it. matchLength = (uint32_t)(dst_end - dst); diff --git a/osfmk/vm/memory_object.c b/osfmk/vm/memory_object.c index d37eb4224..864969eda 100644 --- a/osfmk/vm/memory_object.c +++ b/osfmk/vm/memory_object.c @@ -122,15 +122,15 @@ decl_lck_mtx_data(, memory_manager_default_lock) * 2. Page is precious and should_return is RETURN_ALL. * 3. Should_return is RETURN_ANYTHING. * - * As a side effect, m->dirty will be made consistent + * As a side effect, m->vmp_dirty will be made consistent * with pmap_is_modified(m), if should_return is not * MEMORY_OBJECT_RETURN_NONE. */ #define memory_object_should_return_page(m, should_return) \ (should_return != MEMORY_OBJECT_RETURN_NONE && \ - (((m)->dirty || ((m)->dirty = pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m)))) || \ - ((m)->precious && (should_return) == MEMORY_OBJECT_RETURN_ALL) || \ + (((m)->vmp_dirty || ((m)->vmp_dirty = pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m)))) || \ + ((m)->vmp_precious && (should_return) == MEMORY_OBJECT_RETURN_ALL) || \ (should_return) == MEMORY_OBJECT_RETURN_ANYTHING)) typedef int memory_object_lock_result_t; @@ -171,18 +171,18 @@ memory_object_lock_page( m, should_return, should_flush, prot, 0); - if (m->busy || m->cleaning) + if (m->vmp_busy || m->vmp_cleaning) return (MEMORY_OBJECT_LOCK_RESULT_MUST_BLOCK); - if (m->laundry) + if (m->vmp_laundry) vm_pageout_steal_laundry(m, FALSE); /* * Don't worry about pages for which the kernel * does not have any data. */ - if (m->absent || m->error || m->restart) { - if (m->error && should_flush && !VM_PAGE_WIRED(m)) { + if (m->vmp_absent || m->vmp_error || m->vmp_restart) { + if (m->vmp_error && should_flush && !VM_PAGE_WIRED(m)) { /* * dump the page, pager wants us to * clean it up and there is no @@ -192,7 +192,7 @@ memory_object_lock_page( } return (MEMORY_OBJECT_LOCK_RESULT_DONE); } - assert(!m->fictitious); + assert(!m->vmp_fictitious); if (VM_PAGE_WIRED(m)) { /* @@ -486,10 +486,6 @@ MACRO_BEGIN \ int upl_flags; \ memory_object_t pager; \ \ - if (object->object_slid) { \ - panic("Objects with slid pages not allowed\n"); \ - } \ - \ if ((pager = (object)->pager) != MEMORY_OBJECT_NULL) { \ vm_object_paging_begin(object); \ vm_object_unlock(object); \ @@ -598,7 +594,7 @@ vm_object_update_extent( break; case MEMORY_OBJECT_LOCK_RESULT_MUST_FREE: - if (m->dirty == TRUE) + if (m->vmp_dirty == TRUE) dirty_count++; dwp->dw_mask |= DW_vm_page_free; break; @@ -625,7 +621,7 @@ vm_object_update_extent( /* * add additional state for the flush */ - m->free_when_done = TRUE; + m->vmp_free_when_done = TRUE; } /* * we use to remove the page from the queues at this @@ -767,7 +763,7 @@ vm_object_update( vm_page_t page; vm_page_t top_page; kern_return_t error = 0; - struct vm_object_fault_info fault_info; + struct vm_object_fault_info fault_info = {}; if (copy_object != VM_OBJECT_NULL) { /* @@ -808,16 +804,11 @@ vm_object_update( } fault_info.interruptible = THREAD_UNINT; fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL; - fault_info.user_tag = 0; - fault_info.pmap_options = 0; fault_info.lo_offset = copy_offset; fault_info.hi_offset = copy_size; - fault_info.no_cache = FALSE; fault_info.stealth = TRUE; - fault_info.io_sync = FALSE; - fault_info.cs_bypass = FALSE; - fault_info.mark_zf_absent = FALSE; - fault_info.batch_pmap_op = FALSE; + assert(fault_info.cs_bypass == FALSE); + assert(fault_info.pmap_cs_associated == FALSE); vm_object_paging_begin(copy_object); @@ -958,24 +949,24 @@ vm_object_update( m = (vm_page_t) vm_page_queue_first(&object->memq); while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t) m)) { - next = (vm_page_t) vm_page_queue_next(&m->listq); + next = (vm_page_t) vm_page_queue_next(&m->vmp_listq); - if ((m->offset >= start) && (m->offset < end)) { + if ((m->vmp_offset >= start) && (m->vmp_offset < end)) { /* * this is a page we're interested in * try to fit it into a current extent */ for (n = 0; n < num_of_extents; n++) { - if ((m->offset & e_mask) == extents[n].e_base) { + if ((m->vmp_offset & e_mask) == extents[n].e_base) { /* * use (PAGE_SIZE - 1) to determine the * max offset so that we don't wrap if * we're at the last page of the space */ - if (m->offset < extents[n].e_min) - extents[n].e_min = m->offset; - else if ((m->offset + (PAGE_SIZE - 1)) > extents[n].e_max) - extents[n].e_max = m->offset + (PAGE_SIZE - 1); + if (m->vmp_offset < extents[n].e_min) + extents[n].e_min = m->vmp_offset; + else if ((m->vmp_offset + (PAGE_SIZE - 1)) > extents[n].e_max) + extents[n].e_max = m->vmp_offset + (PAGE_SIZE - 1); break; } } @@ -989,9 +980,9 @@ vm_object_update( * if we still have room, * create a new extent */ - extents[n].e_base = m->offset & e_mask; - extents[n].e_min = m->offset; - extents[n].e_max = m->offset + (PAGE_SIZE - 1); + extents[n].e_base = m->vmp_offset & e_mask; + extents[n].e_min = m->vmp_offset; + extents[n].e_max = m->vmp_offset + (PAGE_SIZE - 1); num_of_extents++; } else { @@ -1556,23 +1547,33 @@ memory_object_super_upl_request( } kern_return_t -memory_object_cluster_size(memory_object_control_t control, memory_object_offset_t *start, - vm_size_t *length, uint32_t *io_streaming, memory_object_fault_info_t fault_info) +memory_object_cluster_size( + memory_object_control_t control, + memory_object_offset_t *start, + vm_size_t *length, + uint32_t *io_streaming, + memory_object_fault_info_t mo_fault_info) { vm_object_t object; + vm_object_fault_info_t fault_info; object = memory_object_control_to_vm_object(control); if (object == VM_OBJECT_NULL || object->paging_offset > *start) - return (KERN_INVALID_ARGUMENT); + return KERN_INVALID_ARGUMENT; *start -= object->paging_offset; - vm_object_cluster_size(object, (vm_object_offset_t *)start, length, (vm_object_fault_info_t)fault_info, io_streaming); + fault_info = (vm_object_fault_info_t)(uintptr_t) mo_fault_info; + vm_object_cluster_size(object, + (vm_object_offset_t *)start, + length, + fault_info, + io_streaming); *start += object->paging_offset; - return (KERN_SUCCESS); + return KERN_SUCCESS; } @@ -1931,7 +1932,7 @@ memory_object_is_signed( } boolean_t -memory_object_is_slid( +memory_object_is_shared_cache( memory_object_control_t control) { vm_object_t object = VM_OBJECT_NULL; @@ -1940,7 +1941,7 @@ memory_object_is_slid( if (object == VM_OBJECT_NULL) return FALSE; - return object->object_slid; + return object->object_is_shared_cache; } static zone_t mem_obj_control_zone; diff --git a/osfmk/vm/memory_object.h b/osfmk/vm/memory_object.h index d14b5e3c9..6023627ae 100644 --- a/osfmk/vm/memory_object.h +++ b/osfmk/vm/memory_object.h @@ -132,7 +132,7 @@ extern kern_return_t memory_object_signed( extern boolean_t memory_object_is_signed( memory_object_control_t control); -extern boolean_t memory_object_is_slid( +extern boolean_t memory_object_is_shared_cache( memory_object_control_t control); extern void memory_object_mark_used( diff --git a/osfmk/vm/pmap.h b/osfmk/vm/pmap.h index 4916313ce..e70231c43 100644 --- a/osfmk/vm/pmap.h +++ b/osfmk/vm/pmap.h @@ -74,6 +74,8 @@ #include #include +#include + #ifdef KERNEL_PRIVATE /* @@ -431,7 +433,7 @@ extern kern_return_t (pmap_attribute)( /* Get/Set special memory if (__obj->internal) { \ __options |= PMAP_OPTIONS_INTERNAL; \ } \ - if (__page->reusable || __obj->all_reusable) { \ + if (__page->vmp_reusable || __obj->all_reusable) { \ __options |= PMAP_OPTIONS_REUSABLE; \ } \ result = pmap_enter_options(__pmap, \ @@ -460,7 +462,7 @@ extern kern_return_t (pmap_attribute)( /* Get/Set special memory if (__obj->internal) { \ __extra_options |= PMAP_OPTIONS_INTERNAL; \ } \ - if (__page->reusable || __obj->all_reusable) { \ + if (__page->vmp_reusable || __obj->all_reusable) { \ __extra_options |= PMAP_OPTIONS_REUSABLE; \ } \ result = pmap_enter_options(__pmap, \ @@ -547,7 +549,7 @@ extern kern_return_t (pmap_attribute)( /* Get/Set special memory #define PMAP_ENTER_CHECK(pmap, page) \ { \ - if ((page)->error) { \ + if ((page)->vmp_error) { \ panic("VM page %p should not have an error\n", \ (page)); \ } \ @@ -688,6 +690,7 @@ extern pmap_t kernel_pmap; /* The kernel's map */ * iff page was modified */ #define PMAP_OPTIONS_PROTECT_IMMEDIATE 0x1000 /* allow protections to be * be upgraded */ +#define PMAP_OPTIONS_CLEAR_WRITE 0x2000 #if !defined(__LP64__) @@ -725,6 +728,21 @@ mach_vm_size_t pmap_query_resident(pmap_t pmap, /* Inform the pmap layer that there is a JIT entry in this map. */ extern void pmap_set_jit_entitled(pmap_t pmap); +/* + * Tell the pmap layer what range within the nested region the VM intends to + * use. + */ +extern void pmap_trim(pmap_t grand, pmap_t subord, addr64_t vstart, addr64_t nstart, uint64_t size); + +/* + * Dump page table contents into the specified buffer. Returns the number of + * bytes copied, 0 if insufficient space, (size_t)-1 if unsupported. + * This is expected to only be called from kernel debugger context, + * so synchronization is not required. + */ + +extern size_t pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end); + /* * Indicates if any special policy is applied to this protection by the pmap * layer. @@ -735,7 +753,7 @@ bool pmap_has_prot_policy(vm_prot_t prot); * Causes the pmap to return any available pages that it can return cheaply to * the VM. */ -void pmap_release_pages_fast(void); +uint64_t pmap_release_pages_fast(void); #define PMAP_QUERY_PAGE_PRESENT 0x01 #define PMAP_QUERY_PAGE_REUSABLE 0x02 @@ -754,6 +772,11 @@ int pmap_pgtrace_delete_page(pmap_t pmap, vm_map_offset_t start, vm_map_offset_t kern_return_t pmap_pgtrace_fault(pmap_t pmap, vm_map_offset_t va, arm_saved_state_t *ss); #endif + +extern void pmap_ledger_alloc_init(size_t); +extern ledger_t pmap_ledger_alloc(void); +extern void pmap_ledger_free(ledger_t); + #endif /* KERNEL_PRIVATE */ #endif /* _VM_PMAP_H_ */ diff --git a/osfmk/vm/vm32_user.c b/osfmk/vm/vm32_user.c index c8f3343f1..9918cf984 100644 --- a/osfmk/vm/vm32_user.c +++ b/osfmk/vm/vm32_user.c @@ -547,10 +547,9 @@ vm32__task_wire( if (map == VM_MAP_NULL) return(KERN_INVALID_ARGUMENT); - if (must_wire) - map->wiring_required = TRUE; - else - map->wiring_required = FALSE; + vm_map_lock(map); + map->wiring_required = (must_wire == TRUE); + vm_map_unlock(map); return(KERN_SUCCESS); } diff --git a/osfmk/vm/vm_apple_protect.c b/osfmk/vm/vm_apple_protect.c index 707c3e695..508211e68 100644 --- a/osfmk/vm/vm_apple_protect.c +++ b/osfmk/vm/vm_apple_protect.c @@ -354,7 +354,6 @@ apple_protect_pager_data_request( unsigned int pl_count; vm_object_t src_top_object, src_page_object, dst_object; kern_return_t kr, retval; - vm_map_offset_t kernel_mapping; vm_offset_t src_vaddr, dst_vaddr; vm_offset_t cur_offset; vm_offset_t offset_in_page; @@ -370,10 +369,9 @@ apple_protect_pager_data_request( retval = KERN_SUCCESS; src_top_object = VM_OBJECT_NULL; src_page_object = VM_OBJECT_NULL; - kernel_mapping = 0; upl = NULL; upl_pl = NULL; - fault_info = *((struct vm_object_fault_info *) mo_fault_info); + fault_info = *((struct vm_object_fault_info *)(uintptr_t)mo_fault_info); fault_info.stealth = TRUE; fault_info.io_sync = FALSE; fault_info.mark_zf_absent = FALSE; @@ -386,6 +384,9 @@ apple_protect_pager_data_request( PAGER_DEBUG(PAGER_PAGEIN, ("apple_protect_pager_data_request: %p, %llx, %x, %x, pager %p\n", mem_obj, offset, length, protection_required, pager)); + fault_info.lo_offset += pager->backing_offset; + fault_info.hi_offset += pager->backing_offset; + /* * Gather in a UPL all the VM pages requested by VM. */ @@ -409,39 +410,6 @@ apple_protect_pager_data_request( dst_object = mo_control->moc_object; assert(dst_object != VM_OBJECT_NULL); - -#if __x86_64__ || __arm__ || __arm64__ - /* we'll use the 1-to-1 mapping of physical memory */ - src_vaddr = 0; - dst_vaddr = 0; -#else /* __x86_64__ || __arm__ || __arm64__ */ - /* - * Reserve 2 virtual pages in the kernel address space to map each - * source and destination physical pages when it's their turn to - * be processed. - */ - vm_map_entry_t map_entry; - - vm_object_reference(kernel_object); /* ref. for mapping */ - kr = vm_map_find_space(kernel_map, - &kernel_mapping, - 2 * PAGE_SIZE_64, - 0, - 0, - VM_MAP_KERNEL_FLAGS_NONE, - &map_entry); - if (kr != KERN_SUCCESS) { - vm_object_deallocate(kernel_object); - retval = kr; - goto done; - } - map_entry->object.vm_object = kernel_object; - map_entry->offset = kernel_mapping; - vm_map_unlock(kernel_map); - src_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping); - dst_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping + PAGE_SIZE_64); -#endif /* __x86_64__ || __arm__ || __arm64__ */ - /* * We'll map the encrypted data in the kernel address space from the * backing VM object (itself backed by the encrypted file via @@ -522,66 +490,42 @@ apple_protect_pager_data_request( kr); } assert(src_page != VM_PAGE_NULL); - assert(src_page->busy); + assert(src_page->vmp_busy); - if (( !VM_PAGE_NON_SPECULATIVE_PAGEABLE(src_page))) { + if (src_page->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q) { vm_page_lockspin_queues(); - if (( !VM_PAGE_NON_SPECULATIVE_PAGEABLE(src_page))) { - vm_page_deactivate(src_page); + if (src_page->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q) { + vm_page_speculate(src_page, FALSE); } vm_page_unlock_queues(); } /* - * Establish an explicit mapping of the source - * physical page. + * Establish pointers to the source + * and destination physical pages. */ + dst_pnum = (ppnum_t) + upl_phys_page(upl_pl, (int)(cur_offset / PAGE_SIZE)); + assert(dst_pnum != 0); #if __x86_64__ src_vaddr = (vm_map_offset_t) PHYSMAP_PTOV((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(src_page) << PAGE_SHIFT); + dst_vaddr = (vm_map_offset_t) + PHYSMAP_PTOV((pmap_paddr_t)dst_pnum << PAGE_SHIFT); + #elif __arm__ || __arm64__ src_vaddr = (vm_map_offset_t) phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(src_page) << PAGE_SHIFT); -#else - kr = pmap_enter(kernel_pmap, - src_vaddr, - VM_PAGE_GET_PHYS_PAGE(src_page), - VM_PROT_READ, - VM_PROT_NONE, - 0, - TRUE); - - assert(kr == KERN_SUCCESS); -#endif - /* - * Establish an explicit pmap mapping of the destination - * physical page. - * We can't do a regular VM mapping because the VM page - * is "busy". - */ - dst_pnum = (ppnum_t) - upl_phys_page(upl_pl, (int)(cur_offset / PAGE_SIZE)); - assert(dst_pnum != 0); -#if __x86_64__ - dst_vaddr = (vm_map_offset_t) - PHYSMAP_PTOV((pmap_paddr_t)dst_pnum << PAGE_SHIFT); -#elif __arm__ || __arm64__ dst_vaddr = (vm_map_offset_t) phystokv((pmap_paddr_t)dst_pnum << PAGE_SHIFT); #else - kr = pmap_enter(kernel_pmap, - dst_vaddr, - dst_pnum, - VM_PROT_READ | VM_PROT_WRITE, - VM_PROT_NONE, - 0, - TRUE); - - assert(kr == KERN_SUCCESS); +#error "vm_paging_map_object: no 1-to-1 kernel mapping of physical memory..." + src_vaddr = 0; + dst_vaddr = 0; #endif src_page_object = VM_PAGE_OBJECT(src_page); @@ -597,11 +541,11 @@ apple_protect_pager_data_request( * ... and transfer the results to the destination page. */ UPL_SET_CS_VALIDATED(upl_pl, cur_offset / PAGE_SIZE, - src_page->cs_validated); + src_page->vmp_cs_validated); UPL_SET_CS_TAINTED(upl_pl, cur_offset / PAGE_SIZE, - src_page->cs_tainted); + src_page->vmp_cs_tainted); UPL_SET_CS_NX(upl_pl, cur_offset / PAGE_SIZE, - src_page->cs_nx); + src_page->vmp_cs_nx); /* * page_decrypt() might access a mapped file, so let's release @@ -610,7 +554,7 @@ apple_protect_pager_data_request( * "paging_in_progress" reference on its object, so it's safe * to unlock the object here. */ - assert(src_page->busy); + assert(src_page->vmp_busy); assert(src_page_object->paging_in_progress > 0); vm_object_unlock(src_page_object); @@ -630,6 +574,7 @@ apple_protect_pager_data_request( offset_in_page), (char *)(dst_vaddr + offset_in_page), 4096); + if (apple_protect_pager_data_request_debug) { printf("apple_protect_data_request" "(%p,0x%llx+0x%llx+0x%04llx): " @@ -651,9 +596,9 @@ apple_protect_pager_data_request( *(uint64_t *)(dst_vaddr+ offset_in_page+8), src_page_object->code_signed, - src_page->cs_validated, - src_page->cs_tainted, - src_page->cs_nx); + src_page->vmp_cs_validated, + src_page->vmp_cs_tainted, + src_page->vmp_cs_nx); } ret = 0; continue; @@ -667,6 +612,7 @@ apple_protect_pager_data_request( cur_offset + offset_in_page), pager->crypt_info->crypt_ops); + if (apple_protect_pager_data_request_debug) { printf("apple_protect_data_request" "(%p,0x%llx+0x%llx+0x%04llx): " @@ -697,9 +643,9 @@ apple_protect_pager_data_request( *(uint64_t *)(dst_vaddr+offset_in_page), *(uint64_t *)(dst_vaddr+offset_in_page+8), src_page_object->code_signed, - src_page->cs_validated, - src_page->cs_tainted, - src_page->cs_nx, + src_page->vmp_cs_validated, + src_page->vmp_cs_tainted, + src_page->vmp_cs_nx, ret); } if (ret) { @@ -714,53 +660,18 @@ apple_protect_pager_data_request( } assert(VM_PAGE_OBJECT(src_page) == src_page_object); - assert(src_page->busy); + assert(src_page->vmp_busy); assert(src_page_object->paging_in_progress > 0); vm_object_lock(src_page_object); -#if __x86_64__ || __arm__ || __arm64__ - /* we used the 1-to-1 mapping of physical memory */ - src_vaddr = 0; - dst_vaddr = 0; -#else /* __x86_64__ || __arm__ || __arm64__ */ - /* - * Remove the pmap mapping of the source and destination pages - * in the kernel. - */ - pmap_remove(kernel_pmap, - (addr64_t) kernel_mapping, - (addr64_t) (kernel_mapping + (2 * PAGE_SIZE_64))); -#endif /* __x86_64__ || __arm__ || __arm64__ */ - /* * Cleanup the result of vm_fault_page() of the source page. */ - if (retval == KERN_SUCCESS && - src_page->busy && - !VM_PAGE_WIRED(src_page) && - !src_page->dirty && - !src_page->precious && - !src_page->laundry && - !src_page->cleaning) { - int refmod_state; - - refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(src_page)); - - if (refmod_state & VM_MEM_MODIFIED) { - SET_PAGE_DIRTY(src_page, FALSE); - } - if (!src_page->dirty) { - vm_page_free_unlocked(src_page, TRUE); - src_page = VM_PAGE_NULL; - } else { - PAGE_WAKEUP_DONE(src_page); - } - } else { - PAGE_WAKEUP_DONE(src_page); - } + PAGE_WAKEUP_DONE(src_page); src_page = VM_PAGE_NULL; vm_object_paging_end(src_page_object); vm_object_unlock(src_page_object); + if (top_page != VM_PAGE_NULL) { assert(VM_PAGE_OBJECT(top_page) == src_top_object); vm_object_lock(src_top_object); @@ -824,21 +735,9 @@ apple_protect_pager_data_request( upl_deallocate(upl); upl = NULL; } - if (kernel_mapping != 0) { - /* clean up the mapping of the source and destination pages */ - kr = vm_map_remove(kernel_map, - kernel_mapping, - kernel_mapping + (2 * PAGE_SIZE_64), - VM_MAP_NO_FLAGS); - assert(kr == KERN_SUCCESS); - kernel_mapping = 0; - src_vaddr = 0; - dst_vaddr = 0; - } if (src_top_object != VM_OBJECT_NULL) { vm_object_deallocate(src_top_object); } - return retval; } @@ -1125,7 +1024,7 @@ apple_protect_pager_lookup( apple_protect_pager_t pager; assert(mem_obj->mo_pager_ops == &apple_protect_pager_ops); - pager = (apple_protect_pager_t) mem_obj; + pager = (apple_protect_pager_t)(uintptr_t) mem_obj; assert(pager->ref_count > 0); return pager; } diff --git a/osfmk/vm/vm_compressor.c b/osfmk/vm/vm_compressor.c index f474385b7..0f3a37fe1 100644 --- a/osfmk/vm/vm_compressor.c +++ b/osfmk/vm/vm_compressor.c @@ -50,6 +50,8 @@ #include +extern boolean_t vm_darkwake_mode; + #if POPCOUNT_THE_COMPRESSED_DATA boolean_t popcount_c_segs = TRUE; @@ -88,6 +90,9 @@ static inline uint32_t vmc_pop(uintptr_t ins, int sz) { } #endif +#if VALIDATE_C_SEGMENTS +boolean_t validate_c_segs = TRUE; +#endif /* * vm_compressor_mode has a heirarchy of control to set its value. * boot-args are checked first, then device-tree, and finally @@ -103,8 +108,7 @@ int vm_compressor_mode = VM_PAGER_FREEZER_DEFAULT; void *freezer_chead; /* The chead used to track c_segs allocated for the exclusive use of holding just one task's compressed memory.*/ char *freezer_compressor_scratch_buf = NULL; -#define VM_MAX_FREEZER_CSEG_SWAP_COUNT 64 /* The maximum number of c_segs holding just one task's compressed memory that can be swapped out to disk.*/ -extern int c_freezer_swapout_count; /* This count keeps track of the # of c_segs holding just one task's compressed memory on the swapout queue. This count is used during each freeze i.e. on a per-task basis.*/ +extern int c_freezer_swapout_page_count; /* This count keeps track of the # of compressed pages holding just one task's compressed memory on the swapout queue. This count is used during each freeze i.e. on a per-task basis.*/ #else /* CONFIG_FREEZE */ int vm_compressor_mode = VM_PAGER_NOT_CONFIGURED; @@ -194,8 +198,9 @@ char *c_compressed_record_cptr; queue_head_t c_age_list_head; -queue_head_t c_swapout_list_head; queue_head_t c_swappedin_list_head; +queue_head_t c_swapout_list_head; +queue_head_t c_swapio_list_head; queue_head_t c_swappedout_list_head; queue_head_t c_swappedout_sparse_list_head; queue_head_t c_major_list_head; @@ -203,8 +208,9 @@ queue_head_t c_filling_list_head; queue_head_t c_bad_list_head; uint32_t c_age_count = 0; -uint32_t c_swapout_count = 0; uint32_t c_swappedin_count = 0; +uint32_t c_swapout_count = 0; +uint32_t c_swapio_count = 0; uint32_t c_swappedout_count = 0; uint32_t c_swappedout_sparse_count = 0; uint32_t c_major_count = 0; @@ -249,6 +255,11 @@ uint32_t vm_compressor_majorcompact_threshold_divisor = 10; uint32_t vm_compressor_unthrottle_threshold_divisor = 10; uint32_t vm_compressor_catchup_threshold_divisor = 10; +uint32_t vm_compressor_minorcompact_threshold_divisor_overridden = 0; +uint32_t vm_compressor_majorcompact_threshold_divisor_overridden = 0; +uint32_t vm_compressor_unthrottle_threshold_divisor_overridden = 0; +uint32_t vm_compressor_catchup_threshold_divisor_overridden = 0; + #define C_SEGMENTS_PER_PAGE (PAGE_SIZE / sizeof(union c_segu)) @@ -286,7 +297,6 @@ uint32_t vm_ripe_target_age = (60 * 60 * 48); uint32_t swapout_target_age = 0; uint32_t age_of_decompressions_during_sample_period[DECOMPRESSION_SAMPLE_MAX_AGE]; uint32_t overage_decompressions_during_sample_period = 0; -uint32_t vm_compressor_pages_grabbed = 0; void do_fastwake_warmup(queue_head_t *, boolean_t); @@ -546,6 +556,9 @@ vm_compressor_init(void) #endif #if CHECKSUM_THE_DATA || CHECKSUM_THE_COMPRESSED_DATA checksum_c_segs = FALSE; +#endif +#if VALIDATE_C_SEGMENTS + validate_c_segs = FALSE; #endif write_protect_c_segs = FALSE; } @@ -602,6 +615,7 @@ vm_compressor_init(void) queue_init(&c_major_list_head); queue_init(&c_filling_list_head); queue_init(&c_swapout_list_head); + queue_init(&c_swapio_list_head); queue_init(&c_swappedin_list_head); queue_init(&c_swappedout_list_head); queue_init(&c_swappedout_sparse_list_head); @@ -731,7 +745,18 @@ vm_compressor_init(void) compressor_scratch_bufs = kalloc_tag(compressor_cpus * vm_compressor_get_decode_scratch_size(), VM_KERN_MEMORY_COMPRESSOR); kdp_compressor_scratch_buf = kalloc_tag(vm_compressor_get_decode_scratch_size(), VM_KERN_MEMORY_COMPRESSOR); - kdp_compressor_decompressed_page = kalloc_tag(PAGE_SIZE, VM_KERN_MEMORY_COMPRESSOR); + + /* + * kdp_compressor_decompressed_page must be page aligned because we access + * it through the physical apperture by page number. kalloc() does not + * guarantee alignment. + */ + vm_offset_t addr; + if (kernel_memory_allocate(kernel_map, &addr, PAGE_SIZE, 0, KMA_KOBJECT, VM_KERN_MEMORY_COMPRESSOR) != KERN_SUCCESS) { + panic("vm_compressor_init: kernel_memory_allocate failed - kdp_compressor_decompressed_page\n"); + } + assert((addr & PAGE_MASK) == 0); + kdp_compressor_decompressed_page = (void *)addr; kdp_compressor_decompressed_page_paddr = kvtophys((vm_offset_t)kdp_compressor_decompressed_page); kdp_compressor_decompressed_page_ppnum = (ppnum_t) atop(kdp_compressor_decompressed_page_paddr); } @@ -783,6 +808,9 @@ c_seg_validate(c_segment_t c_seg, boolean_t must_be_compact) uint32_t c_size; c_slot_t cs; + if (__probable(validate_c_segs == FALSE)) { + return; + } if (c_seg->c_firstemptyslot < c_seg->c_nextslot) { c_indx = c_seg->c_firstemptyslot; cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx); @@ -812,6 +840,16 @@ c_seg_validate(c_segment_t c_seg, boolean_t must_be_compact) panic("Compressed data doesn't match original %p phys: 0x%llx %d %p %d %d 0x%x 0x%x", c_seg, csvphys, cs->c_offset, cs, c_indx, c_size, cs->c_hash_compressed_data, csvhash); } #endif +#if POPCOUNT_THE_COMPRESSED_DATA + unsigned csvpop; + if (c_size) { + uintptr_t csvaddr = (uintptr_t) &c_seg->c_store.c_buffer[cs->c_offset]; + if (cs->c_pop_cdata != (csvpop = vmc_pop(csvaddr, c_size))) { + panic("Compressed data popcount doesn't match original, bit distance: %d %p (phys: %p) %p %p 0x%llx 0x%x 0x%x 0x%x", (csvpop - cs->c_pop_cdata), (void *)csvaddr, (void *) kvtophys(csvaddr), c_seg, cs, cs->c_offset, c_size, csvpop, cs->c_pop_cdata); + } + } +#endif + } if (bytes_used != c_seg->c_bytes_used) @@ -1053,14 +1091,20 @@ c_seg_switch_state(c_segment_t c_seg, int new_state, boolean_t insert_head) break; case C_ON_SWAPOUT_Q: - assert(new_state == C_ON_SWAPPEDOUT_Q || new_state == C_ON_SWAPPEDOUTSPARSE_Q || - new_state == C_ON_AGE_Q || new_state == C_IS_FREE || new_state == C_IS_EMPTY); + assert(new_state == C_ON_AGE_Q || new_state == C_IS_FREE || new_state == C_IS_EMPTY || new_state == C_ON_SWAPIO_Q); queue_remove(&c_swapout_list_head, c_seg, c_segment_t, c_age_list); thread_wakeup((event_t)&compaction_swapper_running); c_swapout_count--; break; + case C_ON_SWAPIO_Q: + assert(new_state == C_ON_SWAPPEDOUT_Q || new_state == C_ON_SWAPPEDOUTSPARSE_Q || new_state == C_ON_AGE_Q); + + queue_remove(&c_swapio_list_head, c_seg, c_segment_t, c_age_list); + c_swapio_count--; + break; + case C_ON_SWAPPEDOUT_Q: assert(new_state == C_ON_SWAPPEDIN_Q || new_state == C_ON_AGE_Q || new_state == C_ON_SWAPPEDOUTSPARSE_Q || @@ -1116,7 +1160,8 @@ c_seg_switch_state(c_segment_t c_seg, int new_state, boolean_t insert_head) break; case C_ON_AGE_Q: - assert(old_state == C_IS_FILLING || old_state == C_ON_SWAPPEDIN_Q || old_state == C_ON_SWAPOUT_Q || + assert(old_state == C_IS_FILLING || old_state == C_ON_SWAPPEDIN_Q || + old_state == C_ON_SWAPOUT_Q || old_state == C_ON_SWAPIO_Q || old_state == C_ON_MAJORCOMPACT_Q || old_state == C_ON_SWAPPEDOUT_Q || old_state == C_ON_SWAPPEDOUTSPARSE_Q); if (old_state == C_IS_FILLING) @@ -1134,7 +1179,7 @@ c_seg_switch_state(c_segment_t c_seg, int new_state, boolean_t insert_head) break; case C_ON_SWAPPEDIN_Q: - assert(c_seg->c_state == C_ON_SWAPPEDOUT_Q || c_seg->c_state == C_ON_SWAPPEDOUTSPARSE_Q); + assert(old_state == C_ON_SWAPPEDOUT_Q || old_state == C_ON_SWAPPEDOUTSPARSE_Q); if (insert_head == TRUE) queue_enter_first(&c_swappedin_list_head, c_seg, c_segment_t, c_age_list); @@ -1153,8 +1198,18 @@ c_seg_switch_state(c_segment_t c_seg, int new_state, boolean_t insert_head) c_swapout_count++; break; + case C_ON_SWAPIO_Q: + assert(old_state == C_ON_SWAPOUT_Q); + + if (insert_head == TRUE) + queue_enter_first(&c_swapio_list_head, c_seg, c_segment_t, c_age_list); + else + queue_enter(&c_swapio_list_head, c_seg, c_segment_t, c_age_list); + c_swapio_count++; + break; + case C_ON_SWAPPEDOUT_Q: - assert(c_seg->c_state == C_ON_SWAPOUT_Q); + assert(old_state == C_ON_SWAPIO_Q); if (insert_head == TRUE) queue_enter_first(&c_swappedout_list_head, c_seg, c_segment_t, c_age_list); @@ -1164,7 +1219,7 @@ c_seg_switch_state(c_segment_t c_seg, int new_state, boolean_t insert_head) break; case C_ON_SWAPPEDOUTSPARSE_Q: - assert(c_seg->c_state == C_ON_SWAPOUT_Q || c_seg->c_state == C_ON_SWAPPEDOUT_Q); + assert(old_state == C_ON_SWAPIO_Q || old_state == C_ON_SWAPPEDOUT_Q); if (insert_head == TRUE) queue_enter_first(&c_swappedout_sparse_list_head, c_seg, c_segment_t, c_age_list); @@ -1175,7 +1230,7 @@ c_seg_switch_state(c_segment_t c_seg, int new_state, boolean_t insert_head) break; case C_ON_MAJORCOMPACT_Q: - assert(c_seg->c_state == C_ON_AGE_Q); + assert(old_state == C_ON_AGE_Q); if (insert_head == TRUE) queue_enter_first(&c_major_list_head, c_seg, c_segment_t, c_age_list); @@ -1185,7 +1240,7 @@ c_seg_switch_state(c_segment_t c_seg, int new_state, boolean_t insert_head) break; case C_ON_BAD_Q: - assert(c_seg->c_state == C_ON_SWAPPEDOUT_Q || c_seg->c_state == C_ON_SWAPPEDOUTSPARSE_Q); + assert(old_state == C_ON_SWAPPEDOUT_Q || old_state == C_ON_SWAPPEDOUTSPARSE_Q); if (insert_head == TRUE) queue_enter_first(&c_bad_list_head, c_seg, c_segment_t, c_age_list); @@ -1672,8 +1727,6 @@ uint32_t compressor_thrashing_min_per_10msecs = 20; /* When true, reset sample data next chance we get. */ static boolean_t compressor_need_sample_reset = FALSE; -extern uint32_t vm_page_filecache_min; - void compute_swapout_target_age(void) @@ -1802,7 +1855,8 @@ int compaction_swapper_abort = 0; #if CONFIG_JETSAM -boolean_t memorystatus_kill_on_VM_thrashing(boolean_t); +boolean_t memorystatus_kill_on_VM_compressor_thrashing(boolean_t); +boolean_t memorystatus_kill_on_VM_compressor_space_shortage(boolean_t); boolean_t memorystatus_kill_on_FC_thrashing(boolean_t); int compressor_thrashing_induced_jetsam = 0; int filecache_thrashing_induced_jetsam = 0; @@ -1875,7 +1929,13 @@ compressor_needs_to_swap(void) vm_compressor_thrashing_detected = TRUE; if (swapout_target_age || vm_compressor_low_on_space() == TRUE) { - memorystatus_kill_on_VM_thrashing(TRUE /* async */); + if (swapout_target_age) { + /* The compressor is thrashing. */ + memorystatus_kill_on_VM_compressor_thrashing(TRUE /* async */); + } else { + /* The compressor is running low on space. */ + memorystatus_kill_on_VM_compressor_space_shortage(TRUE /* async */); + } compressor_thrashing_induced_jetsam++; } else { memorystatus_kill_on_FC_thrashing(TRUE /* async */); @@ -1967,7 +2027,7 @@ vm_run_compactor(void) } if (compaction_swapper_running) { - if (vm_restricted_to_single_processor == FALSE) { + if (vm_pageout_state.vm_restricted_to_single_processor == FALSE) { vm_run_compactor_already_running++; lck_mtx_unlock_always(c_list_lock); @@ -2309,7 +2369,7 @@ vm_compressor_swap_trigger_thread(void) if (compaction_swapper_init_now) { vm_compaction_swapper_do_init(); - if (vm_restricted_to_single_processor == TRUE) + if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) thread_vm_bind_group_add(); thread_set_thread_name(current_thread(), "VM_cswap_trigger"); compaction_swapper_init_now = 0; @@ -2904,7 +2964,8 @@ c_seg_allocate(c_segment_t *current_chead) if (size_to_populate > C_SEG_MAX_POPULATE_SIZE) size_to_populate = C_SEG_MAX_POPULATE_SIZE; - vm_compressor_pages_grabbed += size_to_populate / PAGE_SIZE; + + OSAddAtomic64(size_to_populate / PAGE_SIZE, &vm_pageout_vminfo.vm_compressor_pages_grabbed); kernel_memory_populate(compressor_map, (vm_offset_t) &c_seg->c_store.c_buffer[c_seg->c_populated_offset], @@ -2933,6 +2994,7 @@ c_current_seg_filled(c_segment_t c_seg, c_segment_t *current_chead) int new_state = C_ON_AGE_Q; clock_sec_t sec; clock_nsec_t nsec; + boolean_t head_insert = FALSE; unused_bytes = trunc_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset - c_seg->c_nextoffset)); @@ -2989,23 +3051,33 @@ c_current_seg_filled(c_segment_t c_seg, c_segment_t *current_chead) #if CONFIG_FREEZE if (current_chead == (c_segment_t*)&freezer_chead && VM_CONFIG_SWAP_IS_PRESENT && - VM_CONFIG_FREEZER_SWAP_IS_ACTIVE && - c_freezer_swapout_count < VM_MAX_FREEZER_CSEG_SWAP_COUNT) { + VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { new_state = C_ON_SWAPOUT_Q; } #endif /* CONFIG_FREEZE */ + if (vm_darkwake_mode == TRUE) { + new_state = C_ON_SWAPOUT_Q; + head_insert = TRUE; + } + clock_get_system_nanotime(&sec, &nsec); c_seg->c_creation_ts = (uint32_t)sec; lck_mtx_lock_spin_always(c_list_lock); c_seg->c_generation_id = c_generation_id++; - c_seg_switch_state(c_seg, new_state, FALSE); + c_seg_switch_state(c_seg, new_state, head_insert); #if CONFIG_FREEZE - if (c_seg->c_state == C_ON_SWAPOUT_Q) - c_freezer_swapout_count++; + if (c_seg->c_state == C_ON_SWAPOUT_Q) { + /* + * darkwake and freezer can't co-exist together + * We'll need to fix this accounting as a start. + */ + assert(vm_darkwake_mode == FALSE); + c_freezer_swapout_page_count += (C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset)) / PAGE_SIZE_64; + } #endif /* CONFIG_FREEZE */ if (c_seg->c_state == C_ON_AGE_Q && C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE) @@ -3013,10 +3085,8 @@ c_current_seg_filled(c_segment_t c_seg, c_segment_t *current_chead) lck_mtx_unlock_always(c_list_lock); -#if CONFIG_FREEZE if (c_seg->c_state == C_ON_SWAPOUT_Q) thread_wakeup((event_t)&c_swapout_list_head); -#endif /* CONFIG_FREEZE */ *current_chead = NULL; } @@ -3770,7 +3840,7 @@ c_decompress_page(char *dst, volatile c_slot_mapping_t slot_ptr, int flags, int lck_mtx_lock_spin_always(&c_seg->c_lock); C_SEG_WAKEUP_DONE(c_seg); } - if (!c_seg->c_on_minorcompact_q && c_seg->c_state != C_ON_SWAPOUT_Q) + if (!c_seg->c_on_minorcompact_q && c_seg->c_state != C_ON_SWAPOUT_Q && c_seg->c_state != C_ON_SWAPIO_Q) c_seg_need_delayed_compaction(c_seg, FALSE); } else { if (c_seg->c_state != C_ON_SWAPPEDOUTSPARSE_Q) { @@ -3790,7 +3860,8 @@ c_decompress_page(char *dst, volatile c_slot_mapping_t slot_ptr, int flags, int } } else if ( !(C_SEG_IS_ONDISK(c_seg))) { - if (c_seg->c_state != C_ON_BAD_Q && c_seg->c_state != C_ON_SWAPOUT_Q && C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE) { + if (c_seg->c_state != C_ON_BAD_Q && c_seg->c_state != C_ON_SWAPOUT_Q && c_seg->c_state != C_ON_SWAPIO_Q && + C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE) { c_seg_need_delayed_compaction(c_seg, FALSE); } } else if (c_seg->c_state != C_ON_SWAPPEDOUTSPARSE_Q && C_SEG_ONDISK_IS_SPARSE(c_seg)) { @@ -3810,7 +3881,7 @@ c_decompress_page(char *dst, volatile c_slot_mapping_t slot_ptr, int flags, int PAGE_REPLACEMENT_DISALLOWED(FALSE); if (consider_defragmenting == TRUE) - vm_swap_consider_defragmenting(); + vm_swap_consider_defragmenting(VM_SWAP_FLAGS_NONE); #if CONFIG_EMBEDDED if ((c_minor_count && COMPRESSOR_NEEDS_TO_MINOR_COMPACT()) || vm_compressor_needs_to_major_compact()) diff --git a/osfmk/vm/vm_compressor.h b/osfmk/vm/vm_compressor.h index 4b0883873..0dea16c53 100644 --- a/osfmk/vm/vm_compressor.h +++ b/osfmk/vm/vm_compressor.h @@ -60,7 +60,9 @@ #if DEVELOPMENT || DEBUG - +#if defined(PLATFORM_WatchOS) +#define VALIDATE_C_SEGMENTS (1) +#endif #endif #endif @@ -78,7 +80,10 @@ #define CHECKSUM_THE_SWAP ENABLE_SWAP_CHECKS /* Debug swap data */ #define CHECKSUM_THE_DATA ENABLE_COMPRESSOR_CHECKS /* Debug compressor/decompressor data */ #define CHECKSUM_THE_COMPRESSED_DATA ENABLE_COMPRESSOR_CHECKS /* Debug compressor/decompressor compressed data */ + +#ifndef VALIDATE_C_SEGMENTS #define VALIDATE_C_SEGMENTS ENABLE_COMPRESSOR_CHECKS /* Debug compaction */ +#endif #define RECORD_THE_COMPRESSED_DATA 0 @@ -117,6 +122,7 @@ struct c_slot { #define C_ON_SWAPPEDIN_Q 7 #define C_ON_MAJORCOMPACT_Q 8 #define C_ON_BAD_Q 9 +#define C_ON_SWAPIO_Q 10 struct c_segment { @@ -222,7 +228,8 @@ extern vm_offset_t c_buffers; #define C_SEG_IS_ONDISK(cseg) ((cseg->c_state == C_ON_SWAPPEDOUT_Q || cseg->c_state == C_ON_SWAPPEDOUTSPARSE_Q)) #define C_SEG_IS_ON_DISK_OR_SOQ(cseg) ((cseg->c_state == C_ON_SWAPPEDOUT_Q || \ cseg->c_state == C_ON_SWAPPEDOUTSPARSE_Q || \ - cseg->c_state == C_ON_SWAPOUT_Q)) + cseg->c_state == C_ON_SWAPOUT_Q || \ + cseg->c_state == C_ON_SWAPIO_Q)) #define C_SEG_WAKEUP_DONE(cseg) \ @@ -317,7 +324,7 @@ extern void vm_swap_decrypt(c_segment_t); extern int vm_swap_low_on_space(void); extern kern_return_t vm_swap_get(c_segment_t, uint64_t, uint64_t); extern void vm_swap_free(uint64_t); -extern void vm_swap_consider_defragmenting(void); +extern void vm_swap_consider_defragmenting(int); extern void c_seg_swapin_requeue(c_segment_t, boolean_t, boolean_t, boolean_t); extern int c_seg_swapin(c_segment_t, boolean_t, boolean_t); @@ -358,6 +365,12 @@ extern uint32_t vm_compressor_minorcompact_threshold_divisor; extern uint32_t vm_compressor_majorcompact_threshold_divisor; extern uint32_t vm_compressor_unthrottle_threshold_divisor; extern uint32_t vm_compressor_catchup_threshold_divisor; + +extern uint32_t vm_compressor_minorcompact_threshold_divisor_overridden; +extern uint32_t vm_compressor_majorcompact_threshold_divisor_overridden; +extern uint32_t vm_compressor_unthrottle_threshold_divisor_overridden; +extern uint32_t vm_compressor_catchup_threshold_divisor_overridden; + extern uint64_t vm_compressor_compute_elapsed_msecs(clock_sec_t, clock_nsec_t, clock_sec_t, clock_nsec_t); #define PAGE_REPLACEMENT_DISALLOWED(enable) (enable == TRUE ? lck_rw_lock_shared(&c_master_lock) : lck_rw_done(&c_master_lock)) @@ -366,14 +379,25 @@ extern uint64_t vm_compressor_compute_elapsed_msecs(clock_sec_t, clock_nsec_t, c #define AVAILABLE_NON_COMPRESSED_MEMORY (vm_page_active_count + vm_page_inactive_count + vm_page_free_count + vm_page_speculative_count) #define AVAILABLE_MEMORY (AVAILABLE_NON_COMPRESSED_MEMORY + VM_PAGE_COMPRESSOR_COUNT) -/* TODO, there may be a minor optimisation opportunity to replace these divisions + +/* + * TODO, there may be a minor optimisation opportunity to replace these divisions * with multiplies and shifts + * + * By multiplying by 10, the divisors can have more precision w/o resorting to floating point... a divisor specified as 25 is in reality a divide by 2.5 + * By multiplying by 9, you get a number ~11% smaller which allows us to have another limit point derived from the same base + * By multiplying by 11, you get a number ~10% bigger which allows us to generate a reset limit derived from the same base which is useful for hysteresis */ -#define VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD (((AVAILABLE_MEMORY) * 10) / (vm_compressor_minorcompact_threshold_divisor ? vm_compressor_minorcompact_threshold_divisor : 1)) -#define VM_PAGE_COMPRESSOR_SWAP_THRESHOLD (((AVAILABLE_MEMORY) * 10) / (vm_compressor_majorcompact_threshold_divisor ? vm_compressor_majorcompact_threshold_divisor : 1)) -#define VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD (((AVAILABLE_MEMORY) * 10) / (vm_compressor_unthrottle_threshold_divisor ? vm_compressor_unthrottle_threshold_divisor : 1)) -#define VM_PAGE_COMPRESSOR_SWAP_CATCHUP_THRESHOLD (((AVAILABLE_MEMORY) * 10) / (vm_compressor_catchup_threshold_divisor ? vm_compressor_catchup_threshold_divisor : 1)) +#define VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD (((AVAILABLE_MEMORY) * 10) / (vm_compressor_minorcompact_threshold_divisor ? vm_compressor_minorcompact_threshold_divisor : 10)) +#define VM_PAGE_COMPRESSOR_SWAP_THRESHOLD (((AVAILABLE_MEMORY) * 10) / (vm_compressor_majorcompact_threshold_divisor ? vm_compressor_majorcompact_threshold_divisor : 10)) + +#define VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD (((AVAILABLE_MEMORY) * 10) / (vm_compressor_unthrottle_threshold_divisor ? vm_compressor_unthrottle_threshold_divisor : 10)) +#define VM_PAGE_COMPRESSOR_SWAP_RETHROTTLE_THRESHOLD (((AVAILABLE_MEMORY) * 11) / (vm_compressor_unthrottle_threshold_divisor ? vm_compressor_unthrottle_threshold_divisor : 11)) + +#define VM_PAGE_COMPRESSOR_SWAP_HAS_CAUGHTUP_THRESHOLD (((AVAILABLE_MEMORY) * 11) / (vm_compressor_catchup_threshold_divisor ? vm_compressor_catchup_threshold_divisor : 11)) +#define VM_PAGE_COMPRESSOR_SWAP_CATCHUP_THRESHOLD (((AVAILABLE_MEMORY) * 10) / (vm_compressor_catchup_threshold_divisor ? vm_compressor_catchup_threshold_divisor : 10)) +#define VM_PAGE_COMPRESSOR_HARD_THROTTLE_THRESHOLD (((AVAILABLE_MEMORY) * 9) / (vm_compressor_catchup_threshold_divisor ? vm_compressor_catchup_threshold_divisor : 9)) #ifdef CONFIG_EMBEDDED #define AVAILABLE_NON_COMPRESSED_MIN 20000 @@ -383,11 +407,11 @@ extern uint64_t vm_compressor_compute_elapsed_msecs(clock_sec_t, clock_nsec_t, c #define COMPRESSOR_NEEDS_TO_SWAP() ((AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_SWAP_THRESHOLD) ? 1 : 0) #endif -#define VM_PAGEOUT_SCAN_NEEDS_TO_THROTTLE() \ - (vm_compressor_mode == VM_PAGER_COMPRESSOR_WITH_SWAP && \ - ((AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_SWAP_CATCHUP_THRESHOLD) ? 1 : 0)) -#define HARD_THROTTLE_LIMIT_REACHED() ((AVAILABLE_NON_COMPRESSED_MEMORY < (VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 2) ? 1 : 0) +#define HARD_THROTTLE_LIMIT_REACHED() ((AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_HARD_THROTTLE_THRESHOLD) ? 1 : 0) #define SWAPPER_NEEDS_TO_UNTHROTTLE() ((AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) ? 1 : 0) +#define SWAPPER_NEEDS_TO_RETHROTTLE() ((AVAILABLE_NON_COMPRESSED_MEMORY > VM_PAGE_COMPRESSOR_SWAP_RETHROTTLE_THRESHOLD) ? 1 : 0) +#define SWAPPER_NEEDS_TO_CATCHUP() ((AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_SWAP_CATCHUP_THRESHOLD) ? 1 : 0) +#define SWAPPER_HAS_CAUGHTUP() ((AVAILABLE_NON_COMPRESSED_MEMORY > VM_PAGE_COMPRESSOR_SWAP_HAS_CAUGHTUP_THRESHOLD) ? 1 : 0) #define COMPRESSOR_NEEDS_TO_MINOR_COMPACT() ((AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0) diff --git a/osfmk/vm/vm_compressor_backing_store.c b/osfmk/vm/vm_compressor_backing_store.c index f4ec70ce5..e8c1342a1 100644 --- a/osfmk/vm/vm_compressor_backing_store.c +++ b/osfmk/vm/vm_compressor_backing_store.c @@ -27,6 +27,7 @@ */ #include "vm_compressor_backing_store.h" +#include #include #include @@ -37,12 +38,12 @@ boolean_t compressor_store_stop_compaction = FALSE; boolean_t vm_swapfile_create_needed = FALSE; boolean_t vm_swapfile_gc_needed = FALSE; -int swapper_throttle = -1; -boolean_t swapper_throttle_inited = FALSE; +int vm_swapper_throttle = -1; uint64_t vm_swapout_thread_id; uint64_t vm_swap_put_failures = 0; uint64_t vm_swap_get_failures = 0; +int vm_num_swap_files_config = 0; int vm_num_swap_files = 0; int vm_num_pinned_swap_files = 0; int vm_swapout_thread_processed_segments = 0; @@ -110,18 +111,21 @@ static void vm_swap_do_delayed_trim(struct swapfile *); static void vm_swap_wait_on_trim_handling_in_progress(void); +boolean_t vm_swap_force_defrag = FALSE, vm_swap_force_reclaim = FALSE; + #if CONFIG_EMBEDDED -/* - * Only 1 swap file currently allowed. - */ -#define VM_MAX_SWAP_FILE_NUM 1 + +#if DEVELOPMENT || DEBUG +#define VM_MAX_SWAP_FILE_NUM 100 +#else /* DEVELOPMENT || DEBUG */ +#define VM_MAX_SWAP_FILE_NUM 5 +#endif /* DEVELOPMENT || DEBUG */ + #define VM_SWAPFILE_DELAYED_TRIM_MAX 4 -#define VM_SWAP_SHOULD_DEFRAGMENT() (c_swappedout_sparse_count > (vm_swapfile_total_segs_used / 16) ? 1 : 0) -#define VM_SWAP_SHOULD_RECLAIM() FALSE -#define VM_SWAP_SHOULD_ABORT_RECLAIM() FALSE +#define VM_SWAP_SHOULD_DEFRAGMENT() (((vm_swap_force_defrag == TRUE) || (c_swappedout_sparse_count > (vm_swapfile_total_segs_used / 16))) ? 1 : 0) #define VM_SWAP_SHOULD_PIN(_size) FALSE -#define VM_SWAP_SHOULD_CREATE(cur_ts) ((vm_num_swap_files < VM_MAX_SWAP_FILE_NUM) && ((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) < (unsigned int)VM_SWAPFILE_HIWATER_SEGS) && \ +#define VM_SWAP_SHOULD_CREATE(cur_ts) ((vm_num_swap_files < vm_num_swap_files_config) && ((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) < (unsigned int)VM_SWAPFILE_HIWATER_SEGS) && \ ((cur_ts - vm_swapfile_last_failed_to_create_ts) > VM_SWAPFILE_DELAYED_CREATE) ? 1 : 0) #define VM_SWAP_SHOULD_TRIM(swf) ((swf->swp_delayed_trim_count >= VM_SWAPFILE_DELAYED_TRIM_MAX) ? 1 : 0) @@ -130,19 +134,19 @@ static void vm_swap_wait_on_trim_handling_in_progress(void); #define VM_MAX_SWAP_FILE_NUM 100 #define VM_SWAPFILE_DELAYED_TRIM_MAX 128 -#define VM_SWAP_SHOULD_DEFRAGMENT() (c_swappedout_sparse_count > (vm_swapfile_total_segs_used / 4) ? 1 : 0) -#define VM_SWAP_SHOULD_RECLAIM() (((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) >= SWAPFILE_RECLAIM_THRESHOLD_SEGS) ? 1 : 0) -#define VM_SWAP_SHOULD_ABORT_RECLAIM() (((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) <= SWAPFILE_RECLAIM_MINIMUM_SEGS) ? 1 : 0) +#define VM_SWAP_SHOULD_DEFRAGMENT() (((vm_swap_force_defrag == TRUE) || (c_swappedout_sparse_count > (vm_swapfile_total_segs_used / 4))) ? 1 : 0) #define VM_SWAP_SHOULD_PIN(_size) (vm_swappin_avail > 0 && vm_swappin_avail >= (int64_t)(_size)) -#define VM_SWAP_SHOULD_CREATE(cur_ts) ((vm_num_swap_files < VM_MAX_SWAP_FILE_NUM) && ((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) < (unsigned int)VM_SWAPFILE_HIWATER_SEGS) && \ +#define VM_SWAP_SHOULD_CREATE(cur_ts) ((vm_num_swap_files < vm_num_swap_files_config) && ((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) < (unsigned int)VM_SWAPFILE_HIWATER_SEGS) && \ ((cur_ts - vm_swapfile_last_failed_to_create_ts) > VM_SWAPFILE_DELAYED_CREATE) ? 1 : 0) #define VM_SWAP_SHOULD_TRIM(swf) ((swf->swp_delayed_trim_count >= VM_SWAPFILE_DELAYED_TRIM_MAX) ? 1 : 0) #endif /* CONFIG_EMBEDDED */ +#define VM_SWAP_SHOULD_RECLAIM() (((vm_swap_force_reclaim == TRUE) || ((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) >= SWAPFILE_RECLAIM_THRESHOLD_SEGS)) ? 1 : 0) +#define VM_SWAP_SHOULD_ABORT_RECLAIM() (((vm_swap_force_reclaim == FALSE) && ((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) <= SWAPFILE_RECLAIM_MINIMUM_SEGS)) ? 1 : 0) #define VM_SWAPFILE_DELAYED_CREATE 15 -#define VM_SWAP_BUSY() ((c_swapout_count && (swapper_throttle == THROTTLE_LEVEL_COMPRESSOR_TIER1 || swapper_throttle == THROTTLE_LEVEL_COMPRESSOR_TIER0)) ? 1 : 0) +#define VM_SWAP_BUSY() ((c_swapout_count && (vm_swapper_throttle == THROTTLE_LEVEL_COMPRESSOR_TIER0)) ? 1 : 0) #if CHECKSUM_THE_SWAP @@ -197,152 +201,149 @@ vm_swapfile_for_handle(uint64_t f_offset) #if ENCRYPTED_SWAP -#include -extern u_int32_t random(void); /* from */ +#include -#define SWAP_CRYPT_AES_KEY_SIZE 128 /* XXX 192 and 256 don't work ! */ +extern int cc_rand_generate(void *, size_t); /* from libkern/cyrpto/rand.h> */ -boolean_t swap_crypt_ctx_initialized; -void swap_crypt_ctx_initialize(void); +boolean_t swap_crypt_initialized; +void swap_crypt_initialize(void); -aes_ctx swap_crypt_ctx; -const unsigned char swap_crypt_null_iv[AES_BLOCK_SIZE] = {0xa, }; -uint32_t swap_crypt_key[8]; /* big enough for a 256 key */ +symmetric_xts xts_modectx; +uint32_t swap_crypt_key1[8]; /* big enough for a 256 bit random key */ +uint32_t swap_crypt_key2[8]; /* big enough for a 256 bit random key */ -unsigned long vm_page_encrypt_counter; -unsigned long vm_page_decrypt_counter; +#if DEVELOPMENT || DEBUG +boolean_t swap_crypt_xts_tested = FALSE; +unsigned char swap_crypt_test_page_ref[4096] __attribute__((aligned(4096))); +unsigned char swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096))); +unsigned char swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096))); +#endif /* DEVELOPMENT || DEBUG */ +unsigned long vm_page_encrypt_counter; +unsigned long vm_page_decrypt_counter; -#if DEBUG -boolean_t swap_crypt_ctx_tested = FALSE; -unsigned char swap_crypt_test_page_ref[4096] __attribute__((aligned(4096))); -unsigned char swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096))); -unsigned char swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096))); -#endif /* DEBUG */ -/* - * Initialize the encryption context: key and key size. - */ -void swap_crypt_ctx_initialize(void); /* forward */ void -swap_crypt_ctx_initialize(void) +swap_crypt_initialize(void) { - unsigned int i; + uint8_t *enckey1, *enckey2; + int keylen1, keylen2; + int error; - /* - * No need for locking to protect swap_crypt_ctx_initialized - * because the first use of encryption will come from the - * pageout thread (we won't pagein before there's been a pageout) - * and there's only one pageout thread. - */ - if (swap_crypt_ctx_initialized == FALSE) { - for (i = 0; - i < (sizeof (swap_crypt_key) / - sizeof (swap_crypt_key[0])); - i++) { - swap_crypt_key[i] = random(); - } - aes_encrypt_key((const unsigned char *) swap_crypt_key, - SWAP_CRYPT_AES_KEY_SIZE, - &swap_crypt_ctx.encrypt); - aes_decrypt_key((const unsigned char *) swap_crypt_key, - SWAP_CRYPT_AES_KEY_SIZE, - &swap_crypt_ctx.decrypt); - swap_crypt_ctx_initialized = TRUE; - } + assert(swap_crypt_initialized == FALSE); + + keylen1 = sizeof(swap_crypt_key1); + enckey1 = (uint8_t *)&swap_crypt_key1; + keylen2 = sizeof(swap_crypt_key2); + enckey2 = (uint8_t *)&swap_crypt_key2; + + error = cc_rand_generate((void *)enckey1, keylen1); + assert(!error); + + error = cc_rand_generate((void *)enckey2, keylen2); + assert(!error); + + error = xts_start(0, NULL, enckey1, keylen1, enckey2, keylen2, 0, 0, &xts_modectx); + assert(!error); + + swap_crypt_initialized = TRUE; + +#if DEVELOPMENT || DEBUG + uint8_t *encptr; + uint8_t *decptr; + uint8_t *refptr; + uint8_t *iv; + uint64_t ivnum[2]; + int size = 0; + int i = 0; + int rc = 0; + + assert(swap_crypt_xts_tested == FALSE); -#if DEBUG /* * Validate the encryption algorithms. + * + * First initialize the test data. */ - if (swap_crypt_ctx_tested == FALSE) { - /* initialize */ - for (i = 0; i < 4096; i++) { - swap_crypt_test_page_ref[i] = (char) i; - } - /* encrypt */ - aes_encrypt_cbc(swap_crypt_test_page_ref, - swap_crypt_null_iv, - PAGE_SIZE / AES_BLOCK_SIZE, - swap_crypt_test_page_encrypt, - &swap_crypt_ctx.encrypt); - /* decrypt */ - aes_decrypt_cbc(swap_crypt_test_page_encrypt, - swap_crypt_null_iv, - PAGE_SIZE / AES_BLOCK_SIZE, - swap_crypt_test_page_decrypt, - &swap_crypt_ctx.decrypt); - /* compare result with original */ - for (i = 0; i < 4096; i ++) { - if (swap_crypt_test_page_decrypt[i] != - swap_crypt_test_page_ref[i]) { - panic("encryption test failed"); - } + for (i = 0; i < 4096; i++) { + swap_crypt_test_page_ref[i] = (char) i; + } + ivnum[0] = (uint64_t)0xaa; + ivnum[1] = 0; + iv = (uint8_t *)ivnum; + + refptr = (uint8_t *)swap_crypt_test_page_ref; + encptr = (uint8_t *)swap_crypt_test_page_encrypt; + decptr = (uint8_t *)swap_crypt_test_page_decrypt; + size = 4096; + + /* encrypt */ + rc = xts_encrypt(refptr, size, encptr, iv, &xts_modectx); + assert(!rc); + + /* compare result with original - should NOT match */ + for (i = 0; i < 4096; i ++) { + if (swap_crypt_test_page_encrypt[i] != + swap_crypt_test_page_ref[i]) { + break; } + } + assert(i != 4096); - /* encrypt again */ - aes_encrypt_cbc(swap_crypt_test_page_decrypt, - swap_crypt_null_iv, - PAGE_SIZE / AES_BLOCK_SIZE, - swap_crypt_test_page_decrypt, - &swap_crypt_ctx.encrypt); - /* decrypt in place */ - aes_decrypt_cbc(swap_crypt_test_page_decrypt, - swap_crypt_null_iv, - PAGE_SIZE / AES_BLOCK_SIZE, - swap_crypt_test_page_decrypt, - &swap_crypt_ctx.decrypt); - for (i = 0; i < 4096; i ++) { - if (swap_crypt_test_page_decrypt[i] != - swap_crypt_test_page_ref[i]) { - panic("in place encryption test failed"); - } - } + /* decrypt */ + rc = xts_decrypt(encptr, size, decptr, iv, &xts_modectx); + assert(!rc); - swap_crypt_ctx_tested = TRUE; + /* compare result with original */ + for (i = 0; i < 4096; i ++) { + if (swap_crypt_test_page_decrypt[i] != + swap_crypt_test_page_ref[i]) { + panic("encryption test failed"); + } + } + /* encrypt in place */ + rc = xts_encrypt(decptr, size, decptr, iv, &xts_modectx); + assert(!rc); + + /* decrypt in place */ + rc = xts_decrypt(decptr, size, decptr, iv, &xts_modectx); + assert(!rc); + + for (i = 0; i < 4096; i ++) { + if (swap_crypt_test_page_decrypt[i] != + swap_crypt_test_page_ref[i]) { + panic("in place encryption test failed"); + } } -#endif /* DEBUG */ + swap_crypt_xts_tested = TRUE; +#endif /* DEVELOPMENT || DEBUG */ } void vm_swap_encrypt(c_segment_t c_seg) { - vm_offset_t kernel_vaddr = 0; - uint64_t size = 0; + uint8_t *ptr; + uint8_t *iv; + uint64_t ivnum[2]; + int size = 0; + int rc = 0; + + if (swap_crypt_initialized == FALSE) + swap_crypt_initialize(); - union { - unsigned char aes_iv[AES_BLOCK_SIZE]; - void *c_seg; - } encrypt_iv; - - assert(swap_crypt_ctx_initialized); - #if DEVELOPMENT || DEBUG C_SEG_MAKE_WRITEABLE(c_seg); #endif - bzero(&encrypt_iv.aes_iv[0], sizeof (encrypt_iv.aes_iv)); - - encrypt_iv.c_seg = (void*)c_seg; - - /* encrypt the "initial vector" */ - aes_encrypt_cbc((const unsigned char *) &encrypt_iv.aes_iv[0], - swap_crypt_null_iv, - 1, - &encrypt_iv.aes_iv[0], - &swap_crypt_ctx.encrypt); - - kernel_vaddr = (vm_offset_t) c_seg->c_store.c_buffer; + ptr = (uint8_t *)c_seg->c_store.c_buffer; size = round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset)); - /* - * Encrypt the c_segment. - */ - aes_encrypt_cbc((const unsigned char *) kernel_vaddr, - &encrypt_iv.aes_iv[0], - (unsigned int)(size / AES_BLOCK_SIZE), - (unsigned char *) kernel_vaddr, - &swap_crypt_ctx.encrypt); + ivnum[0] = (uint64_t)c_seg; + ivnum[1] = 0; + iv = (uint8_t *)ivnum; + + rc = xts_encrypt(ptr, size, ptr, iv, &xts_modectx); + assert(!rc); vm_page_encrypt_counter += (size/PAGE_SIZE_64); @@ -354,48 +355,26 @@ vm_swap_encrypt(c_segment_t c_seg) void vm_swap_decrypt(c_segment_t c_seg) { + uint8_t *ptr; + uint8_t *iv; + uint64_t ivnum[2]; + int size = 0; + int rc = 0; - vm_offset_t kernel_vaddr = 0; - uint64_t size = 0; - - union { - unsigned char aes_iv[AES_BLOCK_SIZE]; - void *c_seg; - } decrypt_iv; - - - assert(swap_crypt_ctx_initialized); + assert(swap_crypt_initialized); #if DEVELOPMENT || DEBUG C_SEG_MAKE_WRITEABLE(c_seg); #endif - /* - * Prepare an "initial vector" for the decryption. - * It has to be the same as the "initial vector" we - * used to encrypt that page. - */ - bzero(&decrypt_iv.aes_iv[0], sizeof (decrypt_iv.aes_iv)); - - decrypt_iv.c_seg = (void*)c_seg; - - /* encrypt the "initial vector" */ - aes_encrypt_cbc((const unsigned char *) &decrypt_iv.aes_iv[0], - swap_crypt_null_iv, - 1, - &decrypt_iv.aes_iv[0], - &swap_crypt_ctx.encrypt); - - kernel_vaddr = (vm_offset_t) c_seg->c_store.c_buffer; + ptr = (uint8_t *)c_seg->c_store.c_buffer; size = round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset)); - /* - * Decrypt the c_segment. - */ - aes_decrypt_cbc((const unsigned char *) kernel_vaddr, - &decrypt_iv.aes_iv[0], - (unsigned int) (size / AES_BLOCK_SIZE), - (unsigned char *) kernel_vaddr, - &swap_crypt_ctx.decrypt); + ivnum[0] = (uint64_t)c_seg; + ivnum[1] = 0; + iv = (uint8_t *)ivnum; + + rc = xts_decrypt(ptr, size, ptr, iv, &xts_modectx); + assert(!rc); vm_page_decrypt_counter += (size/PAGE_SIZE_64); @@ -428,6 +407,7 @@ vm_compressor_swap_init() BASEPRI_VM, &thread) != KERN_SUCCESS) { panic("vm_swapout_thread: create failed"); } + thread_set_thread_name(thread, "VM_swapout"); vm_swapout_thread_id = thread->thread_id; thread_deallocate(thread); @@ -437,12 +417,14 @@ vm_compressor_swap_init() panic("vm_swapfile_create_thread: create failed"); } + thread_set_thread_name(thread, "VM_swapfile_create"); thread_deallocate(thread); if (kernel_thread_start_priority((thread_continue_t)vm_swapfile_gc_thread, NULL, BASEPRI_VM, &thread) != KERN_SUCCESS) { panic("vm_swapfile_gc_thread: create failed"); } + thread_set_thread_name(thread, "VM_swapfile_gc"); thread_deallocate(thread); proc_set_thread_policy_with_tid(kernel_task, thread->thread_id, @@ -450,12 +432,6 @@ vm_compressor_swap_init() proc_set_thread_policy_with_tid(kernel_task, thread->thread_id, TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE); -#if ENCRYPTED_SWAP - if (swap_crypt_ctx_initialized == FALSE) { - swap_crypt_ctx_initialize(); - } -#endif /* ENCRYPTED_SWAP */ - #if CONFIG_EMBEDDED /* * dummy value until the swap file gets created @@ -465,6 +441,9 @@ vm_compressor_swap_init() */ c_overage_swapped_limit = 16; #endif + + vm_num_swap_files_config = VM_MAX_SWAP_FILE_NUM; + printf("VM Swap Subsystem is ON\n"); } @@ -521,9 +500,23 @@ vm_compaction_swapper_do_init(void) if (vp) { if (vnode_pager_isSSD(vp) == FALSE) { - vm_compressor_minorcompact_threshold_divisor = 18; - vm_compressor_majorcompact_threshold_divisor = 22; - vm_compressor_unthrottle_threshold_divisor = 32; + /* + * swap files live on an HDD, so let's make sure to start swapping + * much earlier since we're not worried about SSD write-wear and + * we have so little write bandwidth to work with + * these values were derived expermentially by running the performance + * teams stock test for evaluating HDD performance against various + * combinations and looking and comparing overall results. + * Note that the > relationship between these 4 values must be maintained + */ + if (vm_compressor_minorcompact_threshold_divisor_overridden == 0) + vm_compressor_minorcompact_threshold_divisor = 15; + if (vm_compressor_majorcompact_threshold_divisor_overridden == 0) + vm_compressor_majorcompact_threshold_divisor = 18; + if (vm_compressor_unthrottle_threshold_divisor_overridden == 0) + vm_compressor_unthrottle_threshold_divisor = 24; + if (vm_compressor_catchup_threshold_divisor_overridden == 0) + vm_compressor_catchup_threshold_divisor = 30; } #if !CONFIG_EMBEDDED vnode_setswapmount(vp); @@ -542,16 +535,26 @@ vm_compaction_swapper_do_init(void) } - void -vm_swap_consider_defragmenting() +vm_swap_consider_defragmenting(int flags) { + boolean_t force_defrag = (flags & VM_SWAP_FLAGS_FORCE_DEFRAG); + boolean_t force_reclaim = (flags & VM_SWAP_FLAGS_FORCE_RECLAIM); + if (compressor_store_stop_compaction == FALSE && !VM_SWAP_BUSY() && - (VM_SWAP_SHOULD_DEFRAGMENT() || VM_SWAP_SHOULD_RECLAIM())) { + (force_defrag || force_reclaim || VM_SWAP_SHOULD_DEFRAGMENT() || VM_SWAP_SHOULD_RECLAIM())) { - if (!vm_swapfile_gc_thread_running) { + if (!vm_swapfile_gc_thread_running || force_defrag || force_reclaim) { lck_mtx_lock(&vm_swap_data_lock); + if (force_defrag) { + vm_swap_force_defrag = TRUE; + } + + if (force_reclaim) { + vm_swap_force_reclaim = TRUE; + } + if (!vm_swapfile_gc_thread_running) thread_wakeup((event_t) &vm_swapfile_gc_needed); @@ -783,6 +786,9 @@ vm_swapfile_gc_thread(void) if (need_defragment == FALSE && need_reclaim == FALSE) break; + vm_swap_force_defrag = FALSE; + vm_swap_force_reclaim = FALSE; + lck_mtx_unlock(&vm_swap_data_lock); if (need_defragment == TRUE) @@ -806,98 +812,217 @@ vm_swapfile_gc_thread(void) -int swapper_entered_T0 = 0; -int swapper_entered_T1 = 0; -int swapper_entered_T2 = 0; +#define VM_SWAPOUT_LIMIT_T2P 4 +#define VM_SWAPOUT_LIMIT_T1P 4 +#define VM_SWAPOUT_LIMIT_T0P 6 +#define VM_SWAPOUT_LIMIT_T0 8 +#define VM_SWAPOUT_LIMIT_MAX 8 + +#define VM_SWAPOUT_START 0 +#define VM_SWAPOUT_T2_PASSIVE 1 +#define VM_SWAPOUT_T1_PASSIVE 2 +#define VM_SWAPOUT_T0_PASSIVE 3 +#define VM_SWAPOUT_T0 4 + +int vm_swapout_state = VM_SWAPOUT_START; +int vm_swapout_limit = 1; + +int vm_swapper_entered_T0 = 0; +int vm_swapper_entered_T0P = 0; +int vm_swapper_entered_T1P = 0; +int vm_swapper_entered_T2P = 0; + static void vm_swapout_thread_throttle_adjust(void) { - int swapper_throttle_new; - if (swapper_throttle_inited == FALSE) { - /* - * force this thread to be set to the correct - * throttling tier - */ - swapper_throttle_new = THROTTLE_LEVEL_COMPRESSOR_TIER2; - swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER1; - swapper_throttle_inited = TRUE; - swapper_entered_T2++; - goto done; - } - swapper_throttle_new = swapper_throttle; + switch(vm_swapout_state) { + + case VM_SWAPOUT_START: + + vm_swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER2; + vm_swapper_entered_T2P++; + + proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id, + TASK_POLICY_INTERNAL, TASK_POLICY_IO, vm_swapper_throttle); + proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id, + TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE); + vm_swapout_limit = VM_SWAPOUT_LIMIT_T2P; + vm_swapout_state = VM_SWAPOUT_T2_PASSIVE; + + break; + case VM_SWAPOUT_T2_PASSIVE: - switch(swapper_throttle) { + if (SWAPPER_NEEDS_TO_UNTHROTTLE()) { + vm_swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER0; + vm_swapper_entered_T0P++; - case THROTTLE_LEVEL_COMPRESSOR_TIER2: + proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id, + TASK_POLICY_INTERNAL, TASK_POLICY_IO, vm_swapper_throttle); + proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id, + TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE); + vm_swapout_limit = VM_SWAPOUT_LIMIT_T0P; + vm_swapout_state = VM_SWAPOUT_T0_PASSIVE; - if (SWAPPER_NEEDS_TO_UNTHROTTLE() || swapout_target_age || hibernate_flushing == TRUE) { - swapper_throttle_new = THROTTLE_LEVEL_COMPRESSOR_TIER1; - swapper_entered_T1++; break; } + if (swapout_target_age || hibernate_flushing == TRUE) { + vm_swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER1; + vm_swapper_entered_T1P++; + + proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id, + TASK_POLICY_INTERNAL, TASK_POLICY_IO, vm_swapper_throttle); + proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id, + TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE); + vm_swapout_limit = VM_SWAPOUT_LIMIT_T1P; + vm_swapout_state = VM_SWAPOUT_T1_PASSIVE; + } break; - case THROTTLE_LEVEL_COMPRESSOR_TIER1: + case VM_SWAPOUT_T1_PASSIVE: + + if (SWAPPER_NEEDS_TO_UNTHROTTLE()) { + vm_swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER0; + vm_swapper_entered_T0P++; + + proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id, + TASK_POLICY_INTERNAL, TASK_POLICY_IO, vm_swapper_throttle); + proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id, + TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE); + vm_swapout_limit = VM_SWAPOUT_LIMIT_T0P; + vm_swapout_state = VM_SWAPOUT_T0_PASSIVE; - if (VM_PAGEOUT_SCAN_NEEDS_TO_THROTTLE()) { - swapper_throttle_new = THROTTLE_LEVEL_COMPRESSOR_TIER0; - swapper_entered_T0++; break; } - if (COMPRESSOR_NEEDS_TO_SWAP() == 0 && swapout_target_age == 0 && hibernate_flushing == FALSE) { - swapper_throttle_new = THROTTLE_LEVEL_COMPRESSOR_TIER2; - swapper_entered_T2++; - break; + if (swapout_target_age == 0 && hibernate_flushing == FALSE) { + + vm_swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER2; + vm_swapper_entered_T2P++; + + proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id, + TASK_POLICY_INTERNAL, TASK_POLICY_IO, vm_swapper_throttle); + proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id, + TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE); + vm_swapout_limit = VM_SWAPOUT_LIMIT_T2P; + vm_swapout_state = VM_SWAPOUT_T2_PASSIVE; } - break; + break; - case THROTTLE_LEVEL_COMPRESSOR_TIER0: + case VM_SWAPOUT_T0_PASSIVE: + + if (SWAPPER_NEEDS_TO_RETHROTTLE()) { + vm_swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER2; + vm_swapper_entered_T2P++; + + proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id, + TASK_POLICY_INTERNAL, TASK_POLICY_IO, vm_swapper_throttle); + proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id, + TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE); + vm_swapout_limit = VM_SWAPOUT_LIMIT_T2P; + vm_swapout_state = VM_SWAPOUT_T2_PASSIVE; - if (COMPRESSOR_NEEDS_TO_SWAP() == 0) { - swapper_throttle_new = THROTTLE_LEVEL_COMPRESSOR_TIER2; - swapper_entered_T2++; break; } - if (SWAPPER_NEEDS_TO_UNTHROTTLE() == 0) { - swapper_throttle_new = THROTTLE_LEVEL_COMPRESSOR_TIER1; - swapper_entered_T1++; - break; + if (SWAPPER_NEEDS_TO_CATCHUP()) { + vm_swapper_entered_T0++; + + proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id, + TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_DISABLE); + vm_swapout_limit = VM_SWAPOUT_LIMIT_T0; + vm_swapout_state = VM_SWAPOUT_T0; + } + break; + + case VM_SWAPOUT_T0: + + if (SWAPPER_HAS_CAUGHTUP()) { + vm_swapper_entered_T0P++; + + proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id, + TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE); + vm_swapout_limit = VM_SWAPOUT_LIMIT_T0P; + vm_swapout_state = VM_SWAPOUT_T0_PASSIVE; } break; } -done: - if (swapper_throttle != swapper_throttle_new) { - proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id, - TASK_POLICY_INTERNAL, TASK_POLICY_IO, swapper_throttle_new); - proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id, - TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE); +} + +int vm_swapout_found_empty = 0; + +struct swapout_io_completion vm_swapout_ctx[VM_SWAPOUT_LIMIT_MAX]; - swapper_throttle = swapper_throttle_new; +int vm_swapout_soc_busy = 0; +int vm_swapout_soc_done = 0; + + +static struct swapout_io_completion * +vm_swapout_find_free_soc(void) +{ int i; + + for (i = 0; i < VM_SWAPOUT_LIMIT_MAX; i++) { + if (vm_swapout_ctx[i].swp_io_busy == 0) + return (&vm_swapout_ctx[i]); } + assert(vm_swapout_soc_busy == VM_SWAPOUT_LIMIT_MAX); + + return NULL; } +static struct swapout_io_completion * +vm_swapout_find_done_soc(void) +{ int i; + + if (vm_swapout_soc_done) { + for (i = 0; i < VM_SWAPOUT_LIMIT_MAX; i++) { + if (vm_swapout_ctx[i].swp_io_done) + return (&vm_swapout_ctx[i]); + } + } + return NULL; +} + +static void +vm_swapout_complete_soc(struct swapout_io_completion *soc) +{ + kern_return_t kr; + + if (soc->swp_io_error) + kr = KERN_FAILURE; + else + kr = KERN_SUCCESS; + + lck_mtx_unlock_always(c_list_lock); + + vm_swap_put_finish(soc->swp_swf, &soc->swp_f_offset, soc->swp_io_error); + vm_swapout_finish(soc->swp_c_seg, soc->swp_f_offset, soc->swp_c_size, kr); + + lck_mtx_lock_spin_always(c_list_lock); + + soc->swp_io_done = 0; + soc->swp_io_busy = 0; + + vm_swapout_soc_busy--; + vm_swapout_soc_done--; +} -int vm_swapout_found_empty = 0; static void vm_swapout_thread(void) { - uint64_t f_offset = 0; uint32_t size = 0; c_segment_t c_seg = NULL; kern_return_t kr = KERN_SUCCESS; - vm_offset_t addr = 0; + struct swapout_io_completion *soc; current_thread()->options |= TH_OPT_VMPRIV; vm_swapout_thread_awakened++; lck_mtx_lock_spin_always(c_list_lock); - - while (!queue_empty(&c_swapout_list_head)) { +again: + while (!queue_empty(&c_swapout_list_head) && vm_swapout_soc_busy < vm_swapout_limit) { c_seg = (c_segment_t)queue_first(&c_swapout_list_head); @@ -934,14 +1059,13 @@ vm_swapout_thread(void) C_SEG_BUSY(c_seg); c_seg->c_busy_swapping = 1; - lck_mtx_unlock_always(c_list_lock); - - addr = (vm_offset_t) c_seg->c_store.c_buffer; + c_seg_switch_state(c_seg, C_ON_SWAPIO_Q, FALSE); + lck_mtx_unlock_always(c_list_lock); lck_mtx_unlock_always(&c_seg->c_lock); #if CHECKSUM_THE_SWAP - c_seg->cseg_hash = hash_string((char*)addr, (int)size); + c_seg->cseg_hash = hash_string((char *)c_seg->c_store.c_buffer, (int)size); c_seg->cseg_swap_size = size; #endif /* CHECKSUM_THE_SWAP */ @@ -949,80 +1073,133 @@ vm_swapout_thread(void) vm_swap_encrypt(c_seg); #endif /* ENCRYPTED_SWAP */ - vm_swapout_thread_throttle_adjust(); + soc = vm_swapout_find_free_soc(); + assert(soc); - kr = vm_swap_put((vm_offset_t) addr, &f_offset, size, c_seg); + soc->swp_upl_ctx.io_context = (void *)soc; + soc->swp_upl_ctx.io_done = (void *)vm_swapout_iodone; + soc->swp_upl_ctx.io_error = 0; - PAGE_REPLACEMENT_DISALLOWED(TRUE); + kr = vm_swap_put((vm_offset_t)c_seg->c_store.c_buffer, &soc->swp_f_offset, size, c_seg, soc); - if (kr == KERN_SUCCESS) { - kernel_memory_depopulate(compressor_map, (vm_offset_t) addr, size, KMA_COMPRESSOR); - } -#if ENCRYPTED_SWAP - else { - vm_swap_decrypt(c_seg); + if (kr != KERN_SUCCESS) { + if (soc->swp_io_done) { + lck_mtx_lock_spin_always(c_list_lock); + + soc->swp_io_done = 0; + vm_swapout_soc_done--; + + lck_mtx_unlock_always(c_list_lock); + } + vm_swapout_finish(c_seg, soc->swp_f_offset, size, kr); + } else { + soc->swp_io_busy = 1; + vm_swapout_soc_busy++; } -#endif /* ENCRYPTED_SWAP */ + vm_swapout_thread_throttle_adjust(); + vm_pageout_io_throttle(); + +c_seg_is_empty: + if (c_swapout_count == 0) + vm_swap_consider_defragmenting(VM_SWAP_FLAGS_NONE); + lck_mtx_lock_spin_always(c_list_lock); - lck_mtx_lock_spin_always(&c_seg->c_lock); - if (kr == KERN_SUCCESS) { - int new_state = C_ON_SWAPPEDOUT_Q; - boolean_t insert_head = FALSE; + if ((soc = vm_swapout_find_done_soc())) + vm_swapout_complete_soc(soc); + } + if ((soc = vm_swapout_find_done_soc())) { + vm_swapout_complete_soc(soc); + goto again; + } + assert_wait((event_t)&c_swapout_list_head, THREAD_UNINT); - if (hibernate_flushing == TRUE) { - if (c_seg->c_generation_id >= first_c_segment_to_warm_generation_id && - c_seg->c_generation_id <= last_c_segment_to_warm_generation_id) - insert_head = TRUE; - } else if (C_SEG_ONDISK_IS_SPARSE(c_seg)) - new_state = C_ON_SWAPPEDOUTSPARSE_Q; + lck_mtx_unlock_always(c_list_lock); - c_seg_switch_state(c_seg, new_state, insert_head); + thread_block((thread_continue_t)vm_swapout_thread); + + /* NOTREACHED */ +} - c_seg->c_store.c_swap_handle = f_offset; - VM_STAT_INCR_BY(swapouts, size >> PAGE_SHIFT); - - if (c_seg->c_bytes_used) - OSAddAtomic64(-c_seg->c_bytes_used, &compressor_bytes_used); - } else { - if (c_seg->c_overage_swap == TRUE) { - c_seg->c_overage_swap = FALSE; - c_overage_swapped_count--; - } - c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE); +void +vm_swapout_iodone(void *io_context, int error) +{ + struct swapout_io_completion *soc; - if (!c_seg->c_on_minorcompact_q && C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE) - c_seg_need_delayed_compaction(c_seg, TRUE); - } - assert(c_seg->c_busy_swapping); - assert(c_seg->c_busy); + soc = (struct swapout_io_completion *)io_context; - c_seg->c_busy_swapping = 0; - lck_mtx_unlock_always(c_list_lock); + lck_mtx_lock_spin_always(c_list_lock); - C_SEG_WAKEUP_DONE(c_seg); - lck_mtx_unlock_always(&c_seg->c_lock); + soc->swp_io_done = 1; + soc->swp_io_error = error; + vm_swapout_soc_done++; + + thread_wakeup((event_t)&c_swapout_list_head); + + lck_mtx_unlock_always(c_list_lock); +} - PAGE_REPLACEMENT_DISALLOWED(FALSE); - vm_pageout_io_throttle(); -c_seg_is_empty: - if (c_swapout_count == 0) - vm_swap_consider_defragmenting(); +static void +vm_swapout_finish(c_segment_t c_seg, uint64_t f_offset, uint32_t size, kern_return_t kr) +{ - lck_mtx_lock_spin_always(c_list_lock); + PAGE_REPLACEMENT_DISALLOWED(TRUE); + + if (kr == KERN_SUCCESS) { + kernel_memory_depopulate(compressor_map, (vm_offset_t)c_seg->c_store.c_buffer, size, KMA_COMPRESSOR); + } +#if ENCRYPTED_SWAP + else { + vm_swap_decrypt(c_seg); } +#endif /* ENCRYPTED_SWAP */ + lck_mtx_lock_spin_always(c_list_lock); + lck_mtx_lock_spin_always(&c_seg->c_lock); - assert_wait((event_t)&c_swapout_list_head, THREAD_UNINT); + if (kr == KERN_SUCCESS) { + int new_state = C_ON_SWAPPEDOUT_Q; + boolean_t insert_head = FALSE; + + if (hibernate_flushing == TRUE) { + if (c_seg->c_generation_id >= first_c_segment_to_warm_generation_id && + c_seg->c_generation_id <= last_c_segment_to_warm_generation_id) + insert_head = TRUE; + } else if (C_SEG_ONDISK_IS_SPARSE(c_seg)) + new_state = C_ON_SWAPPEDOUTSPARSE_Q; + + c_seg_switch_state(c_seg, new_state, insert_head); + + c_seg->c_store.c_swap_handle = f_offset; + VM_STAT_INCR_BY(swapouts, size >> PAGE_SHIFT); + + if (c_seg->c_bytes_used) + OSAddAtomic64(-c_seg->c_bytes_used, &compressor_bytes_used); + } else { + if (c_seg->c_overage_swap == TRUE) { + c_seg->c_overage_swap = FALSE; + c_overage_swapped_count--; + } + c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE); + + if (!c_seg->c_on_minorcompact_q && C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE) + c_seg_need_delayed_compaction(c_seg, TRUE); + } + assert(c_seg->c_busy_swapping); + assert(c_seg->c_busy); + + c_seg->c_busy_swapping = 0; lck_mtx_unlock_always(c_list_lock); - thread_block((thread_continue_t)vm_swapout_thread); - - /* NOTREACHED */ + C_SEG_WAKEUP_DONE(c_seg); + lck_mtx_unlock_always(&c_seg->c_lock); + + PAGE_REPLACEMENT_DISALLOWED(FALSE); } + boolean_t vm_swap_create_file() { @@ -1199,7 +1376,7 @@ vm_swap_get(c_segment_t c_seg, uint64_t f_offset, uint64_t size) C_SEG_MAKE_WRITEABLE(c_seg); #endif file_offset = (f_offset & SWAP_SLOT_MASK); - retval = vm_swapfile_io(swf->swp_vp, file_offset, (uint64_t)c_seg->c_store.c_buffer, (int)(size / PAGE_SIZE_64), SWAP_READ); + retval = vm_swapfile_io(swf->swp_vp, file_offset, (uint64_t)c_seg->c_store.c_buffer, (int)(size / PAGE_SIZE_64), SWAP_READ, NULL); #if DEVELOPMENT || DEBUG C_SEG_WRITE_PROTECT(c_seg); @@ -1232,7 +1409,7 @@ vm_swap_get(c_segment_t c_seg, uint64_t f_offset, uint64_t size) } kern_return_t -vm_swap_put(vm_offset_t addr, uint64_t *f_offset, uint64_t size, c_segment_t c_seg) +vm_swap_put(vm_offset_t addr, uint64_t *f_offset, uint32_t size, c_segment_t c_seg, struct swapout_io_completion *soc) { unsigned int segidx = 0; struct swapfile *swf = NULL; @@ -1246,6 +1423,7 @@ vm_swap_put(vm_offset_t addr, uint64_t *f_offset, uint64_t size, c_segment_t c_s int error = 0; clock_sec_t sec; clock_nsec_t nsec; + void *upl_ctx = NULL; if (addr == 0 || f_offset == NULL) { return KERN_FAILURE; @@ -1278,8 +1456,9 @@ vm_swap_put(vm_offset_t addr, uint64_t *f_offset, uint64_t size, c_segment_t c_s file_offset = segidx * COMPRESSED_SWAP_CHUNK_SIZE; swf->swp_nseginuse++; swf->swp_io_count++; - swapfile_index = swf->swp_index; + swf->swp_csegs[segidx] = c_seg; + swapfile_index = swf->swp_index; vm_swapfile_total_segs_used++; clock_get_system_nanotime(&sec, &nsec); @@ -1289,7 +1468,7 @@ vm_swap_put(vm_offset_t addr, uint64_t *f_offset, uint64_t size, c_segment_t c_s lck_mtx_unlock(&vm_swap_data_lock); - goto done; + goto issue_io; } } swf = (struct swapfile*) queue_next(&swf->swp_queue); @@ -1336,32 +1515,48 @@ vm_swap_put(vm_offset_t addr, uint64_t *f_offset, uint64_t size, c_segment_t c_s return KERN_FAILURE; -done: +issue_io: assert(c_seg->c_busy_swapping); assert(c_seg->c_busy); assert(!c_seg->c_on_minorcompact_q); - error = vm_swapfile_io(swf->swp_vp, file_offset, addr, (int) (size / PAGE_SIZE_64), SWAP_WRITE); + *f_offset = (swapfile_index << SWAP_DEVICE_SHIFT) | file_offset; + + if (soc) { + soc->swp_c_seg = c_seg; + soc->swp_c_size = size; - lck_mtx_lock(&vm_swap_data_lock); + soc->swp_swf = swf; - swf->swp_csegs[segidx] = c_seg; + soc->swp_io_error = 0; + soc->swp_io_done = 0; - swf->swp_io_count--; + upl_ctx = (void *)&soc->swp_upl_ctx; + } + error = vm_swapfile_io(swf->swp_vp, file_offset, addr, (int) (size / PAGE_SIZE_64), SWAP_WRITE, upl_ctx); - *f_offset = (swapfile_index << SWAP_DEVICE_SHIFT) | file_offset; + if (error || upl_ctx == NULL) + return (vm_swap_put_finish(swf, f_offset, error)); + + return KERN_SUCCESS; +} + +kern_return_t +vm_swap_put_finish(struct swapfile *swf, uint64_t *f_offset, int error) +{ + lck_mtx_lock(&vm_swap_data_lock); + + swf->swp_io_count--; if ((swf->swp_flags & SWAP_WANTED) && swf->swp_io_count == 0) { swf->swp_flags &= ~SWAP_WANTED; thread_wakeup((event_t) &swf->swp_flags); } - lck_mtx_unlock(&vm_swap_data_lock); if (error) { vm_swap_free(*f_offset); - vm_swap_put_failures++; return KERN_FAILURE; @@ -1370,7 +1565,6 @@ vm_swap_put(vm_offset_t addr, uint64_t *f_offset, uint64_t size, c_segment_t c_s } - static void vm_swap_free_now(struct swapfile *swf, uint64_t f_offset) { @@ -1737,7 +1931,7 @@ vm_swap_reclaim(void) lck_mtx_unlock_always(&c_seg->c_lock); - if (vm_swapfile_io(swf->swp_vp, f_offset, addr, (int)(c_size / PAGE_SIZE_64), SWAP_READ)) { + if (vm_swapfile_io(swf->swp_vp, f_offset, addr, (int)(c_size / PAGE_SIZE_64), SWAP_READ, NULL)) { /* * reading the data back in failed, so convert c_seg @@ -1753,7 +1947,7 @@ vm_swap_reclaim(void) } VM_STAT_INCR_BY(swapins, c_size >> PAGE_SHIFT); - if (vm_swap_put(addr, &f_offset, c_size, c_seg)) { + if (vm_swap_put(addr, &f_offset, c_size, c_seg, NULL)) { vm_offset_t c_buffer; /* @@ -1896,9 +2090,67 @@ vm_swap_files_pinned(void) boolean_t result; if (vm_swappin_enabled == FALSE) - return(TRUE); + return (TRUE); result = (vm_num_pinned_swap_files == vm_num_swap_files); return (result); } + +#if CONFIG_FREEZE +boolean_t +vm_swap_max_budget(uint64_t *freeze_daily_budget) +{ + boolean_t use_device_value = FALSE; + struct swapfile *swf = NULL; + + if (vm_num_swap_files) { + lck_mtx_lock(&vm_swap_data_lock); + + swf = (struct swapfile*) queue_first(&swf_global_queue); + + if (swf) { + while(queue_end(&swf_global_queue, (queue_entry_t)swf) == FALSE) { + + if (swf->swp_flags == SWAP_READY) { + + assert(swf->swp_vp); + + if (vm_swap_vol_get_budget(swf->swp_vp, freeze_daily_budget) == 0) { + use_device_value = TRUE; + } + break; + } + swf = (struct swapfile*) queue_next(&swf->swp_queue); + } + } + + lck_mtx_unlock(&vm_swap_data_lock); + + } else { + + /* + * This block is used for the initial budget value before any swap files + * are created. We create a temp swap file to get the budget. + */ + + struct vnode *temp_vp = NULL; + + vm_swapfile_open(swapfilename, &temp_vp); + + if (temp_vp) { + + if (vm_swap_vol_get_budget(temp_vp, freeze_daily_budget) == 0) { + use_device_value = TRUE; + } + + vm_swapfile_close((uint64_t)&swapfilename, temp_vp); + temp_vp = NULL; + } else { + *freeze_daily_budget = 0; + } + } + + return use_device_value; +} +#endif /* CONFIG_FREEZE */ diff --git a/osfmk/vm/vm_compressor_backing_store.h b/osfmk/vm/vm_compressor_backing_store.h index 9dda1ab75..2bad2d6f1 100644 --- a/osfmk/vm/vm_compressor_backing_store.h +++ b/osfmk/vm/vm_compressor_backing_store.h @@ -78,7 +78,29 @@ lck_mtx_t vm_swap_data_lock; void vm_swap_init(void); boolean_t vm_swap_create_file(void); -kern_return_t vm_swap_put(vm_offset_t, uint64_t*, uint64_t, c_segment_t); + + +struct swapout_io_completion { + + int swp_io_busy; + int swp_io_done; + int swp_io_error; + + uint32_t swp_c_size; + c_segment_t swp_c_seg; + + struct swapfile *swp_swf; + uint64_t swp_f_offset; + + struct upl_io_completion swp_upl_ctx; +}; +void vm_swapout_iodone(void *, int); + + +static void vm_swapout_finish(c_segment_t, uint64_t, uint32_t, kern_return_t); +kern_return_t vm_swap_put_finish(struct swapfile *, uint64_t *, int); +kern_return_t vm_swap_put(vm_offset_t, uint64_t*, uint32_t, c_segment_t, struct swapout_io_completion *); + void vm_swap_flush(void); void vm_swap_reclaim(void); void vm_swap_encrypt(c_segment_t); @@ -92,7 +114,12 @@ extern void vm_swapfile_close(uint64_t path, struct vnode *vp); extern int vm_swapfile_preallocate(struct vnode *vp, uint64_t *size, boolean_t *pin); extern uint64_t vm_swapfile_get_blksize(struct vnode *vp); extern uint64_t vm_swapfile_get_transfer_size(struct vnode *vp); -extern int vm_swapfile_io(struct vnode *vp, uint64_t offset, uint64_t start, int npages, int flags); +extern int vm_swapfile_io(struct vnode *vp, uint64_t offset, uint64_t start, int npages, int flags, void *upl_ctx); + +#if CONFIG_FREEZE +boolean_t vm_swap_max_budget(uint64_t *); +int vm_swap_vol_get_budget(struct vnode* vp, uint64_t *freeze_daily_budget); +#endif /* CONFIG_FREEZE */ #if RECORD_THE_COMPRESSED_DATA extern int vm_record_file_write(struct vnode *vp, uint64_t offset, char *buf, int size); diff --git a/osfmk/vm/vm_compressor_pager.c b/osfmk/vm/vm_compressor_pager.c index ae0195c86..c8ec4fed0 100644 --- a/osfmk/vm/vm_compressor_pager.c +++ b/osfmk/vm/vm_compressor_pager.c @@ -707,6 +707,8 @@ vm_compressor_pager_put( { compressor_pager_t pager; compressor_slot_t *slot_p; + unsigned int prev_wimg = VM_WIMG_DEFAULT; + boolean_t set_cache_attr = FALSE; compressor_pager_stats.put++; @@ -746,11 +748,33 @@ vm_compressor_pager_put( vm_compressor_free(slot_p, 0); *compressed_count_delta_p -= 1; } - if (vm_compressor_put(ppnum, slot_p, current_chead, scratch_buf)) - return (KERN_RESOURCE_SHORTAGE); + + /* + * cacheability should be set to the system default (usually writeback) + * during compressor operations, both for performance and correctness, + * e.g. to avoid compressor codec faults generated by an unexpected + * memory type. + */ + prev_wimg = pmap_cache_attributes(ppnum) & VM_WIMG_MASK; + + if ((prev_wimg != VM_WIMG_DEFAULT) && (prev_wimg != VM_WIMG_USE_DEFAULT)) { + set_cache_attr = TRUE; + pmap_set_cache_attributes(ppnum, VM_WIMG_DEFAULT); + } + /* + * If the compressor operation succeeds, we presumably don't need to + * undo any previous WIMG update, as all live mappings should be + * disconnected. + */ + + if (vm_compressor_put(ppnum, slot_p, current_chead, scratch_buf)) { + if (set_cache_attr) + pmap_set_cache_attributes(ppnum, prev_wimg); + return KERN_RESOURCE_SHORTAGE; + } *compressed_count_delta_p += 1; - return (KERN_SUCCESS); + return KERN_SUCCESS; } @@ -796,6 +820,21 @@ vm_compressor_pager_get( if (kr == KERN_SUCCESS) { int retval; + unsigned int prev_wimg = VM_WIMG_DEFAULT; + boolean_t set_cache_attr = FALSE; + + /* + * cacheability should be set to the system default (usually writeback) + * during compressor operations, both for performance and correctness, + * e.g. to avoid compressor codec faults generated by an unexpected + * memory type. + */ + prev_wimg = pmap_cache_attributes(ppnum) & VM_WIMG_MASK; + + if ((prev_wimg != VM_WIMG_DEFAULT) && (prev_wimg != VM_WIMG_USE_DEFAULT)) { + set_cache_attr = TRUE; + pmap_set_cache_attributes(ppnum, VM_WIMG_DEFAULT); + } /* get the page from the compressor */ retval = vm_compressor_get(ppnum, slot_p, flags); @@ -807,6 +846,8 @@ vm_compressor_pager_get( assert((flags & C_DONT_BLOCK)); kr = KERN_FAILURE; } + if (set_cache_attr) + pmap_set_cache_attributes(ppnum, prev_wimg); } if (kr == KERN_SUCCESS) { diff --git a/osfmk/vm/vm_compressor_pager.h b/osfmk/vm/vm_compressor_pager.h index e723c9012..a42d1b9ee 100644 --- a/osfmk/vm/vm_compressor_pager.h +++ b/osfmk/vm/vm_compressor_pager.h @@ -88,11 +88,11 @@ extern vm_external_state_t vm_compressor_pager_state_get( (object)); \ } \ if (_num_pages_cleared && \ - (object)->purgable != VM_PURGABLE_DENY && \ - (object)->vo_purgeable_owner != NULL) { \ - /* less compressed purgeable pages */ \ + ((object)->purgable != VM_PURGABLE_DENY || \ + (object)->vo_ledger_tag)) { \ + /* less compressed purgeable/tagged pages */ \ assert(_num_pages_cleared == 1); \ - vm_purgeable_compressed_update( \ + vm_object_owner_compressed_update( \ (object), \ -_num_pages_cleared); \ } \ diff --git a/osfmk/vm/vm_fault.c b/osfmk/vm/vm_fault.c index 256c70dfe..abbe202ff 100644 --- a/osfmk/vm/vm_fault.c +++ b/osfmk/vm/vm_fault.c @@ -139,11 +139,12 @@ uint64_t vm_hard_throttle_threshold; #define NEED_TO_HARD_THROTTLE_THIS_TASK() (vm_wants_task_throttled(current_task()) || \ - (vm_page_free_count < vm_page_throttle_limit && \ - proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) > THROTTLE_LEVEL_THROTTLED)) + ((vm_page_free_count < vm_page_throttle_limit || \ + HARD_THROTTLE_LIMIT_REACHED()) && \ + proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED)) -#define HARD_THROTTLE_DELAY 5000 /* 5000 us == 5 ms */ +#define HARD_THROTTLE_DELAY 10000 /* 10000 us == 10 ms */ #define SOFT_THROTTLE_DELAY 200 /* 200 us == .2 ms */ #define VM_PAGE_CREATION_THROTTLE_PERIOD_SECS 6 @@ -197,6 +198,10 @@ unsigned long vm_cs_revalidates = 0; unsigned long vm_cs_query_modified = 0; unsigned long vm_cs_validated_dirtied = 0; unsigned long vm_cs_bitmap_validated = 0; +#if PMAP_CS +uint64_t vm_cs_defer_to_pmap_cs = 0; +uint64_t vm_cs_defer_to_pmap_cs_not = 0; +#endif /* PMAP_CS */ void vm_pre_fault(vm_map_offset_t); @@ -204,6 +209,24 @@ extern char *kdp_compressor_decompressed_page; extern addr64_t kdp_compressor_decompressed_page_paddr; extern ppnum_t kdp_compressor_decompressed_page_ppnum; +struct vmrtfr { + int vmrtfr_maxi; + int vmrtfr_curi; + int64_t vmrtf_total; + vm_rtfault_record_t *vm_rtf_records; +} vmrtfrs; +#define VMRTF_DEFAULT_BUFSIZE (4096) +#define VMRTF_NUM_RECORDS_DEFAULT (VMRTF_DEFAULT_BUFSIZE / sizeof(vm_rtfault_record_t)) +int vmrtf_num_records = VMRTF_NUM_RECORDS_DEFAULT; + +static void vm_rtfrecord_lock(void); +static void vm_rtfrecord_unlock(void); +static void vm_record_rtfault(thread_t, uint64_t, vm_map_offset_t, int); + +lck_spin_t vm_rtfr_slock; +extern lck_grp_t vm_page_lck_grp_bucket; +extern lck_attr_t vm_page_lck_attr; + /* * Routine: vm_fault_init * Purpose: @@ -245,11 +268,20 @@ vm_fault_init(void) /* If no boot arg or incorrect boot arg, try device tree. */ PE_get_default("kern.vm_compressor", &vm_compressor_mode, sizeof(vm_compressor_mode)); } - PE_parse_boot_argn("vm_compressor_threads", &vm_compressor_thread_count, sizeof (vm_compressor_thread_count)); - printf("\"vm_compressor_mode\" is %d\n", vm_compressor_mode); } +void vm_rtfault_record_init(void) { + PE_parse_boot_argn("vm_rtfault_records", &vmrtf_num_records, sizeof(vmrtf_num_records)); + + assert(vmrtf_num_records >= 1); + vmrtf_num_records = MAX(vmrtf_num_records, 1); + size_t kallocsz = vmrtf_num_records * sizeof(vm_rtfault_record_t); + vmrtfrs.vm_rtf_records = kalloc(kallocsz); + bzero(vmrtfrs.vm_rtf_records, kallocsz); + vmrtfrs.vmrtfr_maxi = vmrtf_num_records - 1; + lck_spin_init(&vm_rtfr_slock, &vm_page_lck_grp_bucket, &vm_page_lck_attr); +} /* * Routine: vm_fault_cleanup * Purpose: @@ -282,24 +314,6 @@ vm_fault_cleanup( } } -#if MACH_CLUSTER_STATS -#define MAXCLUSTERPAGES 16 -struct { - unsigned long pages_in_cluster; - unsigned long pages_at_higher_offsets; - unsigned long pages_at_lower_offsets; -} cluster_stats_in[MAXCLUSTERPAGES]; -#define CLUSTER_STAT(clause) clause -#define CLUSTER_STAT_HIGHER(x) \ - ((cluster_stats_in[(x)].pages_at_higher_offsets)++) -#define CLUSTER_STAT_LOWER(x) \ - ((cluster_stats_in[(x)].pages_at_lower_offsets)++) -#define CLUSTER_STAT_CLUSTER(x) \ - ((cluster_stats_in[(x)].pages_in_cluster)++) -#else /* MACH_CLUSTER_STATS */ -#define CLUSTER_STAT(clause) -#endif /* MACH_CLUSTER_STATS */ - #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0) @@ -530,7 +544,7 @@ vm_fault_deactivate_behind( for (n = 0; n < max_pages_in_run; n++) { m = vm_page_lookup(object, offset + run_offset + (n * pg_offset)); - if (m && !m->laundry && !m->busy && !m->no_cache && (m->vm_page_q_state != VM_PAGE_ON_THROTTLED_Q) && !m->fictitious && !m->absent) { + if (m && !m->vmp_laundry && !m->vmp_busy && !m->vmp_no_cache && (m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && !m->vmp_fictitious && !m->vmp_absent) { page_run[pages_in_run++] = m; /* @@ -630,7 +644,7 @@ vm_page_throttled(boolean_t page_kept) thread->t_page_creation_time = tv_sec; thread->t_page_creation_count = VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC * (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS - 1); } - ++vm_page_throttle_count; + VM_PAGEOUT_DEBUG(vm_page_throttle_count, 1); thread->t_page_creation_throttled = 1; @@ -664,10 +678,10 @@ vm_page_throttled(boolean_t page_kept) * cleanup is based on being called from vm_fault_page * * object must be locked - * object == m->object + * object == m->vmp_object */ static vm_fault_return_t -vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state, boolean_t page_throttle) +vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, wait_interrupt_t interruptible_state, boolean_t page_throttle) { int throttle_delay; @@ -688,26 +702,6 @@ vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t int return (VM_FAULT_MEMORY_ERROR); } - if (vm_backing_store_low) { - /* - * are we protecting the system from - * backing store exhaustion. If so - * sleep unless we are privileged. - */ - if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) { - - if (m != VM_PAGE_NULL) - VM_PAGE_FREE(m); - vm_fault_cleanup(object, first_m); - - assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT); - - thread_block(THREAD_CONTINUE_NULL); - thread_interrupt_level(interruptible_state); - - return (VM_FAULT_RETRY); - } - } if (page_throttle == TRUE) { if ((throttle_delay = vm_page_throttled(FALSE))) { /* @@ -739,7 +733,7 @@ vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t int * do the work to zero fill a page and * inject it into the correct paging queue * - * m->object must be locked + * m->vmp_object must be locked * page queue lock must NOT be held */ static int @@ -765,16 +759,16 @@ vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill) * sending a program into this area. We * choose this approach for performance */ - m->pmapped = TRUE; + m->vmp_pmapped = TRUE; - m->cs_validated = FALSE; - m->cs_tainted = FALSE; - m->cs_nx = FALSE; + m->vmp_cs_validated = FALSE; + m->vmp_cs_tainted = FALSE; + m->vmp_cs_nx = FALSE; if (no_zero_fill == TRUE) { my_fault = DBG_NZF_PAGE_FAULT; - if (m->absent && m->busy) + if (m->vmp_absent && m->vmp_busy) return (my_fault); } else { vm_page_zero_fill(m); @@ -782,9 +776,9 @@ vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill) VM_STAT_INCR(zero_fill_count); DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL); } - assert(!m->laundry); + assert(!m->vmp_laundry); assert(object != kernel_object); - //assert(m->pageq.next == 0 && m->pageq.prev == 0); + //assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0); if (!VM_DYNAMIC_PAGING_ENABLED() && (object->purgable == VM_PURGABLE_DENY || @@ -802,8 +796,8 @@ vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill) */ vm_page_queues_remove(m, TRUE); vm_page_check_pageable_safe(m); - vm_page_queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq); - m->vm_page_q_state = VM_PAGE_ON_THROTTLED_Q; + vm_page_queue_enter(&vm_page_queue_throttled, m, vm_page_t, vmp_pageq); + m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q; vm_page_throttled_count++; } vm_page_unlock_queues(); @@ -897,10 +891,8 @@ vm_fault_page( boolean_t force_fault_retry = FALSE; vm_prot_t access_required = fault_type; vm_prot_t wants_copy_flag; - CLUSTER_STAT(int pages_at_higher_offsets;) - CLUSTER_STAT(int pages_at_lower_offsets;) kern_return_t wait_result; - boolean_t interruptible_state; + wait_interrupt_t interruptible_state; boolean_t data_already_requested = FALSE; vm_behavior_t orig_behavior; vm_size_t orig_cluster_size; @@ -1072,7 +1064,7 @@ vm_fault_page( #endif if (m != VM_PAGE_NULL) { - if (m->busy) { + if (m->vmp_busy) { /* * The page is being brought in, * wait for it and then retry. @@ -1099,10 +1091,10 @@ vm_fault_page( } continue; } - if (m->laundry) { - m->free_when_done = FALSE; + if (m->vmp_laundry) { + m->vmp_free_when_done = FALSE; - if (!m->cleaning) + if (!m->vmp_cleaning) vm_pageout_steal_laundry(m, FALSE); } if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) { @@ -1116,7 +1108,7 @@ vm_fault_page( * be just to wire or unwire it. * Let's pretend it succeeded... */ - m->busy = TRUE; + m->vmp_busy = TRUE; *result_page = m; assert(first_m == VM_PAGE_NULL); *top_page = first_m; @@ -1135,7 +1127,7 @@ vm_fault_page( } } - if (m->error) { + if (m->vmp_error) { /* * The page is in error, give up now. */ @@ -1151,7 +1143,7 @@ vm_fault_page( return (VM_FAULT_MEMORY_ERROR); } - if (m->restart) { + if (m->vmp_restart) { /* * The pager wants us to restart * at the top of the chain, @@ -1168,7 +1160,7 @@ vm_fault_page( return (VM_FAULT_RETRY); } - if (m->absent) { + if (m->vmp_absent) { /* * The page isn't busy, but is absent, * therefore it's deemed "unavailable". @@ -1238,11 +1230,11 @@ vm_fault_page( * we're going to use the absent page we just found * so convert it to a 'busy' page */ - m->absent = FALSE; - m->busy = TRUE; + m->vmp_absent = FALSE; + m->vmp_busy = TRUE; } if (fault_info->mark_zf_absent && no_zero_fill == TRUE) - m->absent = TRUE; + m->vmp_absent = TRUE; /* * zero-fill the page and put it on * the correct paging queue @@ -1258,8 +1250,8 @@ vm_fault_page( VM_PAGE_FREE(m); } else { first_m = m; - m->absent = FALSE; - m->busy = TRUE; + m->vmp_absent = FALSE; + m->vmp_busy = TRUE; vm_page_lockspin_queues(); vm_page_queues_remove(m, FALSE); @@ -1289,7 +1281,7 @@ vm_fault_page( continue; } } - if ((m->cleaning) + if ((m->vmp_cleaning) && ((object != first_object) || (object->copy != VM_OBJECT_NULL)) && (fault_type & VM_PROT_WRITE)) { /* @@ -1322,7 +1314,7 @@ vm_fault_page( m = vm_page_lookup(object, offset); - if (m != VM_PAGE_NULL && m->cleaning) { + if (m != VM_PAGE_NULL && m->vmp_cleaning) { PAGE_ASSERT_WAIT(m, interruptible); vm_object_unlock(object); @@ -1339,14 +1331,14 @@ vm_fault_page( return (VM_FAULT_RETRY); } } - if (type_of_fault == NULL && (m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) && + if (type_of_fault == NULL && (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) && !(fault_info != NULL && fault_info->stealth)) { /* * If we were passed a non-NULL pointer for * "type_of_fault", than we came from * vm_fault... we'll let it deal with * this condition, since it - * needs to see m->speculative to correctly + * needs to see m->vmp_speculative to correctly * account the pageins, otherwise... * take it off the speculative queue, we'll * let the caller of vm_fault_page deal @@ -1357,7 +1349,7 @@ vm_fault_page( * the page in the speculative queue. */ vm_page_lockspin_queues(); - if (m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) + if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) vm_page_queues_remove(m, FALSE); vm_page_unlock_queues(); } @@ -1388,10 +1380,10 @@ vm_fault_page( XPR(XPR_VM_FAULT, "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n", object, offset, m, 0, 0); - assert(!m->busy); - assert(!m->absent); + assert(!m->vmp_busy); + assert(!m->vmp_absent); - m->busy = TRUE; + m->vmp_busy = TRUE; break; } @@ -1561,16 +1553,16 @@ vm_fault_page( return (VM_FAULT_MEMORY_SHORTAGE); } - m->absent = TRUE; + m->vmp_absent = TRUE; if (fault_info && fault_info->batch_pmap_op == TRUE) { vm_page_insert_internal(m, object, offset, VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL); } else { vm_page_insert(m, object, offset); } } - assert(m->busy); + assert(m->vmp_busy); - m->absent = TRUE; + m->vmp_absent = TRUE; pager = object->pager; assert(object->paging_in_progress > 0); @@ -1610,8 +1602,8 @@ vm_fault_page( switch (rc) { case KERN_SUCCESS: - m->absent = FALSE; - m->dirty = TRUE; + m->vmp_absent = FALSE; + m->vmp_dirty = TRUE; if ((object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_USE_DEFAULT) { @@ -1624,7 +1616,7 @@ vm_fault_page( pmap_sync_page_attributes_phys( VM_PAGE_GET_PHYS_PAGE(m)); } else { - m->written_by_kernel = TRUE; + m->vmp_written_by_kernel = TRUE; } /* @@ -1635,27 +1627,28 @@ vm_fault_page( * "compressed purgeable" ledger, so * update that now. */ - if ((object->purgable != - VM_PURGABLE_DENY) && - (object->vo_purgeable_owner != + if (((object->purgable != + VM_PURGABLE_DENY) || + object->vo_ledger_tag) && + (object->vo_owner != NULL)) { /* * One less compressed - * purgeable page. + * purgeable/tagged page. */ - vm_purgeable_compressed_update( + vm_object_owner_compressed_update( object, -1); } break; case KERN_MEMORY_FAILURE: - m->unusual = TRUE; - m->error = TRUE; - m->absent = FALSE; + m->vmp_unusual = TRUE; + m->vmp_error = TRUE; + m->vmp_absent = FALSE; break; case KERN_MEMORY_ERROR: - assert(m->absent); + assert(m->vmp_absent); break; default: panic("vm_fault_page(): unexpected " @@ -1698,7 +1691,7 @@ vm_fault_page( * so we can release the object lock. */ - if (object->object_slid == TRUE) { + if (object->object_is_shared_cache) { set_thread_rwlock_boost(); } @@ -1786,7 +1779,7 @@ vm_fault_page( #endif vm_object_lock(object); - if (object->object_slid == TRUE) { + if (object->object_is_shared_cache) { clear_thread_rwlock_boost(); } @@ -1859,7 +1852,7 @@ vm_fault_page( * We get here if the object has no pager, or an existence map * exists and indicates the page isn't present on the pager * or we're unwiring a page. If a pager exists, but there - * is no existence map, then the m->absent case above handles + * is no existence map, then the m->vmp_absent case above handles * the ZF case when the pager can't provide the page */ #if TRACEFAULTPAGE @@ -1920,7 +1913,7 @@ vm_fault_page( vm_page_insert(m, object, offset); } if (fault_info->mark_zf_absent && no_zero_fill == TRUE) - m->absent = TRUE; + m->vmp_absent = TRUE; my_fault = vm_fault_zero_page(m, no_zero_fill); @@ -1969,10 +1962,10 @@ vm_fault_page( dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */ #endif #if EXTRA_ASSERTIONS - assert(m->busy && !m->absent); + assert(m->vmp_busy && !m->vmp_absent); assert((first_m == VM_PAGE_NULL) || - (first_m->busy && !first_m->absent && - !first_m->active && !first_m->inactive && !first_m->secluded)); + (first_m->vmp_busy && !first_m->vmp_absent && + !first_m->vmp_active && !first_m->vmp_inactive && !first_m->vmp_secluded)); #endif /* EXTRA_ASSERTIONS */ XPR(XPR_VM_FAULT, @@ -2000,25 +1993,6 @@ vm_fault_page( */ assert(!must_be_resident); - /* - * are we protecting the system from - * backing store exhaustion. If so - * sleep unless we are privileged. - */ - if (vm_backing_store_low) { - if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) { - - RELEASE_PAGE(m); - vm_fault_cleanup(object, first_m); - - assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT); - - thread_block(THREAD_CONTINUE_NULL); - thread_interrupt_level(interruptible_state); - - return (VM_FAULT_RETRY); - } - } /* * If we try to collapse first_object at this * point, we may deadlock when we try to get @@ -2067,14 +2041,14 @@ vm_fault_page( * access to this page, then we could * avoid the pmap_disconnect() call. */ - if (m->pmapped) + if (m->vmp_pmapped) pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)); - if (m->clustered) { + if (m->vmp_clustered) { VM_PAGE_COUNT_AS_PAGEIN(m); VM_PAGE_CONSUME_CLUSTERED(m); } - assert(!m->cleaning); + assert(!m->vmp_cleaning); /* * We no longer need the old page or object. @@ -2114,7 +2088,7 @@ vm_fault_page( * and replace it with the * page we just copied into */ - assert(copy_m->busy); + assert(copy_m->vmp_busy); vm_page_insert(copy_m, object, offset); SET_PAGE_DIRTY(copy_m, TRUE); @@ -2200,7 +2174,7 @@ vm_fault_page( /* * Page currently exists in the copy object */ - if (copy_m->busy) { + if (copy_m->vmp_busy) { /* * If the page is being brought * in, wait for it and then retry. @@ -2223,7 +2197,7 @@ vm_fault_page( assert(copy_object->ref_count > 0); copy_m = vm_page_lookup(copy_object, copy_offset); - if (copy_m != VM_PAGE_NULL && copy_m->busy) { + if (copy_m != VM_PAGE_NULL && copy_m->vmp_busy) { PAGE_ASSERT_WAIT(copy_m, interruptible); vm_object_unlock(copy_object); @@ -2249,32 +2223,7 @@ vm_fault_page( * for example) or it hasn't been paged out. * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT) * We must copy the page to the copy object. - */ - - if (vm_backing_store_low) { - /* - * we are protecting the system from - * backing store exhaustion. If so - * sleep unless we are privileged. - */ - if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) { - assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT); - - RELEASE_PAGE(m); - VM_OBJ_RES_DECR(copy_object); - vm_object_lock_assert_exclusive(copy_object); - copy_object->ref_count--; - assert(copy_object->ref_count > 0); - - vm_object_unlock(copy_object); - vm_fault_cleanup(object, first_m); - thread_block(THREAD_CONTINUE_NULL); - thread_interrupt_level(interruptible_state); - - return (VM_FAULT_RETRY); - } - } - /* + * * Allocate a page for the copy */ copy_m = vm_page_alloc(copy_object, copy_offset); @@ -2304,10 +2253,10 @@ vm_fault_page( * from all pmaps. (We can't know which * pmaps use it.) */ - if (m->pmapped) + if (m->vmp_pmapped) pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)); - if (m->clustered) { + if (m->vmp_clustered) { VM_PAGE_COUNT_AS_PAGEIN(m); VM_PAGE_CONSUME_CLUSTERED(m); } @@ -2321,7 +2270,7 @@ vm_fault_page( ) { vm_page_lockspin_queues(); - assert(!m->cleaning); + assert(!m->vmp_cleaning); vm_page_activate(copy_m); vm_page_unlock_queues(); @@ -2330,8 +2279,8 @@ vm_fault_page( } else { - assert(copy_m->busy == TRUE); - assert(!m->cleaning); + assert(copy_m->vmp_busy == TRUE); + assert(!m->vmp_cleaning); /* * dirty is protected by the object lock @@ -2384,8 +2333,8 @@ vm_fault_page( * wait result]. Can't turn off the page's * busy bit because we're not done with it. */ - if (m->wanted) { - m->wanted = FALSE; + if (m->vmp_wanted) { + m->vmp_wanted = FALSE; thread_wakeup_with_result((event_t) m, THREAD_RESTART); } } @@ -2434,8 +2383,20 @@ vm_fault_page( * state being up to date */ vm_fault_is_sequential(object, offset, fault_info->behavior); + vm_fault_deactivate_behind(object, offset, fault_info->behavior); + } else if (type_of_fault == NULL && my_fault == DBG_CACHE_HIT_FAULT) { + /* + * we weren't called from vm_fault, so handle the + * accounting here for hits in the cache + */ + if (m->vmp_clustered) { + VM_PAGE_COUNT_AS_PAGEIN(m); + VM_PAGE_CONSUME_CLUSTERED(m); + } + vm_fault_is_sequential(object, offset, fault_info->behavior); vm_fault_deactivate_behind(object, offset, fault_info->behavior); + } else if (my_fault == DBG_COMPRESSOR_FAULT || my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) { VM_STAT_INCR(decompressions); @@ -2477,16 +2438,16 @@ vm_fault_page( */ #define VM_FAULT_NEED_CS_VALIDATION(pmap, page, page_obj) \ ((pmap) != kernel_pmap /*1*/ && \ - !(page)->cs_tainted /*2*/ && \ + !(page)->vmp_cs_tainted /*2*/ && \ (page_obj)->code_signed /*3*/ && \ - (!(page)->cs_validated || (page)->wpmapped /*4*/)) + (!(page)->vmp_cs_validated || (page)->vmp_wpmapped /*4*/)) /* * page queue lock must NOT be held - * m->object must be locked + * m->vmp_object must be locked * - * NOTE: m->object could be locked "shared" only if we are called + * NOTE: m->vmp_object could be locked "shared" only if we are called * from vm_fault() as part of a soft fault. If so, we must be * careful not to modify the VM object in any way that is not * legal under a shared lock... @@ -2505,20 +2466,21 @@ vm_fault_enter(vm_page_t m, boolean_t wired, boolean_t change_wiring, vm_tag_t wire_tag, - boolean_t no_cache, - boolean_t cs_bypass, - __unused int user_tag, - int pmap_options, + vm_object_fault_info_t fault_info, boolean_t *need_retry, int *type_of_fault) { kern_return_t kr, pe_result; - boolean_t previously_pmapped = m->pmapped; + boolean_t previously_pmapped = m->vmp_pmapped; boolean_t must_disconnect = 0; boolean_t map_is_switched, map_is_switch_protected; + boolean_t cs_violation; int cs_enforcement_enabled; vm_prot_t fault_type; vm_object_t object; + boolean_t no_cache = fault_info->no_cache; + boolean_t cs_bypass = fault_info->cs_bypass; + int pmap_options = fault_info->pmap_options; fault_type = change_wiring ? VM_PROT_NONE : caller_prot; object = VM_PAGE_OBJECT(m); @@ -2534,7 +2496,7 @@ vm_fault_enter(vm_page_t m, LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED); if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) { - assert(m->fictitious); + assert(m->vmp_fictitious); return KERN_SUCCESS; } @@ -2542,7 +2504,12 @@ vm_fault_enter(vm_page_t m, vm_object_lock_assert_exclusive(object); - } else if ((fault_type & VM_PROT_WRITE) == 0 && !m->wpmapped) { + } else if ((fault_type & VM_PROT_WRITE) == 0 && + (!m->vmp_wpmapped +#if VM_OBJECT_ACCESS_TRACKING + || object->access_tracking +#endif /* VM_OBJECT_ACCESS_TRACKING */ + )) { /* * This is not a "write" fault, so we * might not have taken the object lock @@ -2562,13 +2529,13 @@ vm_fault_enter(vm_page_t m, assert(cs_bypass); } } - if (m->pmapped == FALSE) { + if (m->vmp_pmapped == FALSE) { - if (m->clustered) { + if (m->vmp_clustered) { if (*type_of_fault == DBG_CACHE_HIT_FAULT) { /* * found it in the cache, but this - * is the first fault-in of the page (m->pmapped == FALSE) + * is the first fault-in of the page (m->vmp_pmapped == FALSE) * so it must have come in as part of * a cluster... account 1 pagein against it */ @@ -2592,20 +2559,43 @@ vm_fault_enter(vm_page_t m, } /* Validate code signature if necessary. */ - if (VM_FAULT_NEED_CS_VALIDATION(pmap, m, object)) { + if (!cs_bypass && + VM_FAULT_NEED_CS_VALIDATION(pmap, m, object)) { vm_object_lock_assert_exclusive(object); - if (m->cs_validated) { + if (m->vmp_cs_validated) { vm_cs_revalidates++; } /* VM map is locked, so 1 ref will remain on VM object - * so no harm if vm_page_validate_cs drops the object lock */ + +#if PMAP_CS + if (fault_info->pmap_cs_associated && + pmap_cs_enforced(pmap) && + !m->vmp_cs_validated && + !m->vmp_cs_tainted && + !m->vmp_cs_nx && + (prot & VM_PROT_EXECUTE) && + (caller_prot & VM_PROT_EXECUTE)) { + /* + * With pmap_cs, the pmap layer will validate the + * code signature for any executable pmap mapping. + * No need for us to validate this page too: + * in pmap_cs we trust... + */ + vm_cs_defer_to_pmap_cs++; + } else { + vm_cs_defer_to_pmap_cs_not++; + vm_page_validate_cs(m); + } +#else /* PMAP_CS */ vm_page_validate_cs(m); +#endif /* PMAP_CS */ } -#define page_immutable(m,prot) ((m)->cs_validated /*&& ((prot) & VM_PROT_EXECUTE)*/) -#define page_nx(m) ((m)->cs_nx) +#define page_immutable(m,prot) ((m)->vmp_cs_validated /*&& ((prot) & VM_PROT_EXECUTE)*/) +#define page_nx(m) ((m)->vmp_cs_nx) map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) && (pmap == vm_map_pmap(current_thread()->map))); @@ -2621,7 +2611,7 @@ vm_fault_enter(vm_page_t m, * from the current map. We do that below right before we do the * PMAP_ENTER. */ - cs_enforcement_enabled = cs_enforcement(NULL); + cs_enforcement_enabled = cs_process_enforcement(NULL); if(cs_enforcement_enabled && map_is_switched && map_is_switch_protected && page_immutable(m, prot) && @@ -2636,31 +2626,6 @@ vm_fault_enter(vm_page_t m, return KERN_CODESIGN_ERROR; } - if (cs_enforcement_enabled && - !m->cs_validated && - (prot & VM_PROT_EXECUTE) && - !(caller_prot & VM_PROT_EXECUTE)) { - /* - * FOURK PAGER: - * This page has not been validated and will not be - * allowed to be mapped for "execute". - * But the caller did not request "execute" access for this - * fault, so we should not raise a code-signing violation - * (and possibly kill the process) below. - * Instead, let's just remove the "execute" access request. - * - * This can happen on devices with a 4K page size if a 16K - * page contains a mix of signed&executable and - * unsigned&non-executable 4K pages, making the whole 16K - * mapping "executable". - */ - if (!pmap_has_prot_policy(prot)) { - prot &= ~VM_PROT_EXECUTE; - } else { - assert(cs_bypass); - } - } - /* A page could be tainted, or pose a risk of being tainted later. * Check whether the receiving process wants it, and make it feel * the consequences (that hapens in cs_invalid_page()). @@ -2671,28 +2636,52 @@ vm_fault_enter(vm_page_t m, * can be changed without the kernel noticing, therefore unsigned * code can be created */ - if (!cs_bypass && - (m->cs_tainted || - (cs_enforcement_enabled && - (/* The page is unsigned and wants to be executable */ - (!m->cs_validated && (prot & VM_PROT_EXECUTE)) || - /* The page should be immutable, but is in danger of being modified - * This is the case where we want policy from the code directory - - * is the page immutable or not? For now we have to assume that - * code pages will be immutable, data pages not. - * We'll assume a page is a code page if it has a code directory - * and we fault for execution. - * That is good enough since if we faulted the code page for - * writing in another map before, it is wpmapped; if we fault - * it for writing in this map later it will also be faulted for executing - * at the same time; and if we fault for writing in another map - * later, we will disconnect it from this pmap so we'll notice - * the change. - */ - (page_immutable(m, prot) && ((prot & VM_PROT_WRITE) || m->wpmapped)) - )) - )) - { + if (cs_bypass) { + /* code-signing is bypassed */ + cs_violation = FALSE; + } else if (m->vmp_cs_tainted) { + /* tainted page */ + cs_violation = TRUE; + } else if (!cs_enforcement_enabled) { + /* no further code-signing enforcement */ + cs_violation = FALSE; + } else if (page_immutable(m, prot) && + ((prot & VM_PROT_WRITE) || + m->vmp_wpmapped)) { + /* + * The page should be immutable, but is in danger of being + * modified. + * This is the case where we want policy from the code + * directory - is the page immutable or not? For now we have + * to assume that code pages will be immutable, data pages not. + * We'll assume a page is a code page if it has a code directory + * and we fault for execution. + * That is good enough since if we faulted the code page for + * writing in another map before, it is wpmapped; if we fault + * it for writing in this map later it will also be faulted for + * executing at the same time; and if we fault for writing in + * another map later, we will disconnect it from this pmap so + * we'll notice the change. + */ + cs_violation = TRUE; + } else if (!m->vmp_cs_validated && + (prot & VM_PROT_EXECUTE) +#if PMAP_CS + /* + * Executable pages will be validated by pmap_cs; + * in pmap_cs we trust... + * If pmap_cs is turned off, this is a code-signing + * violation. + */ + && ! (pmap_cs_enforced(pmap)) +#endif /* PMAP_CS */ + ) { + cs_violation = TRUE; + } else { + cs_violation = FALSE; + } + + if (cs_violation) { /* We will have a tainted page. Have to handle the special case * of a switched map now. If the map is not switched, standard * procedure applies - call cs_invalid_page(). @@ -2707,12 +2696,11 @@ vm_fault_enter(vm_page_t m, reject_page = FALSE; } else { if (cs_debug > 5) - printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s slid: %s prot: 0x%x\n", + printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s prot: 0x%x\n", object->code_signed ? "yes" : "no", - m->cs_validated ? "yes" : "no", - m->cs_tainted ? "yes" : "no", - m->wpmapped ? "yes" : "no", - m->slid ? "yes" : "no", + m->vmp_cs_validated ? "yes" : "no", + m->vmp_cs_tainted ? "yes" : "no", + m->vmp_wpmapped ? "yes" : "no", (int)prot); reject_page = cs_invalid_page((addr64_t) vaddr, &cs_killed); } @@ -2744,7 +2732,7 @@ vm_fault_enter(vm_page_t m, /* get file's VM object */ file_object = object; - file_offset = m->offset; + file_offset = m->vmp_offset; for (shadow = file_object->shadow, shadow_depth = 0; shadow != VM_OBJECT_NULL; @@ -2798,7 +2786,7 @@ vm_fault_enter(vm_page_t m, "from offset 0x%llx in file \"%s%s%s\" " "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) " "(signed:%d validated:%d tainted:%d nx:%d " - "wpmapped:%d slid:%d dirty:%d depth:%d)\n", + "wpmapped:%d dirty:%d depth:%d)\n", pid, procname, (addr64_t) vaddr, file_offset, (pathname ? pathname : ""), @@ -2811,12 +2799,11 @@ vm_fault_enter(vm_page_t m, : "!="), mtime.tv_sec, mtime.tv_nsec, object->code_signed, - m->cs_validated, - m->cs_tainted, - m->cs_nx, - m->wpmapped, - m->slid, - m->dirty, + m->vmp_cs_validated, + m->vmp_cs_tainted, + m->vmp_cs_nx, + m->vmp_wpmapped, + m->vmp_dirty, shadow_depth); /* @@ -2861,12 +2848,12 @@ vm_fault_enter(vm_page_t m, ceri->ceri_page_modtime_secs = mtime.tv_sec; ceri->ceri_page_modtime_nsecs = mtime.tv_nsec; ceri->ceri_object_codesigned = (object->code_signed); - ceri->ceri_page_codesig_validated = (m->cs_validated); - ceri->ceri_page_codesig_tainted = (m->cs_tainted); - ceri->ceri_page_codesig_nx = (m->cs_nx); - ceri->ceri_page_wpmapped = (m->wpmapped); - ceri->ceri_page_slid = (m->slid); - ceri->ceri_page_dirty = (m->dirty); + ceri->ceri_page_codesig_validated = (m->vmp_cs_validated); + ceri->ceri_page_codesig_tainted = (m->vmp_cs_tainted); + ceri->ceri_page_codesig_nx = (m->vmp_cs_nx); + ceri->ceri_page_wpmapped = (m->vmp_wpmapped); + ceri->ceri_page_slid = 0; + ceri->ceri_page_dirty = (m->vmp_dirty); ceri->ceri_page_shadow_depth = shadow_depth; } else { #if DEBUG || DEVELOPMENT @@ -2883,13 +2870,13 @@ vm_fault_enter(vm_page_t m, set_thread_exit_reason(current_thread(), codesigning_exit_reason, FALSE); } if (panic_on_cs_killed && - object->object_slid) { + object->object_is_shared_cache) { panic("CODE SIGNING: process %d[%s]: " "rejecting invalid page at address 0x%llx " "from offset 0x%llx in file \"%s%s%s\" " "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) " "(signed:%d validated:%d tainted:%d nx:%d" - "wpmapped:%d slid:%d dirty:%d depth:%d)\n", + "wpmapped:%d dirty:%d depth:%d)\n", pid, procname, (addr64_t) vaddr, file_offset, (pathname ? pathname : ""), @@ -2902,12 +2889,11 @@ vm_fault_enter(vm_page_t m, : "!="), mtime.tv_sec, mtime.tv_nsec, object->code_signed, - m->cs_validated, - m->cs_tainted, - m->cs_nx, - m->wpmapped, - m->slid, - m->dirty, + m->vmp_cs_validated, + m->vmp_cs_tainted, + m->vmp_cs_nx, + m->vmp_wpmapped, + m->vmp_dirty, shadow_depth); } @@ -2922,7 +2908,7 @@ vm_fault_enter(vm_page_t m, } else { /* proceed with the invalid page */ kr = KERN_SUCCESS; - if (!m->cs_validated && + if (!m->vmp_cs_validated && !object->code_signed) { /* * This page has not been (fully) validated but @@ -2951,8 +2937,8 @@ vm_fault_enter(vm_page_t m, * through that code path for re-consideration * of the validity of that page. */ - must_disconnect = !m->cs_tainted; - m->cs_tainted = TRUE; + must_disconnect = !m->vmp_cs_tainted; + m->vmp_cs_tainted = TRUE; } cs_enter_tainted_accepted++; } @@ -2995,12 +2981,12 @@ MACRO_END * the page queues. Change wiring * case is obvious. */ - assert((m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) || object != compressor_object); + assert((m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) || object != compressor_object); #if CONFIG_BACKGROUND_QUEUE vm_page_update_background_state(m); #endif - if (m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) { + if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) { /* * Compressor pages are neither wired * nor pageable and should never change. @@ -3030,10 +3016,10 @@ MACRO_END __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED(); vm_page_deactivate(m); /* we keep the page queues lock, if we need it later */ - } else if (((m->vm_page_q_state == VM_PAGE_NOT_ON_Q) || - (m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) || - (m->vm_page_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) || - ((m->vm_page_q_state != VM_PAGE_ON_THROTTLED_Q) && no_cache)) && + } else if (((m->vmp_q_state == VM_PAGE_NOT_ON_Q) || + (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) || + (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) || + ((m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && no_cache)) && !VM_PAGE_WIRED(m)) { if (vm_page_local_q && @@ -3042,7 +3028,7 @@ MACRO_END struct vpl *lq; uint32_t lid; - assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q); + assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q); __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED(); vm_object_lock_assert_exclusive(object); @@ -3069,9 +3055,9 @@ MACRO_END vm_page_check_pageable_safe(m); vm_page_queue_enter(&lq->vpl_queue, m, - vm_page_t, pageq); - m->vm_page_q_state = VM_PAGE_ON_ACTIVE_LOCAL_Q; - m->local_id = lid; + vm_page_t, vmp_pageq); + m->vmp_q_state = VM_PAGE_ON_ACTIVE_LOCAL_Q; + m->vmp_local_id = lid; lq->vpl_count++; if (object->internal) @@ -3110,11 +3096,11 @@ MACRO_END * page queue lock */ if (!VM_PAGE_WIRED(m)) { - if (m->vm_page_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) { + if (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) { vm_page_queues_remove(m, FALSE); - vm_pageout_cleaned_reactivated++; - vm_pageout_cleaned_fault_reactivated++; + VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1); + VM_PAGEOUT_DEBUG(vm_pageout_cleaned_fault_reactivated, 1); } if ( !VM_PAGE_ACTIVE_OR_INACTIVE(m) || @@ -3134,10 +3120,10 @@ MACRO_END if (no_cache && (!previously_pmapped || - m->no_cache)) { - m->no_cache = TRUE; + m->vmp_no_cache)) { + m->vmp_no_cache = TRUE; - if (m->vm_page_q_state != VM_PAGE_ON_SPECULATIVE_Q) + if (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q) vm_page_speculate(m, FALSE); } else if ( !VM_PAGE_ACTIVE_OR_INACTIVE(m)) { @@ -3166,7 +3152,7 @@ MACRO_END * properly serialize updating the pmapped and * xpmapped bits */ - if ((prot & VM_PROT_EXECUTE) && !m->xpmapped) { + if ((prot & VM_PROT_EXECUTE) && !m->vmp_xpmapped) { ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m); pmap_lock_phys_page(phys_page); @@ -3176,11 +3162,11 @@ MACRO_END * need to grab this lock a 2nd time * just below */ - m->pmapped = TRUE; + m->vmp_pmapped = TRUE; - if (!m->xpmapped) { + if (!m->vmp_xpmapped) { - m->xpmapped = TRUE; + m->vmp_xpmapped = TRUE; pmap_unlock_phys_page(phys_page); @@ -3210,44 +3196,23 @@ MACRO_END } else pmap_unlock_phys_page(phys_page); } else { - if (m->pmapped == FALSE) { + if (m->vmp_pmapped == FALSE) { ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m); pmap_lock_phys_page(phys_page); - m->pmapped = TRUE; + m->vmp_pmapped = TRUE; pmap_unlock_phys_page(phys_page); } } - if (vm_page_is_slideable(m)) { - boolean_t was_busy = m->busy; - - vm_object_lock_assert_exclusive(object); - - m->busy = TRUE; - kr = vm_page_slide(m, 0); - assert(m->busy); - if(!was_busy) { - PAGE_WAKEUP_DONE(m); - } - if (kr != KERN_SUCCESS) { - /* - * This page has not been slid correctly, - * do not do the pmap_enter() ! - * Let vm_fault_enter() return the error - * so the caller can fail the fault. - */ - goto after_the_pmap_enter; - } - } if (fault_type & VM_PROT_WRITE) { - if (m->wpmapped == FALSE) { + if (m->vmp_wpmapped == FALSE) { vm_object_lock_assert_exclusive(object); if (!object->internal && object->pager) { task_update_logical_writes(current_task(), PAGE_SIZE, TASK_WRITE_DEFERRED, vnode_pager_lookup_vnode(object->pager)); } - m->wpmapped = TRUE; + m->vmp_wpmapped = TRUE; } if (must_disconnect) { /* @@ -3273,6 +3238,33 @@ MACRO_END } assert(VM_PAGE_OBJECT(m) == object); +#if VM_OBJECT_ACCESS_TRACKING + if (object->access_tracking) { + DTRACE_VM2(access_tracking, vm_map_offset_t, vaddr, int, fault_type); + if (fault_type & VM_PROT_WRITE) { + object->access_tracking_writes++; + vm_object_access_tracking_writes++; + } else { + object->access_tracking_reads++; + vm_object_access_tracking_reads++; + } + } +#endif /* VM_OBJECT_ACCESS_TRACKING */ + +#if PMAP_CS + /* + * If CS enforcement is on, we don't ask for an executable page if the + * fault does not call for execution, because that can fail in + * situations where the caller only actually wanted read access. + * However, it may be better to instead retry without execute on + * failure, or pass a flag into pmap_enter to do the right thing. + */ + // TODO: maybe do something better than masking out VM_PROT_EXECUTE on non-execute faults + if (pmap_cs_enforced(pmap) && !(caller_prot & VM_PROT_EXECUTE)) { + prot &= ~VM_PROT_EXECUTE; + } +#endif + /* Prevent a deadlock by not * holding the object lock if we need to wait for a page in * pmap_enter() - */ @@ -3317,11 +3309,11 @@ MACRO_END * at the level above us, so * use the blocking version instead. Requires marking * the page busy and unlocking the object */ - boolean_t was_busy = m->busy; + boolean_t was_busy = m->vmp_busy; vm_object_lock_assert_exclusive(object); - m->busy = TRUE; + m->vmp_busy = TRUE; vm_object_unlock(object); PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type, @@ -3335,7 +3327,7 @@ MACRO_END /* If the page was busy, someone else will wake it up. * Otherwise, we have to do it now. */ - assert(m->busy); + assert(m->vmp_busy); if(!was_busy) { PAGE_WAKEUP_DONE(m); } @@ -3449,19 +3441,22 @@ vm_fault_internal( vm_object_t new_object; int type_of_fault; pmap_t pmap; - boolean_t interruptible_state; + wait_interrupt_t interruptible_state; vm_map_t real_map = map; vm_map_t original_map = map; boolean_t object_locks_dropped = FALSE; vm_prot_t fault_type; vm_prot_t original_fault_type; - struct vm_object_fault_info fault_info; + struct vm_object_fault_info fault_info = {}; boolean_t need_collapse = FALSE; boolean_t need_retry = FALSE; boolean_t *need_retry_ptr = NULL; int object_lock_type = 0; int cur_object_lock_type; vm_object_t top_object = VM_OBJECT_NULL; + vm_object_t written_on_object = VM_OBJECT_NULL; + memory_object_t written_on_pager = NULL; + vm_object_offset_t written_on_offset = 0; int throttle_delay; int compressed_count_delta; int grab_options; @@ -3502,6 +3497,14 @@ vm_fault_internal( return (KERN_FAILURE); } + thread_t cthread = current_thread(); + boolean_t rtfault = (cthread->sched_mode == TH_MODE_REALTIME); + uint64_t fstart = 0; + + if (rtfault) { + fstart = mach_continuous_time(); + } + interruptible_state = thread_interrupt_level(interruptible); fault_type = (change_wiring ? VM_PROT_NONE : caller_prot); @@ -3526,6 +3529,8 @@ vm_fault_internal( } } RetryFault: + assert(written_on_object == VM_OBJECT_NULL); + /* * assume we will hit a page in the cache * otherwise, explicitly override with @@ -3677,7 +3682,7 @@ vm_fault_internal( if (m != VM_PAGE_NULL) { m_object = cur_object; - if (m->busy) { + if (m->vmp_busy) { wait_result_t result; /* @@ -3726,9 +3731,9 @@ vm_fault_internal( continue; } } - if ((m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) && m_object->internal) { + if ((m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) && m_object->internal) { /* - * m->busy == TRUE and the object is locked exclusively + * m->vmp_busy == TRUE and the object is locked exclusively * if m->pageout_queue == TRUE after we acquire the * queues lock, we are guaranteed that it is stable on * the pageout queue and therefore reclaimable @@ -3740,7 +3745,7 @@ vm_fault_internal( vm_page_lock_queues(); - if (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) { + if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) { vm_pageout_throttle_up(m); vm_page_unlock_queues(); @@ -3772,7 +3777,7 @@ vm_fault_internal( goto done; } reclaimed_from_pageout: - if (m->laundry) { + if (m->vmp_laundry) { if (object != cur_object) { if (cur_object_lock_type == OBJECT_LOCK_SHARED) { cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE; @@ -3815,7 +3820,7 @@ vm_fault_internal( */ break; } - if (m->unusual && (m->error || m->restart || m->private || m->absent)) { + if (m->vmp_unusual && (m->vmp_error || m->vmp_restart || m->vmp_private || m->vmp_absent)) { /* * Unusual case... let the slow path deal with it */ @@ -3831,32 +3836,6 @@ vm_fault_internal( kr = KERN_MEMORY_ERROR; goto done; } - if (vm_page_is_slideable(m)) { - /* - * We might need to slide this page, and so, - * we want to hold the VM object exclusively. - */ - if (object != cur_object) { - if (cur_object_lock_type == OBJECT_LOCK_SHARED) { - vm_object_unlock(object); - vm_object_unlock(cur_object); - - cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE; - - vm_map_unlock_read(map); - if (real_map != map) - vm_map_unlock(real_map); - - goto RetryFault; - } - } else if (object_lock_type == OBJECT_LOCK_SHARED) { - - vm_object_unlock(object); - object_lock_type = OBJECT_LOCK_EXCLUSIVE; - vm_map_unlock_read(map); - goto RetryFault; - } - } assert(m_object == VM_PAGE_OBJECT(m)); if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m, m_object) || @@ -3963,7 +3942,7 @@ vm_fault_internal( * prepare for the pmap_enter... * object and map are both locked * m contains valid data - * object == m->object + * object == m->vmp_object * cur_object == NULL or it's been unlocked * no paging references on either object or cur_object */ @@ -3981,10 +3960,7 @@ vm_fault_internal( wired, change_wiring, wire_tag, - fault_info.no_cache, - fault_info.cs_bypass, - fault_info.user_tag, - fault_info.pmap_options, + &fault_info, need_retry_ptr, &type_of_fault); } else { @@ -3996,10 +3972,7 @@ vm_fault_internal( wired, change_wiring, wire_tag, - fault_info.no_cache, - fault_info.cs_bypass, - fault_info.user_tag, - fault_info.pmap_options, + &fault_info, need_retry_ptr, &type_of_fault); } @@ -4009,14 +3982,14 @@ vm_fault_internal( if (m_object->internal) event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL)); - else if (m_object->object_slid) + else if (m_object->object_is_shared_cache) event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE)); else event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL)); - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info.user_tag << 16) | (caller_prot << 8) | type_of_fault, m->offset, get_current_unique_pid(), 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info.user_tag << 16) | (caller_prot << 8) | type_of_fault, m->vmp_offset, get_current_unique_pid(), 0); - DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag); + DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag); } #endif if (kr == KERN_SUCCESS && @@ -4025,7 +3998,7 @@ vm_fault_internal( *physpage_p = VM_PAGE_GET_PHYS_PAGE(m); if (prot & VM_PROT_WRITE) { vm_object_lock_assert_exclusive(m_object); - m->dirty = TRUE; + m->vmp_dirty = TRUE; } } @@ -4053,16 +4026,25 @@ vm_fault_internal( * vm_fault_deactivate_behind depends on the * state being up to date */ - vm_fault_is_sequential(object, cur_offset, fault_info.behavior); + vm_fault_is_sequential(m_object, cur_offset, fault_info.behavior); - vm_fault_deactivate_behind(object, cur_offset, fault_info.behavior); + vm_fault_deactivate_behind(m_object, cur_offset, fault_info.behavior); } /* * That's it, clean up and return. */ - if (m->busy) + if (m->vmp_busy) PAGE_WAKEUP_DONE(m); + if (need_retry == FALSE && !m_object->internal && (fault_type & VM_PROT_WRITE)) { + + vm_object_paging_begin(m_object); + + assert(written_on_object == VM_OBJECT_NULL); + written_on_object = m_object; + written_on_pager = m_object->pager; + written_on_offset = m_object->paging_offset + m->vmp_offset; + } vm_object_unlock(object); vm_map_unlock_read(map); @@ -4156,10 +4138,10 @@ vm_fault_internal( /* * Now cope with the source page and object */ - if (object->ref_count > 1 && cur_m->pmapped) + if (object->ref_count > 1 && cur_m->vmp_pmapped) pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m)); - if (cur_m->clustered) { + if (cur_m->vmp_clustered) { VM_PAGE_COUNT_AS_PAGEIN(cur_m); VM_PAGE_CONSUME_CLUSTERED(cur_m); vm_fault_is_sequential(cur_object, cur_offset, fault_info.behavior); @@ -4349,7 +4331,7 @@ vm_fault_internal( m = VM_PAGE_NULL; break; } - m->dirty = TRUE; + m->vmp_dirty = TRUE; /* * If the object is purgeable, its @@ -4377,22 +4359,25 @@ vm_fault_internal( * no ledger update in that * case. */ - } else if ((cur_object->purgable == - VM_PURGABLE_DENY) || - (cur_object->vo_purgeable_owner == + } else if (((cur_object->purgable == + VM_PURGABLE_DENY) && + (!cur_object->vo_ledger_tag)) || + (cur_object->vo_owner == NULL)) { /* * "cur_object" is not purgeable - * or is not owned, so no - * purgeable ledgers to update. + * and is not ledger-taged, or + * there's no owner for it, + * so no owner's ledgers to + * update. */ } else { /* * One less compressed - * purgeable page for + * purgeable/tagged page for * cur_object's owner. */ - vm_purgeable_compressed_update( + vm_object_owner_compressed_update( cur_object, -1); } @@ -4460,16 +4445,6 @@ vm_fault_internal( kr = KERN_MEMORY_ERROR; goto done; } - if (vm_backing_store_low) { - /* - * we are protecting the system from - * backing store exhaustion... - * must take the slow path if we're - * not privileged - */ - if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) - break; - } if (cur_object != object) { vm_object_unlock(cur_object); @@ -4891,10 +4866,7 @@ vm_fault_internal( wired, change_wiring, wire_tag, - fault_info.no_cache, - fault_info.cs_bypass, - fault_info.user_tag, - fault_info.pmap_options, + &fault_info, NULL, &type_of_fault); } else { @@ -4906,10 +4878,7 @@ vm_fault_internal( wired, change_wiring, wire_tag, - fault_info.no_cache, - fault_info.cs_bypass, - fault_info.user_tag, - fault_info.pmap_options, + &fault_info, NULL, &type_of_fault); } @@ -4921,15 +4890,15 @@ vm_fault_internal( if (m_object->internal) event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL)); - else if (m_object->object_slid) + else if (m_object->object_is_shared_cache) event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE)); else event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL)); - KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info.user_tag << 16) | (caller_prot << 8) | type_of_fault, m->offset, get_current_unique_pid(), 0); + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info.user_tag << 16) | (caller_prot << 8) | type_of_fault, m->vmp_offset, get_current_unique_pid(), 0); - DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag); - } + DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag); + } #endif if (kr != KERN_SUCCESS) { /* abort this page fault */ @@ -4946,7 +4915,7 @@ vm_fault_internal( *physpage_p = VM_PAGE_GET_PHYS_PAGE(m); if (prot & VM_PROT_WRITE) { vm_object_lock_assert_exclusive(m_object); - m->dirty = TRUE; + m->vmp_dirty = TRUE; } } } else { @@ -4960,28 +4929,6 @@ vm_fault_internal( * in the object */ -#ifdef ppc - /* While we do not worry about execution protection in */ - /* general, certian pages may have instruction execution */ - /* disallowed. We will check here, and if not allowed */ - /* to execute, we return with a protection failure. */ - - if ((fault_type & VM_PROT_EXECUTE) && - (!pmap_eligible_for_execute((ppnum_t)(object->vo_shadow_offset >> 12)))) { - - vm_map_unlock_read(map); - - if (real_map != map) - vm_map_unlock(real_map); - - vm_fault_cleanup(object, top_page); - vm_object_deallocate(object); - - kr = KERN_PROTECTION_FAILURE; - goto done; - } -#endif /* ppc */ - if (real_map != map) vm_map_unlock(real_map); @@ -5098,6 +5045,15 @@ vm_fault_internal( if (m != VM_PAGE_NULL) { assert(VM_PAGE_OBJECT(m) == m_object); + if (!m_object->internal && (fault_type & VM_PROT_WRITE)) { + + vm_object_paging_begin(m_object); + + assert(written_on_object == VM_OBJECT_NULL); + written_on_object = m_object; + written_on_pager = m_object->pager; + written_on_offset = m_object->paging_offset + m->vmp_offset; + } PAGE_WAKEUP_DONE(m); vm_fault_cleanup(m_object, top_page); @@ -5133,6 +5089,22 @@ vm_fault_internal( } } } + + if (written_on_object) { + + vnode_pager_dirtied(written_on_pager, written_on_offset, written_on_offset + PAGE_SIZE_64); + + vm_object_lock(written_on_object); + vm_object_paging_end(written_on_object); + vm_object_unlock(written_on_object); + + written_on_object = VM_OBJECT_NULL; + } + + if (rtfault) { + vm_record_rtfault(cthread, fstart, trace_vaddr, type_of_fault); + } + KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END, ((uint64_t)trace_vaddr >> 32), @@ -5231,7 +5203,7 @@ vm_fault_unwire( vm_map_offset_t va; vm_map_offset_t end_addr = entry->vme_end; vm_object_t object; - struct vm_object_fault_info fault_info; + struct vm_object_fault_info fault_info = {}; unsigned int unwired_pages; object = (entry->is_sub_map) ? VM_OBJECT_NULL : VME_OBJECT(entry); @@ -5248,7 +5220,6 @@ vm_fault_unwire( fault_info.interruptible = THREAD_UNINT; fault_info.behavior = entry->behavior; fault_info.user_tag = VME_ALIAS(entry); - fault_info.pmap_options = 0; if (entry->iokit_acct || (!entry->is_sub_map && !entry->use_pmap)) { fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT; @@ -5257,10 +5228,6 @@ vm_fault_unwire( fault_info.hi_offset = (entry->vme_end - entry->vme_start) + VME_OFFSET(entry); fault_info.no_cache = entry->no_cache; fault_info.stealth = TRUE; - fault_info.io_sync = FALSE; - fault_info.cs_bypass = FALSE; - fault_info.mark_zf_absent = FALSE; - fault_info.batch_pmap_op = FALSE; unwired_pages = 0; @@ -5285,13 +5252,12 @@ vm_fault_unwire( vm_object_t result_object; vm_fault_return_t result; - if (end_addr - va > (vm_size_t) -1) { - /* 32-bit overflow */ - fault_info.cluster_size = (vm_size_t) (0 - PAGE_SIZE); - } else { - fault_info.cluster_size = (vm_size_t) (end_addr - va); - assert(fault_info.cluster_size == end_addr - va); + /* cap cluster size at maximum UPL size */ + upl_size_t cluster_size; + if (os_sub_overflow(end_addr, va, &cluster_size)) { + cluster_size = 0 - (upl_size_t)PAGE_SIZE; } + fault_info.cluster_size = cluster_size; do { prot = VM_PROT_NONE; @@ -5427,6 +5393,7 @@ vm_fault_wire_fast( thread_t thread = current_thread(); int type_of_fault; kern_return_t kr; + struct vm_object_fault_info fault_info = {}; VM_STAT_INCR(faults); @@ -5511,12 +5478,12 @@ vm_fault_wire_fast( * there's something going on, give up. */ m = vm_page_lookup(object, offset); - if ((m == VM_PAGE_NULL) || (m->busy) || - (m->unusual && ( m->error || m->restart || m->absent))) { + if ((m == VM_PAGE_NULL) || (m->vmp_busy) || + (m->vmp_unusual && ( m->vmp_error || m->vmp_restart || m->vmp_absent))) { GIVE_UP; } - if (m->fictitious && + if (m->vmp_fictitious && VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) { /* * Guard pages are fictitious pages and are never @@ -5538,9 +5505,9 @@ vm_fault_wire_fast( /* * Mark page busy for other threads. */ - assert(!m->busy); - m->busy = TRUE; - assert(!m->absent); + assert(!m->vmp_busy); + m->vmp_busy = TRUE; + assert(!m->vmp_absent); /* * Give up if the page is being written and there's a copy object @@ -5550,6 +5517,13 @@ vm_fault_wire_fast( GIVE_UP; } + fault_info.user_tag = VME_ALIAS(entry); + fault_info.pmap_options = 0; + if (entry->iokit_acct || + (!entry->is_sub_map && !entry->use_pmap)) { + fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT; + } + /* * Put this page into the physical map. */ @@ -5562,13 +5536,7 @@ vm_fault_wire_fast( TRUE, /* wired */ FALSE, /* change_wiring */ wire_tag, - FALSE, /* no_cache */ - FALSE, /* cs_bypass */ - VME_ALIAS(entry), - ((entry->iokit_acct || - (!entry->is_sub_map && !entry->use_pmap)) - ? PMAP_OPTIONS_ALT_ACCT - : 0), + &fault_info, NULL, &type_of_fault); if (kr != KERN_SUCCESS) { @@ -5588,7 +5556,7 @@ vm_fault_wire_fast( *physpage_p = VM_PAGE_GET_PHYS_PAGE(m); if (prot & VM_PROT_WRITE) { vm_object_lock_assert_exclusive(object); - m->dirty = TRUE; + m->vmp_dirty = TRUE; } } else { *physpage_p = 0; @@ -5699,8 +5667,8 @@ vm_fault_copy( vm_fault_return_t result; vm_map_size_t part_size; - struct vm_object_fault_info fault_info_src; - struct vm_object_fault_info fault_info_dst; + struct vm_object_fault_info fault_info_src = {}; + struct vm_object_fault_info fault_info_dst = {}; /* * In order not to confuse the clustered pageins, align @@ -5717,29 +5685,15 @@ vm_fault_copy( fault_info_src.interruptible = interruptible; fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL; - fault_info_src.user_tag = 0; - fault_info_src.pmap_options = 0; fault_info_src.lo_offset = vm_object_trunc_page(src_offset); fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left; - fault_info_src.no_cache = FALSE; fault_info_src.stealth = TRUE; - fault_info_src.io_sync = FALSE; - fault_info_src.cs_bypass = FALSE; - fault_info_src.mark_zf_absent = FALSE; - fault_info_src.batch_pmap_op = FALSE; fault_info_dst.interruptible = interruptible; fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL; - fault_info_dst.user_tag = 0; - fault_info_dst.pmap_options = 0; fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset); fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left; - fault_info_dst.no_cache = FALSE; fault_info_dst.stealth = TRUE; - fault_info_dst.io_sync = FALSE; - fault_info_dst.cs_bypass = FALSE; - fault_info_dst.mark_zf_absent = FALSE; - fault_info_dst.batch_pmap_op = FALSE; do { /* while (amount_left > 0) */ /* @@ -5756,13 +5710,12 @@ vm_fault_copy( vm_object_lock(dst_object); vm_object_paging_begin(dst_object); - if (amount_left > (vm_size_t) -1) { - /* 32-bit overflow */ - fault_info_dst.cluster_size = (vm_size_t) (0 - PAGE_SIZE); - } else { - fault_info_dst.cluster_size = (vm_size_t) amount_left; - assert(fault_info_dst.cluster_size == amount_left); + /* cap cluster size at maximum UPL size */ + upl_size_t cluster_size; + if (os_convert_overflow(amount_left, &cluster_size)) { + cluster_size = 0 - (upl_size_t)PAGE_SIZE; } + fault_info_dst.cluster_size = cluster_size; XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0); dst_page = VM_PAGE_NULL; @@ -5851,13 +5804,11 @@ vm_fault_copy( src_prot = VM_PROT_READ; vm_object_paging_begin(src_object); - if (amount_left > (vm_size_t) -1) { - /* 32-bit overflow */ - fault_info_src.cluster_size = (vm_size_t) (0 - PAGE_SIZE); - } else { - fault_info_src.cluster_size = (vm_size_t) amount_left; - assert(fault_info_src.cluster_size == amount_left); + /* cap cluster size at maximum UPL size */ + if (os_convert_overflow(amount_left, &cluster_size)) { + cluster_size = 0 - (upl_size_t)PAGE_SIZE; } + fault_info_src.cluster_size = cluster_size; XPR(XPR_VM_FAULT, "vm_fault_copy(2) -> vm_fault_page\n", @@ -5972,7 +5923,7 @@ vm_fault_copy( dst_page, (vm_offset_t) dst_po, (vm_size_t)part_size); - if(!dst_page->dirty){ + if(!dst_page->vmp_dirty){ vm_object_lock(dst_object); SET_PAGE_DIRTY(dst_page, TRUE); vm_object_unlock(dst_object); @@ -5989,7 +5940,7 @@ vm_fault_copy( vm_page_copy(result_page, dst_page); vm_object_unlock(result_page_object); - if(!dst_page->dirty){ + if(!dst_page->vmp_dirty){ vm_object_lock(dst_object); SET_PAGE_DIRTY(dst_page, TRUE); vm_object_unlock(dst_object); @@ -6050,7 +6001,7 @@ vm_fault_classify(vm_object_t object, while (TRUE) { m = vm_page_lookup(object, offset); if (m != VM_PAGE_NULL) { - if (m->busy || m->error || m->restart || m->absent) { + if (m->vmp_busy || m->vmp_error || m->vmp_restart || m->vmp_absent) { type = VM_FAULT_TYPE_OTHER; break; } @@ -6164,23 +6115,23 @@ kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr) return 0; } - if (m->laundry || m->busy || m->free_when_done || m->absent || m->error || m->cleaning || - m->overwriting || m->restart || m->unusual) { + if (m->vmp_laundry || m->vmp_busy || m->vmp_free_when_done || m->vmp_absent || m->vmp_error || m->vmp_cleaning || + m->vmp_overwriting || m->vmp_restart || m->vmp_unusual) { return 0; } - assert(!m->private); - if (m->private) { + assert(!m->vmp_private); + if (m->vmp_private) { return 0; } - assert(!m->fictitious); - if (m->fictitious) { + assert(!m->vmp_fictitious); + if (m->vmp_fictitious) { return 0; } - assert(m->vm_page_q_state != VM_PAGE_USED_BY_COMPRESSOR); - if (m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) { + assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR); + if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) { return 0; } @@ -6212,23 +6163,26 @@ kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr) } -void -vm_page_validate_cs_mapped( - vm_page_t page, - const void *kaddr) +/* + * vm_page_validate_cs_fast(): + * Performs a few quick checks to determine if the page's code signature + * really needs to be fully validated. It could: + * 1. have been modified (i.e. automatically tainted), + * 2. have already been validated, + * 3. have already been found to be tainted, + * 4. no longer have a backing store. + * Returns FALSE if the page needs to be fully validated. + */ +static boolean_t +vm_page_validate_cs_fast( + vm_page_t page) { - vm_object_t object; - vm_object_offset_t offset; - memory_object_t pager; - struct vnode *vnode; - boolean_t validated; - unsigned tainted; + vm_object_t object; - assert(page->busy); object = VM_PAGE_OBJECT(page); - vm_object_lock_assert_exclusive(object); + vm_object_lock_assert_held(object); - if (page->wpmapped && !page->cs_tainted) { + if (page->vmp_wpmapped && !page->vmp_cs_tainted) { /* * This page was mapped for "write" access sometime in the * past and could still be modifiable in the future. @@ -6236,38 +6190,76 @@ vm_page_validate_cs_mapped( * [ If the page was already found to be "tainted", no * need to re-validate. ] */ - page->cs_validated = TRUE; - page->cs_tainted = TRUE; + vm_object_lock_assert_exclusive(object); + page->vmp_cs_validated = TRUE; + page->vmp_cs_tainted = TRUE; if (cs_debug) { - printf("CODESIGNING: vm_page_validate_cs: " + printf("CODESIGNING: %s: " "page %p obj %p off 0x%llx " "was modified\n", - page, object, page->offset); + __FUNCTION__, + page, object, page->vmp_offset); } vm_cs_validated_dirtied++; } - if (page->cs_validated || page->cs_tainted) { - return; + if (page->vmp_cs_validated || page->vmp_cs_tainted) { + return TRUE; } + vm_object_lock_assert_exclusive(object); - vm_cs_validates++; +#if CHECK_CS_VALIDATION_BITMAP + kern_return_t kr; - assert(object->code_signed); - offset = page->offset; + kr = vnode_pager_cs_check_validation_bitmap( + object->pager, + page->vmp_offset + object->paging_offset, + CS_BITMAP_CHECK); + if (kr == KERN_SUCCESS) { + page->vmp_cs_validated = TRUE; + page->vmp_cs_tainted = FALSE; + vm_cs_bitmap_validated++; + return TRUE; + } +#endif /* CHECK_CS_VALIDATION_BITMAP */ if (!object->alive || object->terminating || object->pager == NULL) { /* * The object is terminating and we don't have its pager * so we can't validate the data... */ - return; + return TRUE; } + + /* we need to really validate this page */ + vm_object_lock_assert_exclusive(object); + return FALSE; +} + +void +vm_page_validate_cs_mapped_slow( + vm_page_t page, + const void *kaddr) +{ + vm_object_t object; + memory_object_offset_t mo_offset; + memory_object_t pager; + struct vnode *vnode; + boolean_t validated; + unsigned tainted; + + assert(page->vmp_busy); + object = VM_PAGE_OBJECT(page); + vm_object_lock_assert_exclusive(object); + + vm_cs_validates++; + /* * Since we get here to validate a page that was brought in by * the pager, we know that this pager is all setup and ready * by now. */ + assert(object->code_signed); assert(!object->internal); assert(object->pager != NULL); assert(object->pager_ready); @@ -6275,26 +6267,43 @@ vm_page_validate_cs_mapped( pager = object->pager; assert(object->paging_in_progress); vnode = vnode_pager_lookup_vnode(pager); + mo_offset = page->vmp_offset + object->paging_offset; /* verify the SHA1 hash for this page */ tainted = 0; validated = cs_validate_range(vnode, pager, - (object->paging_offset + - offset), + mo_offset, (const void *)((const char *)kaddr), PAGE_SIZE_64, &tainted); if (tainted & CS_VALIDATE_TAINTED) { - page->cs_tainted = TRUE; + page->vmp_cs_tainted = TRUE; } if (tainted & CS_VALIDATE_NX) { - page->cs_nx = TRUE; + page->vmp_cs_nx = TRUE; } - if (validated) { - page->cs_validated = TRUE; + page->vmp_cs_validated = TRUE; + } + +#if CHECK_CS_VALIDATION_BITMAP + if (page->vmp_cs_validated && !page->vmp_cs_tainted) { + vnode_pager_cs_check_validation_bitmap(object->pager, + mo_offset, + CS_BITMAP_SET); + } +#endif /* CHECK_CS_VALIDATION_BITMAP */ +} + +void +vm_page_validate_cs_mapped( + vm_page_t page, + const void *kaddr) +{ + if (!vm_page_validate_cs_fast(page)) { + vm_page_validate_cs_mapped_slow(page, kaddr); } } @@ -6314,53 +6323,18 @@ vm_page_validate_cs( object = VM_PAGE_OBJECT(page); vm_object_lock_assert_held(object); - if (page->wpmapped && !page->cs_tainted) { - vm_object_lock_assert_exclusive(object); - - /* - * This page was mapped for "write" access sometime in the - * past and could still be modifiable in the future. - * Consider it tainted. - * [ If the page was already found to be "tainted", no - * need to re-validate. ] - */ - page->cs_validated = TRUE; - page->cs_tainted = TRUE; - if (cs_debug) { - printf("CODESIGNING: vm_page_validate_cs: " - "page %p obj %p off 0x%llx " - "was modified\n", - page, object, page->offset); - } - vm_cs_validated_dirtied++; - } - - if (page->cs_validated || page->cs_tainted) { - return; - } - - if (page->slid) { - panic("vm_page_validate_cs(%p): page is slid\n", page); - } - assert(!page->slid); - -#if CHECK_CS_VALIDATION_BITMAP - if ( vnode_pager_cs_check_validation_bitmap( object->pager, trunc_page(page->offset + object->paging_offset), CS_BITMAP_CHECK ) == KERN_SUCCESS) { - page->cs_validated = TRUE; - page->cs_tainted = FALSE; - vm_cs_bitmap_validated++; + if (vm_page_validate_cs_fast(page)) { return; } -#endif vm_object_lock_assert_exclusive(object); assert(object->code_signed); - offset = page->offset; + offset = page->vmp_offset; - busy_page = page->busy; + busy_page = page->vmp_busy; if (!busy_page) { /* keep page busy while we map (and unlock) the VM object */ - page->busy = TRUE; + page->vmp_busy = TRUE; } /* @@ -6383,19 +6357,14 @@ vm_page_validate_cs( &koffset, &need_unmap); if (kr != KERN_SUCCESS) { - panic("vm_page_validate_cs: could not map page: 0x%x\n", kr); + panic("%s: could not map page: 0x%x\n", __FUNCTION__, kr); } kaddr = CAST_DOWN(vm_offset_t, koffset); /* validate the mapped page */ - vm_page_validate_cs_mapped(page, (const void *) kaddr); + vm_page_validate_cs_mapped_slow(page, (const void *) kaddr); -#if CHECK_CS_VALIDATION_BITMAP - if ( page->cs_validated == TRUE && page->cs_tainted == FALSE ) { - vnode_pager_cs_check_validation_bitmap( object->pager, trunc_page( offset + object->paging_offset), CS_BITMAP_SET ); - } -#endif - assert(page->busy); + assert(page->vmp_busy); assert(object == VM_PAGE_OBJECT(page)); vm_object_lock_assert_exclusive(object); @@ -6431,12 +6400,12 @@ vm_page_validate_cs_mapped_chunk( *validated_p = FALSE; *tainted_p = 0; - assert(page->busy); + assert(page->vmp_busy); object = VM_PAGE_OBJECT(page); vm_object_lock_assert_exclusive(object); assert(object->code_signed); - offset = page->offset; + offset = page->vmp_offset; if (!object->alive || object->terminating || object->pager == NULL) { /* @@ -6479,3 +6448,98 @@ vm_page_validate_cs_mapped_chunk( *tainted_p = tainted; } } + +static void vm_rtfrecord_lock(void) { + lck_spin_lock(&vm_rtfr_slock); +} + +static void vm_rtfrecord_unlock(void) { + lck_spin_unlock(&vm_rtfr_slock); +} + +unsigned int vmrtfaultinfo_bufsz(void) { + return (vmrtf_num_records * sizeof(vm_rtfault_record_t)); +} + +#include + +static void vm_record_rtfault(thread_t cthread, uint64_t fstart, vm_map_offset_t fault_vaddr, int type_of_fault) { + uint64_t fend = mach_continuous_time(); + + uint64_t cfpc = 0; + uint64_t ctid = cthread->thread_id; + uint64_t cupid = get_current_unique_pid(); + + uintptr_t bpc = 0; + uint32_t bfrs = 0; + bool u64 = false; + + /* Capture a single-frame backtrace; this extracts just the program + * counter at the point of the fault into "bpc", and should perform no + * further user stack traversals, thus avoiding copyin()s and further + * faults. + */ + int btr = backtrace_thread_user(cthread, &bpc, 1U, &bfrs, &u64); + + if ((btr == 0) && (bfrs > 0)) { + cfpc = bpc; + } + + assert((fstart != 0) && fend >= fstart); + vm_rtfrecord_lock(); + assert(vmrtfrs.vmrtfr_curi <= vmrtfrs.vmrtfr_maxi); + + vmrtfrs.vmrtf_total++; + vm_rtfault_record_t *cvmr = &vmrtfrs.vm_rtf_records[vmrtfrs.vmrtfr_curi++]; + + cvmr->rtfabstime = fstart; + cvmr->rtfduration = fend - fstart; + cvmr->rtfaddr = fault_vaddr; + cvmr->rtfpc = cfpc; + cvmr->rtftype = type_of_fault; + cvmr->rtfupid = cupid; + cvmr->rtftid = ctid; + + if (vmrtfrs.vmrtfr_curi > vmrtfrs.vmrtfr_maxi) { + vmrtfrs.vmrtfr_curi = 0; + } + + vm_rtfrecord_unlock(); +} + +int vmrtf_extract(uint64_t cupid, __unused boolean_t isroot, int vrecordsz, void *vrecords, int *vmrtfrv) { + vm_rtfault_record_t *cvmrd = vrecords; + size_t residue = vrecordsz; + int numextracted = 0; + boolean_t early_exit = FALSE; + + vm_rtfrecord_lock(); + + for (int vmfi = 0; vmfi <= vmrtfrs.vmrtfr_maxi; vmfi++) { + + if (residue < sizeof(vm_rtfault_record_t)) { + early_exit = TRUE; + break; + } + + if (vmrtfrs.vm_rtf_records[vmfi].rtfupid != cupid) { +#if DEVELOPMENT || DEBUG + if (isroot == FALSE) { + continue; + } +#else + continue; +#endif /* DEVDEBUG */ + } + + *cvmrd = vmrtfrs.vm_rtf_records[vmfi]; + cvmrd++; + residue -= sizeof(vm_rtfault_record_t); + numextracted++; + } + + vm_rtfrecord_unlock(); + + *vmrtfrv = numextracted; + return (early_exit); +} diff --git a/osfmk/vm/vm_fault.h b/osfmk/vm/vm_fault.h index 666b7ef52..1dc0839e7 100644 --- a/osfmk/vm/vm_fault.h +++ b/osfmk/vm/vm_fault.h @@ -182,10 +182,7 @@ extern kern_return_t vm_fault_enter( boolean_t wired, boolean_t change_wiring, vm_tag_t wire_tag, /* if wiring must pass tag != VM_KERN_MEMORY_NONE */ - boolean_t no_cache, - boolean_t cs_bypass, - int user_tag, - int pmap_options, + vm_object_fault_info_t fault_info, boolean_t *need_retry, int *type_of_fault); @@ -193,6 +190,7 @@ extern vm_offset_t kdp_lightweight_fault( vm_map_t map, vm_offset_t cur_target_addr); +extern void vm_rtfault_record_init(void); #endif /* MACH_KERNEL_PRIVATE */ diff --git a/osfmk/vm/vm_fourk_pager.c b/osfmk/vm/vm_fourk_pager.c index 407cbb916..d8de27077 100644 --- a/osfmk/vm/vm_fourk_pager.c +++ b/osfmk/vm/vm_fourk_pager.c @@ -1068,7 +1068,7 @@ fourk_pager_data_request( kr); } assert(src_page != VM_PAGE_NULL); - assert(src_page->busy); + assert(src_page->vmp_busy); src_page_object = VM_PAGE_OBJECT(src_page); @@ -1164,7 +1164,7 @@ fourk_pager_data_request( offset, cur_offset, (sub_page-sub_page_idx)*FOURK_PAGE_SIZE, src_page_object, - src_page->offset + offset_in_src_page, + src_page->vmp_offset + offset_in_src_page, *(uint64_t *)(dst_vaddr + ((sub_page-sub_page_idx) * FOURK_PAGE_SIZE)), @@ -1302,7 +1302,7 @@ fourk_pager_data_request( kr = vm_map_remove(kernel_map, kernel_mapping, kernel_mapping + (2 * PAGE_SIZE_64), - VM_MAP_NO_FLAGS); + VM_MAP_REMOVE_NO_FLAGS); assert(kr == KERN_SUCCESS); kernel_mapping = 0; src_vaddr = 0; diff --git a/osfmk/vm/vm_init.c b/osfmk/vm/vm_init.c index 326ac01a5..82f7ce30c 100644 --- a/osfmk/vm/vm_init.c +++ b/osfmk/vm/vm_init.c @@ -113,31 +113,12 @@ vm_mem_bootstrap_log(const char *message) * This is done only by the first cpu up. */ -int pacified_footprint_suspend = 0; -int pacified_purgeable_iokit = 0; - void vm_mem_bootstrap(void) { vm_offset_t start, end; vm_size_t zsizearg; mach_vm_size_t zsize; - int pacified; - - pacified = 0; - PE_parse_boot_argn("pacified", - &pacified, - sizeof (pacified)); - if (pacified) { - pacified_footprint_suspend = 1; - pacified_purgeable_iokit = 1; - } - PE_parse_boot_argn("pacified_footprint_suspend", - &pacified_footprint_suspend, - sizeof (pacified_footprint_suspend)); - PE_parse_boot_argn("pacified_purgeable_iokit", - &pacified_purgeable_iokit, - sizeof (pacified_purgeable_iokit)); /* * Initializes resident memory structures. @@ -198,9 +179,10 @@ vm_mem_bootstrap(void) if (zsize < ZONE_MAP_MIN) zsize = ZONE_MAP_MIN; /* Clamp to min */ + #if defined(__LP64__) zsize += zsize >> 1; -#endif /* __LP64__ */ +#endif /* __LP64__ */ if (zsize > sane_size >> 1) zsize = sane_size >> 1; /* Clamp to half of RAM max */ #if !__LP64__ @@ -208,25 +190,6 @@ vm_mem_bootstrap(void) zsize = ZONE_MAP_MAX; /* Clamp to 1.5GB max for K32 */ #endif /* !__LP64__ */ -#if CONFIG_EMBEDDED -#if defined(__LP64__) - { - mach_vm_size_t max_zsize; - - /* - * because of the limited kernel virtual space for embedded systems, - * we need to clamp the size of the zone map being created... replicate - * the above calculation for a 1Gbyte, LP64 system and use that as the - * maximum size for the zone map - */ - max_zsize = (1024ULL * 1024ULL * 1024ULL) >> 2ULL; - max_zsize += max_zsize >> 1; - - if (zsize > max_zsize) - zsize = max_zsize; - } -#endif -#endif vm_mem_bootstrap_log("kext_alloc_init"); kext_alloc_init(); @@ -261,6 +224,11 @@ vm_mem_bootstrap(void) vm_paging_map_init(); vm_mem_bootstrap_log("vm_mem_bootstrap done"); + +#ifdef CONFIG_ZCACHE + zcache_bootstrap(); +#endif + vm_rtfault_record_init(); } void diff --git a/osfmk/vm/vm_kern.c b/osfmk/vm/vm_kern.c index a14a10db9..8e53cbd13 100644 --- a/osfmk/vm/vm_kern.c +++ b/osfmk/vm/vm_kern.c @@ -175,7 +175,7 @@ kmem_alloc_contig( VM_MAP_PAGE_MASK(map)), vm_map_round_page(map_addr + map_size, VM_MAP_PAGE_MASK(map)), - 0); + VM_MAP_REMOVE_NO_FLAGS); vm_object_deallocate(object); *addrp = 0; return kr; @@ -186,7 +186,7 @@ kmem_alloc_contig( m = pages; pages = NEXT_PAGE(m); *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL; - m->busy = FALSE; + m->vmp_busy = FALSE; vm_page_insert(m, object, offset + i); } vm_object_unlock(object); @@ -210,7 +210,7 @@ kmem_alloc_contig( VM_MAP_PAGE_MASK(map)), vm_map_round_page(map_addr + map_size, VM_MAP_PAGE_MASK(map)), - 0); + VM_MAP_REMOVE_NO_FLAGS); vm_object_deallocate(object); return kr; } @@ -265,6 +265,7 @@ kernel_memory_allocate( vm_page_t wired_page_list = NULL; int guard_page_count = 0; int wired_page_count = 0; + int page_grab_count = 0; int i; int vm_alloc_flags; vm_map_kernel_flags_t vmk_flags; @@ -294,7 +295,8 @@ kernel_memory_allocate( * limit raised to 2GB with 128GB max physical limit, * but scaled by installed memory above this */ - if ( !(flags & KMA_VAONLY) && map_size > MAX(1ULL<<31, sane_size/64)) { + if (!(flags & (KMA_VAONLY | KMA_PAGEABLE)) && + map_size > MAX(1ULL<<31, sane_size/64)) { return KERN_RESOURCE_SHORTAGE; } @@ -340,6 +342,10 @@ kernel_memory_allocate( wired_page_count = (int) (fill_size / PAGE_SIZE_64); assert(wired_page_count * PAGE_SIZE_64 == fill_size); +#if DEBUG || DEVELOPMENT + VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START, size, 0, 0, 0); +#endif + for (i = 0; i < guard_page_count; i++) { for (;;) { mem = vm_page_grab_guard(); @@ -352,11 +358,11 @@ kernel_memory_allocate( } vm_page_more_fictitious(); } - mem->snext = guard_page_list; + mem->vmp_snext = guard_page_list; guard_page_list = mem; } - if (! (flags & KMA_VAONLY)) { + if (!(flags & (KMA_VAONLY | KMA_PAGEABLE))) { for (i = 0; i < wired_page_count; i++) { uint64_t unavailable; @@ -385,8 +391,9 @@ kernel_memory_allocate( } VM_PAGE_WAIT(); } + page_grab_count++; if (KMA_ZERO & flags) vm_page_zero_fill(mem); - mem->snext = wired_page_list; + mem->vmp_snext = wired_page_list; wired_page_list = mem; } } @@ -424,7 +431,7 @@ kernel_memory_allocate( VME_OBJECT_SET(entry, object); VME_OFFSET_SET(entry, offset); - if (object != compressor_object) + if (!(flags & (KMA_COMPRESSOR | KMA_PAGEABLE))) entry->wired_count++; if (flags & KMA_PERMANENT) @@ -443,12 +450,12 @@ kernel_memory_allocate( panic("kernel_memory_allocate: guard_page_list == NULL"); mem = guard_page_list; - guard_page_list = mem->snext; - mem->snext = NULL; + guard_page_list = mem->vmp_snext; + mem->vmp_snext = NULL; vm_page_insert(mem, object, offset + pg_offset); - mem->busy = FALSE; + mem->vmp_busy = FALSE; pg_offset += PAGE_SIZE_64; } @@ -461,7 +468,7 @@ kernel_memory_allocate( } #endif - if (flags & KMA_VAONLY) { + if (flags & (KMA_VAONLY | KMA_PAGEABLE)) { pg_offset = fill_start + fill_size; } else { for (pg_offset = fill_start; pg_offset < fill_start + fill_size; pg_offset += PAGE_SIZE_64) { @@ -469,24 +476,24 @@ kernel_memory_allocate( panic("kernel_memory_allocate: wired_page_list == NULL"); mem = wired_page_list; - wired_page_list = mem->snext; - mem->snext = NULL; + wired_page_list = mem->vmp_snext; + mem->vmp_snext = NULL; - assert(mem->wire_count == 0); - assert(mem->vm_page_q_state == VM_PAGE_NOT_ON_Q); + assert(mem->vmp_wire_count == 0); + assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q); - mem->vm_page_q_state = VM_PAGE_IS_WIRED; - mem->wire_count++; - if (__improbable(mem->wire_count == 0)) { + mem->vmp_q_state = VM_PAGE_IS_WIRED; + mem->vmp_wire_count++; + if (__improbable(mem->vmp_wire_count == 0)) { panic("kernel_memory_allocate(%p): wire_count overflow", mem); } vm_page_insert_wired(mem, object, offset + pg_offset, tag); - mem->busy = FALSE; - mem->pmapped = TRUE; - mem->wpmapped = TRUE; + mem->vmp_busy = FALSE; + mem->vmp_pmapped = TRUE; + mem->vmp_wpmapped = TRUE; PMAP_ENTER_OPTIONS(kernel_pmap, map_addr + pg_offset, mem, kma_prot, VM_PROT_NONE, ((flags & KMA_KSTACK) ? VM_MEM_STACK : 0), TRUE, @@ -517,17 +524,17 @@ kernel_memory_allocate( panic("kernel_memory_allocate: guard_page_list == NULL"); mem = guard_page_list; - guard_page_list = mem->snext; - mem->snext = NULL; + guard_page_list = mem->vmp_snext; + mem->vmp_snext = NULL; vm_page_insert(mem, object, offset + pg_offset); - mem->busy = FALSE; + mem->vmp_busy = FALSE; } if (guard_page_list || wired_page_list) panic("kernel_memory_allocate: non empty list\n"); - if (! (flags & KMA_VAONLY)) { + if (!(flags & (KMA_VAONLY | KMA_PAGEABLE))) { vm_page_lockspin_queues(); vm_page_wire_count += wired_page_count; vm_page_unlock_queues(); @@ -543,6 +550,10 @@ kernel_memory_allocate( else vm_object_deallocate(object); +#if DEBUG || DEVELOPMENT + VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0); +#endif + /* * Return the memory, not zeroed. */ @@ -556,6 +567,10 @@ kernel_memory_allocate( if (wired_page_list) vm_page_free_list(wired_page_list, FALSE); +#if DEBUG || DEVELOPMENT + VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0); +#endif + return kr; } @@ -573,8 +588,13 @@ kernel_memory_populate( vm_page_t mem; vm_page_t page_list = NULL; int page_count = 0; + int page_grab_count = 0; int i; +#if DEBUG || DEVELOPMENT + VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START, size, 0, 0, 0); +#endif + page_count = (int) (size / PAGE_SIZE_64); assert((flags & (KMA_COMPRESSOR|KMA_KOBJECT)) != (KMA_COMPRESSOR|KMA_KOBJECT)); @@ -592,8 +612,9 @@ kernel_memory_populate( VM_PAGE_WAIT(); } + page_grab_count++; if (KMA_ZERO & flags) vm_page_zero_fill(mem); - mem->snext = page_list; + mem->vmp_snext = page_list; page_list = mem; pg_offset -= PAGE_SIZE_64; @@ -616,16 +637,16 @@ kernel_memory_populate( pg_offset += PAGE_SIZE_64) { mem = page_list; - page_list = mem->snext; - mem->snext = NULL; + page_list = mem->vmp_snext; + mem->vmp_snext = NULL; vm_page_insert(mem, object, offset + pg_offset); - assert(mem->busy); + assert(mem->vmp_busy); - mem->busy = FALSE; - mem->pmapped = TRUE; - mem->wpmapped = TRUE; - mem->vm_page_q_state = VM_PAGE_USED_BY_COMPRESSOR; + mem->vmp_busy = FALSE; + mem->vmp_pmapped = TRUE; + mem->vmp_wpmapped = TRUE; + mem->vmp_q_state = VM_PAGE_USED_BY_COMPRESSOR; } vm_object_unlock(object); @@ -636,6 +657,10 @@ kernel_memory_populate( kasan_notify_address(addr, size); } #endif + +#if DEBUG || DEVELOPMENT + VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0); +#endif return KERN_SUCCESS; } @@ -660,8 +685,9 @@ kernel_memory_populate( } VM_PAGE_WAIT(); } + page_grab_count++; if (KMA_ZERO & flags) vm_page_zero_fill(mem); - mem->snext = page_list; + mem->vmp_snext = page_list; page_list = mem; } if (flags & KMA_KOBJECT) { @@ -691,22 +717,21 @@ kernel_memory_populate( panic("kernel_memory_populate: page_list == NULL"); mem = page_list; - page_list = mem->snext; - mem->snext = NULL; - - assert(mem->vm_page_q_state == VM_PAGE_NOT_ON_Q); - mem->vm_page_q_state = VM_PAGE_IS_WIRED; - mem->wire_count++; - if (__improbable(mem->wire_count == 0)) { - panic("kernel_memory_populate(%p): wire_count overflow", - mem); + page_list = mem->vmp_snext; + mem->vmp_snext = NULL; + + assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q); + mem->vmp_q_state = VM_PAGE_IS_WIRED; + mem->vmp_wire_count++; + if (__improbable(mem->vmp_wire_count == 0)) { + panic("kernel_memory_populate(%p): wire_count overflow", mem); } vm_page_insert_wired(mem, object, offset + pg_offset, tag); - mem->busy = FALSE; - mem->pmapped = TRUE; - mem->wpmapped = TRUE; + mem->vmp_busy = FALSE; + mem->vmp_pmapped = TRUE; + mem->vmp_wpmapped = TRUE; PMAP_ENTER_OPTIONS(kernel_pmap, addr + pg_offset, mem, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, @@ -732,10 +757,14 @@ kernel_memory_populate( pmap_set_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem)); } } - vm_page_lock_queues(); + vm_page_lockspin_queues(); vm_page_wire_count += page_count; vm_page_unlock_queues(); +#if DEBUG || DEVELOPMENT + VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0); +#endif + if (kernel_object == object) vm_tag_update_size(tag, size); vm_object_unlock(object); @@ -753,6 +782,10 @@ kernel_memory_populate( if (page_list) vm_page_free_list(page_list, FALSE); +#if DEBUG || DEVELOPMENT + VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0); +#endif + return kr; } @@ -804,21 +837,21 @@ kernel_memory_depopulate( assert(mem); - if (mem->vm_page_q_state != VM_PAGE_USED_BY_COMPRESSOR) + if (mem->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR) pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem)); - mem->busy = TRUE; + mem->vmp_busy = TRUE; - assert(mem->tabled); + assert(mem->vmp_tabled); vm_page_remove(mem, TRUE); - assert(mem->busy); + assert(mem->vmp_busy); - assert(mem->pageq.next == 0 && mem->pageq.prev == 0); - assert((mem->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) || - (mem->vm_page_q_state == VM_PAGE_NOT_ON_Q)); + assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0); + assert((mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) || + (mem->vmp_q_state == VM_PAGE_NOT_ON_Q)); - mem->vm_page_q_state = VM_PAGE_NOT_ON_Q; - mem->snext = local_freeq; + mem->vmp_q_state = VM_PAGE_NOT_ON_Q; + mem->vmp_snext = local_freeq; local_freeq = mem; } vm_object_unlock(object); @@ -977,7 +1010,7 @@ kmem_realloc( kr = vm_map_wire_kernel(map, newmapaddr, newmapaddr + newmapsize, VM_PROT_DEFAULT, tag, FALSE); if (KERN_SUCCESS != kr) { - vm_map_remove(map, newmapaddr, newmapaddr + newmapsize, 0); + vm_map_remove(map, newmapaddr, newmapaddr + newmapsize, VM_MAP_REMOVE_NO_FLAGS); vm_object_lock(object); for(offset = oldsize; offset < newmapsize; offset += PAGE_SIZE) { if ((mem = vm_page_lookup(object, offset)) != VM_PAGE_NULL) { @@ -1167,7 +1200,7 @@ kmem_alloc_pages( VM_PAGE_WAIT(); vm_object_lock(object); } - mem->busy = FALSE; + mem->vmp_busy = FALSE; alloc_size -= PAGE_SIZE; offset += PAGE_SIZE; @@ -1248,7 +1281,8 @@ kmem_suballoc( /* * See comment preceding vm_map_submap(). */ - vm_map_remove(parent, map_addr, map_addr + map_size, VM_MAP_NO_FLAGS); + vm_map_remove(parent, map_addr, map_addr + map_size, + VM_MAP_REMOVE_NO_FLAGS); vm_map_deallocate(map); /* also removes ref to pmap */ vm_object_deallocate(vm_submap_object); return (kr); diff --git a/osfmk/vm/vm_kern.h b/osfmk/vm/vm_kern.h index 8cab89ce4..d63523e08 100644 --- a/osfmk/vm/vm_kern.h +++ b/osfmk/vm/vm_kern.h @@ -66,6 +66,10 @@ #ifndef _VM_VM_KERN_H_ #define _VM_VM_KERN_H_ +#ifdef __cplusplus +extern "C" { +#endif + #include #include #include @@ -98,6 +102,7 @@ extern kern_return_t kernel_memory_allocate( #define KMA_COMPRESSOR 0x400 /* Pages belonging to the compressor are not on the paging queues, nor are they counted as wired. */ #define KMA_ATOMIC 0x800 #define KMA_ZERO 0x1000 +#define KMA_PAGEABLE 0x2000 extern kern_return_t kmem_alloc( vm_map_t map, @@ -326,6 +331,7 @@ extern kern_return_t mach_vm_map_kernel( mach_vm_size_t initial_size, mach_vm_offset_t mask, int flags, + vm_map_kernel_flags_t vmk_flags, vm_tag_t tag, ipc_port_t port, vm_object_offset_t offset, @@ -341,6 +347,7 @@ extern kern_return_t vm_map_kernel( vm_size_t size, vm_offset_t mask, int flags, + vm_map_kernel_flags_t vmk_flags, vm_tag_t tag, ipc_port_t port, vm_offset_t offset, @@ -383,6 +390,7 @@ extern kern_return_t vm_map_64_kernel( vm_size_t size, vm_offset_t mask, int flags, + vm_map_kernel_flags_t vmk_flags, vm_tag_t tag, ipc_port_t port, vm_object_offset_t offset, @@ -455,4 +463,9 @@ extern void vm_kernel_addrhash_external( extern void vm_init_before_launchd(void); #endif /* KERNEL */ + +#ifdef __cplusplus +} +#endif + #endif /* _VM_VM_KERN_H_ */ diff --git a/osfmk/vm/vm_map.c b/osfmk/vm/vm_map.c index 573c2a34e..ad60f1693 100644 --- a/osfmk/vm/vm_map.c +++ b/osfmk/vm/vm_map.c @@ -84,10 +84,12 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -95,6 +97,7 @@ #include #include #include +#include #include #include #include @@ -112,9 +115,18 @@ #include +#include +#include +#if DEVELOPMENT || DEBUG +extern int proc_selfcsflags(void); +#if CONFIG_EMBEDDED +extern int panic_on_unsigned_execute; +#endif /* CONFIG_EMBEDDED */ +#endif /* DEVELOPMENT || DEBUG */ + #if __arm64__ -extern int fourk_binary_compatibility_unsafe; -extern int fourk_binary_compatibility_allow_wx; +extern const int fourk_binary_compatibility_unsafe; +extern const int fourk_binary_compatibility_allow_wx; #endif /* __arm64__ */ extern int proc_selfpid(void); extern char *proc_name_address(void *p); @@ -126,8 +138,8 @@ int vm_map_debug_apple_protect = 0; int vm_map_debug_fourk = 0; #endif /* VM_MAP_DEBUG_FOURK */ -int vm_map_executable_immutable = 0; -int vm_map_executable_immutable_no_log = 0; +SECURITY_READ_ONLY_LATE(int) vm_map_executable_immutable = 1; +int vm_map_executable_immutable_verbose = 0; extern u_int32_t random(void); /* from */ /* Internal prototypes @@ -180,6 +192,11 @@ static kern_return_t vm_map_delete( int flags, vm_map_t zap_map); +static void vm_map_copy_insert( + vm_map_t map, + vm_map_entry_t after_where, + vm_map_copy_t copy); + static kern_return_t vm_map_copy_overwrite_unaligned( vm_map_t dst_map, vm_map_entry_t entry, @@ -317,6 +334,9 @@ static kern_return_t vm_map_pageout( vm_map_offset_t end); #endif /* MACH_ASSERT */ +static void vm_map_corpse_footprint_destroy( + vm_map_t map); + pid_t find_largest_process_vm_map_entries(void); /* @@ -329,6 +349,34 @@ pid_t find_largest_process_vm_map_entries(void); * vm_map_copyout. */ +#if CONFIG_EMBEDDED + +/* + * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy(). + * But for security reasons on embedded platforms, we don't want the + * new mapping to be "used for jit", so we always reset the flag here. + * Same for "pmap_cs_associated". + */ +#define VM_MAP_ENTRY_COPY_CODE_SIGNING(NEW,OLD) \ +MACRO_BEGIN \ + (NEW)->used_for_jit = FALSE; \ + (NEW)->pmap_cs_associated = FALSE; \ +MACRO_END + +#else /* CONFIG_EMBEDDED */ + +/* + * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy(). + * On macOS, the new mapping can be "used for jit". + */ +#define VM_MAP_ENTRY_COPY_CODE_SIGNING(NEW,OLD) \ +MACRO_BEGIN \ + assert((NEW)->used_for_jit == (OLD)->used_for_jit); \ + assert((NEW)->pmap_cs_associated == FALSE); \ +MACRO_END + +#endif /* CONFIG_EMBEDDED */ + #define vm_map_entry_copy(NEW,OLD) \ MACRO_BEGIN \ boolean_t _vmec_reserved = (NEW)->from_reserved_zone; \ @@ -339,7 +387,7 @@ boolean_t _vmec_reserved = (NEW)->from_reserved_zone; \ (NEW)->wired_count = 0; \ (NEW)->user_wired_count = 0; \ (NEW)->permanent = FALSE; \ - (NEW)->used_for_jit = FALSE; \ + VM_MAP_ENTRY_COPY_CODE_SIGNING((NEW),(OLD)); \ (NEW)->from_reserved_zone = _vmec_reserved; \ if ((NEW)->iokit_acct) { \ assertf(!(NEW)->use_pmap, "old %p new %p\n", (OLD), (NEW)); \ @@ -708,7 +756,7 @@ vm_map_apple_protected( vm_flags, vmk_flags, VM_KERN_MEMORY_NONE, - (ipc_port_t) unprotected_mem_obj, + (ipc_port_t)(uintptr_t) unprotected_mem_obj, 0, TRUE, tmp_entry.protection, @@ -771,6 +819,14 @@ lck_grp_attr_t vm_map_lck_grp_attr; lck_attr_t vm_map_lck_attr; lck_attr_t vm_map_lck_rw_attr; +#if CONFIG_EMBEDDED +int malloc_no_cow = 1; +#define VM_PROTECT_WX_FAIL 0 +#else /* CONFIG_EMBEDDED */ +int malloc_no_cow = 0; +#define VM_PROTECT_WX_FAIL 1 +#endif /* CONFIG_EMBEDDED */ +uint64_t vm_memory_malloc_no_cow_mask = 0ULL; /* * vm_map_init: @@ -893,9 +949,29 @@ vm_map_init( PE_parse_boot_argn("vm_map_executable_immutable", &vm_map_executable_immutable, sizeof(vm_map_executable_immutable)); - PE_parse_boot_argn("vm_map_executable_immutable_no_log", - &vm_map_executable_immutable_no_log, - sizeof(vm_map_executable_immutable_no_log)); + PE_parse_boot_argn("vm_map_executable_immutable_verbose", + &vm_map_executable_immutable_verbose, + sizeof(vm_map_executable_immutable_verbose)); + + PE_parse_boot_argn("malloc_no_cow", + &malloc_no_cow, + sizeof(malloc_no_cow)); + if (malloc_no_cow) { + vm_memory_malloc_no_cow_mask = 0ULL; + vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC; + vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL; + vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE; +// vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE; +// vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC; + vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY; + vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE; + vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED; + vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO; +// vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC; + PE_parse_boot_argn("vm_memory_malloc_no_cow_mask", + &vm_memory_malloc_no_cow_mask, + sizeof(vm_memory_malloc_no_cow_mask)); + } } void @@ -954,7 +1030,7 @@ vm_map_disable_hole_optimization(vm_map_t map) if (map->holelistenabled) { - head_entry = hole_entry = (vm_map_entry_t) map->holes_list; + head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list); while (hole_entry != NULL) { @@ -994,15 +1070,35 @@ vm_kernel_map_is_kernel(vm_map_t map) { vm_map_t vm_map_create( - pmap_t pmap, + pmap_t pmap, vm_map_offset_t min, vm_map_offset_t max, - boolean_t pageable) + boolean_t pageable) +{ + int options; + + options = 0; + if (pageable) { + options |= VM_MAP_CREATE_PAGEABLE; + } + return vm_map_create_options(pmap, min, max, options); +} + +vm_map_t +vm_map_create_options( + pmap_t pmap, + vm_map_offset_t min, + vm_map_offset_t max, + int options) { - static int color_seed = 0; vm_map_t result; struct vm_map_links *hole_entry = NULL; + if (options & ~(VM_MAP_CREATE_ALL_OPTIONS)) { + /* unknown option */ + return VM_MAP_NULL; + } + result = (vm_map_t) zalloc(vm_map_zone); if (result == VM_MAP_NULL) panic("vm_map_create"); @@ -1010,7 +1106,11 @@ vm_map_create( vm_map_first_entry(result) = vm_map_to_entry(result); vm_map_last_entry(result) = vm_map_to_entry(result); result->hdr.nentries = 0; - result->hdr.entries_pageable = pageable; + if (options & VM_MAP_CREATE_PAGEABLE) { + result->hdr.entries_pageable = TRUE; + } else { + result->hdr.entries_pageable = FALSE; + } vm_map_store_init( &(result->hdr) ); @@ -1022,7 +1122,7 @@ vm_map_create( #if __x86_64__ result->vmmap_high_start = 0; #endif /* __x86_64__ */ - result->ref_count = 1; + result->map_refcnt = 1; #if TASK_SWAPPER result->res_count = 1; result->sw_state = MAP_SW_IN; @@ -1042,25 +1142,30 @@ vm_map_create( result->highest_entry_end = 0; result->first_free = vm_map_to_entry(result); result->hint = vm_map_to_entry(result); - result->color_rr = (color_seed++) & vm_color_mask; result->jit_entry_exists = FALSE; - if (vm_map_supports_hole_optimization) { - hole_entry = zalloc(vm_map_holes_zone); + /* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */ + if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) { + result->has_corpse_footprint = TRUE; + result->holelistenabled = FALSE; + result->vmmap_corpse_footprint = NULL; + } else { + result->has_corpse_footprint = FALSE; + if (vm_map_supports_hole_optimization) { + hole_entry = zalloc(vm_map_holes_zone); - hole_entry->start = min; + hole_entry->start = min; #if defined(__arm__) || defined(__arm64__) - hole_entry->end = result->max_offset; + hole_entry->end = result->max_offset; #else - hole_entry->end = (max > (vm_map_offset_t)MACH_VM_MAX_ADDRESS) ? max : (vm_map_offset_t)MACH_VM_MAX_ADDRESS; + hole_entry->end = (max > (vm_map_offset_t)MACH_VM_MAX_ADDRESS) ? max : (vm_map_offset_t)MACH_VM_MAX_ADDRESS; #endif - result->holes_list = result->hole_hint = hole_entry; - hole_entry->prev = hole_entry->next = (vm_map_entry_t) hole_entry; - result->holelistenabled = TRUE; - - } else { - - result->holelistenabled = FALSE; + result->holes_list = result->hole_hint = hole_entry; + hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry); + result->holelistenabled = TRUE; + } else { + result->holelistenabled = FALSE; + } } vm_map_lock_init(result); @@ -1190,7 +1295,7 @@ void vm_map_res_reference(vm_map_t map) { /* assert map is locked */ assert(map->res_count >= 0); - assert(map->ref_count >= map->res_count); + assert(map->map_refcnt >= map->res_count); if (map->res_count == 0) { lck_mtx_unlock(&map->s_lock); vm_map_lock(map); @@ -1215,8 +1320,8 @@ void vm_map_reference_swap(vm_map_t map) assert(map != VM_MAP_NULL); lck_mtx_lock(&map->s_lock); assert(map->res_count >= 0); - assert(map->ref_count >= map->res_count); - map->ref_count++; + assert(map->map_refcnt >= map->res_count); + map->map_refcnt++; vm_map_res_reference(map); lck_mtx_unlock(&map->s_lock); } @@ -1241,7 +1346,7 @@ void vm_map_res_deallocate(vm_map_t map) vm_map_unlock(map); lck_mtx_lock(&map->s_lock); } - assert(map->ref_count >= map->res_count); + assert(map->map_refcnt >= map->res_count); } #endif /* MACH_ASSERT && TASK_SWAPPER */ @@ -1261,6 +1366,8 @@ vm_map_destroy( flags |= VM_MAP_REMOVE_NO_UNNESTING; /* final cleanup: ok to remove immutable mappings */ flags |= VM_MAP_REMOVE_IMMUTABLE; + /* final cleanup: allow gaps in range */ + flags |= VM_MAP_REMOVE_GAPS_OK; /* clean up regular map entries */ (void) vm_map_delete(map, map->min_offset, map->max_offset, @@ -1272,6 +1379,8 @@ vm_map_destroy( #endif /* !__arm__ && !__arm64__ */ vm_map_disable_hole_optimization(map); + vm_map_corpse_footprint_destroy(map); + vm_map_unlock(map); assert(map->hdr.nentries == 0); @@ -1590,7 +1699,7 @@ vm_map_find_space( VM_MAP_HIGHEST_ENTRY(map, entry, start); } else { if (map->holelistenabled) { - hole_entry = (vm_map_entry_t)map->holes_list; + hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list); if (hole_entry == NULL) { /* @@ -1638,7 +1747,9 @@ vm_map_find_space( return(KERN_NO_SPACE); } start = end; + assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map))); end += size; + assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))); if ((end > map->max_offset) || (end < start)) { vm_map_entry_dispose(map, new_entry); @@ -1675,7 +1786,7 @@ vm_map_find_space( entry = next; if (map->holelistenabled) { - if (entry == (vm_map_entry_t) map->holes_list) { + if (entry == CAST_TO_VM_MAP_ENTRY(map->holes_list)) { /* * Wrapped around */ @@ -1748,6 +1859,7 @@ vm_map_find_space( } new_entry->used_for_jit = FALSE; + new_entry->pmap_cs_associated = FALSE; new_entry->zero_wired_pages = FALSE; new_entry->iokit_acct = FALSE; new_entry->vme_resilient_codesign = FALSE; @@ -1763,7 +1875,7 @@ vm_map_find_space( * Insert the new entry into the list */ - vm_map_store_entry_link(map, entry, new_entry); + vm_map_store_entry_link(map, entry, new_entry, VM_MAP_KERNEL_FLAGS_NONE); map->size += size; @@ -1804,6 +1916,7 @@ vm_map_pmap_enter( { int type_of_fault; kern_return_t kr; + struct vm_object_fault_info fault_info = {}; if(map->pmap == 0) return; @@ -1826,8 +1939,8 @@ vm_map_pmap_enter( m = vm_page_lookup(object, offset); - if (m == VM_PAGE_NULL || m->busy || m->fictitious || - (m->unusual && ( m->error || m->restart || m->absent))) { + if (m == VM_PAGE_NULL || m->vmp_busy || m->vmp_fictitious || + (m->vmp_unusual && ( m->vmp_error || m->vmp_restart || m->vmp_absent))) { vm_object_unlock(object); return; } @@ -1838,16 +1951,14 @@ vm_map_pmap_enter( map, (unsigned long long)addr, object, (unsigned long long)offset); } type_of_fault = DBG_CACHE_HIT_FAULT; - kr = vm_fault_enter(m, map->pmap, addr, protection, protection, - VM_PAGE_WIRED(m), - FALSE, /* change_wiring */ - VM_KERN_MEMORY_NONE, /* tag - not wiring */ - FALSE, /* no_cache */ - FALSE, /* cs_bypass */ - 0, /* XXX need user tag / alias? */ - 0, /* pmap_options */ - NULL, /* need_retry */ - &type_of_fault); + kr = vm_fault_enter(m, map->pmap, + addr, protection, protection, + VM_PAGE_WIRED(m), + FALSE, /* change_wiring */ + VM_KERN_MEMORY_NONE, /* tag - not wiring */ + &fault_info, + NULL, /* need_retry */ + &type_of_fault); vm_object_unlock(object); @@ -1944,6 +2055,19 @@ vm_map_random_address_for_size( return kr; } +static boolean_t +vm_memory_malloc_no_cow( + int alias) +{ + uint64_t alias_mask; + + alias_mask = 1ULL << alias; + if (alias_mask & vm_memory_malloc_no_cow_mask) { + return TRUE; + } + return FALSE; +} + /* * Routine: vm_map_enter * @@ -1977,6 +2101,7 @@ vm_map_enter( vm_map_offset_t start, tmp_start, tmp_offset; vm_map_offset_t end, tmp_end; vm_map_offset_t tmp2_start, tmp2_end; + vm_map_offset_t desired_empty_end; vm_map_offset_t step; kern_return_t result = KERN_SUCCESS; vm_map_t zap_old_map = VM_MAP_NULL; @@ -2042,16 +2167,36 @@ vm_map_enter( } -#if CONFIG_EMBEDDED - if (cur_protection & VM_PROT_WRITE){ - if ((cur_protection & VM_PROT_EXECUTE) && !entry_for_jit){ - printf("EMBEDDED: %s: curprot cannot be write+execute. " - "turning off execute\n", - __FUNCTION__); - cur_protection &= ~VM_PROT_EXECUTE; - } + if ((cur_protection & VM_PROT_WRITE) && + (cur_protection & VM_PROT_EXECUTE) && +#if !CONFIG_EMBEDDED + map != kernel_map && + (cs_process_global_enforcement() || + (vmk_flags.vmkf_cs_enforcement_override + ? vmk_flags.vmkf_cs_enforcement + : cs_process_enforcement(NULL))) && +#endif /* !CONFIG_EMBEDDED */ + !entry_for_jit) { + DTRACE_VM3(cs_wx, + uint64_t, 0, + uint64_t, 0, + vm_prot_t, cur_protection); + printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. " +#if VM_PROTECT_WX_FAIL + "failing\n", +#else /* VM_PROTECT_WX_FAIL */ + "turning off execute\n", +#endif /* VM_PROTECT_WX_FAIL */ + proc_selfpid(), + (current_task()->bsd_info + ? proc_name_address(current_task()->bsd_info) + : "?"), + __FUNCTION__); + cur_protection &= ~VM_PROT_EXECUTE; +#if VM_PROTECT_WX_FAIL + return KERN_PROTECTION_FAILURE; +#endif /* VM_PROTECT_WX_FAIL */ } -#endif /* CONFIG_EMBEDDED */ /* * If the task has requested executable lockdown, @@ -2190,11 +2335,13 @@ StartAgain: ; map_locked = TRUE; if (entry_for_jit) { +#if CONFIG_EMBEDDED if (map->jit_entry_exists) { result = KERN_INVALID_ARGUMENT; goto BailOut; } random_address = TRUE; +#endif /* CONFIG_EMBEDDED */ } if (random_address) { @@ -2236,7 +2383,7 @@ StartAgain: ; } else { if (map->holelistenabled) { - hole_entry = (vm_map_entry_t)map->holes_list; + hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list); if (hole_entry == NULL) { /* @@ -2261,7 +2408,7 @@ StartAgain: ; } hole_entry = hole_entry->vme_next; - } while (hole_entry != (vm_map_entry_t) map->holes_list); + } while (hole_entry != CAST_TO_VM_MAP_ENTRY(map->holes_list)); if (found_hole == FALSE) { result = KERN_NO_SPACE; @@ -2338,7 +2485,10 @@ StartAgain: ; VM_MAP_PAGE_MASK(map))); end += size; - if ((end > effective_max_offset) || (end < start)) { + /* We want an entire page of empty space, but don't increase the allocation size. */ + desired_empty_end = vm_map_round_page(end, VM_MAP_PAGE_MASK(map)); + + if ((desired_empty_end > effective_max_offset) || (desired_empty_end < start)) { if (map->wait_for_space) { assert(!keep_map_locked); if (size <= (effective_max_offset - @@ -2357,7 +2507,7 @@ StartAgain: ; next = entry->vme_next; if (map->holelistenabled) { - if (entry->vme_end >= end) + if (entry->vme_end >= desired_empty_end) break; } else { /* @@ -2372,7 +2522,7 @@ StartAgain: ; if (next == vm_map_to_entry(map)) break; - if (next->vme_start >= end) + if (next->vme_start >= desired_empty_end) break; } @@ -2383,7 +2533,7 @@ StartAgain: ; entry = next; if (map->holelistenabled) { - if (entry == (vm_map_entry_t) map->holes_list) { + if (entry == CAST_TO_VM_MAP_ENTRY(map->holes_list)) { /* * Wrapped around */ @@ -2557,12 +2707,14 @@ StartAgain: ; * semantics. */ - if (purgable || entry_for_jit) { + if (purgable || + entry_for_jit || + vm_memory_malloc_no_cow(user_alias)) { if (object == VM_OBJECT_NULL) { object = vm_object_allocate(size); object->copy_strategy = MEMORY_OBJECT_COPY_NONE; - object->true_share = TRUE; + object->true_share = FALSE; if (purgable) { task_t owner; object->purgable = VM_PURGABLE_NONVOLATILE; @@ -2580,7 +2732,7 @@ StartAgain: ; } else { owner = current_task(); } - assert(object->vo_purgeable_owner == NULL); + assert(object->vo_owner == NULL); assert(object->resident_page_count == 0); assert(object->wired_page_count == 0); vm_object_lock(object); @@ -2616,6 +2768,7 @@ StartAgain: ; (!entry->map_aligned || !clear_map_aligned) && (!entry->zero_wired_pages) && (!entry->used_for_jit && !entry_for_jit) && + (!entry->pmap_cs_associated) && (entry->iokit_acct == iokit_acct) && (!entry->vme_resilient_codesign) && (!entry->vme_resilient_media) && @@ -2715,12 +2868,13 @@ StartAgain: ; assert(!new_entry->iokit_acct); if (!is_submap && object != VM_OBJECT_NULL && - object->purgable != VM_PURGABLE_DENY) { + (object->purgable != VM_PURGABLE_DENY || + object->vo_ledger_tag)) { assert(new_entry->use_pmap); assert(!new_entry->iokit_acct); /* * Turn off pmap accounting since - * purgeable objects have their + * purgeable (or tagged) objects have their * own ledgers. */ new_entry->use_pmap = FALSE; @@ -2991,7 +3145,8 @@ StartAgain: ; vm_map_store_entry_unlink(zap_old_map, entry2); zap_old_map->size -= entry_size; - vm_map_store_entry_link(map, entry1, entry2); + vm_map_store_entry_link(map, entry1, entry2, + VM_MAP_KERNEL_FLAGS_NONE); map->size += entry_size; entry1 = entry2; } @@ -3089,17 +3244,26 @@ vm_map_enter_fourk( return KERN_NOT_SUPPORTED; } -#if CONFIG_EMBEDDED - if (cur_protection & VM_PROT_WRITE) { - if ((cur_protection & VM_PROT_EXECUTE) && - !entry_for_jit) { - printf("EMBEDDED: %s: curprot cannot be write+execute. " - "turning off execute\n", - __FUNCTION__); - cur_protection &= ~VM_PROT_EXECUTE; - } + if ((cur_protection & VM_PROT_WRITE) && + (cur_protection & VM_PROT_EXECUTE) && +#if !CONFIG_EMBEDDED + map != kernel_map && + cs_process_enforcement(NULL) && +#endif /* !CONFIG_EMBEDDED */ + !entry_for_jit) { + DTRACE_VM3(cs_wx, + uint64_t, 0, + uint64_t, 0, + vm_prot_t, cur_protection); + printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. " + "turning off execute\n", + proc_selfpid(), + (current_task()->bsd_info + ? proc_name_address(current_task()->bsd_info) + : "?"), + __FUNCTION__); + cur_protection &= ~VM_PROT_EXECUTE; } -#endif /* CONFIG_EMBEDDED */ /* * If the task has requested executable lockdown, @@ -3611,7 +3775,8 @@ vm_map_enter_fourk( vm_map_store_entry_unlink(zap_old_map, entry2); zap_old_map->size -= entry_size; - vm_map_store_entry_link(map, entry1, entry2); + vm_map_store_entry_link(map, entry1, entry2, + VM_MAP_KERNEL_FLAGS_NONE); map->size += entry_size; entry1 = entry2; } @@ -3828,7 +3993,7 @@ vm_map_enter_mem_object_helper( flags, vmk_flags, tag, - (vm_object_t) submap, + (vm_object_t)(uintptr_t) submap, offset, copy, cur_protection, @@ -3970,7 +4135,7 @@ vm_map_enter_mem_object_helper( vm_map_lock(copy_submap); vm_map_reference(copy_submap); vm_map_unlock(copy_submap); - copy_object = (vm_object_t) copy_submap; + copy_object = (vm_object_t)(uintptr_t) copy_submap; } else if (!copy && copy_object != VM_OBJECT_NULL && (copy_entry->needs_copy || @@ -4039,6 +4204,11 @@ vm_map_enter_mem_object_helper( */ assert(!copy_entry->needs_copy); } +#if !CONFIG_EMBEDDED + if (copy_entry->used_for_jit) { + vmk_remap_flags.vmkf_map_jit = TRUE; + } +#endif /* !CONFIG_EMBEDDED */ kr = vm_map_enter(target_map, ©_addr, copy_size, @@ -4081,7 +4251,7 @@ vm_map_enter_mem_object_helper( vm_map_remove(target_map, map_addr, map_addr + offset, - 0); + VM_MAP_REMOVE_NO_FLAGS); *address += offset; } if (offset + map_size < named_entry->size) { @@ -4095,7 +4265,7 @@ vm_map_enter_mem_object_helper( offset + map_size), (map_addr + named_entry->size), - 0); + VM_MAP_REMOVE_NO_FLAGS); } } named_entry_unlock(named_entry); @@ -4737,15 +4907,15 @@ vm_map_enter_cpm( pages = NEXT_PAGE(m); *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL; - assert(!m->gobbled); - assert(!m->wanted); - assert(!m->pageout); - assert(!m->tabled); + assert(!m->vmp_gobbled); + assert(!m->vmp_wanted); + assert(!m->vmp_pageout); + assert(!m->vmp_tabled); assert(VM_PAGE_WIRED(m)); - assert(m->busy); + assert(m->vmp_busy); assert(VM_PAGE_GET_PHYS_PAGE(m)>=(avail_start>>PAGE_SHIFT) && VM_PAGE_GET_PHYS_PAGE(m)<=(avail_end>>PAGE_SHIFT)); - m->busy = FALSE; + m->vmp_busy = FALSE; vm_page_insert(m, cpm_obj, offset); } assert(cpm_obj->resident_page_count == size / PAGE_SIZE); @@ -4849,17 +5019,17 @@ vm_map_enter_cpm( if (m == VM_PAGE_NULL) panic("vm_allocate_cpm: obj %p off 0x%llx no page", cpm_obj, (uint64_t)offset); - assert(m->tabled); - assert(!m->busy); - assert(!m->wanted); - assert(!m->fictitious); - assert(!m->private); - assert(!m->absent); - assert(!m->error); - assert(!m->cleaning); - assert(!m->laundry); - assert(!m->precious); - assert(!m->clustered); + assert(m->vmp_tabled); + assert(!m->vmp_busy); + assert(!m->vmp_wanted); + assert(!m->vmp_fictitious); + assert(!m->vmp_private); + assert(!m->vmp_absent); + assert(!m->vmp_error); + assert(!m->vmp_cleaning); + assert(!m->vmp_laundry); + assert(!m->vmp_precious); + assert(!m->vmp_clustered); if (offset != 0) { if (VM_PAGE_GET_PHYS_PAGE(m) != prev_addr + 1) { printf("start 0x%llx end 0x%llx va 0x%llx\n", @@ -4971,7 +5141,7 @@ vm_map_clip_unnest( pmap_unnest(map->pmap, entry->vme_start, entry->vme_end - entry->vme_start); - if ((map->mapped_in_other_pmaps) && (map->ref_count)) { + if ((map->mapped_in_other_pmaps) && (map->map_refcnt)) { /* clean up parent map/maps */ vm_map_submap_pmap_clean( map, entry->vme_start, @@ -5029,6 +5199,15 @@ vm_map_clip_start( if (entry->vme_atomic) { panic("Attempting to clip an atomic VM entry! (map: %p, entry: %p)\n", map, entry); } + + DTRACE_VM5( + vm_map_clip_start, + vm_map_t, map, + vm_map_offset_t, entry->vme_start, + vm_map_offset_t, entry->vme_end, + vm_map_offset_t, startaddr, + int, VME_ALIAS(entry)); + _vm_map_clip_start(&map->hdr, entry, startaddr); if (map->holelistenabled) { vm_map_store_update_first_free(map, NULL, FALSE); @@ -5137,6 +5316,14 @@ vm_map_clip_end( if (entry->vme_atomic) { panic("Attempting to clip an atomic VM entry! (map: %p, entry: %p)\n", map, entry); } + DTRACE_VM5( + vm_map_clip_end, + vm_map_t, map, + vm_map_offset_t, entry->vme_start, + vm_map_offset_t, entry->vme_end, + vm_map_offset_t, endaddr, + int, VME_ALIAS(entry)); + _vm_map_clip_end(&map->hdr, entry, endaddr); if (map->holelistenabled) { vm_map_store_update_first_free(map, NULL, FALSE); @@ -5381,12 +5568,6 @@ vm_map_submap( return(result); } -#if CONFIG_EMBEDDED && (DEVELOPMENT || DEBUG) -#include -extern int proc_selfcsflags(void); -extern int panic_on_unsigned_execute; -#endif /* CONFIG_EMBEDDED && (DEVELOPMENT || DEBUG) */ - /* * vm_map_protect: * @@ -5424,8 +5605,39 @@ vm_map_protect( return KERN_INVALID_ADDRESS; } +#if VM_PROTECT_WX_FAIL + if ((new_prot & VM_PROT_EXECUTE) && + map != kernel_map && + cs_process_enforcement(NULL)) { + DTRACE_VM3(cs_wx, + uint64_t, (uint64_t) start, + uint64_t, (uint64_t) end, + vm_prot_t, new_prot); + printf("CODE SIGNING: %d[%s] %s can't have both write and exec at the same time\n", + proc_selfpid(), + (current_task()->bsd_info + ? proc_name_address(current_task()->bsd_info) + : "?"), + __FUNCTION__); + return KERN_PROTECTION_FAILURE; + } +#endif /* VM_PROTECT_WX_FAIL */ + + /* + * Let vm_map_remap_extract() know that it will need to: + * + make a copy of the mapping + * + add VM_PROT_WRITE to the max protections + * + remove any protections that are no longer allowed from the + * max protections (to avoid any WRITE/EXECUTE conflict, for + * example). + * Note that "max_prot" is an IN/OUT parameter only for this + * specific (VM_PROT_COPY) case. It's usually an OUT parameter + * only. + */ + max_prot = new_prot & VM_PROT_ALL; kflags = VM_MAP_KERNEL_FLAGS_NONE; kflags.vmkf_remap_prot_copy = TRUE; + kflags.vmkf_overwrite_immutable = TRUE; new_start = start; kr = vm_map_remap(map, &new_start, @@ -5500,14 +5712,29 @@ vm_map_protect( return(KERN_PROTECTION_FAILURE); } -#if CONFIG_EMBEDDED - if (new_prot & VM_PROT_WRITE) { - if ((new_prot & VM_PROT_EXECUTE) && !(current->used_for_jit)) { - printf("EMBEDDED: %s can't have both write and exec at the same time\n", __FUNCTION__); - new_prot &= ~VM_PROT_EXECUTE; - } + if ((new_prot & VM_PROT_WRITE) && + (new_prot & VM_PROT_EXECUTE) && +#if !CONFIG_EMBEDDED + map != kernel_map && + cs_process_enforcement(NULL) && +#endif /* !CONFIG_EMBEDDED */ + !(current->used_for_jit)) { + DTRACE_VM3(cs_wx, + uint64_t, (uint64_t) current->vme_start, + uint64_t, (uint64_t) current->vme_end, + vm_prot_t, new_prot); + printf("CODE SIGNING: %d[%s] %s can't have both write and exec at the same time\n", + proc_selfpid(), + (current_task()->bsd_info + ? proc_name_address(current_task()->bsd_info) + : "?"), + __FUNCTION__); + new_prot &= ~VM_PROT_EXECUTE; +#if VM_PROTECT_WX_FAIL + vm_map_unlock(map); + return KERN_PROTECTION_FAILURE; +#endif /* VM_PROTECT_WX_FAIL */ } -#endif /* * If the task has requested executable lockdown, @@ -5617,8 +5844,8 @@ vm_map_protect( #if CONFIG_EMBEDDED && (DEVELOPMENT || DEBUG) if (!(old_prot & VM_PROT_EXECUTE) && (prot & VM_PROT_EXECUTE) && - (proc_selfcsflags() & CS_KILL) && - panic_on_unsigned_execute) { + panic_on_unsigned_execute && + (proc_selfcsflags() & CS_KILL)) { panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - code-signing bypass?\n", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot); } #endif /* CONFIG_EMBEDDED && (DEVELOPMENT || DEBUG) */ @@ -5864,9 +6091,7 @@ subtract_wire_counts( } } -#if CONFIG_EMBEDDED int cs_executable_wire = 0; -#endif /* CONFIG_EMBEDDED */ /* * vm_map_wire: @@ -6283,7 +6508,7 @@ vm_map_wire_nested( if (entry->protection & VM_PROT_WRITE) { vm_object_lock_assert_exclusive( object); - m->dirty = TRUE; + m->vmp_dirty = TRUE; } } else { /* not already wired !? */ @@ -6302,14 +6527,20 @@ vm_map_wire_nested( * Unwired entry or wire request transmitted via submap */ -#if CONFIG_EMBEDDED /* * Wiring would copy the pages to the shadow object. * The shadow object would not be code-signed so * attempting to execute code from these copied pages * would trigger a code-signing violation. */ - if (entry->protection & VM_PROT_EXECUTE) { + + if ((entry->protection & VM_PROT_EXECUTE) +#if !CONFIG_EMBEDDED + && + map != kernel_map && + cs_process_enforcement(NULL) +#endif /* !CONFIG_EMBEDDED */ + ) { #if MACH_ASSERT printf("pid %d[%s] wiring executable range from " "0x%llx to 0x%llx: rejected to preserve " @@ -6328,8 +6559,6 @@ vm_map_wire_nested( rc = KERN_PROTECTION_FAILURE; goto done; } -#endif /* CONFIG_EMBEDDED */ - /* * Perform actions of vm_map_lookup that need the write @@ -7044,7 +7273,7 @@ vm_map_submap_pmap_clean( VME_OFFSET(entry)); } else { - if((map->mapped_in_other_pmaps) && (map->ref_count) + if((map->mapped_in_other_pmaps) && (map->map_refcnt) && (VME_OBJECT(entry) != NULL)) { vm_object_pmap_protect_options( VME_OBJECT(entry), @@ -7080,7 +7309,7 @@ vm_map_submap_pmap_clean( VME_SUBMAP(entry), VME_OFFSET(entry)); } else { - if((map->mapped_in_other_pmaps) && (map->ref_count) + if((map->mapped_in_other_pmaps) && (map->map_refcnt) && (VME_OBJECT(entry) != NULL)) { vm_object_pmap_protect_options( VME_OBJECT(entry), @@ -7104,6 +7333,87 @@ vm_map_submap_pmap_clean( return; } +/* + * virt_memory_guard_ast: + * + * Handle the AST callout for a virtual memory guard. + * raise an EXC_GUARD exception and terminate the task + * if configured to do so. + */ +void +virt_memory_guard_ast( + thread_t thread, + mach_exception_data_type_t code, + mach_exception_data_type_t subcode) +{ + task_t task = thread->task; + assert(task != kernel_task); + assert(task == current_task()); + uint32_t behavior; + + behavior = task->task_exc_guard; + + /* Is delivery enabled */ + if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) { + return; + } + + /* If only once, make sure we're that once */ + while (behavior & TASK_EXC_GUARD_VM_ONCE) { + uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER; + + if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) { + break; + } + behavior = task->task_exc_guard; + if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) { + return; + } + } + + /* Raise exception via corpse fork or synchronously */ + if ((task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) && + (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) == 0) { + task_violated_guard(code, subcode, NULL); + } else { + task_exception_notify(EXC_GUARD, code, subcode); + } + + /* Terminate the task if desired */ + if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) { + task_bsdtask_kill(current_task()); + } +} + +/* + * vm_map_guard_exception: + * + * Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception. + * + * Right now, we do this when we find nothing mapped, or a + * gap in the mapping when a user address space deallocate + * was requested. We report the address of the first gap found. + */ +static void +vm_map_guard_exception( + vm_map_offset_t gap_start, + unsigned reason) +{ + mach_exception_code_t code = 0; + unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY; + unsigned int target = 0; /* should we pass in pid associated with map? */ + mach_exception_data_type_t subcode = (uint64_t)gap_start; + + /* Can't deliver exceptions to kernel task */ + if (current_task() == kernel_task) + return; + + EXC_GUARD_ENCODE_TYPE(code, guard_type); + EXC_GUARD_ENCODE_FLAVOR(code, reason); + EXC_GUARD_ENCODE_TARGET(code, target); + thread_guard_violation(current_thread(), code, subcode); +} + /* * vm_map_delete: [ internal use only ] * @@ -7130,6 +7440,16 @@ vm_map_delete( boolean_t need_wakeup; unsigned int last_timestamp = ~0; /* unlikely value */ int interruptible; + vm_map_offset_t gap_start; + vm_map_offset_t save_start = start; + vm_map_offset_t save_end = end; + const vm_map_offset_t FIND_GAP = 1; /* a not page aligned value */ + const vm_map_offset_t GAPS_OK = 2; /* a different not page aligned value */ + + if (map != kernel_map && !(flags & VM_MAP_REMOVE_GAPS_OK)) + gap_start = FIND_GAP; + else + gap_start = GAPS_OK; interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ? THREAD_ABORTSAFE : THREAD_UNINT; @@ -7165,10 +7485,15 @@ vm_map_delete( (uint64_t)entry->vme_start, (uint64_t)entry->vme_end); } - if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) { /* extend request to whole entry */ start = SUPERPAGE_ROUND_DOWN(start); + + /* + * If in a superpage, extend the range to include the start of the mapping. + */ + if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) { start = SUPERPAGE_ROUND_DOWN(start); continue; } + if (start == entry->vme_start) { /* * No need to clip. We don't want to cause @@ -7204,9 +7529,11 @@ vm_map_delete( * time through the loop. */ SAVE_HINT_MAP_WRITE(map, entry->vme_prev); + } else { + if (map->pmap == kernel_pmap && - map->ref_count != 0) { + map->map_refcnt != 0) { panic("vm_map_delete(%p,0x%llx,0x%llx): " "no map entry at 0x%llx\n", map, @@ -7215,6 +7542,8 @@ vm_map_delete( (uint64_t)start); } entry = first_entry->vme_next; + if (gap_start == FIND_GAP) + gap_start = start; } break; } @@ -7315,8 +7644,27 @@ vm_map_delete( } else if (flags & VM_MAP_REMOVE_IMMUTABLE) { // printf("FBDP %d[%s] removing permanent entry %p [0x%llx:0x%llx] prot 0x%x/0x%x\n", proc_selfpid(), (current_task()->bsd_info ? proc_name_address(current_task()->bsd_info) : "?"), entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, entry->protection, entry->max_protection); entry->permanent = FALSE; +#if PMAP_CS + } else if ((entry->protection & VM_PROT_EXECUTE) && !pmap_cs_enforced(map->pmap)) { + entry->permanent = FALSE; + + printf("%d[%s] %s(0x%llx,0x%llx): " + "pmap_cs disabled, allowing for permanent executable entry [0x%llx:0x%llx] " + "prot 0x%x/0x%x\n", + proc_selfpid(), + (current_task()->bsd_info + ? proc_name_address(current_task()->bsd_info) + : "?"), + __FUNCTION__, + (uint64_t) start, + (uint64_t) end, + (uint64_t)entry->vme_start, + (uint64_t)entry->vme_end, + entry->protection, + entry->max_protection); +#endif } else { - if (!vm_map_executable_immutable_no_log) { + if (vm_map_executable_immutable_verbose) { printf("%d[%s] %s(0x%llx,0x%llx): " "permanent entry [0x%llx:0x%llx] " "prot 0x%x/0x%x\n", @@ -7383,6 +7731,8 @@ vm_map_delete( /* * User: use the next entry */ + if (gap_start == FIND_GAP) + gap_start = s; entry = first_entry->vme_next; s = entry->vme_start; } else { @@ -7452,6 +7802,8 @@ vm_map_delete( /* * User: use the next entry */ + if (gap_start == FIND_GAP) + gap_start = s; entry = first_entry->vme_next; s = entry->vme_start; } else { @@ -7527,6 +7879,8 @@ vm_map_delete( if (!vm_map_lookup_entry(map, s, &first_entry)){ assert((map != kernel_map) && (!entry->is_sub_map)); + if (gap_start == FIND_GAP) + gap_start = s; first_entry = first_entry->vme_next; s = first_entry->vme_start; } else { @@ -7607,7 +7961,7 @@ vm_map_delete( entry->vme_end - entry->vme_start, pmap_flags); #endif /* NO_NESTED_PMAP */ - if ((map->mapped_in_other_pmaps) && (map->ref_count)) { + if ((map->mapped_in_other_pmaps) && (map->map_refcnt)) { /* clean up parent map/maps */ vm_map_submap_pmap_clean( map, entry->vme_start, @@ -7624,7 +7978,7 @@ vm_map_delete( } else if (VME_OBJECT(entry) != kernel_object && VME_OBJECT(entry) != compressor_object) { object = VME_OBJECT(entry); - if ((map->mapped_in_other_pmaps) && (map->ref_count)) { + if ((map->mapped_in_other_pmaps) && (map->map_refcnt)) { vm_object_pmap_protect_options( object, VME_OFFSET(entry), entry->vme_end - entry->vme_start, @@ -7679,7 +8033,7 @@ vm_map_delete( next = entry->vme_next; if (map->pmap == kernel_pmap && - map->ref_count != 0 && + map->map_refcnt != 0 && entry->vme_end < end && (next == vm_map_to_entry(map) || next->vme_start != entry->vme_end)) { @@ -7692,6 +8046,19 @@ vm_map_delete( (uint64_t)entry->vme_end); } + /* + * If the desired range didn't end with "entry", then there is a gap if + * we wrapped around to the start of the map or if "entry" and "next" + * aren't contiguous. + * + * The vm_map_round_page() is needed since an entry can be less than VM_MAP_PAGE_MASK() sized. + * For example, devices which have h/w 4K pages, but entry sizes are all now 16K. + */ + if (gap_start == FIND_GAP && + vm_map_round_page(entry->vme_end, VM_MAP_PAGE_MASK(map)) < end && + (next == vm_map_to_entry(map) || entry->vme_end != next->vme_start)) { + gap_start = entry->vme_end; + } s = next->vme_start; last_timestamp = map->timestamp; @@ -7714,8 +8081,9 @@ vm_map_delete( vm_map_store_entry_unlink(map, entry); /* ... and add it to the end of the "zap_map" */ vm_map_store_entry_link(zap_map, - vm_map_last_entry(zap_map), - entry); + vm_map_last_entry(zap_map), + entry, + VM_MAP_KERNEL_FLAGS_NONE); entry_size = entry->vme_end - entry->vme_start; map->size -= entry_size; zap_map->size += entry_size; @@ -7732,17 +8100,23 @@ vm_map_delete( if(entry == vm_map_to_entry(map)) { break; } - if (last_timestamp+1 != map->timestamp) { + if (last_timestamp + 1 != map->timestamp) { /* - * we are responsible for deleting everything - * from the give space, if someone has interfered - * we pick up where we left off, back fills should - * be all right for anyone except map_delete and + * We are responsible for deleting everything + * from the given space. If someone has interfered, + * we pick up where we left off. Back fills should + * be all right for anyone, except map_delete, and * we have to assume that the task has been fully * disabled before we get here */ if (!vm_map_lookup_entry(map, s, &entry)){ entry = entry->vme_next; + + /* + * Nothing found for s. If we weren't already done, then there is a gap. + */ + if (gap_start == FIND_GAP && s < end) + gap_start = s; s = entry->vme_start; } else { SAVE_HINT_MAP_WRITE(map, entry->vme_prev); @@ -7751,7 +8125,7 @@ vm_map_delete( * others can not only allocate behind us, we can * also see coalesce while we don't have the map lock */ - if(entry == vm_map_to_entry(map)) { + if (entry == vm_map_to_entry(map)) { break; } } @@ -7766,6 +8140,28 @@ vm_map_delete( if (need_wakeup) vm_map_entry_wakeup(map); + if (gap_start != FIND_GAP && gap_start != GAPS_OK) { + DTRACE_VM3(kern_vm_deallocate_gap, + vm_map_offset_t, gap_start, + vm_map_offset_t, save_start, + vm_map_offset_t, save_end); + if (!(flags & VM_MAP_REMOVE_GAPS_OK)) { +#if defined(DEVELOPMENT) || defined(DEBUG) + /* log just once if not checking, otherwise log each one */ + if (!map->warned_delete_gap || + (task_exc_guard_default & TASK_EXC_GUARD_VM_ALL) != 0) { + printf("vm_map_delete: map %p [%p...%p] nothing at %p\n", + (void *)map, (void *)save_start, (void *)save_end, + (void *)gap_start); + if (!map->warned_delete_gap) { + map->warned_delete_gap = 1; + } + } +#endif + vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP); + } + } + return KERN_SUCCESS; } @@ -7822,6 +8218,25 @@ vm_map_remove_locked( } +/* + * Routine: vm_map_copy_allocate + * + * Description: + * Allocates and initializes a map copy object. + */ +static vm_map_copy_t +vm_map_copy_allocate(void) +{ + vm_map_copy_t new_copy; + + new_copy = zalloc(vm_map_copy_zone); + bzero(new_copy, sizeof (*new_copy)); + new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE; + vm_map_copy_first_entry(new_copy) = vm_map_copy_to_entry(new_copy); + vm_map_copy_last_entry(new_copy) = vm_map_copy_to_entry(new_copy); + return new_copy; +} + /* * Routine: vm_map_copy_discard * @@ -7902,7 +8317,6 @@ vm_map_copy_copy( */ new_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone); - new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE; *new_copy = *copy; if (copy->type == VM_MAP_COPY_ENTRY_LIST) { @@ -8543,12 +8957,7 @@ vm_map_copy_overwrite_nested( } /* otherwise copy no longer exists, it was */ /* destroyed after successful copy_overwrite */ - copy = (vm_map_copy_t) - zalloc(vm_map_copy_zone); - copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE; - vm_map_copy_first_entry(copy) = - vm_map_copy_last_entry(copy) = - vm_map_copy_to_entry(copy); + copy = vm_map_copy_allocate(); copy->type = VM_MAP_COPY_ENTRY_LIST; copy->offset = new_offset; @@ -8859,14 +9268,8 @@ vm_map_copy_overwrite( /* * Extract "head_copy" out of "copy". */ - head_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone); - head_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE; - vm_map_copy_first_entry(head_copy) = - vm_map_copy_to_entry(head_copy); - vm_map_copy_last_entry(head_copy) = - vm_map_copy_to_entry(head_copy); + head_copy = vm_map_copy_allocate(); head_copy->type = VM_MAP_COPY_ENTRY_LIST; - head_copy->cpy_hdr.nentries = 0; head_copy->cpy_hdr.entries_pageable = copy->cpy_hdr.entries_pageable; vm_map_store_init(&head_copy->cpy_hdr); @@ -8904,14 +9307,8 @@ vm_map_copy_overwrite( /* * Extract "tail_copy" out of "copy". */ - tail_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone); - tail_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE; - vm_map_copy_first_entry(tail_copy) = - vm_map_copy_to_entry(tail_copy); - vm_map_copy_last_entry(tail_copy) = - vm_map_copy_to_entry(tail_copy); + tail_copy = vm_map_copy_allocate(); tail_copy->type = VM_MAP_COPY_ENTRY_LIST; - tail_copy->cpy_hdr.nentries = 0; tail_copy->cpy_hdr.entries_pageable = copy->cpy_hdr.entries_pageable; vm_map_store_init(&tail_copy->cpy_hdr); @@ -9714,7 +10111,7 @@ vm_map_copyin_kernel_buffer( VM_MAP_PAGE_MASK(src_map)), (VM_MAP_REMOVE_INTERRUPTIBLE | VM_MAP_REMOVE_WAIT_FOR_KWIRE | - ((src_map == kernel_map) ? VM_MAP_REMOVE_KUNWIRE : 0))); + ((src_map == kernel_map) ? VM_MAP_REMOVE_KUNWIRE : VM_MAP_REMOVE_NO_FLAGS))); } *copy_result = copy; return KERN_SUCCESS; @@ -9831,7 +10228,7 @@ vm_map_copyout_kernel_buffer( vm_map_round_page(copy_size, VM_MAP_PAGE_MASK(map))), VM_MAP_PAGE_MASK(map)), - VM_MAP_NO_FLAGS); + VM_MAP_REMOVE_NO_FLAGS); *addr = 0; } } else { @@ -9845,21 +10242,31 @@ vm_map_copyout_kernel_buffer( } /* - * Macro: vm_map_copy_insert + * Routine: vm_map_copy_insert [internal use only] * * Description: * Link a copy chain ("copy") into a map at the * specified location (after "where"). * Side effects: * The copy chain is destroyed. - * Warning: - * The arguments are evaluated multiple times. */ -#define vm_map_copy_insert(map, where, copy) \ -MACRO_BEGIN \ - vm_map_store_copy_insert(map, where, copy); \ - zfree(vm_map_copy_zone, copy); \ -MACRO_END +static void +vm_map_copy_insert( + vm_map_t map, + vm_map_entry_t after_where, + vm_map_copy_t copy) +{ + vm_map_entry_t entry; + + while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) { + entry = vm_map_copy_first_entry(copy); + vm_map_copy_entry_unlink(copy, entry); + vm_map_store_entry_link(map, after_where, entry, + VM_MAP_KERNEL_FLAGS_NONE); + after_where = entry; + } + zfree(vm_map_copy_zone, copy); +} void vm_map_copy_remap( @@ -9899,7 +10306,8 @@ vm_map_copy_remap( vm_object_reference(VME_OBJECT(new_entry)); } /* insert the new entry in the map */ - vm_map_store_entry_link(map, where, new_entry); + vm_map_store_entry_link(map, where, new_entry, + VM_MAP_KERNEL_FLAGS_NONE); /* continue inserting the "copy entries" after the new entry */ where = new_entry; } @@ -10089,7 +10497,7 @@ StartAgain: ; last = entry; } else { if (dst_map->holelistenabled) { - hole_entry = (vm_map_entry_t)dst_map->holes_list; + hole_entry = CAST_TO_VM_MAP_ENTRY(dst_map->holes_list); if (hole_entry == NULL) { /* @@ -10151,7 +10559,7 @@ StartAgain: ; last = next; if (dst_map->holelistenabled) { - if (last == (vm_map_entry_t) dst_map->holes_list) { + if (last == CAST_TO_VM_MAP_ENTRY(dst_map->holes_list)) { /* * Wrapped around */ @@ -10291,6 +10699,7 @@ StartAgain: ; while (va < entry->vme_end) { vm_page_t m; + struct vm_object_fault_info fault_info = {}; /* * Look up the page in the object. @@ -10313,7 +10722,7 @@ StartAgain: ; m = vm_page_lookup(object, offset); if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) || - m->absent) + m->vmp_absent) panic("vm_map_copyout: wiring %p", m); prot = entry->protection; @@ -10324,20 +10733,24 @@ StartAgain: ; type_of_fault = DBG_CACHE_HIT_FAULT; - vm_fault_enter(m, dst_map->pmap, va, prot, prot, - VM_PAGE_WIRED(m), - FALSE, /* change_wiring */ - VM_KERN_MEMORY_NONE, /* tag - not wiring */ - FALSE, /* no_cache */ - FALSE, /* cs_bypass */ - VME_ALIAS(entry), - ((entry->iokit_acct || - (!entry->is_sub_map && - !entry->use_pmap)) - ? PMAP_OPTIONS_ALT_ACCT - : 0), /* pmap_options */ - NULL, /* need_retry */ - &type_of_fault); + fault_info.user_tag = VME_ALIAS(entry); + fault_info.pmap_options = 0; + if (entry->iokit_acct || + (!entry->is_sub_map && !entry->use_pmap)) { + fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT; + } + + vm_fault_enter(m, + dst_map->pmap, + va, + prot, + prot, + VM_PAGE_WIRED(m), + FALSE, /* change_wiring */ + VM_KERN_MEMORY_NONE, /* tag - not wiring */ + &fault_info, + NULL, /* need_retry */ + &type_of_fault); vm_object_unlock(object); @@ -10566,12 +10979,8 @@ vm_map_copyin_internal( * remember the endpoints prior to rounding. */ - copy = (vm_map_copy_t) zalloc(vm_map_copy_zone); - copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE; - vm_map_copy_first_entry(copy) = - vm_map_copy_last_entry(copy) = vm_map_copy_to_entry(copy); + copy = vm_map_copy_allocate(); copy->type = VM_MAP_COPY_ENTRY_LIST; - copy->cpy_hdr.nentries = 0; copy->cpy_hdr.entries_pageable = TRUE; #if 00 copy->cpy_hdr.page_shift = src_map->hdr.page_shift; @@ -10944,7 +11353,7 @@ vm_map_copyin_internal( assert(new_object->ref_count == 1); assert(new_object->shadow == VM_OBJECT_NULL); assert(new_object->copy == VM_OBJECT_NULL); - assert(new_object->vo_purgeable_owner == NULL); + assert(new_object->vo_owner == NULL); new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE; new_object->true_share = TRUE; @@ -11180,7 +11589,7 @@ vm_map_copyin_internal( src_end, ((src_map == kernel_map) ? VM_MAP_REMOVE_KUNWIRE : - VM_MAP_NO_FLAGS), + VM_MAP_REMOVE_NO_FLAGS), VM_MAP_NULL); } else { /* fix up the damage we did in the base map */ @@ -11392,12 +11801,8 @@ vm_map_copy_extract( * remember the endpoints prior to rounding. */ - copy = (vm_map_copy_t) zalloc(vm_map_copy_zone); - copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE; - vm_map_copy_first_entry(copy) = - vm_map_copy_last_entry(copy) = vm_map_copy_to_entry(copy); + copy = vm_map_copy_allocate(); copy->type = VM_MAP_COPY_ENTRY_LIST; - copy->cpy_hdr.nentries = 0; copy->cpy_hdr.entries_pageable = TRUE; vm_map_store_init(©->cpy_hdr); @@ -11446,8 +11851,7 @@ vm_map_copyin_object( * that contains the object directly. */ - copy = (vm_map_copy_t) zalloc(vm_map_copy_zone); - copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE; + copy = vm_map_copy_allocate(); copy->type = VM_MAP_COPY_OBJECT; copy->cpy_object = object; copy->offset = offset; @@ -11699,7 +12103,8 @@ vm_map_fork_share( * map. */ - vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry); + vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry, + VM_MAP_KERNEL_FLAGS_NONE); /* * Update the physical map @@ -11816,9 +12221,13 @@ vm_map_fork( boolean_t new_entry_needs_copy; boolean_t pmap_is64bit; int vm_map_copyin_flags; + vm_inherit_t old_entry_inheritance; + int map_create_options; + kern_return_t footprint_collect_kr; if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE | - VM_MAP_FORK_PRESERVE_PURGEABLE)) { + VM_MAP_FORK_PRESERVE_PURGEABLE | + VM_MAP_FORK_CORPSE_FOOTPRINT)) { /* unsupported option */ return VM_MAP_NULL; } @@ -11839,10 +12248,18 @@ vm_map_fork( vm_map_reference_swap(old_map); vm_map_lock(old_map); - new_map = vm_map_create(new_pmap, - old_map->min_offset, - old_map->max_offset, - old_map->hdr.entries_pageable); + map_create_options = 0; + if (old_map->hdr.entries_pageable) { + map_create_options |= VM_MAP_CREATE_PAGEABLE; + } + if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) { + map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT; + footprint_collect_kr = KERN_SUCCESS; + } + new_map = vm_map_create_options(new_pmap, + old_map->min_offset, + old_map->max_offset, + map_create_options); vm_map_lock(new_map); vm_commit_pagezero_status(new_map); /* inherit the parent map's page size */ @@ -11854,20 +12271,40 @@ vm_map_fork( entry_size = old_entry->vme_end - old_entry->vme_start; - switch (old_entry->inheritance) { - case VM_INHERIT_NONE: + old_entry_inheritance = old_entry->inheritance; + /* + * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option + * share VM_INHERIT_NONE entries that are not backed by a + * device pager. + */ + if (old_entry_inheritance == VM_INHERIT_NONE && + (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) && + !(!old_entry->is_sub_map && + VME_OBJECT(old_entry) != NULL && + VME_OBJECT(old_entry)->pager != NULL && + is_device_pager_ops( + VME_OBJECT(old_entry)->pager->mo_pager_ops))) { + old_entry_inheritance = VM_INHERIT_SHARE; + } + + if (old_entry_inheritance != VM_INHERIT_NONE && + (options & VM_MAP_FORK_CORPSE_FOOTPRINT) && + footprint_collect_kr == KERN_SUCCESS) { /* - * Skip making a share entry if VM_MAP_FORK_SHARE_IF_INHERIT_NONE - * is not passed or it is backed by a device pager. + * The corpse won't have old_map->pmap to query + * footprint information, so collect that data now + * and store it in new_map->vmmap_corpse_footprint + * for later autopsy. */ - if ((!(options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE)) || - (!old_entry->is_sub_map && - VME_OBJECT(old_entry) != NULL && - VME_OBJECT(old_entry)->pager != NULL && - is_device_pager_ops(VME_OBJECT(old_entry)->pager->mo_pager_ops))) { - break; - } - /* FALLTHROUGH */ + footprint_collect_kr = + vm_map_corpse_footprint_collect(old_map, + old_entry, + new_map); + } + + switch (old_entry_inheritance) { + case VM_INHERIT_NONE: + break; case VM_INHERIT_SHARE: vm_map_fork_share(old_map, old_entry, new_map); @@ -11960,8 +12397,10 @@ vm_map_fork( * of the map. */ - vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), - new_entry); + vm_map_store_entry_link(new_map, + vm_map_last_entry(new_map), + new_entry, + VM_MAP_KERNEL_FLAGS_NONE); new_size += entry_size; break; @@ -11987,6 +12426,11 @@ vm_map_fork( #endif new_map->size = new_size; + + if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) { + vm_map_corpse_footprint_collect_done(new_map); + } + vm_map_unlock(new_map); vm_map_unlock(old_map); vm_map_deallocate(old_map); @@ -12007,24 +12451,27 @@ vm_map_exec( task_t task, boolean_t is64bit, void *fsroot, - cpu_type_t cpu) + cpu_type_t cpu, + cpu_subtype_t cpu_subtype) { SHARED_REGION_TRACE_DEBUG( - ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x): ->\n", + ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n", (void *)VM_KERNEL_ADDRPERM(current_task()), (void *)VM_KERNEL_ADDRPERM(new_map), (void *)VM_KERNEL_ADDRPERM(task), (void *)VM_KERNEL_ADDRPERM(fsroot), - cpu)); + cpu, + cpu_subtype)); (void) vm_commpage_enter(new_map, task, is64bit); - (void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu); + (void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype); SHARED_REGION_TRACE_DEBUG( - ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x): <-\n", + ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n", (void *)VM_KERNEL_ADDRPERM(current_task()), (void *)VM_KERNEL_ADDRPERM(new_map), (void *)VM_KERNEL_ADDRPERM(task), (void *)VM_KERNEL_ADDRPERM(fsroot), - cpu)); + cpu, + cpu_subtype)); return KERN_SUCCESS; } @@ -12404,17 +12851,41 @@ vm_map_lookup_locked( VME_OBJECT_SET(entry, copy_object); /* propagate the submap entry's protections */ - entry->protection |= subentry_protection; + if (entry->protection != VM_PROT_READ) { + /* + * Someone has already altered the top entry's + * protections via vm_protect(VM_PROT_COPY). + * Respect these new values and ignore the + * submap entry's protections. + */ + } else { + /* + * Regular copy-on-write: propagate the submap + * entry's protections to the top map entry. + */ + entry->protection |= subentry_protection; + } entry->max_protection |= subentry_max_protection; -#if CONFIG_EMBEDDED - if (entry->protection & VM_PROT_WRITE) { - if ((entry->protection & VM_PROT_EXECUTE) && !(entry->used_for_jit)) { - printf("EMBEDDED: %s can't have both write and exec at the same time\n", __FUNCTION__); - entry->protection &= ~VM_PROT_EXECUTE; - } + if ((entry->protection & VM_PROT_WRITE) && + (entry->protection & VM_PROT_EXECUTE) && +#if !CONFIG_EMBEDDED + map != kernel_map && + cs_process_enforcement(NULL) && +#endif /* !CONFIG_EMBEDDED */ + !(entry->used_for_jit)) { + DTRACE_VM3(cs_wx, + uint64_t, (uint64_t)entry->vme_start, + uint64_t, (uint64_t)entry->vme_end, + vm_prot_t, entry->protection); + printf("CODE SIGNING: %d[%s] %s can't have both write and exec at the same time\n", + proc_selfpid(), + (current_task()->bsd_info + ? proc_name_address(current_task()->bsd_info) + : "?"), + __FUNCTION__); + entry->protection &= ~VM_PROT_EXECUTE; } -#endif if(copied_slowly) { VME_OFFSET_SET(entry, local_start - old_start); @@ -12592,6 +13063,16 @@ vm_map_lookup_locked( } else { fault_info->cs_bypass = FALSE; } + fault_info->pmap_cs_associated = FALSE; +#if CONFIG_PMAP_CS + if (entry->pmap_cs_associated) { + /* + * The pmap layer will validate this page + * before allowing it to be executed from. + */ + fault_info->pmap_cs_associated = TRUE; + } +#endif /* CONFIG_PMAP_CS */ fault_info->mark_zf_absent = FALSE; fault_info->batch_pmap_op = FALSE; } @@ -13089,7 +13570,7 @@ vm_map_region_recurse_64( } else { extended.share_mode = SM_PRIVATE; } - extended.ref_count = VME_SUBMAP(curr_entry)->ref_count; + extended.ref_count = VME_SUBMAP(curr_entry)->map_refcnt; } } @@ -13486,9 +13967,27 @@ vm_map_region_walk( int disp; disp = 0; - pmap_query_page_info(map->pmap, va, &disp); + if (map->has_corpse_footprint) { + /* + * Query the page info data we saved + * while forking the corpse. + */ + vm_map_corpse_footprint_query_page_info( + map, + va, + &disp); + } else { + /* + * Query the pmap. + */ + pmap_query_page_info(map->pmap, + va, + &disp); + } if (disp & PMAP_QUERY_PAGE_PRESENT) { - extended->pages_resident++; + if (!(disp & PMAP_QUERY_PAGE_ALTACCT)) { + extended->pages_resident++; + } if (disp & PMAP_QUERY_PAGE_REUSABLE) { extended->pages_reusable++; } else if (!(disp & PMAP_QUERY_PAGE_INTERNAL) || @@ -13505,7 +14004,57 @@ vm_map_region_walk( } } /* deal with alternate accounting */ - if (obj->purgable != VM_PURGABLE_DENY) { + if (obj->purgable == VM_PURGABLE_NONVOLATILE && + /* && not tagged as no-footprint? */ + VM_OBJECT_OWNER(obj) != NULL && + VM_OBJECT_OWNER(obj)->map == map) { + if ((((va + - entry->vme_start + + VME_OFFSET(entry)) + / PAGE_SIZE) < + (obj->resident_page_count + + vm_compressor_pager_get_count(obj->pager)))) { + /* + * Non-volatile purgeable object owned + * by this task: report the first + * "#resident + #compressed" pages as + * "resident" (to show that they + * contribute to the footprint) but not + * "dirty" (to avoid double-counting + * with the fake "non-volatile" region + * we'll report at the end of the + * address space to account for all + * (mapped or not) non-volatile memory + * owned by this task. + */ + extended->pages_resident++; + } + } else if ((obj->purgable == VM_PURGABLE_VOLATILE || + obj->purgable == VM_PURGABLE_EMPTY) && + /* && not tagged as no-footprint? */ + VM_OBJECT_OWNER(obj) != NULL && + VM_OBJECT_OWNER(obj)->map == map) { + if ((((va + - entry->vme_start + + VME_OFFSET(entry)) + / PAGE_SIZE) < + obj->wired_page_count)) { + /* + * Volatile|empty purgeable object owned + * by this task: report the first + * "#wired" pages as "resident" (to + * show that they contribute to the + * footprint) but not "dirty" (to avoid + * double-counting with the fake + * "non-volatile" region we'll report + * at the end of the address space to + * account for all (mapped or not) + * non-volatile memory owned by this + * task. + */ + extended->pages_resident++; + } + } else if (obj->purgable != VM_PURGABLE_DENY) { /* * Pages from purgeable objects * will be reported as dirty @@ -13645,11 +14194,11 @@ vm_map_region_look_for_page( if (shadow && (max_refcnt == 1)) extended->pages_shared_now_private++; - if (!p->fictitious && - (p->dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) + if (!p->vmp_fictitious && + (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) extended->pages_dirtied++; else if (count >= VM_REGION_EXTENDED_INFO_COUNT) { - if (p->reusable || object->all_reusable) { + if (p->vmp_reusable || object->all_reusable) { extended->pages_reusable++; } } @@ -13784,6 +14333,7 @@ vm_map_simplify_entry( (prev_entry->map_aligned == this_entry->map_aligned) && (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) && (prev_entry->used_for_jit == this_entry->used_for_jit) && + (prev_entry->pmap_cs_associated == this_entry->pmap_cs_associated) && /* from_reserved_zone: OK if that field doesn't match */ (prev_entry->iokit_acct == this_entry->iokit_acct) && (prev_entry->vme_resilient_codesign == @@ -13975,7 +14525,7 @@ vm_map_machine_attribute( m = vm_page_lookup( object, offset); - if (m && !m->fictitious) { + if (m && !m->vmp_fictitious) { ret = pmap_attribute_cache_sync( VM_PAGE_GET_PHYS_PAGE(m), @@ -14149,25 +14699,14 @@ vm_map_willneed( vm_map_entry_t entry; vm_object_t object; memory_object_t pager; - struct vm_object_fault_info fault_info; + struct vm_object_fault_info fault_info = {}; kern_return_t kr; vm_object_size_t len; vm_object_offset_t offset; - /* - * Fill in static values in fault_info. Several fields get ignored by the code - * we call, but we'll fill them in anyway since uninitialized fields are bad - * when it comes to future backwards compatibility. - */ - - fault_info.interruptible = THREAD_UNINT; /* ignored value */ + fault_info.interruptible = THREAD_UNINT; /* ignored value */ fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL; - fault_info.no_cache = FALSE; /* ignored value */ fault_info.stealth = TRUE; - fault_info.io_sync = FALSE; - fault_info.cs_bypass = FALSE; - fault_info.mark_zf_absent = FALSE; - fault_info.batch_pmap_op = FALSE; /* * The MADV_WILLNEED operation doesn't require any changes to the @@ -14347,7 +14886,7 @@ vm_map_entry_is_reusable( return TRUE; } - if (entry->is_shared || + if (/*entry->is_shared ||*/ entry->is_sub_map || entry->in_transition || entry->protection != VM_PROT_DEFAULT || @@ -14385,8 +14924,9 @@ vm_map_entry_is_reusable( object->wired_page_count == 0 && object->copy == VM_OBJECT_NULL && object->shadow == VM_OBJECT_NULL && - object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC && object->internal && + object->purgable == VM_PURGABLE_DENY && + object->copy_strategy != MEMORY_OBJECT_COPY_DELAY && !object->true_share && object->wimg_bits == VM_WIMG_USE_DEFAULT && !object->code_signed) { @@ -14758,7 +15298,7 @@ vm_map_pageout( /* * Routine: vm_map_entry_insert * - * Descritpion: This routine inserts a new vm_entry in a locked map. + * Description: This routine inserts a new vm_entry in a locked map. */ vm_map_entry_t vm_map_entry_insert( @@ -14787,6 +15327,7 @@ vm_map_entry_insert( vm_map_entry_t new_entry; assert(insp_entry != (vm_map_entry_t)0); + vm_map_lock_assert_exclusive(map); #if DEVELOPMENT || DEBUG vm_object_offset_t end_offset = 0; @@ -14853,7 +15394,10 @@ vm_map_entry_insert( else new_entry->superpage_size = FALSE; if (used_for_jit){ - if (!(map->jit_entry_exists)){ +#if CONFIG_EMBEDDED + if (!(map->jit_entry_exists)) +#endif /* CONFIG_EMBEDDED */ + { new_entry->used_for_jit = TRUE; map->jit_entry_exists = TRUE; @@ -14863,6 +15407,7 @@ vm_map_entry_insert( } else { new_entry->used_for_jit = FALSE; } + new_entry->pmap_cs_associated = FALSE; new_entry->iokit_acct = FALSE; new_entry->vme_resilient_codesign = FALSE; new_entry->vme_resilient_media = FALSE; @@ -14872,7 +15417,8 @@ vm_map_entry_insert( * Insert the new entry into the list. */ - vm_map_store_entry_link(map, insp_entry, new_entry); + vm_map_store_entry_link(map, insp_entry, new_entry, + VM_MAP_KERNEL_FLAGS_NONE); map->size += end - start; /* @@ -14918,6 +15464,7 @@ vm_map_remap_extract( boolean_t new_entry_needs_copy; vm_map_entry_t saved_src_entry; boolean_t src_entry_was_wired; + vm_prot_t max_prot_for_prot_copy; assert(map != VM_MAP_NULL); assert(size != 0); @@ -14936,14 +15483,19 @@ vm_map_remap_extract( /* * Initialize map_header. */ - map_header->links.next = (struct vm_map_entry *)&map_header->links; - map_header->links.prev = (struct vm_map_entry *)&map_header->links; + map_header->links.next = CAST_TO_VM_MAP_ENTRY(&map_header->links); + map_header->links.prev = CAST_TO_VM_MAP_ENTRY(&map_header->links); map_header->nentries = 0; map_header->entries_pageable = pageable; map_header->page_shift = PAGE_SHIFT; vm_map_store_init( map_header ); + if (copy && vmk_flags.vmkf_remap_prot_copy) { + max_prot_for_prot_copy = *max_protection & VM_PROT_ALL; + } else { + max_prot_for_prot_copy = VM_PROT_NONE; + } *cur_protection = VM_PROT_ALL; *max_protection = VM_PROT_ALL; @@ -15124,6 +15676,7 @@ vm_map_remap_extract( * VM_PROT_WRITE to the max protection. */ new_entry->inheritance = src_entry->inheritance; + new_entry->protection &= max_prot_for_prot_copy; new_entry->max_protection |= VM_PROT_WRITE; } else { new_entry->inheritance = inheritance; @@ -15140,8 +15693,10 @@ vm_map_remap_extract( * region to be shared across address spaces. */ if (src_entry->used_for_jit == TRUE && !same_map) { +#if CONFIG_EMBEDDED result = KERN_INVALID_ARGUMENT; break; +#endif /* CONFIG_EMBEDDED */ } src_entry->is_shared = TRUE; new_entry->is_shared = TRUE; @@ -15313,7 +15868,7 @@ vm_map_remap_extract( * Free all allocated elements. */ for (src_entry = map_header->links.next; - src_entry != (struct vm_map_entry *)&map_header->links; + src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links); src_entry = new_entry) { new_entry = src_entry->vme_next; _vm_map_store_entry_unlink(map_header, src_entry); @@ -15431,14 +15986,13 @@ vm_map_remap( &insp_entry); for (entry = map_header.links.next; - entry != (struct vm_map_entry *)&map_header.links; + entry != CAST_TO_VM_MAP_ENTRY(&map_header.links); entry = new_entry) { new_entry = entry->vme_next; _vm_map_store_entry_unlink(&map_header, entry); if (result == KERN_SUCCESS) { if (flags & VM_FLAGS_RESILIENT_CODESIGN) { /* no codesigning -> read-only access */ - assert(!entry->used_for_jit); entry->max_protection = VM_PROT_READ; entry->protection = VM_PROT_READ; entry->vme_resilient_codesign = TRUE; @@ -15446,7 +16000,8 @@ vm_map_remap( entry->vme_start += *address; entry->vme_end += *address; assert(!entry->map_aligned); - vm_map_store_entry_link(target_map, insp_entry, entry); + vm_map_store_entry_link(target_map, insp_entry, entry, + vmk_flags); insp_entry = entry; } else { if (!entry->is_sub_map) { @@ -15474,9 +16029,37 @@ vm_map_remap( target_map->size += size; SAVE_HINT_MAP_WRITE(target_map, insp_entry); - } - vm_map_unlock(target_map); - +#if PMAP_CS + if (*max_protection & VM_PROT_EXECUTE) { + vm_map_address_t region_start = 0, region_size = 0; + struct pmap_cs_code_directory *region_cd = NULL; + vm_map_address_t base = 0; + struct pmap_cs_lookup_results results = {}; + vm_map_size_t page_addr = vm_map_trunc_page(memory_address, PAGE_MASK); + vm_map_size_t assoc_size = vm_map_round_page(memory_address + size - page_addr, PAGE_MASK); + + pmap_cs_lookup(src_map->pmap, memory_address, &results); + region_size = results.region_size; + region_start = results.region_start; + region_cd = results.region_cd_entry; + base = results.base; + + if (region_cd != NULL && (page_addr != region_start || assoc_size != region_size)) { + *cur_protection = VM_PROT_READ; + *max_protection = VM_PROT_READ; + printf("mismatched remap of executable range 0x%llx-0x%llx to 0x%llx, " + "region_start 0x%llx, region_size 0x%llx, cd_entry %sNULL, making non-executable.\n", + page_addr, page_addr+assoc_size, *address, + region_start, region_size, + region_cd != NULL ? "not " : "" // Don't leak kernel slide + ); + } + } +#endif + + } + vm_map_unlock(target_map); + if (result == KERN_SUCCESS && target_map->wiring_required) result = vm_map_wire_kernel(target_map, *address, *address + size, *cur_protection, VM_KERN_MEMORY_MLOCK, @@ -15511,13 +16094,14 @@ vm_map_remap_range_allocate( vm_map_size_t size, vm_map_offset_t mask, int flags, - __unused vm_map_kernel_flags_t vmk_flags, + vm_map_kernel_flags_t vmk_flags, __unused vm_tag_t tag, vm_map_entry_t *map_entry) /* OUT */ { vm_map_entry_t entry; vm_map_offset_t start; vm_map_offset_t end; + vm_map_offset_t desired_empty_end; kern_return_t kr; vm_map_entry_t hole_entry; @@ -15559,7 +16143,7 @@ StartAgain: ; } else { if (map->holelistenabled) { - hole_entry = (vm_map_entry_t)map->holes_list; + hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list); if (hole_entry == NULL) { /* @@ -15583,7 +16167,7 @@ StartAgain: ; } hole_entry = hole_entry->vme_next; - } while (hole_entry != (vm_map_entry_t) map->holes_list); + } while (hole_entry != CAST_TO_VM_MAP_ENTRY(map->holes_list)); if (found_hole == FALSE) { return (KERN_NO_SPACE); @@ -15630,7 +16214,10 @@ StartAgain: ; start = end; end += size; - if ((end > map->max_offset) || (end < start)) { + /* We want an entire page of empty space, but don't increase the allocation size. */ + desired_empty_end = vm_map_round_page(end, VM_MAP_PAGE_MASK(map)); + + if ((desired_empty_end > map->max_offset) || (desired_empty_end < start)) { if (map->wait_for_space) { if (size <= (map->max_offset - map->min_offset)) { @@ -15648,7 +16235,7 @@ StartAgain: ; next = entry->vme_next; if (map->holelistenabled) { - if (entry->vme_end >= end) + if (entry->vme_end >= desired_empty_end) break; } else { /* @@ -15663,7 +16250,7 @@ StartAgain: ; if (next == vm_map_to_entry(map)) break; - if (next->vme_start >= end) + if (next->vme_start >= desired_empty_end) break; } @@ -15674,7 +16261,7 @@ StartAgain: ; entry = next; if (map->holelistenabled) { - if (entry == (vm_map_entry_t) map->holes_list) { + if (entry == CAST_TO_VM_MAP_ENTRY(map->holes_list)) { /* * Wrapped around */ @@ -15726,6 +16313,7 @@ StartAgain: ; */ if (flags & VM_FLAGS_OVERWRITE) { vm_map_t zap_map; + int remove_flags = VM_MAP_REMOVE_SAVE_ENTRIES | VM_MAP_REMOVE_NO_MAP_ALIGN; /* * We use a "zap_map" to avoid having to unlock @@ -15743,9 +16331,11 @@ StartAgain: ; vm_map_set_page_shift(zap_map, VM_MAP_PAGE_SHIFT(map)); vm_map_disable_hole_optimization(zap_map); + if (vmk_flags.vmkf_overwrite_immutable) { + remove_flags |= VM_MAP_REMOVE_IMMUTABLE; + } kr = vm_map_delete(map, start, end, - (VM_MAP_REMOVE_SAVE_ENTRIES | - VM_MAP_REMOVE_NO_MAP_ALIGN), + remove_flags, zap_map); if (kr == KERN_SUCCESS) { vm_map_destroy(zap_map, @@ -16342,10 +16932,76 @@ vm_map_page_range_info_internal( disposition = 0; pmap_disp = 0; - pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp); - if (map_entry->iokit_acct && - object->internal && - object->purgable == VM_PURGABLE_DENY) { + if (map->has_corpse_footprint) { + /* + * Query the page info data we saved + * while forking the corpse. + */ + vm_map_corpse_footprint_query_page_info( + map, + curr_s_offset, + &pmap_disp); + } else { + /* + * Query the pmap. + */ + pmap_query_page_info(map->pmap, + curr_s_offset, + &pmap_disp); + } + if (object->purgable == VM_PURGABLE_NONVOLATILE && + /* && not tagged as no-footprint? */ + VM_OBJECT_OWNER(object) != NULL && + VM_OBJECT_OWNER(object)->map == map) { + if ((((curr_s_offset + - map_entry->vme_start + + VME_OFFSET(map_entry)) + / PAGE_SIZE) < + (object->resident_page_count + + vm_compressor_pager_get_count(object->pager)))) { + /* + * Non-volatile purgeable object owned + * by this task: report the first + * "#resident + #compressed" pages as + * "resident" (to show that they + * contribute to the footprint) but not + * "dirty" (to avoid double-counting + * with the fake "non-volatile" region + * we'll report at the end of the + * address space to account for all + * (mapped or not) non-volatile memory + * owned by this task. + */ + disposition |= VM_PAGE_QUERY_PAGE_PRESENT; + } + } else if ((object->purgable == VM_PURGABLE_VOLATILE || + object->purgable == VM_PURGABLE_EMPTY) && + /* && not tagged as no-footprint? */ + VM_OBJECT_OWNER(object) != NULL && + VM_OBJECT_OWNER(object)->map == map) { + if ((((curr_s_offset + - map_entry->vme_start + + VME_OFFSET(map_entry)) + / PAGE_SIZE) < + object->wired_page_count)) { + /* + * Volatile|empty purgeable object owned + * by this task: report the first + * "#wired" pages as "resident" (to + * show that they contribute to the + * footprint) but not "dirty" (to avoid + * double-counting with the fake + * "non-volatile" region we'll report + * at the end of the address space to + * account for all (mapped or not) + * non-volatile memory owned by this + * task. + */ + disposition |= VM_PAGE_QUERY_PAGE_PRESENT; + } + } else if (map_entry->iokit_acct && + object->internal && + object->purgable == VM_PURGABLE_DENY) { /* * Non-purgeable IOKit memory: phys_footprint * includes the entire virtual mapping. @@ -16356,7 +17012,25 @@ vm_map_page_range_info_internal( } else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT | PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) { /* alternate accounting */ -// assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry); +#if CONFIG_EMBEDDED && (DEVELOPMENT || DEBUG) + if (map->pmap->footprint_was_suspended || + /* + * XXX corpse does not know if original + * pmap had its footprint suspended... + */ + map->has_corpse_footprint) { + /* + * The assertion below can fail if dyld + * suspended footprint accounting + * while doing some adjustments to + * this page; the mapping would say + * "use pmap accounting" but the page + * would be marked "alternate + * accounting". + */ + } else +#endif /* CONFIG_EMBEDDED && (DEVELOPMENT || DEBUG) */ + assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry); pmap_disp = 0; } else { if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) { @@ -16478,25 +17152,25 @@ vm_map_page_range_info_internal( if (m != VM_PAGE_NULL) { - if (m->fictitious) { + if (m->vmp_fictitious) { disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS; } else { - if (m->dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) + if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) disposition |= VM_PAGE_QUERY_PAGE_DIRTY; - if (m->reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) + if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) disposition |= VM_PAGE_QUERY_PAGE_REF; - if (m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) + if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE; - if (m->cs_validated) + if (m->vmp_cs_validated) disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED; - if (m->cs_tainted) + if (m->vmp_cs_tainted) disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED; - if (m->cs_nx) + if (m->vmp_cs_nx) disposition |= VM_PAGE_QUERY_PAGE_CS_NX; } } @@ -16944,10 +17618,10 @@ vm_map_reference( lck_mtx_lock(&map->s_lock); #if TASK_SWAPPER assert(map->res_count > 0); - assert(map->ref_count >= map->res_count); + assert(map->map_refcnt >= map->res_count); map->res_count++; #endif - map->ref_count++; + map->map_refcnt++; lck_mtx_unlock(&map->s_lock); } @@ -16968,13 +17642,13 @@ vm_map_deallocate( return; lck_mtx_lock(&map->s_lock); - ref = --map->ref_count; + ref = --map->map_refcnt; if (ref > 0) { vm_map_res_deallocate(map); lck_mtx_unlock(&map->s_lock); return; } - assert(map->ref_count == 0); + assert(map->map_refcnt == 0); lck_mtx_unlock(&map->s_lock); #if TASK_SWAPPER @@ -16986,7 +17660,7 @@ vm_map_deallocate( */ #endif - vm_map_destroy(map, VM_MAP_NO_FLAGS); + vm_map_destroy(map, VM_MAP_REMOVE_NO_FLAGS); } @@ -17035,15 +17709,43 @@ vm_map_set_64bit(vm_map_t map) } /* - * Expand the maximum size of an existing map. + * Expand the maximum size of an existing map to the maximum supported. */ void vm_map_set_jumbo(vm_map_t map) { #if defined (__arm64__) + vm_map_set_max_addr(map, ~0); +#else /* arm64 */ + (void) map; +#endif +} + +/* + * Expand the maximum size of an existing map. + */ +void +vm_map_set_max_addr(vm_map_t map, vm_map_offset_t new_max_offset) +{ +#if defined(__arm64__) + vm_map_offset_t max_supported_offset = 0; vm_map_offset_t old_max_offset = map->max_offset; - map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_JUMBO); - if (map->holes_list->prev->vme_end == pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE)) { + max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), ARM_PMAP_MAX_OFFSET_JUMBO); + + new_max_offset = trunc_page(new_max_offset); + + /* The address space cannot be shrunk using this routine. */ + if (old_max_offset >= new_max_offset) { + return; + } + + if (max_supported_offset < new_max_offset) { + new_max_offset = max_supported_offset; + } + + map->max_offset = new_max_offset; + + if (map->holes_list->prev->vme_end == old_max_offset) { /* * There is already a hole at the end of the map; simply make it bigger. */ @@ -17061,8 +17763,9 @@ vm_map_set_jumbo(vm_map_t map) map->holes_list->prev->links.next = (struct vm_map_entry *)new_hole; map->holes_list->prev = (struct vm_map_entry *)new_hole; } -#else /* arm64 */ - (void) map; +#else + (void)map; + (void)new_max_offset; #endif } @@ -17336,8 +18039,8 @@ kern_return_t vm_map_sign(vm_map_t map, return KERN_FAILURE; } /* deal with special page status */ - if (m->busy || - (m->unusual && (m->error || m->restart || m->private || m->absent))) { + if (m->vmp_busy || + (m->vmp_unusual && (m->vmp_error || m->vmp_restart || m->vmp_private || m->vmp_absent))) { vm_object_unlock(object); return KERN_FAILURE; } @@ -17345,18 +18048,18 @@ kern_return_t vm_map_sign(vm_map_t map, /* Page is OK... now "validate" it */ /* This is the place where we'll call out to create a code * directory, later */ - m->cs_validated = TRUE; + m->vmp_cs_validated = TRUE; /* The page is now "clean" for codesigning purposes. That means * we don't consider it as modified (wpmapped) anymore. But * we'll disconnect the page so we note any future modification * attempts. */ - m->wpmapped = FALSE; + m->vmp_wpmapped = FALSE; refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)); /* Pull the dirty status from the pmap, since we cleared the * wpmapped bit */ - if ((refmod & VM_MEM_MODIFIED) && !m->dirty) { + if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) { SET_PAGE_DIRTY(m, FALSE); } @@ -17490,24 +18193,33 @@ vm_map_disconnect_page_mappings( #if CONFIG_FREEZE -int c_freezer_swapout_count; +int c_freezer_swapout_page_count; int c_freezer_compression_count = 0; AbsoluteTime c_freezer_last_yield_ts = 0; -kern_return_t vm_map_freeze( +extern unsigned int memorystatus_freeze_private_shared_pages_ratio; +extern unsigned int memorystatus_freeze_shared_mb_per_process_max; + +kern_return_t +vm_map_freeze( vm_map_t map, unsigned int *purgeable_count, unsigned int *wired_count, unsigned int *clean_count, unsigned int *dirty_count, __unused unsigned int dirty_budget, - boolean_t *has_shared) + unsigned int *shared_count, + int *freezer_error_code, + boolean_t eval_only) { vm_map_entry_t entry2 = VM_MAP_ENTRY_NULL; kern_return_t kr = KERN_SUCCESS; + boolean_t evaluation_phase = TRUE; + vm_object_t cur_shared_object = NULL; + int cur_shared_obj_ref_cnt = 0; + unsigned int dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0; - *purgeable_count = *wired_count = *clean_count = *dirty_count = 0; - *has_shared = FALSE; + *purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0; /* * We need the exclusive lock here so that we can @@ -17519,12 +18231,39 @@ kern_return_t vm_map_freeze( assert(VM_CONFIG_COMPRESSOR_IS_PRESENT); if (vm_compressor_low_on_space() || vm_swap_low_on_space()) { + if (vm_compressor_low_on_space()) { + *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE; + } + + if (vm_swap_low_on_space()) { + *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE; + } + kr = KERN_NO_SPACE; goto done; } - c_freezer_compression_count = 0; - clock_get_uptime(&c_freezer_last_yield_ts); + if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) { + /* + * In-memory compressor backing the freezer. No disk. + * So no need to do the evaluation phase. + */ + evaluation_phase = FALSE; + + if (eval_only == TRUE) { + /* + * We don't support 'eval_only' mode + * in this non-swap config. + */ + *freezer_error_code = FREEZER_ERROR_GENERIC; + kr = KERN_INVALID_ARGUMENT; + goto done; + } + + c_freezer_compression_count = 0; + clock_get_uptime(&c_freezer_last_yield_ts); + } +again: for (entry2 = vm_map_first_entry(map); entry2 != vm_map_to_entry(map); @@ -17544,31 +18283,118 @@ kern_return_t vm_map_freeze( * Pages belonging to this object could be swapped to disk. * Make sure it's not a shared object because we could end * up just bringing it back in again. + * + * We try to optimize somewhat by checking for objects that are mapped + * more than once within our own map. But we don't do full searches, + * we just look at the entries following our current entry. */ if (src_object->ref_count > 1) { + if (src_object != cur_shared_object) { + obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager); + dirty_shared_count += obj_pages_snapshot; + + cur_shared_object = src_object; + cur_shared_obj_ref_cnt = 1; + continue; + } else { + cur_shared_obj_ref_cnt++; + if (src_object->ref_count == cur_shared_obj_ref_cnt) { + /* + * Fall through to below and treat this object as private. + * So deduct its pages from our shared total and add it to the + * private total. + */ + + dirty_shared_count -= obj_pages_snapshot; + dirty_private_count += obj_pages_snapshot; + } else { + continue; + } + } + } + + + if (src_object->ref_count == 1) { + dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager); + } + + if (evaluation_phase == TRUE) { + continue; } } + vm_object_compressed_freezer_pageout(src_object); + *wired_count += src_object->wired_page_count; + if (vm_compressor_low_on_space() || vm_swap_low_on_space()) { + if (vm_compressor_low_on_space()) { + *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE; + } + + if (vm_swap_low_on_space()) { + *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE; + } + kr = KERN_NO_SPACE; break; } } } } + + if (evaluation_phase) { + + unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64; + + if (dirty_shared_count > shared_pages_threshold) { + *freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY; + kr = KERN_FAILURE; + goto done; + } + + if (dirty_shared_count && + ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) { + *freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO; + kr = KERN_FAILURE; + goto done; + } + + evaluation_phase = FALSE; + dirty_shared_count = dirty_private_count = 0; + + c_freezer_compression_count = 0; + clock_get_uptime(&c_freezer_last_yield_ts); + + if (eval_only) { + kr = KERN_SUCCESS; + goto done; + } + + goto again; + + } else { + + kr = KERN_SUCCESS; + *shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL)); + } + done: vm_map_unlock(map); - vm_object_compressed_freezer_done(); + if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) { + vm_object_compressed_freezer_done(); - if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { - /* - * reset the counter tracking the # of swapped c_segs - * because we are now done with this freeze session and task. - */ - c_freezer_swapout_count = 0; + if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { + /* + * reset the counter tracking the # of swapped compressed pages + * because we are now done with this freeze session and task. + */ + + *dirty_count = c_freezer_swapout_page_count; //used to track pageouts + c_freezer_swapout_page_count = 0; + } } return kr; } @@ -17915,3 +18741,841 @@ vm_map_set_high_start( map->vmmap_high_start = high_start; } #endif /* __x86_64__ */ + +#if PMAP_CS +kern_return_t +vm_map_entry_cs_associate( + vm_map_t map, + vm_map_entry_t entry, + vm_map_kernel_flags_t vmk_flags) +{ + vm_object_t cs_object, cs_shadow; + vm_object_offset_t cs_offset; + void *cs_blobs; + struct vnode *cs_vnode; + kern_return_t cs_ret; + + if (map->pmap == NULL || + entry->is_sub_map || /* XXX FBDP: recurse on sub-range? */ + VME_OBJECT(entry) == VM_OBJECT_NULL || + ! (entry->protection & VM_PROT_EXECUTE)) { + return KERN_SUCCESS; + } + + vm_map_lock_assert_exclusive(map); + + if (entry->used_for_jit) { + cs_ret = pmap_cs_associate(map->pmap, + PMAP_CS_ASSOCIATE_JIT, + entry->vme_start, + entry->vme_end - entry->vme_start); + goto done; + } + + if (vmk_flags.vmkf_remap_prot_copy) { + cs_ret = pmap_cs_associate(map->pmap, + PMAP_CS_ASSOCIATE_COW, + entry->vme_start, + entry->vme_end - entry->vme_start); + goto done; + } + + vm_object_lock_shared(VME_OBJECT(entry)); + cs_offset = VME_OFFSET(entry); + for (cs_object = VME_OBJECT(entry); + (cs_object != VM_OBJECT_NULL && + !cs_object->code_signed); + cs_object = cs_shadow) { + cs_shadow = cs_object->shadow; + if (cs_shadow != VM_OBJECT_NULL) { + cs_offset += cs_object->vo_shadow_offset; + vm_object_lock_shared(cs_shadow); + } + vm_object_unlock(cs_object); + } + if (cs_object == VM_OBJECT_NULL) { + return KERN_SUCCESS; + } + + cs_offset += cs_object->paging_offset; + cs_vnode = vnode_pager_lookup_vnode(cs_object->pager); + cs_ret = vnode_pager_get_cs_blobs(cs_vnode, + &cs_blobs); + assert(cs_ret == KERN_SUCCESS); + cs_ret = cs_associate_blob_with_mapping(map->pmap, + entry->vme_start, + (entry->vme_end - + entry->vme_start), + cs_offset, + cs_blobs); + vm_object_unlock(cs_object); + cs_object = VM_OBJECT_NULL; + + done: + if (cs_ret == KERN_SUCCESS) { + DTRACE_VM2(vm_map_entry_cs_associate_success, + vm_map_offset_t, entry->vme_start, + vm_map_offset_t, entry->vme_end); + if (vm_map_executable_immutable) { + /* + * Prevent this executable + * mapping from being unmapped + * or modified. + */ + entry->permanent = TRUE; + } + /* + * pmap says it will validate the + * code-signing validity of pages + * faulted in via this mapping, so + * this map entry should be marked so + * that vm_fault() bypasses code-signing + * validation for faults coming through + * this mapping. + */ + entry->pmap_cs_associated = TRUE; + } else if (cs_ret == KERN_NOT_SUPPORTED) { + /* + * pmap won't check the code-signing + * validity of pages faulted in via + * this mapping, so VM should keep + * doing it. + */ + DTRACE_VM3(vm_map_entry_cs_associate_off, + vm_map_offset_t, entry->vme_start, + vm_map_offset_t, entry->vme_end, + int, cs_ret); + } else { + /* + * A real error: do not allow + * execution in this mapping. + */ + DTRACE_VM3(vm_map_entry_cs_associate_failure, + vm_map_offset_t, entry->vme_start, + vm_map_offset_t, entry->vme_end, + int, cs_ret); + entry->protection &= ~VM_PROT_EXECUTE; + entry->max_protection &= ~VM_PROT_EXECUTE; + } + + return cs_ret; +} +#endif /* PMAP_CS */ + +/* + * FORKED CORPSE FOOTPRINT + * + * A forked corpse gets a copy of the original VM map but its pmap is mostly + * empty since it never ran and never got to fault in any pages. + * Collecting footprint info (via "sysctl vm.self_region_footprint") for + * a forked corpse would therefore return very little information. + * + * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option + * to vm_map_fork() to collect footprint information from the original VM map + * and its pmap, and store it in the forked corpse's VM map. That information + * is stored in place of the VM map's "hole list" since we'll never need to + * lookup for holes in the corpse's map. + * + * The corpse's footprint info looks like this: + * + * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out + * as follows: + * +---------------------------------------+ + * header-> | cf_size | + * +-------------------+-------------------+ + * | cf_last_region | cf_last_zeroes | + * +-------------------+-------------------+ + * region1-> | cfr_vaddr | + * +-------------------+-------------------+ + * | cfr_num_pages | d0 | d1 | d2 | d3 | + * +---------------------------------------+ + * | d4 | d5 | ... | + * +---------------------------------------+ + * | ... | + * +-------------------+-------------------+ + * | dy | dz | na | na | cfr_vaddr... | <-region2 + * +-------------------+-------------------+ + * | cfr_vaddr (ctd) | cfr_num_pages | + * +---------------------------------------+ + * | d0 | d1 ... | + * +---------------------------------------+ + * ... + * +---------------------------------------+ + * last region-> | cfr_vaddr | + * +---------------------------------------+ + * + cfr_num_pages | d0 | d1 | d2 | d3 | + * +---------------------------------------+ + * ... + * +---------------------------------------+ + * | dx | dy | dz | na | na | na | na | na | + * +---------------------------------------+ + * + * where: + * cf_size: total size of the buffer (rounded to page size) + * cf_last_region: offset in the buffer of the last "region" sub-header + * cf_last_zeroes: number of trailing "zero" dispositions at the end + * of last region + * cfr_vaddr: virtual address of the start of the covered "region" + * cfr_num_pages: number of pages in the covered "region" + * d*: disposition of the page at that virtual address + * Regions in the buffer are word-aligned. + * + * We estimate the size of the buffer based on the number of memory regions + * and the virtual size of the address space. While copying each memory region + * during vm_map_fork(), we also collect the footprint info for that region + * and store it in the buffer, packing it as much as possible (coalescing + * contiguous memory regions to avoid having too many region headers and + * avoiding long streaks of "zero" page dispositions by splitting footprint + * "regions", so the number of regions in the footprint buffer might not match + * the number of memory regions in the address space. + * + * We also have to copy the original task's "nonvolatile" ledgers since that's + * part of the footprint and will need to be reported to any tool asking for + * the footprint information of the forked corpse. + */ + +uint64_t vm_map_corpse_footprint_count = 0; +uint64_t vm_map_corpse_footprint_size_avg = 0; +uint64_t vm_map_corpse_footprint_size_max = 0; +uint64_t vm_map_corpse_footprint_full = 0; +uint64_t vm_map_corpse_footprint_no_buf = 0; + +/* + * vm_map_corpse_footprint_new_region: + * closes the current footprint "region" and creates a new one + * + * Returns NULL if there's not enough space in the buffer for a new region. + */ +static struct vm_map_corpse_footprint_region * +vm_map_corpse_footprint_new_region( + struct vm_map_corpse_footprint_header *footprint_header) +{ + uintptr_t footprint_edge; + uint32_t new_region_offset; + struct vm_map_corpse_footprint_region *footprint_region; + struct vm_map_corpse_footprint_region *new_footprint_region; + + footprint_edge = ((uintptr_t)footprint_header + + footprint_header->cf_size); + footprint_region = ((struct vm_map_corpse_footprint_region *) + ((char *)footprint_header + + footprint_header->cf_last_region)); + assert((uintptr_t)footprint_region + sizeof (*footprint_region) <= + footprint_edge); + + /* get rid of trailing zeroes in the last region */ + assert(footprint_region->cfr_num_pages >= + footprint_header->cf_last_zeroes); + footprint_region->cfr_num_pages -= + footprint_header->cf_last_zeroes; + footprint_header->cf_last_zeroes = 0; + + /* reuse this region if it's now empty */ + if (footprint_region->cfr_num_pages == 0) { + return footprint_region; + } + + /* compute offset of new region */ + new_region_offset = footprint_header->cf_last_region; + new_region_offset += sizeof (*footprint_region); + new_region_offset += footprint_region->cfr_num_pages; + new_region_offset = roundup(new_region_offset, sizeof (int)); + + /* check if we're going over the edge */ + if (((uintptr_t)footprint_header + + new_region_offset + + sizeof (*footprint_region)) >= + footprint_edge) { + /* over the edge: no new region */ + return NULL; + } + + /* adjust offset of last region in header */ + footprint_header->cf_last_region = new_region_offset; + + new_footprint_region = (struct vm_map_corpse_footprint_region *) + ((char *)footprint_header + + footprint_header->cf_last_region); + new_footprint_region->cfr_vaddr = 0; + new_footprint_region->cfr_num_pages = 0; + /* caller needs to initialize new region */ + + return new_footprint_region; +} + +/* + * vm_map_corpse_footprint_collect: + * collect footprint information for "old_entry" in "old_map" and + * stores it in "new_map"'s vmmap_footprint_info. + */ +kern_return_t +vm_map_corpse_footprint_collect( + vm_map_t old_map, + vm_map_entry_t old_entry, + vm_map_t new_map) +{ + vm_map_offset_t va; + int disp; + kern_return_t kr; + struct vm_map_corpse_footprint_header *footprint_header; + struct vm_map_corpse_footprint_region *footprint_region; + struct vm_map_corpse_footprint_region *new_footprint_region; + unsigned char *next_disp_p; + uintptr_t footprint_edge; + uint32_t num_pages_tmp; + + va = old_entry->vme_start; + + vm_map_lock_assert_exclusive(old_map); + vm_map_lock_assert_exclusive(new_map); + + assert(new_map->has_corpse_footprint); + assert(!old_map->has_corpse_footprint); + if (!new_map->has_corpse_footprint || + old_map->has_corpse_footprint) { + /* + * This can only transfer footprint info from a + * map with a live pmap to a map with a corpse footprint. + */ + return KERN_NOT_SUPPORTED; + } + + if (new_map->vmmap_corpse_footprint == NULL) { + vm_offset_t buf; + vm_size_t buf_size; + + buf = 0; + buf_size = (sizeof (*footprint_header) + + (old_map->hdr.nentries + * + (sizeof (*footprint_region) + + + 3)) /* potential alignment for each region */ + + + ((old_map->size / PAGE_SIZE) + * + sizeof (char))); /* disposition for each page */ +// printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size); + buf_size = round_page(buf_size); + + /* limit buffer to 1 page to validate overflow detection */ +// buf_size = PAGE_SIZE; + + /* limit size to a somewhat sane amount */ +#if CONFIG_EMBEDDED +#define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (256*1024) /* 256KB */ +#else /* CONFIG_EMBEDDED */ +#define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (8*1024*1024) /* 8MB */ +#endif /* CONFIG_EMBEDDED */ + if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) { + buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE; + } + + /* + * Allocate the pageable buffer (with a trailing guard page). + * It will be zero-filled on demand. + */ + kr = kernel_memory_allocate(kernel_map, + &buf, + (buf_size + + PAGE_SIZE), /* trailing guard page */ + 0, /* mask */ + KMA_PAGEABLE | KMA_GUARD_LAST, + VM_KERN_MEMORY_DIAG); + if (kr != KERN_SUCCESS) { + vm_map_corpse_footprint_no_buf++; + return kr; + } + + /* initialize header and 1st region */ + footprint_header = (struct vm_map_corpse_footprint_header *)buf; + new_map->vmmap_corpse_footprint = footprint_header; + + footprint_header->cf_size = buf_size; + footprint_header->cf_last_region = + sizeof (*footprint_header); + footprint_header->cf_last_zeroes = 0; + + footprint_region = (struct vm_map_corpse_footprint_region *) + ((char *)footprint_header + + footprint_header->cf_last_region); + footprint_region->cfr_vaddr = 0; + footprint_region->cfr_num_pages = 0; + } else { + /* retrieve header and last region */ + footprint_header = (struct vm_map_corpse_footprint_header *) + new_map->vmmap_corpse_footprint; + footprint_region = (struct vm_map_corpse_footprint_region *) + ((char *)footprint_header + + footprint_header->cf_last_region); + } + footprint_edge = ((uintptr_t)footprint_header + + footprint_header->cf_size); + + if ((footprint_region->cfr_vaddr + + (((vm_map_offset_t)footprint_region->cfr_num_pages) * + PAGE_SIZE)) + != old_entry->vme_start) { + uint64_t num_pages_delta; + uint32_t region_offset_delta; + + /* + * Not the next contiguous virtual address: + * start a new region or store "zero" dispositions for + * the missing pages? + */ + /* size of gap in actual page dispositions */ + num_pages_delta = (((old_entry->vme_start - + footprint_region->cfr_vaddr) / PAGE_SIZE) + - footprint_region->cfr_num_pages); + /* size of gap as a new footprint region header */ + region_offset_delta = + (sizeof (*footprint_region) + + roundup((footprint_region->cfr_num_pages - + footprint_header->cf_last_zeroes), + sizeof (int)) - + (footprint_region->cfr_num_pages - + footprint_header->cf_last_zeroes)); +// printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta); + if (region_offset_delta < num_pages_delta || + os_add3_overflow(footprint_region->cfr_num_pages, + (uint32_t) num_pages_delta, + 1, + &num_pages_tmp)) { + /* + * Storing data for this gap would take more space + * than inserting a new footprint region header: + * let's start a new region and save space. If it's a + * tie, let's avoid using a new region, since that + * would require more region hops to find the right + * range during lookups. + * + * If the current region's cfr_num_pages would overflow + * if we added "zero" page dispositions for the gap, + * no choice but to start a new region. + */ +// printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__); + new_footprint_region = + vm_map_corpse_footprint_new_region(footprint_header); + /* check that we're not going over the edge */ + if (new_footprint_region == NULL) { + goto over_the_edge; + } + footprint_region = new_footprint_region; + /* initialize new region as empty */ + footprint_region->cfr_vaddr = old_entry->vme_start; + footprint_region->cfr_num_pages = 0; + } else { + /* + * Store "zero" page dispositions for the missing + * pages. + */ +// printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__); + for (; num_pages_delta > 0; num_pages_delta--) { + next_disp_p = + ((unsigned char *) footprint_region + + sizeof (*footprint_region) + + footprint_region->cfr_num_pages); + /* check that we're not going over the edge */ + if ((uintptr_t)next_disp_p >= footprint_edge) { + goto over_the_edge; + } + /* store "zero" disposition for this gap page */ + footprint_region->cfr_num_pages++; + *next_disp_p = (unsigned char) 0; + footprint_header->cf_last_zeroes++; + } + } + } + + for (va = old_entry->vme_start; + va < old_entry->vme_end; + va += PAGE_SIZE) { + vm_object_t object; + + object = VME_OBJECT(old_entry); + if (!old_entry->is_sub_map && + old_entry->iokit_acct && + object != VM_OBJECT_NULL && + object->internal && + object->purgable == VM_PURGABLE_DENY) { + /* + * Non-purgeable IOKit memory: phys_footprint + * includes the entire virtual mapping. + * Since the forked corpse's VM map entry will not + * have "iokit_acct", pretend that this page's + * disposition is "present & internal", so that it + * shows up in the forked corpse's footprint. + */ + disp = (PMAP_QUERY_PAGE_PRESENT | + PMAP_QUERY_PAGE_INTERNAL); + } else { + disp = 0; + pmap_query_page_info(old_map->pmap, + va, + &disp); + } + +// if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp); + + if (disp == 0 && footprint_region->cfr_num_pages == 0) { + /* + * Ignore "zero" dispositions at start of + * region: just move start of region. + */ + footprint_region->cfr_vaddr += PAGE_SIZE; + continue; + } + + /* would region's cfr_num_pages overflow? */ + if (os_add_overflow(footprint_region->cfr_num_pages, 1, + &num_pages_tmp)) { + /* overflow: create a new region */ + new_footprint_region = + vm_map_corpse_footprint_new_region( + footprint_header); + if (new_footprint_region == NULL) { + goto over_the_edge; + } + footprint_region = new_footprint_region; + footprint_region->cfr_vaddr = va; + footprint_region->cfr_num_pages = 0; + } + + next_disp_p = ((unsigned char *)footprint_region + + sizeof (*footprint_region) + + footprint_region->cfr_num_pages); + /* check that we're not going over the edge */ + if ((uintptr_t)next_disp_p >= footprint_edge) { + goto over_the_edge; + } + /* store this dispostion */ + *next_disp_p = (unsigned char) disp; + footprint_region->cfr_num_pages++; + + if (disp != 0) { + /* non-zero disp: break the current zero streak */ + footprint_header->cf_last_zeroes = 0; + /* done */ + continue; + } + + /* zero disp: add to the current streak of zeroes */ + footprint_header->cf_last_zeroes++; + if ((footprint_header->cf_last_zeroes + + roundup((footprint_region->cfr_num_pages - + footprint_header->cf_last_zeroes) & + (sizeof (int) - 1), + sizeof (int))) < + (sizeof (*footprint_header))) { + /* + * There are not enough trailing "zero" dispositions + * (+ the extra padding we would need for the previous + * region); creating a new region would not save space + * at this point, so let's keep this "zero" disposition + * in this region and reconsider later. + */ + continue; + } + /* + * Create a new region to avoid having too many consecutive + * "zero" dispositions. + */ + new_footprint_region = + vm_map_corpse_footprint_new_region(footprint_header); + if (new_footprint_region == NULL) { + goto over_the_edge; + } + footprint_region = new_footprint_region; + /* initialize the new region as empty ... */ + footprint_region->cfr_num_pages = 0; + /* ... and skip this "zero" disp */ + footprint_region->cfr_vaddr = va + PAGE_SIZE; + } + + return KERN_SUCCESS; + +over_the_edge: +// printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va); + vm_map_corpse_footprint_full++; + return KERN_RESOURCE_SHORTAGE; +} + +/* + * vm_map_corpse_footprint_collect_done: + * completes the footprint collection by getting rid of any remaining + * trailing "zero" dispositions and trimming the unused part of the + * kernel buffer + */ +void +vm_map_corpse_footprint_collect_done( + vm_map_t new_map) +{ + struct vm_map_corpse_footprint_header *footprint_header; + struct vm_map_corpse_footprint_region *footprint_region; + vm_size_t buf_size, actual_size; + kern_return_t kr; + + assert(new_map->has_corpse_footprint); + if (!new_map->has_corpse_footprint || + new_map->vmmap_corpse_footprint == NULL) { + return; + } + + footprint_header = (struct vm_map_corpse_footprint_header *) + new_map->vmmap_corpse_footprint; + buf_size = footprint_header->cf_size; + + footprint_region = (struct vm_map_corpse_footprint_region *) + ((char *)footprint_header + + footprint_header->cf_last_region); + + /* get rid of trailing zeroes in last region */ + assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes); + footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes; + footprint_header->cf_last_zeroes = 0; + + actual_size = (vm_size_t)(footprint_header->cf_last_region + + sizeof (*footprint_region) + + footprint_region->cfr_num_pages); + +// printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size); + vm_map_corpse_footprint_size_avg = + (((vm_map_corpse_footprint_size_avg * + vm_map_corpse_footprint_count) + + actual_size) / + (vm_map_corpse_footprint_count + 1)); + vm_map_corpse_footprint_count++; + if (actual_size > vm_map_corpse_footprint_size_max) { + vm_map_corpse_footprint_size_max = actual_size; + } + + actual_size = round_page(actual_size); + if (buf_size > actual_size) { + kr = vm_deallocate(kernel_map, + ((vm_address_t)footprint_header + + actual_size + + PAGE_SIZE), /* trailing guard page */ + (buf_size - actual_size)); + assertf(kr == KERN_SUCCESS, + "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n", + footprint_header, + (uint64_t) buf_size, + (uint64_t) actual_size, + kr); + kr = vm_protect(kernel_map, + ((vm_address_t)footprint_header + + actual_size), + PAGE_SIZE, + FALSE, /* set_maximum */ + VM_PROT_NONE); + assertf(kr == KERN_SUCCESS, + "guard: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n", + footprint_header, + (uint64_t) buf_size, + (uint64_t) actual_size, + kr); + } + + footprint_header->cf_size = actual_size; +} + +/* + * vm_map_corpse_footprint_query_page_info: + * retrieves the disposition of the page at virtual address "vaddr" + * in the forked corpse's VM map + * + * This is the equivalent of pmap_query_page_info() for a forked corpse. + */ +kern_return_t +vm_map_corpse_footprint_query_page_info( + vm_map_t map, + vm_map_offset_t va, + int *disp) +{ + struct vm_map_corpse_footprint_header *footprint_header; + struct vm_map_corpse_footprint_region *footprint_region; + uint32_t footprint_region_offset; + vm_map_offset_t region_start, region_end; + int disp_idx; + kern_return_t kr; + + if (!map->has_corpse_footprint) { + *disp = 0; + kr = KERN_INVALID_ARGUMENT; + goto done; + } + + footprint_header = map->vmmap_corpse_footprint; + if (footprint_header == NULL) { + *disp = 0; +// if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disp); + kr = KERN_INVALID_ARGUMENT; + goto done; + } + + /* start looking at the hint ("cf_hint_region") */ + footprint_region_offset = footprint_header->cf_hint_region; + +lookup_again: + if (footprint_region_offset < sizeof (*footprint_header)) { + /* hint too low: start from 1st region */ + footprint_region_offset = sizeof (*footprint_header); + } + if (footprint_region_offset >= footprint_header->cf_last_region) { + /* hint too high: re-start from 1st region */ + footprint_region_offset = sizeof (*footprint_header); + } + footprint_region = (struct vm_map_corpse_footprint_region *) + ((char *)footprint_header + footprint_region_offset); + region_start = footprint_region->cfr_vaddr; + region_end = (region_start + + ((vm_map_offset_t)(footprint_region->cfr_num_pages) * + PAGE_SIZE)); + if (va < region_start && + footprint_region_offset != sizeof (*footprint_header)) { + /* our range starts before the hint region */ + + /* reset the hint (in a racy way...) */ + footprint_header->cf_hint_region = sizeof (*footprint_header); + /* lookup "va" again from 1st region */ + footprint_region_offset = sizeof (*footprint_header); + goto lookup_again; + } + + while (va >= region_end) { + if (footprint_region_offset >= footprint_header->cf_last_region) { + break; + } + /* skip the region's header */ + footprint_region_offset += sizeof (*footprint_region); + /* skip the region's page dispositions */ + footprint_region_offset += footprint_region->cfr_num_pages; + /* align to next word boundary */ + footprint_region_offset = + roundup(footprint_region_offset, + sizeof (int)); + footprint_region = (struct vm_map_corpse_footprint_region *) + ((char *)footprint_header + footprint_region_offset); + region_start = footprint_region->cfr_vaddr; + region_end = (region_start + + ((vm_map_offset_t)(footprint_region->cfr_num_pages) * + PAGE_SIZE)); + } + if (va < region_start || va >= region_end) { + /* page not found */ + *disp = 0; +// if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disp); + kr = KERN_SUCCESS; + goto done; + } + + /* "va" found: set the lookup hint for next lookup (in a racy way...) */ + footprint_header->cf_hint_region = footprint_region_offset; + + /* get page disposition for "va" in this region */ + disp_idx = (int) ((va - footprint_region->cfr_vaddr) / PAGE_SIZE); + *disp = (int) (footprint_region->cfr_disposition[disp_idx]); + + kr = KERN_SUCCESS; +done: +// if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disp); + /* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */ + DTRACE_VM4(footprint_query_page_info, + vm_map_t, map, + vm_map_offset_t, va, + int, *disp, + kern_return_t, kr); + + return kr; +} + + +static void +vm_map_corpse_footprint_destroy( + vm_map_t map) +{ + if (map->has_corpse_footprint && + map->vmmap_corpse_footprint != 0) { + struct vm_map_corpse_footprint_header *footprint_header; + vm_size_t buf_size; + kern_return_t kr; + + footprint_header = map->vmmap_corpse_footprint; + buf_size = footprint_header->cf_size; + kr = vm_deallocate(kernel_map, + (vm_offset_t) map->vmmap_corpse_footprint, + ((vm_size_t) buf_size + + PAGE_SIZE)); /* trailing guard page */ + assertf(kr == KERN_SUCCESS, "kr=0x%x\n", kr); + map->vmmap_corpse_footprint = 0; + map->has_corpse_footprint = FALSE; + } +} + +/* + * vm_map_copy_footprint_ledgers: + * copies any ledger that's relevant to the memory footprint of "old_task" + * into the forked corpse's task ("new_task") + */ +void +vm_map_copy_footprint_ledgers( + task_t old_task, + task_t new_task) +{ + vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint); + vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile); + vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed); + vm_map_copy_ledger(old_task, new_task, task_ledgers.internal); + vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed); + vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped); + vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting); + vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed); + vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table); + vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile); + vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed); + vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem); +} + +/* + * vm_map_copy_ledger: + * copy a single ledger from "old_task" to "new_task" + */ +void +vm_map_copy_ledger( + task_t old_task, + task_t new_task, + int ledger_entry) +{ + ledger_amount_t old_balance, new_balance, delta; + + assert(new_task->map->has_corpse_footprint); + if (!new_task->map->has_corpse_footprint) + return; + + /* turn off sanity checks for the ledger we're about to mess with */ + ledger_disable_panic_on_negative(new_task->ledger, + ledger_entry); + + /* adjust "new_task" to match "old_task" */ + ledger_get_balance(old_task->ledger, + ledger_entry, + &old_balance); + ledger_get_balance(new_task->ledger, + ledger_entry, + &new_balance); + if (new_balance == old_balance) { + /* new == old: done */ + } else if (new_balance > old_balance) { + /* new > old ==> new -= new - old */ + delta = new_balance - old_balance; + ledger_debit(new_task->ledger, + ledger_entry, + delta); + } else { + /* new < old ==> new += old - new */ + delta = old_balance - new_balance; + ledger_credit(new_task->ledger, + ledger_entry, + delta); + } +} diff --git a/osfmk/vm/vm_map.h b/osfmk/vm/vm_map.h index 23592b8e4..44cef715d 100644 --- a/osfmk/vm/vm_map.h +++ b/osfmk/vm/vm_map.h @@ -91,11 +91,12 @@ extern vm_map_t current_map(void); /* Setup reserved areas in a new VM map */ extern kern_return_t vm_map_exec( - vm_map_t new_map, - task_t task, - boolean_t is64bit, - void *fsroot, - cpu_type_t cpu); + vm_map_t new_map, + task_t task, + boolean_t is64bit, + void *fsroot, + cpu_type_t cpu, + cpu_subtype_t cpu_subtype); __END_DECLS @@ -147,6 +148,9 @@ typedef union vm_map_object { #define named_entry_lock_destroy(object) lck_mtx_destroy(&(object)->Lock, &vm_object_lck_grp) #define named_entry_lock(object) lck_mtx_lock(&(object)->Lock) #define named_entry_unlock(object) lck_mtx_unlock(&(object)->Lock) +#if VM_NAMED_ENTRY_LIST +extern queue_head_t vm_named_entry_list; +#endif /* VM_NAMED_ENTRY_LIST */ /* * Type: vm_named_entry_t [internal use only] @@ -182,6 +186,13 @@ struct vm_named_entry { /* boolean_t */ internal:1, /* ... an internal object */ /* boolean_t */ is_sub_map:1, /* ... a submap? */ /* boolean_t */ is_copy:1; /* ... a VM map copy */ +#if VM_NAMED_ENTRY_LIST + queue_chain_t named_entry_list; + int named_entry_alias; + mach_port_t named_entry_port; +#define NAMED_ENTRY_BT_DEPTH 16 + void *named_entry_bt[NAMED_ENTRY_BT_DEPTH]; +#endif /* VM_NAMED_ENTRY_LIST */ }; /* @@ -323,6 +334,7 @@ struct vm_map_entry { * this entry it is being deleted * without unwiring them */ /* boolean_t */ used_for_jit:1, + /* boolean_t */ pmap_cs_associated:1, /* pmap_cs will validate */ /* boolean_t */ from_reserved_zone:1, /* Allocated from * kernel reserved zone */ @@ -331,7 +343,7 @@ struct vm_map_entry { /* boolean_t */ vme_resilient_codesign:1, /* boolean_t */ vme_resilient_media:1, /* boolean_t */ vme_atomic:1, /* entry cannot be split/coalesced */ - __unused:5; + __unused:4; ; unsigned short wired_count; /* can be paged if = 0 */ @@ -405,7 +417,7 @@ struct vm_map_header { * quickly find free space. */ struct _vm_map { - lck_rw_t lock; /* map lock */ + lck_rw_t lock; /* map lock */ struct vm_map_header hdr; /* Map entry header */ #define min_offset hdr.links.start /* start of range */ #define max_offset hdr.links.end /* end of range */ @@ -433,24 +445,30 @@ struct _vm_map { } vmu1; #define highest_entry_end vmu1.vmu1_highest_entry_end #define lowest_unnestable_start vmu1.vmu1_lowest_unnestable_start - - int ref_count; /* Reference count */ -#if TASK_SWAPPER - int res_count; /* Residence count (swap) */ - int sw_state; /* Swap state */ -#endif /* TASK_SWAPPER */ decl_lck_mtx_data(, s_lock) /* Lock ref, res fields */ lck_mtx_ext_t s_lock_ext; vm_map_entry_t hint; /* hint for quick lookups */ - struct vm_map_links* hole_hint; /* hint for quick hole lookups */ + union { + struct vm_map_links* vmmap_hole_hint; /* hint for quick hole lookups */ + struct vm_map_corpse_footprint_header *vmmap_corpse_footprint; + } vmmap_u_1; +#define hole_hint vmmap_u_1.vmmap_hole_hint +#define vmmap_corpse_footprint vmmap_u_1.vmmap_corpse_footprint union{ vm_map_entry_t _first_free; /* First free space hint */ struct vm_map_links* _holes; /* links all holes between entries */ - }f_s; /* Union for free space data structures being used */ + } f_s; /* Union for free space data structures being used */ #define first_free f_s._first_free #define holes_list f_s._holes + int map_refcnt; /* Reference count */ + +#if TASK_SWAPPER + int res_count; /* Residence count (swap) */ + int sw_state; /* Swap state */ +#endif /* TASK_SWAPPER */ + unsigned int /* boolean_t */ wait_for_space:1, /* Should callers wait for space? */ /* boolean_t */ wiring_required:1, /* All memory wired? */ @@ -462,14 +480,15 @@ struct _vm_map { /* boolean_t */ holelistenabled:1, /* boolean_t */ is_nested_map:1, /* boolean_t */ map_disallow_new_exec:1, /* Disallow new executable code */ - /* reserved */ pad:22; + /* boolean_t */ jit_entry_exists:1, + /* boolean_t */ has_corpse_footprint:1, + /* boolean_t */ warned_delete_gap:1, + /* reserved */ pad:19; unsigned int timestamp; /* Version number */ - unsigned int color_rr; /* next color (not protected by a lock) */ - - boolean_t jit_entry_exists; -} ; +}; -#define vm_map_to_entry(map) ((struct vm_map_entry *) &(map)->hdr.links) +#define CAST_TO_VM_MAP_ENTRY(x) ((struct vm_map_entry *)(uintptr_t)(x)) +#define vm_map_to_entry(map) CAST_TO_VM_MAP_ENTRY(&(map)->hdr.links) #define vm_map_first_entry(map) ((map)->hdr.links.next) #define vm_map_last_entry(map) ((map)->hdr.links.prev) @@ -563,8 +582,7 @@ struct vm_map_copy { * Useful macros for entry list copy objects */ -#define vm_map_copy_to_entry(copy) \ - ((struct vm_map_entry *) &(copy)->cpy_hdr.links) +#define vm_map_copy_to_entry(copy) CAST_TO_VM_MAP_ENTRY(&(copy)->cpy_hdr.links) #define vm_map_copy_first_entry(copy) \ ((copy)->cpy_hdr.links.next) #define vm_map_copy_last_entry(copy) \ @@ -745,7 +763,7 @@ MACRO_BEGIN \ if (Map) { \ lck_mtx_lock(&Map->s_lock); \ Map->res_count++; \ - Map->ref_count++; \ + Map->map_refcnt++; \ lck_mtx_unlock(&Map->s_lock); \ } \ MACRO_END @@ -780,7 +798,7 @@ MACRO_END MACRO_BEGIN \ vm_map_t Map = (map); \ lck_mtx_lock(&Map->s_lock); \ - ++Map->ref_count; \ + ++Map->map_refcnt; \ vm_map_res_reference(Map); \ lck_mtx_unlock(&Map->s_lock); \ MACRO_END @@ -799,7 +817,7 @@ MACRO_BEGIN \ vm_map_t Map = (map); \ if (Map) { \ lck_mtx_lock(&Map->s_lock); \ - Map->ref_count++; \ + Map->map_refcnt++; \ lck_mtx_unlock(&Map->s_lock); \ } \ MACRO_END @@ -953,6 +971,7 @@ extern vm_map_t vm_map_fork( int options); #define VM_MAP_FORK_SHARE_IF_INHERIT_NONE 0x00000001 #define VM_MAP_FORK_PRESERVE_PURGEABLE 0x00000002 +#define VM_MAP_FORK_CORPSE_FOOTPRINT 0x00000004 /* Change inheritance */ extern kern_return_t vm_map_inherit( @@ -1049,6 +1068,13 @@ extern kern_return_t vm_map_set_cache_attr( extern int override_nx(vm_map_t map, uint32_t user_tag); +#if PMAP_CS +extern kern_return_t vm_map_entry_cs_associate( + vm_map_t map, + vm_map_entry_t entry, + vm_map_kernel_flags_t vmk_flags); +#endif /* PMAP_CS */ + extern void vm_map_region_top_walk( vm_map_entry_t entry, vm_region_top_info_t top); @@ -1062,6 +1088,46 @@ extern void vm_map_region_walk( boolean_t look_for_pages, mach_msg_type_number_t count); + +struct vm_map_corpse_footprint_header { + vm_size_t cf_size; /* allocated buffer size */ + uint32_t cf_last_region; /* offset of last region in buffer */ + union { + uint32_t cfu_last_zeroes; /* during creation: + * number of "zero" dispositions at + * end of last region */ + uint32_t cfu_hint_region; /* during lookup: + * offset of last looked up region */ +#define cf_last_zeroes cfu.cfu_last_zeroes +#define cf_hint_region cfu.cfu_hint_region + } cfu; +}; +struct vm_map_corpse_footprint_region { + vm_map_offset_t cfr_vaddr; /* region start virtual address */ + uint32_t cfr_num_pages; /* number of pages in this "region" */ + unsigned char cfr_disposition[0]; /* disposition of each page */ +} __attribute__((packed)); + +extern kern_return_t vm_map_corpse_footprint_collect( + vm_map_t old_map, + vm_map_entry_t old_entry, + vm_map_t new_map); +extern void vm_map_corpse_footprint_collect_done( + vm_map_t new_map); + +extern kern_return_t vm_map_corpse_footprint_query_page_info( + vm_map_t map, + vm_map_offset_t va, + int *disp); + +extern void vm_map_copy_footprint_ledgers( + task_t old_task, + task_t new_task); +extern void vm_map_copy_ledger( + task_t old_task, + task_t new_task, + int ledger_entry); + #endif /* MACH_KERNEL_PRIVATE */ __BEGIN_DECLS @@ -1072,6 +1138,15 @@ extern vm_map_t vm_map_create( vm_map_offset_t min_off, vm_map_offset_t max_off, boolean_t pageable); +extern vm_map_t vm_map_create_options( + pmap_t pmap, + vm_map_offset_t min_off, + vm_map_offset_t max_off, + int options); +#define VM_MAP_CREATE_PAGEABLE 0x00000001 +#define VM_MAP_CREATE_CORPSE_FOOTPRINT 0x00000002 +#define VM_MAP_CREATE_ALL_OPTIONS (VM_MAP_CREATE_PAGEABLE | \ + VM_MAP_CREATE_CORPSE_FOOTPRINT) extern void vm_map_disable_hole_optimization(vm_map_t map); @@ -1320,6 +1395,9 @@ extern void vm_map_set_32bit( extern void vm_map_set_jumbo( vm_map_t map); +extern void vm_map_set_max_addr( + vm_map_t map, vm_map_offset_t new_max_offset); + extern boolean_t vm_map_has_hard_pagezero( vm_map_t map, vm_map_offset_t pagezero_size); @@ -1479,7 +1557,7 @@ extern kern_return_t vm_map_set_page_shift(vm_map_t map, int pageshift); /* * Flags for vm_map_remove() and vm_map_delete() */ -#define VM_MAP_NO_FLAGS 0x0 +#define VM_MAP_REMOVE_NO_FLAGS 0x0 #define VM_MAP_REMOVE_KUNWIRE 0x1 #define VM_MAP_REMOVE_INTERRUPTIBLE 0x2 #define VM_MAP_REMOVE_WAIT_FOR_KWIRE 0x4 @@ -1488,6 +1566,7 @@ extern kern_return_t vm_map_set_page_shift(vm_map_t map, int pageshift); #define VM_MAP_REMOVE_NO_MAP_ALIGN 0x20 #define VM_MAP_REMOVE_NO_UNNESTING 0x40 #define VM_MAP_REMOVE_IMMUTABLE 0x80 +#define VM_MAP_REMOVE_GAPS_OK 0x100 /* Support for UPLs from vm_maps */ @@ -1535,13 +1614,23 @@ extern int vm_map_disconnect_page_mappings( #if CONFIG_FREEZE extern kern_return_t vm_map_freeze( - vm_map_t map, + vm_map_t map, unsigned int *purgeable_count, unsigned int *wired_count, unsigned int *clean_count, unsigned int *dirty_count, unsigned int dirty_budget, - boolean_t *has_shared); + unsigned int *shared_count, + int *freezer_error_code, + boolean_t eval_only); + + +#define FREEZER_ERROR_GENERIC (-1) +#define FREEZER_ERROR_EXCESS_SHARED_MEMORY (-2) +#define FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO (-3) +#define FREEZER_ERROR_NO_COMPRESSOR_SPACE (-4) +#define FREEZER_ERROR_NO_SWAP_SPACE (-5) + #endif __END_DECLS diff --git a/osfmk/vm/vm_map_store.c b/osfmk/vm/vm_map_store.c index 26b3477a4..8690d27ad 100644 --- a/osfmk/vm/vm_map_store.c +++ b/osfmk/vm/vm_map_store.c @@ -96,33 +96,6 @@ vm_map_store_update( vm_map_t map, vm_map_entry_t entry, int update_type ) } } -void vm_map_store_copy_insert( vm_map_t map, vm_map_entry_t after_where, vm_map_copy_t copy) -{ - if (__improbable(vm_debug_events)) { - vm_map_entry_t entry; - for (entry = vm_map_copy_first_entry(copy); entry != vm_map_copy_to_entry(copy); entry = entry->vme_next) { - DTRACE_VM4(map_entry_link_copy, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->links.start, vm_address_t, entry->links.end); - } - } - - if (map->holelistenabled) { - vm_map_entry_t entry = NULL; - - entry = vm_map_copy_first_entry(copy); - while (entry != vm_map_copy_to_entry(copy)) { - vm_map_store_update_first_free(map, entry, TRUE); - entry = entry->vme_next; - } - } - - vm_map_store_copy_insert_ll(map, after_where, copy); -#ifdef VM_MAP_STORE_USE_RB - if (vm_map_store_has_RB_support( &map->hdr )) { - vm_map_store_copy_insert_rb(map, after_where, copy); - } -#endif -} - /* * vm_map_entry_{un,}link: * @@ -156,7 +129,11 @@ _vm_map_store_entry_link( struct vm_map_header * mapHdr, vm_map_entry_t after_wh } void -vm_map_store_entry_link( vm_map_t map, vm_map_entry_t after_where, vm_map_entry_t entry) +vm_map_store_entry_link( + vm_map_t map, + vm_map_entry_t after_where, + vm_map_entry_t entry, + vm_map_kernel_flags_t vmk_flags) { vm_map_t VMEL_map; vm_map_entry_t VMEL_entry; @@ -174,6 +151,11 @@ vm_map_store_entry_link( vm_map_t map, vm_map_entry_t after_where, vm_map_entry_ } #endif } +#if PMAP_CS + (void) vm_map_entry_cs_associate(map, entry, vmk_flags); +#else /* PMAP_CS */ + (void) vmk_flags; +#endif /* PMAP_CS */ } void diff --git a/osfmk/vm/vm_map_store.h b/osfmk/vm/vm_map_store.h index cc8b60df4..8a0641c70 100644 --- a/osfmk/vm/vm_map_store.h +++ b/osfmk/vm/vm_map_store.h @@ -132,11 +132,10 @@ void vm_map_store_init( struct vm_map_header* ); boolean_t vm_map_store_lookup_entry( struct _vm_map*, vm_map_offset_t, struct vm_map_entry**); void vm_map_store_update( struct _vm_map*, struct vm_map_entry*, int); void _vm_map_store_entry_link( struct vm_map_header *, struct vm_map_entry*, struct vm_map_entry*); -void vm_map_store_entry_link( struct _vm_map*, struct vm_map_entry*, struct vm_map_entry*); +void vm_map_store_entry_link( struct _vm_map*, struct vm_map_entry*, struct vm_map_entry*, vm_map_kernel_flags_t); void _vm_map_store_entry_unlink( struct vm_map_header *, struct vm_map_entry*); void vm_map_store_entry_unlink( struct _vm_map*, struct vm_map_entry*); void vm_map_store_update_first_free( struct _vm_map*, struct vm_map_entry*, boolean_t new_entry_creation); -void vm_map_store_copy_insert( struct _vm_map*, struct vm_map_entry*, struct vm_map_copy*); void vm_map_store_copy_reset( struct vm_map_copy*, struct vm_map_entry*); #if MACH_ASSERT boolean_t first_free_is_valid_store( struct _vm_map*); diff --git a/osfmk/vm/vm_map_store_ll.c b/osfmk/vm/vm_map_store_ll.c index c7c1afd98..5f33f8c0c 100644 --- a/osfmk/vm/vm_map_store_ll.c +++ b/osfmk/vm/vm_map_store_ll.c @@ -238,12 +238,6 @@ vm_map_store_entry_unlink_ll( struct vm_map_header *mapHdr, vm_map_entry_t entry _vm_map_entry_unlink_ll( mapHdr, entry); } -void -vm_map_store_copy_insert_ll( vm_map_t map, vm_map_entry_t after_where, vm_map_copy_t copy) -{ - _vm_map_copy_insert_ll( map, after_where, copy); -} - void vm_map_store_copy_reset_ll( vm_map_copy_t copy, __unused vm_map_entry_t entry, __unused int nentries) { diff --git a/osfmk/vm/vm_map_store_ll.h b/osfmk/vm/vm_map_store_ll.h index 0bbe00d48..0c15b914d 100644 --- a/osfmk/vm/vm_map_store_ll.h +++ b/osfmk/vm/vm_map_store_ll.h @@ -38,7 +38,6 @@ boolean_t vm_map_store_lookup_entry_ll( struct _vm_map*, vm_map_offset_t, struct void vm_map_store_entry_link_ll( struct vm_map_header*, struct vm_map_entry*, struct vm_map_entry*); void vm_map_store_entry_unlink_ll( struct vm_map_header*, struct vm_map_entry*); void update_first_free_ll(struct _vm_map*, struct vm_map_entry*); -void vm_map_store_copy_insert_ll( struct _vm_map*, struct vm_map_entry*, struct vm_map_copy*); void vm_map_store_copy_reset_ll( struct vm_map_copy*, struct vm_map_entry*, int); #endif /* _VM_VM_MAP_STORE_LL_H */ diff --git a/osfmk/vm/vm_map_store_rb.c b/osfmk/vm/vm_map_store_rb.c index 70fb9be4c..9485f0cb8 100644 --- a/osfmk/vm/vm_map_store_rb.c +++ b/osfmk/vm/vm_map_store_rb.c @@ -120,33 +120,6 @@ void vm_map_store_entry_unlink_rb( struct vm_map_header *mapHdr, vm_map_entry_t RB_REMOVE( rb_head, rbh, store ); } -void vm_map_store_copy_insert_rb( vm_map_t map, __unused vm_map_entry_t after_where, vm_map_copy_t copy) -{ - struct vm_map_header *mapHdr = &(map->hdr); - struct rb_head *rbh = &(mapHdr->rb_head_store); - struct vm_map_store *store; - vm_map_entry_t entry = vm_map_copy_first_entry(copy); - int inserted=0, nentries = copy->cpy_hdr.nentries; - - while (entry != vm_map_copy_to_entry(copy) && nentries > 0) { - vm_map_entry_t prev = entry; - store = &(entry->store); - if( RB_INSERT( rb_head, rbh, store ) != NULL){ - panic("VMSCIR1: INSERT FAILED: %d: %p, %p, %p, 0x%lx, 0x%lx, 0x%lx, 0x%lx, 0x%lx, 0x%lx",inserted, prev, entry, vm_map_copy_to_entry(copy), - (uintptr_t)prev->vme_start, (uintptr_t)prev->vme_end, (uintptr_t)entry->vme_start, (uintptr_t)entry->vme_end, - (uintptr_t)(VME_FOR_STORE(rbh->rbh_root))->vme_start, (uintptr_t)(VME_FOR_STORE(rbh->rbh_root))->vme_end); - } else { -#if MAP_ENTRY_INSERTION_DEBUG - backtrace(&entry->vme_insertion_bt[0], - (sizeof (entry->vme_insertion_bt) / sizeof (uintptr_t))); -#endif - entry = entry->vme_next; - inserted++; - nentries--; - } - } -} - void vm_map_store_copy_reset_rb( vm_map_copy_t copy, vm_map_entry_t entry, int nentries ) { @@ -200,9 +173,9 @@ vm_map_delete_hole(vm_map_t map, vm_map_entry_t hole_entry); void vm_map_delete_hole(vm_map_t map, vm_map_entry_t hole_entry) { - if (hole_entry == (vm_map_entry_t) map->holes_list) { + if (hole_entry == CAST_TO_VM_MAP_ENTRY(map->holes_list)) { - if (hole_entry->vme_next == (vm_map_entry_t) map->holes_list) { + if (hole_entry->vme_next == CAST_TO_VM_MAP_ENTRY(map->holes_list)) { map->holes_list = NULL; SAVE_HINT_HOLE_WRITE(map, NULL); @@ -322,7 +295,7 @@ update_holes_on_entry_deletion(vm_map_t map, vm_map_entry_t old_entry) #endif /* DEBUG */ boolean_t create_new_hole = TRUE; - hole_entry = (vm_map_entry_t) map->hole_hint; + hole_entry = CAST_TO_VM_MAP_ENTRY(map->hole_hint); if (hole_entry) { @@ -334,7 +307,7 @@ update_holes_on_entry_deletion(vm_map_t map, vm_map_entry_t old_entry) } else if (hole_entry->vme_start == old_entry->vme_end) { - if (hole_entry != (vm_map_entry_t) map->holes_list) { + if (hole_entry != CAST_TO_VM_MAP_ENTRY(map->holes_list)) { /* * Found a hole right after below our entry but @@ -352,10 +325,10 @@ update_holes_on_entry_deletion(vm_map_t map, vm_map_entry_t old_entry) * Useless hint. Start from the top. */ - hole_entry = (vm_map_entry_t) map->holes_list; + hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list); } - if (hole_entry != (vm_map_entry_t) map->holes_list) { + if (hole_entry != CAST_TO_VM_MAP_ENTRY(map->holes_list)) { if (hole_entry->vme_start > old_entry->vme_start) { panic("Hole hint failed: Hole entry start: 0x%llx, entry start: 0x%llx, map hole start: 0x%llx, map hint start: 0x%llx\n", (unsigned long long)hole_entry->vme_start, @@ -433,7 +406,7 @@ update_holes_on_entry_deletion(vm_map_t map, vm_map_entry_t old_entry) copy_hole_info(hole_entry, &old_hole_entry); #endif /* DEBUG */ - if (hole_entry != (vm_map_entry_t) map->holes_list) { + if (hole_entry != CAST_TO_VM_MAP_ENTRY(map->holes_list)) { assert(hole_entry->vme_start != old_entry->vme_start); hole_entry = hole_entry->vme_prev; } @@ -442,7 +415,7 @@ update_holes_on_entry_deletion(vm_map_t map, vm_map_entry_t old_entry) hole_entry = next_hole_entry; - if (hole_entry == (vm_map_entry_t)map->holes_list) { + if (hole_entry == CAST_TO_VM_MAP_ENTRY(map->holes_list)) { hole_entry = hole_entry->vme_prev; break; } @@ -460,21 +433,21 @@ update_holes_on_entry_deletion(vm_map_t map, vm_map_entry_t old_entry) * OR * A hole that is located above the current first hole in the map? */ - if (map->holes_list == NULL || (hole_entry == (vm_map_entry_t) map->holes_list && hole_entry->vme_start > old_entry->vme_start)) { + if (map->holes_list == NULL || (hole_entry == CAST_TO_VM_MAP_ENTRY(map->holes_list) && hole_entry->vme_start > old_entry->vme_start)) { if (map->holes_list == NULL) { map->holes_list = new_hole_entry; - new_hole_entry->prev = new_hole_entry->next = (vm_map_entry_t)map->holes_list; + new_hole_entry->prev = new_hole_entry->next = CAST_TO_VM_MAP_ENTRY(map->holes_list); } else { - l_next = (vm_map_entry_t) map->holes_list; + l_next = CAST_TO_VM_MAP_ENTRY(map->holes_list); l_prev = map->holes_list->prev; map->holes_list = new_hole_entry; new_hole_entry->next = l_next; new_hole_entry->prev = l_prev; - l_prev->vme_next = l_next->vme_prev = (vm_map_entry_t) new_hole_entry; + l_prev->vme_next = l_next->vme_prev = CAST_TO_VM_MAP_ENTRY(new_hole_entry); } } else { @@ -484,14 +457,14 @@ update_holes_on_entry_deletion(vm_map_t map, vm_map_entry_t old_entry) new_hole_entry->prev = hole_entry; new_hole_entry->next = l_next; - hole_entry->vme_next = (vm_map_entry_t)new_hole_entry; - l_next->vme_prev = (vm_map_entry_t) new_hole_entry; + hole_entry->vme_next = CAST_TO_VM_MAP_ENTRY(new_hole_entry); + l_next->vme_prev = CAST_TO_VM_MAP_ENTRY(new_hole_entry); } new_hole_entry->start = old_entry->vme_start; new_hole_entry->end = old_entry->vme_end; - hole_entry = (vm_map_entry_t) new_hole_entry; + hole_entry = CAST_TO_VM_MAP_ENTRY(new_hole_entry); assert(new_hole_entry->start < new_hole_entry->end); } @@ -529,7 +502,7 @@ update_holes_on_entry_creation(vm_map_t map, vm_map_entry_t new_entry) * This will reduce the size of the hole or delete the hole completely if it is smaller than the entry. */ - hole_entry = (vm_map_entry_t) map->holes_list; + hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list); assert(hole_entry); next_hole_entry = hole_entry->vme_next; @@ -593,8 +566,8 @@ update_holes_on_entry_creation(vm_map_t map, vm_map_entry_t new_entry) new_hole_entry->prev = hole_entry; new_hole_entry->next = hole_entry->vme_next; - hole_entry->vme_next->vme_prev = (vm_map_entry_t)new_hole_entry; - hole_entry->vme_next = (vm_map_entry_t)new_hole_entry; + hole_entry->vme_next->vme_prev = CAST_TO_VM_MAP_ENTRY(new_hole_entry); + hole_entry->vme_next = CAST_TO_VM_MAP_ENTRY(new_hole_entry); new_hole_entry->start = new_entry->vme_end; new_hole_entry->end = hole_entry->vme_end; @@ -664,7 +637,7 @@ update_holes_on_entry_creation(vm_map_t map, vm_map_entry_t new_entry) hole_entry = next_hole_entry; next_hole_entry = hole_entry->vme_next; - if (hole_entry == (vm_map_entry_t)map->holes_list) + if (hole_entry == CAST_TO_VM_MAP_ENTRY(map->holes_list)) break; } diff --git a/osfmk/vm/vm_map_store_rb.h b/osfmk/vm/vm_map_store_rb.h index d9506e6b4..82ac40321 100644 --- a/osfmk/vm/vm_map_store_rb.h +++ b/osfmk/vm/vm_map_store_rb.h @@ -39,7 +39,6 @@ void vm_map_store_walk_rb( struct _vm_map*, struct vm_map_entry**, struct vm_map boolean_t vm_map_store_lookup_entry_rb( struct _vm_map*, vm_map_offset_t, struct vm_map_entry**); void vm_map_store_entry_link_rb( struct vm_map_header*, struct vm_map_entry*, struct vm_map_entry*); void vm_map_store_entry_unlink_rb( struct vm_map_header*, struct vm_map_entry*); -void vm_map_store_copy_insert_rb( struct _vm_map*, struct vm_map_entry*, struct vm_map_copy*); void vm_map_store_copy_reset_rb( struct vm_map_copy*, struct vm_map_entry*, int); void update_first_free_rb(struct _vm_map*, struct vm_map_entry*, boolean_t new_entry_creation); diff --git a/osfmk/vm/vm_object.c b/osfmk/vm/vm_object.c index 821929f88..84f0ff6e8 100644 --- a/osfmk/vm/vm_object.c +++ b/osfmk/vm/vm_object.c @@ -105,6 +105,11 @@ #include #endif +#if VM_OBJECT_ACCESS_TRACKING +uint64_t vm_object_access_tracking_reads = 0; +uint64_t vm_object_access_tracking_writes = 0; +#endif /* VM_OBJECT_ACCESS_TRACKING */ + boolean_t vm_object_collapse_compressor_allowed = TRUE; struct vm_counters vm_counters; @@ -399,6 +404,10 @@ lck_attr_t vm_object_lck_attr; lck_attr_t kernel_object_lck_attr; lck_attr_t compressor_object_lck_attr; +extern void vm_named_entry_init(void); + +int workaround_41447923 = 0; + /* * vm_object_bootstrap: * @@ -466,8 +475,7 @@ vm_object_bootstrap(void) vm_object_template.res_count = 1; #endif /* TASK_SWAPPER */ vm_object_template.resident_page_count = 0; - // static vm_object_template is zeroed - // vm_object_template.wired_page_count = 0; + vm_object_template.wired_page_count = 0; vm_object_template.reusable_page_count = 0; vm_object_template.copy = VM_OBJECT_NULL; vm_object_template.shadow = VM_OBJECT_NULL; @@ -521,7 +529,7 @@ vm_object_bootstrap(void) /* cache bitfields */ vm_object_template.wimg_bits = VM_WIMG_USE_DEFAULT; vm_object_template.set_cache_attr = FALSE; - vm_object_template.object_slid = FALSE; + vm_object_template.object_is_shared_cache = FALSE; vm_object_template.code_signed = FALSE; vm_object_template.transposed = FALSE; vm_object_template.mapping_in_progress = FALSE; @@ -530,6 +538,7 @@ vm_object_bootstrap(void) vm_object_template.volatile_fault = FALSE; vm_object_template.all_reusable = FALSE; vm_object_template.blocked_access = FALSE; + vm_object_template.vo_ledger_tag = VM_OBJECT_LEDGER_TAG_NONE; vm_object_template.__object2_unused_bits = 0; #if CONFIG_IOSCHED || UPL_DEBUG vm_object_template.uplq.prev = NULL; @@ -551,6 +560,10 @@ vm_object_bootstrap(void) vm_object_template.vo_cache_ts = 0; vm_object_template.wire_tag = VM_KERN_MEMORY_NONE; +#if ! VM_TAG_ACTIVE_UPDATE + vm_object_template.wired_objq.next = NULL; + vm_object_template.wired_objq.prev = NULL; +#endif /* ! VM_TAG_ACTIVE_UPDATE */ vm_object_template.io_tracking = FALSE; @@ -561,6 +574,12 @@ vm_object_bootstrap(void) vm_object_template.__object3_unused_bits = 0; #endif /* CONFIG_SECLUDED_MEMORY */ +#if VM_OBJECT_ACCESS_TRACKING + vm_object_template.access_tracking = FALSE; + vm_object_template.access_tracking_reads = 0; + vm_object_template.access_tracking_writes = 0; +#endif /* VM_OBJECT_ACCESS_TRACKING */ + #if DEBUG bzero(&vm_object_template.purgeable_owner_bt[0], sizeof (vm_object_template.purgeable_owner_bt)); @@ -605,6 +624,11 @@ vm_object_bootstrap(void) * non-zone memory. */ vm_object_reference(vm_submap_object); + + vm_named_entry_init(); + + PE_parse_boot_argn("workaround_41447923", &workaround_41447923, + sizeof (workaround_41447923)); } #if CONFIG_IOSCHED @@ -929,44 +953,44 @@ vm_object_page_grab( while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)next_p) && --p_limit > 0) { p = next_p; - next_p = (vm_page_t)vm_page_queue_next(&next_p->listq); + next_p = (vm_page_t)vm_page_queue_next(&next_p->vmp_listq); - if (VM_PAGE_WIRED(p) || p->busy || p->cleaning || p->laundry || p->fictitious) + if (VM_PAGE_WIRED(p) || p->vmp_busy || p->vmp_cleaning || p->vmp_laundry || p->vmp_fictitious) goto move_page_in_obj; - if (p->pmapped || p->dirty || p->precious) { + if (p->vmp_pmapped || p->vmp_dirty || p->vmp_precious) { vm_page_lockspin_queues(); - if (p->pmapped) { + if (p->vmp_pmapped) { int refmod_state; vm_object_page_grab_pmapped++; - if (p->reference == FALSE || p->dirty == FALSE) { + if (p->vmp_reference == FALSE || p->vmp_dirty == FALSE) { refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(p)); if (refmod_state & VM_MEM_REFERENCED) - p->reference = TRUE; + p->vmp_reference = TRUE; if (refmod_state & VM_MEM_MODIFIED) { SET_PAGE_DIRTY(p, FALSE); } } - if (p->dirty == FALSE && p->precious == FALSE) { + if (p->vmp_dirty == FALSE && p->vmp_precious == FALSE) { refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(p)); if (refmod_state & VM_MEM_REFERENCED) - p->reference = TRUE; + p->vmp_reference = TRUE; if (refmod_state & VM_MEM_MODIFIED) { SET_PAGE_DIRTY(p, FALSE); } - if (p->dirty == FALSE) + if (p->vmp_dirty == FALSE) goto take_page; } } - if ((p->vm_page_q_state != VM_PAGE_ON_ACTIVE_Q) && p->reference == TRUE) { + if ((p->vmp_q_state != VM_PAGE_ON_ACTIVE_Q) && p->vmp_reference == TRUE) { vm_page_activate(p); VM_STAT_INCR(reactivations); @@ -974,8 +998,8 @@ vm_object_page_grab( } vm_page_unlock_queues(); move_page_in_obj: - vm_page_queue_remove(&object->memq, p, vm_page_t, listq); - vm_page_queue_enter(&object->memq, p, vm_page_t, listq); + vm_page_queue_remove(&object->memq, p, vm_page_t, vmp_listq); + vm_page_queue_enter(&object->memq, p, vm_page_t, vmp_listq); p_skipped++; continue; @@ -1115,7 +1139,6 @@ vm_object_cache_evict( next_obj = (vm_object_t)queue_next(&next_obj->cached_list); assert(object->purgable == VM_PURGABLE_DENY); - assert(object->wired_page_count == 0); if (sec < object->vo_cache_ts) { KERNEL_DEBUG(0x130020c, object, object->resident_page_count, object->vo_cache_ts, sec, 0); @@ -1180,20 +1203,20 @@ vm_object_cache_evict( while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)next_p) && object->vo_cache_pages_to_scan && ep_count < ep_limit) { p = next_p; - next_p = (vm_page_t)vm_page_queue_next(&next_p->listq); + next_p = (vm_page_t)vm_page_queue_next(&next_p->vmp_listq); object->vo_cache_pages_to_scan--; - if (VM_PAGE_WIRED(p) || p->busy || p->cleaning || p->laundry) { - vm_page_queue_remove(&object->memq, p, vm_page_t, listq); - vm_page_queue_enter(&object->memq, p, vm_page_t, listq); + if (VM_PAGE_WIRED(p) || p->vmp_busy || p->vmp_cleaning || p->vmp_laundry) { + vm_page_queue_remove(&object->memq, p, vm_page_t, vmp_listq); + vm_page_queue_enter(&object->memq, p, vm_page_t, vmp_listq); ep_skipped++; continue; } - if (p->wpmapped || p->dirty || p->precious) { - vm_page_queue_remove(&object->memq, p, vm_page_t, listq); - vm_page_queue_enter(&object->memq, p, vm_page_t, listq); + if (p->vmp_wpmapped || p->vmp_dirty || p->vmp_precious) { + vm_page_queue_remove(&object->memq, p, vm_page_t, vmp_listq); + vm_page_queue_enter(&object->memq, p, vm_page_t, vmp_listq); pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(p)); } @@ -1207,9 +1230,9 @@ vm_object_cache_evict( p = ep_array[ep_index]; - if (p->wpmapped || p->dirty || p->precious) { - p->reference = FALSE; - p->no_cache = FALSE; + if (p->vmp_wpmapped || p->vmp_dirty || p->vmp_precious) { + p->vmp_reference = FALSE; + p->vmp_no_cache = FALSE; /* * we've already filtered out pages that are in the laundry @@ -1225,12 +1248,12 @@ vm_object_cache_evict( #endif vm_page_free_prepare_queues(p); - assert(p->pageq.next == 0 && p->pageq.prev == 0); + assert(p->vmp_pageq.next == 0 && p->vmp_pageq.prev == 0); /* * Add this page to our list of reclaimed pages, * to be freed later. */ - p->snext = local_free_q; + p->vmp_snext = local_free_q; local_free_q = p; ep_freed++; @@ -1453,11 +1476,15 @@ vm_object_reap( * from its pager, to properly account for compressed pages. */ if (object->internal && - object->purgable != VM_PURGABLE_DENY) { - vm_purgeable_accounting(object, - object->purgable, - TRUE, /* disown */ - FALSE); /* task_objq locked? */ + (object->purgable != VM_PURGABLE_DENY || + object->vo_ledger_tag)) { + assert(!object->alive); + assert(object->terminating); + vm_object_ownership_change(object, + object->vo_ledger_tag, /* unchanged */ + NULL, /* no owner */ + FALSE); /* task_objq not locked */ + assert(object->vo_owner == NULL); } pager = object->pager; @@ -1477,9 +1504,7 @@ vm_object_reap( * remove from purgeable queue if it's on */ if (object->internal) { - task_t owner; - - owner = object->vo_purgeable_owner; + assert(VM_OBJECT_OWNER(object) == TASK_NULL); VM_OBJECT_UNWIRED(object); @@ -1488,8 +1513,6 @@ vm_object_reap( } else if (object->purgable == VM_PURGABLE_VOLATILE) { purgeable_q_t queue; - assert(object->vo_purgeable_owner == NULL); - queue = vm_purgeable_object_remove(object); assert(queue); @@ -1532,7 +1555,6 @@ vm_object_reap( else if (object->purgable == VM_PURGABLE_NONVOLATILE || object->purgable == VM_PURGABLE_EMPTY) { /* remove from nonvolatile queue */ - assert(object->vo_purgeable_owner == TASK_NULL); vm_purgeable_nonvolatile_dequeue(object); } else { panic("object %p in unexpected purgeable state 0x%x\n", @@ -1622,8 +1644,8 @@ unsigned int vm_max_batch = 256; vm_page_t m; \ for (m = _local_free_q; \ m != VM_PAGE_NULL; \ - m = m->snext) { \ - if (m->pmapped) { \ + m = m->vmp_snext) { \ + if (m->vmp_pmapped) { \ pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)); \ } \ } \ @@ -1678,7 +1700,7 @@ vm_object_reap_pages( while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)next)) { p = next; - next = (vm_page_t)vm_page_queue_next(&next->listq); + next = (vm_page_t)vm_page_queue_next(&next->vmp_listq); if (--loop_count == 0) { @@ -1706,7 +1728,7 @@ vm_object_reap_pages( } if (reap_type == REAP_DATA_FLUSH || reap_type == REAP_TERMINATE) { - if (p->busy || p->cleaning) { + if (p->vmp_busy || p->vmp_cleaning) { vm_page_unlock_queues(); /* @@ -1719,7 +1741,7 @@ vm_object_reap_pages( goto restart_after_sleep; } - if (p->laundry) + if (p->vmp_laundry) vm_pageout_steal_laundry(p, TRUE); } switch (reap_type) { @@ -1744,10 +1766,10 @@ vm_object_reap_pages( vm_page_purged_wired++; continue; } - if (p->laundry && !p->busy && !p->cleaning) + if (p->vmp_laundry && !p->vmp_busy && !p->vmp_cleaning) vm_pageout_steal_laundry(p, TRUE); - if (p->cleaning || p->laundry || p->absent) { + if (p->vmp_cleaning || p->vmp_laundry || p->vmp_absent) { /* * page is being acted upon, * so don't mess with it @@ -1755,7 +1777,7 @@ vm_object_reap_pages( vm_page_purged_others++; continue; } - if (p->busy) { + if (p->vmp_busy) { /* * We can't reclaim a busy page but we can * make it more likely to be paged (it's not wired) to make @@ -1773,7 +1795,7 @@ vm_object_reap_pages( /* * we can discard this page... */ - if (p->pmapped == TRUE) { + if (p->vmp_pmapped == TRUE) { /* * unmap the page */ @@ -1784,7 +1806,7 @@ vm_object_reap_pages( break; case REAP_TERMINATE: - if (p->absent || p->private) { + if (p->vmp_absent || p->vmp_private) { /* * For private pages, VM_PAGE_FREE just * leaves the page structure around for @@ -1794,20 +1816,20 @@ vm_object_reap_pages( */ break; } - if (p->fictitious) { + if (p->vmp_fictitious) { assert (VM_PAGE_GET_PHYS_PAGE(p) == vm_page_guard_addr); break; } - if (!p->dirty && p->wpmapped) - p->dirty = pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)); + if (!p->vmp_dirty && p->vmp_wpmapped) + p->vmp_dirty = pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)); - if ((p->dirty || p->precious) && !p->error && object->alive) { + if ((p->vmp_dirty || p->vmp_precious) && !p->vmp_error && object->alive) { assert(!object->internal); - p->free_when_done = TRUE; + p->vmp_free_when_done = TRUE; - if (!p->laundry) { + if (!p->vmp_laundry) { vm_page_queues_remove(p, TRUE); /* * flush page... page will be freed @@ -1832,12 +1854,12 @@ vm_object_reap_pages( break; } vm_page_free_prepare_queues(p); - assert(p->pageq.next == 0 && p->pageq.prev == 0); + assert(p->vmp_pageq.next == 0 && p->vmp_pageq.prev == 0); /* * Add this page to our list of reclaimed pages, * to be freed later. */ - p->snext = local_free_q; + p->vmp_snext = local_free_q; local_free_q = p; } vm_page_unlock_queues(); @@ -2195,8 +2217,8 @@ deactivate_pages_in_object( MARK_PAGE_HANDLED(*chunk_state, p); - if (( !VM_PAGE_WIRED(m)) && (!m->private) && (!m->gobbled) && (!m->busy) && - (!m->laundry) && (!m->cleaning) && !(m->free_when_done)) { + if (( !VM_PAGE_WIRED(m)) && (!m->vmp_private) && (!m->vmp_gobbled) && (!m->vmp_busy) && + (!m->vmp_laundry) && (!m->vmp_cleaning) && !(m->vmp_free_when_done)) { int clear_refmod; int pmap_options; @@ -2215,11 +2237,11 @@ deactivate_pages_in_object( */ pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m)); } - m->precious = FALSE; - m->dirty = FALSE; + m->vmp_precious = FALSE; + m->vmp_dirty = FALSE; clear_refmod |= VM_MEM_MODIFIED; - if (m->vm_page_q_state == VM_PAGE_ON_THROTTLED_Q) { + if (m->vmp_q_state == VM_PAGE_ON_THROTTLED_Q) { /* * This page is now clean and * reclaimable. Move it out @@ -2232,10 +2254,10 @@ deactivate_pages_in_object( VM_COMPRESSOR_PAGER_STATE_CLR(object, offset); - if (reusable_page && !m->reusable) { + if (reusable_page && !m->vmp_reusable) { assert(!all_reusable); assert(!object->all_reusable); - m->reusable = TRUE; + m->vmp_reusable = TRUE; object->reusable_page_count++; assert(object->resident_page_count >= object->reusable_page_count); reusable++; @@ -2253,7 +2275,7 @@ deactivate_pages_in_object( pmap_options, (void *)pfc); - if ((m->vm_page_q_state != VM_PAGE_ON_THROTTLED_Q) && !(reusable_page || all_reusable)) + if ((m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && !(reusable_page || all_reusable)) dwp->dw_mask |= DW_move_page; if (dwp->dw_mask) @@ -2506,12 +2528,12 @@ vm_object_reuse_pages( #define VM_OBJECT_REUSE_PAGE(object, m, reused) \ MACRO_BEGIN \ if ((m) != VM_PAGE_NULL && \ - (m)->reusable) { \ + (m)->vmp_reusable) { \ assert((object)->reusable_page_count <= \ (object)->resident_page_count); \ assert((object)->reusable_page_count > 0); \ (object)->reusable_page_count--; \ - (m)->reusable = FALSE; \ + (m)->vmp_reusable = FALSE; \ (reused)++; \ /* \ * Tell pmap that this page is no longer \ @@ -2543,15 +2565,15 @@ vm_object_reuse_pages( reused = object->resident_page_count; } else { vm_page_stats_reusable.partial_reuse_calls++; - vm_page_queue_iterate(&object->memq, m, vm_page_t, listq) { - if (m->offset < start_offset || - m->offset >= end_offset) { - m->reusable = TRUE; + vm_page_queue_iterate(&object->memq, m, vm_page_t, vmp_listq) { + if (m->vmp_offset < start_offset || + m->vmp_offset >= end_offset) { + m->vmp_reusable = TRUE; object->reusable_page_count++; assert(object->resident_page_count >= object->reusable_page_count); continue; } else { - assert(!m->reusable); + assert(!m->vmp_reusable); reused++; } } @@ -2570,12 +2592,12 @@ vm_object_reuse_pages( } } else { vm_page_stats_reusable.partial_reuse_calls++; - vm_page_queue_iterate(&object->memq, m, vm_page_t, listq) { + vm_page_queue_iterate(&object->memq, m, vm_page_t, vmp_listq) { if (object->reusable_page_count == 0) { break; } - if (m->offset < start_offset || - m->offset >= end_offset) { + if (m->vmp_offset < start_offset || + m->vmp_offset >= end_offset) { continue; } VM_OBJECT_REUSE_PAGE(object, m, reused); @@ -2706,11 +2728,11 @@ vm_object_pmap_protect_options( end = offset + size; - vm_page_queue_iterate(&object->memq, p, vm_page_t, listq) { - if (!p->fictitious && (offset <= p->offset) && (p->offset < end)) { + vm_page_queue_iterate(&object->memq, p, vm_page_t, vmp_listq) { + if (!p->vmp_fictitious && (offset <= p->vmp_offset) && (p->vmp_offset < end)) { vm_map_offset_t start; - start = pmap_start + p->offset - offset; + start = pmap_start + p->vmp_offset - offset; if (pmap != PMAP_NULL) pmap_protect_options( @@ -2745,7 +2767,7 @@ vm_object_pmap_protect_options( if (p != VM_PAGE_NULL) { vm_object_offset_t start; - start = pmap_start + (p->offset - offset); + start = pmap_start + (p->vmp_offset - offset); if (pmap != PMAP_NULL) pmap_protect_options( @@ -2801,6 +2823,8 @@ vm_object_pmap_protect_options( vm_object_unlock(object); } +uint32_t vm_page_busy_absent_skipped = 0; + /* * Routine: vm_object_copy_slowly * @@ -2842,7 +2866,7 @@ vm_object_copy_slowly( vm_object_t new_object; vm_object_offset_t new_offset; - struct vm_object_fault_info fault_info; + struct vm_object_fault_info fault_info = {}; XPR(XPR_VM_OBJECT, "v_o_c_slowly obj 0x%x off 0x%x size 0x%x\n", src_object, src_offset, size, 0, 0); @@ -2876,16 +2900,9 @@ vm_object_copy_slowly( fault_info.interruptible = interruptible; fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL; - fault_info.user_tag = 0; - fault_info.pmap_options = 0; fault_info.lo_offset = src_offset; fault_info.hi_offset = src_offset + size; - fault_info.no_cache = FALSE; fault_info.stealth = TRUE; - fault_info.io_sync = FALSE; - fault_info.cs_bypass = FALSE; - fault_info.mark_zf_absent = FALSE; - fault_info.batch_pmap_op = FALSE; for ( ; size != 0 ; @@ -2925,40 +2942,75 @@ vm_object_copy_slowly( if (src_object->internal && src_object->shadow == VM_OBJECT_NULL && - (vm_page_lookup(src_object, - src_offset) == VM_PAGE_NULL) && (src_object->pager == NULL || (VM_COMPRESSOR_PAGER_STATE_GET(src_object, src_offset) == VM_EXTERNAL_STATE_ABSENT))) { - /* - * This page is neither resident nor compressed - * and there's no shadow object below - * "src_object", so this page is really missing. - * There's no need to zero-fill it just to copy - * it: let's leave it missing in "new_object" - * and get zero-filled on demand. - */ - vm_object_unlock(src_object); - /* free the unused "new_page"... */ - vm_object_lock(new_object); - VM_PAGE_FREE(new_page); - new_page = VM_PAGE_NULL; - vm_object_unlock(new_object); - /* ...and go to next page in "src_object" */ - result = VM_FAULT_SUCCESS; - break; + boolean_t can_skip_page; + + _result_page = vm_page_lookup(src_object, + src_offset); + if (_result_page == VM_PAGE_NULL) { + /* + * This page is neither resident nor + * compressed and there's no shadow + * object below "src_object", so this + * page is really missing. + * There's no need to zero-fill it just + * to copy it: let's leave it missing + * in "new_object" and get zero-filled + * on demand. + */ + can_skip_page = TRUE; + } else if (workaround_41447923 && + src_object->pager == NULL && + _result_page != VM_PAGE_NULL && + _result_page->vmp_busy && + _result_page->vmp_absent && + src_object->purgable == VM_PURGABLE_DENY && + !src_object->blocked_access) { + /* + * This page is "busy" and "absent" + * but not because we're waiting for + * it to be decompressed. It must + * be because it's a "no zero fill" + * page that is currently not + * accessible until it gets overwritten + * by a device driver. + * Since its initial state would have + * been "zero-filled", let's leave the + * copy page missing and get zero-filled + * on demand. + */ + assert(src_object->internal); + assert(src_object->shadow == NULL); + assert(src_object->pager == NULL); + can_skip_page = TRUE; + vm_page_busy_absent_skipped++; + } else { + can_skip_page = FALSE; + } + if (can_skip_page) { + vm_object_unlock(src_object); + /* free the unused "new_page"... */ + vm_object_lock(new_object); + VM_PAGE_FREE(new_page); + new_page = VM_PAGE_NULL; + vm_object_unlock(new_object); + /* ...and go to next page in "src_object" */ + result = VM_FAULT_SUCCESS; + break; + } } vm_object_paging_begin(src_object); - if (size > (vm_size_t) -1) { - /* 32-bit overflow */ - fault_info.cluster_size = (vm_size_t) (0 - PAGE_SIZE); - } else { - fault_info.cluster_size = (vm_size_t) size; - assert(fault_info.cluster_size == size); + /* cap size at maximum UPL size */ + upl_size_t cluster_size; + if (os_convert_overflow(size, &cluster_size)) { + cluster_size = 0 - (upl_size_t)PAGE_SIZE; } + fault_info.cluster_size = cluster_size; XPR(XPR_VM_FAULT,"vm_object_copy_slowly -> vm_fault_page",0,0,0,0,0); _result_page = VM_PAGE_NULL; @@ -2999,8 +3051,8 @@ vm_object_copy_slowly( PAGE_WAKEUP_DONE(result_page); vm_page_lockspin_queues(); - if ((result_page->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) || - (result_page->vm_page_q_state == VM_PAGE_NOT_ON_Q)) { + if ((result_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) || + (result_page->vmp_q_state == VM_PAGE_NOT_ON_Q)) { vm_page_activate(result_page); } vm_page_activate(new_page); @@ -3439,10 +3491,10 @@ vm_object_copy_delayed( pmap_flush_context_init(&pmap_flush_context_storage); delayed_pmap_flush = FALSE; - vm_page_queue_iterate(&src_object->memq, p, vm_page_t, listq) { - if (!p->fictitious && - p->offset >= old_copy->vo_size && - p->offset < copy_size) { + vm_page_queue_iterate(&src_object->memq, p, vm_page_t, vmp_listq) { + if (!p->vmp_fictitious && + p->vmp_offset >= old_copy->vo_size && + p->vmp_offset < copy_size) { if (VM_PAGE_WIRED(p)) { vm_object_unlock(old_copy); vm_object_unlock(src_object); @@ -3539,8 +3591,8 @@ vm_object_copy_delayed( pmap_flush_context_init(&pmap_flush_context_storage); delayed_pmap_flush = FALSE; - vm_page_queue_iterate(&src_object->memq, p, vm_page_t, listq) { - if (!p->fictitious && p->offset < copy_size) { + vm_page_queue_iterate(&src_object->memq, p, vm_page_t, vmp_listq) { + if (!p->vmp_fictitious && p->vmp_offset < copy_size) { if (VM_PAGE_WIRED(p)) { if (old_copy) vm_object_unlock(old_copy); @@ -4225,9 +4277,9 @@ vm_object_do_collapse( p = (vm_page_t) vm_page_queue_first(&backing_object->memq); - new_offset = (p->offset - backing_offset); + new_offset = (p->vmp_offset - backing_offset); - assert(!p->busy || p->absent); + assert(!p->vmp_busy || p->vmp_absent); /* * If the parent has a page here, or if @@ -4237,7 +4289,7 @@ vm_object_do_collapse( * Otherwise, move it as planned. */ - if (p->offset < backing_offset || new_offset >= size) { + if (p->vmp_offset < backing_offset || new_offset >= size) { VM_PAGE_FREE(p); } else { pp = vm_page_lookup(object, new_offset); @@ -4262,7 +4314,7 @@ vm_object_do_collapse( vm_page_rename(p, object, new_offset); } } else { - assert(! pp->absent); + assert(! pp->vmp_absent); /* * Parent object has a real page. @@ -4875,7 +4927,7 @@ vm_object_collapse( backing_rcount = backing_object->resident_page_count; p = (vm_page_t)vm_page_queue_first(&backing_object->memq); do { - offset = (p->offset - backing_offset); + offset = (p->vmp_offset - backing_offset); if (offset < object->vo_size && offset != hint_offset && @@ -4885,7 +4937,7 @@ vm_object_collapse( break; } - p = (vm_page_t) vm_page_queue_next(&p->listq); + p = (vm_page_t) vm_page_queue_next(&p->vmp_listq); } while (--backing_rcount); if (backing_rcount != 0 ) { @@ -5003,8 +5055,8 @@ vm_object_page_remove( for (; start < end; start += PAGE_SIZE_64) { p = vm_page_lookup(object, start); if (p != VM_PAGE_NULL) { - assert(!p->cleaning && !p->laundry); - if (!p->fictitious && p->pmapped) + assert(!p->vmp_cleaning && !p->vmp_laundry); + if (!p->vmp_fictitious && p->vmp_pmapped) pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(p)); VM_PAGE_FREE(p); } @@ -5014,10 +5066,10 @@ vm_object_page_remove( p = (vm_page_t) vm_page_queue_first(&object->memq); while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t) p)) { - next = (vm_page_t) vm_page_queue_next(&p->listq); - if ((start <= p->offset) && (p->offset < end)) { - assert(!p->cleaning && !p->laundry); - if (!p->fictitious && p->pmapped) + next = (vm_page_t) vm_page_queue_next(&p->vmp_listq); + if ((start <= p->vmp_offset) && (p->vmp_offset < end)) { + assert(!p->vmp_cleaning && !p->vmp_laundry); + if (!p->vmp_fictitious && p->vmp_pmapped) pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(p)); VM_PAGE_FREE(p); } @@ -5162,25 +5214,25 @@ vm_object_populate_with_private( m = vm_page_lookup(object, base_offset); if (m != VM_PAGE_NULL) { - if (m->fictitious) { + if (m->vmp_fictitious) { if (VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr) { vm_page_lockspin_queues(); - m->private = TRUE; + m->vmp_private = TRUE; vm_page_unlock_queues(); - m->fictitious = FALSE; + m->vmp_fictitious = FALSE; VM_PAGE_SET_PHYS_PAGE(m, base_page); } } else if (VM_PAGE_GET_PHYS_PAGE(m) != base_page) { - if ( !m->private) { + if ( !m->vmp_private) { /* * we'd leak a real page... that can't be right */ panic("vm_object_populate_with_private - %p not private", m); } - if (m->pmapped) { + if (m->vmp_pmapped) { /* * pmap call to clear old mapping */ @@ -5197,11 +5249,11 @@ vm_object_populate_with_private( * private normally requires lock_queues but since we * are initializing the page, its not necessary here */ - m->private = TRUE; - m->fictitious = FALSE; + m->vmp_private = TRUE; + m->vmp_fictitious = FALSE; VM_PAGE_SET_PHYS_PAGE(m, base_page); - m->unusual = TRUE; - m->busy = FALSE; + m->vmp_unusual = TRUE; + m->vmp_busy = FALSE; vm_page_insert(m, object, base_offset); } @@ -5564,8 +5616,8 @@ vm_object_purge(vm_object_t object, int flags) -pgcount, FALSE, /* shared */ object); - vm_purgeable_compressed_update(object, - -pgcount); + vm_object_owner_compressed_update(object, + -pgcount); } if ( !(flags & C_DONT_BLOCK)) { assert(vm_compressor_pager_get_count(object->pager) @@ -5807,9 +5859,7 @@ vm_object_purgable_control( * Transfer the object's pages from the volatile to * non-volatile ledgers. */ - vm_purgeable_accounting(object, VM_PURGABLE_VOLATILE, - FALSE, /* disown */ - FALSE); /* task_objq locked? */ + vm_purgeable_accounting(object, VM_PURGABLE_VOLATILE); } break; @@ -5819,15 +5869,15 @@ vm_object_purgable_control( vm_page_t p; int refmod; - vm_page_queue_iterate(&object->memq, p, vm_page_t, listq) { - if (p->busy || + vm_page_queue_iterate(&object->memq, p, vm_page_t, vmp_listq) { + if (p->vmp_busy || VM_PAGE_WIRED(p) || - p->fictitious) { + p->vmp_fictitious) { continue; } refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(p)); if ((refmod & VM_MEM_MODIFIED) && - !p->dirty) { + !p->vmp_dirty) { SET_PAGE_DIRTY(p, FALSE); } } @@ -5938,9 +5988,7 @@ vm_object_purgable_control( vm_purgeable_object_add(object, queue, (*state&VM_VOLATILE_GROUP_MASK)>>VM_VOLATILE_GROUP_SHIFT ); if (old_state == VM_PURGABLE_NONVOLATILE) { vm_purgeable_accounting(object, - VM_PURGABLE_NONVOLATILE, - FALSE, /* disown */ - FALSE); /* task_objq locked? */ + VM_PURGABLE_NONVOLATILE); } assert(queue->debug_count_objects>=0); @@ -5953,15 +6001,15 @@ vm_object_purgable_control( vm_page_t p; int refmod; - vm_page_queue_iterate(&object->memq, p, vm_page_t, listq) { - if (p->busy || + vm_page_queue_iterate(&object->memq, p, vm_page_t, vmp_listq) { + if (p->vmp_busy || VM_PAGE_WIRED(p) || - p->fictitious) { + p->vmp_fictitious) { continue; } refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(p)); if ((refmod & VM_MEM_MODIFIED) && - !p->dirty) { + !p->vmp_dirty) { SET_PAGE_DIRTY(p, FALSE); } } @@ -5990,9 +6038,7 @@ vm_object_purgable_control( * "volatile". */ vm_purgeable_accounting(object, - VM_PURGABLE_NONVOLATILE, - FALSE, /* disown */ - FALSE); /* task_objq locked? */ + VM_PURGABLE_NONVOLATILE); /* * Set to VM_PURGABLE_EMPTY because the pages are no * longer accounted in the "non-volatile" ledger @@ -6062,15 +6108,15 @@ vm_object_get_page_counts( if (object->resident_page_count <= (size >> PAGE_SHIFT)) { - vm_page_queue_iterate(&object->memq, p, vm_page_t, listq) { + vm_page_queue_iterate(&object->memq, p, vm_page_t, vmp_listq) { - if (p->offset >= cur_offset && p->offset < end_offset) { + if (p->vmp_offset >= cur_offset && p->vmp_offset < end_offset) { local_resident_count++; if (count_dirty_pages) { - if (p->dirty || (p->wpmapped && pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) { + if (p->vmp_dirty || (p->vmp_wpmapped && pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) { local_dirty_count++; } @@ -6089,7 +6135,7 @@ vm_object_get_page_counts( if (count_dirty_pages) { - if (p->dirty || (p->wpmapped && pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) { + if (p->vmp_dirty || (p->vmp_wpmapped && pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) { local_dirty_count++; } @@ -6342,7 +6388,7 @@ vm_object_transpose( */ while (!vm_page_queue_empty(&object2->memq)) { page = (vm_page_t) vm_page_queue_first(&object2->memq); - vm_page_rename(page, object1, page->offset); + vm_page_rename(page, object1, page->vmp_offset); } assert(vm_page_queue_empty(&object2->memq)); } else if (object2->phys_contiguous || vm_page_queue_empty(&object2->memq)) { @@ -6353,31 +6399,31 @@ vm_object_transpose( */ while (!vm_page_queue_empty(&object1->memq)) { page = (vm_page_t) vm_page_queue_first(&object1->memq); - vm_page_rename(page, object2, page->offset); + vm_page_rename(page, object2, page->vmp_offset); } assert(vm_page_queue_empty(&object1->memq)); } else { /* transfer object1's pages to tmp_object */ while (!vm_page_queue_empty(&object1->memq)) { page = (vm_page_t) vm_page_queue_first(&object1->memq); - page_offset = page->offset; + page_offset = page->vmp_offset; vm_page_remove(page, TRUE); - page->offset = page_offset; - vm_page_queue_enter(&tmp_object->memq, page, vm_page_t, listq); + page->vmp_offset = page_offset; + vm_page_queue_enter(&tmp_object->memq, page, vm_page_t, vmp_listq); } assert(vm_page_queue_empty(&object1->memq)); /* transfer object2's pages to object1 */ while (!vm_page_queue_empty(&object2->memq)) { page = (vm_page_t) vm_page_queue_first(&object2->memq); - vm_page_rename(page, object1, page->offset); + vm_page_rename(page, object1, page->vmp_offset); } assert(vm_page_queue_empty(&object2->memq)); /* transfer tmp_object's pages to object2 */ while (!vm_page_queue_empty(&tmp_object->memq)) { page = (vm_page_t) vm_page_queue_first(&tmp_object->memq); vm_page_queue_remove(&tmp_object->memq, page, - vm_page_t, listq); - vm_page_insert(page, object2, page->offset); + vm_page_t, vmp_listq); + vm_page_insert(page, object2, page->vmp_offset); } assert(vm_page_queue_empty(&tmp_object->memq)); } @@ -6401,6 +6447,9 @@ MACRO_END #endif /* "resident_page_count" was updated above when transposing pages */ /* "wired_page_count" was updated above when transposing pages */ +#if ! VM_TAG_ACTIVE_UPDATE + /* "wired_objq" was dealt with along with "wired_page_count" */ +#endif /* ! VM_TAG_ACTIVE_UPDATE */ /* "reusable_page_count" was updated above when transposing pages */ /* there should be no "copy" */ assert(!object1->copy); @@ -6487,6 +6536,26 @@ MACRO_END __TRANSPOSE_FIELD(all_reusable); assert(object1->blocked_access); assert(object2->blocked_access); + __TRANSPOSE_FIELD(set_cache_attr); + assert(!object1->object_is_shared_cache); + assert(!object2->object_is_shared_cache); + /* ignore purgeable_queue_type and purgeable_queue_group */ + assert(!object1->io_tracking); + assert(!object2->io_tracking); +#if VM_OBJECT_ACCESS_TRACKING + assert(!object1->access_tracking); + assert(!object2->access_tracking); +#endif /* VM_OBJECT_ACCESS_TRACKING */ + __TRANSPOSE_FIELD(no_tag_update); +#if CONFIG_SECLUDED_MEMORY + assert(!object1->eligible_for_secluded); + assert(!object2->eligible_for_secluded); + assert(!object1->can_grab_secluded); + assert(!object2->can_grab_secluded); +#else /* CONFIG_SECLUDED_MEMORY */ + assert(object1->__object3_unused_bits == 0); + assert(object2->__object3_unused_bits == 0); +#endif /* CONFIG_SECLUDED_MEMORY */ assert(object1->__object2_unused_bits == 0); assert(object2->__object2_unused_bits == 0); #if UPL_DEBUG @@ -6927,7 +6996,7 @@ vm_object_page_op( } /* Sync up on getting the busy bit */ - if((dst_page->busy || dst_page->cleaning) && + if((dst_page->vmp_busy || dst_page->vmp_cleaning) && (((ops & UPL_POP_SET) && (ops & UPL_POP_BUSY)) || (ops & UPL_POP_DUMP))) { /* someone else is playing with the page, we will */ @@ -6937,7 +7006,7 @@ vm_object_page_op( } if (ops & UPL_POP_DUMP) { - if (dst_page->pmapped == TRUE) + if (dst_page->vmp_pmapped == TRUE) pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page)); VM_PAGE_FREE(dst_page); @@ -6950,11 +7019,11 @@ vm_object_page_op( /* Get the condition of flags before requested ops */ /* are undertaken */ - if(dst_page->dirty) *flags |= UPL_POP_DIRTY; - if(dst_page->free_when_done) *flags |= UPL_POP_PAGEOUT; - if(dst_page->precious) *flags |= UPL_POP_PRECIOUS; - if(dst_page->absent) *flags |= UPL_POP_ABSENT; - if(dst_page->busy) *flags |= UPL_POP_BUSY; + if(dst_page->vmp_dirty) *flags |= UPL_POP_DIRTY; + if(dst_page->vmp_free_when_done) *flags |= UPL_POP_PAGEOUT; + if(dst_page->vmp_precious) *flags |= UPL_POP_PRECIOUS; + if(dst_page->vmp_absent) *flags |= UPL_POP_ABSENT; + if(dst_page->vmp_busy) *flags |= UPL_POP_BUSY; } /* The caller should have made a call either contingent with */ @@ -6967,24 +7036,24 @@ vm_object_page_op( /* because the page may already be busy. However */ /* if such violations occur we will assert sooner */ /* or later. */ - assert(dst_page->busy || (ops & UPL_POP_BUSY)); + assert(dst_page->vmp_busy || (ops & UPL_POP_BUSY)); if (ops & UPL_POP_DIRTY) { SET_PAGE_DIRTY(dst_page, FALSE); } - if (ops & UPL_POP_PAGEOUT) dst_page->free_when_done = TRUE; - if (ops & UPL_POP_PRECIOUS) dst_page->precious = TRUE; - if (ops & UPL_POP_ABSENT) dst_page->absent = TRUE; - if (ops & UPL_POP_BUSY) dst_page->busy = TRUE; + if (ops & UPL_POP_PAGEOUT) dst_page->vmp_free_when_done = TRUE; + if (ops & UPL_POP_PRECIOUS) dst_page->vmp_precious = TRUE; + if (ops & UPL_POP_ABSENT) dst_page->vmp_absent = TRUE; + if (ops & UPL_POP_BUSY) dst_page->vmp_busy = TRUE; } if(ops & UPL_POP_CLR) { - assert(dst_page->busy); - if (ops & UPL_POP_DIRTY) dst_page->dirty = FALSE; - if (ops & UPL_POP_PAGEOUT) dst_page->free_when_done = FALSE; - if (ops & UPL_POP_PRECIOUS) dst_page->precious = FALSE; - if (ops & UPL_POP_ABSENT) dst_page->absent = FALSE; + assert(dst_page->vmp_busy); + if (ops & UPL_POP_DIRTY) dst_page->vmp_dirty = FALSE; + if (ops & UPL_POP_PAGEOUT) dst_page->vmp_free_when_done = FALSE; + if (ops & UPL_POP_PRECIOUS) dst_page->vmp_precious = FALSE; + if (ops & UPL_POP_ABSENT) dst_page->vmp_absent = FALSE; if (ops & UPL_POP_BUSY) { - dst_page->busy = FALSE; + dst_page->vmp_busy = FALSE; PAGE_WAKEUP(dst_page); } } @@ -6993,7 +7062,7 @@ vm_object_page_op( * The physical page number will remain valid * only if the page is kept busy. */ - assert(dst_page->busy); + assert(dst_page->vmp_busy); *phys_entry = VM_PAGE_GET_PHYS_PAGE(dst_page); } @@ -7054,7 +7123,7 @@ vm_object_range_op( dst_page = vm_page_lookup(object, offset); if (dst_page != VM_PAGE_NULL) { if (ops & UPL_ROP_DUMP) { - if (dst_page->busy || dst_page->cleaning) { + if (dst_page->vmp_busy || dst_page->vmp_cleaning) { /* * someone else is playing with the * page, we will have to wait @@ -7068,16 +7137,16 @@ vm_object_range_op( */ continue; } - if (dst_page->laundry) + if (dst_page->vmp_laundry) vm_pageout_steal_laundry(dst_page, FALSE); - if (dst_page->pmapped == TRUE) + if (dst_page->vmp_pmapped == TRUE) pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page)); VM_PAGE_FREE(dst_page); } else if ((ops & UPL_ROP_ABSENT) - && (!dst_page->absent || dst_page->busy)) { + && (!dst_page->vmp_absent || dst_page->vmp_busy)) { break; } } else if (ops & UPL_ROP_PRESENT) @@ -7272,9 +7341,9 @@ vm_object_change_wimg_mode(vm_object_t object, unsigned int wimg_mode) vm_object_paging_wait(object, THREAD_UNINT); - vm_page_queue_iterate(&object->memq, p, vm_page_t, listq) { + vm_page_queue_iterate(&object->memq, p, vm_page_t, vmp_listq) { - if (!p->fictitious) + if (!p->vmp_fictitious) pmap_set_cache_attributes(VM_PAGE_GET_PHYS_PAGE(p), wimg_mode); } if (wimg_mode == VM_WIMG_USE_DEFAULT) @@ -7414,22 +7483,22 @@ vm_object_compressed_freezer_pageout( vm_page_lockspin_queues(); - if (p->cleaning || p->fictitious || p->busy || p->absent || p->unusual || p->error || VM_PAGE_WIRED(p)) { + if (p->vmp_cleaning || p->vmp_fictitious || p->vmp_busy || p->vmp_absent || p->vmp_unusual || p->vmp_error || VM_PAGE_WIRED(p)) { vm_page_unlock_queues(); KERNEL_DEBUG(0xe0430004 | DBG_FUNC_END, object, local_freed, 1, 0, 0); - vm_page_queue_remove(&object->memq, p, vm_page_t, listq); - vm_page_queue_enter(&object->memq, p, vm_page_t, listq); + vm_page_queue_remove(&object->memq, p, vm_page_t, vmp_listq); + vm_page_queue_enter(&object->memq, p, vm_page_t, vmp_listq); continue; } - if (p->pmapped == TRUE) { + if (p->vmp_pmapped == TRUE) { int refmod_state, pmap_flags; - if (p->dirty || p->precious) { + if (p->vmp_dirty || p->vmp_precious) { pmap_flags = PMAP_OPTIONS_COMPRESSOR; } else { pmap_flags = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED; @@ -7441,7 +7510,7 @@ vm_object_compressed_freezer_pageout( } } - if (p->dirty == FALSE && p->precious == FALSE) { + if (p->vmp_dirty == FALSE && p->vmp_precious == FALSE) { /* * Clean and non-precious page. */ @@ -7452,7 +7521,7 @@ vm_object_compressed_freezer_pageout( continue; } - if (p->laundry) + if (p->vmp_laundry) vm_pageout_steal_laundry(p, TRUE); vm_page_queues_remove(p, TRUE); @@ -7466,34 +7535,32 @@ vm_object_compressed_freezer_pageout( * Make the move here while we have the object lock held. */ - vm_page_queue_remove(&object->memq, p, vm_page_t, listq); - vm_page_queue_enter(&object->memq, p, vm_page_t, listq); + vm_page_queue_remove(&object->memq, p, vm_page_t, vmp_listq); + vm_page_queue_enter(&object->memq, p, vm_page_t, vmp_listq); /* * Grab an activity_in_progress here for vm_pageout_compress_page() to consume. * * Mark the page busy so no one messes with it while we have the object lock dropped. */ - - p->busy = TRUE; + p->vmp_busy = TRUE; vm_object_activity_begin(object); vm_object_unlock(object); - /* - * arg3 == FALSE tells vm_pageout_compress_page that we don't hold the object lock and the pager may not be initialized. - */ - if (vm_pageout_compress_page(&freezer_chead, freezer_compressor_scratch_buf, p, FALSE) == KERN_SUCCESS) { + if (vm_pageout_compress_page(&freezer_chead, freezer_compressor_scratch_buf, p) == KERN_SUCCESS) { /* * page has already been un-tabled from the object via 'vm_page_remove' */ - p->snext = local_freeq; + p->vmp_snext = local_freeq; local_freeq = p; local_freed++; if (local_freed >= MAX_FREE_BATCH) { - + + OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions); + vm_page_free_list(local_freeq, TRUE); local_freeq = NULL; @@ -7513,6 +7580,8 @@ vm_object_compressed_freezer_pageout( } if (local_freeq) { + OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions); + vm_page_free_list(local_freeq, TRUE); local_freeq = NULL; @@ -7577,33 +7646,31 @@ vm_object_pageout( while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)next)) { p = next; - next = (vm_page_t)vm_page_queue_next(&next->listq); + next = (vm_page_t)vm_page_queue_next(&next->vmp_listq); - assert(p->vm_page_q_state != VM_PAGE_ON_FREE_Q); + assert(p->vmp_q_state != VM_PAGE_ON_FREE_Q); - if ((p->vm_page_q_state == VM_PAGE_ON_THROTTLED_Q) || - p->cleaning || - p->laundry || - p->busy || - p->absent || - p->error || - p->fictitious || + if ((p->vmp_q_state == VM_PAGE_ON_THROTTLED_Q) || + p->vmp_cleaning || + p->vmp_laundry || + p->vmp_busy || + p->vmp_absent || + p->vmp_error || + p->vmp_fictitious || VM_PAGE_WIRED(p)) { /* * Page is already being cleaned or can't be cleaned. */ continue; } + if (vm_compressor_low_on_space()) { + break; + } /* Throw to the pageout queue */ vm_page_lockspin_queues(); - if (vm_compressor_low_on_space()) { - vm_page_unlock_queues(); - break; - } - if (VM_PAGE_Q_THROTTLED(iq)) { iq->pgo_draining = TRUE; @@ -7619,15 +7686,15 @@ vm_object_pageout( goto ReScan; } - assert(!p->fictitious); - assert(!p->busy); - assert(!p->absent); - assert(!p->unusual); - assert(!p->error); + assert(!p->vmp_fictitious); + assert(!p->vmp_busy); + assert(!p->vmp_absent); + assert(!p->vmp_unusual); + assert(!p->vmp_error); assert(!VM_PAGE_WIRED(p)); - assert(!p->cleaning); + assert(!p->vmp_cleaning); - if (p->pmapped == TRUE) { + if (p->vmp_pmapped == TRUE) { int refmod_state; int pmap_options; @@ -7637,7 +7704,7 @@ vm_object_pageout( */ pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED; - if (p->dirty || p->precious) { + if (p->vmp_dirty || p->vmp_precious) { /* * We already know it's been modified, * so tell pmap to account for it @@ -7653,7 +7720,7 @@ vm_object_pageout( } } - if (!p->dirty && !p->precious) { + if (!p->vmp_dirty && !p->vmp_precious) { vm_page_unlock_queues(); VM_PAGE_FREE(p); continue; @@ -7868,13 +7935,13 @@ vm_page_sleep(vm_object_t o, vm_page_t m, int interruptible) KERNEL_DEBUG((MACHDBG_CODE(DBG_MACH_VM, VM_PAGE_SLEEP)) | DBG_FUNC_START, o, m, 0, 0, 0); - if (o->io_tracking && ((m->busy == TRUE) || (m->cleaning == TRUE) || VM_PAGE_WIRED(m))) { + if (o->io_tracking && ((m->vmp_busy == TRUE) || (m->vmp_cleaning == TRUE) || VM_PAGE_WIRED(m))) { /* Indicates page is busy due to an I/O. Issue a reprioritize request if necessary. */ vm_page_handle_prio_inversion(o,m); } - m->wanted = TRUE; + m->vmp_wanted = TRUE; ret = thread_sleep_vm_object(o, m, interruptible); KERNEL_DEBUG((MACHDBG_CODE(DBG_MACH_VM, VM_PAGE_SLEEP)) | DBG_FUNC_END, o, m, 0, 0, 0); return ret; @@ -7903,3 +7970,310 @@ io_reprioritize_thread(void *param __unused, wait_result_t wr __unused) IO_REPRIO_THREAD_CONTINUATION(); } #endif + +#if VM_OBJECT_ACCESS_TRACKING +void +vm_object_access_tracking( + vm_object_t object, + int *access_tracking_p, + uint32_t *access_tracking_reads_p, + uint32_t *access_tracking_writes_p) +{ + int access_tracking; + + access_tracking = !!*access_tracking_p; + + vm_object_lock(object); + *access_tracking_p = object->access_tracking; + if (access_tracking_reads_p) { + *access_tracking_reads_p = object->access_tracking_reads; + } + if (access_tracking_writes_p) { + *access_tracking_writes_p = object->access_tracking_writes; + } + object->access_tracking = access_tracking; + object->access_tracking_reads = 0; + object->access_tracking_writes = 0; + vm_object_unlock(object); + + if (access_tracking) { + vm_object_pmap_protect_options(object, + 0, + object->vo_size, + PMAP_NULL, + 0, + VM_PROT_NONE, + 0); + } +} +#endif /* VM_OBJECT_ACCESS_TRACKING */ + +void +vm_object_ledger_tag_ledgers( + vm_object_t object, + int *ledger_idx_volatile, + int *ledger_idx_nonvolatile, + int *ledger_idx_volatile_compressed, + int *ledger_idx_nonvolatile_compressed, + boolean_t *do_footprint) +{ + assert(object->shadow == VM_OBJECT_NULL); + + switch (object->vo_ledger_tag) { + case VM_OBJECT_LEDGER_TAG_NONE: + /* regular purgeable memory */ + assert(object->purgable != VM_PURGABLE_DENY); + *ledger_idx_volatile = task_ledgers.purgeable_volatile; + *ledger_idx_nonvolatile = task_ledgers.purgeable_nonvolatile; + *ledger_idx_volatile_compressed = task_ledgers.purgeable_volatile_compressed; + *ledger_idx_nonvolatile_compressed = task_ledgers.purgeable_nonvolatile_compressed; + *do_footprint = TRUE; + break; + case VM_OBJECT_LEDGER_TAG_NETWORK: + *ledger_idx_volatile = task_ledgers.network_volatile; + *ledger_idx_volatile_compressed = task_ledgers.network_volatile_compressed; + *ledger_idx_nonvolatile = task_ledgers.network_nonvolatile; + *ledger_idx_nonvolatile_compressed = task_ledgers.network_nonvolatile_compressed; + *do_footprint = FALSE; + break; + case VM_OBJECT_LEDGER_TAG_MEDIA: + default: + panic("%s: object %p has unsupported ledger_tag %d\n", + __FUNCTION__, object, object->vo_ledger_tag); + } +} + +kern_return_t +vm_object_ownership_change( + vm_object_t object, + int new_ledger_tag, + task_t new_owner, + boolean_t task_objq_locked) +{ + int old_ledger_tag; + task_t old_owner; + int resident_count, wired_count; + unsigned int compressed_count; + int ledger_idx_volatile; + int ledger_idx_nonvolatile; + int ledger_idx_volatile_compressed; + int ledger_idx_nonvolatile_compressed; + int ledger_idx; + int ledger_idx_compressed; + boolean_t do_footprint; + + vm_object_lock_assert_exclusive(object); + assert(object->internal); + + old_ledger_tag = object->vo_ledger_tag; + old_owner = VM_OBJECT_OWNER(object); + + resident_count = object->resident_page_count - object->wired_page_count; + wired_count = object->wired_page_count; + compressed_count = vm_compressor_pager_get_count(object->pager); + + /* + * Deal with the old owner and/or ledger tag, if needed. + */ + if (old_owner != TASK_NULL && + ((old_owner != new_owner) /* new owner ... */ + || /* ... or ... */ + (old_ledger_tag && /* ... new ledger */ + old_ledger_tag != new_ledger_tag))) { + /* + * Take this object off of the old owner's ledgers. + */ + vm_object_ledger_tag_ledgers(object, + &ledger_idx_volatile, + &ledger_idx_nonvolatile, + &ledger_idx_volatile_compressed, + &ledger_idx_nonvolatile_compressed, + &do_footprint); + if (object->purgable == VM_PURGABLE_VOLATILE || + object->purgable == VM_PURGABLE_EMPTY) { + ledger_idx = ledger_idx_volatile; + ledger_idx_compressed = ledger_idx_volatile_compressed; + } else { + ledger_idx = ledger_idx_nonvolatile; + ledger_idx_compressed = ledger_idx_nonvolatile_compressed; + } + if (resident_count) { + /* + * Adjust the appropriate old owners's ledgers by the + * number of resident pages. + */ + ledger_debit(old_owner->ledger, + ledger_idx, + ptoa_64(resident_count)); + /* adjust old owner's footprint */ + if (do_footprint && + object->purgable != VM_PURGABLE_VOLATILE && + object->purgable != VM_PURGABLE_EMPTY) { + ledger_debit(old_owner->ledger, + task_ledgers.phys_footprint, + ptoa_64(resident_count)); + } + } + if (wired_count) { + /* wired pages are always nonvolatile */ + ledger_debit(old_owner->ledger, + ledger_idx_nonvolatile, + ptoa_64(wired_count)); + if (do_footprint) { + ledger_debit(old_owner->ledger, + task_ledgers.phys_footprint, + ptoa_64(wired_count)); + } + } + if (compressed_count) { + /* + * Adjust the appropriate old owner's ledgers + * by the number of compressed pages. + */ + ledger_debit(old_owner->ledger, + ledger_idx_compressed, + ptoa_64(compressed_count)); + if (do_footprint && + object->purgable != VM_PURGABLE_VOLATILE && + object->purgable != VM_PURGABLE_EMPTY) { + ledger_debit(old_owner->ledger, + task_ledgers.phys_footprint, + ptoa_64(compressed_count)); + } + } + if (old_owner != new_owner) { + /* remove object from old_owner's list of owned objects */ + DTRACE_VM2(object_owner_remove, + vm_object_t, object, + task_t, new_owner); + if (!task_objq_locked) { + task_objq_lock(old_owner); + } + queue_remove(&old_owner->task_objq, object, + vm_object_t, task_objq); + switch (object->purgable) { + case VM_PURGABLE_NONVOLATILE: + case VM_PURGABLE_EMPTY: + vm_purgeable_nonvolatile_owner_update(old_owner, + -1); + break; + case VM_PURGABLE_VOLATILE: + vm_purgeable_volatile_owner_update(old_owner, + -1); + break; + default: + break; + } + if (!task_objq_locked) { + task_objq_unlock(old_owner); + } + } + } + + /* + * Switch to new ledger tag and/or owner. + */ + object->vo_ledger_tag = new_ledger_tag; + object->vo_owner = new_owner; + + if (new_owner == VM_OBJECT_OWNER_DISOWNED) { + assert(old_owner != kernel_task); + new_owner = kernel_task; + } + + /* + * Deal with the new owner and/or ledger tag, if needed. + */ + if (new_owner != TASK_NULL && + ((new_owner != old_owner) /* new owner ... */ + || /* ... or ... */ + (new_ledger_tag && /* ... new ledger */ + new_ledger_tag != old_ledger_tag))) { + /* + * Add this object to the new owner's ledgers. + */ + vm_object_ledger_tag_ledgers(object, + &ledger_idx_volatile, + &ledger_idx_nonvolatile, + &ledger_idx_volatile_compressed, + &ledger_idx_nonvolatile_compressed, + &do_footprint); + if (object->purgable == VM_PURGABLE_VOLATILE || + object->purgable == VM_PURGABLE_EMPTY) { + ledger_idx = ledger_idx_volatile; + ledger_idx_compressed = ledger_idx_volatile_compressed; + } else { + ledger_idx = ledger_idx_nonvolatile; + ledger_idx_compressed = ledger_idx_nonvolatile_compressed; + } + if (resident_count) { + /* + * Adjust the appropriate new owners's ledgers by the + * number of resident pages. + */ + ledger_credit(new_owner->ledger, + ledger_idx, + ptoa_64(resident_count)); + /* adjust new owner's footprint */ + if (do_footprint && + object->purgable != VM_PURGABLE_VOLATILE && + object->purgable != VM_PURGABLE_EMPTY) { + ledger_credit(new_owner->ledger, + task_ledgers.phys_footprint, + ptoa_64(resident_count)); + } + } + if (wired_count) { + /* wired pages are always nonvolatile */ + ledger_credit(new_owner->ledger, + ledger_idx_nonvolatile, + ptoa_64(wired_count)); + if (do_footprint) { + ledger_credit(new_owner->ledger, + task_ledgers.phys_footprint, + ptoa_64(wired_count)); + } + } + if (compressed_count) { + /* + * Adjust the new owner's ledgers by the number of + * compressed pages. + */ + ledger_credit(new_owner->ledger, + ledger_idx_compressed, + ptoa_64(compressed_count)); + if (do_footprint && + object->purgable != VM_PURGABLE_VOLATILE && + object->purgable != VM_PURGABLE_EMPTY) { + ledger_credit(new_owner->ledger, + task_ledgers.phys_footprint, + ptoa_64(compressed_count)); + } + } + if (new_owner != old_owner) { + /* add object to new_owner's list of owned objects */ + DTRACE_VM2(object_owner_add, + vm_object_t, object, + task_t, new_owner); + task_objq_lock(new_owner); + queue_enter(&new_owner->task_objq, object, + vm_object_t, task_objq); + switch (object->purgable) { + case VM_PURGABLE_NONVOLATILE: + case VM_PURGABLE_EMPTY: + vm_purgeable_nonvolatile_owner_update(new_owner, + +1); + break; + case VM_PURGABLE_VOLATILE: + vm_purgeable_volatile_owner_update(new_owner, + +1); + break; + default: + break; + } + task_objq_unlock(new_owner); + } + } + + return KERN_SUCCESS; +} diff --git a/osfmk/vm/vm_object.h b/osfmk/vm/vm_object.h index 247437571..ccd1de547 100644 --- a/osfmk/vm/vm_object.h +++ b/osfmk/vm/vm_object.h @@ -105,7 +105,6 @@ extern btlog_t *vm_object_tracking_btlog; #endif /* VM_OBJECT_TRACKING */ struct vm_page; -struct vm_shared_region_slide_info; /* * Types defined: @@ -126,9 +125,10 @@ struct vm_object_fault_info { /* boolean_t */ stealth:1, /* boolean_t */ io_sync:1, /* boolean_t */ cs_bypass:1, + /* boolean_t */ pmap_cs_associated:1, /* boolean_t */ mark_zf_absent:1, /* boolean_t */ batch_pmap_op:1, - __vm_object_fault_info_unused_bits:26; + __vm_object_fault_info_unused_bits:25; int pmap_options; }; @@ -137,8 +137,7 @@ struct vm_object_fault_info { #define vo_cache_pages_to_scan vo_un1.vou_cache_pages_to_scan #define vo_shadow_offset vo_un2.vou_shadow_offset #define vo_cache_ts vo_un2.vou_cache_ts -#define vo_purgeable_owner vo_un2.vou_purgeable_owner -#define vo_slide_info vo_un2.vou_slide_info +#define vo_owner vo_un2.vou_owner struct vm_object { /* @@ -171,7 +170,7 @@ struct vm_object { int ref_count; /* Number of references */ unsigned int resident_page_count; /* number of resident pages */ - const unsigned int wired_page_count; /* number of wired pages + unsigned int wired_page_count; /* number of wired pages use VM_OBJECT_WIRED_PAGE_UPDATE macros to update */ unsigned int reusable_page_count; @@ -189,11 +188,10 @@ struct vm_object { clock_sec_t vou_cache_ts; /* age of an external object * present in cache */ - task_t vou_purgeable_owner; /* If the purg'a'ble bits below are set - * to volatile/emtpy, this is the task - * that owns this purgeable object. - */ - struct vm_shared_region_slide_info *vou_slide_info; + task_t vou_owner; /* If the object is purgeable + * or has a "ledger_tag", this + * is the task that owns it. + */ } vo_un2; memory_object_t pager; /* Where to get data */ @@ -348,7 +346,7 @@ struct vm_object { all_reusable:1, blocked_access:1, set_cache_attr:1, - object_slid:1, + object_is_shared_cache:1, purgeable_queue_type:2, purgeable_queue_group:3, io_tracking:1, @@ -359,7 +357,18 @@ struct vm_object { #else /* CONFIG_SECLUDED_MEMORY */ __object3_unused_bits:2, #endif /* CONFIG_SECLUDED_MEMORY */ - __object2_unused_bits:5; /* for expansion */ +#if VM_OBJECT_ACCESS_TRACKING + access_tracking:1, +#else /* VM_OBJECT_ACCESS_TRACKING */ + __unused_access_tracking:1, +#endif /* VM_OBJECT_ACCESS_TRACKING */ + vo_ledger_tag:2, + __object2_unused_bits:2; /* for expansion */ + +#if VM_OBJECT_ACCESS_TRACKING + uint32_t access_tracking_reads; + uint32_t access_tracking_writes; +#endif /* VM_OBJECT_ACCESS_TRACKING */ uint8_t scan_collisions; vm_tag_t wire_tag; @@ -387,6 +396,10 @@ struct vm_object { queue_chain_t objq; /* object queue - currently used for purgable queues */ queue_chain_t task_objq; /* objects owned by task - protected by task lock */ +#if !VM_TAG_ACTIVE_UPDATE + queue_chain_t wired_objq; +#endif /* !VM_TAG_ACTIVE_UPDATE */ + #if DEBUG void *purgeable_owner_bt[16]; task_t vo_purgeable_volatilizer; /* who made it volatile? */ @@ -394,11 +407,26 @@ struct vm_object { #endif /* DEBUG */ }; +/* values for object->vo_ledger_tag */ +#define VM_OBJECT_LEDGER_TAG_NONE 0 +#define VM_OBJECT_LEDGER_TAG_NETWORK 1 +#define VM_OBJECT_LEDGER_TAG_MEDIA 2 +#define VM_OBJECT_LEDGER_TAG_RESERVED 3 + #define VM_OBJECT_PURGEABLE_FAULT_ERROR(object) \ ((object)->volatile_fault && \ ((object)->purgable == VM_PURGABLE_VOLATILE || \ (object)->purgable == VM_PURGABLE_EMPTY)) +#if VM_OBJECT_ACCESS_TRACKING +extern uint64_t vm_object_access_tracking_reads; +extern uint64_t vm_object_access_tracking_writes; +extern void vm_object_access_tracking(vm_object_t object, + int *access_tracking, + uint32_t *access_tracking_reads, + uint32_t *acess_tracking_writes); +#endif /* VM_OBJECT_ACCESS_TRACKING */ + extern vm_object_t kernel_object; /* the single kernel object */ @@ -421,30 +449,44 @@ extern lck_attr_t vm_map_lck_attr; #error VM_TAG_ACTIVE_UPDATE #endif +#if VM_TAG_ACTIVE_UPDATE +#define VM_OBJECT_WIRED_ENQUEUE(object) panic("VM_OBJECT_WIRED_ENQUEUE") +#define VM_OBJECT_WIRED_DEQUEUE(object) panic("VM_OBJECT_WIRED_DEQUEUE") +#else /* VM_TAG_ACTIVE_UPDATE */ +#define VM_OBJECT_WIRED_ENQUEUE(object) \ + MACRO_BEGIN \ + lck_spin_lock(&vm_objects_wired_lock); \ + assert(!(object)->wired_objq.next); \ + assert(!(object)->wired_objq.prev); \ + queue_enter(&vm_objects_wired, (object), \ + vm_object_t, wired_objq); \ + lck_spin_unlock(&vm_objects_wired_lock); \ + MACRO_END +#define VM_OBJECT_WIRED_DEQUEUE(object) \ + MACRO_BEGIN \ + if ((object)->wired_objq.next) { \ + lck_spin_lock(&vm_objects_wired_lock); \ + queue_remove(&vm_objects_wired, (object), \ + vm_object_t, wired_objq); \ + lck_spin_unlock(&vm_objects_wired_lock); \ + } \ + MACRO_END +#endif /* VM_TAG_ACTIVE_UPDATE */ + #define VM_OBJECT_WIRED(object, tag) \ MACRO_BEGIN \ assert(VM_KERN_MEMORY_NONE != (tag)); \ assert(VM_KERN_MEMORY_NONE == (object)->wire_tag); \ (object)->wire_tag = (tag); \ - if (!VM_TAG_ACTIVE_UPDATE \ - && ((object)->purgable == VM_PURGABLE_DENY)) \ - { \ - lck_spin_lock(&vm_objects_wired_lock); \ - assert(!(object)->objq.next); \ - assert(!(object)->objq.prev); \ - queue_enter(&vm_objects_wired, (object), vm_object_t, objq); \ - lck_spin_unlock(&vm_objects_wired_lock); \ + if (!VM_TAG_ACTIVE_UPDATE) { \ + VM_OBJECT_WIRED_ENQUEUE((object)); \ } \ MACRO_END #define VM_OBJECT_UNWIRED(object) \ MACRO_BEGIN \ - if (!VM_TAG_ACTIVE_UPDATE \ - && ((object)->purgable == VM_PURGABLE_DENY) && (object)->objq.next) \ - { \ - lck_spin_lock(&vm_objects_wired_lock); \ - queue_remove(&vm_objects_wired, (object), vm_object_t, objq); \ - lck_spin_unlock(&vm_objects_wired_lock); \ + if (!VM_TAG_ACTIVE_UPDATE) { \ + VM_OBJECT_WIRED_DEQUEUE((object)); \ } \ if (VM_KERN_MEMORY_NONE != (object)->wire_tag) { \ vm_tag_update_size((object)->wire_tag, -ptoa_64((object)->wired_page_count)); \ @@ -462,7 +504,7 @@ extern lck_attr_t vm_map_lck_attr; if (__wireddelta) { \ boolean_t __overflow __assert_only = \ os_add_overflow((object)->wired_page_count, __wireddelta, \ - (unsigned int *)(uintptr_t)&(object)->wired_page_count); \ + &(object)->wired_page_count); \ assert(!__overflow); \ if (!(object)->pageout && !(object)->no_tag_update) { \ if (__wireddelta > 0) { \ @@ -487,10 +529,10 @@ extern lck_attr_t vm_map_lck_attr; __wireddelta += delta; \ #define VM_OBJECT_WIRED_PAGE_ADD(object, m) \ - if (!m->private && !m->fictitious) __wireddelta++; + if (!(m)->vmp_private && !(m)->vmp_fictitious) __wireddelta++; #define VM_OBJECT_WIRED_PAGE_REMOVE(object, m) \ - if (!m->private && !m->fictitious) __wireddelta--; + if (!(m)->vmp_private && !(m)->vmp_fictitious) __wireddelta--; @@ -1140,4 +1182,27 @@ extern void vm_object_cache_add(vm_object_t); extern void vm_object_cache_remove(vm_object_t); extern int vm_object_cache_evict(int, int); +#define VM_OBJECT_OWNER_DISOWNED ((task_t) -1) +#define VM_OBJECT_OWNER(object) \ + ((((object)->purgable == VM_PURGABLE_DENY && \ + (object)->vo_ledger_tag == 0) || \ + (object)->vo_owner == TASK_NULL) \ + ? TASK_NULL /* not owned */ \ + : (((object)->vo_owner == VM_OBJECT_OWNER_DISOWNED) \ + ? kernel_task /* disowned -> kernel */ \ + : (object)->vo_owner)) /* explicit owner */ \ + +extern void vm_object_ledger_tag_ledgers( + vm_object_t object, + int *ledger_idx_volatile, + int *ledger_idx_nonvolatile, + int *ledger_idx_volatile_compressed, + int *ledger_idx_nonvolatile_compressed, + boolean_t *do_footprint); +extern kern_return_t vm_object_ownership_change( + vm_object_t object, + int ledger_tag, + task_t owner, + boolean_t task_objq_locked); + #endif /* _VM_VM_OBJECT_H_ */ diff --git a/osfmk/vm/vm_options.h b/osfmk/vm/vm_options.h index 92781d9da..47a456aa6 100644 --- a/osfmk/vm/vm_options.h +++ b/osfmk/vm/vm_options.h @@ -40,4 +40,8 @@ #define VM_OBJECT_TRACKING 0 #define VM_SCAN_FOR_SHADOW_CHAIN (DEVELOPMENT || DEBUG) +#define VM_OBJECT_ACCESS_TRACKING (DEVELOPMENT || DEBUG) + +#define VM_NAMED_ENTRY_LIST (DEVELOPMENT || DEBUG) + #endif /* __VM_VM_OPTIONS_H__ */ diff --git a/osfmk/vm/vm_page.h b/osfmk/vm/vm_page.h index 559af3f0b..f8fa9c025 100644 --- a/osfmk/vm/vm_page.h +++ b/osfmk/vm/vm_page.h @@ -158,10 +158,10 @@ extern char vm_page_non_speculative_pageable_states[]; extern char vm_page_active_or_inactive_states[]; -#define VM_PAGE_INACTIVE(m) (vm_page_inactive_states[m->vm_page_q_state]) -#define VM_PAGE_PAGEABLE(m) (vm_page_pageable_states[m->vm_page_q_state]) -#define VM_PAGE_NON_SPECULATIVE_PAGEABLE(m) (vm_page_non_speculative_pageable_states[m->vm_page_q_state]) -#define VM_PAGE_ACTIVE_OR_INACTIVE(m) (vm_page_active_or_inactive_states[m->vm_page_q_state]) +#define VM_PAGE_INACTIVE(m) (vm_page_inactive_states[m->vmp_q_state]) +#define VM_PAGE_PAGEABLE(m) (vm_page_pageable_states[m->vmp_q_state]) +#define VM_PAGE_NON_SPECULATIVE_PAGEABLE(m) (vm_page_non_speculative_pageable_states[m->vmp_q_state]) +#define VM_PAGE_ACTIVE_OR_INACTIVE(m) (vm_page_active_or_inactive_states[m->vmp_q_state]) #define VM_PAGE_NOT_ON_Q 0 /* page is not present on any queue, nor is it wired... mainly a transient state */ @@ -184,49 +184,45 @@ extern char vm_page_active_or_inactive_states[]; #define VM_PAGE_Q_STATE_ARRAY_SIZE (VM_PAGE_Q_STATE_LAST_VALID_VALUE+1) -#define pageq pageq_un.vm_page_pageq -#define snext pageq_un.vm_page_snext +/* + * The structure itself. See the block comment above for what (O) and (P) mean. + */ +#define vmp_pageq vmp_q_un.vmp_q_pageq +#define vmp_snext vmp_q_un.vmp_q_snext struct vm_page { union { - vm_page_queue_chain_t vm_page_pageq; /* queue info for FIFO queue or free list (P) */ - struct vm_page *vm_page_snext; - } pageq_un; + vm_page_queue_chain_t vmp_q_pageq; /* queue info for FIFO queue or free list (P) */ + struct vm_page *vmp_q_snext; + } vmp_q_un; - vm_page_queue_chain_t listq; /* all pages in same object (O) */ + vm_page_queue_chain_t vmp_listq; /* all pages in same object (O) */ #if CONFIG_BACKGROUND_QUEUE - vm_page_queue_chain_t vm_page_backgroundq; /* anonymous pages in the background pool (P) */ + vm_page_queue_chain_t vmp_backgroundq; /* anonymous pages in the background pool (P) */ #endif - vm_object_offset_t offset; /* offset into that object (O,P) */ - vm_page_object_t vm_page_object; /* which object am I in (O&P) */ + vm_object_offset_t vmp_offset; /* offset into that object (O,P) */ + vm_page_object_t vmp_object; /* which object am I in (O&P) */ /* - * The following word of flags is protected - * by the "page queues" lock. + * The following word of flags is always protected by the "page queues" lock. * - * we use the 'wire_count' field to store the local - * queue id if local queues are enabled... - * see the comments at 'vm_page_queues_remove' as to - * why this is safe to do + * We use 'vmp_wire_count' to store the local queue id if local queues are enabled. + * See the comments at 'vm_page_queues_remove' as to why this is safe to do. */ -#define local_id wire_count - unsigned int wire_count:16, /* how many wired down maps use me? (O&P) */ - vm_page_q_state:4, /* which q is the page on (P) */ - - vm_page_in_background:1, - vm_page_on_backgroundq:1, - /* boolean_t */ - gobbled:1, /* page used internally (P) */ - laundry:1, /* page is being cleaned now (P)*/ - no_cache:1, /* page is not to be cached and should - * be reused ahead of other pages (P) */ - private:1, /* Page should not be returned to - * the free list (P) */ - reference:1, /* page has been used (P) */ - - __unused_pageq_bits:5; /* 5 bits available here */ +#define vmp_local_id vmp_wire_count + unsigned int vmp_wire_count:16, /* how many wired down maps use me? (O&P) */ + vmp_q_state:4, /* which q is the page on (P) */ + vmp_in_background:1, + vmp_on_backgroundq:1, + vmp_gobbled:1, /* page used internally (P) */ + vmp_laundry:1, /* page is being cleaned now (P)*/ + vmp_no_cache:1, /* page is not to be cached and should */ + /* be reused ahead of other pages (P) */ + vmp_private:1, /* Page should not be returned to the free list (P) */ + vmp_reference:1, /* page has been used (P) */ + vmp_unused_page_bits:5; /* * MUST keep the 2 32 bit words used as bit fields @@ -236,62 +232,48 @@ struct vm_page { * they are protected by 2 different locks, this * is a real problem */ - vm_page_packed_t next_m; /* VP bucket link (O) */ + vm_page_packed_t vmp_next_m; /* VP bucket link (O) */ /* - * The following word of flags is protected - * by the "VM object" lock. - */ - unsigned int - /* boolean_t */ busy:1, /* page is in transit (O) */ - wanted:1, /* someone is waiting for page (O) */ - tabled:1, /* page is in VP table (O) */ - hashed:1, /* page is in vm_page_buckets[] - (O) + the bucket lock */ - fictitious:1, /* Physical page doesn't exist (O) */ - /* - * IMPORTANT: the "pmapped", "xpmapped" and "clustered" bits can be modified while holding the + * The following word of flags is protected by the "VM object" lock. + * + * IMPORTANT: the "vmp_pmapped", "vmp_xpmapped" and "vmp_clustered" bits can be modified while holding the * VM object "shared" lock + the page lock provided through the pmap_lock_phys_page function. - * This is done in vm_fault_enter and the CONSUME_CLUSTERED macro. + * This is done in vm_fault_enter() and the CONSUME_CLUSTERED macro. * It's also ok to modify them behind just the VM object "exclusive" lock. */ - clustered:1, /* page is not the faulted page (O) or (O-shared AND pmap_page) */ - pmapped:1, /* page has been entered at some - * point into a pmap (O) or (O-shared AND pmap_page) */ - xpmapped:1, /* page has been entered with execute permission (O) - or (O-shared AND pmap_page) */ - - wpmapped:1, /* page has been entered at some - * point into a pmap for write (O) */ - free_when_done:1, /* page is to be freed once cleaning is completed (O) */ - absent:1, /* Data has been requested, but is - * not yet available (O) */ - error:1, /* Data manager was unable to provide - * data due to error (O) */ - dirty:1, /* Page must be cleaned (O) */ - cleaning:1, /* Page clean has begun (O) */ - precious:1, /* Page is precious; data must be - * returned even if clean (O) */ - overwriting:1, /* Request to unlock has been made - * without having data. (O) - * [See vm_fault_page_overwrite] */ - restart:1, /* Page was pushed higher in shadow - chain by copy_call-related pagers; - start again at top of chain */ - unusual:1, /* Page is absent, error, restart or - page locked */ - cs_validated:1, /* code-signing: page was checked */ - cs_tainted:1, /* code-signing: page is tainted */ - cs_nx:1, /* code-signing: page is nx */ - reusable:1, - lopage:1, - slid:1, - written_by_kernel:1, /* page was written by kernel (i.e. decompressed) */ - __unused_object_bits:7; /* 7 bits available here */ + unsigned int vmp_busy:1, /* page is in transit (O) */ + vmp_wanted:1, /* someone is waiting for page (O) */ + vmp_tabled:1, /* page is in VP table (O) */ + vmp_hashed:1, /* page is in vm_page_buckets[] (O) + the bucket lock */ + vmp_fictitious:1, /* Physical page doesn't exist (O) */ + vmp_clustered:1, /* page is not the faulted page (O) or (O-shared AND pmap_page) */ + vmp_pmapped:1, /* page has at some time been entered into a pmap (O) or */ + /* (O-shared AND pmap_page) */ + vmp_xpmapped:1, /* page has been entered with execute permission (O) or */ + /* (O-shared AND pmap_page) */ + vmp_wpmapped:1, /* page has been entered at some point into a pmap for write (O) */ + vmp_free_when_done:1, /* page is to be freed once cleaning is completed (O) */ + vmp_absent:1, /* Data has been requested, but is not yet available (O) */ + vmp_error:1, /* Data manager was unable to provide data due to error (O) */ + vmp_dirty:1, /* Page must be cleaned (O) */ + vmp_cleaning:1, /* Page clean has begun (O) */ + vmp_precious:1, /* Page is precious; data must be returned even if clean (O) */ + vmp_overwriting:1, /* Request to unlock has been made without having data. (O) */ + /* [See vm_fault_page_overwrite] */ + vmp_restart:1, /* Page was pushed higher in shadow chain by copy_call-related pagers */ + /* start again at top of chain */ + vmp_unusual:1, /* Page is absent, error, restart or page locked */ + vmp_cs_validated:1, /* code-signing: page was checked */ + vmp_cs_tainted:1, /* code-signing: page is tainted */ + vmp_cs_nx:1, /* code-signing: page is nx */ + vmp_reusable:1, + vmp_lopage:1, + vmp_written_by_kernel:1, /* page was written by kernel (i.e. decompressed) */ + vmp_unused_object_bits:8; #if !defined(__arm__) && !defined(__arm64__) - ppnum_t phys_page; /* Physical address of page, passed - * to pmap_enter (read-only) */ + ppnum_t vmp_phys_page; /* Physical page number of the page */ #endif }; @@ -309,7 +291,7 @@ extern unsigned int vm_first_phys_ppnum; struct vm_page_with_ppnum { struct vm_page vm_page_wo_ppnum; - ppnum_t phys_page; + ppnum_t vmp_phys_page; }; typedef struct vm_page_with_ppnum *vm_page_with_ppnum_t; @@ -319,13 +301,13 @@ static inline ppnum_t VM_PAGE_GET_PHYS_PAGE(vm_page_t m) if (m >= vm_page_array_beginning_addr && m < vm_page_array_ending_addr) return ((ppnum_t)((uintptr_t)(m - vm_page_array_beginning_addr) + vm_first_phys_ppnum)); else - return (((vm_page_with_ppnum_t)m)->phys_page); + return (((vm_page_with_ppnum_t)m)->vmp_phys_page); } #define VM_PAGE_SET_PHYS_PAGE(m, ppnum) \ MACRO_BEGIN \ if ((m) < vm_page_array_beginning_addr || (m) >= vm_page_array_ending_addr) \ - ((vm_page_with_ppnum_t)(m))->phys_page = ppnum; \ + ((vm_page_with_ppnum_t)(m))->vmp_phys_page = ppnum; \ assert(ppnum == VM_PAGE_GET_PHYS_PAGE(m)); \ MACRO_END @@ -340,10 +322,10 @@ struct vm_page_with_ppnum { typedef struct vm_page_with_ppnum *vm_page_with_ppnum_t; -#define VM_PAGE_GET_PHYS_PAGE(page) (page)->phys_page +#define VM_PAGE_GET_PHYS_PAGE(page) (page)->vmp_phys_page #define VM_PAGE_SET_PHYS_PAGE(page, ppnum) \ MACRO_BEGIN \ - (page)->phys_page = ppnum; \ + (page)->vmp_phys_page = ppnum; \ MACRO_END #define VM_PAGE_GET_CLUMP(m) ((VM_PAGE_GET_PHYS_PAGE(m)) >> vm_clump_shift) @@ -398,13 +380,13 @@ static inline uintptr_t vm_page_unpack_ptr(uintptr_t p) #define VM_PAGE_PACK_PTR(p) vm_page_pack_ptr((uintptr_t)(p)) #define VM_PAGE_UNPACK_PTR(p) vm_page_unpack_ptr((uintptr_t)(p)) -#define VM_PAGE_OBJECT(p) ((vm_object_t)(VM_PAGE_UNPACK_PTR(p->vm_page_object))) +#define VM_PAGE_OBJECT(p) ((vm_object_t)(VM_PAGE_UNPACK_PTR(p->vmp_object))) #define VM_PAGE_PACK_OBJECT(o) ((vm_page_object_t)(VM_PAGE_PACK_PTR(o))) #define VM_PAGE_ZERO_PAGEQ_ENTRY(p) \ MACRO_BEGIN \ - (p)->snext = 0; \ + (p)->vmp_snext = 0; \ MACRO_END @@ -560,7 +542,7 @@ MACRO_BEGIN __n = VM_PAGE_GET_PHYS_PAGE(elt) & vm_clump_mask; \ /* scan backward looking for a buddy page */ \ for(__i=0, __p=(elt)-1; __i<__n && __p>=vm_page_array_beginning_addr; __i++, __p--) { \ - if(__p->vm_page_q_state == VM_PAGE_ON_FREE_Q && __clump_num == VM_PAGE_GET_CLUMP(__p)) { \ + if(__p->vmp_q_state == VM_PAGE_ON_FREE_Q && __clump_num == VM_PAGE_GET_CLUMP(__p)) { \ if(__prev == 0) __prev = (vm_page_queue_entry_t) __p; \ __first = (vm_page_queue_entry_t) __p; \ __n_free++; \ @@ -568,7 +550,7 @@ MACRO_BEGIN } \ /* scan forward looking for a buddy page */ \ for(__i=__n+1, __p=(elt)+1; __ivm_page_q_state == VM_PAGE_ON_FREE_Q && __clump_num == VM_PAGE_GET_CLUMP(__p)) { \ + if(__p->vmp_q_state == VM_PAGE_ON_FREE_Q && __clump_num == VM_PAGE_GET_CLUMP(__p)) { \ __DEBUG_CHECK_BUDDIES(__check, __prev, __p, field); \ if(__prev == 0) __prev = (vm_page_queue_entry_t) VM_PAGE_UNPACK_PTR(__p->field.prev); \ __last = (vm_page_queue_entry_t) __p; \ @@ -841,14 +823,14 @@ MACRO_END #define VM_PAGE_PACK_PTR(p) (p) #define VM_PAGE_UNPACK_PTR(p) ((uintptr_t)(p)) -#define VM_PAGE_OBJECT(p) (vm_object_t)(p->vm_page_object) +#define VM_PAGE_OBJECT(p) (vm_object_t)(p->vmp_object) #define VM_PAGE_PACK_OBJECT(o) ((vm_page_object_t)(VM_PAGE_PACK_PTR(o))) #define VM_PAGE_ZERO_PAGEQ_ENTRY(p) \ MACRO_BEGIN \ - (p)->pageq.next = 0; \ - (p)->pageq.prev = 0; \ + (p)->vmp_pageq.next = 0; \ + (p)->vmp_pageq.prev = 0; \ MACRO_END #define VM_PAGE_CONVERT_TO_QUEUE_ENTRY(p) ((queue_entry_t)(p)) @@ -940,9 +922,9 @@ extern void vm_page_add_to_backgroundq(vm_page_t mem, boolean_t first); extern void vm_page_remove_from_backgroundq(vm_page_t mem); #endif -#define VM_PAGE_WIRED(m) ((m)->vm_page_q_state == VM_PAGE_IS_WIRED) -#define NEXT_PAGE(m) ((m)->snext) -#define NEXT_PAGE_PTR(m) (&(m)->snext) +#define VM_PAGE_WIRED(m) ((m)->vmp_q_state == VM_PAGE_IS_WIRED) +#define NEXT_PAGE(m) ((m)->vmp_snext) +#define NEXT_PAGE_PTR(m) (&(m)->vmp_snext) /* * XXX The unusual bit should not be necessary. Most of the bit @@ -1147,8 +1129,6 @@ unsigned int vm_page_free_min; /* When to wakeup pageout */ extern unsigned int vm_page_throttle_limit; /* When to throttle new page creation */ extern -uint32_t vm_page_creation_throttle; /* When to throttle new page creation */ -extern unsigned int vm_page_inactive_target;/* How many do we want inactive? */ #if CONFIG_SECLUDED_MEMORY extern @@ -1157,12 +1137,8 @@ unsigned int vm_page_secluded_target;/* How many do we want secluded? */ extern unsigned int vm_page_anonymous_min; /* When it's ok to pre-clean */ extern -unsigned int vm_page_inactive_min; /* When to wakeup pageout */ -extern unsigned int vm_page_free_reserved; /* How many pages reserved to do pageout */ extern -unsigned int vm_page_throttle_count; /* Count of page allocations throttled */ -extern unsigned int vm_page_gobble_count; extern unsigned int vm_page_stolen_count; /* Count of stolen pages not acccounted in zones */ @@ -1285,6 +1261,9 @@ extern void vm_page_free_unlocked( vm_page_t page, boolean_t remove_from_hash); +extern void vm_page_balance_inactive( + int max_to_move); + extern void vm_page_activate( vm_page_t page); @@ -1384,6 +1363,9 @@ extern void vm_page_validate_cs(vm_page_t page); extern void vm_page_validate_cs_mapped( vm_page_t page, const void *kaddr); +extern void vm_page_validate_cs_mapped_slow( + vm_page_t page, + const void *kaddr); extern void vm_page_validate_cs_mapped_chunk( vm_page_t page, const void *kaddr, @@ -1434,29 +1416,32 @@ extern void memorystatus_pages_update(unsigned int pages_avail); #endif /* CONFIG_JETSAM */ /* - * Functions implemented as macros. m->wanted and m->busy are - * protected by the object lock. + * Functions implemented as macros. m->vmp_wanted and m->vmp_busy are + * protected by the object lock. */ #if CONFIG_EMBEDDED #define SET_PAGE_DIRTY(m, set_pmap_modified) \ MACRO_BEGIN \ vm_page_t __page__ = (m); \ - if (__page__->dirty == FALSE && (set_pmap_modified)) { \ + if (__page__->vmp_pmapped == TRUE && \ + __page__->vmp_wpmapped == TRUE && \ + __page__->vmp_dirty == FALSE && \ + (set_pmap_modified)) { \ pmap_set_modify(VM_PAGE_GET_PHYS_PAGE(__page__)); \ } \ - __page__->dirty = TRUE; \ + __page__->vmp_dirty = TRUE; \ MACRO_END #else /* CONFIG_EMBEDDED */ #define SET_PAGE_DIRTY(m, set_pmap_modified) \ MACRO_BEGIN \ vm_page_t __page__ = (m); \ - __page__->dirty = TRUE; \ + __page__->vmp_dirty = TRUE; \ MACRO_END #endif /* CONFIG_EMBEDDED */ #define PAGE_ASSERT_WAIT(m, interruptible) \ - (((m)->wanted = TRUE), \ + (((m)->vmp_wanted = TRUE), \ assert_wait((event_t) (m), (interruptible))) #if CONFIG_IOSCHED @@ -1464,23 +1449,23 @@ extern void memorystatus_pages_update(unsigned int pages_avail); vm_page_sleep(o, m, interruptible) #else #define PAGE_SLEEP(o, m, interruptible) \ - (((m)->wanted = TRUE), \ + (((m)->vmp_wanted = TRUE), \ thread_sleep_vm_object((o), (m), (interruptible))) #endif #define PAGE_WAKEUP_DONE(m) \ MACRO_BEGIN \ - (m)->busy = FALSE; \ - if ((m)->wanted) { \ - (m)->wanted = FALSE; \ + (m)->vmp_busy = FALSE; \ + if ((m)->vmp_wanted) { \ + (m)->vmp_wanted = FALSE; \ thread_wakeup((event_t) (m)); \ } \ MACRO_END #define PAGE_WAKEUP(m) \ MACRO_BEGIN \ - if ((m)->wanted) { \ - (m)->wanted = FALSE; \ + if ((m)->vmp_wanted) { \ + (m)->vmp_wanted = FALSE; \ thread_wakeup((event_t) (m)); \ } \ MACRO_END @@ -1523,7 +1508,7 @@ extern void memorystatus_pages_update(unsigned int pages_avail); #if DEVELOPMENT || DEBUG #define VM_PAGE_SPECULATIVE_USED_ADD() \ MACRO_BEGIN \ - OSAddAtomic(1, &vm_page_speculative_used); \ + OSAddAtomic(1, &vm_page_speculative_used); \ MACRO_END #else #define VM_PAGE_SPECULATIVE_USED_ADD() @@ -1534,16 +1519,16 @@ extern void memorystatus_pages_update(unsigned int pages_avail); MACRO_BEGIN \ ppnum_t __phys_page; \ __phys_page = VM_PAGE_GET_PHYS_PAGE(mem); \ - pmap_lock_phys_page(__phys_page); \ - if (mem->clustered) { \ + pmap_lock_phys_page(__phys_page); \ + if (mem->vmp_clustered) { \ vm_object_t o; \ o = VM_PAGE_OBJECT(mem); \ assert(o); \ o->pages_used++; \ - mem->clustered = FALSE; \ + mem->vmp_clustered = FALSE; \ VM_PAGE_SPECULATIVE_USED_ADD(); \ } \ - pmap_unlock_phys_page(__phys_page); \ + pmap_unlock_phys_page(__phys_page); \ MACRO_END @@ -1610,8 +1595,8 @@ extern unsigned int vm_max_delayed_work_limit; #define VM_PAGE_ADD_DELAYED_WORK(dwp, mem, dw_cnt) \ MACRO_BEGIN \ - if (mem->busy == FALSE) { \ - mem->busy = TRUE; \ + if (mem->vmp_busy == FALSE) { \ + mem->vmp_busy = TRUE; \ if ( !(dwp->dw_mask & DW_vm_page_free)) \ dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP); \ } \ @@ -1632,5 +1617,11 @@ extern void vm_page_enqueue_inactive(vm_page_t mem, boolean_t first); extern void vm_page_enqueue_active(vm_page_t mem, boolean_t first); extern void vm_page_check_pageable_safe(vm_page_t page); +#if CONFIG_SECLUDED_MEMORY +extern uint64_t secluded_shutoff_trigger; +extern void start_secluded_suppression(task_t); +extern void stop_secluded_suppression(task_t); +#endif /* CONFIG_SECLUDED_MEMORY */ + #endif /* _VM_VM_PAGE_H_ */ diff --git a/osfmk/vm/vm_pageout.c b/osfmk/vm/vm_pageout.c index 20eac579d..bf722548b 100644 --- a/osfmk/vm/vm_pageout.c +++ b/osfmk/vm/vm_pageout.c @@ -2,7 +2,7 @@ * Copyright (c) 2000-2014 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,34 +22,34 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* * @OSF_COPYRIGHT@ */ -/* +/* * Mach Operating System * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University * All Rights Reserved. - * + * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. - * + * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * + * * Carnegie Mellon requests users of this software to return to - * + * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 - * + * * any improvements or extensions that they make and grant Carnegie Mellon * the rights to redistribute these changes. */ @@ -114,13 +114,13 @@ #include #endif -extern int cs_debug; - #if UPL_DEBUG #include #endif -extern void m_drain(void); +extern int cs_debug; + +extern void mbuf_drain(boolean_t); #if VM_PRESSURE_EVENTS #if CONFIG_JETSAM @@ -135,24 +135,14 @@ extern uint64_t memorystatus_available_pages_critical; extern unsigned int memorystatus_frozen_count; extern unsigned int memorystatus_suspended_count; - extern vm_pressure_level_t memorystatus_vm_pressure_level; -int memorystatus_purge_on_warning = 2; -int memorystatus_purge_on_urgent = 5; -int memorystatus_purge_on_critical = 8; void vm_pressure_response(void); -boolean_t vm_pressure_thread_running = FALSE; extern void consider_vm_pressure_events(void); #define MEMORYSTATUS_SUSPENDED_THRESHOLD 4 #endif /* VM_PRESSURE_EVENTS */ -boolean_t vm_pressure_changed = FALSE; - -#ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE /* maximum iterations of the active queue to move pages to inactive */ -#define VM_PAGEOUT_BURST_ACTIVE_THROTTLE 100 -#endif #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE /* maximum iterations of the inactive queue w/o stealing/cleaning a page */ #ifdef CONFIG_EMBEDDED @@ -166,24 +156,20 @@ boolean_t vm_pressure_changed = FALSE; #define VM_PAGEOUT_DEADLOCK_RELIEF 100 /* number of pages to move to break deadlock */ #endif -#ifndef VM_PAGEOUT_INACTIVE_RELIEF -#define VM_PAGEOUT_INACTIVE_RELIEF 50 /* minimum number of pages to move to the inactive q */ -#endif - #ifndef VM_PAGE_LAUNDRY_MAX #define VM_PAGE_LAUNDRY_MAX 128UL /* maximum pageouts on a given pageout queue */ #endif /* VM_PAGEOUT_LAUNDRY_MAX */ #ifndef VM_PAGEOUT_BURST_WAIT -#define VM_PAGEOUT_BURST_WAIT 10 /* milliseconds */ +#define VM_PAGEOUT_BURST_WAIT 1 /* milliseconds */ #endif /* VM_PAGEOUT_BURST_WAIT */ #ifndef VM_PAGEOUT_EMPTY_WAIT -#define VM_PAGEOUT_EMPTY_WAIT 200 /* milliseconds */ +#define VM_PAGEOUT_EMPTY_WAIT 50 /* milliseconds */ #endif /* VM_PAGEOUT_EMPTY_WAIT */ #ifndef VM_PAGEOUT_DEADLOCK_WAIT -#define VM_PAGEOUT_DEADLOCK_WAIT 300 /* milliseconds */ +#define VM_PAGEOUT_DEADLOCK_WAIT 100 /* milliseconds */ #endif /* VM_PAGEOUT_DEADLOCK_WAIT */ #ifndef VM_PAGEOUT_IDLE_WAIT @@ -191,22 +177,12 @@ boolean_t vm_pressure_changed = FALSE; #endif /* VM_PAGEOUT_IDLE_WAIT */ #ifndef VM_PAGEOUT_SWAP_WAIT -#define VM_PAGEOUT_SWAP_WAIT 50 /* milliseconds */ +#define VM_PAGEOUT_SWAP_WAIT 10 /* milliseconds */ #endif /* VM_PAGEOUT_SWAP_WAIT */ -#ifndef VM_PAGEOUT_PRESSURE_PAGES_CONSIDERED -#define VM_PAGEOUT_PRESSURE_PAGES_CONSIDERED 1000 /* maximum pages considered before we issue a pressure event */ -#endif /* VM_PAGEOUT_PRESSURE_PAGES_CONSIDERED */ - -#ifndef VM_PAGEOUT_PRESSURE_EVENT_MONITOR_SECS -#define VM_PAGEOUT_PRESSURE_EVENT_MONITOR_SECS 5 /* seconds */ -#endif /* VM_PAGEOUT_PRESSURE_EVENT_MONITOR_SECS */ - -unsigned int vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS; -unsigned int vm_page_speculative_percentage = 5; #ifndef VM_PAGE_SPECULATIVE_TARGET -#define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_page_speculative_percentage)) +#define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_pageout_state.vm_page_speculative_percentage)) #endif /* VM_PAGE_SPECULATIVE_TARGET */ @@ -223,11 +199,7 @@ unsigned int vm_page_speculative_percentage = 5; */ #ifndef VM_PAGE_INACTIVE_TARGET -#ifdef CONFIG_EMBEDDED -#define VM_PAGE_INACTIVE_TARGET(avail) ((avail) * 1 / 3) -#else #define VM_PAGE_INACTIVE_TARGET(avail) ((avail) * 1 / 2) -#endif #endif /* VM_PAGE_INACTIVE_TARGET */ /* @@ -289,6 +261,7 @@ unsigned int vm_page_speculative_percentage = 5; * we will make per call of vm_pageout_scan(). */ #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000 + #ifndef VM_PAGE_REACTIVATE_LIMIT #ifdef CONFIG_EMBEDDED #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2) @@ -298,18 +271,8 @@ unsigned int vm_page_speculative_percentage = 5; #endif /* VM_PAGE_REACTIVATE_LIMIT */ #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM 1000 - extern boolean_t hibernate_cleaning_in_progress; -/* - * Exported variable used to broadcast the activation of the pageout scan - * Working Set uses this to throttle its use of pmap removes. In this - * way, code which runs within memory in an uncontested context does - * not keep encountering soft faults. - */ - -unsigned int vm_pageout_scan_event_counter = 0; - /* * Forward declarations for internal routines. */ @@ -332,6 +295,7 @@ boolean_t VM_PRESSURE_WARNING_TO_CRITICAL(void); boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void); boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void); #endif + void vm_pageout_garbage_collect(int); static void vm_pageout_iothread_external(void); static void vm_pageout_iothread_internal(struct cq *cq); @@ -339,176 +303,48 @@ static void vm_pageout_adjust_eq_iothrottle(struct vm_pageout_queue *, boolean_t extern void vm_pageout_continue(void); extern void vm_pageout_scan(void); + void vm_tests(void); /* forward */ -boolean_t vm_restricted_to_single_processor = FALSE; #if !CONFIG_EMBEDDED static boolean_t vm_pageout_waiter = FALSE; static boolean_t vm_pageout_running = FALSE; #endif /* !CONFIG_EMBEDDED */ -static thread_t vm_pageout_external_iothread = THREAD_NULL; -static thread_t vm_pageout_internal_iothread = THREAD_NULL; - -unsigned int vm_pageout_reserved_internal = 0; -unsigned int vm_pageout_reserved_really = 0; - -unsigned int vm_pageout_swap_wait = 0; -unsigned int vm_pageout_idle_wait = 0; /* milliseconds */ -unsigned int vm_pageout_empty_wait = 0; /* milliseconds */ -unsigned int vm_pageout_burst_wait = 0; /* milliseconds */ -unsigned int vm_pageout_deadlock_wait = 0; /* milliseconds */ -unsigned int vm_pageout_deadlock_relief = 0; -unsigned int vm_pageout_inactive_relief = 0; -unsigned int vm_pageout_burst_active_throttle = 0; -unsigned int vm_pageout_burst_inactive_throttle = 0; - -int vm_upl_wait_for_pages = 0; - - -/* - * These variables record the pageout daemon's actions: - * how many pages it looks at and what happens to those pages. - * No locking needed because only one thread modifies the variables. - */ - -unsigned int vm_pageout_active = 0; /* debugging */ -unsigned int vm_pageout_inactive = 0; /* debugging */ -unsigned int vm_pageout_inactive_throttled = 0; /* debugging */ -unsigned int vm_pageout_inactive_forced = 0; /* debugging */ -unsigned int vm_pageout_inactive_nolock = 0; /* debugging */ -unsigned int vm_pageout_inactive_avoid = 0; /* debugging */ -unsigned int vm_pageout_inactive_busy = 0; /* debugging */ -unsigned int vm_pageout_inactive_error = 0; /* debugging */ -unsigned int vm_pageout_inactive_absent = 0; /* debugging */ -unsigned int vm_pageout_inactive_notalive = 0; /* debugging */ -unsigned int vm_pageout_inactive_used = 0; /* debugging */ -unsigned int vm_pageout_cache_evicted = 0; /* debugging */ -unsigned int vm_pageout_inactive_clean = 0; /* debugging */ -unsigned int vm_pageout_speculative_clean = 0; /* debugging */ -unsigned int vm_pageout_speculative_dirty = 0; /* debugging */ - -unsigned int vm_pageout_freed_from_cleaned = 0; -unsigned int vm_pageout_freed_from_speculative = 0; -unsigned int vm_pageout_freed_from_inactive_clean = 0; -unsigned int vm_pageout_freed_after_compression = 0; - -extern uint32_t vm_compressor_pages_grabbed; -extern uint32_t c_segment_pages_compressed; - -unsigned int vm_pageout_enqueued_cleaned_from_inactive_dirty = 0; - -unsigned int vm_pageout_cleaned_reclaimed = 0; /* debugging; how many cleaned pages are reclaimed by the pageout scan */ -unsigned int vm_pageout_cleaned_reactivated = 0; /* debugging; how many cleaned pages are found to be referenced on pageout (and are therefore reactivated) */ -unsigned int vm_pageout_cleaned_reference_reactivated = 0; -unsigned int vm_pageout_cleaned_volatile_reactivated = 0; -unsigned int vm_pageout_cleaned_fault_reactivated = 0; -unsigned int vm_pageout_cleaned_commit_reactivated = 0; /* debugging; how many cleaned pages are found to be referenced on commit (and are therefore reactivated) */ -unsigned int vm_pageout_cleaned_busy = 0; -unsigned int vm_pageout_cleaned_nolock = 0; - -unsigned int vm_pageout_inactive_dirty_internal = 0; /* debugging */ -unsigned int vm_pageout_inactive_dirty_external = 0; /* debugging */ -unsigned int vm_pageout_inactive_deactivated = 0; /* debugging */ -unsigned int vm_pageout_inactive_anonymous = 0; /* debugging */ -unsigned int vm_pageout_dirty_no_pager = 0; /* debugging */ -unsigned int vm_pageout_purged_objects = 0; /* used for sysctl vm stats */ -unsigned int vm_stat_discard = 0; /* debugging */ -unsigned int vm_stat_discard_sent = 0; /* debugging */ -unsigned int vm_stat_discard_failure = 0; /* debugging */ -unsigned int vm_stat_discard_throttle = 0; /* debugging */ -unsigned int vm_pageout_reactivation_limit_exceeded = 0; /* debugging */ -unsigned int vm_pageout_inactive_force_reclaim = 0; /* debugging */ -unsigned int vm_pageout_skipped_external = 0; /* debugging */ - -unsigned int vm_pageout_scan_reclaimed_throttled = 0; -unsigned int vm_pageout_scan_active_throttled = 0; -unsigned int vm_pageout_scan_inactive_throttled_internal = 0; -unsigned int vm_pageout_scan_inactive_throttled_external = 0; -unsigned int vm_pageout_scan_throttle = 0; /* debugging */ -unsigned int vm_pageout_scan_burst_throttle = 0; /* debugging */ -unsigned int vm_pageout_scan_empty_throttle = 0; /* debugging */ -unsigned int vm_pageout_scan_swap_throttle = 0; /* debugging */ -unsigned int vm_pageout_scan_deadlock_detected = 0; /* debugging */ -unsigned int vm_pageout_scan_active_throttle_success = 0; /* debugging */ -unsigned int vm_pageout_scan_inactive_throttle_success = 0; /* debugging */ -unsigned int vm_pageout_inactive_external_forced_jetsam_count = 0; /* debugging */ -unsigned int vm_pageout_scan_throttle_deferred = 0; /* debugging */ -unsigned int vm_pageout_scan_yield_unthrottled = 0; /* debugging */ -unsigned int vm_page_speculative_count_drifts = 0; -unsigned int vm_page_speculative_count_drift_max = 0; - -uint32_t vm_compressor_failed; - -/* - * Backing store throttle when BS is exhausted - */ -unsigned int vm_backing_store_low = 0; - -unsigned int vm_pageout_out_of_line = 0; -unsigned int vm_pageout_in_place = 0; - -unsigned int vm_page_steal_pageout_page = 0; - -struct vm_config vm_config; +#if DEVELOPMENT || DEBUG +struct vm_pageout_debug vm_pageout_debug; +#endif +struct vm_pageout_vminfo vm_pageout_vminfo; +struct vm_pageout_state vm_pageout_state; +struct vm_config vm_config; struct vm_pageout_queue vm_pageout_queue_internal __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT))); struct vm_pageout_queue vm_pageout_queue_external __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT))); -unsigned int vm_page_speculative_target = 0; - -vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL; +int vm_upl_wait_for_pages = 0; +vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL; boolean_t (* volatile consider_buffer_cache_collect)(int) = NULL; -#if DEVELOPMENT || DEBUG -unsigned long vm_cs_validated_resets = 0; -#endif - int vm_debug_events = 0; #if CONFIG_MEMORYSTATUS -#if !CONFIG_JETSAM -extern boolean_t memorystatus_idle_exit_from_VM(void); -#endif extern boolean_t memorystatus_kill_on_VM_page_shortage(boolean_t async); -extern void memorystatus_on_pageout_scan_end(void); uint32_t vm_pageout_memorystatus_fb_factor_nr = 5; uint32_t vm_pageout_memorystatus_fb_factor_dr = 2; -#if DEVELOPMENT || DEBUG -uint32_t vm_grab_anon_overrides = 0; -uint32_t vm_grab_anon_nops = 0; -#endif #endif -#if MACH_CLUSTER_STATS -unsigned long vm_pageout_cluster_dirtied = 0; -unsigned long vm_pageout_cluster_cleaned = 0; -unsigned long vm_pageout_cluster_collisions = 0; -unsigned long vm_pageout_cluster_clusters = 0; -unsigned long vm_pageout_cluster_conversions = 0; -unsigned long vm_pageout_target_collisions = 0; -unsigned long vm_pageout_target_page_dirtied = 0; -unsigned long vm_pageout_target_page_freed = 0; -#define CLUSTER_STAT(clause) clause -#else /* MACH_CLUSTER_STATS */ -#define CLUSTER_STAT(clause) -#endif /* MACH_CLUSTER_STATS */ -#if DEVELOPMENT || DEBUG -vmct_stats_t vmct_stats; -#endif - -/* +/* * Routine: vm_pageout_object_terminate * Purpose: * Destroy the pageout_object, and perform all of the * required cleanup actions. - * + * * In/Out conditions: * The object must be locked, and will be returned locked. */ @@ -534,13 +370,13 @@ vm_pageout_object_terminate( p = (vm_page_t) vm_page_queue_first(&object->memq); - assert(p->private); - assert(p->free_when_done); - p->free_when_done = FALSE; - assert(!p->cleaning); - assert(!p->laundry); + assert(p->vmp_private); + assert(p->vmp_free_when_done); + p->vmp_free_when_done = FALSE; + assert(!p->vmp_cleaning); + assert(!p->vmp_laundry); - offset = p->offset; + offset = p->vmp_offset; VM_PAGE_FREE(p); p = VM_PAGE_NULL; @@ -550,15 +386,15 @@ vm_pageout_object_terminate( if(m == VM_PAGE_NULL) continue; - assert((m->dirty) || (m->precious) || - (m->busy && m->cleaning)); + assert((m->vmp_dirty) || (m->vmp_precious) || + (m->vmp_busy && m->vmp_cleaning)); /* * Handle the trusted pager throttle. * Also decrement the burst throttle (if external). */ vm_page_lock_queues(); - if (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) + if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) vm_pageout_throttle_up(m); /* @@ -569,15 +405,12 @@ vm_pageout_object_terminate( * pages may have been modified between the selection as an * adjacent page and conversion to a target. */ - if (m->free_when_done) { - assert(m->busy); - assert(m->vm_page_q_state == VM_PAGE_IS_WIRED); - assert(m->wire_count == 1); - m->cleaning = FALSE; - m->free_when_done = FALSE; -#if MACH_CLUSTER_STATS - if (m->wanted) vm_pageout_target_collisions++; -#endif + if (m->vmp_free_when_done) { + assert(m->vmp_busy); + assert(m->vmp_q_state == VM_PAGE_IS_WIRED); + assert(m->vmp_wire_count == 1); + m->vmp_cleaning = FALSE; + m->vmp_free_when_done = FALSE; /* * Revoke all access to the page. Since the object is * locked, and the page is busy, this prevents the page @@ -591,17 +424,15 @@ vm_pageout_object_terminate( if (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) { SET_PAGE_DIRTY(m, FALSE); } else { - m->dirty = FALSE; + m->vmp_dirty = FALSE; } - if (m->dirty) { - CLUSTER_STAT(vm_pageout_target_page_dirtied++;) + if (m->vmp_dirty) { vm_page_unwire(m, TRUE); /* reactivates */ VM_STAT_INCR(reactivations); PAGE_WAKEUP_DONE(m); } else { - CLUSTER_STAT(vm_pageout_target_page_freed++;) - vm_page_free(m);/* clears busy, etc. */ + vm_page_free(m); /* clears busy, etc. */ } vm_page_unlock_queues(); continue; @@ -612,19 +443,19 @@ vm_pageout_object_terminate( * If prep_pin_count is nonzero, then someone is using the * page, so make it active. */ - if ((m->vm_page_q_state == VM_PAGE_NOT_ON_Q) && !m->private) { - if (m->reference) + if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) && !m->vmp_private) { + if (m->vmp_reference) vm_page_activate(m); else vm_page_deactivate(m); } - if (m->overwriting) { + if (m->vmp_overwriting) { /* * the (COPY_OUT_FROM == FALSE) request_page_list case */ - if (m->busy) { + if (m->vmp_busy) { /* - * We do not re-set m->dirty ! + * We do not re-set m->vmp_dirty ! * The page was busy so no extraneous activity * could have occurred. COPY_INTO is a read into the * new pages. CLEAN_IN_PLACE does actually write @@ -634,8 +465,8 @@ vm_pageout_object_terminate( */ pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m)); - m->busy = FALSE; - m->absent = FALSE; + m->vmp_busy = FALSE; + m->vmp_absent = FALSE; } else { /* * alternate (COPY_OUT_FROM == FALSE) request_page_list case @@ -645,28 +476,11 @@ vm_pageout_object_terminate( assert(VM_PAGE_WIRED(m)); vm_page_unwire(m, TRUE); /* reactivates */ } - m->overwriting = FALSE; + m->vmp_overwriting = FALSE; } else { - /* - * Set the dirty state according to whether or not the page was - * modified during the pageout. Note that we purposefully do - * NOT call pmap_clear_modify since the page is still mapped. - * If the page were to be dirtied between the 2 calls, this - * this fact would be lost. This code is only necessary to - * maintain statistics, since the pmap module is always - * consulted if m->dirty is false. - */ -#if MACH_CLUSTER_STATS - m->dirty = pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m)); - - if (m->dirty) vm_pageout_cluster_dirtied++; - else vm_pageout_cluster_cleaned++; - if (m->wanted) vm_pageout_cluster_collisions++; -#else - m->dirty = FALSE; -#endif + m->vmp_dirty = FALSE; } - m->cleaning = FALSE; + m->vmp_cleaning = FALSE; /* * Wakeup any thread waiting for the page to be un-cleaning. @@ -705,14 +519,14 @@ vm_pageclean_setup( vm_object_t new_object, vm_object_offset_t new_offset) { - assert(!m->busy); + assert(!m->vmp_busy); #if 0 - assert(!m->cleaning); + assert(!m->vmp_cleaning); #endif XPR(XPR_VM_PAGEOUT, "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n", - VM_PAGE_OBJECT(m), m->offset, m, + VM_PAGE_OBJECT(m), m->vmp_offset, m, new_m, new_offset); pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m)); @@ -720,19 +534,19 @@ vm_pageclean_setup( /* * Mark original page as cleaning in place. */ - m->cleaning = TRUE; + m->vmp_cleaning = TRUE; SET_PAGE_DIRTY(m, FALSE); - m->precious = FALSE; + m->vmp_precious = FALSE; /* * Convert the fictitious page to a private shadow of * the real page. */ - assert(new_m->fictitious); + assert(new_m->vmp_fictitious); assert(VM_PAGE_GET_PHYS_PAGE(new_m) == vm_page_fictitious_addr); - new_m->fictitious = FALSE; - new_m->private = TRUE; - new_m->free_when_done = TRUE; + new_m->vmp_fictitious = FALSE; + new_m->vmp_private = TRUE; + new_m->vmp_free_when_done = TRUE; VM_PAGE_SET_PHYS_PAGE(new_m, VM_PAGE_GET_PHYS_PAGE(m)); vm_page_lockspin_queues(); @@ -740,8 +554,8 @@ vm_pageclean_setup( vm_page_unlock_queues(); vm_page_insert_wired(new_m, new_object, new_offset, VM_KERN_MEMORY_NONE); - assert(!new_m->wanted); - new_m->busy = FALSE; + assert(!new_m->vmp_wanted); + new_m->vmp_busy = FALSE; } /* @@ -762,7 +576,7 @@ vm_pageclean_setup( * Implementation: * Move this page to a completely new object. */ -void +void vm_pageout_initialize_page( vm_page_t m) { @@ -778,22 +592,22 @@ vm_pageout_initialize_page( object = VM_PAGE_OBJECT(m); - assert(m->busy); + assert(m->vmp_busy); assert(object->internal); /* * Verify that we really want to clean this page */ - assert(!m->absent); - assert(!m->error); - assert(m->dirty); + assert(!m->vmp_absent); + assert(!m->vmp_error); + assert(m->vmp_dirty); /* * Create a paging reference to let us play with the object. */ - paging_offset = m->offset + object->paging_offset; + paging_offset = m->vmp_offset + object->paging_offset; - if (m->absent || m->error || m->restart || (!m->dirty && !m->precious)) { + if (m->vmp_absent || m->vmp_error || m->vmp_restart || (!m->vmp_dirty && !m->vmp_precious)) { panic("reservation without pageout?"); /* alan */ VM_PAGE_FREE(m); @@ -803,7 +617,7 @@ vm_pageout_initialize_page( } /* - * If there's no pager, then we can't clean the page. This should + * If there's no pager, then we can't clean the page. This should * never happen since this should be a copy object and therefore not * an external object, so the pager should always be there. */ @@ -843,15 +657,6 @@ vm_pageout_initialize_page( vm_object_paging_end(object); } -#if MACH_CLUSTER_STATS -#define MAXCLUSTERPAGES 16 -struct { - unsigned long pages_in_cluster; - unsigned long pages_at_higher_offsets; - unsigned long pages_at_lower_offsets; -} cluster_stats[MAXCLUSTERPAGES]; -#endif /* MACH_CLUSTER_STATS */ - /* * vm_pageout_cluster: @@ -867,13 +672,21 @@ struct { * * The page must not be on any pageout queue. */ +#if DEVELOPMENT || DEBUG +vmct_stats_t vmct_stats; + int32_t vmct_active = 0; +uint64_t vm_compressor_epoch_start = 0; +uint64_t vm_compressor_epoch_stop = 0; + typedef enum vmct_state_t { VMCT_IDLE, VMCT_AWAKENED, VMCT_ACTIVE, } vmct_state_t; vmct_state_t vmct_state[MAX_COMPRESSOR_THREAD_COUNT]; +#endif + void vm_pageout_cluster(vm_page_t m) @@ -884,7 +697,7 @@ vm_pageout_cluster(vm_page_t m) XPR(XPR_VM_PAGEOUT, "vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n", - object, m->offset, m, 0, 0); + object, m->vmp_offset, m, 0, 0); VM_PAGE_CHECK(m); LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); @@ -893,9 +706,9 @@ vm_pageout_cluster(vm_page_t m) /* * Only a certain kind of page is appreciated here. */ - assert((m->dirty || m->precious) && (!VM_PAGE_WIRED(m))); - assert(!m->cleaning && !m->laundry); - assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q); + assert((m->vmp_dirty || m->vmp_precious) && (!VM_PAGE_WIRED(m))); + assert(!m->vmp_cleaning && !m->vmp_laundry); + assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q); /* * protect the object from collapse or termination @@ -905,20 +718,20 @@ vm_pageout_cluster(vm_page_t m) if (object->internal == TRUE) { assert(VM_CONFIG_COMPRESSOR_IS_PRESENT); - m->busy = TRUE; + m->vmp_busy = TRUE; q = &vm_pageout_queue_internal; } else q = &vm_pageout_queue_external; - /* + /* * pgo_laundry count is tied to the laundry bit */ - m->laundry = TRUE; + m->vmp_laundry = TRUE; q->pgo_laundry++; - m->vm_page_q_state = VM_PAGE_ON_PAGEOUT_Q; - vm_page_queue_enter(&q->pgo_pending, m, vm_page_t, pageq); + m->vmp_q_state = VM_PAGE_ON_PAGEOUT_Q; + vm_page_queue_enter(&q->pgo_pending, m, vm_page_t, vmp_pageq); if (q->pgo_idle == TRUE) { q->pgo_idle = FALSE; @@ -928,10 +741,8 @@ vm_pageout_cluster(vm_page_t m) } -unsigned long vm_pageout_throttle_up_count = 0; - /* - * A page is back from laundry or we are stealing it back from + * A page is back from laundry or we are stealing it back from * the laundering state. See if there are some pages waiting to * go to laundry and if we can let some of them go now. * @@ -952,25 +763,25 @@ vm_pageout_throttle_up( LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); vm_object_lock_assert_exclusive(m_object); - vm_pageout_throttle_up_count++; - if (m_object->internal == TRUE) q = &vm_pageout_queue_internal; else q = &vm_pageout_queue_external; - if (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) { + if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) { - vm_page_queue_remove(&q->pgo_pending, m, vm_page_t, pageq); - m->vm_page_q_state = VM_PAGE_NOT_ON_Q; + vm_page_queue_remove(&q->pgo_pending, m, vm_page_t, vmp_pageq); + m->vmp_q_state = VM_PAGE_NOT_ON_Q; VM_PAGE_ZERO_PAGEQ_ENTRY(m); vm_object_activity_end(m_object); + + VM_PAGEOUT_DEBUG(vm_page_steal_pageout_page, 1); } - if (m->laundry == TRUE) { + if (m->vmp_laundry == TRUE) { - m->laundry = FALSE; + m->vmp_laundry = FALSE; q->pgo_laundry--; if (q->pgo_throttled == TRUE) { @@ -981,6 +792,7 @@ vm_pageout_throttle_up( q->pgo_draining = FALSE; thread_wakeup((event_t) (&q->pgo_laundry+1)); } + VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, 1); } } @@ -992,7 +804,7 @@ vm_pageout_throttle_up_batch( { LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); - vm_pageout_throttle_up_count += batch_cnt; + VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, batch_cnt); q->pgo_laundry -= batch_cnt; @@ -1025,20 +837,64 @@ vm_pageout_throttle_up_batch( * also returns the number of pages the system still needs to reclaim at this * moment in time. */ -#define VM_PAGEOUT_STAT_SIZE 31 +#if DEVELOPMENT || DEBUG +#define VM_PAGEOUT_STAT_SIZE (30 * 8) + 1 +#else +#define VM_PAGEOUT_STAT_SIZE (1 * 8) + 1 +#endif struct vm_pageout_stat { - unsigned int considered; - unsigned int reclaimed_clean; + unsigned long vm_page_active_count; + unsigned long vm_page_speculative_count; + unsigned long vm_page_inactive_count; + unsigned long vm_page_anonymous_count; + + unsigned long vm_page_free_count; + unsigned long vm_page_wire_count; + unsigned long vm_page_compressor_count; + + unsigned long vm_page_pages_compressed; + unsigned long vm_page_pageable_internal_count; + unsigned long vm_page_pageable_external_count; + unsigned long vm_page_xpmapped_external_count; + + unsigned int pages_grabbed; + unsigned int pages_freed; + unsigned int pages_compressed; unsigned int pages_grabbed_by_compressor; + unsigned int failed_compressions; + + unsigned int pages_evicted; + unsigned int pages_purged; + + unsigned int considered; + unsigned int considered_bq_internal; + unsigned int considered_bq_external; + + unsigned int skipped_external; + unsigned int filecache_min_reactivations; + + unsigned int freed_speculative; + unsigned int freed_cleaned; + unsigned int freed_internal; + unsigned int freed_external; + unsigned int cleaned_dirty_external; + unsigned int cleaned_dirty_internal; + + unsigned int inactive_referenced; + unsigned int inactive_nolock; + unsigned int reactivation_limit_exceeded; + unsigned int forced_inactive_reclaim; + unsigned int throttled_internal_q; unsigned int throttled_external_q; - unsigned int failed_compressions; -} vm_pageout_stats[VM_PAGEOUT_STAT_SIZE] = {{0,0,0,0,0,0,0,0}, }; + + unsigned int phantom_ghosts_found; + unsigned int phantom_ghosts_added; +} vm_pageout_stats[VM_PAGEOUT_STAT_SIZE] = {{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, }; unsigned int vm_pageout_stat_now = 0; -unsigned int vm_memory_pressure = 0; #define VM_PAGEOUT_STAT_BEFORE(i) \ (((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1) @@ -1046,15 +902,14 @@ unsigned int vm_memory_pressure = 0; (((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1) #if VM_PAGE_BUCKETS_CHECK -int vm_page_buckets_check_interval = 10; /* in seconds */ +int vm_page_buckets_check_interval = 80; /* in eighths of a second */ #endif /* VM_PAGE_BUCKETS_CHECK */ -/* - * Called from compute_averages(). - */ + void -compute_memory_pressure( - __unused void *arg) +record_memory_pressure(void); +void +record_memory_pressure(void) { unsigned int vm_pageout_next; @@ -1066,21 +921,18 @@ compute_memory_pressure( } #endif /* VM_PAGE_BUCKETS_CHECK */ - vm_memory_pressure = - vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].reclaimed_clean; + vm_pageout_state.vm_memory_pressure = + vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_speculative + + vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_cleaned + + vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_internal + + vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_external; - commpage_set_memory_pressure( vm_memory_pressure ); + commpage_set_memory_pressure( (unsigned int)vm_pageout_state.vm_memory_pressure ); /* move "now" forward */ vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now); - vm_pageout_stats[vm_pageout_next].considered = 0; - vm_pageout_stats[vm_pageout_next].reclaimed_clean = 0; - vm_pageout_stats[vm_pageout_next].throttled_internal_q = 0; - vm_pageout_stats[vm_pageout_next].throttled_external_q = 0; - vm_pageout_stats[vm_pageout_next].cleaned_dirty_external = 0; - vm_pageout_stats[vm_pageout_next].pages_compressed = 0; - vm_pageout_stats[vm_pageout_next].pages_grabbed_by_compressor = 0; - vm_pageout_stats[vm_pageout_next].failed_compressions = 0; + + bzero(&vm_pageout_stats[vm_pageout_next], sizeof(struct vm_pageout_stat)); vm_pageout_stat_now = vm_pageout_next; } @@ -1089,8 +941,8 @@ compute_memory_pressure( /* * IMPORTANT * mach_vm_ctl_page_free_wanted() is called indirectly, via - * mach_vm_pressure_monitor(), when taking a stackshot. Therefore, - * it must be safe in the restricted stackshot context. Locks and/or + * mach_vm_pressure_monitor(), when taking a stackshot. Therefore, + * it must be safe in the restricted stackshot context. Locks and/or * blocking are not allowable. */ unsigned int @@ -1112,7 +964,7 @@ mach_vm_ctl_page_free_wanted(void) /* * IMPORTANT: - * mach_vm_pressure_monitor() is called when taking a stackshot, with + * mach_vm_pressure_monitor() is called when taking a stackshot, with * wait_for_pressure FALSE, so that code path must remain safe in the * restricted stackshot context. No blocking or locks are allowable. * on that code path. @@ -1128,7 +980,9 @@ mach_vm_pressure_monitor( wait_result_t wr; unsigned int vm_pageout_then, vm_pageout_now; unsigned int pages_reclaimed; + unsigned int units_of_monitor; + units_of_monitor = 8 * nsecs_monitored; /* * We don't take the vm_page_queue_lock here because we don't want * vm_pressure_monitor() to get in the way of the vm_pageout_scan() @@ -1174,10 +1028,13 @@ mach_vm_pressure_monitor( for (vm_pageout_then = VM_PAGEOUT_STAT_BEFORE(vm_pageout_now); vm_pageout_then != vm_pageout_now && - nsecs_monitored-- != 0; + units_of_monitor-- != 0; vm_pageout_then = VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) { - pages_reclaimed += vm_pageout_stats[vm_pageout_then].reclaimed_clean; + pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_speculative; + pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_cleaned; + pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_internal; + pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_external; } *pages_reclaimed_p = pages_reclaimed; @@ -1250,7 +1107,7 @@ vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount) */ if (m_object != l_object) { /* - * the object associated with candidate page is + * the object associated with candidate page is * different from the one we were just working * with... dump the lock if we still own it */ @@ -1266,7 +1123,7 @@ vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount) * page queues lock, we can only 'try' for this one. * if the 'try' fails, we need to do a mutex_pause * to allow the owner of the object lock a chance to - * run... + * run... */ if ( !vm_object_lock_try_scan(m_object)) { @@ -1287,21 +1144,21 @@ vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount) l_object = m_object; } - if ( !m_object->alive || m->cleaning || m->laundry || m->busy || m->absent || m->error || m->free_when_done) { + if ( !m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || m->vmp_error || m->vmp_free_when_done) { /* * put it back on the head of its queue */ goto reenter_pg_on_q; } - if (m->pmapped == TRUE) { + if (m->vmp_pmapped == TRUE) { pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)); disconnected_count++; } reenter_pg_on_q: - vm_page_queue_remove(q, m, vm_page_t, pageq); - vm_page_queue_enter(q, m, vm_page_t, pageq); + vm_page_queue_remove(q, m, vm_page_t, vmp_pageq); + vm_page_queue_enter(q, m, vm_page_t, vmp_pageq); qcount--; try_failed_count = 0; @@ -1383,7 +1240,7 @@ vm_pageout_page_queue(vm_page_queue_head_t *q, int qcount) iq = &vm_pageout_queue_internal; - + vm_page_lock_queues(); while (qcount && !vm_page_queue_empty(q)) { @@ -1397,12 +1254,12 @@ vm_pageout_page_queue(vm_page_queue_head_t *q, int qcount) l_object = NULL; } iq->pgo_draining = TRUE; - + assert_wait((event_t) (&iq->pgo_laundry + 1), THREAD_INTERRUPTIBLE); vm_page_unlock_queues(); - + thread_block(THREAD_CONTINUE_NULL); - + vm_page_lock_queues(); delayed_unlock = 0; continue; @@ -1416,11 +1273,11 @@ vm_pageout_page_queue(vm_page_queue_head_t *q, int qcount) * already got the lock */ if (m_object != l_object) { - if ( !m_object->internal) + if ( !m_object->internal) goto reenter_pg_on_q; /* - * the object associated with candidate page is + * the object associated with candidate page is * different from the one we were just working * with... dump the lock if we still own it */ @@ -1436,7 +1293,7 @@ vm_pageout_page_queue(vm_page_queue_head_t *q, int qcount) * page queues lock, we can only 'try' for this one. * if the 'try' fails, we need to do a mutex_pause * to allow the owner of the object lock a chance to - * run... + * run... */ if ( !vm_object_lock_try_scan(m_object)) { @@ -1453,7 +1310,7 @@ vm_pageout_page_queue(vm_page_queue_head_t *q, int qcount) } l_object = m_object; } - if ( !m_object->alive || m->cleaning || m->laundry || m->busy || m->absent || m->error || m->free_when_done) { + if ( !m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || m->vmp_error || m->vmp_free_when_done) { /* * page is not to be cleaned * put it back on the head of its queue @@ -1462,22 +1319,22 @@ vm_pageout_page_queue(vm_page_queue_head_t *q, int qcount) } phys_page = VM_PAGE_GET_PHYS_PAGE(m); - if (m->reference == FALSE && m->pmapped == TRUE) { + if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) { refmod_state = pmap_get_refmod(phys_page); - + if (refmod_state & VM_MEM_REFERENCED) - m->reference = TRUE; + m->vmp_reference = TRUE; if (refmod_state & VM_MEM_MODIFIED) { SET_PAGE_DIRTY(m, FALSE); } } - if (m->reference == TRUE) { - m->reference = FALSE; + if (m->vmp_reference == TRUE) { + m->vmp_reference = FALSE; pmap_clear_refmod_options(phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL); goto reenter_pg_on_q; } - if (m->pmapped == TRUE) { - if (m->dirty || m->precious) { + if (m->vmp_pmapped == TRUE) { + if (m->vmp_dirty || m->vmp_precious) { pmap_options = PMAP_OPTIONS_COMPRESSOR; } else { pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED; @@ -1487,7 +1344,8 @@ vm_pageout_page_queue(vm_page_queue_head_t *q, int qcount) SET_PAGE_DIRTY(m, FALSE); } } - if ( !m->dirty && !m->precious) { + + if ( !m->vmp_dirty && !m->vmp_precious) { vm_page_unlock_queues(); VM_PAGE_FREE(m); vm_page_lock_queues(); @@ -1496,7 +1354,7 @@ vm_pageout_page_queue(vm_page_queue_head_t *q, int qcount) goto next_pg; } if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) { - + if (!m_object->pager_initialized) { vm_page_unlock_queues(); @@ -1531,8 +1389,8 @@ vm_pageout_page_queue(vm_page_queue_head_t *q, int qcount) goto next_pg; reenter_pg_on_q: - vm_page_queue_remove(q, m, vm_page_t, pageq); - vm_page_queue_enter(q, m, vm_page_t, pageq); + vm_page_queue_remove(q, m, vm_page_t, vmp_pageq); + vm_page_queue_enter(q, m, vm_page_t, vmp_pageq); next_pg: qcount--; try_failed_count = 0; @@ -1572,11 +1430,11 @@ extern void vm_pageout_io_throttle(void); * "partially re-used", which could be expensive. \ */ \ assert(VM_PAGE_OBJECT((m)) == (obj)); \ - if ((m)->reusable || \ + if ((m)->vmp_reusable || \ (obj)->all_reusable) { \ vm_object_reuse_pages((obj), \ - (m)->offset, \ - (m)->offset + PAGE_SIZE_64, \ + (m)->vmp_offset, \ + (m)->vmp_offset + PAGE_SIZE_64, \ FALSE); \ } \ MACRO_END @@ -1594,27 +1452,19 @@ struct flow_control { mach_timespec_t ts; }; + #if CONFIG_BACKGROUND_QUEUE -uint64_t vm_pageout_skipped_bq_internal = 0; -uint64_t vm_pageout_considered_bq_internal = 0; -uint64_t vm_pageout_considered_bq_external = 0; uint64_t vm_pageout_rejected_bq_internal = 0; uint64_t vm_pageout_rejected_bq_external = 0; +uint64_t vm_pageout_skipped_bq_internal = 0; #endif -uint32_t vm_pageout_no_victim = 0; -uint32_t vm_pageout_considered_page = 0; -uint32_t vm_page_filecache_min = 0; - #define ANONS_GRABBED_LIMIT 2 -#if CONFIG_SECLUDED_MEMORY -extern vm_page_t vm_page_grab_secluded(void); -uint64_t vm_pageout_secluded_burst_count = 0; -#endif /* CONFIG_SECLUDED_MEMORY */ - +#if 0 static void vm_pageout_delayed_unlock(int *, int *, vm_page_t *); +#endif static void vm_pageout_prepare_to_block(vm_object_t *, int *, vm_page_t *, int *, int); #define VM_PAGEOUT_PB_NO_ACTION 0 @@ -1622,20 +1472,21 @@ static void vm_pageout_prepare_to_block(vm_object_t *, int *, vm_page_t *, int * #define VM_PAGEOUT_PB_THREAD_YIELD 2 +#if 0 static void vm_pageout_delayed_unlock(int *delayed_unlock, int *local_freed, vm_page_t *local_freeq) { if (*local_freeq) { vm_page_unlock_queues(); - VM_DEBUG_EVENT( + VM_DEBUG_CONSTANT_EVENT( vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START, - vm_page_free_count, *local_freed, 0, 1); + vm_page_free_count, 0, 0, 1); vm_page_free_list(*local_freeq, TRUE); - VM_DEBUG_EVENT(vm_pageout_freelist,VM_PAGEOUT_FREELIST, DBG_FUNC_END, - vm_page_free_count, 0, 0, 1); + VM_DEBUG_CONSTANT_EVENT(vm_pageout_freelist,VM_PAGEOUT_FREELIST, DBG_FUNC_END, + vm_page_free_count, *local_freed, 0, 1); *local_freeq = NULL; *local_freed = 0; @@ -1646,6 +1497,7 @@ vm_pageout_delayed_unlock(int *delayed_unlock, int *local_freed, vm_page_t *loca } *delayed_unlock = 1; } +#endif static void @@ -1658,17 +1510,9 @@ vm_pageout_prepare_to_block(vm_object_t *object, int *delayed_unlock, vm_object_unlock(*object); *object = NULL; } - vm_pageout_scan_wants_object = VM_OBJECT_NULL; - if (*local_freeq) { - VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START, - vm_page_free_count, *local_freed, 0, 2); - vm_page_free_list(*local_freeq, TRUE); - - VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END, - vm_page_free_count, 0, 0, 2); *local_freeq = NULL; *local_freed = 0; @@ -1691,130 +1535,259 @@ vm_pageout_prepare_to_block(vm_object_t *object, int *delayed_unlock, } -int last_vm_pageout_freed_from_inactive_clean = 0; -int last_vm_pageout_freed_from_cleaned = 0; -int last_vm_pageout_freed_from_speculative = 0; -int last_vm_pageout_freed_after_compression = 0; -int last_vm_pageout_enqueued_cleaned_from_inactive_dirty = 0; -int last_vm_pageout_inactive_force_reclaim = 0; -int last_vm_pageout_scan_inactive_throttled_external = 0; -int last_vm_pageout_scan_inactive_throttled_internal = 0; -int last_vm_pageout_reactivation_limit_exceeded = 0; -int last_vm_pageout_considered_page = 0; -int last_vm_compressor_pages_grabbed = 0; -int last_vm_compressor_failed = 0; -int last_vm_pageout_skipped_external = 0; +static struct vm_pageout_vminfo last; + +uint64_t last_vm_page_pages_grabbed = 0; + +extern uint32_t c_segment_pages_compressed; +extern uint64_t shared_region_pager_reclaimed; +extern struct memory_object_pager_ops shared_region_pager_ops; void update_vm_info(void) { - int tmp1, tmp2, tmp3, tmp4; + uint64_t tmp; - if (!kdebug_enable) - return; - - KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO1)) | DBG_FUNC_NONE, - vm_page_active_count, - vm_page_speculative_count, - vm_page_inactive_count, - vm_page_anonymous_count, - 0); + vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count = vm_page_active_count; + vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count = vm_page_speculative_count; + vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count = vm_page_inactive_count; + vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count = vm_page_anonymous_count; - KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO2)) | DBG_FUNC_NONE, - vm_page_free_count, - vm_page_wire_count, - VM_PAGE_COMPRESSOR_COUNT, - 0, 0); + vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count = vm_page_free_count; + vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count = vm_page_wire_count; + vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count = VM_PAGE_COMPRESSOR_COUNT; - KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO3)) | DBG_FUNC_NONE, - c_segment_pages_compressed, - vm_page_internal_count, - vm_page_external_count, - vm_page_xpmapped_external_count, - 0); + vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed = c_segment_pages_compressed; + vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count = vm_page_pageable_internal_count; + vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count = vm_page_pageable_external_count; + vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count = vm_page_xpmapped_external_count; + + + tmp = vm_pageout_vminfo.vm_pageout_considered_page; + vm_pageout_stats[vm_pageout_stat_now].considered = (unsigned int)(tmp - last.vm_pageout_considered_page); + last.vm_pageout_considered_page = tmp; + + tmp = vm_pageout_vminfo.vm_pageout_compressions; + vm_pageout_stats[vm_pageout_stat_now].pages_compressed = (unsigned int)(tmp - last.vm_pageout_compressions); + last.vm_pageout_compressions = tmp; + + tmp = vm_pageout_vminfo.vm_compressor_failed; + vm_pageout_stats[vm_pageout_stat_now].failed_compressions = (unsigned int)(tmp - last.vm_compressor_failed); + last.vm_compressor_failed = tmp; + + tmp = vm_pageout_vminfo.vm_compressor_pages_grabbed; + vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor = (unsigned int)(tmp - last.vm_compressor_pages_grabbed); + last.vm_compressor_pages_grabbed = tmp; + + tmp = vm_pageout_vminfo.vm_phantom_cache_found_ghost; + vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found = (unsigned int)(tmp - last.vm_phantom_cache_found_ghost); + last.vm_phantom_cache_found_ghost = tmp; + + tmp = vm_pageout_vminfo.vm_phantom_cache_added_ghost; + vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added = (unsigned int)(tmp - last.vm_phantom_cache_added_ghost); + last.vm_phantom_cache_added_ghost = tmp; + + tmp = get_pages_grabbed_count(); + vm_pageout_stats[vm_pageout_stat_now].pages_grabbed = (unsigned int)(tmp - last_vm_page_pages_grabbed); + last_vm_page_pages_grabbed = tmp; + + tmp = vm_pageout_vminfo.vm_page_pages_freed; + vm_pageout_stats[vm_pageout_stat_now].pages_freed = (unsigned int)(tmp - last.vm_page_pages_freed); + last.vm_page_pages_freed = tmp; + + + if (vm_pageout_stats[vm_pageout_stat_now].considered) { + tmp = vm_pageout_vminfo.vm_pageout_pages_evicted; + vm_pageout_stats[vm_pageout_stat_now].pages_evicted = (unsigned int)(tmp - last.vm_pageout_pages_evicted); + last.vm_pageout_pages_evicted = tmp; - if ((vm_pageout_considered_page - last_vm_pageout_considered_page) == 0 && - (vm_pageout_enqueued_cleaned_from_inactive_dirty - last_vm_pageout_enqueued_cleaned_from_inactive_dirty == 0) && - (vm_pageout_freed_after_compression - last_vm_pageout_freed_after_compression == 0)) - return; + tmp = vm_pageout_vminfo.vm_pageout_pages_purged; + vm_pageout_stats[vm_pageout_stat_now].pages_purged = (unsigned int)(tmp - last.vm_pageout_pages_purged); + last.vm_pageout_pages_purged = tmp; + tmp = vm_pageout_vminfo.vm_pageout_freed_speculative; + vm_pageout_stats[vm_pageout_stat_now].freed_speculative = (unsigned int)(tmp - last.vm_pageout_freed_speculative); + last.vm_pageout_freed_speculative = tmp; - tmp1 = vm_pageout_considered_page; - tmp2 = vm_pageout_freed_from_speculative; - tmp3 = vm_pageout_freed_from_inactive_clean; + tmp = vm_pageout_vminfo.vm_pageout_freed_external; + vm_pageout_stats[vm_pageout_stat_now].freed_external = (unsigned int)(tmp - last.vm_pageout_freed_external); + last.vm_pageout_freed_external = tmp; - KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO4)) | DBG_FUNC_NONE, - tmp1 - last_vm_pageout_considered_page, - tmp2 - last_vm_pageout_freed_from_speculative, - tmp3 - last_vm_pageout_freed_from_inactive_clean, - 0, 0); - - last_vm_pageout_considered_page = tmp1; - last_vm_pageout_freed_from_speculative = tmp2; - last_vm_pageout_freed_from_inactive_clean = tmp3; + tmp = vm_pageout_vminfo.vm_pageout_inactive_referenced; + vm_pageout_stats[vm_pageout_stat_now].inactive_referenced = (unsigned int)(tmp - last.vm_pageout_inactive_referenced); + last.vm_pageout_inactive_referenced = tmp; + tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external; + vm_pageout_stats[vm_pageout_stat_now].throttled_external_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_external); + last.vm_pageout_scan_inactive_throttled_external = tmp; - tmp1 = vm_pageout_scan_inactive_throttled_external; - tmp2 = vm_pageout_enqueued_cleaned_from_inactive_dirty; - tmp3 = vm_pageout_freed_from_cleaned; + tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_external; + vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_external); + last.vm_pageout_inactive_dirty_external = tmp; - KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO5)) | DBG_FUNC_NONE, - tmp1 - last_vm_pageout_scan_inactive_throttled_external, - tmp2 - last_vm_pageout_enqueued_cleaned_from_inactive_dirty, - tmp3 - last_vm_pageout_freed_from_cleaned, - 0, 0); + tmp = vm_pageout_vminfo.vm_pageout_freed_cleaned; + vm_pageout_stats[vm_pageout_stat_now].freed_cleaned = (unsigned int)(tmp - last.vm_pageout_freed_cleaned); + last.vm_pageout_freed_cleaned = tmp; - vm_pageout_stats[vm_pageout_stat_now].throttled_external_q += (tmp1 - last_vm_pageout_scan_inactive_throttled_external); - vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external += (tmp2 - last_vm_pageout_enqueued_cleaned_from_inactive_dirty); + tmp = vm_pageout_vminfo.vm_pageout_inactive_nolock; + vm_pageout_stats[vm_pageout_stat_now].inactive_nolock = (unsigned int)(tmp - last.vm_pageout_inactive_nolock); + last.vm_pageout_inactive_nolock = tmp; - last_vm_pageout_scan_inactive_throttled_external = tmp1; - last_vm_pageout_enqueued_cleaned_from_inactive_dirty = tmp2; - last_vm_pageout_freed_from_cleaned = tmp3; + tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal; + vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_internal); + last.vm_pageout_scan_inactive_throttled_internal = tmp; + tmp = vm_pageout_vminfo.vm_pageout_skipped_external; + vm_pageout_stats[vm_pageout_stat_now].skipped_external = (unsigned int)(tmp - last.vm_pageout_skipped_external); + last.vm_pageout_skipped_external = tmp; - tmp1 = vm_pageout_scan_inactive_throttled_internal; - tmp2 = vm_pageout_freed_after_compression; - tmp3 = vm_compressor_pages_grabbed; - tmp4 = vm_pageout_skipped_external; + tmp = vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded; + vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded = (unsigned int)(tmp - last.vm_pageout_reactivation_limit_exceeded); + last.vm_pageout_reactivation_limit_exceeded = tmp; - KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO6)) | DBG_FUNC_NONE, - tmp1 - last_vm_pageout_scan_inactive_throttled_internal, - tmp2 - last_vm_pageout_freed_after_compression, - tmp3 - last_vm_compressor_pages_grabbed, - tmp4 - last_vm_pageout_skipped_external, + tmp = vm_pageout_vminfo.vm_pageout_inactive_force_reclaim; + vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim = (unsigned int)(tmp - last.vm_pageout_inactive_force_reclaim); + last.vm_pageout_inactive_force_reclaim = tmp; + + tmp = vm_pageout_vminfo.vm_pageout_freed_internal; + vm_pageout_stats[vm_pageout_stat_now].freed_internal = (unsigned int)(tmp - last.vm_pageout_freed_internal); + last.vm_pageout_freed_internal = tmp; + + tmp = vm_pageout_vminfo.vm_pageout_considered_bq_internal; + vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal = (unsigned int)(tmp - last.vm_pageout_considered_bq_internal); + last.vm_pageout_considered_bq_internal = tmp; + + tmp = vm_pageout_vminfo.vm_pageout_considered_bq_external; + vm_pageout_stats[vm_pageout_stat_now].considered_bq_external = (unsigned int)(tmp - last.vm_pageout_considered_bq_external); + last.vm_pageout_considered_bq_external = tmp; + + tmp = vm_pageout_vminfo.vm_pageout_filecache_min_reactivated; + vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations = (unsigned int)(tmp - last.vm_pageout_filecache_min_reactivated); + last.vm_pageout_filecache_min_reactivated = tmp; + + tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_internal; + vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_internal); + last.vm_pageout_inactive_dirty_internal = tmp; + } + + KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO1)) | DBG_FUNC_NONE, + vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count, + vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count, + vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count, + vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count, + 0); + + KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO2)) | DBG_FUNC_NONE, + vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count, + vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count, + vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count, + 0, + 0); + + KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO3)) | DBG_FUNC_NONE, + vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed, + vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count, + vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count, + vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count, + 0); + + if (vm_pageout_stats[vm_pageout_stat_now].considered || + vm_pageout_stats[vm_pageout_stat_now].pages_compressed || + vm_pageout_stats[vm_pageout_stat_now].failed_compressions) { + + KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO4)) | DBG_FUNC_NONE, + vm_pageout_stats[vm_pageout_stat_now].considered, + vm_pageout_stats[vm_pageout_stat_now].freed_speculative, + vm_pageout_stats[vm_pageout_stat_now].freed_external, + vm_pageout_stats[vm_pageout_stat_now].inactive_referenced, + 0); + + KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO5)) | DBG_FUNC_NONE, + vm_pageout_stats[vm_pageout_stat_now].throttled_external_q, + vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external, + vm_pageout_stats[vm_pageout_stat_now].freed_cleaned, + vm_pageout_stats[vm_pageout_stat_now].inactive_nolock, + 0); + + KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO6)) | DBG_FUNC_NONE, + vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q, + vm_pageout_stats[vm_pageout_stat_now].pages_compressed, + vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor, + vm_pageout_stats[vm_pageout_stat_now].skipped_external, + 0); + + KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO7)) | DBG_FUNC_NONE, + vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded, + vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim, + vm_pageout_stats[vm_pageout_stat_now].failed_compressions, + vm_pageout_stats[vm_pageout_stat_now].freed_internal, + 0); + + KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO8)) | DBG_FUNC_NONE, + vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal, + vm_pageout_stats[vm_pageout_stat_now].considered_bq_external, + vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations, + vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal, + 0); + + } + KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO9)) | DBG_FUNC_NONE, + vm_pageout_stats[vm_pageout_stat_now].pages_grabbed, + vm_pageout_stats[vm_pageout_stat_now].pages_freed, + vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found, + vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added, 0); - - vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q += (tmp1 - last_vm_pageout_scan_inactive_throttled_internal); - vm_pageout_stats[vm_pageout_stat_now].pages_compressed += (tmp2 - last_vm_pageout_freed_after_compression); - vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor += (tmp3 - last_vm_compressor_pages_grabbed); - last_vm_pageout_scan_inactive_throttled_internal = tmp1; - last_vm_pageout_freed_after_compression = tmp2; - last_vm_compressor_pages_grabbed = tmp3; - last_vm_pageout_skipped_external = tmp4; + record_memory_pressure(); +} + + +void +vm_page_balance_inactive(int max_to_move) +{ + vm_page_t m; + + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); + + vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count + + vm_page_inactive_count + + vm_page_speculative_count); + while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) { - if ((vm_pageout_reactivation_limit_exceeded - last_vm_pageout_reactivation_limit_exceeded) == 0 && - (vm_pageout_inactive_force_reclaim - last_vm_pageout_inactive_force_reclaim) == 0 && - (vm_compressor_failed - last_vm_compressor_failed) == 0) - return; + VM_PAGEOUT_DEBUG(vm_pageout_balanced, 1); - tmp1 = vm_pageout_reactivation_limit_exceeded; - tmp2 = vm_pageout_inactive_force_reclaim; - tmp3 = vm_compressor_failed; + m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active); + + assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q); + assert(!m->vmp_laundry); + assert(VM_PAGE_OBJECT(m) != kernel_object); + assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr); - KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO7)) | DBG_FUNC_NONE, - tmp1 - last_vm_pageout_reactivation_limit_exceeded, - tmp2 - last_vm_pageout_inactive_force_reclaim, - tmp3 - last_vm_compressor_failed, - 0, 0); + DTRACE_VM2(scan, int, 1, (uint64_t *), NULL); - vm_pageout_stats[vm_pageout_stat_now].failed_compressions += (tmp3 - last_vm_compressor_failed); + /* + * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise... + * + * a TLB flush isn't really needed here since at worst we'll miss the reference bit being + * updated in the PTE if a remote processor still has this mapping cached in its TLB when the + * new reference happens. If no futher references happen on the page after that remote TLB flushes + * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue + * by pageout_scan, which is just fine since the last reference would have happened quite far + * in the past (TLB caches don't hang around for very long), and of course could just as easily + * have happened before we moved the page + */ + if (m->vmp_pmapped == TRUE) + pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL); - last_vm_pageout_reactivation_limit_exceeded = tmp1; - last_vm_pageout_inactive_force_reclaim = tmp2; - last_vm_compressor_failed = tmp3; + /* + * The page might be absent or busy, + * but vm_page_deactivate can handle that. + * FALSE indicates that we don't want a H/W clear reference + */ + vm_page_deactivate_internal(m, FALSE); + } } @@ -1828,7 +1801,6 @@ vm_pageout_scan(void) { unsigned int loop_count = 0; unsigned int inactive_burst_count = 0; - unsigned int active_burst_count = 0; unsigned int reactivated_this_call; unsigned int reactivate_limit; vm_page_t local_freeq = NULL; @@ -1842,7 +1814,6 @@ vm_pageout_scan(void) struct vm_speculative_age_q *sq; struct flow_control flow_control = { 0, { 0, 0 } }; boolean_t inactive_throttled = FALSE; - boolean_t try_failed; mach_timespec_t ts; unsigned int msecs = 0; vm_object_t object = NULL; @@ -1858,7 +1829,9 @@ vm_pageout_scan(void) #endif int cache_evict_throttle = 0; uint32_t vm_pageout_inactive_external_forced_reactivate_limit = 0; + uint32_t inactive_external_count; int force_purge = 0; + int divisor; #define DELAY_SPECULATIVE_AGE 1000 int delay_speculative_age = 0; vm_object_t m_object = VM_OBJECT_NULL; @@ -1868,8 +1841,10 @@ vm_pageout_scan(void) #endif /* VM_PRESSURE_EVENTS */ VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_START, - vm_pageout_speculative_clean, vm_pageout_inactive_clean, - vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external); + vm_pageout_vminfo.vm_pageout_freed_speculative, + vm_pageout_state.vm_pageout_inactive_clean, + vm_pageout_vminfo.vm_pageout_inactive_dirty_internal, + vm_pageout_vminfo.vm_pageout_inactive_dirty_external); flow_control.state = FCS_IDLE; iq = &vm_pageout_queue_internal; @@ -1880,9 +1855,12 @@ vm_pageout_scan(void) XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0); /* Ask the pmap layer to return any pages it no longer needs. */ - pmap_release_pages_fast(); + uint64_t pmap_wired_pages_freed = pmap_release_pages_fast(); vm_page_lock_queues(); + + vm_page_wire_count -= pmap_wired_pages_freed; + delayed_unlock = 1; /* @@ -1897,12 +1875,6 @@ vm_pageout_scan(void) vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count; /* - * We want to gradually dribble pages from the active queue - * to the inactive queue. If we let the inactive queue get - * very small, and then suddenly dump many pages into it, - * those pages won't get a sufficient chance to be referenced - * before we start taking them from the inactive queue. - * * We must limit the rate at which we send pages to the pagers * so that we don't tie up too many pages in the I/O queues. * We implement a throttling mechanism using the laundry count @@ -1913,37 +1885,20 @@ vm_pageout_scan(void) * stalled waiting for memory, which only we can provide. */ - Restart: assert(object == NULL); assert(delayed_unlock != 0); - - /* - * Recalculate vm_page_inactivate_target. - */ - vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count + - vm_page_inactive_count + - vm_page_speculative_count); vm_page_anonymous_min = vm_page_inactive_target / 20; + if (vm_pageout_state.vm_page_speculative_percentage > 50) + vm_pageout_state.vm_page_speculative_percentage = 50; + else if (vm_pageout_state.vm_page_speculative_percentage <= 0) + vm_pageout_state.vm_page_speculative_percentage = 1; - /* - * don't want to wake the pageout_scan thread up everytime we fall below - * the targets... set a low water mark at 0.25% below the target - */ - vm_page_inactive_min = vm_page_inactive_target - (vm_page_inactive_target / 400); - - if (vm_page_speculative_percentage > 50) - vm_page_speculative_percentage = 50; - else if (vm_page_speculative_percentage <= 0) - vm_page_speculative_percentage = 1; - - vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count + - vm_page_inactive_count); - - try_failed = FALSE; + vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count + + vm_page_inactive_count); for (;;) { vm_page_t m; @@ -1963,53 +1918,33 @@ vm_pageout_scan(void) * Deal with secluded_q overflow. */ if (vm_page_secluded_count > vm_page_secluded_target) { - unsigned int secluded_overflow; vm_page_t secluded_page; - if (object != NULL) { - vm_object_unlock(object); - object = NULL; - vm_pageout_scan_wants_object = VM_OBJECT_NULL; - } /* * SECLUDED_AGING_BEFORE_ACTIVE: * Excess secluded pages go to the active queue and * will later go to the inactive queue. */ - active_burst_count = MIN(vm_pageout_burst_active_throttle, - vm_page_secluded_count_inuse); - secluded_overflow = (vm_page_secluded_count - - vm_page_secluded_target); - while (secluded_overflow-- > 0 && - vm_page_secluded_count > vm_page_secluded_target) { - assert((vm_page_secluded_count_free + - vm_page_secluded_count_inuse) == - vm_page_secluded_count); - secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded); - assert(secluded_page->vm_page_q_state == - VM_PAGE_ON_SECLUDED_Q); - vm_page_queues_remove(secluded_page, FALSE); - assert(!secluded_page->fictitious); - assert(!VM_PAGE_WIRED(secluded_page)); - if (secluded_page->vm_page_object == 0) { - /* transfer to free queue */ - assert(secluded_page->busy); - secluded_page->snext = local_freeq; - local_freeq = secluded_page; - local_freed++; - } else { - /* transfer to head of active queue */ - vm_page_enqueue_active(secluded_page, FALSE); - if (active_burst_count-- == 0) { - vm_pageout_secluded_burst_count++; - break; - } - } + assert((vm_page_secluded_count_free + + vm_page_secluded_count_inuse) == + vm_page_secluded_count); + secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded); + assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q); + + vm_page_queues_remove(secluded_page, FALSE); + assert(!secluded_page->vmp_fictitious); + assert(!VM_PAGE_WIRED(secluded_page)); + + if (secluded_page->vmp_object == 0) { + /* transfer to free queue */ + assert(secluded_page->vmp_busy); + secluded_page->vmp_snext = local_freeq; + local_freeq = secluded_page; + local_freed++; + } else { + /* transfer to head of active queue */ + vm_page_enqueue_active(secluded_page, FALSE); secluded_page = VM_PAGE_NULL; - - if (delayed_unlock++ > delayed_unlock_limit) { - vm_pageout_delayed_unlock(&delayed_unlock, &local_freed, &local_freeq); - } } } #endif /* CONFIG_SECLUDED_MEMORY */ @@ -2017,72 +1952,10 @@ vm_pageout_scan(void) assert(delayed_unlock); /* - * Move pages from active to inactive if we're below the target - */ - if ((vm_page_inactive_count + vm_page_speculative_count) >= vm_page_inactive_target) - goto done_moving_active_pages; - - if (object != NULL) { - vm_object_unlock(object); - object = NULL; - vm_pageout_scan_wants_object = VM_OBJECT_NULL; - } - /* - * Don't sweep through active queue more than the throttle - * which should be kept relatively low + * maintain our balance */ - active_burst_count = MIN(vm_pageout_burst_active_throttle, vm_page_active_count); - - VM_DEBUG_EVENT(vm_pageout_balance, VM_PAGEOUT_BALANCE, DBG_FUNC_START, - vm_pageout_inactive, vm_pageout_inactive_used, vm_page_free_count, local_freed); - - VM_DEBUG_EVENT(vm_pageout_balance, VM_PAGEOUT_BALANCE, DBG_FUNC_NONE, - vm_pageout_speculative_clean, vm_pageout_inactive_clean, - vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external); - memoryshot(VM_PAGEOUT_BALANCE, DBG_FUNC_START); - - - while (!vm_page_queue_empty(&vm_page_queue_active) && active_burst_count--) { - - vm_pageout_active++; + vm_page_balance_inactive(1); - m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active); - - assert(m->vm_page_q_state == VM_PAGE_ON_ACTIVE_Q); - assert(!m->laundry); - assert(VM_PAGE_OBJECT(m) != kernel_object); - assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr); - - DTRACE_VM2(scan, int, 1, (uint64_t *), NULL); - - /* - * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise... - * - * a TLB flush isn't really needed here since at worst we'll miss the reference bit being - * updated in the PTE if a remote processor still has this mapping cached in its TLB when the - * new reference happens. If no futher references happen on the page after that remote TLB flushes - * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue - * by pageout_scan, which is just fine since the last reference would have happened quite far - * in the past (TLB caches don't hang around for very long), and of course could just as easily - * have happened before we moved the page - */ - pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL); - - /* - * The page might be absent or busy, - * but vm_page_deactivate can handle that. - * FALSE indicates that we don't want a H/W clear reference - */ - vm_page_deactivate_internal(m, FALSE); - - if (delayed_unlock++ > delayed_unlock_limit) { - vm_pageout_delayed_unlock(&delayed_unlock, &local_freed, &local_freeq); - } - } - - VM_DEBUG_EVENT(vm_pageout_balance, VM_PAGEOUT_BALANCE, DBG_FUNC_END, - vm_page_active_count, vm_page_inactive_count, vm_page_speculative_count, vm_page_inactive_target); - memoryshot(VM_PAGEOUT_BALANCE, DBG_FUNC_END); /********************************************************************** * above this point we're playing with the active and secluded queues @@ -2090,15 +1963,15 @@ vm_pageout_scan(void) * and the inactive queue **********************************************************************/ -done_moving_active_pages: - if (vm_page_free_count + local_freed >= vm_page_free_target) { + vm_pageout_scan_wants_object = VM_OBJECT_NULL; + vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed, VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER); /* * make sure the pageout I/O threads are running - * throttled in case there are still requests + * throttled in case there are still requests * in the laundry... since we have met our targets * we don't need the laundry to be cleaned in a timely * fashion... so let's avoid interfering with foreground @@ -2106,22 +1979,6 @@ vm_pageout_scan(void) */ vm_pageout_adjust_eq_iothrottle(eq, TRUE); - /* - * recalculate vm_page_inactivate_target - */ - vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count + - vm_page_inactive_count + - vm_page_speculative_count); -#ifndef CONFIG_EMBEDDED - if (((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) && - !vm_page_queue_empty(&vm_page_queue_active)) { - /* - * inactive target still not met... keep going - * until we get the queues balanced... - */ - continue; - } -#endif lck_mtx_lock(&vm_page_queue_free_lock); if ((vm_page_free_count >= vm_page_free_target) && @@ -2134,24 +1991,27 @@ vm_pageout_scan(void) assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL); VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_NONE, - vm_pageout_inactive, vm_pageout_inactive_used, 0, 0); + vm_pageout_state.vm_pageout_inactive, + vm_pageout_state.vm_pageout_inactive_used, 0, 0); VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_END, - vm_pageout_speculative_clean, vm_pageout_inactive_clean, - vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external); + vm_pageout_vminfo.vm_pageout_freed_speculative, + vm_pageout_state.vm_pageout_inactive_clean, + vm_pageout_vminfo.vm_pageout_inactive_dirty_internal, + vm_pageout_vminfo.vm_pageout_inactive_dirty_external); return; } lck_mtx_unlock(&vm_page_queue_free_lock); } - + /* - * Before anything, we check if we have any ripe volatile + * Before anything, we check if we have any ripe volatile * objects around. If so, try to purge the first object. * If the purge fails, fall through to reclaim a page instead. * If the purge succeeds, go back to the top and reevalute * the new memory situation. */ - + assert (available_for_purge>=0); force_purge = 0; /* no force-purging */ @@ -2161,11 +2021,11 @@ vm_pageout_scan(void) if (pressure_level > kVMPressureNormal) { if (pressure_level >= kVMPressureCritical) { - force_purge = memorystatus_purge_on_critical; + force_purge = vm_pageout_state.memorystatus_purge_on_critical; } else if (pressure_level >= kVMPressureUrgent) { - force_purge = memorystatus_purge_on_urgent; + force_purge = vm_pageout_state.memorystatus_purge_on_urgent; } else if (pressure_level >= kVMPressureWarning) { - force_purge = memorystatus_purge_on_warning; + force_purge = vm_pageout_state.memorystatus_purge_on_warning; } } #endif /* VM_PRESSURE_EVENTS */ @@ -2181,7 +2041,7 @@ vm_pageout_scan(void) VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0); if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) { - vm_pageout_purged_objects++; + VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, 1); VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0); memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END); continue; @@ -2199,7 +2059,7 @@ vm_pageout_scan(void) struct vm_speculative_age_q *aq; boolean_t can_steal = FALSE; int num_scanned_queues; - + aq = &vm_page_queue_speculative[speculative_steal_index]; num_scanned_queues = 0; @@ -2210,7 +2070,7 @@ vm_pageout_scan(void) if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q) speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q; - + aq = &vm_page_queue_speculative[speculative_steal_index]; } @@ -2220,33 +2080,26 @@ vm_pageout_scan(void) * queues but still haven't found one * that is not empty, even though * vm_page_speculative_count is not 0. - * - * report the anomaly... */ - printf("vm_pageout_scan: " - "all speculative queues empty " - "but count=%d. Re-adjusting.\n", - vm_page_speculative_count); - if (vm_page_speculative_count > vm_page_speculative_count_drift_max) - vm_page_speculative_count_drift_max = vm_page_speculative_count; - vm_page_speculative_count_drifts++; + if (!vm_page_queue_empty(&sq->age_q)) + continue; #if DEVELOPMENT || DEBUG panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count); -#endif /* DEVELOPMENT || DEBUG */ +#endif /* readjust... */ vm_page_speculative_count = 0; /* ... and continue */ continue; } - if (vm_page_speculative_count > vm_page_speculative_target || force_speculative_aging == TRUE) + if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target || force_speculative_aging == TRUE) can_steal = TRUE; else { if (!delay_speculative_age) { mach_timespec_t ts_fully_aged; - ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_page_speculative_q_age_ms) / 1000; - ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_page_speculative_q_age_ms) % 1000) + ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) / 1000; + ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) % 1000) * 1000 * NSEC_PER_USEC; ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts); @@ -2272,27 +2125,26 @@ vm_pageout_scan(void) } force_speculative_aging = FALSE; -#if CONFIG_BACKGROUND_QUEUE - if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0 && - ((vm_page_background_mode == VM_PAGE_BG_DISABLED) || (vm_page_background_count <= vm_page_background_target))) -#else - if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) -#endif - { + if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) { + int pages_evicted; if (object != NULL) { vm_object_unlock(object); object = NULL; } + KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_START, 0, 0, 0, 0, 0); + pages_evicted = vm_object_cache_evict(100, 10); + KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_END, pages_evicted, 0, 0, 0, 0); + if (pages_evicted) { - vm_pageout_cache_evicted += pages_evicted; + vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted; VM_DEBUG_EVENT(vm_pageout_cache_evict, VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE, - vm_page_free_count, pages_evicted, vm_pageout_cache_evicted, 0); + vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, 0); memoryshot(VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE); /* @@ -2307,6 +2159,8 @@ vm_pageout_scan(void) if (cache_evict_throttle) cache_evict_throttle--; + divisor = vm_pageout_state.vm_page_filecache_min_divisor; + #if CONFIG_JETSAM /* * don't let the filecache_min fall below 15% of available memory @@ -2319,22 +2173,24 @@ vm_pageout_scan(void) * throttled queue (which isn't counted as available) which * effectively disables this filter */ - if (vm_compressor_low_on_space()) - vm_page_filecache_min = 0; + if (vm_compressor_low_on_space() || divisor == 0) + vm_pageout_state.vm_page_filecache_min = 0; else - vm_page_filecache_min = (AVAILABLE_NON_COMPRESSED_MEMORY / 7); + vm_pageout_state.vm_page_filecache_min = + ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor; #else - if (vm_compressor_out_of_space()) - vm_page_filecache_min = 0; + if (vm_compressor_out_of_space() || divisor == 0) + vm_pageout_state.vm_page_filecache_min = 0; else { /* - * don't let the filecache_min fall below 33% of available memory... + * don't let the filecache_min fall below the specified critical level */ - vm_page_filecache_min = (AVAILABLE_NON_COMPRESSED_MEMORY / 3); + vm_pageout_state.vm_page_filecache_min = + ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor; } #endif if (vm_page_free_count < (vm_page_free_reserved / 4)) - vm_page_filecache_min = 0; + vm_pageout_state.vm_page_filecache_min = 0; exceeded_burst_throttle = FALSE; /* @@ -2346,28 +2202,23 @@ vm_pageout_scan(void) */ if (vm_page_queue_empty(&vm_page_queue_inactive) && vm_page_queue_empty(&vm_page_queue_anonymous) && + vm_page_queue_empty(&vm_page_queue_cleaned) && vm_page_queue_empty(&sq->age_q)) { - vm_pageout_scan_empty_throttle++; - msecs = vm_pageout_empty_wait; + VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, 1); + msecs = vm_pageout_state.vm_pageout_empty_wait; goto vm_pageout_scan_delay; - } else if (inactive_burst_count >= - MIN(vm_pageout_burst_inactive_throttle, + } else if (inactive_burst_count >= + MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle, (vm_page_inactive_count + vm_page_speculative_count))) { - vm_pageout_scan_burst_throttle++; - msecs = vm_pageout_burst_wait; + VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, 1); + msecs = vm_pageout_state.vm_pageout_burst_wait; exceeded_burst_throttle = TRUE; goto vm_pageout_scan_delay; - } else if (vm_page_free_count > (vm_page_free_reserved / 4) && - VM_PAGEOUT_SCAN_NEEDS_TO_THROTTLE()) { - vm_pageout_scan_swap_throttle++; - msecs = vm_pageout_swap_wait; - goto vm_pageout_scan_delay; - - } else if (VM_PAGE_Q_THROTTLED(iq) && + } else if (VM_PAGE_Q_THROTTLED(iq) && VM_DYNAMIC_PAGING_ENABLED()) { clock_sec_t sec; clock_nsec_t nsec; @@ -2375,36 +2226,36 @@ vm_pageout_scan(void) switch (flow_control.state) { case FCS_IDLE: - if ((vm_page_free_count + local_freed) < vm_page_free_target) { + if ((vm_page_free_count + local_freed) < vm_page_free_target && + vm_pageout_state.vm_restricted_to_single_processor == FALSE) { + /* + * since the compressor is running independently of vm_pageout_scan + * let's not wait for it just yet... as long as we have a healthy supply + * of filecache pages to work with, let's keep stealing those. + */ + inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count; - vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed, - VM_PAGEOUT_PB_THREAD_YIELD); - if (!VM_PAGE_Q_THROTTLED(iq)) { - vm_pageout_scan_yield_unthrottled++; - continue; - } - if (vm_page_pageable_external_count > vm_page_filecache_min && - !vm_page_queue_empty(&vm_page_queue_inactive)) { - anons_grabbed = ANONS_GRABBED_LIMIT; - vm_pageout_scan_throttle_deferred++; + if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min && + (inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) { + anons_grabbed = ANONS_GRABBED_LIMIT; + VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, 1); goto consider_inactive; } - if (((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) && vm_page_active_count) - continue; } reset_deadlock_timer: - ts.tv_sec = vm_pageout_deadlock_wait / 1000; - ts.tv_nsec = (vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC; + ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / 1000; + ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC; clock_get_system_nanotime(&sec, &nsec); flow_control.ts.tv_sec = (unsigned int) sec; flow_control.ts.tv_nsec = nsec; ADD_MACH_TIMESPEC(&flow_control.ts, &ts); - + flow_control.state = FCS_DELAYED; - msecs = vm_pageout_deadlock_wait; + msecs = vm_pageout_state.vm_pageout_deadlock_wait; + vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++; break; - + case FCS_DELAYED: clock_get_system_nanotime(&sec, &nsec); ts.tv_sec = (unsigned int) sec; @@ -2413,7 +2264,7 @@ vm_pageout_scan(void) if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) { /* * the pageout thread for the default pager is potentially - * deadlocked since the + * deadlocked since the * default pager queue has been throttled for more than the * allowable time... we need to move some clean pages or dirty * pages belonging to the external pagers if they aren't throttled @@ -2421,14 +2272,15 @@ vm_pageout_scan(void) * blocked waiting for pages... we'll move one page for each of * these plus a fixed amount to break the logjam... once we're done * moving this number of pages, we'll re-enter the FSC_DELAYED state - * with a new timeout target since we have no way of knowing + * with a new timeout target since we have no way of knowing * whether we've broken the deadlock except through observation * of the queue associated with the default pager... we need to * stop moving pages and allow the system to run to see what * state it settles into. */ - vm_pageout_deadlock_target = vm_pageout_deadlock_relief + vm_page_free_wanted + vm_page_free_wanted_privileged; - vm_pageout_scan_deadlock_detected++; + vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief + + vm_page_free_wanted + vm_page_free_wanted_privileged; + VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, 1); flow_control.state = FCS_DEADLOCK_DETECTED; thread_wakeup((event_t) &vm_pageout_garbage_collect); goto consider_inactive; @@ -2439,7 +2291,7 @@ vm_pageout_scan(void) * awakened immediately upon a laundry completion, * so we won't wait any longer than necessary */ - msecs = vm_pageout_idle_wait; + msecs = vm_pageout_state.vm_pageout_idle_wait; break; case FCS_DEADLOCK_DETECTED: @@ -2449,28 +2301,24 @@ vm_pageout_scan(void) } vm_pageout_scan_delay: - vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed, + vm_pageout_scan_wants_object = VM_OBJECT_NULL; + + vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed, VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER); - if (flow_control.state == FCS_DELAYED && - !VM_PAGE_Q_THROTTLED(iq)) { - flow_control.state = FCS_IDLE; - goto consider_inactive; - } - if (vm_page_free_count >= vm_page_free_target) { /* * we're here because * 1) someone else freed up some pages while we had * the queues unlocked above - * and we've hit one of the 3 conditions that + * and we've hit one of the 3 conditions that * cause us to pause the pageout scan thread * * since we already have enough free pages, * let's avoid stalling and return normally * * before we return, make sure the pageout I/O threads - * are running throttled in case there are still requests + * are running throttled in case there are still requests * in the laundry... since we have enough free pages * we don't need the laundry to be cleaned in a timely * fashion... so let's avoid interfering with foreground @@ -2493,7 +2341,7 @@ vm_pageout_scan(void) goto return_from_scan; } lck_mtx_unlock(&vm_page_queue_free_lock); - + if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) { /* * we're most likely about to block due to one of @@ -2523,20 +2371,26 @@ vm_pageout_scan(void) flow_control.state = FCS_IDLE; goto consider_inactive; } + if (flow_control.state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) { + flow_control.state = FCS_IDLE; + goto consider_inactive; + } + VM_CHECK_MEMORYSTATUS; if (flow_control.state != FCS_IDLE) - vm_pageout_scan_throttle++; - iq->pgo_throttled = TRUE; + VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, 1); + iq->pgo_throttled = TRUE; assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC); + counter(c_vm_pageout_scan_block++); vm_page_unlock_queues(); assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL); - VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START, + VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START, iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0); memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START); @@ -2561,18 +2415,16 @@ vm_pageout_scan(void) flow_control.state = FCS_IDLE; consider_inactive: - vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count), + vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count), vm_pageout_inactive_external_forced_reactivate_limit); loop_count++; inactive_burst_count++; - vm_pageout_inactive++; - + vm_pageout_state.vm_pageout_inactive++; /* * Choose a victim. */ while (1) { - uint32_t inactive_external_count; #if CONFIG_BACKGROUND_QUEUE page_from_bg_q = FALSE; @@ -2580,7 +2432,7 @@ vm_pageout_scan(void) m = NULL; m_object = VM_OBJECT_NULL; - + if (VM_DYNAMIC_PAGING_ENABLED()) { assert(vm_page_throttled_count == 0); assert(vm_page_queue_empty(&vm_page_queue_throttled)); @@ -2593,9 +2445,9 @@ vm_pageout_scan(void) */ if (!vm_page_queue_empty(&vm_page_queue_cleaned)) { m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned); - - assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q); - + + assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q); + break; } @@ -2606,9 +2458,9 @@ vm_pageout_scan(void) if (!vm_page_queue_empty(&sq->age_q)) { m = (vm_page_t) vm_page_queue_first(&sq->age_q); - assert(m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q); + assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q); - if (!m->dirty || force_anonymous == FALSE) + if (!m->vmp_dirty || force_anonymous == FALSE) break; else m = NULL; @@ -2636,35 +2488,35 @@ vm_pageout_scan(void) } else if (force_anonymous == FALSE || bg_m_object->internal) { if (bg_m_object->internal && - ((vm_compressor_out_of_space() == TRUE) || - (vm_page_free_count < (vm_page_free_reserved / 4)))) { - - vm_pageout_skipped_bq_internal++; + (VM_PAGE_Q_THROTTLED(iq) || + vm_compressor_out_of_space() == TRUE || + vm_page_free_count < (vm_page_free_reserved / 4))) { + + vm_pageout_skipped_bq_internal++; } else { page_from_bg_q = TRUE; - + if (bg_m_object->internal) - vm_pageout_considered_bq_internal++; + vm_pageout_vminfo.vm_pageout_considered_bq_internal++; else - vm_pageout_considered_bq_external++; - + vm_pageout_vminfo.vm_pageout_considered_bq_external++; break; } } } #endif - - grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min); inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count; - if ((vm_page_pageable_external_count < vm_page_filecache_min || force_anonymous == TRUE) || - ((inactive_external_count < vm_page_anonymous_count) && (inactive_external_count < (vm_page_pageable_external_count / 3)))) { + if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min || force_anonymous == TRUE) || + (inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) { grab_anonymous = TRUE; anons_grabbed = 0; - vm_pageout_skipped_external++; + vm_pageout_vminfo.vm_pageout_skipped_external++; goto want_anonymous; } + grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min); + #if CONFIG_JETSAM /* If the file-backed pool has accumulated * significantly more pages than the jetsam @@ -2677,22 +2529,19 @@ vm_pageout_scan(void) */ if (grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) { if (vm_page_pageable_external_count > - vm_page_filecache_min) { + vm_pageout_state.vm_page_filecache_min) { if ((vm_page_pageable_external_count * vm_pageout_memorystatus_fb_factor_dr) > (memorystatus_available_pages_critical * vm_pageout_memorystatus_fb_factor_nr)) { grab_anonymous = FALSE; -#if DEVELOPMENT || DEBUG - vm_grab_anon_overrides++; -#endif + + VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, 1); } } -#if DEVELOPMENT || DEBUG if (grab_anonymous) { - vm_grab_anon_nops++; + VM_PAGEOUT_DEBUG(vm_grab_anon_nops, 1); } -#endif } #endif /* CONFIG_JETSAM */ @@ -2701,18 +2550,23 @@ vm_pageout_scan(void) if ( !vm_page_queue_empty(&vm_page_queue_inactive) ) { m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive); - - assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q); + + assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q); anons_grabbed = 0; - if (vm_page_pageable_external_count < vm_page_filecache_min) { - if ((++reactivated_this_call % 100)) - goto must_activate_page; - /* - * steal 1% of the file backed pages even if - * we are under the limit that has been set - * for a healthy filecache - */ + if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) { + + if ( !vm_page_queue_empty(&vm_page_queue_anonymous) ) { + if ((++reactivated_this_call % 100)) { + vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++; + goto must_activate_page; + } + /* + * steal 1% of the file backed pages even if + * we are under the limit that has been set + * for a healthy filecache + */ + } } break; } @@ -2720,7 +2574,7 @@ vm_pageout_scan(void) if ( !vm_page_queue_empty(&vm_page_queue_anonymous) ) { m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous); - assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q); + assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q); anons_grabbed++; break; @@ -2729,16 +2583,13 @@ vm_pageout_scan(void) /* * if we've gotten here, we have no victim page. * check to see if we've not finished balancing the queues - * or we have a page on the aged speculative queue that we + * or we have a page on the aged speculative queue that we * skipped due to force_anonymous == TRUE.. or we have * speculative pages that we can prematurely age... if * one of these cases we'll keep going, else panic */ force_anonymous = FALSE; - vm_pageout_no_victim++; - - if ((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) - goto done_with_inactivepage; + VM_PAGEOUT_DEBUG(vm_pageout_no_victim, 1); if (!vm_page_queue_empty(&sq->age_q)) goto done_with_inactivepage; @@ -2748,14 +2599,14 @@ vm_pageout_scan(void) goto done_with_inactivepage; } panic("vm_pageout: no victim"); - + /* NOTREACHED */ } assert(VM_PAGE_PAGEABLE(m)); m_object = VM_PAGE_OBJECT(m); force_anonymous = FALSE; - - page_prev_q_state = m->vm_page_q_state; + + page_prev_q_state = m->vmp_q_state; /* * we just found this page on one of our queues... * it can't also be on the pageout queue, so safe @@ -2763,14 +2614,13 @@ vm_pageout_scan(void) */ vm_page_queues_remove(m, TRUE); - assert(!m->laundry); - assert(!m->private); - assert(!m->fictitious); + assert(!m->vmp_laundry); + assert(!m->vmp_private); + assert(!m->vmp_fictitious); assert(m_object != kernel_object); assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr); - vm_pageout_stats[vm_pageout_stat_now].considered++; - vm_pageout_considered_page++; + vm_pageout_vminfo.vm_pageout_considered_page++; DTRACE_VM2(scan, int, 1, (uint64_t *), NULL); @@ -2781,14 +2631,13 @@ vm_pageout_scan(void) */ if (m_object != object) { /* - * the object associated with candidate page is + * the object associated with candidate page is * different from the one we were just working * with... dump the lock if we still own it */ if (object != NULL) { vm_object_unlock(object); object = NULL; - vm_pageout_scan_wants_object = VM_OBJECT_NULL; } /* * Try to lock object; since we've alread got the @@ -2803,86 +2652,74 @@ vm_pageout_scan(void) if (!vm_object_lock_try_scan(m_object)) { vm_page_t m_want = NULL; - vm_pageout_inactive_nolock++; + vm_pageout_vminfo.vm_pageout_inactive_nolock++; if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) - vm_pageout_cleaned_nolock++; + VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, 1); pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m)); - m->reference = FALSE; - -#if !CONFIG_EMBEDDED - /* - * m->object must be stable since we hold the page queues lock... - * we can update the scan_collisions field sans the object lock - * since it is a separate field and this is the only spot that does - * a read-modify-write operation and it is never executed concurrently... - * we can asynchronously set this field to 0 when creating a UPL, so it - * is possible for the value to be a bit non-determistic, but that's ok - * since it's only used as a hint - */ - /* - * This is not used on EMBEDDED because having this variable set *could* lead - * us to self-cannibalize pages from m_object to fill a UPL for a pagein. - * And, there's a high probability that the object that vm_pageout_scan - * wants and collides on is a very popular object e.g. the shared cache on EMBEDDED. - * The older pages that we cannibalize from the shared cache could be really - * important text pages e.g. the system call stubs. - */ - m_object->scan_collisions = 1; -#endif /* !CONFIG_EMBEDDED */ + m->vmp_reference = FALSE; - if ( !vm_page_queue_empty(&sq->age_q) ) - m_want = (vm_page_t) vm_page_queue_first(&sq->age_q); - else if ( !vm_page_queue_empty(&vm_page_queue_cleaned)) + if ( !m_object->object_is_shared_cache) { + /* + * don't apply this optimization if this is the shared cache + * object, it's too easy to get rid of very hot and important + * pages... + * m->vmp_object must be stable since we hold the page queues lock... + * we can update the scan_collisions field sans the object lock + * since it is a separate field and this is the only spot that does + * a read-modify-write operation and it is never executed concurrently... + * we can asynchronously set this field to 0 when creating a UPL, so it + * is possible for the value to be a bit non-determistic, but that's ok + * since it's only used as a hint + */ + m_object->scan_collisions = 1; + } + if ( !vm_page_queue_empty(&vm_page_queue_cleaned)) m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned); - else if ( !vm_page_queue_empty(&vm_page_queue_inactive) && - (anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous))) - m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive); + else if ( !vm_page_queue_empty(&sq->age_q)) + m_want = (vm_page_t) vm_page_queue_first(&sq->age_q); + else if ( (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT || + vm_page_queue_empty(&vm_page_queue_anonymous)) && + !vm_page_queue_empty(&vm_page_queue_inactive)) + m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive); else if ( !vm_page_queue_empty(&vm_page_queue_anonymous)) m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous); /* * this is the next object we're going to be interested in - * try to make sure its available after the mutex_yield + * try to make sure its available after the mutex_pause * returns control */ if (m_want) vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want); - /* - * force us to dump any collected free pages - * and to pause before moving on - */ - try_failed = TRUE; - goto requeue_page; } object = m_object; vm_pageout_scan_wants_object = VM_OBJECT_NULL; - - try_failed = FALSE; } assert(m_object == object); assert(VM_PAGE_OBJECT(m) == m_object); - if (m->busy) { + if (m->vmp_busy) { /* * Somebody is already playing with this page. * Put it back on the appropriate queue * */ - vm_pageout_inactive_busy++; + VM_PAGEOUT_DEBUG(vm_pageout_inactive_busy, 1); if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) - vm_pageout_cleaned_busy++; + VM_PAGEOUT_DEBUG(vm_pageout_cleaned_busy, 1); requeue_page: if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) vm_page_enqueue_inactive(m, FALSE); else vm_page_activate(m); #if CONFIG_BACKGROUND_QUEUE +#if DEVELOPMENT || DEBUG if (page_from_bg_q == TRUE) { if (m_object->internal) vm_pageout_rejected_bq_internal++; @@ -2890,6 +2727,31 @@ vm_pageout_scan(void) vm_pageout_rejected_bq_external++; } #endif +#endif + goto done_with_inactivepage; + } + + /* + * if (m->vmp_cleaning && !m->vmp_free_when_done) + * If already cleaning this page in place + * just leave if off the paging queues. + * We can leave the page mapped, and upl_commit_range + * will put it on the clean queue. + * + * if (m->vmp_free_when_done && !m->vmp_cleaning) + * an msync INVALIDATE is in progress... + * this page has been marked for destruction + * after it has been cleaned, + * but not yet gathered into a UPL + * where 'cleaning' will be set... + * just leave it off the paging queues + * + * if (m->vmp_free_when_done && m->vmp_clenaing) + * an msync INVALIDATE is in progress + * and the UPL has already gathered this page... + * just leave it off the paging queues + */ + if (m->vmp_free_when_done || m->vmp_cleaning) { goto done_with_inactivepage; } @@ -2901,17 +2763,17 @@ vm_pageout_scan(void) * from reclaiming it - busy or cleaning - that we've already * dealt with */ - if (m->absent || m->error || !object->alive) { + if (m->vmp_absent || m->vmp_error || !object->alive) { - if (m->absent) - vm_pageout_inactive_absent++; + if (m->vmp_absent) + VM_PAGEOUT_DEBUG(vm_pageout_inactive_absent, 1); else if (!object->alive) - vm_pageout_inactive_notalive++; + VM_PAGEOUT_DEBUG(vm_pageout_inactive_notalive, 1); else - vm_pageout_inactive_error++; -reclaim_page: + VM_PAGEOUT_DEBUG(vm_pageout_inactive_error, 1); +reclaim_page: if (vm_pageout_deadlock_target) { - vm_pageout_scan_inactive_throttle_success++; + VM_PAGEOUT_DEBUG(vm_pageout_scan_inactive_throttle_success, 1); vm_pageout_deadlock_target--; } @@ -2922,10 +2784,16 @@ vm_pageout_scan(void) } else { DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL); } - assert(!m->cleaning); - assert(!m->laundry); + assert(!m->vmp_cleaning); + assert(!m->vmp_laundry); + + if (!object->internal && + object->pager != NULL && + object->pager->mo_pager_ops == &shared_region_pager_ops) { + shared_region_pager_reclaimed++; + } - m->busy = TRUE; + m->vmp_busy = TRUE; /* * remove page from object here since we're already @@ -2933,42 +2801,43 @@ vm_pageout_scan(void) * we'd normally do in vm_page_free_prepare_object * until 'vm_page_free_list' is called */ - if (m->tabled) + if (m->vmp_tabled) vm_page_remove(m, TRUE); - assert(m->pageq.next == 0 && m->pageq.prev == 0); - m->snext = local_freeq; + assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0); + m->vmp_snext = local_freeq; local_freeq = m; local_freed++; - + if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) - vm_pageout_freed_from_speculative++; + vm_pageout_vminfo.vm_pageout_freed_speculative++; else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) - vm_pageout_freed_from_cleaned++; + vm_pageout_vminfo.vm_pageout_freed_cleaned++; + else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) + vm_pageout_vminfo.vm_pageout_freed_internal++; else - vm_pageout_freed_from_inactive_clean++; - - vm_pageout_stats[vm_pageout_stat_now].reclaimed_clean++; + vm_pageout_vminfo.vm_pageout_freed_external++; inactive_burst_count = 0; goto done_with_inactivepage; } - /* - * If the object is empty, the page must be reclaimed even - * if dirty or used. - * If the page belongs to a volatile object, we stick it back - * on. - */ if (object->copy == VM_OBJECT_NULL) { + /* + * No one else can have any interest in this page. + * If this is an empty purgable object, the page can be + * reclaimed even if dirty. + * If the page belongs to a volatile purgable object, we + * reactivate it if the compressor isn't active. + */ if (object->purgable == VM_PURGABLE_EMPTY) { - if (m->pmapped == TRUE) { + if (m->vmp_pmapped == TRUE) { /* unmap the page */ refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)); if (refmod_state & VM_MEM_MODIFIED) { SET_PAGE_DIRTY(m, FALSE); } } - if (m->dirty || m->precious) { + if (m->vmp_dirty || m->vmp_precious) { /* we saved the cost of cleaning this page ! */ vm_page_purged_count++; } @@ -2997,7 +2866,7 @@ vm_pageout_scan(void) reactivated_this_call++; if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) - vm_pageout_cleaned_volatile_reactivated++; + VM_PAGEOUT_DEBUG(vm_pageout_cleaned_volatile_reactivated, 1); goto reactivate_page; } @@ -3010,66 +2879,33 @@ vm_pageout_scan(void) */ refmod_state = -1; - if (m->reference == FALSE && m->pmapped == TRUE) { + if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) { refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m)); - + if (refmod_state & VM_MEM_REFERENCED) - m->reference = TRUE; + m->vmp_reference = TRUE; if (refmod_state & VM_MEM_MODIFIED) { SET_PAGE_DIRTY(m, FALSE); } } - - /* - * if (m->cleaning && !m->free_when_done) - * If already cleaning this page in place and it hasn't - * been recently referenced, just pull off the queue. - * We can leave the page mapped, and upl_commit_range - * will put it on the clean queue. - * - * if (m->free_when_done && !m->cleaning) - * an msync INVALIDATE is in progress... - * this page has been marked for destruction - * after it has been cleaned, - * but not yet gathered into a UPL - * where 'cleaning' will be set... - * just leave it off the paging queues - * - * if (m->free_when_done && m->clenaing) - * an msync INVALIDATE is in progress - * and the UPL has already gathered this page... - * just leave it off the paging queues - */ - - /* - * page with m->free_when_done and still on the queues means that an - * MS_INVALIDATE is in progress on this page... leave it alone - */ - if (m->free_when_done) { - goto done_with_inactivepage; - } - - /* if cleaning, reactivate if referenced. otherwise, just pull off queue */ - if (m->cleaning) { - if (m->reference == TRUE) { - reactivated_this_call++; - goto reactivate_page; - } else { - goto done_with_inactivepage; - } - } - if (m->reference || m->dirty) { + if (m->vmp_reference || m->vmp_dirty) { /* deal with a rogue "reusable" page */ VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object); } + divisor = vm_pageout_state.vm_page_xpmapped_min_divisor; - if (!m->no_cache && + if (divisor == 0) + vm_pageout_state.vm_page_xpmapped_min = 0; + else + vm_pageout_state.vm_page_xpmapped_min = (vm_page_external_count * 10) / divisor; + + if (!m->vmp_no_cache && #if CONFIG_BACKGROUND_QUEUE page_from_bg_q == FALSE && #endif - (m->reference || - (m->xpmapped && !object->internal && (vm_page_xpmapped_external_count < (vm_page_external_count / 4))))) { + (m->vmp_reference || (m->vmp_xpmapped && !object->internal && + (vm_page_xpmapped_external_count < vm_pageout_state.vm_page_xpmapped_min)))) { /* * The page we pulled off the inactive list has * been referenced. It is possible for other @@ -3079,14 +2915,16 @@ vm_pageout_scan(void) * reactivations. */ if (++reactivated_this_call >= reactivate_limit) { - vm_pageout_reactivation_limit_exceeded++; + vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded++; } else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) { - vm_pageout_inactive_force_reclaim++; + vm_pageout_vminfo.vm_pageout_inactive_force_reclaim++; } else { uint32_t isinuse; if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) - vm_pageout_cleaned_reference_reactivated++; + VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reference_reactivated, 1); + + vm_pageout_vminfo.vm_pageout_inactive_referenced++; reactivate_page: if ( !object->internal && object->pager != MEMORY_OBJECT_NULL && vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) { @@ -3095,7 +2933,7 @@ vm_pageout_scan(void) * and it's not open via the filesystem */ vm_page_deactivate(m); - vm_pageout_inactive_deactivated++; + VM_PAGEOUT_DEBUG(vm_pageout_inactive_deactivated, 1); } else { must_activate_page: /* @@ -3106,25 +2944,27 @@ vm_pageout_scan(void) inactive_burst_count = 0; } #if CONFIG_BACKGROUND_QUEUE +#if DEVELOPMENT || DEBUG if (page_from_bg_q == TRUE) { if (m_object->internal) vm_pageout_rejected_bq_internal++; else vm_pageout_rejected_bq_external++; } +#endif #endif if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) - vm_pageout_cleaned_reactivated++; - vm_pageout_inactive_used++; + VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1); + vm_pageout_state.vm_pageout_inactive_used++; goto done_with_inactivepage; } - /* + /* * Make sure we call pmap_get_refmod() if it * wasn't already called just above, to update * the dirty bit. */ - if ((refmod_state == -1) && !m->dirty && m->pmapped) { + if ((refmod_state == -1) && !m->vmp_dirty && m->vmp_pmapped) { refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m)); if (refmod_state & VM_MEM_MODIFIED) { SET_PAGE_DIRTY(m, FALSE); @@ -3134,18 +2974,18 @@ vm_pageout_scan(void) XPR(XPR_VM_PAGEOUT, "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n", - object, m->offset, m, 0,0); + object, m->vmp_offset, m, 0,0); /* * we've got a candidate page to steal... * - * m->dirty is up to date courtesy of the - * preceding check for m->reference... if - * we get here, then m->reference had to be + * m->vmp_dirty is up to date courtesy of the + * preceding check for m->vmp_reference... if + * we get here, then m->vmp_reference had to be * FALSE (or possibly "reactivate_limit" was * exceeded), but in either case we called * pmap_get_refmod() and updated both - * m->reference and m->dirty + * m->vmp_reference and m->vmp_dirty * * if it's dirty or precious we need to * see if the target queue is throtttled @@ -3155,7 +2995,7 @@ vm_pageout_scan(void) inactive_throttled = FALSE; - if (m->dirty || m->precious) { + if (m->vmp_dirty || m->vmp_precious) { if (object->internal) { if (VM_PAGE_Q_THROTTLED(iq)) inactive_throttled = TRUE; @@ -3165,18 +3005,18 @@ vm_pageout_scan(void) } throttle_inactive: if (!VM_DYNAMIC_PAGING_ENABLED() && - object->internal && m->dirty && + object->internal && m->vmp_dirty && (object->purgable == VM_PURGABLE_DENY || object->purgable == VM_PURGABLE_NONVOLATILE || object->purgable == VM_PURGABLE_VOLATILE)) { vm_page_check_pageable_safe(m); - assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q); + assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q); vm_page_queue_enter(&vm_page_queue_throttled, m, - vm_page_t, pageq); - m->vm_page_q_state = VM_PAGE_ON_THROTTLED_Q; + vm_page_t, vmp_pageq); + m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q; vm_page_throttled_count++; - vm_pageout_scan_reclaimed_throttled++; + VM_PAGEOUT_DEBUG(vm_pageout_scan_reclaimed_throttled, 1); inactive_burst_count = 0; goto done_with_inactivepage; @@ -3209,12 +3049,12 @@ vm_pageout_scan(void) * that we can try to find clean pages in the active/inactive queues before * deciding to jetsam a process */ - vm_pageout_scan_inactive_throttled_external++; + vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++; vm_page_check_pageable_safe(m); - assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q); - vm_page_queue_enter(&vm_page_queue_active, m, vm_page_t, pageq); - m->vm_page_q_state = VM_PAGE_ON_ACTIVE_Q; + assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q); + vm_page_queue_enter(&vm_page_queue_active, m, vm_page_t, vmp_pageq); + m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q; vm_page_active_count++; vm_page_pageable_external_count++; @@ -3238,13 +3078,13 @@ vm_pageout_scan(void) /* Kill first suitable process. If this call returned FALSE, we might have simply purged a process instead. */ if (memorystatus_kill_on_VM_page_shortage(FALSE) == TRUE) { - vm_pageout_inactive_external_forced_jetsam_count++; + VM_PAGEOUT_DEBUG(vm_pageout_inactive_external_forced_jetsam_count, 1); } - + VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_END, vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count); - vm_page_lock_queues(); + vm_page_lock_queues(); delayed_unlock = 1; } #else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */ @@ -3253,7 +3093,6 @@ vm_pageout_scan(void) inactive_burst_count = 0; goto done_with_inactivepage; } else { - vm_pageout_scan_inactive_throttled_internal++; goto must_activate_page; } } @@ -3264,17 +3103,17 @@ vm_pageout_scan(void) * we have the up-to-date modified state * * if we need to do a pmap_disconnect then we - * need to re-evaluate m->dirty since the pmap_disconnect - * provides the true state atomically... the + * need to re-evaluate m->vmp_dirty since the pmap_disconnect + * provides the true state atomically... the * page was still mapped up to the pmap_disconnect * and may have been dirtied at the last microsecond * * Note that if 'pmapped' is FALSE then the page is not * and has not been in any map, so there is no point calling - * pmap_disconnect(). m->dirty could have been set in anticipation + * pmap_disconnect(). m->vmp_dirty could have been set in anticipation * of likely usage of the page. */ - if (m->pmapped == TRUE) { + if (m->vmp_pmapped == TRUE) { int pmap_options; /* @@ -3291,7 +3130,7 @@ vm_pageout_scan(void) if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE || object->internal == FALSE) { pmap_options = 0; - } else if (m->dirty || m->precious) { + } else if (m->vmp_dirty || m->vmp_precious) { /* * VM knows that this page is dirty (or * precious) and needs to be compressed @@ -3319,8 +3158,9 @@ vm_pageout_scan(void) SET_PAGE_DIRTY(m, FALSE); } } + /* - * reset our count of pages that have been reclaimed + * reset our count of pages that have been reclaimed * since the last page was 'stolen' */ inactive_reclaim_run = 0; @@ -3328,18 +3168,10 @@ vm_pageout_scan(void) /* * If it's clean and not precious, we can free the page. */ - if (!m->dirty && !m->precious) { + if (!m->vmp_dirty && !m->vmp_precious) { - if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) - vm_pageout_speculative_clean++; - else { - if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) - vm_pageout_inactive_anonymous++; - else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) - vm_pageout_cleaned_reclaimed++; + vm_pageout_state.vm_pageout_inactive_clean++; - vm_pageout_inactive_clean++; - } /* * OK, at this point we have found a page we are going to free. */ @@ -3365,7 +3197,7 @@ vm_pageout_scan(void) if (inactive_throttled == TRUE) goto throttle_inactive; - + #if VM_PRESSURE_EVENTS #if CONFIG_JETSAM @@ -3377,40 +3209,43 @@ vm_pageout_scan(void) */ #else /* CONFIG_JETSAM */ - + vm_pressure_response(); #endif /* CONFIG_JETSAM */ #endif /* VM_PRESSURE_EVENTS */ - + if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) - vm_pageout_speculative_dirty++; - else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) - vm_pageout_inactive_anonymous++; + VM_PAGEOUT_DEBUG(vm_pageout_speculative_dirty, 1); if (object->internal) - vm_pageout_inactive_dirty_internal++; + vm_pageout_vminfo.vm_pageout_inactive_dirty_internal++; else - vm_pageout_inactive_dirty_external++; + vm_pageout_vminfo.vm_pageout_inactive_dirty_external++; /* - * do NOT set the pageout bit! - * sure, we might need free pages, but this page is going to take time to become free - * anyway, so we may as well put it on the clean queue first and take it from there later - * if necessary. that way, we'll ensure we don't free up too much. -mj + * internal pages will go to the compressor... + * external pages will go to the appropriate pager to be cleaned + * and upon completion will end up on 'vm_page_queue_cleaned' which + * is a preferred queue to steal from */ vm_pageout_cluster(m); + inactive_burst_count = 0; done_with_inactivepage: - if (delayed_unlock++ > delayed_unlock_limit || try_failed == TRUE) { + if (delayed_unlock++ > delayed_unlock_limit) { + int freed = local_freed; vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed, VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER); - if (try_failed == TRUE) - lck_mtx_yield(&vm_page_queue_lock); + if (freed == 0) + lck_mtx_yield(&vm_page_queue_lock); + } else if (vm_pageout_scan_wants_object) { + vm_page_unlock_queues(); + mutex_pause(0); + vm_page_lock_queues(); } - /* * back to top of pageout scan loop */ @@ -3418,8 +3253,6 @@ vm_pageout_scan(void) } -int vm_page_free_count_init; - void vm_page_free_reserve( int pages) @@ -3439,7 +3272,7 @@ vm_page_free_reserve( else vm_page_free_reserved += pages; } - free_after_reserve = vm_page_free_count_init - vm_page_free_reserved; + free_after_reserve = vm_pageout_state.vm_page_free_count_init - vm_page_free_reserved; vm_page_free_min = vm_page_free_reserved + VM_PAGE_FREE_MIN(free_after_reserve); @@ -3467,7 +3300,7 @@ void vm_pageout_continue(void) { DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL); - vm_pageout_scan_event_counter++; + VM_PAGEOUT_DEBUG(vm_pageout_scan_event_counter, 1); #if !CONFIG_EMBEDDED lck_mtx_lock(&vm_page_queue_free_lock); @@ -3530,11 +3363,11 @@ vm_pageout_iothread_external_continue(struct vm_pageout_queue *q) vm_object_offset_t offset; memory_object_t pager; - /* On systems without a compressor, the external IO thread clears its + /* On systems with a compressor, the external IO thread clears its * VM privileged bit to accommodate large allocations (e.g. bulk UPL * creation) */ - if (vm_pageout_internal_iothread != THREAD_NULL) + if (vm_pageout_state.vm_pageout_internal_iothread != THREAD_NULL) current_thread()->options &= ~TH_OPT_VMPRIV; vm_page_lockspin_queues(); @@ -3542,9 +3375,9 @@ vm_pageout_iothread_external_continue(struct vm_pageout_queue *q) while ( !vm_page_queue_empty(&q->pgo_pending) ) { q->pgo_busy = TRUE; - vm_page_queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq); + vm_page_queue_remove_first(&q->pgo_pending, m, vm_page_t, vmp_pageq); - assert(m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q); + assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q); VM_PAGE_CHECK(m); /* * grab a snapshot of the object and offset this @@ -3556,12 +3389,9 @@ vm_pageout_iothread_external_continue(struct vm_pageout_queue *q) * on this object which will keep it from terminating */ object = VM_PAGE_OBJECT(m); - offset = m->offset; + offset = m->vmp_offset; - if (object->object_slid) { - panic("slid page %p not allowed on this path\n", m); - } - m->vm_page_q_state = VM_PAGE_NOT_ON_Q; + m->vmp_q_state = VM_PAGE_NOT_ON_Q; VM_PAGE_ZERO_PAGEQ_ENTRY(m); vm_page_unlock_queues(); @@ -3571,7 +3401,7 @@ vm_pageout_iothread_external_continue(struct vm_pageout_queue *q) m = vm_page_lookup(object, offset); if (m == NULL || - m->busy || m->cleaning || !m->laundry || (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q)) { + m->vmp_busy || m->vmp_cleaning || !m->vmp_laundry || (m->vmp_q_state != VM_PAGE_NOT_ON_Q)) { /* * it's either the same page that someone else has * started cleaning (or it's finished cleaning or @@ -3595,7 +3425,7 @@ vm_pageout_iothread_external_continue(struct vm_pageout_queue *q) * memory_object_destroy or vm_object_destroy, and * so there is nowhere for the page to go. */ - if (m->free_when_done) { + if (m->vmp_free_when_done) { /* * Just free the page... VM_PAGE_FREE takes * care of cleaning up all the state... @@ -3607,7 +3437,7 @@ vm_pageout_iothread_external_continue(struct vm_pageout_queue *q) vm_pageout_throttle_up(m); vm_page_activate(m); - + vm_page_unlock_queues(); /* @@ -3643,7 +3473,7 @@ vm_pageout_iothread_external_continue(struct vm_pageout_queue *q) * any pageout clustering happens there */ memory_object_data_return(pager, - m->offset + object->paging_offset, + m->vmp_offset + object->paging_offset, PAGE_SIZE, NULL, NULL, @@ -3676,10 +3506,6 @@ uint32_t vm_compressor_time_thread; /* Set via sysctl to record time accrued by */ -#if DEVELOPMENT || DEBUG -uint64_t compressor_epoch_start, compressor_epoch_stop, compressor_threads_runtime; -#endif - void vm_pageout_iothread_internal_continue(struct cq *); void @@ -3693,14 +3519,14 @@ vm_pageout_iothread_internal_continue(struct cq *cq) vm_page_t local_freeq = NULL; int local_freed = 0; int local_batch_size; - int ncomps = 0; #if DEVELOPMENT || DEBUG + int ncomps = 0; boolean_t marked_active = FALSE; #endif KERNEL_DEBUG(0xe040000c | DBG_FUNC_END, 0, 0, 0, 0, 0); q = cq->q; - local_batch_size = q->pgo_maxlaundry / (vm_compressor_thread_count * 2); + local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2); #if RECORD_THE_COMPRESSED_DATA if (q->pgo_laundry) @@ -3721,7 +3547,7 @@ vm_pageout_iothread_internal_continue(struct cq *cq) vmct_state[cq->id] = VMCT_ACTIVE; marked_active = TRUE; if (vmct_active == 1) { - compressor_epoch_start = mach_absolute_time(); + vm_compressor_epoch_start = mach_absolute_time(); } } #endif @@ -3729,17 +3555,17 @@ vm_pageout_iothread_internal_continue(struct cq *cq) KERNEL_DEBUG(0xe0400018 | DBG_FUNC_START, q->pgo_laundry, 0, 0, 0, 0); - while ( !vm_page_queue_empty(&q->pgo_pending) && local_cnt < local_batch_size) { + while ( !vm_page_queue_empty(&q->pgo_pending) && local_cnt < local_batch_size) { - vm_page_queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq); - assert(m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q); + vm_page_queue_remove_first(&q->pgo_pending, m, vm_page_t, vmp_pageq); + assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q); VM_PAGE_CHECK(m); - - m->vm_page_q_state = VM_PAGE_NOT_ON_Q; + + m->vmp_q_state = VM_PAGE_NOT_ON_Q; VM_PAGE_ZERO_PAGEQ_ENTRY(m); - m->laundry = FALSE; + m->vmp_laundry = FALSE; - m->snext = local_q; + m->vmp_snext = local_q; local_q = m; local_cnt++; } @@ -3757,7 +3583,7 @@ vm_pageout_iothread_internal_continue(struct cq *cq) vm_page_unlock_queues(); #if !RECORD_THE_COMPRESSED_DATA - if (pages_left_on_q >= local_batch_size && cq->id < (vm_compressor_thread_count - 1)) { + if (pages_left_on_q >= local_batch_size && cq->id < (vm_pageout_state.vm_compressor_thread_count - 1)) { thread_wakeup((event_t) ((uintptr_t)&q->pgo_pending + cq->id + 1)); } #endif @@ -3768,19 +3594,25 @@ vm_pageout_iothread_internal_continue(struct cq *cq) KERNEL_DEBUG(0xe0400024 | DBG_FUNC_START, local_cnt, 0, 0, 0, 0); m = local_q; - local_q = m->snext; - m->snext = NULL; + local_q = m->vmp_snext; + m->vmp_snext = NULL; - if (vm_pageout_compress_page(&cq->current_chead, cq->scratch_buf, m, FALSE) == KERN_SUCCESS) { + if (vm_pageout_compress_page(&cq->current_chead, cq->scratch_buf, m) == KERN_SUCCESS) { +#if DEVELOPMENT || DEBUG ncomps++; - m->snext = local_freeq; +#endif + KERNEL_DEBUG(0xe0400024 | DBG_FUNC_END, local_cnt, 0, 0, 0, 0); + + m->vmp_snext = local_freeq; local_freeq = m; local_freed++; if (local_freed >= MAX_FREE_BATCH) { - vm_pageout_freed_after_compression += local_freed; + + OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions); vm_page_free_list(local_freeq, TRUE); + local_freeq = NULL; local_freed = 0; } @@ -3791,7 +3623,7 @@ vm_pageout_iothread_internal_continue(struct cq *cq) int need_wakeup = 0; if (local_freeq) { - vm_pageout_freed_after_compression += local_freed; + OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions); vm_page_free_list(local_freeq, TRUE); local_freeq = NULL; @@ -3821,7 +3653,7 @@ vm_pageout_iothread_internal_continue(struct cq *cq) #endif } if (local_freeq) { - vm_pageout_freed_after_compression += local_freed; + OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions); vm_page_free_list(local_freeq, TRUE); local_freeq = NULL; @@ -3848,14 +3680,15 @@ vm_pageout_iothread_internal_continue(struct cq *cq) vmct_state[cq->id] = VMCT_IDLE; if (vmct_active == 0) { - compressor_epoch_stop = mach_absolute_time(); - assert(compressor_epoch_stop > compressor_epoch_start); + vm_compressor_epoch_stop = mach_absolute_time(); + assertf(vm_compressor_epoch_stop >= vm_compressor_epoch_start, + "Compressor epoch non-monotonic: 0x%llx -> 0x%llx", + vm_compressor_epoch_start, vm_compressor_epoch_stop); /* This interval includes intervals where one or more * compressor threads were pre-empted */ - vmct_stats.vmct_cthreads_total += compressor_epoch_stop - compressor_epoch_start; + vmct_stats.vmct_cthreads_total += vm_compressor_epoch_stop - vm_compressor_epoch_start; } - } #endif vm_page_unlock_queues(); @@ -3881,7 +3714,7 @@ vm_pageout_iothread_internal_continue(struct cq *cq) kern_return_t -vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m, boolean_t object_locked_by_caller) +vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m) { vm_object_t object; memory_object_t pager; @@ -3890,16 +3723,13 @@ vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m, b object = VM_PAGE_OBJECT(m); - if (object->object_slid) { - panic("slid page %p not allowed on this path\n", m); - } - assert(!m->free_when_done); - assert(!m->laundry); + assert(!m->vmp_free_when_done); + assert(!m->vmp_laundry); pager = object->pager; - if (object_locked_by_caller == FALSE && (!object->pager_initialized || pager == MEMORY_OBJECT_NULL)) { - + if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) { + KERNEL_DEBUG(0xe0400010 | DBG_FUNC_START, object, pager, 0, 0, 0); vm_object_lock(object); @@ -3929,9 +3759,9 @@ vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m, b vm_page_lockspin_queues(); vm_page_activate(m); - vm_pageout_dirty_no_pager++; + VM_PAGEOUT_DEBUG(vm_pageout_dirty_no_pager, 1); vm_page_unlock_queues(); - + /* * And we are done with it. */ @@ -3941,36 +3771,31 @@ vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m, b return KERN_FAILURE; } vm_object_unlock(object); - + KERNEL_DEBUG(0xe0400010 | DBG_FUNC_END, object, pager, 0, 0, 0); } assert(object->pager_initialized && pager != MEMORY_OBJECT_NULL); - - if (object_locked_by_caller == FALSE) - assert(object->activity_in_progress > 0); + assert(object->activity_in_progress > 0); retval = vm_compressor_pager_put( pager, - m->offset + object->paging_offset, + m->vmp_offset + object->paging_offset, VM_PAGE_GET_PHYS_PAGE(m), current_chead, scratch_buf, &compressed_count_delta); - if (object_locked_by_caller == FALSE) { - vm_object_lock(object); + vm_object_lock(object); - assert(object->activity_in_progress > 0); - assert(VM_PAGE_OBJECT(m) == object); - } + assert(object->activity_in_progress > 0); + assert(VM_PAGE_OBJECT(m) == object); + assert( !VM_PAGE_WIRED(m)); vm_compressor_pager_count(pager, compressed_count_delta, FALSE, /* shared_lock */ object); - assert( !VM_PAGE_WIRED(m)); - if (retval == KERN_SUCCESS) { /* * If the object is purgeable, its owner's @@ -3979,15 +3804,16 @@ vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m, b * contributes to the owner's memory footprint, * so account for it as such. */ - if (object->purgable != VM_PURGABLE_DENY && - object->vo_purgeable_owner != NULL) { - /* one more compressed purgeable page */ - vm_purgeable_compressed_update(object, - +1); + if ((object->purgable != VM_PURGABLE_DENY || + object->vo_ledger_tag) && + object->vo_owner != NULL) { + /* one more compressed purgeable/tagged page */ + vm_object_owner_compressed_update(object, + +1); } VM_STAT_INCR(compressions); - - if (m->tabled) + + if (m->vmp_tabled) vm_page_remove(m, TRUE); } else { @@ -3996,14 +3822,13 @@ vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m, b vm_page_lockspin_queues(); vm_page_activate(m); - vm_compressor_failed++; + vm_pageout_vminfo.vm_compressor_failed++; vm_page_unlock_queues(); } - if (object_locked_by_caller == FALSE) { - vm_object_activity_end(object); - vm_object_unlock(object); - } + vm_object_activity_end(object); + vm_object_unlock(object); + return retval; } @@ -4012,12 +3837,12 @@ static void vm_pageout_adjust_eq_iothrottle(struct vm_pageout_queue *eq, boolean_t req_lowpriority) { uint32_t policy; - + if (hibernate_cleaning_in_progress == TRUE) req_lowpriority = FALSE; if (eq->pgo_inited == TRUE && eq->pgo_lowpriority != req_lowpriority) { - + vm_page_unlock_queues(); if (req_lowpriority == TRUE) { @@ -4078,7 +3903,7 @@ vm_pageout_iothread_internal(struct cq *cq) vm_page_unlock_queues(); - if (vm_restricted_to_single_processor == TRUE) + if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) thread_vm_bind_group_add(); @@ -4092,7 +3917,7 @@ vm_pageout_iothread_internal(struct cq *cq) } kern_return_t -vm_set_buffer_cleanup_callout(boolean_t (*func)(int)) +vm_set_buffer_cleanup_callout(boolean_t (*func)(int)) { if (OSCompareAndSwapPtr(NULL, func, (void * volatile *) &consider_buffer_cache_collect)) { return KERN_SUCCESS; @@ -4141,7 +3966,7 @@ vm_pressure_response(void) if (memorystatus_manual_testing_on) { return; } - + old_level = memorystatus_vm_pressure_level; switch (memorystatus_vm_pressure_level) { @@ -4180,17 +4005,22 @@ vm_pressure_response(void) default: return; } - + if (new_level != -1) { memorystatus_vm_pressure_level = (vm_pressure_level_t) new_level; - if ((memorystatus_vm_pressure_level != kVMPressureNormal) || (old_level != new_level)) { - if (vm_pressure_thread_running == FALSE) { + if (new_level != old_level) { + VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE, + new_level, old_level, 0, 0); + } + + if ((memorystatus_vm_pressure_level != kVMPressureNormal) || (old_level != memorystatus_vm_pressure_level)) { + if (vm_pageout_state.vm_pressure_thread_running == FALSE) { thread_wakeup(&vm_pressure_thread); } - if (old_level != new_level) { - thread_wakeup(&vm_pressure_changed); + if (old_level != memorystatus_vm_pressure_level) { + thread_wakeup(&vm_pageout_state.vm_pressure_changed); } } } @@ -4202,11 +4032,11 @@ kern_return_t mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure, __unused unsigned int *pressure_level) { #if CONFIG_EMBEDDED - + return KERN_FAILURE; #elif !VM_PRESSURE_EVENTS - + return KERN_FAILURE; #else /* VM_PRESSURE_EVENTS */ @@ -4221,7 +4051,7 @@ mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure, __unused un wait_result_t wr = 0; while (old_level == *pressure_level) { - wr = assert_wait((event_t) &vm_pressure_changed, + wr = assert_wait((event_t) &vm_pageout_state.vm_pressure_changed, THREAD_INTERRUPTIBLE); if (wr == THREAD_WAITING) { wr = thread_block(THREAD_CONTINUE_NULL); @@ -4230,7 +4060,7 @@ mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure, __unused un return KERN_ABORTED; } if (wr == THREAD_AWAKENED) { - + old_level = memorystatus_vm_pressure_level; if (old_level != *pressure_level) { @@ -4256,11 +4086,12 @@ vm_pressure_thread(void) { static boolean_t thread_initialized = FALSE; if (thread_initialized == TRUE) { - vm_pressure_thread_running = TRUE; + vm_pageout_state.vm_pressure_thread_running = TRUE; consider_vm_pressure_events(); - vm_pressure_thread_running = FALSE; + vm_pageout_state.vm_pressure_thread_running = FALSE; } + thread_set_thread_name(current_thread(), "VM_pressure"); thread_initialized = TRUE; assert_wait((event_t) &vm_pressure_thread, THREAD_UNINT); thread_block((thread_continue_t)vm_pressure_thread); @@ -4268,17 +4099,15 @@ vm_pressure_thread(void) { #endif /* VM_PRESSURE_EVENTS */ -uint32_t vm_pageout_considered_page_last = 0; - /* * called once per-second via "compute_averages" */ void compute_pageout_gc_throttle(__unused void *arg) { - if (vm_pageout_considered_page != vm_pageout_considered_page_last) { + if (vm_pageout_vminfo.vm_pageout_considered_page != vm_pageout_state.vm_pageout_considered_page_last) { - vm_pageout_considered_page_last = vm_pageout_considered_page; + vm_pageout_state.vm_pageout_considered_page_last = vm_pageout_vminfo.vm_pageout_considered_page; thread_wakeup((event_t) &vm_pageout_garbage_collect); } @@ -4339,7 +4168,7 @@ vm_pageout_garbage_collect(int collect) stack_collect(); consider_machine_collect(); - m_drain(); + mbuf_drain(FALSE); do { if (consider_buffer_cache_collect != NULL) { @@ -4388,13 +4217,13 @@ vm_set_restrictions() if (hinfo.max_cpus <= 3) { /* - * on systems with a limited number of CPUS, bind the + * on systems with a limited number of CPUS, bind the * 4 major threads that can free memory and that tend to use * a fair bit of CPU under pressured conditions to a single processor. * This insures that these threads don't hog all of the available CPUs * (important for camera launch), while allowing them to run independently * w/r to locks... the 4 threads are - * vm_pageout_scan, vm_pageout_iothread_internal (compressor), + * vm_pageout_scan, vm_pageout_iothread_internal (compressor), * vm_compressor_swap_trigger_thread (minor and major compactions), * memorystatus_thread (jetsams). * @@ -4403,8 +4232,9 @@ vm_set_restrictions() * thread_bind_master... someday this should be replaced with a group * scheduling mechanism and KPI. */ - vm_restricted_to_single_processor = TRUE; - } + vm_pageout_state.vm_restricted_to_single_processor = TRUE; + } else + vm_pageout_state.vm_restricted_to_single_processor = FALSE; } void @@ -4428,7 +4258,7 @@ vm_pageout(void) if (!self->reserved_stack) self->reserved_stack = self->kernel_stack; - if (vm_restricted_to_single_processor == TRUE) + if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) thread_vm_bind_group_add(); splx(s); @@ -4439,43 +4269,63 @@ vm_pageout(void) * Initialize some paging parameters. */ - if (vm_pageout_swap_wait == 0) - vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT; - - if (vm_pageout_idle_wait == 0) - vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT; - - if (vm_pageout_burst_wait == 0) - vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT; - - if (vm_pageout_empty_wait == 0) - vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT; + vm_pageout_state.vm_pressure_thread_running = FALSE; + vm_pageout_state.vm_pressure_changed = FALSE; + vm_pageout_state.memorystatus_purge_on_warning = 2; + vm_pageout_state.memorystatus_purge_on_urgent = 5; + vm_pageout_state.memorystatus_purge_on_critical = 8; + vm_pageout_state.vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS; + vm_pageout_state.vm_page_speculative_percentage = 5; + vm_pageout_state.vm_page_speculative_target = 0; + + vm_pageout_state.vm_pageout_external_iothread = THREAD_NULL; + vm_pageout_state.vm_pageout_internal_iothread = THREAD_NULL; + + vm_pageout_state.vm_pageout_swap_wait = 0; + vm_pageout_state.vm_pageout_idle_wait = 0; + vm_pageout_state.vm_pageout_empty_wait = 0; + vm_pageout_state.vm_pageout_burst_wait = 0; + vm_pageout_state.vm_pageout_deadlock_wait = 0; + vm_pageout_state.vm_pageout_deadlock_relief = 0; + vm_pageout_state.vm_pageout_burst_inactive_throttle = 0; + + vm_pageout_state.vm_pageout_inactive = 0; + vm_pageout_state.vm_pageout_inactive_used = 0; + vm_pageout_state.vm_pageout_inactive_clean = 0; + + vm_pageout_state.vm_memory_pressure = 0; + vm_pageout_state.vm_page_filecache_min = 0; +#if CONFIG_JETSAM + vm_pageout_state.vm_page_filecache_min_divisor = 70; + vm_pageout_state.vm_page_xpmapped_min_divisor = 40; +#else + vm_pageout_state.vm_page_filecache_min_divisor = 27; + vm_pageout_state.vm_page_xpmapped_min_divisor = 36; +#endif + vm_pageout_state.vm_page_free_count_init = vm_page_free_count; - if (vm_pageout_deadlock_wait == 0) - vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT; + vm_pageout_state.vm_pageout_considered_page_last = 0; - if (vm_pageout_deadlock_relief == 0) - vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF; + if (vm_pageout_state.vm_pageout_swap_wait == 0) + vm_pageout_state.vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT; - if (vm_pageout_inactive_relief == 0) - vm_pageout_inactive_relief = VM_PAGEOUT_INACTIVE_RELIEF; + if (vm_pageout_state.vm_pageout_idle_wait == 0) + vm_pageout_state.vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT; - if (vm_pageout_burst_active_throttle == 0) - vm_pageout_burst_active_throttle = VM_PAGEOUT_BURST_ACTIVE_THROTTLE; + if (vm_pageout_state.vm_pageout_burst_wait == 0) + vm_pageout_state.vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT; - if (vm_pageout_burst_inactive_throttle == 0) - vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE; + if (vm_pageout_state.vm_pageout_empty_wait == 0) + vm_pageout_state.vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT; - /* - * Set kernel task to low backing store privileged - * status - */ - task_lock(kernel_task); - kernel_task->priv_flags |= VM_BACKING_STORE_PRIV; - task_unlock(kernel_task); + if (vm_pageout_state.vm_pageout_deadlock_wait == 0) + vm_pageout_state.vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT; - vm_page_free_count_init = vm_page_free_count; + if (vm_pageout_state.vm_pageout_deadlock_relief == 0) + vm_pageout_state.vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF; + if (vm_pageout_state.vm_pageout_burst_inactive_throttle == 0) + vm_pageout_state.vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE; /* * even if we've already called vm_page_free_reserve * call it again here to insure that the targets are @@ -4514,16 +4364,16 @@ vm_pageout(void) /* internal pageout thread started when default pager registered first time */ /* external pageout and garbage collection threads started here */ - result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL, + result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL, BASEPRI_VM, - &vm_pageout_external_iothread); + &vm_pageout_state.vm_pageout_external_iothread); if (result != KERN_SUCCESS) panic("vm_pageout_iothread_external: create failed"); - thread_deallocate(vm_pageout_external_iothread); + thread_deallocate(vm_pageout_state.vm_pageout_external_iothread); result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL, - BASEPRI_DEFAULT, + BASEPRI_DEFAULT, &thread); if (result != KERN_SUCCESS) panic("vm_pageout_garbage_collect: create failed"); @@ -4547,7 +4397,7 @@ vm_pageout(void) bzero(&vm_config, sizeof(vm_config)); switch(vm_compressor_mode) { - + case VM_PAGER_DEFAULT: printf("mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n"); @@ -4643,12 +4493,6 @@ vm_pageout(void) -#if CONFIG_EMBEDDED -int vm_compressor_thread_count = 1; -#else -int vm_compressor_thread_count = 2; -#endif - kern_return_t vm_pageout_internal_start(void) { @@ -4664,28 +4508,39 @@ vm_pageout_internal_start(void) assert(hinfo.max_cpus > 0); - PE_parse_boot_argn("vmcomp_threads", &vm_compressor_thread_count, sizeof(vm_compressor_thread_count)); - if (vm_compressor_thread_count >= hinfo.max_cpus) - vm_compressor_thread_count = hinfo.max_cpus - 1; - if (vm_compressor_thread_count <= 0) - vm_compressor_thread_count = 1; - else if (vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT) - vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT; +#if CONFIG_EMBEDDED + vm_pageout_state.vm_compressor_thread_count = 1; +#else + if (hinfo.max_cpus > 4) + vm_pageout_state.vm_compressor_thread_count = 2; + else + vm_pageout_state.vm_compressor_thread_count = 1; +#endif + PE_parse_boot_argn("vmcomp_threads", &vm_pageout_state.vm_compressor_thread_count, + sizeof(vm_pageout_state.vm_compressor_thread_count)); + + if (vm_pageout_state.vm_compressor_thread_count >= hinfo.max_cpus) + vm_pageout_state.vm_compressor_thread_count = hinfo.max_cpus - 1; + if (vm_pageout_state.vm_compressor_thread_count <= 0) + vm_pageout_state.vm_compressor_thread_count = 1; + else if (vm_pageout_state.vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT) + vm_pageout_state.vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT; - vm_pageout_queue_internal.pgo_maxlaundry = (vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX; + vm_pageout_queue_internal.pgo_maxlaundry = (vm_pageout_state.vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX; PE_parse_boot_argn("vmpgoi_maxlaundry", &vm_pageout_queue_internal.pgo_maxlaundry, sizeof(vm_pageout_queue_internal.pgo_maxlaundry)); - for (i = 0; i < vm_compressor_thread_count; i++) { + for (i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) { ciq[i].id = i; ciq[i].q = &vm_pageout_queue_internal; ciq[i].current_chead = NULL; ciq[i].scratch_buf = kalloc(COMPRESSOR_SCRATCH_BUF_SIZE); - result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, (void *)&ciq[i], BASEPRI_VM, &vm_pageout_internal_iothread); + result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, (void *)&ciq[i], + BASEPRI_VM, &vm_pageout_state.vm_pageout_internal_iothread); if (result == KERN_SUCCESS) - thread_deallocate(vm_pageout_internal_iothread); + thread_deallocate(vm_pageout_state.vm_pageout_internal_iothread); else break; } @@ -4718,7 +4573,7 @@ upl_set_decmp_info(upl_t upl, upl_t src_upl) * This case should rarely happen and even if it does, it just means * that we might issue a spurious expedite which the driver is expected * to handle. - */ + */ upl_unlock(src_upl); return; } @@ -4729,7 +4584,7 @@ upl_set_decmp_info(upl_t upl, upl_t src_upl) upl->decmp_io_upl = (void *)src_upl; upl_unlock(src_upl); } -#endif /* CONFIG_IOSCHED */ +#endif /* CONFIG_IOSCHED */ #if UPL_DEBUG int upl_debug_enabled = 1; @@ -4773,11 +4628,12 @@ upl_create(int type, int flags, upl_size_t size) upl_lock_init(upl); upl->vector_upl = NULL; upl->associated_upl = NULL; + upl->upl_iodone = NULL; #if CONFIG_IOSCHED if (type & UPL_CREATE_IO_TRACKING) { upl->upl_priority = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO); } - + upl->upl_reprio_info = 0; upl->decmp_io_upl = 0; if ((type & UPL_CREATE_INTERNAL) && (type & UPL_CREATE_EXPEDITE_SUP)) { @@ -4786,7 +4642,7 @@ upl_create(int type, int flags, upl_size_t size) upl->upl_reprio_info = (uint64_t *)kalloc(sizeof(uint64_t) * atop(size)); bzero(upl->upl_reprio_info, (sizeof(uint64_t) * atop(size))); upl->flags |= UPL_EXPEDITE_SUPPORTED; - if (curthread->decmp_upl != NULL) + if (curthread->decmp_upl != NULL) upl_set_decmp_info(upl, curthread->decmp_upl); } #endif @@ -4879,7 +4735,7 @@ upl_destroy(upl_t upl) if (upl->flags & UPL_INTERNAL) { kfree(upl, - sizeof(struct upl) + + sizeof(struct upl) + (sizeof(struct upl_page_info) * (size/PAGE_SIZE)) + page_field_size); } else { @@ -4891,13 +4747,17 @@ void upl_deallocate(upl_t upl) { upl_lock(upl); + if (--upl->ref_count == 0) { if(vector_upl_is_valid(upl)) vector_upl_deallocate(upl); - upl_unlock(upl); + upl_unlock(upl); + + if (upl->upl_iodone) + upl_callout_iodone(upl); + upl_destroy(upl); - } - else + } else upl_unlock(upl); } @@ -4908,7 +4768,7 @@ upl_mark_decmp(upl_t upl) if (upl->flags & UPL_TRACKED_BY_OBJECT) { upl->flags |= UPL_DECMP_REQ; upl->upl_creator->decmp_upl = (void *)upl; - } + } } void @@ -4917,7 +4777,7 @@ upl_unmark_decmp(upl_t upl) if(upl && (upl->flags & UPL_DECMP_REQ)) { upl->upl_creator->decmp_upl = NULL; } -} +} #endif /* CONFIG_IOSCHED */ @@ -4937,22 +4797,9 @@ must_throttle_writes() } -#if DEVELOPMENT || DEBUG -/*/* - * Statistics about UPL enforcement of copy-on-write obligations. - */ -unsigned long upl_cow = 0; -unsigned long upl_cow_again = 0; -unsigned long upl_cow_pages = 0; -unsigned long upl_cow_again_pages = 0; - -unsigned long iopl_cow = 0; -unsigned long iopl_cow_pages = 0; -#endif - -/* - * Routine: vm_object_upl_request - * Purpose: +/* + * Routine: vm_object_upl_request + * Purpose: * Cause the population of a portion of a vm_object. * Depending on the nature of the request, the pages * returned may be contain valid data or be uninitialized. @@ -4963,7 +4810,7 @@ unsigned long iopl_cow_pages = 0; * IMPORTANT NOTE: The caller must still respect the relationship * between the vm_object and its backing memory object. The * caller MUST NOT substitute changes in the backing file - * without first doing a memory_object_lock_request on the + * without first doing a memory_object_lock_request on the * target range unless it is know that the pages are not * shared with another entity at the pager level. * Copy_in_to: @@ -4981,7 +4828,7 @@ unsigned long iopl_cow_pages = 0; * all mapped pages. Where a page does not exist * map a zero filled one. Leave pages busy in * the original object. If a page list structure - * is not specified, this call is a no-op. + * is not specified, this call is a no-op. * * Note: access of default pager objects has a rather interesting * twist. The caller of this routine, presumably the file system @@ -4989,7 +4836,7 @@ unsigned long iopl_cow_pages = 0; * against a default pager backed object. Only the default * pager will make requests on backing store related vm_objects * In this way the default pager can maintain the relationship - * between backing store files (abstract memory objects) and + * between backing store files (abstract memory objects) and * the vm_objects (cache objects), they support. * */ @@ -5013,9 +4860,6 @@ vm_object_upl_request( boolean_t hw_dirty; upl_t upl = NULL; unsigned int entry; -#if MACH_CLUSTER_STATS - boolean_t encountered_lrp = FALSE; -#endif vm_page_t alias_page = NULL; int refmod_state = 0; wpl_array_t lite_list = NULL; @@ -5026,7 +4870,10 @@ vm_object_upl_request( int dw_limit; int io_tracking_flag = 0; int grab_options; + int page_grab_count = 0; ppnum_t phys_page; + pmap_flush_context pmap_flush_context_storage; + boolean_t pmap_flushes_delayed = FALSE; if (cntrl_flags & ~UPL_VALID_FLAGS) { /* @@ -5040,6 +4887,7 @@ vm_object_upl_request( if (object->phys_contiguous) panic("vm_object_upl_request: contiguous object specified\n"); + VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, 0, 0); if (size > MAX_UPL_SIZE_BYTES) size = MAX_UPL_SIZE_BYTES; @@ -5063,7 +4911,7 @@ vm_object_upl_request( user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl)); lite_list = (wpl_array_t) - (((uintptr_t)user_page_list) + + (((uintptr_t)user_page_list) + ((size/PAGE_SIZE) * sizeof(upl_page_info_t))); if (size == 0) { user_page_list = NULL; @@ -5091,7 +4939,7 @@ vm_object_upl_request( } } *upl_ptr = upl; - + if (user_page_list) user_page_list[0].device = FALSE; @@ -5157,10 +5005,9 @@ vm_object_upl_request( FALSE, /* should_return */ MEMORY_OBJECT_COPY_SYNC, VM_PROT_NO_CHANGE); -#if DEVELOPMENT || DEBUG - upl_cow++; - upl_cow_pages += size >> PAGE_SHIFT; -#endif + + VM_PAGEOUT_DEBUG(upl_cow, 1); + VM_PAGEOUT_DEBUG(upl_cow_pages, (size >> PAGE_SHIFT)); } /* * remember which copy object we synchronized with @@ -5189,7 +5036,7 @@ vm_object_upl_request( vnode_pager_get_isSSD(object->pager, &isSSD); #endif vm_object_unlock(object); - + OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages); if (isSSD == TRUE) @@ -5214,12 +5061,12 @@ vm_object_upl_request( upl->flags |= UPL_PAGE_SYNC_DONE; if ( ((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) || - dst_page->fictitious || - dst_page->absent || - dst_page->error || - dst_page->cleaning || + dst_page->vmp_fictitious || + dst_page->vmp_absent || + dst_page->vmp_error || + dst_page->vmp_cleaning || (VM_PAGE_WIRED(dst_page))) { - + if (user_page_list) user_page_list[entry].phys_addr = 0; @@ -5234,7 +5081,7 @@ vm_object_upl_request( * anyway... so we can eliminate an extra call into * the pmap layer by grabbing it here and recording it */ - if (dst_page->pmapped) + if (dst_page->vmp_pmapped) refmod_state = pmap_get_refmod(phys_page); else refmod_state = 0; @@ -5252,15 +5099,15 @@ vm_object_upl_request( /* * we're only asking for DIRTY pages to be returned */ - if (dst_page->laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) { + if (dst_page->vmp_laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) { /* * if we were the page stolen by vm_pageout_scan to be - * cleaned (as opposed to a buddy being clustered in + * cleaned (as opposed to a buddy being clustered in * or this request is not being driven by a PAGEOUT cluster * then we only need to check for the page being dirty or * precious to decide whether to return it */ - if (dst_page->dirty || dst_page->precious || (refmod_state & VM_MEM_MODIFIED)) + if (dst_page->vmp_dirty || dst_page->vmp_precious || (refmod_state & VM_MEM_MODIFIED)) goto check_busy; goto dont_return; } @@ -5271,9 +5118,9 @@ vm_object_upl_request( * can't have been referenced recently... */ if ( (hibernate_cleaning_in_progress == TRUE || - (!((refmod_state & VM_MEM_REFERENCED) || dst_page->reference) || - (dst_page->vm_page_q_state == VM_PAGE_ON_THROTTLED_Q))) && - ((refmod_state & VM_MEM_MODIFIED) || dst_page->dirty || dst_page->precious) ) { + (!((refmod_state & VM_MEM_REFERENCED) || dst_page->vmp_reference) || + (dst_page->vmp_q_state == VM_PAGE_ON_THROTTLED_Q))) && + ((refmod_state & VM_MEM_MODIFIED) || dst_page->vmp_dirty || dst_page->vmp_precious) ) { goto check_busy; } dont_return: @@ -5281,7 +5128,7 @@ vm_object_upl_request( * if we reach here, we're not to return * the page... go on to the next one */ - if (dst_page->laundry == TRUE) { + if (dst_page->vmp_laundry == TRUE) { /* * if we get here, the page is not 'cleaning' (filtered out above). * since it has been referenced, remove it from the laundry @@ -5292,7 +5139,7 @@ vm_object_upl_request( vm_pageout_steal_laundry(dst_page, TRUE); vm_page_activate(dst_page); - + vm_page_unlock_queues(); } if (user_page_list) @@ -5300,9 +5147,9 @@ vm_object_upl_request( goto try_next_page; } -check_busy: - if (dst_page->busy) { - if (cntrl_flags & UPL_NOBLOCK) { +check_busy: + if (dst_page->vmp_busy) { + if (cntrl_flags & UPL_NOBLOCK) { if (user_page_list) user_page_list[entry].phys_addr = 0; dwp->dw_mask = 0; @@ -5317,11 +5164,11 @@ vm_object_upl_request( continue; } - if (dst_page->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) { + if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) { vm_page_lockspin_queues(); - if (dst_page->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) { + if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) { /* * we've buddied up a page for a clustered pageout * that has already been moved to the pageout @@ -5333,24 +5180,8 @@ vm_object_upl_request( } vm_page_unlock_queues(); } -#if MACH_CLUSTER_STATS - /* - * pageout statistics gathering. count - * all the pages we will page out that - * were not counted in the initial - * vm_pageout_scan work - */ - if (dst_page->pageout) - encountered_lrp = TRUE; - if ((dst_page->dirty || (object->internal && dst_page->precious))) { - if (encountered_lrp) - CLUSTER_STAT(pages_at_higher_offsets++;) - else - CLUSTER_STAT(pages_at_lower_offsets++;) - } -#endif hw_dirty = refmod_state & VM_MEM_MODIFIED; - dirty = hw_dirty ? TRUE : dst_page->dirty; + dirty = hw_dirty ? TRUE : dst_page->vmp_dirty; if (phys_page > upl->highest_page) upl->highest_page = phys_page; @@ -5364,15 +5195,23 @@ vm_object_upl_request( assert(pg_num == (dst_offset-offset)/PAGE_SIZE); lite_list[pg_num>>5] |= 1 << (pg_num & 31); - if (hw_dirty) - pmap_clear_modify(phys_page); + if (hw_dirty) { + if (pmap_flushes_delayed == FALSE) { + pmap_flush_context_init(&pmap_flush_context_storage); + pmap_flushes_delayed = TRUE; + } + pmap_clear_refmod_options(phys_page, + VM_MEM_MODIFIED, + PMAP_OPTIONS_NOFLUSH | PMAP_OPTIONS_CLEAR_WRITE, + &pmap_flush_context_storage); + } /* - * Mark original page as cleaning + * Mark original page as cleaning * in place. */ - dst_page->cleaning = TRUE; - dst_page->precious = FALSE; + dst_page->vmp_cleaning = TRUE; + dst_page->vmp_precious = FALSE; } else { /* * use pageclean setup, it is more @@ -5383,21 +5222,21 @@ vm_object_upl_request( vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size); vm_object_unlock(upl->map_object); - alias_page->absent = FALSE; + alias_page->vmp_absent = FALSE; alias_page = NULL; } if (dirty) { SET_PAGE_DIRTY(dst_page, FALSE); } else { - dst_page->dirty = FALSE; + dst_page->vmp_dirty = FALSE; } if (!dirty) - dst_page->precious = TRUE; + dst_page->vmp_precious = TRUE; if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) { if ( !VM_PAGE_WIRED(dst_page)) - dst_page->free_when_done = TRUE; + dst_page->vmp_free_when_done = TRUE; } } else { if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != last_copy_object) { @@ -5433,10 +5272,8 @@ vm_object_upl_request( MEMORY_OBJECT_COPY_SYNC, VM_PROT_NO_CHANGE); -#if DEVELOPMENT || DEBUG - upl_cow_again++; - upl_cow_again_pages += xfer_size >> PAGE_SHIFT; -#endif + VM_PAGEOUT_DEBUG(upl_cow_again, 1); + VM_PAGEOUT_DEBUG(upl_cow_again_pages, (xfer_size >> PAGE_SHIFT)); } /* * remember the copy object we synced with @@ -5444,7 +5281,7 @@ vm_object_upl_request( last_copy_object = object->copy; } dst_page = vm_page_lookup(object, dst_offset); - + if (dst_page != VM_PAGE_NULL) { if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) { @@ -5456,11 +5293,11 @@ vm_object_upl_request( goto try_next_page; } - if (dst_page->fictitious) { + if (dst_page->vmp_fictitious) { panic("need corner case for fictitious page"); } - if (dst_page->busy || dst_page->cleaning) { + if (dst_page->vmp_busy || dst_page->vmp_cleaning) { /* * someone else is playing with the * page. We will have to wait. @@ -5469,14 +5306,14 @@ vm_object_upl_request( continue; } - if (dst_page->laundry) + if (dst_page->vmp_laundry) vm_pageout_steal_laundry(dst_page, FALSE); } else { if (object->private) { - /* - * This is a nasty wrinkle for users - * of upl who encounter device or - * private memory however, it is + /* + * This is a nasty wrinkle for users + * of upl who encounter device or + * private memory however, it is * unavoidable, only a fault can * resolve the actual backing * physical page by asking the @@ -5512,6 +5349,8 @@ vm_object_upl_request( * need to allocate a page */ dst_page = vm_page_grab_options(grab_options); + if (dst_page != VM_PAGE_NULL) + page_grab_count++; } if (dst_page == VM_PAGE_NULL) { if ( (cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) { @@ -5531,7 +5370,7 @@ vm_object_upl_request( * offset... */ vm_object_unlock(object); - + OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages); VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0); @@ -5547,19 +5386,19 @@ vm_object_upl_request( } vm_page_insert(dst_page, object, dst_offset); - dst_page->absent = TRUE; - dst_page->busy = FALSE; + dst_page->vmp_absent = TRUE; + dst_page->vmp_busy = FALSE; if (cntrl_flags & UPL_RET_ONLY_ABSENT) { /* * if UPL_RET_ONLY_ABSENT was specified, * than we're definitely setting up a - * upl for a clustered read/pagein + * upl for a clustered read/pagein * operation... mark the pages as clustered * so upl_commit_range can put them on the * speculative list */ - dst_page->clustered = TRUE; + dst_page->vmp_clustered = TRUE; if ( !(cntrl_flags & UPL_FILE_IO)) VM_STAT_INCR(pageins); @@ -5567,9 +5406,9 @@ vm_object_upl_request( } phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page); - dst_page->overwriting = TRUE; + dst_page->vmp_overwriting = TRUE; - if (dst_page->pmapped) { + if (dst_page->vmp_pmapped) { if ( !(cntrl_flags & UPL_FILE_IO)) /* * eliminate all mappings from the @@ -5582,7 +5421,7 @@ vm_object_upl_request( refmod_state = 0; hw_dirty = refmod_state & VM_MEM_MODIFIED; - dirty = hw_dirty ? TRUE : dst_page->dirty; + dirty = hw_dirty ? TRUE : dst_page->vmp_dirty; if (cntrl_flags & UPL_SET_LITE) { unsigned int pg_num; @@ -5595,11 +5434,11 @@ vm_object_upl_request( pmap_clear_modify(phys_page); /* - * Mark original page as cleaning + * Mark original page as cleaning * in place. */ - dst_page->cleaning = TRUE; - dst_page->precious = FALSE; + dst_page->vmp_cleaning = TRUE; + dst_page->vmp_precious = FALSE; } else { /* * use pageclean setup, it is more @@ -5610,7 +5449,7 @@ vm_object_upl_request( vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size); vm_object_unlock(upl->map_object); - alias_page->absent = FALSE; + alias_page->vmp_absent = FALSE; alias_page = NULL; } @@ -5630,17 +5469,17 @@ vm_object_upl_request( */ upl->flags |= UPL_CLEAR_DIRTY; } - dst_page->dirty = dirty; + dst_page->vmp_dirty = dirty; if (!dirty) - dst_page->precious = TRUE; + dst_page->vmp_precious = TRUE; if ( !VM_PAGE_WIRED(dst_page)) { /* * deny access to the target page while * it is being worked on */ - dst_page->busy = TRUE; + dst_page->vmp_busy = TRUE; } else dwp->dw_mask |= DW_vm_page_wire; @@ -5648,8 +5487,8 @@ vm_object_upl_request( * We might be about to satisfy a fault which has been * requested. So no need for the "restart" bit. */ - dst_page->restart = FALSE; - if (!dst_page->absent && !(cntrl_flags & UPL_WILL_MODIFY)) { + dst_page->vmp_restart = FALSE; + if (!dst_page->vmp_absent && !(cntrl_flags & UPL_WILL_MODIFY)) { /* * expect the page to be used */ @@ -5658,15 +5497,15 @@ vm_object_upl_request( if (cntrl_flags & UPL_PRECIOUS) { if (object->internal) { SET_PAGE_DIRTY(dst_page, FALSE); - dst_page->precious = FALSE; + dst_page->vmp_precious = FALSE; } else { - dst_page->precious = TRUE; + dst_page->vmp_precious = TRUE; } } else { - dst_page->precious = FALSE; + dst_page->vmp_precious = FALSE; } } - if (dst_page->busy) + if (dst_page->vmp_busy) upl->flags |= UPL_HAS_BUSY; if (phys_page > upl->highest_page) @@ -5674,19 +5513,19 @@ vm_object_upl_request( assert (!pmap_is_noencrypt(phys_page)); if (user_page_list) { user_page_list[entry].phys_addr = phys_page; - user_page_list[entry].free_when_done = dst_page->free_when_done; - user_page_list[entry].absent = dst_page->absent; - user_page_list[entry].dirty = dst_page->dirty; - user_page_list[entry].precious = dst_page->precious; + user_page_list[entry].free_when_done = dst_page->vmp_free_when_done; + user_page_list[entry].absent = dst_page->vmp_absent; + user_page_list[entry].dirty = dst_page->vmp_dirty; + user_page_list[entry].precious = dst_page->vmp_precious; user_page_list[entry].device = FALSE; user_page_list[entry].needed = FALSE; - if (dst_page->clustered == TRUE) - user_page_list[entry].speculative = (dst_page->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE; + if (dst_page->vmp_clustered == TRUE) + user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE; else user_page_list[entry].speculative = FALSE; - user_page_list[entry].cs_validated = dst_page->cs_validated; - user_page_list[entry].cs_tainted = dst_page->cs_tainted; - user_page_list[entry].cs_nx = dst_page->cs_nx; + user_page_list[entry].cs_validated = dst_page->vmp_cs_validated; + user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted; + user_page_list[entry].cs_nx = dst_page->vmp_cs_nx; user_page_list[entry].mark = FALSE; } /* @@ -5700,9 +5539,9 @@ vm_object_upl_request( /* * someone is explicitly grabbing this page... * update clustered and speculative state - * + * */ - if (dst_page->clustered) + if (dst_page->vmp_clustered) VM_PAGE_CONSUME_CLUSTERED(dst_page); } try_next_page: @@ -5729,6 +5568,8 @@ vm_object_upl_request( if (alias_page != NULL) { VM_PAGE_FREE(alias_page); } + if (pmap_flushes_delayed == TRUE) + pmap_flush(&pmap_flush_context_storage); if (page_list_count != NULL) { if (upl->flags & UPL_INTERNAL) @@ -5741,12 +5582,14 @@ vm_object_upl_request( #endif vm_object_unlock(object); + VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0); + return KERN_SUCCESS; } -/* +/* * Routine: vm_object_super_upl_request - * Purpose: + * Purpose: * Cause the population of a portion of a vm_object * in much the same way as memory_object_upl_request. * Depending on the nature of the request, the pages @@ -5904,6 +5747,7 @@ vm_map_create_upl( } if (!(caller_flags & UPL_COPYOUT_FROM) && + !entry->is_sub_map && !(entry->protection & VM_PROT_WRITE)) { vm_map_unlock_read(map); return KERN_PROTECTION_FAILURE; @@ -6117,8 +5961,8 @@ vm_map_create_upl( vm_map_reference(submap); vm_map_unlock_read(map); - ret = vm_map_create_upl(submap, - local_offset + (offset - local_start), + ret = vm_map_create_upl(submap, + local_offset + (offset - local_start), upl_size, upl, page_list, count, flags, tag); vm_map_deallocate(submap); @@ -6141,7 +5985,7 @@ vm_map_create_upl( ((offset - local_start) + local_offset) + local_object->vo_shadow_offset), - *upl_size, FALSE, + *upl_size, FALSE, MEMORY_OBJECT_DATA_SYNC, VM_PROT_NO_CHANGE); } @@ -6163,7 +6007,7 @@ vm_map_create_upl( ((offset - local_start) + local_offset)), (vm_object_size_t)*upl_size, - FALSE, + FALSE, MEMORY_OBJECT_DATA_SYNC, VM_PROT_NO_CHANGE); @@ -6251,7 +6095,7 @@ vm_map_create_upl( vm_map_unlock_read(map); - ret = vm_object_iopl_request(local_object, + ret = vm_object_iopl_request(local_object, ((vm_object_offset_t) ((offset - local_start) + local_offset)), *upl_size, @@ -6267,14 +6111,14 @@ vm_map_create_upl( /* * Internal routine to enter a UPL into a VM map. - * + * * JMM - This should just be doable through the standard * vm_map_enter() API. */ kern_return_t vm_map_enter_upl( - vm_map_t map, - upl_t upl, + vm_map_t map, + upl_t upl, vm_map_offset_t *dst_addr) { vm_map_size_t size; @@ -6306,7 +6150,7 @@ vm_map_enter_upl( mapped++; } - if(mapped) { + if(mapped) { if(mapped != valid_upls) panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped\n", mapped, valid_upls); else { @@ -6360,7 +6204,7 @@ vm_map_enter_upl( wpl_array_t lite_list; if (upl->flags & UPL_INTERNAL) { - lite_list = (wpl_array_t) + lite_list = (wpl_array_t) ((((uintptr_t)upl) + sizeof(struct upl)) + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t))); } else { @@ -6399,13 +6243,13 @@ vm_map_enter_upl( } /* - * Convert the fictitious page to a private + * Convert the fictitious page to a private * shadow of the real page. */ - assert(alias_page->fictitious); - alias_page->fictitious = FALSE; - alias_page->private = TRUE; - alias_page->free_when_done = TRUE; + assert(alias_page->vmp_fictitious); + alias_page->vmp_fictitious = FALSE; + alias_page->vmp_private = TRUE; + alias_page->vmp_free_when_done = TRUE; /* * since m is a page in the upl it must * already be wired or BUSY, so it's @@ -6419,12 +6263,12 @@ vm_map_enter_upl( vm_page_lockspin_queues(); vm_page_wire(alias_page, VM_KERN_MEMORY_NONE, TRUE); vm_page_unlock_queues(); - + vm_page_insert_wired(alias_page, upl->map_object, new_offset, VM_KERN_MEMORY_NONE); - assert(!alias_page->wanted); - alias_page->busy = FALSE; - alias_page->absent = FALSE; + assert(!alias_page->vmp_wanted); + alias_page->vmp_busy = FALSE; + alias_page->vmp_absent = FALSE; } size -= PAGE_SIZE; offset += PAGE_SIZE_64; @@ -6438,7 +6282,7 @@ vm_map_enter_upl( offset = upl->offset - upl->map_object->paging_offset; size = upl->size; - + vm_object_reference(upl->map_object); if(!isVectorUPL) { @@ -6471,14 +6315,14 @@ vm_map_enter_upl( m = vm_page_lookup(upl->map_object, offset); if (m) { - m->pmapped = TRUE; + m->vmp_pmapped = TRUE; - /* CODE SIGNING ENFORCEMENT: page has been wpmapped, + /* CODE SIGNING ENFORCEMENT: page has been wpmapped, * but only in kernel space. If this was on a user map, * we'd have to set the wpmapped bit. */ - /* m->wpmapped = TRUE; */ + /* m->vmp_wpmapped = TRUE; */ assert(map->pmap == kernel_pmap); - + PMAP_ENTER(map->pmap, addr, m, VM_PROT_DEFAULT, VM_PROT_NONE, 0, TRUE, kr); assert(kr == KERN_SUCCESS); @@ -6497,7 +6341,7 @@ vm_map_enter_upl( upl->flags |= UPL_PAGE_LIST_MAPPED; upl->kaddr = (vm_offset_t) *dst_addr; assert(upl->kaddr == *dst_addr); - + if(isVectorUPL) goto process_upl_to_enter; @@ -6505,7 +6349,7 @@ vm_map_enter_upl( return KERN_SUCCESS; } - + /* * Internal routine to remove a UPL mapping from a VM map. * @@ -6518,7 +6362,7 @@ vm_map_enter_upl( */ kern_return_t vm_map_remove_upl( - vm_map_t map, + vm_map_t map, upl_t upl) { vm_address_t addr; @@ -6562,7 +6406,7 @@ vm_map_remove_upl( vm_offset_t v_upl_submap_dst_addr; vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr); - vm_map_remove(map, v_upl_submap_dst_addr, v_upl_submap_dst_addr + vector_upl->size, VM_MAP_NO_FLAGS); + vm_map_remove(map, v_upl_submap_dst_addr, v_upl_submap_dst_addr + vector_upl->size, VM_MAP_REMOVE_NO_FLAGS); vm_map_deallocate(v_upl_submap); upl_unlock(vector_upl); return KERN_SUCCESS; @@ -6570,7 +6414,7 @@ vm_map_remove_upl( upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ ); if(upl == NULL) - goto process_upl_to_remove; + goto process_upl_to_remove; } if (upl->flags & UPL_PAGE_LIST_MAPPED) { @@ -6582,18 +6426,17 @@ vm_map_remove_upl( upl->flags &= ~UPL_PAGE_LIST_MAPPED; upl->kaddr = (vm_offset_t) 0; - + if(!isVectorUPL) { upl_unlock(upl); - + vm_map_remove( map, vm_map_trunc_page(addr, VM_MAP_PAGE_MASK(map)), vm_map_round_page(addr + size, VM_MAP_PAGE_MASK(map)), - VM_MAP_NO_FLAGS); - + VM_MAP_REMOVE_NO_FLAGS); return KERN_SUCCESS; } else { @@ -6601,7 +6444,7 @@ vm_map_remove_upl( * If it's a Vectored UPL, we'll be removing the entire * submap anyways, so no need to remove individual UPL * element mappings from within the submap - */ + */ goto process_upl_to_remove; } } @@ -6613,13 +6456,13 @@ vm_map_remove_upl( kern_return_t upl_commit_range( - upl_t upl, - upl_offset_t offset, + upl_t upl, + upl_offset_t offset, upl_size_t size, int flags, upl_page_info_t *page_list, mach_msg_type_number_t count, - boolean_t *empty) + boolean_t *empty) { upl_size_t xfer_size, subupl_size = size; vm_object_t shadow_object; @@ -6685,7 +6528,7 @@ upl_commit_range( #if UPL_DEBUG if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) { (void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES); - + upl->upl_commit_records[upl->upl_commit_index].c_beg = offset; upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size); @@ -6803,8 +6646,8 @@ upl_commit_range( if (nxt_page != VM_PAGE_NULL) { m = nxt_page; - nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->listq); - target_offset = m->offset; + nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq); + target_offset = m->vmp_offset; } pg_num = (unsigned int) (target_offset/PAGE_SIZE); assert(pg_num == target_offset/PAGE_SIZE); @@ -6820,7 +6663,7 @@ upl_commit_range( if (upl->flags & UPL_SHADOWED) { if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) { - t->free_when_done = FALSE; + t->vmp_free_when_done = FALSE; VM_PAGE_FREE(t); @@ -6833,8 +6676,8 @@ upl_commit_range( m_object = VM_PAGE_OBJECT(m); - if (m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) { - assert(m->busy); + if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) { + assert(m->vmp_busy); dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP); goto commit_next_page; @@ -6846,12 +6689,12 @@ upl_commit_range( * Set the code signing bits according to * what the UPL says they should be. */ - m->cs_validated = page_list[entry].cs_validated; - m->cs_tainted = page_list[entry].cs_tainted; - m->cs_nx = page_list[entry].cs_nx; + m->vmp_cs_validated = page_list[entry].cs_validated; + m->vmp_cs_tainted = page_list[entry].cs_tainted; + m->vmp_cs_nx = page_list[entry].cs_nx; } if (flags & UPL_COMMIT_WRITTEN_BY_KERNEL) - m->written_by_kernel = TRUE; + m->vmp_written_by_kernel = TRUE; if (upl->flags & UPL_IO_WIRE) { @@ -6861,10 +6704,10 @@ upl_commit_range( if (flags & UPL_COMMIT_SET_DIRTY) { SET_PAGE_DIRTY(m, FALSE); } else if (flags & UPL_COMMIT_CLEAR_DIRTY) { - m->dirty = FALSE; + m->vmp_dirty = FALSE; if (! (flags & UPL_COMMIT_CS_VALIDATED) && - m->cs_validated && !m->cs_tainted) { + m->vmp_cs_validated && !m->vmp_cs_tainted) { /* * CODE SIGNING: * This page is no longer dirty @@ -6872,15 +6715,10 @@ upl_commit_range( * so it will need to be * re-validated. */ - if (m->slid) { - panic("upl_commit_range(%p): page %p was slid\n", - upl, m); - } - assert(!m->slid); - m->cs_validated = FALSE; -#if DEVELOPMENT || DEBUG - vm_cs_validated_resets++; -#endif + m->vmp_cs_validated = FALSE; + + VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1); + pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)); } clear_refmod |= VM_MEM_MODIFIED; @@ -6896,33 +6734,33 @@ upl_commit_range( if (fast_path_possible) { assert(m_object->purgable != VM_PURGABLE_EMPTY); assert(m_object->purgable != VM_PURGABLE_VOLATILE); - if (m->absent) { - assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q); - assert(m->wire_count == 0); - assert(m->busy); + if (m->vmp_absent) { + assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q); + assert(m->vmp_wire_count == 0); + assert(m->vmp_busy); - m->absent = FALSE; + m->vmp_absent = FALSE; dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP); } else { - if (m->wire_count == 0) + if (m->vmp_wire_count == 0) panic("wire_count == 0, m = %p, obj = %p\n", m, shadow_object); - assert(m->vm_page_q_state == VM_PAGE_IS_WIRED); + assert(m->vmp_q_state == VM_PAGE_IS_WIRED); /* * XXX FBDP need to update some other * counters here (purgeable_wired_count) * (ledgers), ... */ - assert(m->wire_count > 0); - m->wire_count--; + assert(m->vmp_wire_count > 0); + m->vmp_wire_count--; - if (m->wire_count == 0) { - m->vm_page_q_state = VM_PAGE_NOT_ON_Q; + if (m->vmp_wire_count == 0) { + m->vmp_q_state = VM_PAGE_NOT_ON_Q; unwired_count++; } } - if (m->wire_count == 0) { - assert(m->pageq.next == 0 && m->pageq.prev == 0); + if (m->vmp_wire_count == 0) { + assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0); if (last_local == VM_PAGE_NULL) { assert(first_local == VM_PAGE_NULL); @@ -6932,22 +6770,22 @@ upl_commit_range( } else { assert(first_local != VM_PAGE_NULL); - m->pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local); - first_local->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(m); + m->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local); + first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(m); first_local = m; } local_queue_count++; if (throttle_page) { - m->vm_page_q_state = VM_PAGE_ON_THROTTLED_Q; + m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q; } else { if (flags & UPL_COMMIT_INACTIVATE) { if (shadow_object->internal) - m->vm_page_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q; + m->vmp_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q; else - m->vm_page_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q; + m->vmp_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q; } else - m->vm_page_q_state = VM_PAGE_ON_ACTIVE_Q; + m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q; } } } else { @@ -6955,11 +6793,11 @@ upl_commit_range( dwp->dw_mask |= DW_vm_page_deactivate_internal; clear_refmod |= VM_MEM_REFERENCED; } - if (m->absent) { + if (m->vmp_absent) { if (flags & UPL_COMMIT_FREE_ABSENT) dwp->dw_mask |= DW_vm_page_free; else { - m->absent = FALSE; + m->vmp_absent = FALSE; dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP); if ( !(dwp->dw_mask & DW_vm_page_deactivate_internal)) @@ -6970,7 +6808,7 @@ upl_commit_range( } goto commit_next_page; } - assert(m->vm_page_q_state != VM_PAGE_USED_BY_COMPRESSOR); + assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR); if (page_list) page_list[entry].phys_addr = 0; @@ -6983,18 +6821,18 @@ upl_commit_range( * change of state */ if (flags & UPL_COMMIT_CLEAR_DIRTY) { - m->dirty = FALSE; + m->vmp_dirty = FALSE; clear_refmod |= VM_MEM_MODIFIED; } - if (m->laundry) + if (m->vmp_laundry) dwp->dw_mask |= DW_vm_pageout_throttle_up; if (VM_PAGE_WIRED(m)) - m->free_when_done = FALSE; - + m->vmp_free_when_done = FALSE; + if (! (flags & UPL_COMMIT_CS_VALIDATED) && - m->cs_validated && !m->cs_tainted) { + m->vmp_cs_validated && !m->vmp_cs_tainted) { /* * CODE SIGNING: * This page is no longer dirty @@ -7002,27 +6840,22 @@ upl_commit_range( * so it will need to be * re-validated. */ - if (m->slid) { - panic("upl_commit_range(%p): page %p was slid\n", - upl, m); - } - assert(!m->slid); - m->cs_validated = FALSE; -#if DEVELOPMENT || DEBUG - vm_cs_validated_resets++; -#endif + m->vmp_cs_validated = FALSE; + + VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1); + pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)); } - if (m->overwriting) { + if (m->vmp_overwriting) { /* * the (COPY_OUT_FROM == FALSE) request_page_list case */ - if (m->busy) { + if (m->vmp_busy) { #if CONFIG_PHANTOM_CACHE - if (m->absent && !m_object->internal) + if (m->vmp_absent && !m_object->internal) dwp->dw_mask |= DW_vm_phantom_cache_update; #endif - m->absent = FALSE; + m->vmp_absent = FALSE; dwp->dw_mask |= DW_clear_busy; } else { @@ -7035,37 +6868,34 @@ upl_commit_range( dwp->dw_mask |= DW_vm_page_unwire; /* reactivates */ } - m->overwriting = FALSE; + m->vmp_overwriting = FALSE; } - m->cleaning = FALSE; + m->vmp_cleaning = FALSE; - if (m->free_when_done) { - /* + if (m->vmp_free_when_done) { + /* * With the clean queue enabled, UPL_PAGEOUT should - * no longer set the pageout bit. It's pages now go + * no longer set the pageout bit. It's pages now go * to the clean queue. */ assert(!(flags & UPL_PAGEOUT)); assert(!m_object->internal); - m->free_when_done = FALSE; -#if MACH_CLUSTER_STATS - if (m->wanted) vm_pageout_target_collisions++; -#endif + m->vmp_free_when_done = FALSE; + if ((flags & UPL_COMMIT_SET_DIRTY) || - (m->pmapped && (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED))) { + (m->vmp_pmapped && (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED))) { /* * page was re-dirtied after we started - * the pageout... reactivate it since + * the pageout... reactivate it since * we don't know whether the on-disk * copy matches what is now in memory */ SET_PAGE_DIRTY(m, FALSE); - + dwp->dw_mask |= DW_vm_page_activate | DW_PAGE_WAKEUP; if (upl->flags & UPL_PAGEOUT) { - CLUSTER_STAT(vm_pageout_target_page_dirtied++;) VM_STAT_INCR(reactivations); DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL); } @@ -7079,21 +6909,13 @@ upl_commit_range( } else { DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL); } - m->dirty = FALSE; - m->busy = TRUE; + m->vmp_dirty = FALSE; + m->vmp_busy = TRUE; dwp->dw_mask |= DW_vm_page_free; } goto commit_next_page; } -#if MACH_CLUSTER_STATS - if (m->wpmapped) - m->dirty = pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m)); - - if (m->dirty) vm_pageout_cluster_dirtied++; - else vm_pageout_cluster_cleaned++; - if (m->wanted) vm_pageout_cluster_collisions++; -#endif /* * It is a part of the semantic of COPYOUT_FROM * UPLs that a commit implies cache sync @@ -7102,24 +6924,23 @@ upl_commit_range( * as well as clean */ if ((upl->flags & UPL_PAGE_SYNC_DONE) || (flags & UPL_COMMIT_CLEAR_PRECIOUS)) - m->precious = FALSE; + m->vmp_precious = FALSE; if (flags & UPL_COMMIT_SET_DIRTY) { SET_PAGE_DIRTY(m, FALSE); } else { - m->dirty = FALSE; + m->vmp_dirty = FALSE; } /* with the clean queue on, move *all* cleaned pages to the clean queue */ - if (hibernate_cleaning_in_progress == FALSE && !m->dirty && (upl->flags & UPL_PAGEOUT)) { + if (hibernate_cleaning_in_progress == FALSE && !m->vmp_dirty && (upl->flags & UPL_PAGEOUT)) { pgpgout_count++; VM_STAT_INCR(pageouts); DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL); dwp->dw_mask |= DW_enqueue_cleaned; - vm_pageout_enqueued_cleaned_from_inactive_dirty++; - } else if (should_be_throttled == TRUE && (m->vm_page_q_state == VM_PAGE_NOT_ON_Q)) { + } else if (should_be_throttled == TRUE && (m->vmp_q_state == VM_PAGE_NOT_ON_Q)) { /* * page coming back in from being 'frozen'... * it was dirty before it was frozen, so keep it so @@ -7130,14 +6951,14 @@ upl_commit_range( dwp->dw_mask |= DW_vm_page_activate; } else { - if ((flags & UPL_COMMIT_INACTIVATE) && !m->clustered && (m->vm_page_q_state != VM_PAGE_ON_SPECULATIVE_Q)) { + if ((flags & UPL_COMMIT_INACTIVATE) && !m->vmp_clustered && (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q)) { dwp->dw_mask |= DW_vm_page_deactivate_internal; clear_refmod |= VM_MEM_REFERENCED; } else if ( !VM_PAGE_PAGEABLE(m)) { - if (m->clustered || (flags & UPL_COMMIT_SPECULATE)) + if (m->vmp_clustered || (flags & UPL_COMMIT_SPECULATE)) dwp->dw_mask |= DW_vm_page_speculate; - else if (m->reference) + else if (m->vmp_reference) dwp->dw_mask |= DW_vm_page_activate; else { dwp->dw_mask |= DW_vm_page_deactivate_internal; @@ -7172,13 +6993,13 @@ upl_commit_range( if (dw_count >= dw_limit) { vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, &dw_array[0], dw_count); - + dwp = &dw_array[0]; dw_count = 0; } } else { if (dwp->dw_mask & DW_clear_busy) - m->busy = FALSE; + m->vmp_busy = FALSE; if (dwp->dw_mask & DW_PAGE_WAKEUP) PAGE_WAKEUP(m); @@ -7220,11 +7041,11 @@ upl_commit_range( if (vm_page_queue_empty(target_queue)) target_queue->prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local); else - first_target->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local); + first_target->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local); target_queue->next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local); - first_local->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(target_queue); - last_local->pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_target); + first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(target_queue); + last_local->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_target); /* * Adjust the global page counts. @@ -7249,7 +7070,7 @@ upl_commit_range( } else { vm_page_lockspin_queues(); } - if (unwired_count) { + if (unwired_count) { vm_page_wire_count -= unwired_count; VM_CHECK_MEMORYSTATUS; } @@ -7316,14 +7137,14 @@ upl_commit_range( vm_object_unlock(shadow_object); if (object != shadow_object) vm_object_unlock(object); - + if(!isVectorUPL) upl_unlock(upl); else { - /* + /* * If we completed our operations on an UPL that is * part of a Vectored UPL and if empty is TRUE, then - * we should go ahead and deallocate this UPL element. + * we should go ahead and deallocate this UPL element. * Then we check if this was the last of the UPL elements * within that Vectored UPL. If so, set empty to TRUE * so that in ubc_upl_commit_range or ubc_upl_commit, we @@ -7344,11 +7165,11 @@ upl_commit_range( kern_return_t upl_abort_range( - upl_t upl, - upl_offset_t offset, + upl_t upl, + upl_offset_t offset, upl_size_t size, int error, - boolean_t *empty) + boolean_t *empty) { upl_page_info_t *user_page_list = NULL; upl_size_t xfer_size, subupl_size = size; @@ -7403,7 +7224,7 @@ upl_abort_range( #if UPL_DEBUG if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) { (void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES); - + upl->upl_commit_records[upl->upl_commit_index].c_beg = offset; upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size); upl->upl_commit_records[upl->upl_commit_index].c_aborted = 1; @@ -7425,13 +7246,13 @@ upl_abort_range( return KERN_FAILURE; } if (upl->flags & UPL_INTERNAL) { - lite_list = (wpl_array_t) + lite_list = (wpl_array_t) ((((uintptr_t)upl) + sizeof(struct upl)) + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t))); user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl)); } else { - lite_list = (wpl_array_t) + lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl)); } object = upl->map_object; @@ -7494,7 +7315,7 @@ upl_abort_range( } if (upl->flags & UPL_SHADOWED) { if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) { - t->free_when_done = FALSE; + t->vmp_free_when_done = FALSE; VM_PAGE_FREE(t); @@ -7507,9 +7328,9 @@ upl_abort_range( if (m != VM_PAGE_NULL) { - assert(m->vm_page_q_state != VM_PAGE_USED_BY_COMPRESSOR); + assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR); - if (m->absent) { + if (m->vmp_absent) { boolean_t must_free = TRUE; /* @@ -7518,22 +7339,22 @@ upl_abort_range( * be passed back to the pages customer */ if (error & UPL_ABORT_RESTART) { - m->restart = TRUE; - m->absent = FALSE; - m->unusual = TRUE; + m->vmp_restart = TRUE; + m->vmp_absent = FALSE; + m->vmp_unusual = TRUE; must_free = FALSE; } else if (error & UPL_ABORT_UNAVAILABLE) { - m->restart = FALSE; - m->unusual = TRUE; + m->vmp_restart = FALSE; + m->vmp_unusual = TRUE; must_free = FALSE; } else if (error & UPL_ABORT_ERROR) { - m->restart = FALSE; - m->absent = FALSE; - m->error = TRUE; - m->unusual = TRUE; + m->vmp_restart = FALSE; + m->vmp_absent = FALSE; + m->vmp_error = TRUE; + m->vmp_unusual = TRUE; must_free = FALSE; } - if (m->clustered && needed == FALSE) { + if (m->vmp_clustered && needed == FALSE) { /* * This page was a part of a speculative * read-ahead initiated by the kernel @@ -7545,14 +7366,14 @@ upl_abort_range( */ must_free = TRUE; } - m->cleaning = FALSE; + m->vmp_cleaning = FALSE; - if (m->overwriting && !m->busy) { + if (m->vmp_overwriting && !m->vmp_busy) { /* * this shouldn't happen since * this is an 'absent' page, but * it doesn't hurt to check for - * the 'alternate' method of + * the 'alternate' method of * stabilizing the page... * we will mark 'busy' to be cleared * in the following code which will @@ -7561,7 +7382,7 @@ upl_abort_range( */ dwp->dw_mask |= DW_vm_page_unwire; } - m->overwriting = FALSE; + m->vmp_overwriting = FALSE; dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP); @@ -7570,10 +7391,10 @@ upl_abort_range( else dwp->dw_mask |= DW_vm_page_activate; } else { - /* + /* * Handle the trusted pager throttle. - */ - if (m->laundry) + */ + if (m->vmp_laundry) dwp->dw_mask |= DW_vm_pageout_throttle_up; if (upl->flags & UPL_ACCESS_BLOCKED) { @@ -7584,8 +7405,8 @@ upl_abort_range( */ dwp->dw_mask |= DW_clear_busy; } - if (m->overwriting) { - if (m->busy) + if (m->vmp_overwriting) { + if (m->vmp_busy) dwp->dw_mask |= DW_clear_busy; else { /* @@ -7599,10 +7420,10 @@ upl_abort_range( */ dwp->dw_mask |= DW_vm_page_unwire; } - m->overwriting = FALSE; + m->vmp_overwriting = FALSE; } - m->free_when_done = FALSE; - m->cleaning = FALSE; + m->vmp_free_when_done = FALSE; + m->vmp_cleaning = FALSE; if (error & UPL_ABORT_DUMP_PAGES) { pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)); @@ -7613,7 +7434,7 @@ upl_abort_range( if (error & UPL_ABORT_REFERENCE) { /* * we've been told to explictly - * reference this page... for + * reference this page... for * file I/O, this is done by * implementing an LRU on the inactive q */ @@ -7637,13 +7458,13 @@ upl_abort_range( if (dw_count >= dw_limit) { vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, &dw_array[0], dw_count); - + dwp = &dw_array[0]; dw_count = 0; } } else { if (dwp->dw_mask & DW_clear_busy) - m->busy = FALSE; + m->vmp_busy = FALSE; if (dwp->dw_mask & DW_PAGE_WAKEUP) PAGE_WAKEUP(m); @@ -7707,14 +7528,14 @@ upl_abort_range( vm_object_unlock(shadow_object); if (object != shadow_object) vm_object_unlock(object); - + if(!isVectorUPL) upl_unlock(upl); else { - /* + /* * If we completed our operations on an UPL that is * part of a Vectored UPL and if empty is TRUE, then - * we should go ahead and deallocate this UPL element. + * we should go ahead and deallocate this UPL element. * Then we check if this was the last of the UPL elements * within that Vectored UPL. If so, set empty to TRUE * so that in ubc_upl_abort_range or ubc_upl_abort, we @@ -7785,7 +7606,7 @@ iopl_valid_data( panic("iopl_valid_data: object == kernel or compressor"); if (object->purgable == VM_PURGABLE_VOLATILE || - object->purgable == VM_PURGABLE_EMPTY) + object->purgable == VM_PURGABLE_EMPTY) panic("iopl_valid_data: object %p purgable %d", object, object->purgable); @@ -7803,7 +7624,7 @@ iopl_valid_data( if (nxt_page != VM_PAGE_NULL) { m = nxt_page; - nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->listq); + nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq); } else { m = vm_page_lookup(object, offset); offset += PAGE_SIZE; @@ -7811,29 +7632,29 @@ iopl_valid_data( if (m == VM_PAGE_NULL) panic("iopl_valid_data: missing expected page at offset %lx", (long)offset); } - if (m->busy) { - if (!m->absent) + if (m->vmp_busy) { + if (!m->vmp_absent) panic("iopl_valid_data: busy page w/o absent"); - if (m->pageq.next || m->pageq.prev) + if (m->vmp_pageq.next || m->vmp_pageq.prev) panic("iopl_valid_data: busy+absent page on page queue"); - if (m->reusable) { + if (m->vmp_reusable) { panic("iopl_valid_data: %p is reusable", m); } - m->absent = FALSE; - m->dirty = TRUE; - assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q); - assert(m->wire_count == 0); - m->wire_count++; - assert(m->wire_count); - if (m->wire_count == 1) { - m->vm_page_q_state = VM_PAGE_IS_WIRED; + m->vmp_absent = FALSE; + m->vmp_dirty = TRUE; + assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q); + assert(m->vmp_wire_count == 0); + m->vmp_wire_count++; + assert(m->vmp_wire_count); + if (m->vmp_wire_count == 1) { + m->vmp_q_state = VM_PAGE_IS_WIRED; wired_count++; } else { panic("iopl_valid_data: %p already wired\n", m); } - + PAGE_WAKEUP_DONE(m); } size -= PAGE_SIZE; @@ -7874,7 +7695,7 @@ vm_object_set_pmap_cache_attr( boolean_t vm_object_iopl_wire_full(vm_object_t, upl_t, upl_page_info_array_t, wpl_array_t, upl_control_flags_t, vm_tag_t); -kern_return_t vm_object_iopl_wire_empty(vm_object_t, upl_t, upl_page_info_array_t, wpl_array_t, upl_control_flags_t, vm_tag_t, vm_object_offset_t *, int); +kern_return_t vm_object_iopl_wire_empty(vm_object_t, upl_t, upl_page_info_array_t, wpl_array_t, upl_control_flags_t, vm_tag_t, vm_object_offset_t *, int, int*); @@ -7903,31 +7724,31 @@ vm_object_iopl_wire_full(vm_object_t object, upl_t upl, upl_page_info_array_t us while (page_count--) { - if (dst_page->busy || - dst_page->fictitious || - dst_page->absent || - dst_page->error || - dst_page->cleaning || - dst_page->restart || - dst_page->laundry) { + if (dst_page->vmp_busy || + dst_page->vmp_fictitious || + dst_page->vmp_absent || + dst_page->vmp_error || + dst_page->vmp_cleaning || + dst_page->vmp_restart || + dst_page->vmp_laundry) { retval = FALSE; goto done; } - if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->written_by_kernel == TRUE) { + if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) { retval = FALSE; goto done; } - dst_page->reference = TRUE; + dst_page->vmp_reference = TRUE; vm_page_wire(dst_page, tag, FALSE); if (!(cntrl_flags & UPL_COPYOUT_FROM)) { SET_PAGE_DIRTY(dst_page, FALSE); } - entry = (unsigned int)(dst_page->offset / PAGE_SIZE); + entry = (unsigned int)(dst_page->vmp_offset / PAGE_SIZE); assert(entry >= 0 && entry < object->resident_page_count); lite_list[entry>>5] |= 1 << (entry & 31); - + phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page); if (phys_page > upl->highest_page) @@ -7935,10 +7756,10 @@ vm_object_iopl_wire_full(vm_object_t object, upl_t upl, upl_page_info_array_t us if (user_page_list) { user_page_list[entry].phys_addr = phys_page; - user_page_list[entry].absent = dst_page->absent; - user_page_list[entry].dirty = dst_page->dirty; - user_page_list[entry].free_when_done = dst_page->free_when_done; - user_page_list[entry].precious = dst_page->precious; + user_page_list[entry].absent = dst_page->vmp_absent; + user_page_list[entry].dirty = dst_page->vmp_dirty; + user_page_list[entry].free_when_done = dst_page->vmp_free_when_done; + user_page_list[entry].precious = dst_page->vmp_precious; user_page_list[entry].device = FALSE; user_page_list[entry].speculative = FALSE; user_page_list[entry].cs_validated = FALSE; @@ -7953,7 +7774,7 @@ vm_object_iopl_wire_full(vm_object_t object, upl_t upl, upl_page_info_array_t us VM_CHECK_MEMORYSTATUS; } - dst_page = (vm_page_t)vm_page_queue_next(&dst_page->listq); + dst_page = (vm_page_t)vm_page_queue_next(&dst_page->vmp_listq); } done: vm_page_unlock_queues(); @@ -7966,7 +7787,8 @@ vm_object_iopl_wire_full(vm_object_t object, upl_t upl, upl_page_info_array_t us kern_return_t vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t user_page_list, - wpl_array_t lite_list, upl_control_flags_t cntrl_flags, vm_tag_t tag, vm_object_offset_t *dst_offset, int page_count) + wpl_array_t lite_list, upl_control_flags_t cntrl_flags, vm_tag_t tag, vm_object_offset_t *dst_offset, + int page_count, int* page_grab_count) { vm_page_t dst_page; boolean_t no_zero_fill = FALSE; @@ -8002,7 +7824,7 @@ vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t u #endif /* CONFIG_SECLUDED_MEMORY */ while (page_count--) { - + while ((dst_page = vm_page_grab_options(grab_options)) == VM_PAGE_NULL) { @@ -8017,7 +7839,7 @@ vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t u OSAddAtomic(-page_count, &vm_upl_wait_for_pages); VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1); - + ret = MACH_SEND_INTERRUPTED; goto done; } @@ -8028,19 +7850,19 @@ vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t u if (no_zero_fill == FALSE) vm_page_zero_fill(dst_page); else - dst_page->absent = TRUE; + dst_page->vmp_absent = TRUE; - dst_page->reference = TRUE; + dst_page->vmp_reference = TRUE; if (!(cntrl_flags & UPL_COPYOUT_FROM)) { - SET_PAGE_DIRTY(dst_page, FALSE); - } - if (dst_page->absent == FALSE) { - assert(dst_page->vm_page_q_state == VM_PAGE_NOT_ON_Q); - assert(dst_page->wire_count == 0); - dst_page->wire_count++; - dst_page->vm_page_q_state = VM_PAGE_IS_WIRED; - assert(dst_page->wire_count); + SET_PAGE_DIRTY(dst_page, FALSE); + } + if (dst_page->vmp_absent == FALSE) { + assert(dst_page->vmp_q_state == VM_PAGE_NOT_ON_Q); + assert(dst_page->vmp_wire_count == 0); + dst_page->vmp_wire_count++; + dst_page->vmp_q_state = VM_PAGE_IS_WIRED; + assert(dst_page->vmp_wire_count); pages_wired++; PAGE_WAKEUP_DONE(dst_page); } @@ -8049,7 +7871,7 @@ vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t u vm_page_insert_internal(dst_page, object, *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, &delayed_ledger_update); lite_list[entry>>5] |= 1 << (entry & 31); - + phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page); if (phys_page > upl->highest_page) @@ -8057,8 +7879,8 @@ vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t u if (user_page_list) { user_page_list[entry].phys_addr = phys_page; - user_page_list[entry].absent = dst_page->absent; - user_page_list[entry].dirty = dst_page->dirty; + user_page_list[entry].absent = dst_page->vmp_absent; + user_page_list[entry].dirty = dst_page->vmp_dirty; user_page_list[entry].free_when_done = FALSE; user_page_list[entry].precious = FALSE; user_page_list[entry].device = FALSE; @@ -8087,25 +7909,41 @@ vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t u } if (delayed_ledger_update) { task_t owner; + int ledger_idx_volatile; + int ledger_idx_nonvolatile; + int ledger_idx_volatile_compressed; + int ledger_idx_nonvolatile_compressed; + boolean_t do_footprint; - owner = object->vo_purgeable_owner; + owner = VM_OBJECT_OWNER(object); assert(owner); + vm_object_ledger_tag_ledgers(object, + &ledger_idx_volatile, + &ledger_idx_nonvolatile, + &ledger_idx_volatile_compressed, + &ledger_idx_nonvolatile_compressed, + &do_footprint); + /* more non-volatile bytes */ ledger_credit(owner->ledger, - task_ledgers.purgeable_nonvolatile, - delayed_ledger_update); - /* more footprint */ - ledger_credit(owner->ledger, - task_ledgers.phys_footprint, + ledger_idx_nonvolatile, delayed_ledger_update); + if (do_footprint) { + /* more footprint */ + ledger_credit(owner->ledger, + task_ledgers.phys_footprint, + delayed_ledger_update); + } } + + assert(page_grab_count); + *page_grab_count = pages_inserted; + return (ret); } -unsigned int vm_object_iopl_request_sleep_for_cleaning = 0; - kern_return_t vm_object_iopl_request( @@ -8126,10 +7964,11 @@ vm_object_iopl_request( wpl_array_t lite_list = NULL; int no_zero_fill = FALSE; unsigned int size_in_pages; + int page_grab_count = 0; u_int32_t psize; kern_return_t ret; vm_prot_t prot; - struct vm_object_fault_info fault_info; + struct vm_object_fault_info fault_info = {}; struct vm_page_delayed_work dw_array[DEFAULT_DELAYED_WORK_LIMIT]; struct vm_page_delayed_work *dwp; int dw_count; @@ -8162,7 +8001,7 @@ vm_object_iopl_request( if (object->phys_contiguous) { if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address) return KERN_INVALID_ADDRESS; - + if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address) return KERN_INVALID_ADDRESS; } @@ -8178,6 +8017,8 @@ vm_object_iopl_request( if ((!object->internal) && (object->paging_offset != 0)) panic("vm_object_iopl_request: external object with non-zero paging offset\n"); + VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, prot, 0); + #if CONFIG_IOSCHED || UPL_DEBUG if ((object->io_tracking && object != kernel_object) || upl_debug_enabled) io_tracking_flag |= UPL_CREATE_IO_TRACKING; @@ -8218,6 +8059,14 @@ vm_object_iopl_request( user_page_list[0].device = FALSE; *upl_ptr = upl; + if (cntrl_flags & UPL_NOZEROFILLIO) { + DTRACE_VM4(upl_nozerofillio, + vm_object_t, object, + vm_object_offset_t, offset, + upl_size_t, size, + upl_t, upl); + } + upl->map_object = object; upl->size = size; @@ -8282,6 +8131,8 @@ vm_object_iopl_request( else *page_list_count = 1; } + + VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0); return KERN_SUCCESS; } if (object != kernel_object && object != compressor_object) { @@ -8338,10 +8189,8 @@ vm_object_iopl_request( FALSE, /* should_return */ MEMORY_OBJECT_COPY_SYNC, VM_PROT_NO_CHANGE); -#if DEVELOPMENT || DEBUG - iopl_cow++; - iopl_cow_pages += size >> PAGE_SHIFT; -#endif + VM_PAGEOUT_DEBUG(iopl_cow, 1); + VM_PAGEOUT_DEBUG(iopl_cow_pages, (size >> PAGE_SHIFT)); } if (!(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS)) && object->purgable != VM_PURGABLE_VOLATILE && @@ -8393,8 +8242,8 @@ vm_object_iopl_request( ret = KERN_MEMORY_ERROR; goto return_err; } - ret = vm_object_iopl_wire_empty(object, upl, user_page_list, lite_list, cntrl_flags, tag, &dst_offset, size_in_pages); - + ret = vm_object_iopl_wire_empty(object, upl, user_page_list, lite_list, cntrl_flags, tag, &dst_offset, size_in_pages, &page_grab_count); + if (ret) { free_wired_pages = TRUE; goto return_err; @@ -8403,13 +8252,8 @@ vm_object_iopl_request( } fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL; - fault_info.user_tag = 0; fault_info.lo_offset = offset; fault_info.hi_offset = offset + xfer_size; - fault_info.no_cache = FALSE; - fault_info.stealth = FALSE; - fault_info.io_sync = FALSE; - fault_info.cs_bypass = FALSE; fault_info.mark_zf_absent = TRUE; fault_info.interruptible = interruptible; fault_info.batch_pmap_op = TRUE; @@ -8437,11 +8281,11 @@ vm_object_iopl_request( dst_page = vm_page_lookup(object, dst_offset); if (dst_page == VM_PAGE_NULL || - dst_page->busy || - dst_page->error || - dst_page->restart || - dst_page->absent || - dst_page->fictitious) { + dst_page->vmp_busy || + dst_page->vmp_error || + dst_page->vmp_restart || + dst_page->vmp_absent || + dst_page->vmp_fictitious) { if (object == kernel_object) panic("vm_object_iopl_request: missing/bad page in kernel object\n"); @@ -8484,15 +8328,16 @@ vm_object_iopl_request( switch (result) { case VM_FAULT_SUCCESS: + page_grab_count++; - if ( !dst_page->absent) { + if ( !dst_page->vmp_absent) { PAGE_WAKEUP_DONE(dst_page); } else { /* * we only get back an absent page if we * requested that it not be zero-filled * because we are about to fill it via I/O - * + * * absent pages should be left BUSY * to prevent them from being faulted * into an address space before we've @@ -8509,11 +8354,11 @@ vm_object_iopl_request( vm_object_t local_object; local_object = VM_PAGE_OBJECT(top_page); - + /* * comparing 2 packed pointers */ - if (top_page->vm_page_object != dst_page->vm_page_object) { + if (top_page->vmp_object != dst_page->vmp_object) { vm_object_lock(local_object); VM_PAGE_FREE(top_page); vm_object_paging_end(local_object); @@ -8525,7 +8370,7 @@ vm_object_iopl_request( } vm_object_paging_end(object); break; - + case VM_FAULT_RETRY: vm_object_lock(object); break; @@ -8576,12 +8421,12 @@ vm_object_iopl_request( if (upl->flags & UPL_KERNEL_OBJECT) goto record_phys_addr; - if (dst_page->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) { - dst_page->busy = TRUE; + if (dst_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) { + dst_page->vmp_busy = TRUE; goto record_phys_addr; } - if (dst_page->cleaning) { + if (dst_page->vmp_cleaning) { /* * Someone else is cleaning this page in place. * In theory, we should be able to proceed and use this @@ -8592,11 +8437,11 @@ vm_object_iopl_request( * We'd better wait for the cleaning to complete and * then try again. */ - vm_object_iopl_request_sleep_for_cleaning++; + VM_PAGEOUT_DEBUG(vm_object_iopl_request_sleep_for_cleaning, 1); PAGE_SLEEP(object, dst_page, THREAD_UNINT); continue; } - if (dst_page->laundry) + if (dst_page->vmp_laundry) vm_pageout_steal_laundry(dst_page, FALSE); if ( (cntrl_flags & UPL_NEED_32BIT_ADDR) && @@ -8629,20 +8474,20 @@ vm_object_iopl_request( * it after we disconnect it... we want the fault * to find the new page being substituted. */ - if (dst_page->pmapped) + if (dst_page->vmp_pmapped) refmod = pmap_disconnect(phys_page); else refmod = 0; - if (!dst_page->absent) + if (!dst_page->vmp_absent) vm_page_copy(dst_page, low_page); - - low_page->reference = dst_page->reference; - low_page->dirty = dst_page->dirty; - low_page->absent = dst_page->absent; + + low_page->vmp_reference = dst_page->vmp_reference; + low_page->vmp_dirty = dst_page->vmp_dirty; + low_page->vmp_absent = dst_page->vmp_absent; if (refmod & VM_MEM_REFERENCED) - low_page->reference = TRUE; + low_page->vmp_reference = TRUE; if (refmod & VM_MEM_MODIFIED) { SET_PAGE_DIRTY(low_page, FALSE); } @@ -8655,12 +8500,12 @@ vm_object_iopl_request( * BUSY... we don't need a PAGE_WAKEUP_DONE * here, because we've never dropped the object lock */ - if ( !dst_page->absent) - dst_page->busy = FALSE; + if ( !dst_page->vmp_absent) + dst_page->vmp_busy = FALSE; phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page); } - if ( !dst_page->busy) + if ( !dst_page->vmp_busy) dwp->dw_mask |= DW_vm_page_wire; if (cntrl_flags & UPL_BLOCK_ACCESS) { @@ -8670,8 +8515,8 @@ vm_object_iopl_request( * We'll also remove the mapping * of all these pages before leaving this routine. */ - assert(!dst_page->fictitious); - dst_page->busy = TRUE; + assert(!dst_page->vmp_fictitious); + dst_page->vmp_busy = TRUE; } /* * expect the page to be used @@ -8680,15 +8525,15 @@ vm_object_iopl_request( dwp->dw_mask |= DW_set_reference; if (!(cntrl_flags & UPL_COPYOUT_FROM)) { - SET_PAGE_DIRTY(dst_page, TRUE); + SET_PAGE_DIRTY(dst_page, TRUE); } - if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->written_by_kernel == TRUE) { + if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) { pmap_sync_page_attributes_phys(phys_page); - dst_page->written_by_kernel = FALSE; + dst_page->vmp_written_by_kernel = FALSE; } record_phys_addr: - if (dst_page->busy) + if (dst_page->vmp_busy) upl->flags |= UPL_HAS_BUSY; lite_list[entry>>5] |= 1 << (entry & 31); @@ -8698,28 +8543,28 @@ vm_object_iopl_request( if (user_page_list) { user_page_list[entry].phys_addr = phys_page; - user_page_list[entry].free_when_done = dst_page->free_when_done; - user_page_list[entry].absent = dst_page->absent; - user_page_list[entry].dirty = dst_page->dirty; - user_page_list[entry].precious = dst_page->precious; + user_page_list[entry].free_when_done = dst_page->vmp_free_when_done; + user_page_list[entry].absent = dst_page->vmp_absent; + user_page_list[entry].dirty = dst_page->vmp_dirty; + user_page_list[entry].precious = dst_page->vmp_precious; user_page_list[entry].device = FALSE; user_page_list[entry].needed = FALSE; - if (dst_page->clustered == TRUE) - user_page_list[entry].speculative = (dst_page->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE; + if (dst_page->vmp_clustered == TRUE) + user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE; else user_page_list[entry].speculative = FALSE; - user_page_list[entry].cs_validated = dst_page->cs_validated; - user_page_list[entry].cs_tainted = dst_page->cs_tainted; - user_page_list[entry].cs_nx = dst_page->cs_nx; + user_page_list[entry].cs_validated = dst_page->vmp_cs_validated; + user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted; + user_page_list[entry].cs_nx = dst_page->vmp_cs_nx; user_page_list[entry].mark = FALSE; } if (object != kernel_object && object != compressor_object) { /* * someone is explicitly grabbing this page... * update clustered and speculative state - * + * */ - if (dst_page->clustered) + if (dst_page->vmp_clustered) VM_PAGE_CONSUME_CLUSTERED(dst_page); } skip_page: @@ -8732,7 +8577,7 @@ vm_object_iopl_request( if (dw_count >= dw_limit) { vm_page_do_delayed_work(object, tag, &dw_array[0], dw_count); - + dwp = &dw_array[0]; dw_count = 0; } @@ -8767,6 +8612,7 @@ vm_object_iopl_request( object->blocked_access = TRUE; } + VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0); return KERN_SUCCESS; return_err: @@ -8781,7 +8627,7 @@ vm_object_iopl_request( panic("vm_object_iopl_request: Wired page missing. \n"); /* - * if we've already processed this page in an earlier + * if we've already processed this page in an earlier * dw_do_work, we need to undo the wiring... we will * leave the dirty and reference bits on if they * were set, since we don't have a good way of knowing @@ -8807,7 +8653,7 @@ vm_object_iopl_request( } vm_page_lock_queues(); - if (dst_page->absent || free_wired_pages == TRUE) { + if (dst_page->vmp_absent || free_wired_pages == TRUE) { vm_page_free(dst_page); need_unwire = FALSE; @@ -8832,6 +8678,7 @@ vm_object_iopl_request( vm_object_unlock(object); upl_destroy(upl); + VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, ret, 0, 0); return ret; } @@ -8847,7 +8694,7 @@ upl_transpose( if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2 || ((upl1->flags & UPL_VECTOR)==UPL_VECTOR) || ((upl2->flags & UPL_VECTOR)==UPL_VECTOR)) { return KERN_INVALID_ARGUMENT; } - + upls_locked = FALSE; /* @@ -8964,6 +8811,7 @@ boolean_t vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, }; int vm_paging_max_index = 0; int vm_paging_page_waiter = 0; int vm_paging_page_waiter_total = 0; + unsigned long vm_paging_no_kernel_page = 0; unsigned long vm_paging_objects_mapped = 0; unsigned long vm_paging_pages_mapped = 0; @@ -9055,7 +8903,7 @@ vm_paging_map_object( #warn "vm_paging_map_object: no 1-to-1 kernel mapping of physical memory..." #endif - assert(page->busy); + assert(page->vmp_busy); /* * Use one of the pre-allocated kernel virtual addresses * and just enter the VM page in the kernel address space @@ -9116,12 +8964,12 @@ vm_paging_map_object( vm_paging_page_inuse[i] = TRUE; simple_unlock(&vm_paging_lock); - page->pmapped = TRUE; + page->vmp_pmapped = TRUE; /* * Keep the VM object locked over the PMAP_ENTER * and the actual use of the page by the kernel, - * or this pmap mapping might get undone by a + * or this pmap mapping might get undone by a * vm_object_pmap_protect() call... */ PMAP_ENTER(kernel_pmap, @@ -9134,7 +8982,7 @@ vm_paging_map_object( kr); assert(kr == KERN_SUCCESS); vm_paging_objects_mapped++; - vm_paging_pages_mapped++; + vm_paging_pages_mapped++; *address = page_map_offset; *need_unmap = TRUE; @@ -9218,7 +9066,7 @@ vm_paging_map_object( printf("vm_paging_map_object: no page !?"); vm_object_unlock(object); kr = vm_map_remove(kernel_map, *address, *size, - VM_MAP_NO_FLAGS); + VM_MAP_REMOVE_NO_FLAGS); assert(kr == KERN_SUCCESS); *address = 0; *size = 0; @@ -9226,7 +9074,7 @@ vm_paging_map_object( vm_object_lock(object); return KERN_MEMORY_ERROR; } - page->pmapped = TRUE; + page->vmp_pmapped = TRUE; //assert(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(page))); PMAP_ENTER(kernel_pmap, @@ -9242,7 +9090,7 @@ vm_paging_map_object( kasan_notify_address(*address + page_map_offset, PAGE_SIZE); #endif } - + vm_paging_objects_mapped_slow++; vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64); @@ -9280,7 +9128,8 @@ vm_paging_unmap_object( if (object != VM_OBJECT_NULL) { vm_object_unlock(object); } - kr = vm_map_remove(kernel_map, start, end, VM_MAP_NO_FLAGS); + kr = vm_map_remove(kernel_map, start, end, + VM_MAP_REMOVE_NO_FLAGS); if (object != VM_OBJECT_NULL) { vm_object_lock(object); } @@ -9309,7 +9158,7 @@ vm_paging_unmap_object( /* - * page->object must be locked + * page->vmp_object must be locked */ void vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked) @@ -9318,7 +9167,7 @@ vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked) vm_page_lockspin_queues(); } - page->free_when_done = FALSE; + page->vmp_free_when_done = FALSE; /* * need to drop the laundry count... * we may also need to remove it @@ -9329,8 +9178,6 @@ vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked) */ vm_pageout_throttle_up(page); - vm_page_steal_pageout_page++; - if (!queues_locked) { vm_page_unlock_queues(); } @@ -9352,11 +9199,11 @@ vector_upl_create(vm_offset_t upl_offset) vector_upl->invalid_upls=0; vector_upl->num_upls=0; vector_upl->pagelist = NULL; - + for(i=0; i < MAX_VECTOR_UPL_ELEMENTS ; i++) { vector_upl->upl_iostates[i].size = 0; vector_upl->upl_iostates[i].offset = 0; - + } return upl; } @@ -9401,9 +9248,9 @@ vector_upl_is_valid(upl_t upl) boolean_t vector_upl_set_subupl(upl_t upl,upl_t subupl, uint32_t io_size) { - if(vector_upl_is_valid(upl)) { + if(vector_upl_is_valid(upl)) { vector_upl_t vector_upl = upl->vector_upl; - + if(vector_upl) { if(subupl) { if(io_size) { @@ -9422,12 +9269,12 @@ vector_upl_set_subupl(upl_t upl,upl_t subupl, uint32_t io_size) } if(i == vector_upl->num_upls) panic("Trying to remove sub-upl when none exists"); - + vector_upl->upl_elems[i] = NULL; - invalid_upls = hw_atomic_add(&(vector_upl)->invalid_upls, 1); + invalid_upls = hw_atomic_add(&(vector_upl)->invalid_upls, 1); if(invalid_upls == vector_upl->num_upls) return TRUE; - else + else return FALSE; } } @@ -9441,12 +9288,12 @@ vector_upl_set_subupl(upl_t upl,upl_t subupl, uint32_t io_size) panic("vector_upl_set_subupl was passed a NULL upl\n"); return FALSE; -} +} void vector_upl_set_pagelist(upl_t upl) { - if(vector_upl_is_valid(upl)) { + if(vector_upl_is_valid(upl)) { uint32_t i=0; vector_upl_t vector_upl = upl->vector_upl; @@ -9454,7 +9301,7 @@ vector_upl_set_pagelist(upl_t upl) vm_offset_t pagelist_size=0, cur_upl_pagelist_size=0; vector_upl->pagelist = (upl_page_info_array_t)kalloc(sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE)); - + for(i=0; i < vector_upl->num_upls; i++) { cur_upl_pagelist_size = sizeof(struct upl_page_info) * vector_upl->upl_elems[i]->size/PAGE_SIZE; bcopy(UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(vector_upl->upl_elems[i]), (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size); @@ -9475,7 +9322,7 @@ vector_upl_set_pagelist(upl_t upl) upl_t vector_upl_subupl_byindex(upl_t upl, uint32_t index) { - if(vector_upl_is_valid(upl)) { + if(vector_upl_is_valid(upl)) { vector_upl_t vector_upl = upl->vector_upl; if(vector_upl) { if(index < vector_upl->num_upls) @@ -9490,7 +9337,7 @@ vector_upl_subupl_byindex(upl_t upl, uint32_t index) upl_t vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size) { - if(vector_upl_is_valid(upl)) { + if(vector_upl_is_valid(upl)) { uint32_t i=0; vector_upl_t vector_upl = upl->vector_upl; @@ -9518,7 +9365,7 @@ vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_ else if(i) panic("Vector UPL offset miscalculation\n"); return subupl; - } + } } } else @@ -9532,7 +9379,7 @@ vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst { *v_upl_submap = NULL; - if(vector_upl_is_valid(upl)) { + if(vector_upl_is_valid(upl)) { vector_upl_t vector_upl = upl->vector_upl; if(vector_upl) { *v_upl_submap = vector_upl->submap; @@ -9548,7 +9395,7 @@ vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst void vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr) { - if(vector_upl_is_valid(upl)) { + if(vector_upl_is_valid(upl)) { vector_upl_t vector_upl = upl->vector_upl; if(vector_upl) { vector_upl->submap = submap; @@ -9564,7 +9411,7 @@ vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr) void vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size) { - if(vector_upl_is_valid(upl)) { + if(vector_upl_is_valid(upl)) { uint32_t i = 0; vector_upl_t vector_upl = upl->vector_upl; @@ -9573,7 +9420,7 @@ vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t if(vector_upl->upl_elems[i] == subupl) break; } - + if(i == vector_upl->num_upls) panic("setting sub-upl iostate when none exists"); @@ -9592,7 +9439,7 @@ vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t void vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size) { - if(vector_upl_is_valid(upl)) { + if(vector_upl_is_valid(upl)) { uint32_t i = 0; vector_upl_t vector_upl = upl->vector_upl; @@ -9601,7 +9448,7 @@ vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t if(vector_upl->upl_elems[i] == subupl) break; } - + if(i == vector_upl->num_upls) panic("getting sub-upl iostate when none exists"); @@ -9618,7 +9465,7 @@ vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t void vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size) { - if(vector_upl_is_valid(upl)) { + if(vector_upl_is_valid(upl)) { vector_upl_t vector_upl = upl->vector_upl; if(vector_upl) { if(index < vector_upl->num_upls) { @@ -9693,161 +9540,14 @@ upl_set_blkno( int i,j; if ((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0) return; - - assert(upl->upl_reprio_info != 0); + + assert(upl->upl_reprio_info != 0); for(i = (int)(upl_offset / PAGE_SIZE), j = 0; j < io_size; i++, j += PAGE_SIZE) { UPL_SET_REPRIO_INFO(upl, i, blkno, io_size); } } #endif -boolean_t -vm_page_is_slideable(vm_page_t m) -{ - boolean_t result = FALSE; - vm_shared_region_slide_info_t si; - vm_object_t m_object; - - m_object = VM_PAGE_OBJECT(m); - - vm_object_lock_assert_held(m_object); - - /* make sure our page belongs to the one object allowed to do this */ - if (!m_object->object_slid) { - goto done; - } - - si = m_object->vo_slide_info; - if (si == NULL) { - goto done; - } - - if(!m->slid && (si->start <= m->offset && si->end > m->offset)) { - result = TRUE; - } - -done: - return result; -} - -int vm_page_slide_counter = 0; -int vm_page_slide_errors = 0; -kern_return_t -vm_page_slide( - vm_page_t page, - vm_map_offset_t kernel_mapping_offset) -{ - kern_return_t kr; - vm_map_size_t kernel_mapping_size; - boolean_t kernel_mapping_needs_unmap; - vm_offset_t kernel_vaddr; - uint32_t pageIndex; - uint32_t slide_chunk; - vm_object_t page_object; - - page_object = VM_PAGE_OBJECT(page); - - assert(!page->slid); - assert(page_object->object_slid); - vm_object_lock_assert_exclusive(page_object); - - if (page->error) - return KERN_FAILURE; - - /* - * Take a paging-in-progress reference to keep the object - * alive even if we have to unlock it (in vm_paging_map_object() - * for example)... - */ - vm_object_paging_begin(page_object); - - if (kernel_mapping_offset == 0) { - /* - * The page hasn't already been mapped in kernel space - * by the caller. Map it now, so that we can access - * its contents and decrypt them. - */ - kernel_mapping_size = PAGE_SIZE; - kernel_mapping_needs_unmap = FALSE; - kr = vm_paging_map_object(page, - page_object, - page->offset, - VM_PROT_READ | VM_PROT_WRITE, - FALSE, - &kernel_mapping_size, - &kernel_mapping_offset, - &kernel_mapping_needs_unmap); - if (kr != KERN_SUCCESS) { - panic("vm_page_slide: " - "could not map page in kernel: 0x%x\n", - kr); - } - } else { - kernel_mapping_size = 0; - kernel_mapping_needs_unmap = FALSE; - } - kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset); - - /* - * Slide the pointers on the page. - */ - - /*assert that slide_file_info.start/end are page-aligned?*/ - - assert(!page->slid); - assert(page_object->object_slid); - - pageIndex = (uint32_t)((page->offset - - page_object->vo_slide_info->start) / - PAGE_SIZE_FOR_SR_SLIDE); - for (slide_chunk = 0; - slide_chunk < PAGE_SIZE / PAGE_SIZE_FOR_SR_SLIDE; - slide_chunk++) { - kr = vm_shared_region_slide_page(page_object->vo_slide_info, - (kernel_vaddr + - (slide_chunk * - PAGE_SIZE_FOR_SR_SLIDE)), - (pageIndex + slide_chunk)); - if (kr != KERN_SUCCESS) { - break; - } - } - - vm_page_slide_counter++; - - /* - * Unmap the page from the kernel's address space, - */ - if (kernel_mapping_needs_unmap) { - vm_paging_unmap_object(page_object, - kernel_vaddr, - kernel_vaddr + PAGE_SIZE); - } - - page->dirty = FALSE; - pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(page), VM_MEM_MODIFIED | VM_MEM_REFERENCED); - - if (kr != KERN_SUCCESS || cs_debug > 1) { - printf("vm_page_slide(%p): " - "obj %p off 0x%llx mobj %p moff 0x%llx\n", - page, - page_object, page->offset, - page_object->pager, - page->offset + page_object->paging_offset); - } - - if (kr == KERN_SUCCESS) { - page->slid = TRUE; - } else { - page->error = TRUE; - vm_page_slide_errors++; - } - - vm_object_paging_end(page_object); - - return kr; -} - void inline memoryshot(unsigned int event, unsigned int control) { if (vm_debug_events) { @@ -9917,12 +9617,12 @@ vm_countdirtypages(void) do { if (m ==(vm_page_t )0) break; - if(m->dirty) dpages++; - if(m->free_when_done) pgopages++; - if(m->precious) precpages++; + if(m->vmp_dirty) dpages++; + if(m->vmp_free_when_done) pgopages++; + if(m->vmp_precious) precpages++; assert(VM_PAGE_OBJECT(m) != kernel_object); - m = (vm_page_t) vm_page_queue_next(&m->pageq); + m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq); if (m ==(vm_page_t )0) break; } while (!vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t) m)); @@ -9934,10 +9634,10 @@ vm_countdirtypages(void) if (m ==(vm_page_t )0) break; dpages++; - assert(m->dirty); - assert(!m->free_when_done); + assert(m->vmp_dirty); + assert(!m->vmp_free_when_done); assert(VM_PAGE_OBJECT(m) != kernel_object); - m = (vm_page_t) vm_page_queue_next(&m->pageq); + m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq); if (m ==(vm_page_t )0) break; } while (!vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t) m)); @@ -9948,12 +9648,12 @@ vm_countdirtypages(void) do { if (m ==(vm_page_t )0) break; - if(m->dirty) dpages++; - if(m->free_when_done) pgopages++; - if(m->precious) precpages++; + if(m->vmp_dirty) dpages++; + if(m->vmp_free_when_done) pgopages++; + if(m->vmp_precious) precpages++; assert(VM_PAGE_OBJECT(m) != kernel_object); - m = (vm_page_t) vm_page_queue_next(&m->pageq); + m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq); if (m ==(vm_page_t )0) break; } while (!vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t) m)); @@ -9970,12 +9670,12 @@ vm_countdirtypages(void) do { if(m == (vm_page_t )0) break; - if(m->dirty) dpages++; - if(m->free_when_done) pgopages++; - if(m->precious) precpages++; + if(m->vmp_dirty) dpages++; + if(m->vmp_free_when_done) pgopages++; + if(m->vmp_precious) precpages++; assert(VM_PAGE_OBJECT(m) != kernel_object); - m = (vm_page_t) vm_page_queue_next(&m->pageq); + m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq); if(m == (vm_page_t )0) break; } while (!vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t) m)); @@ -9995,7 +9695,35 @@ int upl_get_cached_tier(upl_t upl) return (upl->upl_priority); return (-1); } -#endif /* CONFIG_IOSCHED */ +#endif /* CONFIG_IOSCHED */ + + +void upl_callout_iodone(upl_t upl) +{ + struct upl_io_completion *upl_ctx = upl->upl_iodone; + + if (upl_ctx) { + void (*iodone_func)(void *, int) = upl_ctx->io_done; + + assert(upl_ctx->io_done); + + (*iodone_func)(upl_ctx->io_context, upl_ctx->io_error); + } +} + +void upl_set_iodone(upl_t upl, void *upl_iodone) +{ + upl->upl_iodone = (struct upl_io_completion *)upl_iodone; +} + +void upl_set_iodone_error(upl_t upl, int error) +{ + struct upl_io_completion *upl_ctx = upl->upl_iodone; + + if (upl_ctx) + upl_ctx->io_error = error; +} + ppnum_t upl_get_highest_page( upl_t upl) @@ -10025,7 +9753,7 @@ struct vnode * upl_lookup_vnode(upl_t upl) return vnode_pager_lookup_vnode(upl->map_object->pager); else return NULL; -} +} #if UPL_DEBUG kern_return_t upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2) @@ -10249,7 +9977,7 @@ vm_test_collapse_compressor(void) vm_map_remove(kernel_map, backing_offset, backing_offset + backing_size, - 0); + VM_MAP_REMOVE_NO_FLAGS); printf("VM_TEST_COLLAPSE_COMPRESSOR: " "unmapped backing_object %p [0x%llx:0x%llx]\n", backing_object, diff --git a/osfmk/vm/vm_pageout.h b/osfmk/vm/vm_pageout.h index a39763477..237205926 100644 --- a/osfmk/vm/vm_pageout.h +++ b/osfmk/vm/vm_pageout.h @@ -90,7 +90,7 @@ #define VM_PAGE_AVAILABLE_COUNT() ((unsigned int)(vm_page_cleaned_count)) /* externally manipulated counters */ -extern unsigned int vm_pageout_cleaned_reactivated, vm_pageout_cleaned_fault_reactivated, vm_pageout_cleaned_commit_reactivated; +extern unsigned int vm_pageout_cleaned_fault_reactivated; #if CONFIG_FREEZE extern boolean_t memorystatus_freeze_enabled; @@ -137,6 +137,8 @@ extern int vm_debug_events; #define VM_INFO5 0x10F #define VM_INFO6 0x110 #define VM_INFO7 0x111 +#define VM_INFO8 0x112 +#define VM_INFO9 0x113 #define VM_UPL_PAGE_WAIT 0x120 #define VM_IOPL_PAGE_WAIT 0x121 @@ -148,15 +150,23 @@ extern int vm_debug_events; #define VM_PAGE_EXPEDITE_NO_MEMORY 0x125 #endif +#define VM_PAGE_GRAB 0x126 +#define VM_PAGE_RELEASE 0x127 + #define VM_PRESSURE_EVENT 0x130 #define VM_EXECVE 0x131 #define VM_WAKEUP_COMPACTOR_SWAPPER 0x132 +#define VM_UPL_REQUEST 0x133 +#define VM_IOPL_REQUEST 0x134 +#define VM_KERN_REQUEST 0x135 #define VM_DATA_WRITE 0x140 +#define VM_PRESSURE_LEVEL_CHANGE 0x141 + #define VM_DEBUG_EVENT(name, event, control, arg1, arg2, arg3, arg4) \ MACRO_BEGIN \ - if (vm_debug_events) { \ + if (__improbable(vm_debug_events)) { \ KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, event)) | control, arg1, arg2, arg3, arg4, 0); \ } \ MACRO_END @@ -175,6 +185,10 @@ extern int upl_get_cached_tier( upl_t upl); #endif +extern void upl_set_iodone(upl_t, void *); +extern void upl_set_iodone_error(upl_t, int); +extern void upl_callout_iodone(upl_t); + extern ppnum_t upl_get_highest_page( upl_t upl); @@ -324,6 +338,14 @@ struct ucd { }; #endif +struct upl_io_completion { + + void *io_context; + void (*io_done)(void *, int); + + int io_error; +}; + struct upl { decl_lck_mtx_data(, Lock) /* Synchronization */ @@ -337,6 +359,7 @@ struct upl { ppnum_t highest_page; void* vector_upl; upl_t associated_upl; + struct upl_io_completion *upl_iodone; #if CONFIG_IOSCHED int upl_priority; uint64_t *upl_reprio_info; @@ -470,9 +493,6 @@ extern void vm_pageout_steal_laundry( vm_page_t page, boolean_t queues_locked); -extern boolean_t vm_page_is_slideable(vm_page_t m); - -extern kern_return_t vm_page_slide(vm_page_t page, vm_map_offset_t kernel_mapping_offset); #endif /* MACH_KERNEL_PRIVATE */ #if UPL_DEBUG @@ -536,9 +556,7 @@ extern void hibernate_create_paddr_map(void); extern void vm_set_restrictions(void); extern int vm_compressor_mode; -extern int vm_compressor_thread_count; -extern boolean_t vm_restricted_to_single_processor; -extern kern_return_t vm_pageout_compress_page(void **, char *, vm_page_t, boolean_t); +extern kern_return_t vm_pageout_compress_page(void **, char *, vm_page_t); extern void vm_pageout_anonymous_pages(void); extern void vm_pageout_disconnect_all_pages(void); @@ -574,6 +592,161 @@ extern struct vm_config vm_config; #endif /* KERNEL_PRIVATE */ #ifdef XNU_KERNEL_PRIVATE + +struct vm_pageout_state { + boolean_t vm_pressure_thread_running; + boolean_t vm_pressure_changed; + boolean_t vm_restricted_to_single_processor; + int vm_compressor_thread_count; + + unsigned int vm_page_speculative_q_age_ms; + unsigned int vm_page_speculative_percentage; + unsigned int vm_page_speculative_target; + + unsigned int vm_pageout_swap_wait; + unsigned int vm_pageout_idle_wait; /* milliseconds */ + unsigned int vm_pageout_empty_wait; /* milliseconds */ + unsigned int vm_pageout_burst_wait; /* milliseconds */ + unsigned int vm_pageout_deadlock_wait; /* milliseconds */ + unsigned int vm_pageout_deadlock_relief; + unsigned int vm_pageout_burst_inactive_throttle; + + unsigned int vm_pageout_inactive; + unsigned int vm_pageout_inactive_used; /* debugging */ + unsigned int vm_pageout_inactive_clean; /* debugging */ + + uint32_t vm_page_filecache_min; + uint32_t vm_page_filecache_min_divisor; + uint32_t vm_page_xpmapped_min; + uint32_t vm_page_xpmapped_min_divisor; + uint64_t vm_pageout_considered_page_last; + + int vm_page_free_count_init; + + unsigned int vm_memory_pressure; + + int memorystatus_purge_on_critical; + int memorystatus_purge_on_warning; + int memorystatus_purge_on_urgent; + + thread_t vm_pageout_external_iothread; + thread_t vm_pageout_internal_iothread; +}; + +extern struct vm_pageout_state vm_pageout_state; + +/* + * This structure is used to track the VM_INFO instrumentation + */ +struct vm_pageout_vminfo { + unsigned long vm_pageout_considered_page; + unsigned long vm_pageout_considered_bq_internal; + unsigned long vm_pageout_considered_bq_external; + unsigned long vm_pageout_skipped_external; + + unsigned long vm_pageout_pages_evicted; + unsigned long vm_pageout_pages_purged;; + unsigned long vm_pageout_freed_cleaned; + unsigned long vm_pageout_freed_speculative; + unsigned long vm_pageout_freed_external; + unsigned long vm_pageout_freed_internal; + unsigned long vm_pageout_inactive_dirty_internal; + unsigned long vm_pageout_inactive_dirty_external; + unsigned long vm_pageout_inactive_referenced; + unsigned long vm_pageout_reactivation_limit_exceeded; + unsigned long vm_pageout_inactive_force_reclaim; + unsigned long vm_pageout_inactive_nolock; + unsigned long vm_pageout_filecache_min_reactivated; + unsigned long vm_pageout_scan_inactive_throttled_internal; + unsigned long vm_pageout_scan_inactive_throttled_external; + + uint64_t vm_pageout_compressions; + uint64_t vm_compressor_pages_grabbed; + unsigned long vm_compressor_failed; + + unsigned long vm_page_pages_freed; + + unsigned long vm_phantom_cache_found_ghost; + unsigned long vm_phantom_cache_added_ghost; +}; + +extern struct vm_pageout_vminfo vm_pageout_vminfo; + + +#if DEVELOPMENT || DEBUG + +/* + * This structure records the pageout daemon's actions: + * how many pages it looks at and what happens to those pages. + * No locking needed because only one thread modifies the fields. + */ +struct vm_pageout_debug { + uint32_t vm_pageout_balanced; + uint32_t vm_pageout_scan_event_counter; + uint32_t vm_pageout_speculative_dirty; + + uint32_t vm_pageout_inactive_busy; + uint32_t vm_pageout_inactive_absent; + uint32_t vm_pageout_inactive_notalive; + uint32_t vm_pageout_inactive_error; + uint32_t vm_pageout_inactive_deactivated; + + uint32_t vm_pageout_enqueued_cleaned; + + uint32_t vm_pageout_cleaned_busy; + uint32_t vm_pageout_cleaned_nolock; + uint32_t vm_pageout_cleaned_reference_reactivated; + uint32_t vm_pageout_cleaned_volatile_reactivated; + uint32_t vm_pageout_cleaned_reactivated; /* debugging; how many cleaned pages are found to be referenced on pageout (and are therefore reactivated) */ + uint32_t vm_pageout_cleaned_fault_reactivated; + + uint32_t vm_pageout_dirty_no_pager; + uint32_t vm_pageout_purged_objects; + + uint32_t vm_pageout_scan_throttle; + uint32_t vm_pageout_scan_reclaimed_throttled; + uint32_t vm_pageout_scan_burst_throttle; + uint32_t vm_pageout_scan_empty_throttle; + uint32_t vm_pageout_scan_swap_throttle; + uint32_t vm_pageout_scan_deadlock_detected; + uint32_t vm_pageout_scan_inactive_throttle_success; + uint32_t vm_pageout_scan_throttle_deferred; + + uint32_t vm_pageout_inactive_external_forced_jetsam_count; + + uint32_t vm_grab_anon_overrides; + uint32_t vm_grab_anon_nops; + + uint32_t vm_pageout_no_victim; + unsigned long vm_pageout_throttle_up_count; + uint32_t vm_page_steal_pageout_page; + + uint32_t vm_cs_validated_resets; + uint32_t vm_object_iopl_request_sleep_for_cleaning; + uint32_t vm_page_slide_counter; + uint32_t vm_page_slide_errors; + uint32_t vm_page_throttle_count; + /* + * Statistics about UPL enforcement of copy-on-write obligations. + */ + unsigned long upl_cow; + unsigned long upl_cow_again; + unsigned long upl_cow_pages; + unsigned long upl_cow_again_pages; + unsigned long iopl_cow; + unsigned long iopl_cow_pages; +}; + +extern struct vm_pageout_debug vm_pageout_debug; + +#define VM_PAGEOUT_DEBUG(member, value) \ + MACRO_BEGIN \ + vm_pageout_debug.member += value; \ + MACRO_END +#else +#define VM_PAGEOUT_DEBUG(member, value) +#endif + #define MAX_COMPRESSOR_THREAD_COUNT 8 #if DEVELOPMENT || DEBUG diff --git a/osfmk/vm/vm_phantom_cache.c b/osfmk/vm/vm_phantom_cache.c index a075f53fa..95bdaa27e 100644 --- a/osfmk/vm/vm_phantom_cache.c +++ b/osfmk/vm/vm_phantom_cache.c @@ -39,7 +39,7 @@ uint32_t phantom_cache_thrashing_threshold_ssd = 1000; #if CONFIG_EMBEDDED uint32_t phantom_cache_thrashing_threshold = 500; #else -uint32_t phantom_cache_thrashing_threshold = 100; +uint32_t phantom_cache_thrashing_threshold = 50; #endif /* @@ -102,6 +102,7 @@ struct phantom_cache_stats { } phantom_cache_stats; + void vm_phantom_cache_init() { @@ -173,7 +174,7 @@ vm_phantom_cache_add_ghost(vm_page_t m) if (vm_phantom_cache_num_entries == 0) return; - pg_mask = pg_masks[(m->offset >> PAGE_SHIFT) & VM_GHOST_PAGE_MASK]; + pg_mask = pg_masks[(m->vmp_offset >> PAGE_SHIFT) & VM_GHOST_PAGE_MASK]; if (object->phantom_object_id == 0) { @@ -239,7 +240,7 @@ vm_phantom_cache_add_ghost(vm_page_t m) phantom_cache_stats.pcs_added_new_entry++; vpce->g_pages_held = pg_mask; - vpce->g_obj_offset = (m->offset >> (PAGE_SHIFT + VM_GHOST_PAGE_SHIFT)) & VM_GHOST_OFFSET_MASK; + vpce->g_obj_offset = (m->vmp_offset >> (PAGE_SHIFT + VM_GHOST_PAGE_SHIFT)) & VM_GHOST_OFFSET_MASK; vpce->g_obj_id = object->phantom_object_id; ghost_hash_index = vm_phantom_hash(vpce->g_obj_id, vpce->g_obj_offset); @@ -247,6 +248,8 @@ vm_phantom_cache_add_ghost(vm_page_t m) vm_phantom_cache_hash[ghost_hash_index] = ghost_index; done: + vm_pageout_vminfo.vm_phantom_cache_added_ghost++; + if (object->phantom_isssd) OSAddAtomic(1, &sample_period_ghost_added_count_ssd); else @@ -270,7 +273,7 @@ vm_phantom_cache_lookup_ghost(vm_page_t m, uint32_t pg_mask) */ return (NULL); } - g_obj_offset = (m->offset >> (PAGE_SHIFT + VM_GHOST_PAGE_SHIFT)) & VM_GHOST_OFFSET_MASK; + g_obj_offset = (m->vmp_offset >> (PAGE_SHIFT + VM_GHOST_PAGE_SHIFT)) & VM_GHOST_OFFSET_MASK; ghost_index = vm_phantom_cache_hash[vm_phantom_hash(g_obj_id, g_obj_offset)]; @@ -314,13 +317,14 @@ vm_phantom_cache_update(vm_page_t m) if (vm_phantom_cache_num_entries == 0) return; - pg_mask = pg_masks[(m->offset >> PAGE_SHIFT) & VM_GHOST_PAGE_MASK]; + pg_mask = pg_masks[(m->vmp_offset >> PAGE_SHIFT) & VM_GHOST_PAGE_MASK]; if ( (vpce = vm_phantom_cache_lookup_ghost(m, pg_mask)) ) { vpce->g_pages_held &= ~pg_mask; phantom_cache_stats.pcs_updated_phantom_state++; + vm_pageout_vminfo.vm_phantom_cache_found_ghost++; if (object->phantom_isssd) OSAddAtomic(1, &sample_period_ghost_found_count_ssd); diff --git a/osfmk/vm/vm_protos.h b/osfmk/vm/vm_protos.h index 8044286f5..cf4ad88ce 100644 --- a/osfmk/vm/vm_protos.h +++ b/osfmk/vm/vm_protos.h @@ -173,6 +173,19 @@ extern memory_object_t apple_protect_pager_setup( vm_object_offset_t crypto_end); #endif /* CONFIG_CODE_DECRYPTION */ +struct vm_shared_region_slide_info; +extern kern_return_t vm_map_shared_region( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + vm_object_offset_t backing_offset, + struct vm_shared_region_slide_info *slide_info); +extern void shared_region_pager_bootstrap(void); +extern memory_object_t shared_region_pager_setup( + vm_object_t backing_object, + vm_object_offset_t backing_offset, + struct vm_shared_region_slide_info *slide_info); + struct vnode; extern void swapfile_pager_bootstrap(void); extern memory_object_t swapfile_pager_setup(struct vnode *vp); @@ -218,6 +231,11 @@ extern void *upl_get_internal_page_list( extern void vnode_setswapmount(struct vnode *); extern int64_t vnode_getswappin_avail(struct vnode *); +extern void vnode_pager_was_dirtied( + struct vnode *, + vm_object_offset_t, + vm_object_offset_t); + typedef int pager_return_t; extern pager_return_t vnode_pagein( struct vnode *, upl_t, @@ -294,6 +312,10 @@ extern kern_return_t vnode_pager_get_object_devvp( uintptr_t *); #endif +extern void vnode_pager_dirtied( + memory_object_t, + vm_object_offset_t, + vm_object_offset_t); extern kern_return_t vnode_pager_get_isinuse( memory_object_t, uint32_t *); @@ -462,12 +484,26 @@ extern boolean_t cs_validate_range(struct vnode *vp, const void *data, vm_size_t size, unsigned *result); +#if PMAP_CS +extern kern_return_t cs_associate_blob_with_mapping( + void *pmap, + vm_map_offset_t start, + vm_map_size_t size, + vm_object_offset_t offset, + void *blobs_p); +#endif /* PMAP_CS */ extern kern_return_t memory_entry_purgeable_control_internal( ipc_port_t entry_port, vm_purgable_t control, int *state); +extern kern_return_t memory_entry_access_tracking_internal( + ipc_port_t entry_port, + int *access_tracking, + uint32_t *access_tracking_reads, + uint32_t *access_tracking_writes); + extern kern_return_t mach_memory_entry_purgable_control( ipc_port_t entry_port, vm_purgable_t control, @@ -531,6 +567,11 @@ extern int proc_get_memstat_priority(struct proc*, boolean_t); /* returns TRUE if an object was purged, otherwise FALSE. */ boolean_t vm_purgeable_object_purge_one_unlocked(int force_purge_below_group); void vm_purgeable_disown(task_t task); +void vm_purgeable_nonvolatile_owner_update(task_t owner, + int delta); +void vm_purgeable_volatile_owner_update(task_t owner, + int delta); + struct trim_list { uint64_t tl_offset; @@ -597,10 +638,23 @@ extern kern_return_t mach_make_memory_entry_internal( ipc_port_t *object_handle, ipc_port_t parent_handle); +#define roundup(x, y) ((((x) % (y)) == 0) ? \ + (x) : ((x) + ((y) - ((x) % (y))))) + #ifdef __cplusplus } #endif +/* + * Flags for the VM swapper/reclaimer. + * Used by vm_swap_consider_defragment() + * to force defrag/reclaim by the swap + * GC thread. + */ +#define VM_SWAP_FLAGS_NONE 0 +#define VM_SWAP_FLAGS_FORCE_DEFRAG 1 +#define VM_SWAP_FLAGS_FORCE_RECLAIM 2 + #endif /* _VM_VM_PROTOS_H_ */ #endif /* XNU_KERNEL_PRIVATE */ diff --git a/osfmk/vm/vm_purgeable.c b/osfmk/vm/vm_purgeable.c index 4606d9591..5fc967116 100644 --- a/osfmk/vm/vm_purgeable.c +++ b/osfmk/vm/vm_purgeable.c @@ -90,11 +90,6 @@ static token_idx_t vm_purgeable_token_remove_first(purgeable_q_t queue); static void vm_purgeable_stats_helper(vm_purgeable_stat_t *stat, purgeable_q_t queue, int group, task_t target_task); -void vm_purgeable_nonvolatile_owner_update(task_t owner, - int delta); -void vm_purgeable_volatile_owner_update(task_t owner, - int delta); - #if MACH_ASSERT static void @@ -724,8 +719,13 @@ vm_purgeable_object_find_and_lock( object_task_importance = 0; - owner = object->vo_purgeable_owner; - if (owner) { + /* + * We don't want to use VM_OBJECT_OWNER() here: we want to + * distinguish kernel-owned and disowned objects. + * Disowned objects have no owner and will have no importance... + */ + owner = object->vo_owner; + if (owner != NULL && owner != VM_OBJECT_OWNER_DISOWNED) { #if CONFIG_EMBEDDED #if CONFIG_JETSAM object_task_importance = proc_get_memstat_priority((struct proc *)get_bsdtask_info(owner), TRUE); @@ -780,7 +780,7 @@ vm_purgeable_object_find_and_lock( object->purgeable_queue_type = PURGEABLE_Q_TYPE_MAX; object->purgeable_queue_group = 0; /* one less volatile object for this object's owner */ - vm_purgeable_volatile_owner_update(object->vo_purgeable_owner, -1); + vm_purgeable_volatile_owner_update(VM_OBJECT_OWNER(object), -1); #if DEBUG object->vo_purgeable_volatilizer = NULL; @@ -793,7 +793,7 @@ vm_purgeable_object_find_and_lock( purgeable_nonvolatile_count++; assert(purgeable_nonvolatile_count > 0); /* one more nonvolatile object for this object's owner */ - vm_purgeable_nonvolatile_owner_update(object->vo_purgeable_owner, +1); + vm_purgeable_nonvolatile_owner_update(VM_OBJECT_OWNER(object), +1); #if MACH_ASSERT queue->debug_count_objects--; @@ -891,6 +891,11 @@ vm_purgeable_object_purge_one( vm_object_t object = 0; purgeable_q_t queue, queue2; boolean_t forced_purge; + unsigned int resident_page_count; + + + KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, OBJECT_PURGE)) | DBG_FUNC_START, + force_purge_below_group, flags, 0, 0, 0); /* Need the page queue lock since we'll be changing the token queue. */ LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); @@ -984,22 +989,29 @@ vm_purgeable_object_purge_one( * we have objects in a purgeable state */ lck_mtx_unlock(&vm_purgeable_queue_lock); + + KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, OBJECT_PURGE)) | DBG_FUNC_END, + 0, 0, available_for_purge, 0, 0); + return FALSE; purge_now: assert(object); vm_page_unlock_queues(); /* Unlock for call to vm_object_purge() */ -// printf("%sPURGING object %p task %p importance %d queue %d group %d force_purge_below_group %d memorystatus_vm_pressure_level %d\n", forced_purge ? "FORCED " : "", object, object->vo_purgeable_owner, task_importance_estimate(object->vo_purgeable_owner), i, group, force_purge_below_group, memorystatus_vm_pressure_level); +// printf("%sPURGING object %p task %p importance %d queue %d group %d force_purge_below_group %d memorystatus_vm_pressure_level %d\n", forced_purge ? "FORCED " : "", object, object->vo_owner, task_importance_estimate(object->vo_owner), i, group, force_purge_below_group, memorystatus_vm_pressure_level); + resident_page_count = object->resident_page_count; (void) vm_object_purge(object, flags); assert(object->purgable == VM_PURGABLE_EMPTY); /* no change in purgeable accounting */ vm_object_unlock(object); vm_page_lock_queues(); - KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, OBJECT_PURGE)), + vm_pageout_vminfo.vm_pageout_pages_purged += resident_page_count; + + KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, OBJECT_PURGE)) | DBG_FUNC_END, VM_KERNEL_UNSLIDE_OR_PERM(object), /* purged object */ - 0, + resident_page_count, available_for_purge, 0, 0); @@ -1024,7 +1036,7 @@ vm_purgeable_object_add(vm_object_t object, purgeable_q_t queue, int group) purgeable_nonvolatile_count--; assert(purgeable_nonvolatile_count >= 0); /* one less nonvolatile object for this object's owner */ - vm_purgeable_nonvolatile_owner_update(object->vo_purgeable_owner, -1); + vm_purgeable_nonvolatile_owner_update(VM_OBJECT_OWNER(object), -1); if (queue->type == PURGEABLE_Q_TYPE_OBSOLETE) group = 0; @@ -1035,7 +1047,7 @@ vm_purgeable_object_add(vm_object_t object, purgeable_q_t queue, int group) else queue_enter_first(&queue->objq[group], object, vm_object_t, objq); /* first to die */ /* one more volatile object for this object's owner */ - vm_purgeable_volatile_owner_update(object->vo_purgeable_owner, +1); + vm_purgeable_volatile_owner_update(VM_OBJECT_OWNER(object), +1); object->purgeable_queue_type = queue->type; object->purgeable_queue_group = group; @@ -1043,7 +1055,8 @@ vm_purgeable_object_add(vm_object_t object, purgeable_q_t queue, int group) #if DEBUG assert(object->vo_purgeable_volatilizer == NULL); object->vo_purgeable_volatilizer = current_task(); - OSBacktrace(&object->purgeable_volatilizer_bt[0], 16); + OSBacktrace(&object->purgeable_volatilizer_bt[0], + ARRAY_COUNT(object->purgeable_volatilizer_bt)); #endif /* DEBUG */ #if MACH_ASSERT @@ -1089,21 +1102,19 @@ vm_purgeable_object_remove(vm_object_t object) object->objq.next = NULL; object->objq.prev = NULL; /* one less volatile object for this object's owner */ - vm_purgeable_volatile_owner_update(object->vo_purgeable_owner, -1); + vm_purgeable_volatile_owner_update(VM_OBJECT_OWNER(object), -1); #if DEBUG object->vo_purgeable_volatilizer = NULL; #endif /* DEBUG */ /* keep queue of non-volatile objects */ if (object->alive && !object->terminating) { - task_t owner; queue_enter(&purgeable_nonvolatile_queue, object, vm_object_t, objq); assert(purgeable_nonvolatile_count >= 0); purgeable_nonvolatile_count++; assert(purgeable_nonvolatile_count > 0); /* one more nonvolatile object for this object's owner */ - owner = object->vo_purgeable_owner; - vm_purgeable_nonvolatile_owner_update(owner, +1); + vm_purgeable_nonvolatile_owner_update(VM_OBJECT_OWNER(object), +1); } #if MACH_ASSERT @@ -1136,10 +1147,10 @@ vm_purgeable_stats_helper(vm_purgeable_stat_t *stat, purgeable_q_t queue, int gr for (object = (vm_object_t) queue_first(&queue->objq[group]); !queue_end(&queue->objq[group], (queue_entry_t) object); object = (vm_object_t) queue_next(&object->objq)) { - if (!target_task || object->vo_purgeable_owner == target_task) { - stat->count++; - stat->size += (object->resident_page_count * PAGE_SIZE); - } + if (!target_task || VM_OBJECT_OWNER(object) == target_task) { + stat->count++; + stat->size += (object->resident_page_count * PAGE_SIZE); + } } return; } @@ -1184,7 +1195,7 @@ vm_purgeable_account_volatile_queue( for (object = (vm_object_t) queue_first(&queue->objq[group]); !queue_end(&queue->objq[group], (queue_entry_t) object); object = (vm_object_t) queue_next(&object->objq)) { - if (object->vo_purgeable_owner == task) { + if (VM_OBJECT_OWNER(object) == task) { compressed_count = vm_compressor_pager_get_count(object->pager); acnt_info->pvm_volatile_compressed_count += compressed_count; acnt_info->pvm_volatile_count += (object->resident_page_count - object->wired_page_count); @@ -1226,7 +1237,7 @@ vm_purgeable_account( for (object = (vm_object_t) queue_first(nonvolatile_q); !queue_end(nonvolatile_q, (queue_entry_t) object); object = (vm_object_t) queue_next(&object->objq)) { - if (object->vo_purgeable_owner == task) { + if (VM_OBJECT_OWNER(object) == task) { state = object->purgable; compressed_count = vm_compressor_pager_get_count(object->pager); if (state == VM_PURGABLE_EMPTY) { @@ -1319,18 +1330,21 @@ vm_purgeable_disown( #if DEBUG assert(object->vo_purgeable_volatilizer == NULL); #endif /* DEBUG */ - assert(object->vo_purgeable_owner == task); + assert(object->vo_owner == task); if (!vm_object_lock_try(object)) { lck_mtx_unlock(&vm_purgeable_queue_lock); task_objq_unlock(task); mutex_pause(collisions++); goto again; } - vm_purgeable_accounting(object, - object->purgable, - TRUE, /* disown */ - TRUE);/* task_objq_lock is locked */ - assert(object->vo_purgeable_owner == NULL); + /* transfer ownership to the kernel */ + assert(VM_OBJECT_OWNER(object) != kernel_task); + vm_object_ownership_change( + object, + object->vo_ledger_tag, /* unchanged */ + VM_OBJECT_OWNER_DISOWNED, /* new owner */ + TRUE); /* old_owner->task_objq locked */ + assert(object->vo_owner == VM_OBJECT_OWNER_DISOWNED); vm_object_unlock(object); } @@ -1379,7 +1393,7 @@ vm_purgeable_queue_purge_task_owned( !queue_end(&queue->objq[group], (queue_entry_t) object); object = (vm_object_t) queue_next(&object->objq)) { - if (object->vo_purgeable_owner != task) { + if (object->vo_owner != task) { continue; } @@ -1401,7 +1415,7 @@ vm_purgeable_queue_purge_task_owned( object->purgeable_queue_type = PURGEABLE_Q_TYPE_MAX; object->purgeable_queue_group = 0; /* one less volatile object for this object's owner */ - assert(object->vo_purgeable_owner == task); + assert(object->vo_owner == task); vm_purgeable_volatile_owner_update(task, -1); #if DEBUG @@ -1413,7 +1427,7 @@ vm_purgeable_queue_purge_task_owned( purgeable_nonvolatile_count++; assert(purgeable_nonvolatile_count > 0); /* one more nonvolatile object for this object's owner */ - assert(object->vo_purgeable_owner == task); + assert(object->vo_owner == task); vm_purgeable_nonvolatile_owner_update(task, +1); /* unlock purgeable queues */ @@ -1477,45 +1491,32 @@ vm_purgeable_nonvolatile_enqueue( vm_object_t object, task_t owner) { - int page_count; - vm_object_lock_assert_exclusive(object); assert(object->purgable == VM_PURGABLE_NONVOLATILE); - assert(object->vo_purgeable_owner == NULL); + assert(object->vo_owner == NULL); lck_mtx_lock(&vm_purgeable_queue_lock); if (owner != NULL && owner->task_purgeable_disowning) { /* task is exiting and no longer tracking purgeable objects */ - owner = NULL; + owner = VM_OBJECT_OWNER_DISOWNED; + } + if (owner == NULL) { + owner = kernel_task; } - - object->vo_purgeable_owner = owner; #if DEBUG + OSBacktrace(&object->purgeable_owner_bt[0], + ARRAY_COUNT(object->purgeable_owner_bt)); object->vo_purgeable_volatilizer = NULL; #endif /* DEBUG */ - if (owner != NULL) { - task_objq_lock(owner); - queue_enter(&owner->task_objq, object, vm_object_t, task_objq); - task_objq_unlock(owner); - } -#if DEBUG - OSBacktrace(&object->purgeable_owner_bt[0], 16); -#endif /* DEBUG */ + vm_object_ownership_change(object, + object->vo_ledger_tag, /* tag unchanged */ + owner, + FALSE); /* task_objq_locked */ - page_count = object->resident_page_count; - if (owner != NULL && page_count != 0) { - ledger_credit(owner->ledger, - task_ledgers.purgeable_nonvolatile, - ptoa(page_count)); - ledger_credit(owner->ledger, - task_ledgers.phys_footprint, - ptoa(page_count)); - } - assert(object->objq.next == NULL); assert(object->objq.prev == NULL); @@ -1524,9 +1525,6 @@ vm_purgeable_nonvolatile_enqueue( assert(purgeable_nonvolatile_count >= 0); purgeable_nonvolatile_count++; assert(purgeable_nonvolatile_count > 0); - /* one more nonvolatile object for this object's owner */ - assert(object->vo_purgeable_owner == owner); - vm_purgeable_nonvolatile_owner_update(owner, +1); lck_mtx_unlock(&vm_purgeable_queue_lock); vm_object_lock_assert_exclusive(object); @@ -1540,7 +1538,7 @@ vm_purgeable_nonvolatile_dequeue( vm_object_lock_assert_exclusive(object); - owner = object->vo_purgeable_owner; + owner = VM_OBJECT_OWNER(object); #if DEBUG assert(object->vo_purgeable_volatilizer == NULL); #endif /* DEBUG */ @@ -1549,10 +1547,14 @@ vm_purgeable_nonvolatile_dequeue( * Update the owner's ledger to stop accounting * for this object. */ - vm_purgeable_accounting(object, - object->purgable, - TRUE, /* disown */ - FALSE); /* is task_objq locked? */ + /* transfer ownership to the kernel */ + assert(VM_OBJECT_OWNER(object) != kernel_task); + vm_object_ownership_change( + object, + object->vo_ledger_tag, /* unchanged */ + VM_OBJECT_OWNER_DISOWNED, /* new owner */ + FALSE); /* old_owner->task_objq locked */ + assert(object->vo_owner == VM_OBJECT_OWNER_DISOWNED); } lck_mtx_lock(&vm_purgeable_queue_lock); @@ -1573,28 +1575,32 @@ vm_purgeable_nonvolatile_dequeue( void vm_purgeable_accounting( vm_object_t object, - vm_purgable_t old_state, - boolean_t disown, - boolean_t task_objq_locked) + vm_purgable_t old_state) { task_t owner; int resident_page_count; int wired_page_count; int compressed_page_count; - boolean_t disown_on_the_fly; + int ledger_idx_volatile; + int ledger_idx_nonvolatile; + int ledger_idx_volatile_compressed; + int ledger_idx_nonvolatile_compressed; + boolean_t do_footprint; vm_object_lock_assert_exclusive(object); + assert(object->purgable != VM_PURGABLE_DENY); - owner = object->vo_purgeable_owner; - if (owner == NULL) + owner = VM_OBJECT_OWNER(object); + if (owner == NULL || + object->purgable == VM_PURGABLE_DENY) return; - if (!disown && owner->task_purgeable_disowning) { - /* task is disowning its purgeable objects: help it */ - disown_on_the_fly = TRUE; - } else { - disown_on_the_fly = FALSE; - } + vm_object_ledger_tag_ledgers(object, + &ledger_idx_volatile, + &ledger_idx_nonvolatile, + &ledger_idx_volatile_compressed, + &ledger_idx_nonvolatile_compressed, + &do_footprint); resident_page_count = object->resident_page_count; wired_page_count = object->wired_page_count; @@ -1610,121 +1616,57 @@ vm_purgeable_accounting( old_state == VM_PURGABLE_EMPTY) { /* less volatile bytes in ledger */ ledger_debit(owner->ledger, - task_ledgers.purgeable_volatile, - ptoa(resident_page_count - wired_page_count)); + ledger_idx_volatile, + ptoa_64(resident_page_count - wired_page_count)); /* less compressed volatile bytes in ledger */ ledger_debit(owner->ledger, - task_ledgers.purgeable_volatile_compressed, - ptoa(compressed_page_count)); - - if (disown || !object->alive || object->terminating) { - /* wired pages were accounted as "non-volatile"... */ - ledger_debit(owner->ledger, - task_ledgers.purgeable_nonvolatile, - ptoa(wired_page_count)); - /* ... and in phys_footprint */ - ledger_debit(owner->ledger, - task_ledgers.phys_footprint, - ptoa(wired_page_count)); - - /* no more accounting for this dead object */ - if (! task_objq_locked) { - task_objq_lock(owner); - } - if (!disown_on_the_fly && - (object->purgeable_queue_type == - PURGEABLE_Q_TYPE_MAX)) { - /* - * Not on a volatile queue: must be empty - * or emptying. - */ - vm_purgeable_nonvolatile_owner_update(owner,-1); - } else { - /* on a volatile queue */ - vm_purgeable_volatile_owner_update(owner, -1); - } - task_objq_lock_assert_owned(owner); - queue_remove(&owner->task_objq, object, vm_object_t, task_objq); - object->vo_purgeable_owner = NULL; -#if DEBUG - object->vo_purgeable_volatilizer = NULL; -#endif /* DEBUG */ - if (! task_objq_locked) { - task_objq_unlock(owner); - } - return; - } + ledger_idx_volatile_compressed, + ptoa_64(compressed_page_count)); /* more non-volatile bytes in ledger */ ledger_credit(owner->ledger, - task_ledgers.purgeable_nonvolatile, - ptoa(resident_page_count - wired_page_count)); + ledger_idx_nonvolatile, + ptoa_64(resident_page_count - wired_page_count)); /* more compressed non-volatile bytes in ledger */ ledger_credit(owner->ledger, - task_ledgers.purgeable_nonvolatile_compressed, - ptoa(compressed_page_count)); - /* more footprint */ - ledger_credit(owner->ledger, - task_ledgers.phys_footprint, - ptoa(resident_page_count - + compressed_page_count - - wired_page_count)); + ledger_idx_nonvolatile_compressed, + ptoa_64(compressed_page_count)); + if (do_footprint) { + /* more footprint */ + ledger_credit(owner->ledger, + task_ledgers.phys_footprint, + ptoa_64(resident_page_count + + compressed_page_count + - wired_page_count)); + } } else if (old_state == VM_PURGABLE_NONVOLATILE) { /* less non-volatile bytes in ledger */ ledger_debit(owner->ledger, - task_ledgers.purgeable_nonvolatile, - ptoa(resident_page_count - wired_page_count)); + ledger_idx_nonvolatile, + ptoa_64(resident_page_count - wired_page_count)); /* less compressed non-volatile bytes in ledger */ ledger_debit(owner->ledger, - task_ledgers.purgeable_nonvolatile_compressed, - ptoa(compressed_page_count)); - /* less footprint */ - ledger_debit(owner->ledger, - task_ledgers.phys_footprint, - ptoa(resident_page_count - + compressed_page_count - - wired_page_count)); - - if (disown || !object->alive || object->terminating) { - /* wired pages still accounted as "non-volatile" */ - ledger_debit(owner->ledger, - task_ledgers.purgeable_nonvolatile, - ptoa(wired_page_count)); + ledger_idx_nonvolatile_compressed, + ptoa_64(compressed_page_count)); + if (do_footprint) { + /* less footprint */ ledger_debit(owner->ledger, task_ledgers.phys_footprint, - ptoa(wired_page_count)); - - /* no more accounting for this dead object */ - if (! task_objq_locked) { - task_objq_lock(owner); - } - /* one less "non-volatile" object for the owner */ - if (!disown_on_the_fly) { - assert(object->purgeable_queue_type == - PURGEABLE_Q_TYPE_MAX); - } - vm_purgeable_nonvolatile_owner_update(owner, -1); - task_objq_lock_assert_owned(owner); - queue_remove(&owner->task_objq, object, vm_object_t, task_objq); - object->vo_purgeable_owner = NULL; -#if DEBUG - object->vo_purgeable_volatilizer = NULL; -#endif /* DEBUG */ - if (! task_objq_locked) { - task_objq_unlock(owner); - } - return; + ptoa_64(resident_page_count + + compressed_page_count + - wired_page_count)); } + /* more volatile bytes in ledger */ ledger_credit(owner->ledger, - task_ledgers.purgeable_volatile, - ptoa(resident_page_count - wired_page_count)); + ledger_idx_volatile, + ptoa_64(resident_page_count - wired_page_count)); /* more compressed volatile bytes in ledger */ ledger_credit(owner->ledger, - task_ledgers.purgeable_volatile_compressed, - ptoa(compressed_page_count)); + ledger_idx_volatile_compressed, + ptoa_64(compressed_page_count)); } else { panic("vm_purgeable_accounting(%p): " "unexpected old_state=%d\n", @@ -1775,53 +1717,72 @@ vm_purgeable_volatile_owner_update( } void -vm_purgeable_compressed_update( +vm_object_owner_compressed_update( vm_object_t object, int delta) { - task_t owner; + task_t owner; + int ledger_idx_volatile; + int ledger_idx_nonvolatile; + int ledger_idx_volatile_compressed; + int ledger_idx_nonvolatile_compressed; + boolean_t do_footprint; vm_object_lock_assert_exclusive(object); + owner = VM_OBJECT_OWNER(object); + if (delta == 0 || !object->internal || - object->purgable == VM_PURGABLE_DENY || - object->vo_purgeable_owner == NULL) { - /* not an owned purgeable VM object: nothing to update */ + (object->purgable == VM_PURGABLE_DENY && + ! object->vo_ledger_tag) || + owner == NULL) { + /* not an owned purgeable (or tagged) VM object: nothing to update */ return; } - owner = object->vo_purgeable_owner; + vm_object_ledger_tag_ledgers(object, + &ledger_idx_volatile, + &ledger_idx_nonvolatile, + &ledger_idx_volatile_compressed, + &ledger_idx_nonvolatile_compressed, + &do_footprint); switch (object->purgable) { case VM_PURGABLE_DENY: - break; + /* not purgeable: must be ledger-tagged */ + assert(object->vo_ledger_tag != VM_OBJECT_LEDGER_TAG_NONE); + /* fallthru */ case VM_PURGABLE_NONVOLATILE: if (delta > 0) { ledger_credit(owner->ledger, - task_ledgers.purgeable_nonvolatile_compressed, - ptoa(delta)); - ledger_credit(owner->ledger, - task_ledgers.phys_footprint, - ptoa(delta)); + ledger_idx_nonvolatile_compressed, + ptoa_64(delta)); + if (do_footprint) { + ledger_credit(owner->ledger, + task_ledgers.phys_footprint, + ptoa_64(delta)); + } } else { ledger_debit(owner->ledger, - task_ledgers.purgeable_nonvolatile_compressed, - ptoa(-delta)); - ledger_debit(owner->ledger, - task_ledgers.phys_footprint, - ptoa(-delta)); + ledger_idx_nonvolatile_compressed, + ptoa_64(-delta)); + if (do_footprint) { + ledger_debit(owner->ledger, + task_ledgers.phys_footprint, + ptoa_64(-delta)); + } } break; case VM_PURGABLE_VOLATILE: case VM_PURGABLE_EMPTY: if (delta > 0) { ledger_credit(owner->ledger, - task_ledgers.purgeable_volatile_compressed, - ptoa(delta)); + ledger_idx_volatile_compressed, + ptoa_64(delta)); } else { ledger_debit(owner->ledger, - task_ledgers.purgeable_volatile_compressed, - ptoa(-delta)); + ledger_idx_volatile_compressed, + ptoa_64(-delta)); } break; default: diff --git a/osfmk/vm/vm_purgeable_internal.h b/osfmk/vm/vm_purgeable_internal.h index 010c2ee22..5015ada14 100644 --- a/osfmk/vm/vm_purgeable_internal.h +++ b/osfmk/vm/vm_purgeable_internal.h @@ -123,11 +123,9 @@ uint64_t vm_purgeable_purge_task_owned(task_t task); void vm_purgeable_nonvolatile_enqueue(vm_object_t object, task_t task); void vm_purgeable_nonvolatile_dequeue(vm_object_t object); void vm_purgeable_accounting(vm_object_t object, - vm_purgable_t old_state, - boolean_t disown, - boolean_t task_objq_locked); -void vm_purgeable_compressed_update(vm_object_t object, - int delta); + vm_purgable_t old_state); +void vm_object_owner_compressed_update(vm_object_t object, + int delta); #define PURGEABLE_LOOP_MAX 64 diff --git a/osfmk/vm/vm_resident.c b/osfmk/vm/vm_resident.c index b34f1b2d5..748c42754 100644 --- a/osfmk/vm/vm_resident.c +++ b/osfmk/vm/vm_resident.c @@ -106,6 +106,7 @@ #include + char vm_page_inactive_states[VM_PAGE_Q_STATE_ARRAY_SIZE]; char vm_page_pageable_states[VM_PAGE_Q_STATE_ARRAY_SIZE]; char vm_page_non_speculative_pageable_states[VM_PAGE_Q_STATE_ARRAY_SIZE]; @@ -113,6 +114,7 @@ char vm_page_active_or_inactive_states[VM_PAGE_Q_STATE_ARRAY_SIZE]; #if CONFIG_SECLUDED_MEMORY struct vm_page_secluded_data vm_page_secluded; +void secluded_suppression_init(void); #endif /* CONFIG_SECLUDED_MEMORY */ boolean_t hibernate_cleaning_in_progress = FALSE; @@ -329,8 +331,6 @@ vm_locks_array_t vm_page_locks; decl_lck_mtx_data(,vm_page_alloc_lock) lck_mtx_ext_t vm_page_alloc_lock_ext; -unsigned int io_throttle_zero_fill; - unsigned int vm_page_local_q_count = 0; unsigned int vm_page_local_q_soft_limit = 250; unsigned int vm_page_local_q_hard_limit = 500; @@ -377,9 +377,12 @@ vm_page_queue_head_t vm_page_queue_throttled __attribute__((aligned(VM_PACKED_PO queue_head_t vm_objects_wired; +void vm_update_darkwake_mode(boolean_t); + #if CONFIG_BACKGROUND_QUEUE vm_page_queue_head_t vm_page_queue_background __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT))); uint32_t vm_page_background_target; +uint32_t vm_page_background_target_snapshot; uint32_t vm_page_background_count; uint64_t vm_page_background_promoted_count; @@ -430,7 +433,6 @@ unsigned int vm_page_speculative_used = 0; vm_page_queue_head_t vm_page_queue_cleaned __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT))); unsigned int vm_page_cleaned_count = 0; -unsigned int vm_pageout_enqueued_cleaned = 0; uint64_t max_valid_dma_address = 0xffffffffffffffffULL; ppnum_t max_valid_low_ppnum = 0xffffffff; @@ -450,9 +452,7 @@ unsigned int vm_page_inactive_target = 0; unsigned int vm_page_secluded_target = 0; #endif /* CONFIG_SECLUDED_MEMORY */ unsigned int vm_page_anonymous_min = 0; -unsigned int vm_page_inactive_min = 0; unsigned int vm_page_free_reserved = 0; -unsigned int vm_page_throttle_count = 0; /* @@ -687,57 +687,56 @@ vm_page_bootstrap( bzero(m, sizeof (*m)); #if CONFIG_BACKGROUND_QUEUE - m->vm_page_backgroundq.next = 0; - m->vm_page_backgroundq.prev = 0; - m->vm_page_in_background = FALSE; - m->vm_page_on_backgroundq = FALSE; + m->vmp_backgroundq.next = 0; + m->vmp_backgroundq.prev = 0; + m->vmp_in_background = FALSE; + m->vmp_on_backgroundq = FALSE; #endif VM_PAGE_ZERO_PAGEQ_ENTRY(m); - m->listq.next = 0; - m->listq.prev = 0; - m->next_m = 0; + m->vmp_listq.next = 0; + m->vmp_listq.prev = 0; + m->vmp_next_m = 0; - m->vm_page_object = 0; /* reset later */ - m->offset = (vm_object_offset_t) -1; /* reset later */ + m->vmp_object = 0; /* reset later */ + m->vmp_offset = (vm_object_offset_t) -1; /* reset later */ - m->wire_count = 0; - m->vm_page_q_state = VM_PAGE_NOT_ON_Q; - m->laundry = FALSE; - m->reference = FALSE; - m->gobbled = FALSE; - m->private = FALSE; - m->__unused_pageq_bits = 0; + m->vmp_wire_count = 0; + m->vmp_q_state = VM_PAGE_NOT_ON_Q; + m->vmp_laundry = FALSE; + m->vmp_reference = FALSE; + m->vmp_gobbled = FALSE; + m->vmp_private = FALSE; + m->vmp_unused_page_bits = 0; #if !defined(__arm__) && !defined(__arm64__) VM_PAGE_SET_PHYS_PAGE(m, 0); /* reset later */ #endif - m->busy = TRUE; - m->wanted = FALSE; - m->tabled = FALSE; - m->hashed = FALSE; - m->fictitious = FALSE; - m->pmapped = FALSE; - m->wpmapped = FALSE; - m->free_when_done = FALSE; - m->absent = FALSE; - m->error = FALSE; - m->dirty = FALSE; - m->cleaning = FALSE; - m->precious = FALSE; - m->clustered = FALSE; - m->overwriting = FALSE; - m->restart = FALSE; - m->unusual = FALSE; - m->cs_validated = FALSE; - m->cs_tainted = FALSE; - m->cs_nx = FALSE; - m->no_cache = FALSE; - m->reusable = FALSE; - m->slid = FALSE; - m->xpmapped = FALSE; - m->written_by_kernel = FALSE; - m->__unused_object_bits = 0; + m->vmp_busy = TRUE; + m->vmp_wanted = FALSE; + m->vmp_tabled = FALSE; + m->vmp_hashed = FALSE; + m->vmp_fictitious = FALSE; + m->vmp_pmapped = FALSE; + m->vmp_wpmapped = FALSE; + m->vmp_free_when_done = FALSE; + m->vmp_absent = FALSE; + m->vmp_error = FALSE; + m->vmp_dirty = FALSE; + m->vmp_cleaning = FALSE; + m->vmp_precious = FALSE; + m->vmp_clustered = FALSE; + m->vmp_overwriting = FALSE; + m->vmp_restart = FALSE; + m->vmp_unusual = FALSE; + m->vmp_cs_validated = FALSE; + m->vmp_cs_tainted = FALSE; + m->vmp_cs_nx = FALSE; + m->vmp_no_cache = FALSE; + m->vmp_reusable = FALSE; + m->vmp_xpmapped = FALSE; + m->vmp_written_by_kernel = FALSE; + m->vmp_unused_object_bits = 0; /* * Initialize the page queues. @@ -1112,6 +1111,7 @@ int secluded_for_filecache = 2; /* filecache can use seclude memory */ #if 11 int secluded_for_fbdp = 0; #endif +uint64_t secluded_shutoff_trigger = 0; #endif /* CONFIG_SECLUDED_MEMORY */ @@ -1238,28 +1238,52 @@ pmap_startup( &secluded_for_fbdp, sizeof (secluded_for_fbdp)); #endif -#endif /* CONFIG_SECLUDED_MEMORY */ - // -debug code remove - if (2 == vm_himemory_mode) { - // free low -> high so high is preferred - for (i = 1; i <= pages_initialized; i++) { - if(fill) fillPage(VM_PAGE_GET_PHYS_PAGE(&vm_pages[i - 1]), fillval); /* Fill the page with a know value if requested at boot */ - vm_page_release_startup(&vm_pages[i - 1]); - } + /* + * On small devices, allow a large app to effectively suppress + * secluded memory until it exits. + */ + if (max_mem <= 1 * 1024 * 1024 * 1024 && vm_page_secluded_target != 0) { + + /* + * Get an amount from boot-args, else use 500MB. + * 500MB was chosen from a Peace daemon tentpole test which used munch + * to induce jetsam thrashing of false idle daemons. + */ + int secluded_shutoff_mb; + if (PE_parse_boot_argn("secluded_shutoff_mb", &secluded_shutoff_mb, + sizeof (secluded_shutoff_mb))) + secluded_shutoff_trigger = (uint64_t)secluded_shutoff_mb * 1024 * 1024; + else + secluded_shutoff_trigger = 500 * 1024 * 1024; + + if (secluded_shutoff_trigger != 0) + secluded_suppression_init(); } - else - // debug code remove- + +#endif /* CONFIG_SECLUDED_MEMORY */ /* - * Release pages in reverse order so that physical pages + * By default release pages in reverse order so that physical pages * initially get allocated in ascending addresses. This keeps * the devices (which must address physical memory) happy if * they require several consecutive pages. + * + * For debugging, you can reverse this ordering and/or fill + * all pages with a known value. */ - for (i = pages_initialized; i > 0; i--) { - if(fill) fillPage(VM_PAGE_GET_PHYS_PAGE(&vm_pages[i - 1]), fillval); /* Fill the page with a know value if requested at boot */ - vm_page_release_startup(&vm_pages[i - 1]); + if (vm_himemory_mode == 2) { + for (i = 0; i < pages_initialized; i++) { + if (fill) + fillPage(VM_PAGE_GET_PHYS_PAGE(&vm_pages[i]), fillval); + vm_page_release_startup(&vm_pages[i]); + } + } else { + for (i = pages_initialized; i-- > 0; ) { + if (fill) + fillPage(VM_PAGE_GET_PHYS_PAGE(&vm_pages[i]), fillval); + vm_page_release_startup(&vm_pages[i]); + } } VM_CHECK_MEMORYSTATUS; @@ -1276,7 +1300,7 @@ pmap_startup( queue_iterate(&vm_page_queue_free[i].qhead, xx, vm_page_t, - pageq) { /* BRINGUP */ + vmp_pageq) { /* BRINGUP */ j++; /* (BRINGUP) */ if(j > vm_page_free_count) { /* (BRINGUP) */ panic("pmap_startup: too many pages, xx = %08X, xxl = %08X\n", xx, xxl); @@ -1388,7 +1412,7 @@ vm_page_create( == VM_PAGE_NULL) vm_page_more_fictitious(); - m->fictitious = FALSE; + m->vmp_fictitious = FALSE; pmap_clear_noencrypt(phys_page); vm_page_pages++; @@ -1451,6 +1475,11 @@ vm_page_insert_internal( lck_spin_t *bucket_lock; int hash_id; task_t owner; + int ledger_idx_volatile; + int ledger_idx_nonvolatile; + int ledger_idx_volatile_compressed; + int ledger_idx_nonvolatile_compressed; + boolean_t do_footprint; XPR(XPR_VM_PAGE, "vm_page_insert, object 0x%X offset 0x%X page 0x%X\n", @@ -1465,7 +1494,7 @@ vm_page_insert_internal( assert(page_aligned(offset)); - assert(!VM_PAGE_WIRED(mem) || mem->private || mem->fictitious || (tag != VM_KERN_MEMORY_NONE)); + assert(!VM_PAGE_WIRED(mem) || mem->vmp_private || mem->vmp_fictitious || (tag != VM_KERN_MEMORY_NONE)); /* the vm_submap_object is only a placeholder for submaps */ assert(object != vm_submap_object); @@ -1480,10 +1509,10 @@ vm_page_insert_internal( if (insert_in_hash == TRUE) { #if DEBUG || VM_PAGE_CHECK_BUCKETS - if (mem->tabled || mem->vm_page_object) + if (mem->vmp_tabled || mem->vmp_object) panic("vm_page_insert: page %p for (obj=%p,off=0x%llx) " "already in (obj=%p,off=0x%llx)", - mem, object, offset, VM_PAGE_OBJECT(mem), mem->offset); + mem, object, offset, VM_PAGE_OBJECT(mem), mem->vmp_offset); #endif if (object->internal && (offset >= object->vo_size)) { panic("vm_page_insert_internal: (page=%p,obj=%p,off=0x%llx,size=0x%llx) inserted at offset past object bounds", @@ -1496,8 +1525,8 @@ vm_page_insert_internal( * Record the object/offset pair in this page */ - mem->vm_page_object = VM_PAGE_PACK_OBJECT(object); - mem->offset = offset; + mem->vmp_object = VM_PAGE_PACK_OBJECT(object); + mem->vmp_offset = offset; #if CONFIG_SECLUDED_MEMORY if (object->eligible_for_secluded) { @@ -1514,7 +1543,7 @@ vm_page_insert_internal( lck_spin_lock(bucket_lock); - mem->next_m = bucket->page_list; + mem->vmp_next_m = bucket->page_list; bucket->page_list = VM_PAGE_PACK_PTR(mem); assert(mem == (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list))); @@ -1522,7 +1551,7 @@ vm_page_insert_internal( if (++bucket->cur_count > bucket->hi_count) bucket->hi_count = bucket->cur_count; #endif /* MACH_PAGE_HASH_STATS */ - mem->hashed = TRUE; + mem->vmp_hashed = TRUE; lck_spin_unlock(bucket_lock); } @@ -1538,9 +1567,9 @@ vm_page_insert_internal( /* * Now link into the object's list of backed pages. */ - vm_page_queue_enter(&object->memq, mem, vm_page_t, listq); + vm_page_queue_enter(&object->memq, mem, vm_page_t, vmp_listq); object->memq_hint = mem; - mem->tabled = TRUE; + mem->vmp_tabled = TRUE; /* * Show that the object has one more resident page. @@ -1548,7 +1577,7 @@ vm_page_insert_internal( object->resident_page_count++; if (VM_PAGE_WIRED(mem)) { - assert(mem->wire_count > 0); + assert(mem->vmp_wire_count > 0); VM_OBJECT_WIRED_PAGE_UPDATE_START(object); VM_OBJECT_WIRED_PAGE_ADD(object, mem); VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag); @@ -1575,18 +1604,26 @@ vm_page_insert_internal( * a different physical page during a physically-contiguous * allocation. */ - assert(!mem->reusable); + assert(!mem->vmp_reusable); if (object->all_reusable) { OSAddAtomic(+1, &vm_page_stats_reusable.reusable_count); } - if (object->purgable == VM_PURGABLE_DENY) { + if (object->purgable == VM_PURGABLE_DENY && + ! object->vo_ledger_tag) { owner = TASK_NULL; } else { - owner = object->vo_purgeable_owner; + owner = VM_OBJECT_OWNER(object); + vm_object_ledger_tag_ledgers(object, + &ledger_idx_volatile, + &ledger_idx_nonvolatile, + &ledger_idx_volatile_compressed, + &ledger_idx_nonvolatile_compressed, + &do_footprint); } if (owner && (object->purgable == VM_PURGABLE_NONVOLATILE || + object->purgable == VM_PURGABLE_DENY || VM_PAGE_WIRED(mem))) { if (delayed_ledger_update) @@ -1594,12 +1631,14 @@ vm_page_insert_internal( else { /* more non-volatile bytes */ ledger_credit(owner->ledger, - task_ledgers.purgeable_nonvolatile, - PAGE_SIZE); - /* more footprint */ - ledger_credit(owner->ledger, - task_ledgers.phys_footprint, + ledger_idx_nonvolatile, PAGE_SIZE); + if (do_footprint) { + /* more footprint */ + ledger_credit(owner->ledger, + task_ledgers.phys_footprint, + PAGE_SIZE); + } } } else if (owner && @@ -1608,7 +1647,7 @@ vm_page_insert_internal( assert(! VM_PAGE_WIRED(mem)); /* more volatile bytes */ ledger_credit(owner->ledger, - task_ledgers.purgeable_volatile, + ledger_idx_volatile, PAGE_SIZE); } @@ -1619,7 +1658,7 @@ vm_page_insert_internal( OSAddAtomic(+1, &vm_page_purgeable_count); } } else if (object->purgable == VM_PURGABLE_EMPTY && - mem->vm_page_q_state == VM_PAGE_ON_THROTTLED_Q) { + mem->vmp_q_state == VM_PAGE_ON_THROTTLED_Q) { /* * This page belongs to a purged VM object but hasn't * been purged (because it was "busy"). @@ -1683,10 +1722,10 @@ vm_page_replace( #endif vm_object_lock_assert_exclusive(object); #if DEBUG || VM_PAGE_CHECK_BUCKETS - if (mem->tabled || mem->vm_page_object) + if (mem->vmp_tabled || mem->vmp_object) panic("vm_page_replace: page %p for (obj=%p,off=0x%llx) " "already in (obj=%p,off=0x%llx)", - mem, object, offset, VM_PAGE_OBJECT(mem), mem->offset); + mem, object, offset, VM_PAGE_OBJECT(mem), mem->vmp_offset); #endif LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED); @@ -1695,8 +1734,8 @@ vm_page_replace( /* * Record the object/offset pair in this page */ - mem->vm_page_object = VM_PAGE_PACK_OBJECT(object); - mem->offset = offset; + mem->vmp_object = VM_PAGE_PACK_OBJECT(object); + mem->vmp_offset = offset; /* * Insert it into the object_object/offset hash table, @@ -1717,29 +1756,29 @@ vm_page_replace( /* * compare packed object pointers */ - if (m->vm_page_object == mem->vm_page_object && m->offset == offset) { + if (m->vmp_object == mem->vmp_object && m->vmp_offset == offset) { /* * Remove old page from hash list */ - *mp = m->next_m; - m->hashed = FALSE; - m->next_m = VM_PAGE_PACK_PTR(NULL); + *mp = m->vmp_next_m; + m->vmp_hashed = FALSE; + m->vmp_next_m = VM_PAGE_PACK_PTR(NULL); found_m = m; break; } - mp = &m->next_m; + mp = &m->vmp_next_m; } while ((m = (vm_page_t)(VM_PAGE_UNPACK_PTR(*mp)))); - mem->next_m = bucket->page_list; + mem->vmp_next_m = bucket->page_list; } else { - mem->next_m = VM_PAGE_PACK_PTR(NULL); + mem->vmp_next_m = VM_PAGE_PACK_PTR(NULL); } /* * insert new page at head of hash list */ bucket->page_list = VM_PAGE_PACK_PTR(mem); - mem->hashed = TRUE; + mem->vmp_hashed = TRUE; lck_spin_unlock(bucket_lock); @@ -1774,18 +1813,23 @@ vm_page_remove( int hash_id; task_t owner; vm_object_t m_object; + int ledger_idx_volatile; + int ledger_idx_nonvolatile; + int ledger_idx_volatile_compressed; + int ledger_idx_nonvolatile_compressed; + int do_footprint; m_object = VM_PAGE_OBJECT(mem); XPR(XPR_VM_PAGE, "vm_page_remove, object 0x%X offset 0x%X page 0x%X\n", - m_object, mem->offset, + m_object, mem->vmp_offset, mem, 0,0); vm_object_lock_assert_exclusive(m_object); - assert(mem->tabled); - assert(!mem->cleaning); - assert(!mem->laundry); + assert(mem->vmp_tabled); + assert(!mem->vmp_cleaning); + assert(!mem->vmp_laundry); if (VM_PAGE_PAGEABLE(mem)) { LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); @@ -1801,7 +1845,7 @@ vm_page_remove( /* * Remove from the object_object/offset hash table */ - hash_id = vm_page_hash(m_object, mem->offset); + hash_id = vm_page_hash(m_object, mem->vmp_offset); bucket = &vm_page_buckets[hash_id]; bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK]; @@ -1810,21 +1854,21 @@ vm_page_remove( if ((this = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list))) == mem) { /* optimize for common case */ - bucket->page_list = mem->next_m; + bucket->page_list = mem->vmp_next_m; } else { vm_page_packed_t *prev; - for (prev = &this->next_m; + for (prev = &this->vmp_next_m; (this = (vm_page_t)(VM_PAGE_UNPACK_PTR(*prev))) != mem; - prev = &this->next_m) + prev = &this->vmp_next_m) continue; - *prev = this->next_m; + *prev = this->vmp_next_m; } #if MACH_PAGE_HASH_STATS bucket->cur_count--; #endif /* MACH_PAGE_HASH_STATS */ - mem->hashed = FALSE; - this->next_m = VM_PAGE_PACK_PTR(NULL); + mem->vmp_hashed = FALSE; + this->vmp_next_m = VM_PAGE_PACK_PTR(NULL); lck_spin_unlock(bucket_lock); } /* @@ -1851,30 +1895,32 @@ vm_page_remove( assert(vm_page_external_count); OSAddAtomic(-1, &vm_page_external_count); - if (mem->xpmapped) { + if (mem->vmp_xpmapped) { assert(vm_page_xpmapped_external_count); OSAddAtomic(-1, &vm_page_xpmapped_external_count); } } - if (!m_object->internal && (m_object->objq.next || m_object->objq.prev)) { + if (!m_object->internal && + m_object->cached_list.next && + m_object->cached_list.prev) { if (m_object->resident_page_count == 0) vm_object_cache_remove(m_object); } if (VM_PAGE_WIRED(mem)) { - assert(mem->wire_count > 0); + assert(mem->vmp_wire_count > 0); VM_OBJECT_WIRED_PAGE_UPDATE_START(m_object); VM_OBJECT_WIRED_PAGE_REMOVE(m_object, mem); VM_OBJECT_WIRED_PAGE_UPDATE_END(m_object, m_object->wire_tag); } assert(m_object->resident_page_count >= m_object->wired_page_count); - if (mem->reusable) { + if (mem->vmp_reusable) { assert(m_object->reusable_page_count > 0); m_object->reusable_page_count--; assert(m_object->reusable_page_count <= m_object->resident_page_count); - mem->reusable = FALSE; + mem->vmp_reusable = FALSE; OSAddAtomic(-1, &vm_page_stats_reusable.reusable_count); vm_page_stats_reusable.reused_remove++; } else if (m_object->all_reusable) { @@ -1882,29 +1928,39 @@ vm_page_remove( vm_page_stats_reusable.reused_remove++; } - if (m_object->purgable == VM_PURGABLE_DENY) { + if (m_object->purgable == VM_PURGABLE_DENY && + ! m_object->vo_ledger_tag) { owner = TASK_NULL; } else { - owner = m_object->vo_purgeable_owner; + owner = VM_OBJECT_OWNER(m_object); + vm_object_ledger_tag_ledgers(m_object, + &ledger_idx_volatile, + &ledger_idx_nonvolatile, + &ledger_idx_volatile_compressed, + &ledger_idx_nonvolatile_compressed, + &do_footprint); } if (owner && (m_object->purgable == VM_PURGABLE_NONVOLATILE || + m_object->purgable == VM_PURGABLE_DENY || VM_PAGE_WIRED(mem))) { /* less non-volatile bytes */ ledger_debit(owner->ledger, - task_ledgers.purgeable_nonvolatile, - PAGE_SIZE); - /* less footprint */ - ledger_debit(owner->ledger, - task_ledgers.phys_footprint, + ledger_idx_nonvolatile, PAGE_SIZE); + if (do_footprint) { + /* less footprint */ + ledger_debit(owner->ledger, + task_ledgers.phys_footprint, + PAGE_SIZE); + } } else if (owner && (m_object->purgable == VM_PURGABLE_VOLATILE || m_object->purgable == VM_PURGABLE_EMPTY)) { assert(! VM_PAGE_WIRED(mem)); /* less volatile bytes */ ledger_debit(owner->ledger, - task_ledgers.purgeable_volatile, + ledger_idx_volatile, PAGE_SIZE); } if (m_object->purgable == VM_PURGABLE_VOLATILE) { @@ -1920,9 +1976,9 @@ vm_page_remove( if (m_object->set_cache_attr == TRUE) pmap_set_cache_attributes(VM_PAGE_GET_PHYS_PAGE(mem), 0); - mem->tabled = FALSE; - mem->vm_page_object = 0; - mem->offset = (vm_object_offset_t) -1; + mem->vmp_tabled = FALSE; + mem->vmp_object = 0; + mem->vmp_offset = (vm_object_offset_t) -1; } @@ -1971,8 +2027,8 @@ kdp_vm_page_lookup( panic("panic: kdp_vm_page_lookup done outside of kernel debugger"); } - vm_page_queue_iterate(&object->memq, cur_page, vm_page_t, listq) { - if (cur_page->offset == offset) { + vm_page_queue_iterate(&object->memq, cur_page, vm_page_t, vmp_listq) { + if (cur_page->vmp_offset == offset) { return cur_page; } num_traversed++; @@ -2014,13 +2070,13 @@ vm_page_lookup( if (mem != VM_PAGE_NULL) { assert(VM_PAGE_OBJECT(mem) == object); - if (mem->offset == offset) { + if (mem->vmp_offset == offset) { #if DEBUG_VM_PAGE_LOOKUP OSAddAtomic64(1, &vm_page_lookup_stats.vpl_hit_hint); #endif return (mem); } - qe = (vm_page_queue_entry_t)vm_page_queue_next(&mem->listq); + qe = (vm_page_queue_entry_t)vm_page_queue_next(&mem->vmp_listq); if (! vm_page_queue_end(&object->memq, qe)) { vm_page_t next_page; @@ -2028,7 +2084,7 @@ vm_page_lookup( next_page = (vm_page_t)((uintptr_t)qe); assert(VM_PAGE_OBJECT(next_page) == object); - if (next_page->offset == offset) { + if (next_page->vmp_offset == offset) { object->memq_hint = next_page; /* new hint */ #if DEBUG_VM_PAGE_LOOKUP OSAddAtomic64(1, &vm_page_lookup_stats.vpl_hit_hint_next); @@ -2036,7 +2092,7 @@ vm_page_lookup( return (next_page); } } - qe = (vm_page_queue_entry_t)vm_page_queue_prev(&mem->listq); + qe = (vm_page_queue_entry_t)vm_page_queue_prev(&mem->vmp_listq); if (! vm_page_queue_end(&object->memq, qe)) { vm_page_t prev_page; @@ -2044,7 +2100,7 @@ vm_page_lookup( prev_page = (vm_page_t)((uintptr_t)qe); assert(VM_PAGE_OBJECT(prev_page) == object); - if (prev_page->offset == offset) { + if (prev_page->vmp_offset == offset) { object->memq_hint = prev_page; /* new hint */ #if DEBUG_VM_PAGE_LOOKUP OSAddAtomic64(1, &vm_page_lookup_stats.vpl_hit_hint_prev); @@ -2086,10 +2142,10 @@ vm_page_lookup( while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)mem)) { - if (mem->offset == offset) + if (mem->vmp_offset == offset) break; - mem = (vm_page_t)vm_page_queue_next(&mem->listq); + mem = (vm_page_t)vm_page_queue_next(&mem->vmp_listq); } if (vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)mem)) mem = NULL; @@ -2104,7 +2160,7 @@ vm_page_lookup( for (mem = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list)); mem != VM_PAGE_NULL; - mem = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->next_m))) { + mem = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->vmp_next_m))) { #if 0 /* * we don't hold the page queue lock @@ -2112,7 +2168,7 @@ vm_page_lookup( */ VM_PAGE_CHECK(mem); #endif - if ((mem->vm_page_object == packed_object) && (mem->offset == offset)) + if ((mem->vmp_object == packed_object) && (mem->vmp_offset == offset)) break; } lck_spin_unlock(bucket_lock); @@ -2171,7 +2227,7 @@ vm_page_rename( mem, 0,0); /* - * Changes to mem->object require the page lock because + * Changes to mem->vmp_object require the page lock because * the pageout daemon uses that lock to get the object. */ vm_page_lockspin_queues(); @@ -2179,7 +2235,7 @@ vm_page_rename( internal_to_external = FALSE; external_to_internal = FALSE; - if (mem->vm_page_q_state == VM_PAGE_ON_ACTIVE_LOCAL_Q) { + if (mem->vmp_q_state == VM_PAGE_ON_ACTIVE_LOCAL_Q) { /* * it's much easier to get the vm_page_pageable_xxx accounting correct * if we first move the page to the active queue... it's going to end @@ -2257,7 +2313,7 @@ vm_page_init( */ pmap_clear_refmod(phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED); #endif - mem->lopage = lopage; + mem->vmp_lopage = lopage; } /* @@ -2280,7 +2336,7 @@ vm_page_grab_fictitious_common( if ((m = (vm_page_t)zget(vm_page_zone))) { vm_page_init(m, phys_addr, FALSE); - m->fictitious = TRUE; + m->vmp_fictitious = TRUE; c_vm_page_grab_fictitious++; } else @@ -2317,8 +2373,8 @@ void vm_page_release_fictitious( vm_page_t m) { - assert((m->vm_page_q_state == VM_PAGE_NOT_ON_Q) || (m->vm_page_q_state == VM_PAGE_IS_WIRED)); - assert(m->fictitious); + assert((m->vmp_q_state == VM_PAGE_NOT_ON_Q) || (m->vmp_q_state == VM_PAGE_IS_WIRED)); + assert(m->vmp_fictitious); assert(VM_PAGE_GET_PHYS_PAGE(m) == vm_page_fictitious_addr || VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr); @@ -2421,6 +2477,58 @@ vm_pool_low(void) return( vm_page_free_count <= vm_page_free_reserved ); } +boolean_t vm_darkwake_mode = FALSE; + +/* + * vm_update_darkwake_mode(): + * + * Tells the VM that the system is in / out of darkwake. + * + * Today, the VM only lowers/raises the background queue target + * so as to favor consuming more/less background pages when + * darwake is ON/OFF. + * + * We might need to do more things in the future. + */ + +void +vm_update_darkwake_mode(boolean_t darkwake_mode) +{ + LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED); + + vm_page_lockspin_queues(); + + if (vm_darkwake_mode == darkwake_mode) { + /* + * No change. + */ + vm_page_unlock_queues(); + return; + } + + vm_darkwake_mode = darkwake_mode; + + if (vm_darkwake_mode == TRUE) { +#if CONFIG_BACKGROUND_QUEUE + + /* save background target to restore later */ + vm_page_background_target_snapshot = vm_page_background_target; + + /* target is set to 0...no protection for background pages */ + vm_page_background_target = 0; + +#endif /* CONFIG_BACKGROUND_QUEUE */ + + } else if (vm_darkwake_mode == FALSE) { +#if CONFIG_BACKGROUND_QUEUE + + if (vm_page_background_target_snapshot) { + vm_page_background_target = vm_page_background_target_snapshot; + } +#endif /* CONFIG_BACKGROUND_QUEUE */ + } + vm_page_unlock_queues(); +} #if CONFIG_BACKGROUND_QUEUE @@ -2430,17 +2538,21 @@ vm_page_update_background_state(vm_page_t mem) if (vm_page_background_mode == VM_PAGE_BG_DISABLED) return; - if (mem->vm_page_in_background == FALSE) + if (mem->vmp_in_background == FALSE) return; + task_t my_task = current_task(); + + if (my_task) { + if (task_get_darkwake_mode(my_task)) { + return; + } + } + #if BACKGROUNDQ_BASED_ON_QOS if (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_QOS) <= THREAD_QOS_LEGACY) return; #else - task_t my_task; - - my_task = current_task(); - if (my_task) { if (proc_get_effective_task_policy(my_task, TASK_POLICY_DARWIN_BG)) return; @@ -2448,7 +2560,7 @@ vm_page_update_background_state(vm_page_t mem) #endif vm_page_lockspin_queues(); - mem->vm_page_in_background = FALSE; + mem->vmp_in_background = FALSE; vm_page_background_promoted_count++; vm_page_remove_from_backgroundq(mem); @@ -2463,18 +2575,23 @@ vm_page_assign_background_state(vm_page_t mem) if (vm_page_background_mode == VM_PAGE_BG_DISABLED) return; + task_t my_task = current_task(); + + if (my_task) { + if (task_get_darkwake_mode(my_task)) { + mem->vmp_in_background = TRUE; + return; + } + } + #if BACKGROUNDQ_BASED_ON_QOS if (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_QOS) <= THREAD_QOS_LEGACY) - mem->vm_page_in_background = TRUE; + mem->vmp_in_background = TRUE; else - mem->vm_page_in_background = FALSE; + mem->vmp_in_background = FALSE; #else - task_t my_task; - - my_task = current_task(); - if (my_task) - mem->vm_page_in_background = proc_get_effective_task_policy(my_task, TASK_POLICY_DARWIN_BG); + mem->vmp_in_background = proc_get_effective_task_policy(my_task, TASK_POLICY_DARWIN_BG); #endif } @@ -2487,12 +2604,12 @@ vm_page_remove_from_backgroundq( LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); - if (mem->vm_page_on_backgroundq) { - vm_page_queue_remove(&vm_page_queue_background, mem, vm_page_t, vm_page_backgroundq); + if (mem->vmp_on_backgroundq) { + vm_page_queue_remove(&vm_page_queue_background, mem, vm_page_t, vmp_backgroundq); - mem->vm_page_backgroundq.next = 0; - mem->vm_page_backgroundq.prev = 0; - mem->vm_page_on_backgroundq = FALSE; + mem->vmp_backgroundq.next = 0; + mem->vmp_backgroundq.prev = 0; + mem->vmp_on_backgroundq = FALSE; vm_page_background_count--; @@ -2503,8 +2620,8 @@ vm_page_remove_from_backgroundq( else vm_page_background_external_count--; } else { - assert(VM_PAGE_UNPACK_PTR(mem->vm_page_backgroundq.next) == (uintptr_t)NULL && - VM_PAGE_UNPACK_PTR(mem->vm_page_backgroundq.prev) == (uintptr_t)NULL); + assert(VM_PAGE_UNPACK_PTR(mem->vmp_backgroundq.next) == (uintptr_t)NULL && + VM_PAGE_UNPACK_PTR(mem->vmp_backgroundq.prev) == (uintptr_t)NULL); } } @@ -2521,7 +2638,7 @@ vm_page_add_to_backgroundq( if (vm_page_background_mode == VM_PAGE_BG_DISABLED) return; - if (mem->vm_page_on_backgroundq == FALSE) { + if (mem->vmp_on_backgroundq == FALSE) { m_object = VM_PAGE_OBJECT(mem); @@ -2529,10 +2646,10 @@ vm_page_add_to_backgroundq( return; if (first == TRUE) - vm_page_queue_enter_first(&vm_page_queue_background, mem, vm_page_t, vm_page_backgroundq); + vm_page_queue_enter_first(&vm_page_queue_background, mem, vm_page_t, vmp_backgroundq); else - vm_page_queue_enter(&vm_page_queue_background, mem, vm_page_t, vm_page_backgroundq); - mem->vm_page_on_backgroundq = TRUE; + vm_page_queue_enter(&vm_page_queue_background, mem, vm_page_t, vmp_backgroundq); + mem->vmp_on_backgroundq = TRUE; vm_page_background_count++; @@ -2543,7 +2660,7 @@ vm_page_add_to_backgroundq( } } -#endif +#endif /* CONFIG_BACKGROUND_QUEUE */ /* * this is an interface to support bring-up of drivers @@ -2576,10 +2693,10 @@ vm_page_grablo(void) vm_page_queue_remove_first(&vm_lopage_queue_free, mem, vm_page_t, - pageq); + vmp_pageq); assert(vm_lopage_free_count); - assert(mem->vm_page_q_state == VM_PAGE_ON_FREE_LOPAGE_Q); - mem->vm_page_q_state = VM_PAGE_NOT_ON_Q; + assert(mem->vmp_q_state == VM_PAGE_ON_FREE_LOPAGE_Q); + mem->vmp_q_state = VM_PAGE_NOT_ON_Q; vm_lopage_free_count--; vm_lopages_allocated_q++; @@ -2603,26 +2720,31 @@ vm_page_grablo(void) return (VM_PAGE_NULL); } - assert(mem->vm_page_q_state == VM_PAGE_NOT_ON_Q); + assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q); - mem->busy = TRUE; + mem->vmp_busy = TRUE; vm_page_lockspin_queues(); - mem->gobbled = FALSE; + mem->vmp_gobbled = FALSE; vm_page_gobble_count--; vm_page_wire_count--; vm_lopages_allocated_cpm_success++; vm_page_unlock_queues(); } - assert(mem->busy); - assert(!mem->pmapped); - assert(!mem->wpmapped); + assert(mem->vmp_busy); + assert(!mem->vmp_pmapped); + assert(!mem->vmp_wpmapped); assert(!pmap_is_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem))); VM_PAGE_ZERO_PAGEQ_ENTRY(mem); + disable_preemption(); + PROCESSOR_DATA(current_processor(), page_grab_count) += 1; + VM_DEBUG_EVENT(vm_page_grab, VM_PAGE_GRAB, DBG_FUNC_NONE, 0, 1, 0, 0); + enable_preemption(); + return (mem); } @@ -2672,7 +2794,7 @@ vm_page_grab_options( if ((mem = PROCESSOR_DATA(current_processor(), free_pages))) { return_page_from_cpu_list: - assert(mem->vm_page_q_state == VM_PAGE_ON_FREE_LOCAL_Q); + assert(mem->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q); #if HIBERNATION if (hibernate_rebuild_needed) { @@ -2680,20 +2802,21 @@ vm_page_grab_options( } #endif /* HIBERNATION */ PROCESSOR_DATA(current_processor(), page_grab_count) += 1; - PROCESSOR_DATA(current_processor(), free_pages) = mem->snext; + PROCESSOR_DATA(current_processor(), free_pages) = mem->vmp_snext; + VM_DEBUG_EVENT(vm_page_grab, VM_PAGE_GRAB, DBG_FUNC_NONE, grab_options, 0, 0, 0); enable_preemption(); VM_PAGE_ZERO_PAGEQ_ENTRY(mem); - mem->vm_page_q_state = VM_PAGE_NOT_ON_Q; - - assert(mem->listq.next == 0 && mem->listq.prev == 0); - assert(mem->tabled == FALSE); - assert(mem->vm_page_object == 0); - assert(!mem->laundry); - assert(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(mem))); - assert(mem->busy); - assert(!mem->pmapped); - assert(!mem->wpmapped); + mem->vmp_q_state = VM_PAGE_NOT_ON_Q; + + assert(mem->vmp_listq.next == 0 && mem->vmp_listq.prev == 0); + assert(mem->vmp_tabled == FALSE); + assert(mem->vmp_object == 0); + assert(!mem->vmp_laundry); + assertf(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(mem)), "page = 0x%llx", (uint64_t)VM_PAGE_GET_PHYS_PAGE(mem)); + assert(mem->vmp_busy); + assert(!mem->vmp_pmapped); + assert(!mem->vmp_wpmapped); assert(!pmap_is_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem))); #if CONFIG_BACKGROUND_QUEUE @@ -2737,7 +2860,7 @@ vm_page_grab_options( /* ... but can we try and grab from the secluded queue? */ if (vm_page_secluded_count > 0 && ((grab_options & VM_PAGE_GRAB_SECLUDED) || - task_can_use_secluded_mem(current_task()))) { + task_can_use_secluded_mem(current_task(), TRUE))) { mem = vm_page_grab_secluded(); if (grab_options & VM_PAGE_GRAB_SECLUDED) { vm_page_secluded.grab_for_iokit++; @@ -2747,6 +2870,12 @@ vm_page_grab_options( } if (mem) { VM_CHECK_MEMORYSTATUS; + + disable_preemption(); + PROCESSOR_DATA(current_processor(), page_grab_count) += 1; + VM_DEBUG_EVENT(vm_page_grab, VM_PAGE_GRAB, DBG_FUNC_NONE, grab_options, 0, 0, 0); + enable_preemption(); + return mem; } } @@ -2807,16 +2936,16 @@ vm_page_grab_options( vm_page_queue_remove_first_with_clump(&vm_page_queue_free[color].qhead, mem, vm_page_t, - pageq, + vmp_pageq, clump_end); #else vm_page_queue_remove_first(&vm_page_queue_free[color].qhead, mem, vm_page_t, - pageq); + vmp_pageq); #endif - assert(mem->vm_page_q_state == VM_PAGE_ON_FREE_Q); + assert(mem->vmp_q_state == VM_PAGE_ON_FREE_Q); VM_PAGE_ZERO_PAGEQ_ENTRY(mem); @@ -2842,20 +2971,20 @@ vm_page_grab_options( if (head == NULL) head = mem; else - tail->snext = mem; + tail->vmp_snext = mem; tail = mem; - assert(mem->listq.next == 0 && mem->listq.prev == 0); - assert(mem->tabled == FALSE); - assert(mem->vm_page_object == 0); - assert(!mem->laundry); + assert(mem->vmp_listq.next == 0 && mem->vmp_listq.prev == 0); + assert(mem->vmp_tabled == FALSE); + assert(mem->vmp_object == 0); + assert(!mem->vmp_laundry); - mem->vm_page_q_state = VM_PAGE_ON_FREE_LOCAL_Q; + mem->vmp_q_state = VM_PAGE_ON_FREE_LOCAL_Q; - assert(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(mem))); - assert(mem->busy); - assert(!mem->pmapped); - assert(!mem->wpmapped); + assertf(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(mem)), "page = 0x%llx", (uint64_t)VM_PAGE_GET_PHYS_PAGE(mem)); + assert(mem->vmp_busy); + assert(!mem->vmp_pmapped); + assert(!mem->vmp_wpmapped); assert(!pmap_is_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem))); } #if defined (__x86_64__) && (DEVELOPMENT || DEBUG) @@ -2868,18 +2997,19 @@ vm_page_grab_options( panic("%s:%d should not modify cpu->free_pages while hibernating", __FUNCTION__, __LINE__); } #endif /* HIBERNATION */ - PROCESSOR_DATA(current_processor(), free_pages) = head->snext; + PROCESSOR_DATA(current_processor(), free_pages) = head->vmp_snext; PROCESSOR_DATA(current_processor(), start_color) = color; /* * satisfy this request */ PROCESSOR_DATA(current_processor(), page_grab_count) += 1; + VM_DEBUG_EVENT(vm_page_grab, VM_PAGE_GRAB, DBG_FUNC_NONE, grab_options, 0, 0, 0); mem = head; - assert(mem->vm_page_q_state == VM_PAGE_ON_FREE_LOCAL_Q); + assert(mem->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q); VM_PAGE_ZERO_PAGEQ_ENTRY(mem); - mem->vm_page_q_state = VM_PAGE_NOT_ON_Q; + mem->vmp_q_state = VM_PAGE_NOT_ON_Q; enable_preemption(); } @@ -2893,9 +3023,7 @@ vm_page_grab_options( * We don't have the counts locked ... if they change a little, * it doesn't really matter. */ - if ((vm_page_free_count < vm_page_free_min) || - ((vm_page_free_count < vm_page_free_target) && - ((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_min))) + if (vm_page_free_count < vm_page_free_min) thread_wakeup((event_t) &vm_page_free_wanted); VM_CHECK_MEMORYSTATUS; @@ -2936,7 +3064,7 @@ vm_page_grab_secluded(void) /* can we grab from the secluded queue? */ if (vm_page_secluded_count > vm_page_secluded_target || (vm_page_secluded_count > 0 && - task_can_use_secluded_mem(current_task()))) { + task_can_use_secluded_mem(current_task(), TRUE))) { /* OK */ } else { /* can't grab from secluded queue... */ @@ -2955,29 +3083,29 @@ vm_page_grab_secluded(void) assert(!vm_page_queue_empty(&vm_page_queue_secluded)); LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); mem = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded); - assert(mem->vm_page_q_state == VM_PAGE_ON_SECLUDED_Q); + assert(mem->vmp_q_state == VM_PAGE_ON_SECLUDED_Q); vm_page_queues_remove(mem, TRUE); object = VM_PAGE_OBJECT(mem); - assert(!mem->fictitious); + assert(!mem->vmp_fictitious); assert(!VM_PAGE_WIRED(mem)); if (object == VM_OBJECT_NULL) { /* free for grab! */ vm_page_unlock_queues(); vm_page_secluded.grab_success_free++; - assert(mem->busy); - assert(mem->vm_page_q_state == VM_PAGE_NOT_ON_Q); + assert(mem->vmp_busy); + assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q); assert(VM_PAGE_OBJECT(mem) == VM_OBJECT_NULL); - assert(mem->pageq.next == 0); - assert(mem->pageq.prev == 0); - assert(mem->listq.next == 0); - assert(mem->listq.prev == 0); + assert(mem->vmp_pageq.next == 0); + assert(mem->vmp_pageq.prev == 0); + assert(mem->vmp_listq.next == 0); + assert(mem->vmp_listq.prev == 0); #if CONFIG_BACKGROUND_QUEUE - assert(mem->vm_page_on_backgroundq == 0); - assert(mem->vm_page_backgroundq.next == 0); - assert(mem->vm_page_backgroundq.prev == 0); + assert(mem->vmp_on_backgroundq == 0); + assert(mem->vmp_backgroundq.next == 0); + assert(mem->vmp_backgroundq.prev == 0); #endif /* CONFIG_BACKGROUND_QUEUE */ return mem; } @@ -2993,24 +3121,24 @@ vm_page_grab_secluded(void) vm_page_unlock_queues(); return VM_PAGE_NULL; } - if (mem->busy || - mem->cleaning || - mem->laundry) { + if (mem->vmp_busy || + mem->vmp_cleaning || + mem->vmp_laundry) { /* can't steal page in this state... */ vm_object_unlock(object); vm_page_secluded.grab_failure_state++; goto reactivate_secluded_page; } - mem->busy = TRUE; + mem->vmp_busy = TRUE; refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem)); if (refmod_state & VM_MEM_REFERENCED) { - mem->reference = TRUE; + mem->vmp_reference = TRUE; } if (refmod_state & VM_MEM_MODIFIED) { SET_PAGE_DIRTY(mem, FALSE); } - if (mem->dirty || mem->precious) { + if (mem->vmp_dirty || mem->vmp_precious) { /* can't grab a dirty page; re-activate */ // printf("SECLUDED: dirty page %p\n", mem); PAGE_WAKEUP_DONE(mem); @@ -3018,7 +3146,7 @@ vm_page_grab_secluded(void) vm_object_unlock(object); goto reactivate_secluded_page; } - if (mem->reference) { + if (mem->vmp_reference) { /* it's been used but we do need to grab a page... */ } @@ -3029,22 +3157,22 @@ vm_page_grab_secluded(void) vm_object_unlock(object); object = VM_OBJECT_NULL; if (vm_page_free_verify) { - assert(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(mem))); + assertf(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(mem)), "page = 0x%llx", (uint64_t)VM_PAGE_GET_PHYS_PAGE(mem)); } pmap_clear_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem)); vm_page_secluded.grab_success_other++; - assert(mem->busy); - assert(mem->vm_page_q_state == VM_PAGE_NOT_ON_Q); + assert(mem->vmp_busy); + assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q); assert(VM_PAGE_OBJECT(mem) == VM_OBJECT_NULL); - assert(mem->pageq.next == 0); - assert(mem->pageq.prev == 0); - assert(mem->listq.next == 0); - assert(mem->listq.prev == 0); + assert(mem->vmp_pageq.next == 0); + assert(mem->vmp_pageq.prev == 0); + assert(mem->vmp_listq.next == 0); + assert(mem->vmp_listq.prev == 0); #if CONFIG_BACKGROUND_QUEUE - assert(mem->vm_page_on_backgroundq == 0); - assert(mem->vm_page_backgroundq.next == 0); - assert(mem->vm_page_backgroundq.prev == 0); + assert(mem->vmp_on_backgroundq == 0); + assert(mem->vmp_backgroundq.next == 0); + assert(mem->vmp_backgroundq.prev == 0); #endif /* CONFIG_BACKGROUND_QUEUE */ return mem; @@ -3075,9 +3203,9 @@ vm_page_release( LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED); } - assert(!mem->private && !mem->fictitious); + assert(!mem->vmp_private && !mem->vmp_fictitious); if (vm_page_free_verify) { - assert(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(mem))); + assertf(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(mem)), "page = 0x%llx", (uint64_t)VM_PAGE_GET_PHYS_PAGE(mem)); } // dbgLog(VM_PAGE_GET_PHYS_PAGE(mem), vm_page_free_count, vm_page_wire_count, 5); /* (TEST/DEBUG) */ @@ -3085,18 +3213,18 @@ vm_page_release( lck_mtx_lock_spin(&vm_page_queue_free_lock); - assert(mem->vm_page_q_state == VM_PAGE_NOT_ON_Q); - assert(mem->busy); - assert(!mem->laundry); - assert(mem->vm_page_object == 0); - assert(mem->pageq.next == 0 && mem->pageq.prev == 0); - assert(mem->listq.next == 0 && mem->listq.prev == 0); + assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q); + assert(mem->vmp_busy); + assert(!mem->vmp_laundry); + assert(mem->vmp_object == 0); + assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0); + assert(mem->vmp_listq.next == 0 && mem->vmp_listq.prev == 0); #if CONFIG_BACKGROUND_QUEUE - assert(mem->vm_page_backgroundq.next == 0 && - mem->vm_page_backgroundq.prev == 0 && - mem->vm_page_on_backgroundq == FALSE); + assert(mem->vmp_backgroundq.next == 0 && + mem->vmp_backgroundq.prev == 0 && + mem->vmp_on_backgroundq == FALSE); #endif - if ((mem->lopage == TRUE || vm_lopage_refill == TRUE) && + if ((mem->vmp_lopage == TRUE || vm_lopage_refill == TRUE) && vm_lopage_free_count < vm_lopage_free_limit && VM_PAGE_GET_PHYS_PAGE(mem) < max_valid_low_ppnum) { /* @@ -3107,14 +3235,14 @@ vm_page_release( vm_page_queue_enter_first(&vm_lopage_queue_free, mem, vm_page_t, - pageq); + vmp_pageq); vm_lopage_free_count++; if (vm_lopage_free_count >= vm_lopage_free_limit) vm_lopage_refill = FALSE; - mem->vm_page_q_state = VM_PAGE_ON_FREE_LOPAGE_Q; - mem->lopage = TRUE; + mem->vmp_q_state = VM_PAGE_ON_FREE_LOPAGE_Q; + mem->vmp_lopage = TRUE; #if CONFIG_SECLUDED_MEMORY } else if (vm_page_free_count > vm_page_free_reserved && vm_page_secluded_count < vm_page_secluded_target && @@ -3131,13 +3259,13 @@ vm_page_release( lck_mtx_lock_spin(&vm_page_queue_free_lock); } } - mem->lopage = FALSE; + mem->vmp_lopage = FALSE; LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); vm_page_queue_enter_first(&vm_page_queue_secluded, mem, vm_page_t, - pageq); - mem->vm_page_q_state = VM_PAGE_ON_SECLUDED_Q; + vmp_pageq); + mem->vmp_q_state = VM_PAGE_ON_SECLUDED_Q; vm_page_secluded_count++; vm_page_secluded_count_free++; if (!page_queues_locked) { @@ -3150,20 +3278,20 @@ vm_page_release( } #endif /* CONFIG_SECLUDED_MEMORY */ } else { - mem->lopage = FALSE; - mem->vm_page_q_state = VM_PAGE_ON_FREE_Q; + mem->vmp_lopage = FALSE; + mem->vmp_q_state = VM_PAGE_ON_FREE_Q; color = VM_PAGE_GET_COLOR(mem); #if defined(__x86_64__) vm_page_queue_enter_clump(&vm_page_queue_free[color].qhead, mem, vm_page_t, - pageq); + vmp_pageq); #else vm_page_queue_enter(&vm_page_queue_free[color].qhead, mem, vm_page_t, - pageq); + vmp_pageq); #endif vm_page_free_count++; /* @@ -3202,6 +3330,10 @@ vm_page_release( need_wakeup = 1; } } + vm_pageout_vminfo.vm_page_pages_freed++; + + VM_DEBUG_CONSTANT_EVENT(vm_page_release, VM_PAGE_RELEASE, DBG_FUNC_NONE, 1, 0, 0, 0); + lck_mtx_unlock(&vm_page_queue_free_lock); if (need_priv_wakeup) @@ -3230,32 +3362,32 @@ vm_page_release_startup( if (vm_lopage_free_count < vm_lopage_free_limit && VM_PAGE_GET_PHYS_PAGE(mem) < max_valid_low_ppnum) { - mem->lopage = TRUE; - mem->vm_page_q_state = VM_PAGE_ON_FREE_LOPAGE_Q; + mem->vmp_lopage = TRUE; + mem->vmp_q_state = VM_PAGE_ON_FREE_LOPAGE_Q; vm_lopage_free_count++; queue_free = &vm_lopage_queue_free; #if CONFIG_SECLUDED_MEMORY } else if (vm_page_secluded_count < vm_page_secluded_target) { - mem->lopage = FALSE; - mem->vm_page_q_state = VM_PAGE_ON_SECLUDED_Q; + mem->vmp_lopage = FALSE; + mem->vmp_q_state = VM_PAGE_ON_SECLUDED_Q; vm_page_secluded_count++; vm_page_secluded_count_free++; queue_free = &vm_page_queue_secluded; #endif /* CONFIG_SECLUDED_MEMORY */ } else { - mem->lopage = FALSE; - mem->vm_page_q_state = VM_PAGE_ON_FREE_Q; + mem->vmp_lopage = FALSE; + mem->vmp_q_state = VM_PAGE_ON_FREE_Q; vm_page_free_count++; queue_free = &vm_page_queue_free[VM_PAGE_GET_COLOR(mem)].qhead; } - if (mem->vm_page_q_state == VM_PAGE_ON_FREE_Q) { + if (mem->vmp_q_state == VM_PAGE_ON_FREE_Q) { #if defined(__x86_64__) - vm_page_queue_enter_clump(queue_free, mem, vm_page_t, pageq); + vm_page_queue_enter_clump(queue_free, mem, vm_page_t, vmp_pageq); #else - vm_page_queue_enter(queue_free, mem, vm_page_t, pageq); + vm_page_queue_enter(queue_free, mem, vm_page_t, vmp_pageq); #endif } else - vm_page_queue_enter_first(queue_free, mem, vm_page_t, pageq); + vm_page_queue_enter_first(queue_free, mem, vm_page_t, vmp_pageq); } /* @@ -3302,7 +3434,7 @@ vm_page_wait( wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, interruptible); #if CONFIG_SECLUDED_MEMORY } else if (secluded_for_apps && - task_can_use_secluded_mem(current_task())) { + task_can_use_secluded_mem(current_task(), FALSE)) { #if 00 /* XXX FBDP: need pageq lock for this... */ /* XXX FBDP: might wait even if pages available, */ @@ -3332,7 +3464,7 @@ vm_page_wait( thread_wakeup((event_t)&vm_page_free_wanted); if (wait_result == THREAD_WAITING) { - VM_DEBUG_EVENT(vm_page_wait_block, VM_PAGE_WAIT_BLOCK, DBG_FUNC_START, + VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block, VM_PAGE_WAIT_BLOCK, DBG_FUNC_START, vm_page_free_wanted_privileged, vm_page_free_wanted, #if CONFIG_SECLUDED_MEMORY @@ -3342,8 +3474,8 @@ vm_page_wait( #endif /* CONFIG_SECLUDED_MEMORY */ 0); wait_result = thread_block(THREAD_CONTINUE_NULL); - VM_DEBUG_EVENT(vm_page_wait_block, - VM_PAGE_WAIT_BLOCK, DBG_FUNC_END, 0, 0, 0, 0); + VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block, + VM_PAGE_WAIT_BLOCK, DBG_FUNC_END, 0, 0, 0, 0); } return (wait_result == THREAD_AWAKENED); @@ -3435,15 +3567,15 @@ vm_page_free_prepare_queues( VM_PAGE_CHECK(mem); - assert(mem->vm_page_q_state != VM_PAGE_ON_FREE_Q); - assert(!mem->cleaning); + assert(mem->vmp_q_state != VM_PAGE_ON_FREE_Q); + assert(!mem->vmp_cleaning); m_object = VM_PAGE_OBJECT(mem); LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); if (m_object) { vm_object_lock_assert_exclusive(m_object); } - if (mem->laundry) { + if (mem->vmp_laundry) { /* * We may have to free a page while it's being laundered * if we lost its pager (due to a forced unmount, for example). @@ -3458,7 +3590,7 @@ vm_page_free_prepare_queues( vm_page_queues_remove(mem, TRUE); if (VM_PAGE_WIRED(mem)) { - assert(mem->wire_count > 0); + assert(mem->vmp_wire_count > 0); if (m_object) { @@ -3476,10 +3608,22 @@ vm_page_free_prepare_queues( } if ((m_object->purgable == VM_PURGABLE_VOLATILE || m_object->purgable == VM_PURGABLE_EMPTY) && - m_object->vo_purgeable_owner != TASK_NULL) { - task_t owner; - - owner = m_object->vo_purgeable_owner; + m_object->vo_owner != TASK_NULL) { + task_t owner; + int ledger_idx_volatile; + int ledger_idx_nonvolatile; + int ledger_idx_volatile_compressed; + int ledger_idx_nonvolatile_compressed; + boolean_t do_footprint; + + owner = VM_OBJECT_OWNER(m_object); + vm_object_ledger_tag_ledgers( + m_object, + &ledger_idx_volatile, + &ledger_idx_nonvolatile, + &ledger_idx_volatile_compressed, + &ledger_idx_nonvolatile_compressed, + &do_footprint); /* * While wired, this page was accounted * as "non-volatile" but it should now @@ -3487,26 +3631,28 @@ vm_page_free_prepare_queues( */ /* one less "non-volatile"... */ ledger_debit(owner->ledger, - task_ledgers.purgeable_nonvolatile, - PAGE_SIZE); - /* ... and "phys_footprint" */ - ledger_debit(owner->ledger, - task_ledgers.phys_footprint, + ledger_idx_nonvolatile, PAGE_SIZE); + if (do_footprint) { + /* ... and "phys_footprint" */ + ledger_debit(owner->ledger, + task_ledgers.phys_footprint, + PAGE_SIZE); + } /* one more "volatile" */ ledger_credit(owner->ledger, - task_ledgers.purgeable_volatile, + ledger_idx_volatile, PAGE_SIZE); } } - if (!mem->private && !mem->fictitious) + if (!mem->vmp_private && !mem->vmp_fictitious) vm_page_wire_count--; - mem->vm_page_q_state = VM_PAGE_NOT_ON_Q; - mem->wire_count = 0; - assert(!mem->gobbled); - } else if (mem->gobbled) { - if (!mem->private && !mem->fictitious) + mem->vmp_q_state = VM_PAGE_NOT_ON_Q; + mem->vmp_wire_count = 0; + assert(!mem->vmp_gobbled); + } else if (mem->vmp_gobbled) { + if (!mem->vmp_private && !mem->vmp_fictitious) vm_page_wire_count--; vm_page_gobble_count--; } @@ -3518,27 +3664,27 @@ vm_page_free_prepare_object( vm_page_t mem, boolean_t remove_from_hash) { - if (mem->tabled) + if (mem->vmp_tabled) vm_page_remove(mem, remove_from_hash); /* clears tabled, object, offset */ PAGE_WAKEUP(mem); /* clears wanted */ - if (mem->private) { - mem->private = FALSE; - mem->fictitious = TRUE; + if (mem->vmp_private) { + mem->vmp_private = FALSE; + mem->vmp_fictitious = TRUE; VM_PAGE_SET_PHYS_PAGE(mem, vm_page_fictitious_addr); } - if ( !mem->fictitious) { - assert(mem->pageq.next == 0); - assert(mem->pageq.prev == 0); - assert(mem->listq.next == 0); - assert(mem->listq.prev == 0); + if ( !mem->vmp_fictitious) { + assert(mem->vmp_pageq.next == 0); + assert(mem->vmp_pageq.prev == 0); + assert(mem->vmp_listq.next == 0); + assert(mem->vmp_listq.prev == 0); #if CONFIG_BACKGROUND_QUEUE - assert(mem->vm_page_backgroundq.next == 0); - assert(mem->vm_page_backgroundq.prev == 0); + assert(mem->vmp_backgroundq.next == 0); + assert(mem->vmp_backgroundq.prev == 0); #endif /* CONFIG_BACKGROUND_QUEUE */ - assert(mem->next_m == 0); - vm_page_init(mem, VM_PAGE_GET_PHYS_PAGE(mem), mem->lopage); + assert(mem->vmp_next_m == 0); + vm_page_init(mem, VM_PAGE_GET_PHYS_PAGE(mem), mem->vmp_lopage); } } @@ -3557,7 +3703,7 @@ vm_page_free( { vm_page_free_prepare(mem); - if (mem->fictitious) { + if (mem->vmp_fictitious) { vm_page_release_fictitious(mem); } else { vm_page_release(mem, @@ -3577,7 +3723,7 @@ vm_page_free_unlocked( vm_page_free_prepare_object(mem, remove_from_hash); - if (mem->fictitious) { + if (mem->vmp_fictitious) { vm_page_release_fictitious(mem); } else { vm_page_release(mem, FALSE); /* page queues are not locked */ @@ -3621,27 +3767,27 @@ vm_page_free_list( */ while (mem && pg_count < 64) { - assert((mem->vm_page_q_state == VM_PAGE_NOT_ON_Q) || - (mem->vm_page_q_state == VM_PAGE_IS_WIRED)); + assert((mem->vmp_q_state == VM_PAGE_NOT_ON_Q) || + (mem->vmp_q_state == VM_PAGE_IS_WIRED)); #if CONFIG_BACKGROUND_QUEUE - assert(mem->vm_page_backgroundq.next == 0 && - mem->vm_page_backgroundq.prev == 0 && - mem->vm_page_on_backgroundq == FALSE); + assert(mem->vmp_backgroundq.next == 0 && + mem->vmp_backgroundq.prev == 0 && + mem->vmp_on_backgroundq == FALSE); #endif - nxt = mem->snext; - mem->snext = NULL; - assert(mem->pageq.prev == 0); + nxt = mem->vmp_snext; + mem->vmp_snext = NULL; + assert(mem->vmp_pageq.prev == 0); - if (vm_page_free_verify && !mem->fictitious && !mem->private) { - assert(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(mem))); + if (vm_page_free_verify && !mem->vmp_fictitious && !mem->vmp_private) { + assertf(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(mem)), "page = 0x%llx", (uint64_t)VM_PAGE_GET_PHYS_PAGE(mem)); } if (prepare_object == TRUE) vm_page_free_prepare_object(mem, TRUE); - if (!mem->fictitious) { - assert(mem->busy); + if (!mem->vmp_fictitious) { + assert(mem->vmp_busy); - if ((mem->lopage == TRUE || vm_lopage_refill == TRUE) && + if ((mem->vmp_lopage == TRUE || vm_lopage_refill == TRUE) && vm_lopage_free_count < vm_lopage_free_limit && VM_PAGE_GET_PHYS_PAGE(mem) < max_valid_low_ppnum) { vm_page_release(mem, FALSE); /* page queues are not locked */ @@ -3661,7 +3807,7 @@ vm_page_free_list( * cause trouble because the page is not actually * in the free queue yet... */ - mem->snext = local_freeq; + mem->vmp_snext = local_freeq; local_freeq = mem; pg_count++; @@ -3689,30 +3835,33 @@ vm_page_free_list( while (mem) { int color; - nxt = mem->snext; + nxt = mem->vmp_snext; - assert(mem->vm_page_q_state == VM_PAGE_NOT_ON_Q); - assert(mem->busy); - mem->lopage = FALSE; - mem->vm_page_q_state = VM_PAGE_ON_FREE_Q; + assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q); + assert(mem->vmp_busy); + mem->vmp_lopage = FALSE; + mem->vmp_q_state = VM_PAGE_ON_FREE_Q; color = VM_PAGE_GET_COLOR(mem); #if defined(__x86_64__) vm_page_queue_enter_clump(&vm_page_queue_free[color].qhead, mem, vm_page_t, - pageq); + vmp_pageq); #else vm_page_queue_enter(&vm_page_queue_free[color].qhead, mem, vm_page_t, - pageq); + vmp_pageq); #endif mem = nxt; } + vm_pageout_vminfo.vm_page_pages_freed += pg_count; vm_page_free_count += pg_count; avail_free_count = vm_page_free_count; + VM_DEBUG_CONSTANT_EVENT(vm_page_release, VM_PAGE_RELEASE, DBG_FUNC_NONE, pg_count, 0, 0, 0); + if (vm_page_free_wanted_privileged > 0 && avail_free_count > 0) { if (avail_free_count < vm_page_free_wanted_privileged) { @@ -3825,7 +3974,7 @@ vm_page_wire( m_object = VM_PAGE_OBJECT(mem); -// dbgLog(current_thread(), mem->offset, m_object, 1); /* (TEST/DEBUG) */ +// dbgLog(current_thread(), mem->vmp_offset, m_object, 1); /* (TEST/DEBUG) */ VM_PAGE_CHECK(mem); if (m_object) { @@ -3844,13 +3993,13 @@ vm_page_wire( LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); if ( !VM_PAGE_WIRED(mem)) { - if (mem->laundry) + if (mem->vmp_laundry) vm_pageout_steal_laundry(mem, TRUE); vm_page_queues_remove(mem, TRUE); - assert(mem->wire_count == 0); - mem->vm_page_q_state = VM_PAGE_IS_WIRED; + assert(mem->vmp_wire_count == 0); + mem->vmp_q_state = VM_PAGE_IS_WIRED; if (m_object) { @@ -3867,22 +4016,36 @@ vm_page_wire( } if ((m_object->purgable == VM_PURGABLE_VOLATILE || m_object->purgable == VM_PURGABLE_EMPTY) && - m_object->vo_purgeable_owner != TASK_NULL) { - task_t owner; - - owner = m_object->vo_purgeable_owner; + m_object->vo_owner != TASK_NULL) { + task_t owner; + int ledger_idx_volatile; + int ledger_idx_nonvolatile; + int ledger_idx_volatile_compressed; + int ledger_idx_nonvolatile_compressed; + boolean_t do_footprint; + + owner = VM_OBJECT_OWNER(m_object); + vm_object_ledger_tag_ledgers( + m_object, + &ledger_idx_volatile, + &ledger_idx_nonvolatile, + &ledger_idx_volatile_compressed, + &ledger_idx_nonvolatile_compressed, + &do_footprint); /* less volatile bytes */ ledger_debit(owner->ledger, - task_ledgers.purgeable_volatile, + ledger_idx_volatile, PAGE_SIZE); /* more not-quite-volatile bytes */ ledger_credit(owner->ledger, - task_ledgers.purgeable_nonvolatile, - PAGE_SIZE); - /* more footprint */ - ledger_credit(owner->ledger, - task_ledgers.phys_footprint, + ledger_idx_nonvolatile, PAGE_SIZE); + if (do_footprint) { + /* more footprint */ + ledger_credit(owner->ledger, + task_ledgers.phys_footprint, + PAGE_SIZE); + } } if (m_object->all_reusable) { /* @@ -3890,34 +4053,34 @@ vm_page_wire( * in "all_reusable" VM objects, so nothing * to do here. */ - } else if (mem->reusable) { + } else if (mem->vmp_reusable) { /* * This page is not "re-usable" when it's * wired, so adjust its state and the * accounting. */ vm_object_reuse_pages(m_object, - mem->offset, - mem->offset+PAGE_SIZE_64, + mem->vmp_offset, + mem->vmp_offset+PAGE_SIZE_64, FALSE); } } - assert(!mem->reusable); + assert(!mem->vmp_reusable); - if (!mem->private && !mem->fictitious && !mem->gobbled) + if (!mem->vmp_private && !mem->vmp_fictitious && !mem->vmp_gobbled) vm_page_wire_count++; - if (mem->gobbled) + if (mem->vmp_gobbled) vm_page_gobble_count--; - mem->gobbled = FALSE; + mem->vmp_gobbled = FALSE; if (check_memorystatus == TRUE) { VM_CHECK_MEMORYSTATUS; } } - assert(!mem->gobbled); - assert(mem->vm_page_q_state == VM_PAGE_IS_WIRED); - mem->wire_count++; - if (__improbable(mem->wire_count == 0)) { + assert(!mem->vmp_gobbled); + assert(mem->vmp_q_state == VM_PAGE_IS_WIRED); + mem->vmp_wire_count++; + if (__improbable(mem->vmp_wire_count == 0)) { panic("vm_page_wire(%p): wire_count overflow", mem); } VM_PAGE_CHECK(mem); @@ -3940,23 +4103,23 @@ vm_page_unwire( m_object = VM_PAGE_OBJECT(mem); -// dbgLog(current_thread(), mem->offset, m_object, 0); /* (TEST/DEBUG) */ +// dbgLog(current_thread(), mem->vmp_offset, m_object, 0); /* (TEST/DEBUG) */ VM_PAGE_CHECK(mem); assert(VM_PAGE_WIRED(mem)); - assert(mem->wire_count > 0); - assert(!mem->gobbled); + assert(mem->vmp_wire_count > 0); + assert(!mem->vmp_gobbled); assert(m_object != VM_OBJECT_NULL); vm_object_lock_assert_exclusive(m_object); LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); - if (--mem->wire_count == 0) { + if (--mem->vmp_wire_count == 0) { - mem->vm_page_q_state = VM_PAGE_NOT_ON_Q; + mem->vmp_q_state = VM_PAGE_NOT_ON_Q; VM_OBJECT_WIRED_PAGE_UPDATE_START(m_object); VM_OBJECT_WIRED_PAGE_REMOVE(m_object, mem); VM_OBJECT_WIRED_PAGE_UPDATE_END(m_object, m_object->wire_tag); - if (!mem->private && !mem->fictitious) { + if (!mem->vmp_private && !mem->vmp_fictitious) { vm_page_wire_count--; } @@ -3969,25 +4132,39 @@ vm_page_unwire( } if ((m_object->purgable == VM_PURGABLE_VOLATILE || m_object->purgable == VM_PURGABLE_EMPTY) && - m_object->vo_purgeable_owner != TASK_NULL) { - task_t owner; - - owner = m_object->vo_purgeable_owner; + m_object->vo_owner != TASK_NULL) { + task_t owner; + int ledger_idx_volatile; + int ledger_idx_nonvolatile; + int ledger_idx_volatile_compressed; + int ledger_idx_nonvolatile_compressed; + boolean_t do_footprint; + + owner = VM_OBJECT_OWNER(m_object); + vm_object_ledger_tag_ledgers( + m_object, + &ledger_idx_volatile, + &ledger_idx_nonvolatile, + &ledger_idx_volatile_compressed, + &ledger_idx_nonvolatile_compressed, + &do_footprint); /* more volatile bytes */ ledger_credit(owner->ledger, - task_ledgers.purgeable_volatile, + ledger_idx_volatile, PAGE_SIZE); /* less not-quite-volatile bytes */ ledger_debit(owner->ledger, - task_ledgers.purgeable_nonvolatile, - PAGE_SIZE); - /* less footprint */ - ledger_debit(owner->ledger, - task_ledgers.phys_footprint, + ledger_idx_nonvolatile, PAGE_SIZE); + if (do_footprint) { + /* less footprint */ + ledger_debit(owner->ledger, + task_ledgers.phys_footprint, + PAGE_SIZE); + } } assert(m_object != kernel_object); - assert(mem->pageq.next == 0 && mem->pageq.prev == 0); + assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0); if (queueit == TRUE) { if (m_object->purgable == VM_PURGABLE_EMPTY) { @@ -4042,15 +4219,15 @@ vm_page_deactivate_internal( * inactive queue. Note wired pages should not have * their reference bit cleared. */ - assert ( !(m->absent && !m->unusual)); + assert ( !(m->vmp_absent && !m->vmp_unusual)); - if (m->gobbled) { /* can this happen? */ + if (m->vmp_gobbled) { /* can this happen? */ assert( !VM_PAGE_WIRED(m)); - if (!m->private && !m->fictitious) + if (!m->vmp_private && !m->vmp_fictitious) vm_page_wire_count--; vm_page_gobble_count--; - m->gobbled = FALSE; + m->vmp_gobbled = FALSE; } /* * if this page is currently on the pageout queue, we can't do the @@ -4060,29 +4237,29 @@ vm_page_deactivate_internal( * reference which is held on the object while the page is in the pageout queue... * just let the normal laundry processing proceed */ - if (m->laundry || m->private || m->fictitious || - (m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) || - (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) || + if (m->vmp_laundry || m->vmp_private || m->vmp_fictitious || + (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) || + (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) || VM_PAGE_WIRED(m)) { return; } - if (!m->absent && clear_hw_reference == TRUE) + if (!m->vmp_absent && clear_hw_reference == TRUE) pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m)); - m->reference = FALSE; - m->no_cache = FALSE; + m->vmp_reference = FALSE; + m->vmp_no_cache = FALSE; if ( !VM_PAGE_INACTIVE(m)) { vm_page_queues_remove(m, FALSE); if (!VM_DYNAMIC_PAGING_ENABLED() && - m->dirty && m_object->internal && + m->vmp_dirty && m_object->internal && (m_object->purgable == VM_PURGABLE_DENY || m_object->purgable == VM_PURGABLE_NONVOLATILE || m_object->purgable == VM_PURGABLE_VOLATILE)) { vm_page_check_pageable_safe(m); - vm_page_queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq); - m->vm_page_q_state = VM_PAGE_ON_THROTTLED_Q; + vm_page_queue_enter(&vm_page_queue_throttled, m, vm_page_t, vmp_pageq); + m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q; vm_page_throttled_count++; } else { if (m_object->named && m_object->ref_count == 1) { @@ -4115,17 +4292,17 @@ void vm_page_enqueue_cleaned(vm_page_t m) assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr); LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); - assert( !(m->absent && !m->unusual)); + assert( !(m->vmp_absent && !m->vmp_unusual)); if (VM_PAGE_WIRED(m)) { return; } - if (m->gobbled) { - if (!m->private && !m->fictitious) + if (m->vmp_gobbled) { + if (!m->vmp_private && !m->vmp_fictitious) vm_page_wire_count--; vm_page_gobble_count--; - m->gobbled = FALSE; + m->vmp_gobbled = FALSE; } /* * if this page is currently on the pageout queue, we can't do the @@ -4135,16 +4312,16 @@ void vm_page_enqueue_cleaned(vm_page_t m) * reference which is held on the object while the page is in the pageout queue... * just let the normal laundry processing proceed */ - if (m->laundry || m->private || m->fictitious || - (m->vm_page_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) || - (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q)) { + if (m->vmp_laundry || m->vmp_private || m->vmp_fictitious || + (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) || + (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) { return; } vm_page_queues_remove(m, FALSE); vm_page_check_pageable_safe(m); - vm_page_queue_enter(&vm_page_queue_cleaned, m, vm_page_t, pageq); - m->vm_page_q_state = VM_PAGE_ON_INACTIVE_CLEANED_Q; + vm_page_queue_enter(&vm_page_queue_cleaned, m, vm_page_t, vmp_pageq); + m->vmp_q_state = VM_PAGE_ON_INACTIVE_CLEANED_Q; vm_page_cleaned_count++; vm_page_inactive_count++; @@ -4154,10 +4331,10 @@ void vm_page_enqueue_cleaned(vm_page_t m) vm_page_pageable_external_count++; } #if CONFIG_BACKGROUND_QUEUE - if (m->vm_page_in_background) + if (m->vmp_in_background) vm_page_add_to_backgroundq(m, TRUE); #endif - vm_pageout_enqueued_cleaned++; + VM_PAGEOUT_DEBUG(vm_pageout_enqueued_cleaned, 1); } /* @@ -4182,14 +4359,14 @@ vm_page_activate( #endif assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr); LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); - assert( !(m->absent && !m->unusual)); + assert( !(m->vmp_absent && !m->vmp_unusual)); - if (m->gobbled) { + if (m->vmp_gobbled) { assert( !VM_PAGE_WIRED(m)); - if (!m->private && !m->fictitious) + if (!m->vmp_private && !m->vmp_fictitious) vm_page_wire_count--; vm_page_gobble_count--; - m->gobbled = FALSE; + m->vmp_gobbled = FALSE; } /* * if this page is currently on the pageout queue, we can't do the @@ -4199,17 +4376,17 @@ vm_page_activate( * reference which is held on the object while the page is in the pageout queue... * just let the normal laundry processing proceed */ - if (m->laundry || m->private || m->fictitious || - (m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) || - (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q)) + if (m->vmp_laundry || m->vmp_private || m->vmp_fictitious || + (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) || + (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) return; #if DEBUG - if (m->vm_page_q_state == VM_PAGE_ON_ACTIVE_Q) + if (m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q) panic("vm_page_activate: already active"); #endif - if (m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) { + if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) { DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL); DTRACE_VM2(pgfrec, int, 1, (uint64_t *), NULL); } @@ -4219,12 +4396,12 @@ vm_page_activate( if ( !VM_PAGE_WIRED(m)) { vm_page_check_pageable_safe(m); if (!VM_DYNAMIC_PAGING_ENABLED() && - m->dirty && m_object->internal && + m->vmp_dirty && m_object->internal && (m_object->purgable == VM_PURGABLE_DENY || m_object->purgable == VM_PURGABLE_NONVOLATILE || m_object->purgable == VM_PURGABLE_VOLATILE)) { - vm_page_queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq); - m->vm_page_q_state = VM_PAGE_ON_THROTTLED_Q; + vm_page_queue_enter(&vm_page_queue_throttled, m, vm_page_t, vmp_pageq); + m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q; vm_page_throttled_count++; } else { #if CONFIG_SECLUDED_MEMORY @@ -4233,8 +4410,8 @@ vm_page_activate( num_tasks_can_use_secluded_mem == 0 && m_object->eligible_for_secluded) { vm_page_queue_enter(&vm_page_queue_secluded, m, - vm_page_t, pageq); - m->vm_page_q_state = VM_PAGE_ON_SECLUDED_Q; + vm_page_t, vmp_pageq); + m->vmp_q_state = VM_PAGE_ON_SECLUDED_Q; vm_page_secluded_count++; vm_page_secluded_count_inuse++; assert(!m_object->internal); @@ -4243,8 +4420,8 @@ vm_page_activate( #endif /* CONFIG_SECLUDED_MEMORY */ vm_page_enqueue_active(m, FALSE); } - m->reference = TRUE; - m->no_cache = FALSE; + m->vmp_reference = TRUE; + m->vmp_no_cache = FALSE; } VM_PAGE_CHECK(m); } @@ -4272,7 +4449,7 @@ vm_page_speculate( assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr); LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); - assert( !(m->absent && !m->unusual)); + assert( !(m->vmp_absent && !m->vmp_unusual)); assert(m_object->internal == FALSE); /* @@ -4283,9 +4460,9 @@ vm_page_speculate( * reference which is held on the object while the page is in the pageout queue... * just let the normal laundry processing proceed */ - if (m->laundry || m->private || m->fictitious || - (m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) || - (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q)) + if (m->vmp_laundry || m->vmp_private || m->vmp_fictitious || + (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) || + (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) return; vm_page_queues_remove(m, FALSE); @@ -4309,9 +4486,8 @@ vm_page_speculate( /* * set the timer to begin a new group */ - aq->age_ts.tv_sec = vm_page_speculative_q_age_ms / 1000; - aq->age_ts.tv_nsec = (vm_page_speculative_q_age_ms % 1000) * 1000 * NSEC_PER_USEC; - + aq->age_ts.tv_sec = vm_pageout_state.vm_page_speculative_q_age_ms / 1000; + aq->age_ts.tv_nsec = (vm_pageout_state.vm_page_speculative_q_age_ms % 1000) * 1000 * NSEC_PER_USEC; ADD_MACH_TIMESPEC(&aq->age_ts, &ts); } else { aq = &vm_page_queue_speculative[speculative_age_index]; @@ -4333,14 +4509,13 @@ vm_page_speculate( if (!vm_page_queue_empty(&aq->age_q)) vm_page_speculate_ageit(aq); - aq->age_ts.tv_sec = vm_page_speculative_q_age_ms / 1000; - aq->age_ts.tv_nsec = (vm_page_speculative_q_age_ms % 1000) * 1000 * NSEC_PER_USEC; - + aq->age_ts.tv_sec = vm_pageout_state.vm_page_speculative_q_age_ms / 1000; + aq->age_ts.tv_nsec = (vm_pageout_state.vm_page_speculative_q_age_ms % 1000) * 1000 * NSEC_PER_USEC; ADD_MACH_TIMESPEC(&aq->age_ts, &ts); } } - vm_page_enqueue_tail(&aq->age_q, &m->pageq); - m->vm_page_q_state = VM_PAGE_ON_SPECULATIVE_Q; + vm_page_enqueue_tail(&aq->age_q, &m->vmp_pageq); + m->vmp_q_state = VM_PAGE_ON_SPECULATIVE_Q; vm_page_speculative_count++; vm_page_pageable_external_count++; @@ -4376,19 +4551,19 @@ vm_page_speculate_ageit(struct vm_speculative_age_q *aq) sq->age_q.prev = aq->age_q.prev; t = (vm_page_t)VM_PAGE_UNPACK_PTR(sq->age_q.next); - t->pageq.prev = VM_PAGE_PACK_PTR(&sq->age_q); + t->vmp_pageq.prev = VM_PAGE_PACK_PTR(&sq->age_q); t = (vm_page_t)VM_PAGE_UNPACK_PTR(sq->age_q.prev); - t->pageq.next = VM_PAGE_PACK_PTR(&sq->age_q); + t->vmp_pageq.next = VM_PAGE_PACK_PTR(&sq->age_q); } else { t = (vm_page_t)VM_PAGE_UNPACK_PTR(sq->age_q.prev); - t->pageq.next = aq->age_q.next; + t->vmp_pageq.next = aq->age_q.next; t = (vm_page_t)VM_PAGE_UNPACK_PTR(aq->age_q.next); - t->pageq.prev = sq->age_q.prev; + t->vmp_pageq.prev = sq->age_q.prev; t = (vm_page_t)VM_PAGE_UNPACK_PTR(aq->age_q.prev); - t->pageq.next = VM_PAGE_PACK_PTR(&sq->age_q); + t->vmp_pageq.next = VM_PAGE_PACK_PTR(&sq->age_q); sq->age_q.prev = aq->age_q.prev; } @@ -4405,6 +4580,23 @@ vm_page_lru( assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr); LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); + + if (m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q) { + /* + * we don't need to do all the other work that + * vm_page_queues_remove and vm_page_enqueue_inactive + * bring along for the ride + */ + assert(!m->vmp_laundry); + assert(!m->vmp_private); + + m->vmp_no_cache = FALSE; + + vm_page_queue_remove(&vm_page_queue_inactive, m, vm_page_t, vmp_pageq); + vm_page_queue_enter(&vm_page_queue_inactive, m, vm_page_t, vmp_pageq); + + return; + } /* * if this page is currently on the pageout queue, we can't do the * vm_page_queues_remove (which doesn't handle the pageout queue case) @@ -4413,13 +4605,13 @@ vm_page_lru( * reference which is held on the object while the page is in the pageout queue... * just let the normal laundry processing proceed */ - if (m->laundry || m->private || - (m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) || - (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) || + if (m->vmp_laundry || m->vmp_private || + (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) || + (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) || VM_PAGE_WIRED(m)) return; - m->no_cache = FALSE; + m->vmp_no_cache = FALSE; vm_page_queues_remove(m, FALSE); @@ -4448,9 +4640,9 @@ vm_page_reactivate_all_throttled(void) /* * Switch "throttled" pages to "active". */ - vm_page_queue_iterate(&vm_page_queue_throttled, m, vm_page_t, pageq) { + vm_page_queue_iterate(&vm_page_queue_throttled, m, vm_page_t, vmp_pageq) { VM_PAGE_CHECK(m); - assert(m->vm_page_q_state == VM_PAGE_ON_THROTTLED_Q); + assert(m->vmp_q_state == VM_PAGE_ON_THROTTLED_Q); m_object = VM_PAGE_OBJECT(m); @@ -4461,10 +4653,10 @@ vm_page_reactivate_all_throttled(void) extra_external_count++; } - m->vm_page_q_state = VM_PAGE_ON_ACTIVE_Q; + m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q; VM_PAGE_CHECK(m); #if CONFIG_BACKGROUND_QUEUE - if (m->vm_page_in_background) + if (m->vmp_in_background) vm_page_add_to_backgroundq(m, FALSE); #endif } @@ -4481,11 +4673,11 @@ vm_page_reactivate_all_throttled(void) if (vm_page_queue_empty(&vm_page_queue_active)) { vm_page_queue_active.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_throttled); } else { - first_active->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_throttled); + first_active->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_throttled); } vm_page_queue_active.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_throttled); - first_throttled->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(&vm_page_queue_active); - last_throttled->pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_active); + first_throttled->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(&vm_page_queue_active); + last_throttled->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_active); #if DEBUG printf("reactivated %d throttled pages\n", vm_page_throttled_count); @@ -4541,20 +4733,20 @@ vm_page_reactivate_local(uint32_t lid, boolean_t force, boolean_t nolocks) */ assert(!vm_page_queue_empty(&lq->vpl_queue)); - vm_page_queue_iterate(&lq->vpl_queue, m, vm_page_t, pageq) { + vm_page_queue_iterate(&lq->vpl_queue, m, vm_page_t, vmp_pageq) { VM_PAGE_CHECK(m); vm_page_check_pageable_safe(m); - assert(m->vm_page_q_state == VM_PAGE_ON_ACTIVE_LOCAL_Q); - assert(!m->fictitious); + assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_LOCAL_Q); + assert(!m->vmp_fictitious); - if (m->local_id != lid) + if (m->vmp_local_id != lid) panic("vm_page_reactivate_local: found vm_page_t(%p) with wrong cpuid", m); - m->local_id = 0; - m->vm_page_q_state = VM_PAGE_ON_ACTIVE_Q; + m->vmp_local_id = 0; + m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q; VM_PAGE_CHECK(m); #if CONFIG_BACKGROUND_QUEUE - if (m->vm_page_in_background) + if (m->vmp_in_background) vm_page_add_to_backgroundq(m, FALSE); #endif count++; @@ -4572,11 +4764,11 @@ vm_page_reactivate_local(uint32_t lid, boolean_t force, boolean_t nolocks) if (vm_page_queue_empty(&vm_page_queue_active)) { vm_page_queue_active.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local); } else { - first_active->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local); + first_active->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local); } vm_page_queue_active.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local); - first_local->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(&vm_page_queue_active); - last_local->pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_active); + first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(&vm_page_queue_active); + last_local->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_active); vm_page_queue_init(&lq->vpl_queue); /* @@ -4593,6 +4785,8 @@ vm_page_reactivate_local(uint32_t lid, boolean_t force, boolean_t nolocks) if (nolocks == FALSE) { VPL_UNLOCK(&lq->vpl_lock); + + vm_page_balance_inactive(count / 4); vm_page_unlock_queues(); } } @@ -4655,7 +4849,7 @@ vm_page_zero_fill( { XPR(XPR_VM_PAGE, "vm_page_zero_fill, object 0x%X offset 0x%X page 0x%X\n", - VM_PAGE_OBJECT(m), m->offset, m, 0,0); + VM_PAGE_OBJECT(m), m->vmp_offset, m, 0,0); #if 0 /* * we don't hold the page queue lock @@ -4714,8 +4908,8 @@ vm_page_copy( XPR(XPR_VM_PAGE, "vm_page_copy, object 0x%X offset 0x%X to object 0x%X offset 0x%X\n", - src_m_object, src_m->offset, - VM_PAGE_OBJECT(dest_m), dest_m->offset, + src_m_object, src_m->vmp_offset, + VM_PAGE_OBJECT(dest_m), dest_m->vmp_offset, 0); #if 0 /* @@ -4740,33 +4934,22 @@ vm_page_copy( #if DEVELOPMENT || DEBUG DTRACE_VM4(codesigned_copy, vm_object_t, src_m_object, - vm_object_offset_t, src_m->offset, - int, src_m->cs_validated, - int, src_m->cs_tainted); + vm_object_offset_t, src_m->vmp_offset, + int, src_m->vmp_cs_validated, + int, src_m->vmp_cs_tainted); #endif /* DEVELOPMENT || DEBUG */ } - if (vm_page_is_slideable(src_m)) { - boolean_t was_busy = src_m->busy; - src_m->busy = TRUE; - (void) vm_page_slide(src_m, 0); - assert(src_m->busy); - if (!was_busy) { - PAGE_WAKEUP_DONE(src_m); - } - } - /* * Propagate the cs_tainted bit to the copy page. Do not propagate * the cs_validated bit. */ - dest_m->cs_tainted = src_m->cs_tainted; - if (dest_m->cs_tainted) { + dest_m->vmp_cs_tainted = src_m->vmp_cs_tainted; + if (dest_m->vmp_cs_tainted) { vm_page_copy_cs_tainted++; } - dest_m->slid = src_m->slid; - dest_m->error = src_m->error; /* sliding src_m might have failed... */ + dest_m->vmp_error = src_m->vmp_error; /* sliding src_m might have failed... */ pmap_copy_page(VM_PAGE_GET_PHYS_PAGE(src_m), VM_PAGE_GET_PHYS_PAGE(dest_m)); } @@ -4777,45 +4960,45 @@ _vm_page_print( { printf("vm_page %p: \n", p); printf(" pageq: next=%p prev=%p\n", - (vm_page_t)VM_PAGE_UNPACK_PTR(p->pageq.next), - (vm_page_t)VM_PAGE_UNPACK_PTR(p->pageq.prev)); + (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.next), + (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.prev)); printf(" listq: next=%p prev=%p\n", - (vm_page_t)(VM_PAGE_UNPACK_PTR(p->listq.next)), - (vm_page_t)(VM_PAGE_UNPACK_PTR(p->listq.prev))); - printf(" next=%p\n", (vm_page_t)(VM_PAGE_UNPACK_PTR(p->next_m))); - printf(" object=%p offset=0x%llx\n",VM_PAGE_OBJECT(p), p->offset); - printf(" wire_count=%u\n", p->wire_count); - printf(" q_state=%u\n", p->vm_page_q_state); + (vm_page_t)(VM_PAGE_UNPACK_PTR(p->vmp_listq.next)), + (vm_page_t)(VM_PAGE_UNPACK_PTR(p->vmp_listq.prev))); + printf(" next=%p\n", (vm_page_t)(VM_PAGE_UNPACK_PTR(p->vmp_next_m))); + printf(" object=%p offset=0x%llx\n",VM_PAGE_OBJECT(p), p->vmp_offset); + printf(" wire_count=%u\n", p->vmp_wire_count); + printf(" q_state=%u\n", p->vmp_q_state); printf(" %slaundry, %sref, %sgobbled, %sprivate\n", - (p->laundry ? "" : "!"), - (p->reference ? "" : "!"), - (p->gobbled ? "" : "!"), - (p->private ? "" : "!")); + (p->vmp_laundry ? "" : "!"), + (p->vmp_reference ? "" : "!"), + (p->vmp_gobbled ? "" : "!"), + (p->vmp_private ? "" : "!")); printf(" %sbusy, %swanted, %stabled, %sfictitious, %spmapped, %swpmapped\n", - (p->busy ? "" : "!"), - (p->wanted ? "" : "!"), - (p->tabled ? "" : "!"), - (p->fictitious ? "" : "!"), - (p->pmapped ? "" : "!"), - (p->wpmapped ? "" : "!")); + (p->vmp_busy ? "" : "!"), + (p->vmp_wanted ? "" : "!"), + (p->vmp_tabled ? "" : "!"), + (p->vmp_fictitious ? "" : "!"), + (p->vmp_pmapped ? "" : "!"), + (p->vmp_wpmapped ? "" : "!")); printf(" %sfree_when_done, %sabsent, %serror, %sdirty, %scleaning, %sprecious, %sclustered\n", - (p->free_when_done ? "" : "!"), - (p->absent ? "" : "!"), - (p->error ? "" : "!"), - (p->dirty ? "" : "!"), - (p->cleaning ? "" : "!"), - (p->precious ? "" : "!"), - (p->clustered ? "" : "!")); + (p->vmp_free_when_done ? "" : "!"), + (p->vmp_absent ? "" : "!"), + (p->vmp_error ? "" : "!"), + (p->vmp_dirty ? "" : "!"), + (p->vmp_cleaning ? "" : "!"), + (p->vmp_precious ? "" : "!"), + (p->vmp_clustered ? "" : "!")); printf(" %soverwriting, %srestart, %sunusual\n", - (p->overwriting ? "" : "!"), - (p->restart ? "" : "!"), - (p->unusual ? "" : "!")); + (p->vmp_overwriting ? "" : "!"), + (p->vmp_restart ? "" : "!"), + (p->vmp_unusual ? "" : "!")); printf(" %scs_validated, %scs_tainted, %scs_nx, %sno_cache\n", - (p->cs_validated ? "" : "!"), - (p->cs_tainted ? "" : "!"), - (p->cs_nx ? "" : "!"), - (p->no_cache ? "" : "!")); + (p->vmp_cs_validated ? "" : "!"), + (p->vmp_cs_tainted ? "" : "!"), + (p->vmp_cs_nx ? "" : "!"), + (p->vmp_no_cache ? "" : "!")); printf("phys_page=0x%x\n", VM_PAGE_GET_PHYS_PAGE(p)); } @@ -4880,28 +5063,28 @@ vm_page_verify_free_list( vm_page_queue_iterate(vm_page_queue, m, vm_page_t, - pageq) { + vmp_pageq) { if (m == look_for_page) { found_page = TRUE; } - if ((vm_page_t)VM_PAGE_UNPACK_PTR(m->pageq.prev) != prev_m) + if ((vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.prev) != prev_m) panic("vm_page_verify_free_list(color=%u, npages=%u): page %p corrupted prev ptr %p instead of %p\n", - color, npages, m, (vm_page_t)VM_PAGE_UNPACK_PTR(m->pageq.prev), prev_m); - if ( ! m->busy ) + color, npages, m, (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.prev), prev_m); + if ( ! m->vmp_busy ) panic("vm_page_verify_free_list(color=%u, npages=%u): page %p not busy\n", color, npages, m); if (color != (unsigned int) -1) { if (VM_PAGE_GET_COLOR(m) != color) panic("vm_page_verify_free_list(color=%u, npages=%u): page %p wrong color %u instead of %u\n", color, npages, m, VM_PAGE_GET_COLOR(m), color); - if (m->vm_page_q_state != VM_PAGE_ON_FREE_Q) + if (m->vmp_q_state != VM_PAGE_ON_FREE_Q) panic("vm_page_verify_free_list(color=%u, npages=%u): page %p - expecting q_state == VM_PAGE_ON_FREE_Q, found %d\n", - color, npages, m, m->vm_page_q_state); + color, npages, m, m->vmp_q_state); } else { - if (m->vm_page_q_state != VM_PAGE_ON_FREE_LOCAL_Q) + if (m->vmp_q_state != VM_PAGE_ON_FREE_LOCAL_Q) panic("vm_page_verify_free_list(npages=%u): local page %p - expecting q_state == VM_PAGE_ON_FREE_LOCAL_Q, found %d\n", - npages, m, m->vm_page_q_state); + npages, m, m->vmp_q_state); } ++npages; prev_m = m; @@ -5150,25 +5333,25 @@ vm_page_find_contiguous( for(npages = 0; npages < contig_pages; npages++, last_idx++) { - assert(vm_pages[last_idx].gobbled == FALSE); + assert(vm_pages[last_idx].vmp_gobbled == FALSE); - vm_pages[last_idx].gobbled = TRUE; + vm_pages[last_idx].vmp_gobbled = TRUE; vm_page_gobble_count++; - assert(1 == vm_pages[last_idx].wire_count); + assert(1 == vm_pages[last_idx].vmp_wire_count); /* * Gobbled pages are counted as wired pages. So no need to drop * the global wired page count. Just the page's wire count is fine. */ - vm_pages[last_idx].wire_count--; - vm_pages[last_idx].vm_page_q_state = VM_PAGE_NOT_ON_Q; + vm_pages[last_idx].vmp_wire_count--; + vm_pages[last_idx].vmp_q_state = VM_PAGE_NOT_ON_Q; } } last_idx = start_idx + contig_pages - 1; - vm_pages[last_idx].snext = NULL; + vm_pages[last_idx].vmp_snext = NULL; printf("Using preallocated buffer: Requested size (pages):%d... index range: %d-%d...freeing %llu pages\n", contig_pages, start_idx, last_idx, PREALLOCATED_CONTIG_BUFFER_PAGES_COUNT - contig_pages); @@ -5222,8 +5405,8 @@ vm_page_find_contiguous( scanned++; m = &vm_pages[page_idx]; - assert(!m->fictitious); - assert(!m->private); + assert(!m->vmp_fictitious); + assert(!m->vmp_private); if (max_pnum && VM_PAGE_GET_PHYS_PAGE(m) > max_pnum) { /* no more low pages... */ @@ -5235,9 +5418,9 @@ vm_page_find_contiguous( */ RESET_STATE_OF_RUN(); - } else if (VM_PAGE_WIRED(m) || m->gobbled || - m->laundry || m->wanted || - m->cleaning || m->overwriting || m->free_when_done) { + } else if (VM_PAGE_WIRED(m) || m->vmp_gobbled || + m->vmp_laundry || m->vmp_wanted || + m->vmp_cleaning || m->vmp_overwriting || m->vmp_free_when_done) { /* * page is in a transient state * or a state we don't want to deal @@ -5246,14 +5429,14 @@ vm_page_find_contiguous( */ RESET_STATE_OF_RUN(); - } else if ((m->vm_page_q_state == VM_PAGE_NOT_ON_Q) || - (m->vm_page_q_state == VM_PAGE_ON_FREE_LOCAL_Q) || - (m->vm_page_q_state == VM_PAGE_ON_FREE_LOPAGE_Q) || - (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q)) { + } else if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) || + (m->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q) || + (m->vmp_q_state == VM_PAGE_ON_FREE_LOPAGE_Q) || + (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) { /* * page needs to be on one of our queues (other then the pageout or special free queues) * or it needs to belong to the compressor pool (which is now indicated - * by vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR and falls out + * by vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR and falls out * from the check for VM_PAGE_NOT_ON_Q) * in order for it to be stable behind the * locks we hold at this point... @@ -5262,7 +5445,7 @@ vm_page_find_contiguous( */ RESET_STATE_OF_RUN(); - } else if ((m->vm_page_q_state != VM_PAGE_ON_FREE_Q) && (!m->tabled || m->busy)) { + } else if ((m->vmp_q_state != VM_PAGE_ON_FREE_Q) && (!m->vmp_tabled || m->vmp_busy)) { /* * pages on the free list are always 'busy' * so we couldn't test for 'busy' in the check @@ -5292,7 +5475,7 @@ vm_page_find_contiguous( prevcontaddr = VM_PAGE_GET_PHYS_PAGE(m); VM_PAGE_CHECK(m); - if (m->vm_page_q_state == VM_PAGE_ON_FREE_Q) { + if (m->vmp_q_state == VM_PAGE_ON_FREE_Q) { free_considered++; } else { /* @@ -5305,7 +5488,7 @@ vm_page_find_contiguous( * into a substitute page. */ #if VM_PAGE_FIND_CONTIGUOUS_CAN_STEAL - if (m->pmapped || m->dirty || m->precious) { + if (m->vmp_pmapped || m->vmp_dirty || m->vmp_precious) { substitute_needed++; } #else @@ -5413,10 +5596,10 @@ vm_page_find_contiguous( m1 = &vm_pages[start_idx++]; #if !VM_PAGE_FIND_CONTIGUOUS_CAN_STEAL - assert(m1->vm_page_q_state == VM_PAGE_ON_FREE_Q); + assert(m1->vmp_q_state == VM_PAGE_ON_FREE_Q); #endif - if (m1->vm_page_q_state == VM_PAGE_ON_FREE_Q) { + if (m1->vmp_q_state == VM_PAGE_ON_FREE_Q) { unsigned int color; color = VM_PAGE_GET_COLOR(m1); @@ -5426,7 +5609,7 @@ vm_page_find_contiguous( vm_page_queue_remove(&vm_page_queue_free[color].qhead, m1, vm_page_t, - pageq); + vmp_pageq); VM_PAGE_ZERO_PAGEQ_ENTRY(m1); #if MACH_ASSERT @@ -5437,8 +5620,8 @@ vm_page_find_contiguous( * does not get considered for another * concurrent physically-contiguous allocation. */ - m1->vm_page_q_state = VM_PAGE_NOT_ON_Q; - assert(m1->busy); + m1->vmp_q_state = VM_PAGE_NOT_ON_Q; + assert(m1->vmp_busy); vm_page_free_count--; } @@ -5467,16 +5650,16 @@ vm_page_find_contiguous( */ m1 = &vm_pages[cur_idx--]; - if (m1->vm_page_object == 0) { + if (m1->vmp_object == 0) { /* * page has already been removed from * the free list in the 1st pass */ - assert(m1->vm_page_q_state == VM_PAGE_NOT_ON_Q); - assert(m1->offset == (vm_object_offset_t) -1); - assert(m1->busy); - assert(!m1->wanted); - assert(!m1->laundry); + assert(m1->vmp_q_state == VM_PAGE_NOT_ON_Q); + assert(m1->vmp_offset == (vm_object_offset_t) -1); + assert(m1->vmp_busy); + assert(!m1->vmp_wanted); + assert(!m1->vmp_laundry); } else { vm_object_t object; int refmod; @@ -5485,7 +5668,7 @@ vm_page_find_contiguous( if (abort_run == TRUE) continue; - assert(m1->vm_page_q_state != VM_PAGE_NOT_ON_Q); + assert(m1->vmp_q_state != VM_PAGE_NOT_ON_Q); object = VM_PAGE_OBJECT(m1); @@ -5498,10 +5681,10 @@ vm_page_find_contiguous( locked_object = object; } if (locked_object == VM_OBJECT_NULL || - (VM_PAGE_WIRED(m1) || m1->gobbled || - m1->laundry || m1->wanted || - m1->cleaning || m1->overwriting || m1->free_when_done || m1->busy) || - (m1->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q)) { + (VM_PAGE_WIRED(m1) || m1->vmp_gobbled || + m1->vmp_laundry || m1->vmp_wanted || + m1->vmp_cleaning || m1->vmp_overwriting || m1->vmp_free_when_done || m1->vmp_busy) || + (m1->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) { if (locked_object) { vm_object_unlock(locked_object); @@ -5515,11 +5698,11 @@ vm_page_find_contiguous( disconnected = FALSE; reusable = FALSE; - if ((m1->reusable || + if ((m1->vmp_reusable || object->all_reusable) && - (m1->vm_page_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) && - !m1->dirty && - !m1->reference) { + (m1->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) && + !m1->vmp_dirty && + !m1->vmp_reference) { /* reusable page... */ refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m1)); disconnected = TRUE; @@ -5532,10 +5715,10 @@ vm_page_find_contiguous( } } - if ((m1->pmapped && + if ((m1->vmp_pmapped && ! reusable) || - m1->dirty || - m1->precious) { + m1->vmp_dirty || + m1->vmp_precious) { vm_object_offset_t offset; m2 = vm_page_grab(); @@ -5550,7 +5733,7 @@ vm_page_find_contiguous( continue; } if (! disconnected) { - if (m1->pmapped) + if (m1->vmp_pmapped) refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m1)); else refmod = 0; @@ -5560,32 +5743,32 @@ vm_page_find_contiguous( pmap_copy_page(VM_PAGE_GET_PHYS_PAGE(m1), VM_PAGE_GET_PHYS_PAGE(m2)); /* copy the page's state */ assert(!VM_PAGE_WIRED(m1)); - assert(m1->vm_page_q_state != VM_PAGE_ON_FREE_Q); - assert(m1->vm_page_q_state != VM_PAGE_ON_PAGEOUT_Q); - assert(!m1->laundry); - m2->reference = m1->reference; - assert(!m1->gobbled); - assert(!m1->private); - m2->no_cache = m1->no_cache; - m2->xpmapped = 0; - assert(!m1->busy); - assert(!m1->wanted); - assert(!m1->fictitious); - m2->pmapped = m1->pmapped; /* should flush cache ? */ - m2->wpmapped = m1->wpmapped; - assert(!m1->free_when_done); - m2->absent = m1->absent; - m2->error = m1->error; - m2->dirty = m1->dirty; - assert(!m1->cleaning); - m2->precious = m1->precious; - m2->clustered = m1->clustered; - assert(!m1->overwriting); - m2->restart = m1->restart; - m2->unusual = m1->unusual; - m2->cs_validated = m1->cs_validated; - m2->cs_tainted = m1->cs_tainted; - m2->cs_nx = m1->cs_nx; + assert(m1->vmp_q_state != VM_PAGE_ON_FREE_Q); + assert(m1->vmp_q_state != VM_PAGE_ON_PAGEOUT_Q); + assert(!m1->vmp_laundry); + m2->vmp_reference = m1->vmp_reference; + assert(!m1->vmp_gobbled); + assert(!m1->vmp_private); + m2->vmp_no_cache = m1->vmp_no_cache; + m2->vmp_xpmapped = 0; + assert(!m1->vmp_busy); + assert(!m1->vmp_wanted); + assert(!m1->vmp_fictitious); + m2->vmp_pmapped = m1->vmp_pmapped; /* should flush cache ? */ + m2->vmp_wpmapped = m1->vmp_wpmapped; + assert(!m1->vmp_free_when_done); + m2->vmp_absent = m1->vmp_absent; + m2->vmp_error = m1->vmp_error; + m2->vmp_dirty = m1->vmp_dirty; + assert(!m1->vmp_cleaning); + m2->vmp_precious = m1->vmp_precious; + m2->vmp_clustered = m1->vmp_clustered; + assert(!m1->vmp_overwriting); + m2->vmp_restart = m1->vmp_restart; + m2->vmp_unusual = m1->vmp_unusual; + m2->vmp_cs_validated = m1->vmp_cs_validated; + m2->vmp_cs_tainted = m1->vmp_cs_tainted; + m2->vmp_cs_nx = m1->vmp_cs_nx; /* * If m1 had really been reusable, @@ -5594,14 +5777,13 @@ vm_page_find_contiguous( * bit and assert that m2 is not * marked as "reusable". */ - // m2->reusable = m1->reusable; - assert(!m2->reusable); + // m2->vmp_reusable = m1->vmp_reusable; + assert(!m2->vmp_reusable); - // assert(!m1->lopage); - m2->slid = m1->slid; + // assert(!m1->vmp_lopage); - if (m1->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) - m2->vm_page_q_state = VM_PAGE_USED_BY_COMPRESSOR; + if (m1->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) + m2->vmp_q_state = VM_PAGE_USED_BY_COMPRESSOR; /* * page may need to be flushed if @@ -5609,7 +5791,7 @@ vm_page_find_contiguous( * that is going to be used by a device * that doesn't support coherency */ - m2->written_by_kernel = TRUE; + m2->vmp_written_by_kernel = TRUE; /* * make sure we clear the ref/mod state @@ -5620,11 +5802,11 @@ vm_page_find_contiguous( pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(m2), VM_MEM_MODIFIED | VM_MEM_REFERENCED); if (refmod & VM_MEM_REFERENCED) - m2->reference = TRUE; + m2->vmp_reference = TRUE; if (refmod & VM_MEM_MODIFIED) { SET_PAGE_DIRTY(m2, TRUE); } - offset = m1->offset; + offset = m1->vmp_offset; /* * completely cleans up the state @@ -5641,11 +5823,11 @@ vm_page_find_contiguous( */ vm_page_insert_internal(m2, locked_object, offset, VM_KERN_MEMORY_NONE, TRUE, TRUE, FALSE, FALSE, NULL); - if (m2->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) { - m2->pmapped = TRUE; - m2->wpmapped = TRUE; + if (m2->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) { + m2->vmp_pmapped = TRUE; + m2->vmp_wpmapped = TRUE; - PMAP_ENTER(kernel_pmap, m2->offset, m2, + PMAP_ENTER(kernel_pmap, m2->vmp_offset, m2, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, TRUE, kr); assert(kr == KERN_SUCCESS); @@ -5653,7 +5835,7 @@ vm_page_find_contiguous( compressed_pages++; } else { - if (m2->reference) + if (m2->vmp_reference) vm_page_activate(m2); else vm_page_deactivate(m2); @@ -5661,7 +5843,7 @@ vm_page_find_contiguous( PAGE_WAKEUP_DONE(m2); } else { - assert(m1->vm_page_q_state != VM_PAGE_USED_BY_COMPRESSOR); + assert(m1->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR); /* * completely cleans up the state @@ -5680,7 +5862,7 @@ vm_page_find_contiguous( vm_page_assign_background_state(m1); #endif VM_PAGE_ZERO_PAGEQ_ENTRY(m1); - m1->snext = m; + m1->vmp_snext = m; m = m1; } if (locked_object) { @@ -5746,14 +5928,14 @@ vm_page_find_contiguous( for (m1 = m; m1 != VM_PAGE_NULL; m1 = NEXT_PAGE(m1)) { - assert(m1->vm_page_q_state == VM_PAGE_NOT_ON_Q); - assert(m1->wire_count == 0); + assert(m1->vmp_q_state == VM_PAGE_NOT_ON_Q); + assert(m1->vmp_wire_count == 0); if (wire == TRUE) { - m1->wire_count++; - m1->vm_page_q_state = VM_PAGE_IS_WIRED; + m1->vmp_wire_count++; + m1->vmp_q_state = VM_PAGE_IS_WIRED; } else - m1->gobbled = TRUE; + m1->vmp_gobbled = TRUE; } if (wire == FALSE) vm_page_gobble_count += npages; @@ -5850,10 +6032,8 @@ cpm_allocate( /* * determine need for wakeups */ - if ((vm_page_free_count < vm_page_free_min) || - ((vm_page_free_count < vm_page_free_target) && - ((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_min))) - thread_wakeup((event_t) &vm_page_free_wanted); + if (vm_page_free_count < vm_page_free_min) + thread_wakeup((event_t) &vm_page_free_wanted); VM_CHECK_MEMORYSTATUS; @@ -5949,18 +6129,18 @@ vm_page_do_delayed_work( if (dwp->dw_mask & DW_vm_page_free) { vm_page_free_prepare_queues(m); - assert(m->pageq.next == 0 && m->pageq.prev == 0); + assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0); /* * Add this page to our list of reclaimed pages, * to be freed later. */ - m->snext = local_free_q; + m->vmp_snext = local_free_q; local_free_q = m; } else { if (dwp->dw_mask & DW_vm_page_deactivate_internal) vm_page_deactivate_internal(m, FALSE); else if (dwp->dw_mask & DW_vm_page_activate) { - if (m->vm_page_q_state != VM_PAGE_ON_ACTIVE_Q) { + if (m->vmp_q_state != VM_PAGE_ON_ACTIVE_Q) { vm_page_activate(m); } } @@ -5981,30 +6161,29 @@ vm_page_do_delayed_work( * this page has been touched since it got cleaned; let's activate it * if it hasn't already been */ - vm_pageout_enqueued_cleaned++; - vm_pageout_cleaned_reactivated++; - vm_pageout_cleaned_commit_reactivated++; + VM_PAGEOUT_DEBUG(vm_pageout_enqueued_cleaned, 1); + VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1); - if (m->vm_page_q_state != VM_PAGE_ON_ACTIVE_Q) + if (m->vmp_q_state != VM_PAGE_ON_ACTIVE_Q) vm_page_activate(m); } else { - m->reference = FALSE; + m->vmp_reference = FALSE; vm_page_enqueue_cleaned(m); } } else if (dwp->dw_mask & DW_vm_page_lru) vm_page_lru(m); else if (dwp->dw_mask & DW_VM_PAGE_QUEUES_REMOVE) { - if (m->vm_page_q_state != VM_PAGE_ON_PAGEOUT_Q) + if (m->vmp_q_state != VM_PAGE_ON_PAGEOUT_Q) vm_page_queues_remove(m, TRUE); } if (dwp->dw_mask & DW_set_reference) - m->reference = TRUE; + m->vmp_reference = TRUE; else if (dwp->dw_mask & DW_clear_reference) - m->reference = FALSE; + m->vmp_reference = FALSE; if (dwp->dw_mask & DW_move_page) { - if (m->vm_page_q_state != VM_PAGE_ON_PAGEOUT_Q) { + if (m->vmp_q_state != VM_PAGE_ON_PAGEOUT_Q) { vm_page_queues_remove(m, FALSE); assert(VM_PAGE_OBJECT(m) != kernel_object); @@ -6013,7 +6192,7 @@ vm_page_do_delayed_work( } } if (dwp->dw_mask & DW_clear_busy) - m->busy = FALSE; + m->vmp_busy = FALSE; if (dwp->dw_mask & DW_PAGE_WAKEUP) PAGE_WAKEUP(m); @@ -6053,7 +6232,7 @@ vm_page_alloc_list( return (KERN_RESOURCE_SHORTAGE); } - mem->snext = lo_page_list; + mem->vmp_snext = lo_page_list; lo_page_list = mem; } *list = lo_page_list; @@ -6064,19 +6243,19 @@ vm_page_alloc_list( void vm_page_set_offset(vm_page_t page, vm_object_offset_t offset) { - page->offset = offset; + page->vmp_offset = offset; } vm_page_t vm_page_get_next(vm_page_t page) { - return (page->snext); + return (page->vmp_snext); } vm_object_offset_t vm_page_get_offset(vm_page_t page) { - return (page->offset); + return (page->vmp_offset); } ppnum_t @@ -6261,12 +6440,12 @@ hibernate_flush_queue(vm_page_queue_head_t *q, int qcount) l_object = m_object; } } - if ( !m_object->alive || m->cleaning || m->laundry || m->busy || m->absent || m->error) { + if ( !m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || m->vmp_error) { /* * page is not to be cleaned * put it back on the head of its queue */ - if (m->cleaning) + if (m->vmp_cleaning) hibernate_stats.hibernate_skipped_cleaning++; else hibernate_stats.hibernate_skipped_transient++; @@ -6282,7 +6461,7 @@ hibernate_flush_queue(vm_page_queue_head_t *q, int qcount) goto reenter_pg_on_q; } } - if ( !m->dirty && m->pmapped) { + if ( !m->vmp_dirty && m->vmp_pmapped) { refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m)); if ((refmod_state & VM_MEM_MODIFIED)) { @@ -6291,12 +6470,12 @@ hibernate_flush_queue(vm_page_queue_head_t *q, int qcount) } else refmod_state = 0; - if ( !m->dirty) { + if ( !m->vmp_dirty) { /* * page is not to be cleaned * put it back on the head of its queue */ - if (m->precious) + if (m->vmp_precious) hibernate_stats.hibernate_skipped_precious++; goto reenter_pg_on_q; @@ -6380,8 +6559,8 @@ hibernate_flush_queue(vm_page_queue_head_t *q, int qcount) goto next_pg; reenter_pg_on_q: - vm_page_queue_remove(q, m, vm_page_t, pageq); - vm_page_queue_enter(q, m, vm_page_t, pageq); + vm_page_queue_remove(q, m, vm_page_t, vmp_pageq); + vm_page_queue_enter(q, m, vm_page_t, vmp_pageq); hibernate_stats.hibernate_reentered_on_q++; next_pg: @@ -6429,7 +6608,7 @@ hibernate_flush_dirty_pages(int pass) vm_page_queue_iterate(&aq->age_q, m, vm_page_t, - pageq) + vmp_pageq) { qcount++; } @@ -6573,7 +6752,7 @@ hibernate_free_gobble_pages(void) m = (vm_page_t) hibernate_gobble_queue; while(m) { - next = m->snext; + next = m->vmp_snext; vm_page_free(m); count++; m = next; @@ -6593,7 +6772,7 @@ hibernate_consider_discard(vm_page_t m, boolean_t preflight) do { - if (m->private) + if (m->vmp_private) panic("hibernate_consider_discard: private"); object = VM_PAGE_OBJECT(m); @@ -6607,38 +6786,38 @@ hibernate_consider_discard(vm_page_t m, boolean_t preflight) if (!preflight) hibernate_stats.cd_found_wired++; break; } - if (m->precious) { + if (m->vmp_precious) { if (!preflight) hibernate_stats.cd_found_precious++; break; } - if (m->busy || !object->alive) { + if (m->vmp_busy || !object->alive) { /* * Somebody is playing with this page. */ if (!preflight) hibernate_stats.cd_found_busy++; break; } - if (m->absent || m->unusual || m->error) { + if (m->vmp_absent || m->vmp_unusual || m->vmp_error) { /* * If it's unusual in anyway, ignore it */ if (!preflight) hibernate_stats.cd_found_unusual++; break; } - if (m->cleaning) { + if (m->vmp_cleaning) { if (!preflight) hibernate_stats.cd_found_cleaning++; break; } - if (m->laundry) { + if (m->vmp_laundry) { if (!preflight) hibernate_stats.cd_found_laundry++; break; } - if (!m->dirty) + if (!m->vmp_dirty) { refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m)); if (refmod_state & VM_MEM_REFERENCED) - m->reference = TRUE; + m->vmp_reference = TRUE; if (refmod_state & VM_MEM_MODIFIED) { SET_PAGE_DIRTY(m, FALSE); } @@ -6647,7 +6826,7 @@ hibernate_consider_discard(vm_page_t m, boolean_t preflight) /* * If it's clean or purgeable we can discard the page on wakeup. */ - discard = (!m->dirty) + discard = (!m->vmp_dirty) || (VM_PURGABLE_VOLATILE == object->purgable) || (VM_PURGABLE_EMPTY == object->purgable); @@ -6655,7 +6834,7 @@ hibernate_consider_discard(vm_page_t m, boolean_t preflight) if (discard == FALSE) { if (!preflight) hibernate_stats.cd_found_dirty++; - } else if (m->xpmapped && m->reference && !object->internal) { + } else if (m->vmp_xpmapped && m->vmp_reference && !object->internal) { if (hibernate_stats.cd_found_xpmapped < HIBERNATE_XPMAPPED_LIMIT) { if (!preflight) hibernate_stats.cd_found_xpmapped++; @@ -6680,7 +6859,7 @@ hibernate_discard_page(vm_page_t m) { vm_object_t m_object; - if (m->absent || m->unusual || m->error) + if (m->vmp_absent || m->vmp_unusual || m->vmp_error) /* * If it's unusual in anyway, ignore */ @@ -6696,16 +6875,16 @@ hibernate_discard_page(vm_page_t m) makes sure these locks are uncontended before sleep */ #endif /* MACH_ASSERT || DEBUG */ - if (m->pmapped == TRUE) + if (m->vmp_pmapped == TRUE) { __unused int refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)); } - if (m->laundry) + if (m->vmp_laundry) panic("hibernate_discard_page(%p) laundry", m); - if (m->private) + if (m->vmp_private) panic("hibernate_discard_page(%p) private", m); - if (m->fictitious) + if (m->vmp_fictitious) panic("hibernate_discard_page(%p) fictitious", m); if (VM_PURGABLE_VOLATILE == m_object->purgable) @@ -6873,16 +7052,16 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); } - m = m->snext; + m = m->vmp_snext; } if (!preflight) for( i = 0; i < real_ncpus; i++ ) { if (cpu_data_ptr[i] && cpu_data_ptr[i]->cpu_processor) { - for (m = PROCESSOR_DATA(cpu_data_ptr[i]->cpu_processor, free_pages); m; m = m->snext) + for (m = PROCESSOR_DATA(cpu_data_ptr[i]->cpu_processor, free_pages); m; m = m->vmp_snext) { - assert(m->vm_page_q_state == VM_PAGE_ON_FREE_LOCAL_Q); + assert(m->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q); pages--; count_wire--; @@ -6900,9 +7079,9 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, vm_page_queue_iterate(&vm_page_queue_free[i].qhead, m, vm_page_t, - pageq) + vmp_pageq) { - assert(m->vm_page_q_state == VM_PAGE_ON_FREE_Q); + assert(m->vmp_q_state == VM_PAGE_ON_FREE_Q); pages--; count_wire--; @@ -6918,9 +7097,9 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, vm_page_queue_iterate(&vm_lopage_queue_free, m, vm_page_t, - pageq) + vmp_pageq) { - assert(m->vm_page_q_state == VM_PAGE_ON_FREE_LOPAGE_Q); + assert(m->vmp_q_state == VM_PAGE_ON_FREE_LOPAGE_Q); pages--; count_wire--; @@ -6935,9 +7114,9 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled); while (m && !vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t)m)) { - assert(m->vm_page_q_state == VM_PAGE_ON_THROTTLED_Q); + assert(m->vmp_q_state == VM_PAGE_ON_THROTTLED_Q); - next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->pageq.next); + next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next); discard = FALSE; if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) && hibernate_consider_discard(m, preflight)) @@ -6958,15 +7137,15 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous); while (m && !vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t)m)) { - assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q); + assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q); - next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->pageq.next); + next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next); discard = FALSE; if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) && hibernate_consider_discard(m, preflight)) { if (!preflight) hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); - if (m->dirty) + if (m->vmp_dirty) count_discard_purgeable++; else count_discard_inactive++; @@ -6983,15 +7162,15 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned); while (m && !vm_page_queue_end(&vm_page_queue_cleaned, (vm_page_queue_entry_t)m)) { - assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q); + assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q); - next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->pageq.next); + next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next); discard = FALSE; if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) && hibernate_consider_discard(m, preflight)) { if (!preflight) hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); - if (m->dirty) + if (m->vmp_dirty) count_discard_purgeable++; else count_discard_cleaned++; @@ -7008,15 +7187,15 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active); while (m && !vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t)m)) { - assert(m->vm_page_q_state == VM_PAGE_ON_ACTIVE_Q); + assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q); - next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->pageq.next); + next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next); discard = FALSE; if ((kIOHibernateModeDiscardCleanActive & gIOHibernateMode) && hibernate_consider_discard(m, preflight)) { if (!preflight) hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); - if (m->dirty) + if (m->vmp_dirty) count_discard_purgeable++; else count_discard_active++; @@ -7033,15 +7212,15 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive); while (m && !vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t)m)) { - assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q); + assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q); - next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->pageq.next); + next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next); discard = FALSE; if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) && hibernate_consider_discard(m, preflight)) { if (!preflight) hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m)); - if (m->dirty) + if (m->vmp_dirty) count_discard_purgeable++; else count_discard_inactive++; @@ -7061,9 +7240,12 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, m = (vm_page_t) vm_page_queue_first(&vm_page_queue_speculative[i].age_q); while (m && !vm_page_queue_end(&vm_page_queue_speculative[i].age_q, (vm_page_queue_entry_t)m)) { - assert(m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q); + assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q); + assertf(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q, + "Bad page: %p (0x%x:0x%x) on queue %d has state: %d (Discard: %d, Preflight: %d)", + m, m->vmp_pageq.next, m->vmp_pageq.prev, i, m->vmp_q_state, discard, preflight); - next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->pageq.next); + next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next); discard = FALSE; if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) && hibernate_consider_discard(m, preflight)) @@ -7081,9 +7263,9 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, } } - vm_page_queue_iterate(&compressor_object->memq, m, vm_page_t, listq) + vm_page_queue_iterate(&compressor_object->memq, m, vm_page_t, vmp_listq) { - assert(m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR); + assert(m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR); count_compressor++; count_wire--; @@ -7196,12 +7378,12 @@ hibernate_page_list_discard(hibernate_page_list_t * page_list) m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous); while (m && !vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t)m)) { - assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q); + assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q); - next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->pageq.next); + next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next); if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) { - if (m->dirty) + if (m->vmp_dirty) count_discard_purgeable++; else count_discard_inactive++; @@ -7215,9 +7397,9 @@ hibernate_page_list_discard(hibernate_page_list_t * page_list) m = (vm_page_t) vm_page_queue_first(&vm_page_queue_speculative[i].age_q); while (m && !vm_page_queue_end(&vm_page_queue_speculative[i].age_q, (vm_page_queue_entry_t)m)) { - assert(m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q); + assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q); - next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->pageq.next); + next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next); if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) { count_discard_speculative++; @@ -7230,12 +7412,12 @@ hibernate_page_list_discard(hibernate_page_list_t * page_list) m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive); while (m && !vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t)m)) { - assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q); + assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q); - next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->pageq.next); + next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next); if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) { - if (m->dirty) + if (m->vmp_dirty) count_discard_purgeable++; else count_discard_inactive++; @@ -7248,12 +7430,12 @@ hibernate_page_list_discard(hibernate_page_list_t * page_list) m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active); while (m && !vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t)m)) { - assert(m->vm_page_q_state == VM_PAGE_ON_ACTIVE_Q); + assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q); - next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->pageq.next); + next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next); if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) { - if (m->dirty) + if (m->vmp_dirty) count_discard_purgeable++; else count_discard_active++; @@ -7265,12 +7447,12 @@ hibernate_page_list_discard(hibernate_page_list_t * page_list) m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned); while (m && !vm_page_queue_end(&vm_page_queue_cleaned, (vm_page_queue_entry_t)m)) { - assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q); + assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q); - next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->pageq.next); + next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next); if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m))) { - if (m->dirty) + if (m->vmp_dirty) count_discard_purgeable++; else count_discard_cleaned++; @@ -7411,17 +7593,17 @@ hibernate_hash_insert_page(vm_page_t mem) m_object = VM_PAGE_OBJECT(mem); - assert(mem->hashed); + assert(mem->vmp_hashed); assert(m_object); - assert(mem->offset != (vm_object_offset_t) -1); + assert(mem->vmp_offset != (vm_object_offset_t) -1); /* * Insert it into the object_object/offset hash table */ - hash_id = vm_page_hash(m_object, mem->offset); + hash_id = vm_page_hash(m_object, mem->vmp_offset); bucket = &vm_page_buckets[hash_id]; - mem->next_m = bucket->page_list; + mem->vmp_next_m = bucket->page_list; bucket->page_list = VM_PAGE_PACK_PTR(mem); } @@ -7437,20 +7619,20 @@ hibernate_free_range(int sindx, int eindx) vm_page_init(mem, hibernate_lookup_paddr(sindx), FALSE); - mem->lopage = FALSE; - mem->vm_page_q_state = VM_PAGE_ON_FREE_Q; + mem->vmp_lopage = FALSE; + mem->vmp_q_state = VM_PAGE_ON_FREE_Q; color = VM_PAGE_GET_COLOR(mem); #if defined(__x86_64__) vm_page_queue_enter_clump(&vm_page_queue_free[color].qhead, mem, vm_page_t, - pageq); + vmp_pageq); #else vm_page_queue_enter(&vm_page_queue_free[color].qhead, mem, vm_page_t, - pageq); + vmp_pageq); #endif vm_page_free_count++; @@ -7488,18 +7670,18 @@ hibernate_rebuild_vm_structs(void) * Without this random data in these vm_pages[] can trip the buddy search */ for (i = hibernate_teardown_last_valid_compact_indx+1; i < eindx; ++i) - vm_pages[i].vm_page_q_state = VM_PAGE_NOT_ON_Q; + vm_pages[i].vmp_q_state = VM_PAGE_NOT_ON_Q; for (cindx = hibernate_teardown_last_valid_compact_indx; cindx >= 0; cindx--) { mem = &vm_pages[cindx]; - assert(mem->vm_page_q_state != VM_PAGE_ON_FREE_Q); + assert(mem->vmp_q_state != VM_PAGE_ON_FREE_Q); /* * hibernate_teardown_vm_structs leaves the location where * this vm_page_t must be located in "next". */ - tmem = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->next_m)); - mem->next_m = VM_PAGE_PACK_PTR(NULL); + tmem = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->vmp_next_m)); + mem->vmp_next_m = VM_PAGE_PACK_PTR(NULL); sindx = (int)(tmem - &vm_pages[0]); @@ -7511,7 +7693,7 @@ hibernate_rebuild_vm_structs(void) *tmem = *mem; mem = tmem; } - if (mem->hashed) + if (mem->vmp_hashed) hibernate_hash_insert_page(mem); /* * the 'hole' between this vm_page_t and the previous @@ -7533,9 +7715,9 @@ hibernate_rebuild_vm_structs(void) * vm_page_t's that were created on the fly (i.e. fictitious) */ for (mem = hibernate_rebuild_hash_list; mem; mem = mem_next) { - mem_next = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->next_m)); + mem_next = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->vmp_next_m)); - mem->next_m = 0; + mem->vmp_next_m = 0; hibernate_hash_insert_page(mem); } hibernate_rebuild_hash_list = NULL; @@ -7583,12 +7765,12 @@ hibernate_teardown_vm_structs(hibernate_page_list_t *page_list, hibernate_page_l bucket = &vm_page_buckets[i]; for (mem = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list)); mem != VM_PAGE_NULL; mem = mem_next) { - assert(mem->hashed); + assert(mem->vmp_hashed); - mem_next = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->next_m)); + mem_next = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->vmp_next_m)); if (mem < &vm_pages[0] || mem >= &vm_pages[vm_pages_count]) { - mem->next_m = VM_PAGE_PACK_PTR(hibernate_rebuild_hash_list); + mem->vmp_next_m = VM_PAGE_PACK_PTR(hibernate_rebuild_hash_list); hibernate_rebuild_hash_list = mem; } } @@ -7604,18 +7786,18 @@ hibernate_teardown_vm_structs(hibernate_page_list_t *page_list, hibernate_page_l mem = &vm_pages[i]; - if (mem->vm_page_q_state == VM_PAGE_ON_FREE_Q) { + if (mem->vmp_q_state == VM_PAGE_ON_FREE_Q) { unsigned int color; - assert(mem->busy); - assert(!mem->lopage); + assert(mem->vmp_busy); + assert(!mem->vmp_lopage); color = VM_PAGE_GET_COLOR(mem); vm_page_queue_remove(&vm_page_queue_free[color].qhead, mem, vm_page_t, - pageq); + vmp_pageq); VM_PAGE_ZERO_PAGEQ_ENTRY(mem); @@ -7623,7 +7805,7 @@ hibernate_teardown_vm_structs(hibernate_page_list_t *page_list, hibernate_page_l hibernate_teardown_found_free_pages++; - if (vm_pages[compact_target_indx].vm_page_q_state != VM_PAGE_ON_FREE_Q) + if (vm_pages[compact_target_indx].vmp_q_state != VM_PAGE_ON_FREE_Q) compact_target_indx = i; } else { /* @@ -7632,15 +7814,15 @@ hibernate_teardown_vm_structs(hibernate_page_list_t *page_list, hibernate_page_l * as an indicator to the rebuild function that * we don't have to move it */ - mem->next_m = VM_PAGE_PACK_PTR(mem); + mem->vmp_next_m = VM_PAGE_PACK_PTR(mem); - if (vm_pages[compact_target_indx].vm_page_q_state == VM_PAGE_ON_FREE_Q) { + if (vm_pages[compact_target_indx].vmp_q_state == VM_PAGE_ON_FREE_Q) { /* * we've got a hole to fill, so * move this vm_page_t to it's new home */ vm_pages[compact_target_indx] = *mem; - mem->vm_page_q_state = VM_PAGE_ON_FREE_Q; + mem->vmp_q_state = VM_PAGE_ON_FREE_Q; hibernate_teardown_last_valid_compact_indx = compact_target_indx; compact_target_indx++; @@ -7706,7 +7888,7 @@ vm_page_info( for (m = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list)); m != VM_PAGE_NULL; - m = (vm_page_t)(VM_PAGE_UNPACK_PTR(m->next_m))) + m = (vm_page_t)(VM_PAGE_UNPACK_PTR(m->vmp_next_m))) bucket_count++; lck_spin_unlock(bucket_lock); @@ -7773,22 +7955,22 @@ vm_page_buckets_check(void) while (p != VM_PAGE_NULL) { p_object = VM_PAGE_OBJECT(p); - if (!p->hashed) { + if (!p->vmp_hashed) { panic("BUCKET_CHECK: page %p (%p,0x%llx) " "hash %d in bucket %d at %p " "is not hashed\n", - p, p_object, p->offset, + p, p_object, p->vmp_offset, p_hash, i, bucket); } - p_hash = vm_page_hash(p_object, p->offset); + p_hash = vm_page_hash(p_object, p->vmp_offset); if (p_hash != i) { panic("BUCKET_CHECK: corruption in bucket %d " "at %p: page %p object %p offset 0x%llx " "hash %d\n", - i, bucket, p, p_object, p->offset, + i, bucket, p, p_object, p->vmp_offset, p_hash); } - p = (vm_page_t)(VM_PAGE_UNPACK_PTR(p->next_m)); + p = (vm_page_t)(VM_PAGE_UNPACK_PTR(p->vmp_next_m)); } lck_spin_unlock(bucket_lock); } @@ -7828,44 +8010,44 @@ vm_page_queues_remove(vm_page_t mem, boolean_t __unused remove_from_backgroundq) LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); - if (mem->vm_page_q_state == VM_PAGE_NOT_ON_Q) + if (mem->vmp_q_state == VM_PAGE_NOT_ON_Q) { - assert(mem->pageq.next == 0 && mem->pageq.prev == 0); + assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0); #if CONFIG_BACKGROUND_QUEUE if (remove_from_backgroundq == TRUE) { vm_page_remove_from_backgroundq(mem); } - if (mem->vm_page_on_backgroundq) { - assert(mem->vm_page_backgroundq.next != 0); - assert(mem->vm_page_backgroundq.prev != 0); + if (mem->vmp_on_backgroundq) { + assert(mem->vmp_backgroundq.next != 0); + assert(mem->vmp_backgroundq.prev != 0); } else { - assert(mem->vm_page_backgroundq.next == 0); - assert(mem->vm_page_backgroundq.prev == 0); + assert(mem->vmp_backgroundq.next == 0); + assert(mem->vmp_backgroundq.prev == 0); } #endif /* CONFIG_BACKGROUND_QUEUE */ return; } - if (mem->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) + if (mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) { - assert(mem->pageq.next == 0 && mem->pageq.prev == 0); + assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0); #if CONFIG_BACKGROUND_QUEUE - assert(mem->vm_page_backgroundq.next == 0 && - mem->vm_page_backgroundq.prev == 0 && - mem->vm_page_on_backgroundq == FALSE); + assert(mem->vmp_backgroundq.next == 0 && + mem->vmp_backgroundq.prev == 0 && + mem->vmp_on_backgroundq == FALSE); #endif return; } - if (mem->vm_page_q_state == VM_PAGE_IS_WIRED) { + if (mem->vmp_q_state == VM_PAGE_IS_WIRED) { /* * might put these guys on a list for debugging purposes * if we do, we'll need to remove this assert */ - assert(mem->pageq.next == 0 && mem->pageq.prev == 0); + assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0); #if CONFIG_BACKGROUND_QUEUE - assert(mem->vm_page_backgroundq.next == 0 && - mem->vm_page_backgroundq.prev == 0 && - mem->vm_page_on_backgroundq == FALSE); + assert(mem->vmp_backgroundq.next == 0 && + mem->vmp_backgroundq.prev == 0 && + mem->vmp_on_backgroundq == FALSE); #endif return; } @@ -7873,19 +8055,19 @@ vm_page_queues_remove(vm_page_t mem, boolean_t __unused remove_from_backgroundq) assert(m_object != compressor_object); assert(m_object != kernel_object); assert(m_object != vm_submap_object); - assert(!mem->fictitious); + assert(!mem->vmp_fictitious); - switch(mem->vm_page_q_state) { + switch(mem->vmp_q_state) { case VM_PAGE_ON_ACTIVE_LOCAL_Q: { struct vpl *lq; - lq = &vm_page_local_q[mem->local_id].vpl_un.vpl; + lq = &vm_page_local_q[mem->vmp_local_id].vpl_un.vpl; VPL_LOCK(&lq->vpl_lock); vm_page_queue_remove(&lq->vpl_queue, - mem, vm_page_t, pageq); - mem->local_id = 0; + mem, vm_page_t, vmp_pageq); + mem->vmp_local_id = 0; lq->vpl_count--; if (m_object->internal) { lq->vpl_internal_count--; @@ -7899,7 +8081,7 @@ vm_page_queues_remove(vm_page_t mem, boolean_t __unused remove_from_backgroundq) case VM_PAGE_ON_ACTIVE_Q: { vm_page_queue_remove(&vm_page_queue_active, - mem, vm_page_t, pageq); + mem, vm_page_t, vmp_pageq); vm_page_active_count--; break; } @@ -7910,9 +8092,11 @@ vm_page_queues_remove(vm_page_t mem, boolean_t __unused remove_from_backgroundq) vm_page_inactive_count--; vm_page_queue_remove(&vm_page_queue_anonymous, - mem, vm_page_t, pageq); + mem, vm_page_t, vmp_pageq); vm_page_anonymous_count--; + vm_purgeable_q_advance_all(); + vm_page_balance_inactive(3); break; } @@ -7922,8 +8106,9 @@ vm_page_queues_remove(vm_page_t mem, boolean_t __unused remove_from_backgroundq) vm_page_inactive_count--; vm_page_queue_remove(&vm_page_queue_inactive, - mem, vm_page_t, pageq); + mem, vm_page_t, vmp_pageq); vm_purgeable_q_advance_all(); + vm_page_balance_inactive(3); break; } @@ -7933,8 +8118,9 @@ vm_page_queues_remove(vm_page_t mem, boolean_t __unused remove_from_backgroundq) vm_page_inactive_count--; vm_page_queue_remove(&vm_page_queue_cleaned, - mem, vm_page_t, pageq); + mem, vm_page_t, vmp_pageq); vm_page_cleaned_count--; + vm_page_balance_inactive(3); break; } @@ -7943,7 +8129,7 @@ vm_page_queues_remove(vm_page_t mem, boolean_t __unused remove_from_backgroundq) assert(m_object->internal == TRUE); vm_page_queue_remove(&vm_page_queue_throttled, - mem, vm_page_t, pageq); + mem, vm_page_t, vmp_pageq); vm_page_throttled_count--; was_pageable = FALSE; break; @@ -7953,8 +8139,9 @@ vm_page_queues_remove(vm_page_t mem, boolean_t __unused remove_from_backgroundq) { assert(m_object->internal == FALSE); - vm_page_remque(&mem->pageq); + vm_page_remque(&mem->vmp_pageq); vm_page_speculative_count--; + vm_page_balance_inactive(3); break; } @@ -7962,7 +8149,7 @@ vm_page_queues_remove(vm_page_t mem, boolean_t __unused remove_from_backgroundq) case VM_PAGE_ON_SECLUDED_Q: { vm_page_queue_remove(&vm_page_queue_secluded, - mem, vm_page_t, pageq); + mem, vm_page_t, vmp_pageq); vm_page_secluded_count--; if (m_object == VM_OBJECT_NULL) { vm_page_secluded_count_free--; @@ -7980,7 +8167,7 @@ vm_page_queues_remove(vm_page_t mem, boolean_t __unused remove_from_backgroundq) default: { /* - * if (mem->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) + * if (mem->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) * NOTE: vm_page_queues_remove does not deal with removing pages from the pageout queue... * the caller is responsible for determing if the page is on that queue, and if so, must * either first remove it (it needs both the page queues lock and the object lock to do @@ -7989,13 +8176,13 @@ vm_page_queues_remove(vm_page_t mem, boolean_t __unused remove_from_backgroundq) * we also don't expect to encounter VM_PAGE_ON_FREE_Q, VM_PAGE_ON_FREE_LOCAL_Q, VM_PAGE_ON_FREE_LOPAGE_Q * or any of the undefined states */ - panic("vm_page_queues_remove - bad page q_state (%p, %d)\n", mem, mem->vm_page_q_state); + panic("vm_page_queues_remove - bad page q_state (%p, %d)\n", mem, mem->vmp_q_state); break; } } VM_PAGE_ZERO_PAGEQ_ENTRY(mem); - mem->vm_page_q_state = VM_PAGE_NOT_ON_Q; + mem->vmp_q_state = VM_PAGE_NOT_ON_Q; #if CONFIG_BACKGROUND_QUEUE if (remove_from_backgroundq == TRUE) @@ -8017,9 +8204,9 @@ vm_page_remove_internal(vm_page_t page) if (page == __object->memq_hint) { vm_page_t __new_hint; vm_page_queue_entry_t __qe; - __qe = (vm_page_queue_entry_t)vm_page_queue_next(&page->listq); + __qe = (vm_page_queue_entry_t)vm_page_queue_next(&page->vmp_listq); if (vm_page_queue_end(&__object->memq, __qe)) { - __qe = (vm_page_queue_entry_t)vm_page_queue_prev(&page->listq); + __qe = (vm_page_queue_entry_t)vm_page_queue_prev(&page->vmp_listq); if (vm_page_queue_end(&__object->memq, __qe)) { __qe = NULL; } @@ -8027,7 +8214,7 @@ vm_page_remove_internal(vm_page_t page) __new_hint = (vm_page_t)((uintptr_t) __qe); __object->memq_hint = __new_hint; } - vm_page_queue_remove(&__object->memq, page, vm_page_t, listq); + vm_page_queue_remove(&__object->memq, page, vm_page_t, vmp_listq); #if CONFIG_SECLUDED_MEMORY if (__object->eligible_for_secluded) { vm_page_secluded.eligible_for_secluded--; @@ -8043,28 +8230,28 @@ vm_page_enqueue_inactive(vm_page_t mem, boolean_t first) m_object = VM_PAGE_OBJECT(mem); LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); - assert(!mem->fictitious); - assert(!mem->laundry); - assert(mem->vm_page_q_state == VM_PAGE_NOT_ON_Q); + assert(!mem->vmp_fictitious); + assert(!mem->vmp_laundry); + assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q); vm_page_check_pageable_safe(mem); if (m_object->internal) { - mem->vm_page_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q; + mem->vmp_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q; if (first == TRUE) - vm_page_queue_enter_first(&vm_page_queue_anonymous, mem, vm_page_t, pageq); + vm_page_queue_enter_first(&vm_page_queue_anonymous, mem, vm_page_t, vmp_pageq); else - vm_page_queue_enter(&vm_page_queue_anonymous, mem, vm_page_t, pageq); + vm_page_queue_enter(&vm_page_queue_anonymous, mem, vm_page_t, vmp_pageq); vm_page_anonymous_count++; vm_page_pageable_internal_count++; } else { - mem->vm_page_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q; + mem->vmp_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q; if (first == TRUE) - vm_page_queue_enter_first(&vm_page_queue_inactive, mem, vm_page_t, pageq); + vm_page_queue_enter_first(&vm_page_queue_inactive, mem, vm_page_t, vmp_pageq); else - vm_page_queue_enter(&vm_page_queue_inactive, mem, vm_page_t, pageq); + vm_page_queue_enter(&vm_page_queue_inactive, mem, vm_page_t, vmp_pageq); vm_page_pageable_external_count++; } @@ -8072,7 +8259,7 @@ vm_page_enqueue_inactive(vm_page_t mem, boolean_t first) token_new_pagecount++; #if CONFIG_BACKGROUND_QUEUE - if (mem->vm_page_in_background) + if (mem->vmp_in_background) vm_page_add_to_backgroundq(mem, FALSE); #endif } @@ -8085,16 +8272,16 @@ vm_page_enqueue_active(vm_page_t mem, boolean_t first) m_object = VM_PAGE_OBJECT(mem); LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); - assert(!mem->fictitious); - assert(!mem->laundry); - assert(mem->vm_page_q_state == VM_PAGE_NOT_ON_Q); + assert(!mem->vmp_fictitious); + assert(!mem->vmp_laundry); + assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q); vm_page_check_pageable_safe(mem); - mem->vm_page_q_state = VM_PAGE_ON_ACTIVE_Q; + mem->vmp_q_state = VM_PAGE_ON_ACTIVE_Q; if (first == TRUE) - vm_page_queue_enter_first(&vm_page_queue_active, mem, vm_page_t, pageq); + vm_page_queue_enter_first(&vm_page_queue_active, mem, vm_page_t, vmp_pageq); else - vm_page_queue_enter(&vm_page_queue_active, mem, vm_page_t, pageq); + vm_page_queue_enter(&vm_page_queue_active, mem, vm_page_t, vmp_pageq); vm_page_active_count++; if (m_object->internal) { @@ -8104,9 +8291,10 @@ vm_page_enqueue_active(vm_page_t mem, boolean_t first) } #if CONFIG_BACKGROUND_QUEUE - if (mem->vm_page_in_background) + if (mem->vmp_in_background) vm_page_add_to_backgroundq(mem, FALSE); #endif + vm_page_balance_inactive(3); } /* @@ -8189,7 +8377,8 @@ vm_tag_bt(void) retaddr = *(frameptr + 1); - if ((retaddr < vm_kernel_stext) || (retaddr > vm_kernel_top)) + if (((retaddr < vm_kernel_builtinkmod_text_end) && (retaddr >= vm_kernel_builtinkmod_text)) + || (retaddr < vm_kernel_stext) || (retaddr > vm_kernel_top)) { site = OSKextGetAllocationSiteForCaller(retaddr); break; @@ -8544,6 +8733,7 @@ kern_allocation_name_get_vm_tag(kern_allocation_name_t allocation) return (vm_tag_alloc(allocation)); } +#if ! VM_TAG_ACTIVE_UPDATE static void vm_page_count_object(mach_memory_info_t * info, unsigned int __unused num_info, vm_object_t object) { @@ -8577,46 +8767,19 @@ static void vm_page_iterate_objects(mach_memory_info_t * info, unsigned int num_info, vm_page_iterate_proc proc) { - purgeable_q_t volatile_q; - queue_head_t * nonvolatile_q; vm_object_t object; - int group; lck_spin_lock(&vm_objects_wired_lock); queue_iterate(&vm_objects_wired, object, vm_object_t, - objq) + wired_objq) { proc(info, num_info, object); } lck_spin_unlock(&vm_objects_wired_lock); - - lck_mtx_lock(&vm_purgeable_queue_lock); - nonvolatile_q = &purgeable_nonvolatile_queue; - for (object = (vm_object_t) queue_first(nonvolatile_q); - !queue_end(nonvolatile_q, (queue_entry_t) object); - object = (vm_object_t) queue_next(&object->objq)) - { - proc(info, num_info, object); - } - - volatile_q = &purgeable_queues[PURGEABLE_Q_TYPE_OBSOLETE]; - vm_page_iterate_purgeable_objects(info, num_info, proc, volatile_q, 0); - - volatile_q = &purgeable_queues[PURGEABLE_Q_TYPE_FIFO]; - for (group = 0; group < NUM_VOLATILE_GROUPS; group++) - { - vm_page_iterate_purgeable_objects(info, num_info, proc, volatile_q, group); - } - - volatile_q = &purgeable_queues[PURGEABLE_Q_TYPE_LIFO]; - for (group = 0; group < NUM_VOLATILE_GROUPS; group++) - { - vm_page_iterate_purgeable_objects(info, num_info, proc, volatile_q, group); - } - lck_mtx_unlock(&vm_purgeable_queue_lock); } +#endif /* ! VM_TAG_ACTIVE_UPDATE */ static uint64_t process_account(mach_memory_info_t * info, unsigned int num_info, uint64_t zones_collectable_bytes, boolean_t iterated) @@ -8854,7 +9017,9 @@ vm_page_diagnose(mach_memory_info_t * info, unsigned int num_info, uint64_t zone vm_page_t page; int stackIdx, count; +#if ! VM_TAG_ACTIVE_UPDATE vm_page_iterate_objects(info, num_info, &vm_page_count_object); +#endif /* ! VM_TAG_ACTIVE_UPDATE */ map = kernel_map; stackIdx = 0; @@ -8973,3 +9138,60 @@ vm_tag_get_kext(vm_tag_t tag, char * name, vm_size_t namelen) return (kmodId); } + + +#if CONFIG_SECLUDED_MEMORY +/* + * Note that there's no locking around other accesses to vm_page_secluded_target. + * That should be OK, since these are the only place where it can be changed after + * initialization. Other users (like vm_pageout) may see the wrong value briefly, + * but will eventually get the correct value. This brief mismatch is OK as pageout + * and page freeing will auto-adjust the vm_page_secluded_count to match the target + * over time. + */ +unsigned int vm_page_secluded_suppress_cnt = 0; +unsigned int vm_page_secluded_save_target; + + +lck_grp_attr_t secluded_suppress_slock_grp_attr; +lck_grp_t secluded_suppress_slock_grp; +lck_attr_t secluded_suppress_slock_attr; +lck_spin_t secluded_suppress_slock; + +void +secluded_suppression_init(void) +{ + lck_grp_attr_setdefault(&secluded_suppress_slock_grp_attr); + lck_grp_init(&secluded_suppress_slock_grp, + "secluded_suppress_slock", &secluded_suppress_slock_grp_attr); + lck_attr_setdefault(&secluded_suppress_slock_attr); + lck_spin_init(&secluded_suppress_slock, + &secluded_suppress_slock_grp, &secluded_suppress_slock_attr); +} + +void +start_secluded_suppression(task_t task) +{ + if (task->task_suppressed_secluded) + return; + lck_spin_lock(&secluded_suppress_slock); + if (!task->task_suppressed_secluded && vm_page_secluded_suppress_cnt++ == 0) { + task->task_suppressed_secluded = TRUE; + vm_page_secluded_save_target = vm_page_secluded_target; + vm_page_secluded_target = 0; + } + lck_spin_unlock(&secluded_suppress_slock); +} + +void +stop_secluded_suppression(task_t task) +{ + lck_spin_lock(&secluded_suppress_slock); + if (task->task_suppressed_secluded && --vm_page_secluded_suppress_cnt == 0) { + task->task_suppressed_secluded = FALSE; + vm_page_secluded_target = vm_page_secluded_save_target; + } + lck_spin_unlock(&secluded_suppress_slock); +} + +#endif /* CONFIG_SECLUDED_MEMORY */ diff --git a/osfmk/vm/vm_shared_region.c b/osfmk/vm/vm_shared_region.c index f7018253b..e94960ddc 100644 --- a/osfmk/vm/vm_shared_region.c +++ b/osfmk/vm/vm_shared_region.c @@ -112,6 +112,17 @@ #include #include +#if defined (__arm__) || defined(__arm64__) +#include +#endif + +/* + * the following codes are used in the subclass + * of the DBG_MACH_SHAREDREGION class + */ +#define PROCESS_SHARED_CACHE_LAYOUT 0x00 + + /* "dyld" uses this to figure out what the kernel supports */ int shared_region_version = 3; @@ -124,6 +135,8 @@ int shared_region_persistence = 0; /* no by default */ /* delay before reclaiming an unused shared region */ int shared_region_destroy_delay = 120; /* in seconds */ +struct vm_shared_region *init_task_shared_region = NULL; + #ifndef CONFIG_EMBEDDED /* * Only one cache gets to slide on Desktop, since we can't @@ -152,11 +165,20 @@ static void vm_shared_region_reference_locked(vm_shared_region_t shared_region); static vm_shared_region_t vm_shared_region_create( void *root_dir, cpu_type_t cputype, + cpu_subtype_t cpu_subtype, boolean_t is_64bit); static void vm_shared_region_destroy(vm_shared_region_t shared_region); static void vm_shared_region_timeout(thread_call_param_t param0, thread_call_param_t param1); +kern_return_t vm_shared_region_slide_mapping( + vm_shared_region_t sr, + mach_vm_size_t slide_info_size, + mach_vm_offset_t start, + mach_vm_size_t size, + mach_vm_offset_t slid_mapping, + uint32_t slide, + memory_object_control_t); /* forward */ static int __commpage_setup = 0; #if defined(__i386__) || defined(__x86_64__) @@ -289,6 +311,30 @@ vm_shared_region_mem_entry( return shared_region->sr_mem_entry; } +vm_map_t +vm_shared_region_vm_map( + vm_shared_region_t shared_region) +{ + ipc_port_t sr_handle; + vm_named_entry_t sr_mem_entry; + vm_map_t sr_map; + + SHARED_REGION_TRACE_DEBUG( + ("shared_region: -> vm_map(%p)\n", + (void *)VM_KERNEL_ADDRPERM(shared_region))); + assert(shared_region->sr_ref_count > 1); + + sr_handle = shared_region->sr_mem_entry; + sr_mem_entry = (vm_named_entry_t) sr_handle->ip_kobject; + sr_map = sr_mem_entry->backing.map; + assert(sr_mem_entry->is_sub_map); + + SHARED_REGION_TRACE_DEBUG( + ("shared_region: vm_map(%p) <- %p\n", + (void *)VM_KERNEL_ADDRPERM(shared_region), + (void *)VM_KERNEL_ADDRPERM(sr_map))); + return sr_map; +} uint32_t vm_shared_region_get_slide( vm_shared_region_t shared_region) @@ -379,15 +425,17 @@ vm_shared_region_t vm_shared_region_lookup( void *root_dir, cpu_type_t cputype, + cpu_subtype_t cpu_subtype, boolean_t is_64bit) { vm_shared_region_t shared_region; vm_shared_region_t new_shared_region; SHARED_REGION_TRACE_DEBUG( - ("shared_region: -> lookup(root=%p,cpu=%d,64bit=%d)\n", + ("shared_region: -> lookup(root=%p,cpu=<%d,%d>,64bit=%d)\n", - (void *)VM_KERNEL_ADDRPERM(root_dir), cputype, is_64bit)); + (void *)VM_KERNEL_ADDRPERM(root_dir), + cputype, cpu_subtype, is_64bit)); shared_region = NULL; new_shared_region = NULL; @@ -400,6 +448,7 @@ vm_shared_region_lookup( sr_q) { assert(shared_region->sr_ref_count > 0); if (shared_region->sr_cpu_type == cputype && + shared_region->sr_cpu_subtype == cpu_subtype && shared_region->sr_root_dir == root_dir && shared_region->sr_64bit == is_64bit) { /* found a match ! */ @@ -412,6 +461,7 @@ vm_shared_region_lookup( vm_shared_region_unlock(); new_shared_region = vm_shared_region_create(root_dir, cputype, + cpu_subtype, is_64bit); /* do the lookup again, in case we lost a race */ vm_shared_region_lock(); @@ -442,9 +492,9 @@ vm_shared_region_lookup( } SHARED_REGION_TRACE_DEBUG( - ("shared_region: lookup(root=%p,cpu=%d,64bit=%d) <- %p\n", + ("shared_region: lookup(root=%p,cpu=<%d,%d>,64bit=%d) <- %p\n", (void *)VM_KERNEL_ADDRPERM(root_dir), - cputype, is_64bit, + cputype, cpu_subtype, is_64bit, (void *)VM_KERNEL_ADDRPERM(shared_region))); assert(shared_region->sr_ref_count > 0); @@ -612,6 +662,7 @@ static vm_shared_region_t vm_shared_region_create( void *root_dir, cpu_type_t cputype, + cpu_subtype_t cpu_subtype, boolean_t is_64bit) { kern_return_t kr; @@ -623,9 +674,10 @@ vm_shared_region_create( mach_vm_offset_t base_address, pmap_nesting_start; mach_vm_size_t size, pmap_nesting_size; - SHARED_REGION_TRACE_DEBUG( - ("shared_region: -> create(root=%p,cpu=%d,64bit=%d)\n", - (void *)VM_KERNEL_ADDRPERM(root_dir), cputype, is_64bit)); + SHARED_REGION_TRACE_INFO( + ("shared_region: -> create(root=%p,cpu=<%d,%d>,64bit=%d)\n", + (void *)VM_KERNEL_ADDRPERM(root_dir), + cputype, cpu_subtype, is_64bit)); base_address = 0; size = 0; @@ -776,6 +828,7 @@ vm_shared_region_create( shared_region->sr_pmap_nesting_start = pmap_nesting_start; shared_region->sr_pmap_nesting_size = pmap_nesting_size; shared_region->sr_cpu_type = cputype; + shared_region->sr_cpu_subtype = cpu_subtype; shared_region->sr_64bit = is_64bit; shared_region->sr_root_dir = root_dir; @@ -799,17 +852,20 @@ vm_shared_region_create( si->slide_info_size = 0; si->slide_info_entry = NULL; - /* Initialize UUID */ + /* Initialize UUID and other metadata */ memset(&shared_region->sr_uuid, '\0', sizeof(shared_region->sr_uuid)); shared_region->sr_uuid_copied = FALSE; + shared_region->sr_images_count = 0; + shared_region->sr_images = NULL; done: if (shared_region) { SHARED_REGION_TRACE_INFO( - ("shared_region: create(root=%p,cpu=%d,64bit=%d," + ("shared_region: create(root=%p,cpu=<%d,%d>,64bit=%d," "base=0x%llx,size=0x%llx) <- " "%p mem=(%p,%p) map=%p pmap=%p\n", (void *)VM_KERNEL_ADDRPERM(root_dir), - cputype, is_64bit, (long long)base_address, + cputype, cpu_subtype, is_64bit, + (long long)base_address, (long long)size, (void *)VM_KERNEL_ADDRPERM(shared_region), (void *)VM_KERNEL_ADDRPERM(mem_entry_port), @@ -818,10 +874,11 @@ vm_shared_region_create( (void *)VM_KERNEL_ADDRPERM(sub_map->pmap))); } else { SHARED_REGION_TRACE_INFO( - ("shared_region: create(root=%p,cpu=%d,64bit=%d," + ("shared_region: create(root=%p,cpu=<%d,%d>,64bit=%d," "base=0x%llx,size=0x%llx) <- NULL", (void *)VM_KERNEL_ADDRPERM(root_dir), - cputype, is_64bit, (long long)base_address, + cputype, cpu_subtype, is_64bit, + (long long)base_address, (long long)size)); } return shared_region; @@ -839,10 +896,11 @@ vm_shared_region_destroy( vm_map_t map; SHARED_REGION_TRACE_INFO( - ("shared_region: -> destroy(%p) (root=%p,cpu=%d,64bit=%d)\n", + ("shared_region: -> destroy(%p) (root=%p,cpu=<%d,%d>,64bit=%d)\n", (void *)VM_KERNEL_ADDRPERM(shared_region), (void *)VM_KERNEL_ADDRPERM(shared_region->sr_root_dir), shared_region->sr_cpu_type, + shared_region->sr_cpu_subtype, shared_region->sr_64bit)); assert(shared_region->sr_ref_count == 0); @@ -1091,9 +1149,12 @@ vm_shared_region_map_file( vm_object_size_t obj_size; struct shared_file_mapping_np *mapping_to_slide = NULL; mach_vm_offset_t first_mapping = (mach_vm_offset_t) -1; + mach_vm_offset_t slid_mapping = (mach_vm_offset_t) -1; vm_map_offset_t lowest_unnestable_addr = 0; vm_map_kernel_flags_t vmk_flags; - + mach_vm_offset_t sfm_min_address = ~0; + mach_vm_offset_t sfm_max_address = 0; + struct _dyld_cache_header sr_cache_header; #if __arm64__ if ((shared_region->sr_64bit || @@ -1170,6 +1231,14 @@ vm_shared_region_map_file( mappings[i].sfm_max_prot, mappings[i].sfm_init_prot)); + if (mappings[i].sfm_address < sfm_min_address) { + sfm_min_address = mappings[i].sfm_address; + } + + if ((mappings[i].sfm_address + mappings[i].sfm_size) > sfm_max_address) { + sfm_max_address = mappings[i].sfm_address + mappings[i].sfm_size; + } + if (mappings[i].sfm_init_prot & VM_PROT_ZF) { /* zero-filled memory */ map_port = MACH_PORT_NULL; @@ -1268,6 +1337,11 @@ vm_shared_region_map_file( first_mapping = target_address; } + if ((slid_mapping == (mach_vm_offset_t) -1) && + (mapping_to_slide == &mappings[i])) { + slid_mapping = target_address; + } + /* * Record the lowest writable address in this * sub map, to log any unexpected unnesting below @@ -1343,6 +1417,7 @@ vm_shared_region_map_file( mapping_to_slide->sfm_size, slide_start, slide_size, + slid_mapping, file_control); if (kr != KERN_SUCCESS) { SHARED_REGION_TRACE_ERROR( @@ -1375,35 +1450,89 @@ vm_shared_region_map_file( vm_shared_region_lock(); assert(shared_region->sr_ref_count > 1); assert(shared_region->sr_mapping_in_progress); + /* set "sr_first_mapping"; dyld uses it to validate the shared cache */ if (kr == KERN_SUCCESS && shared_region->sr_first_mapping == (mach_vm_offset_t) -1) { shared_region->sr_first_mapping = first_mapping; } - - /* copy in the shared region UUID to the shared region structure */ + /* + * copy in the shared region UUID to the shared region structure. + * we do this indirectly by first copying in the shared cache header + * and then copying the UUID from there because we'll need to look + * at other content from the shared cache header. + */ if (kr == KERN_SUCCESS && !shared_region->sr_uuid_copied) { - int error = copyin((shared_region->sr_base_address + shared_region->sr_first_mapping + - offsetof(struct _dyld_cache_header, uuid)), - (char *)&shared_region->sr_uuid, - sizeof(shared_region->sr_uuid)); - if (error == 0) { + int error = copyin((shared_region->sr_base_address + shared_region->sr_first_mapping), + (char *)&sr_cache_header, + sizeof(sr_cache_header)); + if (error == 0) { + memcpy(&shared_region->sr_uuid, &sr_cache_header.uuid, sizeof(shared_region->sr_uuid)); shared_region->sr_uuid_copied = TRUE; - } else { + } else { #if DEVELOPMENT || DEBUG - panic("shared_region: copyin_UUID(sr_base_addr:0x%016llx sr_first_mapping:0x%016llx " - "offset:0x%016llx size:0x%016llx) failed with %d\n", + panic("shared_region: copyin shared_cache_header(sr_base_addr:0x%016llx sr_first_mapping:0x%016llx " + "offset:0 size:0x%016llx) failed with %d\n", (long long)shared_region->sr_base_address, (long long)shared_region->sr_first_mapping, - (long long)offsetof(struct _dyld_cache_header, uuid), - (long long)sizeof(shared_region->sr_uuid), + (long long)sizeof(sr_cache_header), error); #endif /* DEVELOPMENT || DEBUG */ shared_region->sr_uuid_copied = FALSE; } } + /* + * If the shared cache is associated with the init task (and is therefore the system shared cache), + * check whether it is a custom built shared cache and copy in the shared cache layout accordingly. + */ + boolean_t is_init_task = (task_pid(current_task()) == 1); + if (shared_region->sr_uuid_copied && is_init_task) { + /* Copy in the shared cache layout if we're running with a locally built shared cache */ + if (sr_cache_header.locallyBuiltCache) { + KDBG((MACHDBG_CODE(DBG_MACH_SHAREDREGION, PROCESS_SHARED_CACHE_LAYOUT)) | DBG_FUNC_START); + size_t image_array_length = (sr_cache_header.imagesTextCount * sizeof(struct _dyld_cache_image_text_info)); + struct _dyld_cache_image_text_info *sr_image_layout = kalloc(image_array_length); + int error = copyin((shared_region->sr_base_address + shared_region->sr_first_mapping + + sr_cache_header.imagesTextOffset), (char *)sr_image_layout, image_array_length); + if (error == 0) { + shared_region->sr_images = kalloc(sr_cache_header.imagesTextCount * sizeof(struct dyld_uuid_info_64)); + for (size_t index = 0; index < sr_cache_header.imagesTextCount; index++) { + memcpy((char *)&shared_region->sr_images[index].imageUUID, (char *)&sr_image_layout[index].uuid, + sizeof(shared_region->sr_images[index].imageUUID)); + shared_region->sr_images[index].imageLoadAddress = sr_image_layout[index].loadAddress; + } + + assert(sr_cache_header.imagesTextCount < UINT32_MAX); + shared_region->sr_images_count = (uint32_t) sr_cache_header.imagesTextCount; + } else { +#if DEVELOPMENT || DEBUG + panic("shared_region: copyin shared_cache_layout(sr_base_addr:0x%016llx sr_first_mapping:0x%016llx " + "offset:0x%016llx size:0x%016llx) failed with %d\n", + (long long)shared_region->sr_base_address, + (long long)shared_region->sr_first_mapping, + (long long)sr_cache_header.imagesTextOffset, + (long long)image_array_length, + error); +#endif /* DEVELOPMENT || DEBUG */ + } + KDBG((MACHDBG_CODE(DBG_MACH_SHAREDREGION, PROCESS_SHARED_CACHE_LAYOUT)) | DBG_FUNC_END, shared_region->sr_images_count); + kfree(sr_image_layout, image_array_length); + sr_image_layout = NULL; + } + init_task_shared_region = shared_region; + } + + if (kr == KERN_SUCCESS) { + /* + * If we succeeded, we know the bounds of the shared region. + * Trim our pmaps to only cover this range (if applicable to + * this platform). + */ + pmap_trim(current_map()->pmap, sr_map->pmap, sfm_min_address, sfm_min_address, sfm_max_address - sfm_min_address); + } + /* we're done working on that shared region */ shared_region->sr_mapping_in_progress = FALSE; thread_wakeup((event_t) &shared_region->sr_mapping_in_progress); @@ -1418,6 +1547,38 @@ vm_shared_region_map_file( return kr; } +/* + * Retrieve a task's shared region and grab an extra reference to + * make sure it doesn't disappear while the caller is using it. + * The caller is responsible for consuming that extra reference if + * necessary. + * + * This also tries to trim the pmap for the shared region. + */ +vm_shared_region_t +vm_shared_region_trim_and_get(task_t task) +{ + vm_shared_region_t shared_region; + ipc_port_t sr_handle; + vm_named_entry_t sr_mem_entry; + vm_map_t sr_map; + + /* Get the shared region and the map. */ + shared_region = vm_shared_region_get(task); + if (shared_region == NULL) { + return NULL; + } + + sr_handle = shared_region->sr_mem_entry; + sr_mem_entry = (vm_named_entry_t) sr_handle->ip_kobject; + sr_map = sr_mem_entry->backing.map; + + /* Trim the pmap if possible. */ + pmap_trim(task->map->pmap, sr_map->pmap, 0, 0, 0); + + return shared_region; +} + /* * Enter the appropriate shared region into "map" for "task". * This involves looking up the shared region (and possibly creating a new @@ -1430,7 +1591,8 @@ vm_shared_region_enter( struct task *task, boolean_t is_64bit, void *fsroot, - cpu_type_t cpu) + cpu_type_t cpu, + cpu_subtype_t cpu_subtype) { kern_return_t kr; vm_shared_region_t shared_region; @@ -1443,29 +1605,28 @@ vm_shared_region_enter( SHARED_REGION_TRACE_DEBUG( ("shared_region: -> " - "enter(map=%p,task=%p,root=%p,cpu=%d,64bit=%d)\n", + "enter(map=%p,task=%p,root=%p,cpu=<%d,%d>,64bit=%d)\n", (void *)VM_KERNEL_ADDRPERM(map), (void *)VM_KERNEL_ADDRPERM(task), - (void *)VM_KERNEL_ADDRPERM(fsroot), cpu, is_64bit)); + (void *)VM_KERNEL_ADDRPERM(fsroot), + cpu, cpu_subtype, is_64bit)); /* lookup (create if needed) the shared region for this environment */ - shared_region = vm_shared_region_lookup(fsroot, cpu, is_64bit); + shared_region = vm_shared_region_lookup(fsroot, cpu, cpu_subtype, is_64bit); if (shared_region == NULL) { /* this should not happen ! */ SHARED_REGION_TRACE_ERROR( ("shared_region: -> " - "enter(map=%p,task=%p,root=%p,cpu=%d,64bit=%d): " + "enter(map=%p,task=%p,root=%p,cpu=<%d,%d>,64bit=%d): " "lookup failed !\n", (void *)VM_KERNEL_ADDRPERM(map), (void *)VM_KERNEL_ADDRPERM(task), - (void *)VM_KERNEL_ADDRPERM(fsroot), cpu, is_64bit)); + (void *)VM_KERNEL_ADDRPERM(fsroot), + cpu, cpu_subtype, is_64bit)); //panic("shared_region_enter: lookup failed\n"); return KERN_FAILURE; } - /* let the task use that shared region */ - vm_shared_region_set(task, shared_region); - kr = KERN_SUCCESS; /* no need to lock since this data is never modified */ sr_address = shared_region->sr_base_address; @@ -1511,23 +1672,24 @@ vm_shared_region_enter( VM_INHERIT_SHARE); if (kr != KERN_SUCCESS) { SHARED_REGION_TRACE_ERROR( - ("shared_region: enter(%p,%p,%p,%d,%d): " + ("shared_region: enter(%p,%p,%p,%d,%d,%d): " "vm_map_enter(0x%llx,0x%llx,%p) error 0x%x\n", (void *)VM_KERNEL_ADDRPERM(map), (void *)VM_KERNEL_ADDRPERM(task), (void *)VM_KERNEL_ADDRPERM(fsroot), - cpu, is_64bit, + cpu, cpu_subtype, is_64bit, (long long)target_address, (long long)mapping_size, (void *)VM_KERNEL_ADDRPERM(sr_handle), kr)); goto done; } SHARED_REGION_TRACE_DEBUG( - ("shared_region: enter(%p,%p,%p,%d,%d): " + ("shared_region: enter(%p,%p,%p,%d,%d,%d): " "vm_map_enter(0x%llx,0x%llx,%p) error 0x%x\n", (void *)VM_KERNEL_ADDRPERM(map), (void *)VM_KERNEL_ADDRPERM(task), - (void *)VM_KERNEL_ADDRPERM(fsroot), cpu, is_64bit, + (void *)VM_KERNEL_ADDRPERM(fsroot), + cpu, cpu_subtype, is_64bit, (long long)target_address, (long long)mapping_size, (void *)VM_KERNEL_ADDRPERM(sr_handle), kr)); sr_offset += mapping_size; @@ -1564,23 +1726,24 @@ vm_shared_region_enter( VM_INHERIT_SHARE); if (kr != KERN_SUCCESS) { SHARED_REGION_TRACE_ERROR( - ("shared_region: enter(%p,%p,%p,%d,%d): " + ("shared_region: enter(%p,%p,%p,%d,%d,%d): " "vm_map_enter(0x%llx,0x%llx,%p) error 0x%x\n", (void *)VM_KERNEL_ADDRPERM(map), (void *)VM_KERNEL_ADDRPERM(task), (void *)VM_KERNEL_ADDRPERM(fsroot), - cpu, is_64bit, + cpu, cpu_subtype, is_64bit, (long long)target_address, (long long)mapping_size, (void *)VM_KERNEL_ADDRPERM(sr_handle), kr)); goto done; } SHARED_REGION_TRACE_DEBUG( - ("shared_region: enter(%p,%p,%p,%d,%d): " + ("shared_region: enter(%p,%p,%p,%d,%d,%d): " "nested vm_map_enter(0x%llx,0x%llx,%p) error 0x%x\n", (void *)VM_KERNEL_ADDRPERM(map), (void *)VM_KERNEL_ADDRPERM(task), - (void *)VM_KERNEL_ADDRPERM(fsroot), cpu, is_64bit, + (void *)VM_KERNEL_ADDRPERM(fsroot), + cpu, cpu_subtype, is_64bit, (long long)target_address, (long long)mapping_size, (void *)VM_KERNEL_ADDRPERM(sr_handle), kr)); } @@ -1604,23 +1767,24 @@ vm_shared_region_enter( VM_INHERIT_SHARE); if (kr != KERN_SUCCESS) { SHARED_REGION_TRACE_ERROR( - ("shared_region: enter(%p,%p,%p,%d,%d): " + ("shared_region: enter(%p,%p,%p,%d,%d,%d): " "vm_map_enter(0x%llx,0x%llx,%p) error 0x%x\n", (void *)VM_KERNEL_ADDRPERM(map), (void *)VM_KERNEL_ADDRPERM(task), (void *)VM_KERNEL_ADDRPERM(fsroot), - cpu, is_64bit, + cpu, cpu_subtype, is_64bit, (long long)target_address, (long long)mapping_size, (void *)VM_KERNEL_ADDRPERM(sr_handle), kr)); goto done; } SHARED_REGION_TRACE_DEBUG( - ("shared_region: enter(%p,%p,%p,%d,%d): " + ("shared_region: enter(%p,%p,%p,%d,%d,%d): " "vm_map_enter(0x%llx,0x%llx,%p) error 0x%x\n", (void *)VM_KERNEL_ADDRPERM(map), (void *)VM_KERNEL_ADDRPERM(task), - (void *)VM_KERNEL_ADDRPERM(fsroot), cpu, is_64bit, + (void *)VM_KERNEL_ADDRPERM(fsroot), + cpu, cpu_subtype, is_64bit, (long long)target_address, (long long)mapping_size, (void *)VM_KERNEL_ADDRPERM(sr_handle), kr)); sr_offset += mapping_size; @@ -1629,11 +1793,21 @@ vm_shared_region_enter( assert(sr_size == 0); done: + if (kr == KERN_SUCCESS) { + /* let the task use that shared region */ + vm_shared_region_set(task, shared_region); + } else { + /* drop our reference since we're not using it */ + vm_shared_region_deallocate(shared_region); + vm_shared_region_set(task, NULL); + } + SHARED_REGION_TRACE_DEBUG( - ("shared_region: enter(%p,%p,%p,%d,%d) <- 0x%x\n", + ("shared_region: enter(%p,%p,%p,%d,%d,%d) <- 0x%x\n", (void *)VM_KERNEL_ADDRPERM(map), (void *)VM_KERNEL_ADDRPERM(task), - (void *)VM_KERNEL_ADDRPERM(fsroot), cpu, is_64bit, kr)); + (void *)VM_KERNEL_ADDRPERM(fsroot), + cpu, cpu_subtype, is_64bit, kr)); return kr; } @@ -1672,32 +1846,45 @@ vm_shared_region_sliding_valid(uint32_t slide) } kern_return_t -vm_shared_region_slide_init( - vm_shared_region_t sr, - mach_vm_size_t slide_info_size, - mach_vm_offset_t start, - mach_vm_size_t size, - uint32_t slide, - memory_object_control_t sr_file_control) +vm_shared_region_slide_mapping( + vm_shared_region_t sr, + mach_vm_size_t slide_info_size, + mach_vm_offset_t start, + mach_vm_size_t size, + mach_vm_offset_t slid_mapping, + uint32_t slide, + memory_object_control_t sr_file_control) { - kern_return_t kr = KERN_SUCCESS; - vm_object_t object = VM_OBJECT_NULL; - vm_object_offset_t offset = 0; - vm_shared_region_slide_info_t si = vm_shared_region_get_slide_info(sr); - vm_offset_t slide_info_entry; - - vm_map_t map = NULL, cur_map = NULL; - boolean_t is_map_locked = FALSE; + kern_return_t kr; + vm_object_t object; + vm_shared_region_slide_info_t si; + vm_offset_t slide_info_entry; + vm_map_entry_t slid_entry, tmp_entry; + struct vm_map_entry tmp_entry_store; + memory_object_t sr_pager; + vm_map_t sr_map; + int vm_flags; + vm_map_kernel_flags_t vmk_flags; + vm_map_offset_t map_addr; + + tmp_entry = VM_MAP_ENTRY_NULL; + sr_pager = MEMORY_OBJECT_NULL; + object = VM_OBJECT_NULL; + slide_info_entry = 0; assert(sr->sr_slide_in_progress); assert(!sr->sr_slid); - assert(si->slide_object == NULL); + + si = vm_shared_region_get_slide_info(sr); + assert(si->slide_object == VM_OBJECT_NULL); assert(si->slide_info_entry == NULL); + if (sr_file_control == MEMORY_OBJECT_CONTROL_NULL) { + return KERN_INVALID_ARGUMENT; + } if (slide_info_size > SANE_SLIDE_INFO_SIZE) { printf("Slide_info_size too large: %lx\n", (uintptr_t)slide_info_size); - kr = KERN_FAILURE; - return kr; + return KERN_FAILURE; } kr = kmem_alloc(kernel_map, @@ -1707,93 +1894,117 @@ vm_shared_region_slide_init( return kr; } - if (sr_file_control != MEMORY_OBJECT_CONTROL_NULL) { + object = memory_object_control_to_vm_object(sr_file_control); + if (object == VM_OBJECT_NULL || object->internal) { + object = VM_OBJECT_NULL; + kr = KERN_INVALID_ADDRESS; + goto done; + } - object = memory_object_control_to_vm_object(sr_file_control); - vm_object_reference(object); - offset = start; + vm_object_lock(object); + vm_object_reference_locked(object); /* for si->slide_object */ + object->object_is_shared_cache = TRUE; + vm_object_unlock(object); - vm_object_lock(object); - } else { - /* - * Remove this entire "else" block and all "map" references - * once we get rid of the shared_region_slide_np() - * system call. - */ - vm_map_entry_t entry = VM_MAP_ENTRY_NULL; - map = current_map(); - vm_map_lock_read(map); - is_map_locked = TRUE; - Retry: - cur_map = map; - if(!vm_map_lookup_entry(map, start, &entry)) { - kr = KERN_INVALID_ARGUMENT; - } else { - vm_object_t shadow_obj = VM_OBJECT_NULL; - - if (entry->is_sub_map == TRUE) { - map = VME_SUBMAP(entry); - start -= entry->vme_start; - start += VME_OFFSET(entry); - vm_map_lock_read(map); - vm_map_unlock_read(cur_map); - goto Retry; - } else { - object = VME_OBJECT(entry); - offset = ((start - entry->vme_start) + - VME_OFFSET(entry)); - } - - vm_object_lock(object); - while (object->shadow != VM_OBJECT_NULL) { - shadow_obj = object->shadow; - vm_object_lock(shadow_obj); - vm_object_unlock(object); - object = shadow_obj; - } - } + si->slide_info_entry = (vm_shared_region_slide_info_entry_t)slide_info_entry; + si->slide_info_size = slide_info_size; + + assert(slid_mapping != (mach_vm_offset_t) -1); + si->slid_address = slid_mapping + sr->sr_base_address; + si->slide_object = object; + si->start = start; + si->end = si->start + size; + si->slide = slide; + + /* find the shared region's map entry to slide */ + sr_map = vm_shared_region_vm_map(sr); + vm_map_lock_read(sr_map); + if (!vm_map_lookup_entry(sr_map, + slid_mapping, + &slid_entry)) { + /* no mapping there */ + vm_map_unlock(sr_map); + kr = KERN_INVALID_ARGUMENT; + goto done; + } + /* + * We might want to clip the entry to cover only the portion that + * needs sliding (offsets si->start to si->end in the shared cache + * file at the bottom of the shadow chain). + * In practice, it seems to cover the entire DATA segment... + */ + tmp_entry_store = *slid_entry; + tmp_entry = &tmp_entry_store; + slid_entry = VM_MAP_ENTRY_NULL; + /* extra ref to keep object alive while map is unlocked */ + vm_object_reference(VME_OBJECT(tmp_entry)); + vm_map_unlock_read(sr_map); + + /* create a "shared_region" sliding pager */ + sr_pager = shared_region_pager_setup(VME_OBJECT(tmp_entry), + VME_OFFSET(tmp_entry), + si); + if (sr_pager == NULL) { + kr = KERN_RESOURCE_SHORTAGE; + goto done; } - - if (object->internal == TRUE) { - kr = KERN_INVALID_ADDRESS; - } else if (object->object_slid) { - /* Can only be slid once */ - printf("%s: found vm_object %p already slid?\n", __FUNCTION__, object); - kr = KERN_FAILURE; - } else { - si->slide_info_entry = (vm_shared_region_slide_info_entry_t)slide_info_entry; - si->slide_info_size = slide_info_size; - si->slide_object = object; - si->start = offset; - si->end = si->start + size; - si->slide = slide; + /* map that pager over the portion of the mapping that needs sliding */ + vm_flags = VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE; + vmk_flags = VM_MAP_KERNEL_FLAGS_NONE; + vmk_flags.vmkf_overwrite_immutable = TRUE; + map_addr = tmp_entry->vme_start; + kr = vm_map_enter_mem_object(sr_map, + &map_addr, + (tmp_entry->vme_end - + tmp_entry->vme_start), + (mach_vm_offset_t) 0, + vm_flags, + vmk_flags, + VM_KERN_MEMORY_NONE, + (ipc_port_t)(uintptr_t) sr_pager, + 0, + TRUE, + tmp_entry->protection, + tmp_entry->max_protection, + tmp_entry->inheritance); + assertf(kr == KERN_SUCCESS, "kr = 0x%x\n", kr); + assertf(map_addr == tmp_entry->vme_start, + "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n", + (uint64_t)map_addr, + (uint64_t) tmp_entry->vme_start, + tmp_entry); + + /* success! */ + kr = KERN_SUCCESS; +done: + if (sr_pager) { /* - * If we want to have this region get deallocated/freed - * then we will have to make sure that we msync(..MS_INVALIDATE..) - * the pages associated with this shared region. Those pages would - * have been slid with an older slide value. - */ - - /* - * Pointers in object are held without references; they - * are disconnected at the time that we destroy the - * shared region, and since the shared region holds - * a reference on the object, no references in the other - * direction are required. + * Release the sr_pager reference obtained by + * shared_region_pager_setup(). + * The mapping (if it succeeded) is now holding a reference on + * the memory object. */ - object->object_slid = TRUE; - object->vo_slide_info = si; + memory_object_deallocate(sr_pager); + sr_pager = MEMORY_OBJECT_NULL; } - - vm_object_unlock(object); - if (is_map_locked == TRUE) { - vm_map_unlock_read(map); + if (tmp_entry) { + /* release extra ref on tmp_entry's VM object */ + vm_object_deallocate(VME_OBJECT(tmp_entry)); + tmp_entry = VM_MAP_ENTRY_NULL; } if (kr != KERN_SUCCESS) { - kmem_free(kernel_map, slide_info_entry, slide_info_size); + /* cleanup */ + if (slide_info_entry) { + kmem_free(kernel_map, slide_info_entry, slide_info_size); + slide_info_entry = 0; + } + if (si->slide_object) { + vm_object_deallocate(si->slide_object); + si->slide_object = VM_OBJECT_NULL; + } } return kr; } @@ -1858,6 +2069,66 @@ vm_shared_region_slide_sanity_check_v2(vm_shared_region_slide_info_entry_v2_t s_ return KERN_SUCCESS; } +static kern_return_t +vm_shared_region_slide_sanity_check_v3(vm_shared_region_slide_info_entry_v3_t s_info, mach_vm_size_t slide_info_size) +{ + if (s_info->page_size != PAGE_SIZE_FOR_SR_SLIDE) { + printf("vm_shared_region_slide_sanity_check_v3: s_info->page_size != PAGE_SIZE_FOR_SR_SL 0x%llx != 0x%llx\n", (uint64_t)s_info->page_size, (uint64_t)PAGE_SIZE_FOR_SR_SLIDE); + return KERN_FAILURE; + } + + uint32_t page_starts_count = s_info->page_starts_count; + mach_vm_size_t num_trailing_entries = page_starts_count; + mach_vm_size_t trailing_size = num_trailing_entries << 1; + mach_vm_size_t required_size = sizeof(*s_info) + trailing_size; + if (required_size < sizeof(*s_info)) { + printf("vm_shared_region_slide_sanity_check_v3: required_size != sizeof(*s_info) 0x%llx != 0x%llx\n", (uint64_t)required_size, (uint64_t)sizeof(*s_info)); + return KERN_FAILURE; + } + + if (required_size > slide_info_size) { + printf("vm_shared_region_slide_sanity_check_v3: required_size != slide_info_size 0x%llx != 0x%llx\n", (uint64_t)required_size, (uint64_t)slide_info_size); + return KERN_FAILURE; + } + + return KERN_SUCCESS; +} + +static kern_return_t +vm_shared_region_slide_sanity_check_v4(vm_shared_region_slide_info_entry_v4_t s_info, mach_vm_size_t slide_info_size) +{ + if (s_info->page_size != PAGE_SIZE_FOR_SR_SLIDE) { + return KERN_FAILURE; + } + + /* Ensure that the slide info doesn't reference any data outside of its bounds. */ + + uint32_t page_starts_count = s_info->page_starts_count; + uint32_t page_extras_count = s_info->page_extras_count; + mach_vm_size_t num_trailing_entries = page_starts_count + page_extras_count; + if (num_trailing_entries < page_starts_count) { + return KERN_FAILURE; + } + + /* Scale by sizeof(uint16_t). Hard-coding the size simplifies the overflow check. */ + mach_vm_size_t trailing_size = num_trailing_entries << 1; + if (trailing_size >> 1 != num_trailing_entries) { + return KERN_FAILURE; + } + + mach_vm_size_t required_size = sizeof(*s_info) + trailing_size; + if (required_size < sizeof(*s_info)) { + return KERN_FAILURE; + } + + if (required_size > slide_info_size) { + return KERN_FAILURE; + } + + return KERN_SUCCESS; +} + + kern_return_t vm_shared_region_slide_sanity_check(vm_shared_region_t sr) { @@ -1880,6 +2151,10 @@ vm_shared_region_slide_sanity_check(vm_shared_region_t sr) kr = vm_shared_region_slide_sanity_check_v1(&s_info->v1); } else if (s_info->version == 2) { kr = vm_shared_region_slide_sanity_check_v2(&s_info->v2, si->slide_info_size); + } else if (s_info->version == 3) { + kr = vm_shared_region_slide_sanity_check_v3(&s_info->v3, si->slide_info_size); + } else if (s_info->version == 4) { + kr = vm_shared_region_slide_sanity_check_v4(&s_info->v4, si->slide_info_size); } else { goto fail; } @@ -1894,11 +2169,6 @@ vm_shared_region_slide_sanity_check(vm_shared_region_t sr) (vm_offset_t) si->slide_info_entry, (vm_size_t) si->slide_info_size); - vm_object_lock(si->slide_object); - si->slide_object->object_slid = FALSE; - si->slide_object->vo_slide_info = NULL; - vm_object_unlock(si->slide_object); - vm_object_deallocate(si->slide_object); si->slide_object = NULL; si->start = 0; @@ -1918,7 +2188,7 @@ vm_shared_region_slide_page_v1(vm_shared_region_slide_info_t si, vm_offset_t vad uint32_t i=0, j=0; uint8_t b = 0; uint32_t slide = si->slide; - int is_64 = task_has_64BitAddr(current_task()); + int is_64 = task_has_64Bit_addr(current_task()); vm_shared_region_slide_info_entry_v1_t s_info = &si->slide_info_entry->v1; toc = (uint16_t*)((uintptr_t)s_info + s_info->toc_offset); @@ -2148,14 +2418,206 @@ vm_shared_region_slide_page_v2(vm_shared_region_slide_info_t si, vm_offset_t vad return KERN_SUCCESS; } + +static kern_return_t +vm_shared_region_slide_page_v3(vm_shared_region_slide_info_t si, vm_offset_t vaddr, __unused mach_vm_offset_t uservaddr, uint32_t pageIndex) +{ + vm_shared_region_slide_info_entry_v3_t s_info = &si->slide_info_entry->v3; + const uint32_t slide_amount = si->slide; + + uint8_t *page_content = (uint8_t *)vaddr; + uint16_t page_entry; + + if (pageIndex >= s_info->page_starts_count) { + printf("vm_shared_region_slide_page() did not find page start in slide info: pageIndex=%u, count=%u\n", + pageIndex, s_info->page_starts_count); + return KERN_FAILURE; + } + page_entry = s_info->page_starts[pageIndex]; + + if (page_entry == DYLD_CACHE_SLIDE_V3_PAGE_ATTR_NO_REBASE) { + return KERN_SUCCESS; + } + + uint8_t* rebaseLocation = page_content; + uint64_t delta = page_entry; + do { + rebaseLocation += delta; + uint64_t value; + memcpy(&value, rebaseLocation, sizeof(value)); + delta = ( (value & 0x3FF8000000000000) >> 51) * sizeof(uint64_t); + + // A pointer is one of : + // { + // uint64_t pointerValue : 51; + // uint64_t offsetToNextPointer : 11; + // uint64_t isBind : 1 = 0; + // uint64_t authenticated : 1 = 0; + // } + // { + // uint32_t offsetFromSharedCacheBase; + // uint16_t diversityData; + // uint16_t hasAddressDiversity : 1; + // uint16_t hasDKey : 1; + // uint16_t hasBKey : 1; + // uint16_t offsetToNextPointer : 11; + // uint16_t isBind : 1; + // uint16_t authenticated : 1 = 1; + // } + + bool isBind = (value & (1ULL << 62)) == 1; + if (isBind) { + return KERN_FAILURE; + } + + bool isAuthenticated = (value & (1ULL << 63)) != 0; + + if (isAuthenticated) { + // The new value for a rebase is the low 32-bits of the threaded value plus the slide. + value = (value & 0xFFFFFFFF) + slide_amount; + // Add in the offset from the mach_header + const uint64_t value_add = s_info->value_add; + value += value_add; + + } else { + // The new value for a rebase is the low 51-bits of the threaded value plus the slide. + // Regular pointer which needs to fit in 51-bits of value. + // C++ RTTI uses the top bit, so we'll allow the whole top-byte + // and the bottom 43-bits to be fit in to 51-bits. + uint64_t top8Bits = value & 0x0007F80000000000ULL; + uint64_t bottom43Bits = value & 0x000007FFFFFFFFFFULL; + uint64_t targetValue = ( top8Bits << 13 ) | bottom43Bits; + value = targetValue + slide_amount; + } + + memcpy(rebaseLocation, &value, sizeof(value)); + } while (delta != 0); + + return KERN_SUCCESS; +} + +static kern_return_t +rebase_chainv4( + uint8_t *page_content, + uint16_t start_offset, + uint32_t slide_amount, + vm_shared_region_slide_info_entry_v4_t s_info) +{ + const uint32_t last_page_offset = PAGE_SIZE_FOR_SR_SLIDE - sizeof(uint32_t); + + const uint32_t delta_mask = (uint32_t)(s_info->delta_mask); + const uint32_t value_mask = ~delta_mask; + const uint32_t value_add = (uint32_t)(s_info->value_add); + const uint32_t delta_shift = __builtin_ctzll(delta_mask) - 2; + + uint32_t page_offset = start_offset; + uint32_t delta = 1; + + while (delta != 0 && page_offset <= last_page_offset) { + uint8_t *loc; + uint32_t value; + + loc = page_content + page_offset; + memcpy(&value, loc, sizeof(value)); + delta = (value & delta_mask) >> delta_shift; + value &= value_mask; + + if ( (value & 0xFFFF8000) == 0 ) { + // small positive non-pointer, use as-is + } else if ( (value & 0x3FFF8000) == 0x3FFF8000 ) { + // small negative non-pointer + value |= 0xC0000000; + } else { + // pointer that needs rebasing + value += value_add; + value += slide_amount; + } + memcpy(loc, &value, sizeof(value)); + page_offset += delta; + } + + /* If the offset went past the end of the page, then the slide data is invalid. */ + if (page_offset > last_page_offset) { + return KERN_FAILURE; + } + return KERN_SUCCESS; +} + +static kern_return_t +vm_shared_region_slide_page_v4(vm_shared_region_slide_info_t si, vm_offset_t vaddr, uint32_t pageIndex) +{ + vm_shared_region_slide_info_entry_v4_t s_info = &si->slide_info_entry->v4; + const uint32_t slide_amount = si->slide; + + const uint16_t *page_starts = (uint16_t *)((uintptr_t)s_info + s_info->page_starts_offset); + const uint16_t *page_extras = (uint16_t *)((uintptr_t)s_info + s_info->page_extras_offset); + + uint8_t *page_content = (uint8_t *)vaddr; + uint16_t page_entry; + + if (pageIndex >= s_info->page_starts_count) { + printf("vm_shared_region_slide_page() did not find page start in slide info: pageIndex=%u, count=%u\n", + pageIndex, s_info->page_starts_count); + return KERN_FAILURE; + } + page_entry = page_starts[pageIndex]; + + if (page_entry == DYLD_CACHE_SLIDE4_PAGE_NO_REBASE) { + return KERN_SUCCESS; + } + + if (page_entry & DYLD_CACHE_SLIDE4_PAGE_USE_EXTRA) { + uint16_t chain_index = page_entry & DYLD_CACHE_SLIDE4_PAGE_INDEX; + uint16_t info; + + do { + uint16_t page_start_offset; + kern_return_t kr; + + if (chain_index >= s_info->page_extras_count) { + printf("vm_shared_region_slide_page() out-of-bounds extras index: index=%u, count=%u\n", + chain_index, s_info->page_extras_count); + return KERN_FAILURE; + } + info = page_extras[chain_index]; + page_start_offset = (info & DYLD_CACHE_SLIDE4_PAGE_INDEX) << DYLD_CACHE_SLIDE_PAGE_OFFSET_SHIFT; + + kr = rebase_chainv4(page_content, page_start_offset, slide_amount, s_info); + if (kr != KERN_SUCCESS) { + return KERN_FAILURE; + } + + chain_index++; + } while (!(info & DYLD_CACHE_SLIDE4_PAGE_EXTRA_END)); + } else { + const uint32_t page_start_offset = page_entry << DYLD_CACHE_SLIDE_PAGE_OFFSET_SHIFT; + kern_return_t kr; + + kr = rebase_chainv4(page_content, page_start_offset, slide_amount, s_info); + if (kr != KERN_SUCCESS) { + return KERN_FAILURE; + } + } + + return KERN_SUCCESS; +} + + + kern_return_t -vm_shared_region_slide_page(vm_shared_region_slide_info_t si, vm_offset_t vaddr, uint32_t pageIndex) +vm_shared_region_slide_page(vm_shared_region_slide_info_t si, vm_offset_t vaddr, mach_vm_offset_t uservaddr, uint32_t pageIndex) { if (si->slide_info_entry->version == 1) { return vm_shared_region_slide_page_v1(si, vaddr, pageIndex); - } else { + } else if (si->slide_info_entry->version == 2) { return vm_shared_region_slide_page_v2(si, vaddr, pageIndex); - } + } else if (si->slide_info_entry->version == 3) { + return vm_shared_region_slide_page_v3(si, vaddr, uservaddr, pageIndex); + } else if (si->slide_info_entry->version == 4) { + return vm_shared_region_slide_page_v4(si, vaddr, pageIndex); + } else { + return KERN_FAILURE; + } } /******************************************************************************/ @@ -2452,6 +2914,7 @@ vm_shared_region_slide(uint32_t slide, mach_vm_size_t entry_size, mach_vm_offset_t slide_start, mach_vm_size_t slide_size, + mach_vm_offset_t slid_mapping, memory_object_control_t sr_file_control) { void *slide_info_entry = NULL; @@ -2496,7 +2959,14 @@ vm_shared_region_slide(uint32_t slide, sr->sr_slide_in_progress = TRUE; vm_shared_region_unlock(); - if((error = vm_shared_region_slide_init(sr, slide_size, entry_start_address, entry_size, slide, sr_file_control))) { + error = vm_shared_region_slide_mapping(sr, + slide_size, + entry_start_address, + entry_size, + slid_mapping, + slide, + sr_file_control); + if (error) { printf("slide_info initialization failed with kr=%d\n", error); goto done; } diff --git a/osfmk/vm/vm_shared_region.h b/osfmk/vm/vm_shared_region.h index 34becaefb..f57b3c891 100644 --- a/osfmk/vm/vm_shared_region.h +++ b/osfmk/vm/vm_shared_region.h @@ -57,6 +57,9 @@ extern int shared_region_debug; #endif /* DEBUG */ extern int shared_region_trace_level; + +extern struct vm_shared_region *init_task_shared_region; + #define SHARED_REGION_TRACE_NONE_LVL 0 /* no trace */ #define SHARED_REGION_TRACE_ERROR_LVL 1 /* trace abnormal events */ #define SHARED_REGION_TRACE_INFO_LVL 2 /* trace all events */ @@ -136,15 +139,52 @@ struct vm_shared_region_slide_info_entry_v2 { #define DYLD_CACHE_SLIDE_PAGE_VALUE 0x3FFF // bitwise negation of DYLD_CACHE_SLIDE_PAGE_ATTRS #define DYLD_CACHE_SLIDE_PAGE_OFFSET_SHIFT 2 +typedef struct vm_shared_region_slide_info_entry_v3 *vm_shared_region_slide_info_entry_v3_t; +struct vm_shared_region_slide_info_entry_v3 +{ + uint32_t version; // currently 3 + uint32_t page_size; // currently 4096 (may also be 16384) + uint32_t page_starts_count; + uint64_t value_add; + uint16_t page_starts[/* page_starts_count */]; +}; + +#define DYLD_CACHE_SLIDE_V3_PAGE_ATTR_NO_REBASE 0xFFFF // page has no rebasing + + +typedef struct vm_shared_region_slide_info_entry_v4 *vm_shared_region_slide_info_entry_v4_t; +struct vm_shared_region_slide_info_entry_v4 { + uint32_t version; // currently 4 + uint32_t page_size; // currently 4096 (may also be 16384) + uint32_t page_starts_offset; + uint32_t page_starts_count; + uint32_t page_extras_offset; + uint32_t page_extras_count; + uint64_t delta_mask; // which (contiguous) set of bits contains the delta to the next rebase location (0xC0000000) + uint64_t value_add; // base address of cache + // uint16_t page_starts[page_starts_count]; + // uint16_t page_extras[page_extras_count]; +}; + +#define DYLD_CACHE_SLIDE4_PAGE_NO_REBASE 0xFFFF // page has no rebasing +#define DYLD_CACHE_SLIDE4_PAGE_INDEX 0x7FFF // index into starts or extras +#define DYLD_CACHE_SLIDE4_PAGE_USE_EXTRA 0x8000 // index is into extras array (not starts array) +#define DYLD_CACHE_SLIDE4_PAGE_EXTRA_END 0x8000 // last chain entry for page + + + typedef union vm_shared_region_slide_info_entry *vm_shared_region_slide_info_entry_t; union vm_shared_region_slide_info_entry { uint32_t version; struct vm_shared_region_slide_info_entry_v1 v1; struct vm_shared_region_slide_info_entry_v2 v2; + struct vm_shared_region_slide_info_entry_v3 v3; + struct vm_shared_region_slide_info_entry_v4 v4; }; typedef struct vm_shared_region_slide_info *vm_shared_region_slide_info_t; struct vm_shared_region_slide_info { + mach_vm_address_t slid_address; mach_vm_offset_t start; mach_vm_offset_t end; uint32_t slide; @@ -159,6 +199,7 @@ struct vm_shared_region { queue_chain_t sr_q; void *sr_root_dir; cpu_type_t sr_cpu_type; + cpu_subtype_t sr_cpu_subtype; boolean_t sr_64bit; boolean_t sr_mapping_in_progress; boolean_t sr_slide_in_progress; @@ -174,10 +215,13 @@ struct vm_shared_region { struct vm_shared_region_slide_info sr_slide_info; uuid_t sr_uuid; boolean_t sr_uuid_copied; + uint32_t sr_images_count; + struct dyld_uuid_info_64 *sr_images; }; extern kern_return_t vm_shared_region_slide_page(vm_shared_region_slide_info_t si, - vm_offset_t vaddr, + vm_offset_t vaddr, + mach_vm_offset_t uservaddr, uint32_t pageIndex); extern vm_shared_region_slide_info_t vm_shared_region_get_slide_info(vm_shared_region_t sr); #else /* !MACH_KERNEL_PRIVATE */ @@ -195,12 +239,15 @@ extern kern_return_t vm_shared_region_enter( struct task *task, boolean_t is_64bit, void *fsroot, - cpu_type_t cpu); + cpu_type_t cpu, + cpu_subtype_t cpu_subtype); extern kern_return_t vm_shared_region_remove( struct _vm_map *map, struct task *task); extern vm_shared_region_t vm_shared_region_get( struct task *task); +extern vm_shared_region_t vm_shared_region_trim_and_get( + struct task *task); extern void vm_shared_region_deallocate( struct vm_shared_region *shared_region); extern mach_vm_offset_t vm_shared_region_base_address( @@ -209,6 +256,8 @@ extern mach_vm_size_t vm_shared_region_size( struct vm_shared_region *shared_region); extern ipc_port_t vm_shared_region_mem_entry( struct vm_shared_region *shared_region); +extern vm_map_t vm_shared_region_vm_map( + struct vm_shared_region *shared_region); extern uint32_t vm_shared_region_get_slide( vm_shared_region_t shared_region); extern void vm_shared_region_set( @@ -217,6 +266,7 @@ extern void vm_shared_region_set( extern vm_shared_region_t vm_shared_region_lookup( void *root_dir, cpu_type_t cpu, + cpu_subtype_t cpu_subtype, boolean_t is_64bit); extern kern_return_t vm_shared_region_start_address( struct vm_shared_region *shared_region, @@ -238,12 +288,6 @@ extern kern_return_t vm_shared_region_map_file( user_addr_t slide_size); extern kern_return_t vm_shared_region_sliding_valid(uint32_t slide); extern kern_return_t vm_shared_region_slide_sanity_check(vm_shared_region_t sr); -extern kern_return_t vm_shared_region_slide_init(vm_shared_region_t sr, - mach_vm_size_t slide_info_size, - mach_vm_offset_t start, - mach_vm_size_t size, - uint32_t slide, - memory_object_control_t); extern void* vm_shared_region_get_slide_info_entry(vm_shared_region_t sr); extern void vm_commpage_init(void); extern void vm_commpage_text_init(void); @@ -259,6 +303,7 @@ int vm_shared_region_slide(uint32_t, mach_vm_size_t, mach_vm_offset_t, mach_vm_size_t, + mach_vm_offset_t, memory_object_control_t); #endif /* KERNEL_PRIVATE */ diff --git a/osfmk/vm/vm_shared_region_pager.c b/osfmk/vm/vm_shared_region_pager.c new file mode 100644 index 000000000..773233d24 --- /dev/null +++ b/osfmk/vm/vm_shared_region_pager.c @@ -0,0 +1,1146 @@ +/* + * Copyright (c) 2018 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + + +/* + * SHARED REGION MEMORY PAGER + * + * This external memory manager (EMM) handles mappings of a dyld shared cache + * in shared regions, applying any necessary modifications (sliding, + * pointer signing, ...). + * + * It mostly handles page-in requests (from memory_object_data_request()) by + * getting the original data from its backing VM object, itself backed by + * the dyld shared cache file, modifying it if needed and providing it to VM. + * + * The modified pages will never be dirtied, so the memory manager doesn't + * need to handle page-out requests (from memory_object_data_return()). The + * pages need to be mapped copy-on-write, so that the originals stay clean. + * + * We don't expect to have to handle a large number of shared cache files, + * so the data structures are very simple (simple linked list) for now. + */ + +/* forward declarations */ +void shared_region_pager_reference(memory_object_t mem_obj); +void shared_region_pager_deallocate(memory_object_t mem_obj); +kern_return_t shared_region_pager_init(memory_object_t mem_obj, + memory_object_control_t control, + memory_object_cluster_size_t pg_size); +kern_return_t shared_region_pager_terminate(memory_object_t mem_obj); +kern_return_t shared_region_pager_data_request(memory_object_t mem_obj, + memory_object_offset_t offset, + memory_object_cluster_size_t length, + vm_prot_t protection_required, + memory_object_fault_info_t fault_info); +kern_return_t shared_region_pager_data_return(memory_object_t mem_obj, + memory_object_offset_t offset, + memory_object_cluster_size_t data_cnt, + memory_object_offset_t *resid_offset, + int *io_error, + boolean_t dirty, + boolean_t kernel_copy, + int upl_flags); +kern_return_t shared_region_pager_data_initialize(memory_object_t mem_obj, + memory_object_offset_t offset, + memory_object_cluster_size_t data_cnt); +kern_return_t shared_region_pager_data_unlock(memory_object_t mem_obj, + memory_object_offset_t offset, + memory_object_size_t size, + vm_prot_t desired_access); +kern_return_t shared_region_pager_synchronize(memory_object_t mem_obj, + memory_object_offset_t offset, + memory_object_size_t length, + vm_sync_t sync_flags); +kern_return_t shared_region_pager_map(memory_object_t mem_obj, + vm_prot_t prot); +kern_return_t shared_region_pager_last_unmap(memory_object_t mem_obj); + +/* + * Vector of VM operations for this EMM. + * These routines are invoked by VM via the memory_object_*() interfaces. + */ +const struct memory_object_pager_ops shared_region_pager_ops = { + shared_region_pager_reference, + shared_region_pager_deallocate, + shared_region_pager_init, + shared_region_pager_terminate, + shared_region_pager_data_request, + shared_region_pager_data_return, + shared_region_pager_data_initialize, + shared_region_pager_data_unlock, + shared_region_pager_synchronize, + shared_region_pager_map, + shared_region_pager_last_unmap, + NULL, /* data_reclaim */ + "shared_region" +}; + +/* + * The "shared_region_pager" describes a memory object backed by + * the "shared_region" EMM. + */ +typedef struct shared_region_pager { + /* mandatory generic header */ + struct memory_object sc_pgr_hdr; + + /* pager-specific data */ + queue_chain_t pager_queue; /* next & prev pagers */ + unsigned int ref_count; /* reference count */ + boolean_t is_ready; /* is this pager ready ? */ + boolean_t is_mapped; /* is this mem_obj mapped ? */ + vm_object_t backing_object; /* VM obj for shared cache */ + vm_object_offset_t backing_offset; + struct vm_shared_region_slide_info *scp_slide_info; +} *shared_region_pager_t; +#define SHARED_REGION_PAGER_NULL ((shared_region_pager_t) NULL) + +/* + * List of memory objects managed by this EMM. + * The list is protected by the "shared_region_pager_lock" lock. + */ +int shared_region_pager_count = 0; /* number of pagers */ +int shared_region_pager_count_mapped = 0; /* number of unmapped pagers */ +queue_head_t shared_region_pager_queue; +decl_lck_mtx_data(,shared_region_pager_lock) + +/* + * Maximum number of unmapped pagers we're willing to keep around. + */ +int shared_region_pager_cache_limit = 0; + +/* + * Statistics & counters. + */ +int shared_region_pager_count_max = 0; +int shared_region_pager_count_unmapped_max = 0; +int shared_region_pager_num_trim_max = 0; +int shared_region_pager_num_trim_total = 0; + + +lck_grp_t shared_region_pager_lck_grp; +lck_grp_attr_t shared_region_pager_lck_grp_attr; +lck_attr_t shared_region_pager_lck_attr; + +uint64_t shared_region_pager_copied = 0; +uint64_t shared_region_pager_slid = 0; +uint64_t shared_region_pager_slid_error = 0; +uint64_t shared_region_pager_reclaimed = 0; + +/* internal prototypes */ +shared_region_pager_t shared_region_pager_create( + vm_object_t backing_object, + vm_object_offset_t backing_offset, + struct vm_shared_region_slide_info *slide_info); +shared_region_pager_t shared_region_pager_lookup(memory_object_t mem_obj); +void shared_region_pager_dequeue(shared_region_pager_t pager); +void shared_region_pager_deallocate_internal(shared_region_pager_t pager, + boolean_t locked); +void shared_region_pager_terminate_internal(shared_region_pager_t pager); +void shared_region_pager_trim(void); + + +#if DEBUG +int shared_region_pagerdebug = 0; +#define PAGER_ALL 0xffffffff +#define PAGER_INIT 0x00000001 +#define PAGER_PAGEIN 0x00000002 + +#define PAGER_DEBUG(LEVEL, A) \ + MACRO_BEGIN \ + if ((shared_region_pagerdebug & (LEVEL)) == (LEVEL)) { \ + printf A; \ + } \ + MACRO_END +#else +#define PAGER_DEBUG(LEVEL, A) +#endif + + +void +shared_region_pager_bootstrap(void) +{ + lck_grp_attr_setdefault(&shared_region_pager_lck_grp_attr); + lck_grp_init(&shared_region_pager_lck_grp, "shared_region", &shared_region_pager_lck_grp_attr); + lck_attr_setdefault(&shared_region_pager_lck_attr); + lck_mtx_init(&shared_region_pager_lock, &shared_region_pager_lck_grp, &shared_region_pager_lck_attr); + queue_init(&shared_region_pager_queue); +} + +/* + * shared_region_pager_init() + * + * Initialize the memory object and makes it ready to be used and mapped. + */ +kern_return_t +shared_region_pager_init( + memory_object_t mem_obj, + memory_object_control_t control, +#if !DEBUG + __unused +#endif + memory_object_cluster_size_t pg_size) +{ + shared_region_pager_t pager; + kern_return_t kr; + memory_object_attr_info_data_t attributes; + + PAGER_DEBUG(PAGER_ALL, + ("shared_region_pager_init: %p, %p, %x\n", + mem_obj, control, pg_size)); + + if (control == MEMORY_OBJECT_CONTROL_NULL) + return KERN_INVALID_ARGUMENT; + + pager = shared_region_pager_lookup(mem_obj); + + memory_object_control_reference(control); + + pager->sc_pgr_hdr.mo_control = control; + + attributes.copy_strategy = MEMORY_OBJECT_COPY_DELAY; + /* attributes.cluster_size = (1 << (CLUSTER_SHIFT + PAGE_SHIFT));*/ + attributes.cluster_size = (1 << (PAGE_SHIFT)); + attributes.may_cache_object = FALSE; + attributes.temporary = TRUE; + + kr = memory_object_change_attributes( + control, + MEMORY_OBJECT_ATTRIBUTE_INFO, + (memory_object_info_t) &attributes, + MEMORY_OBJECT_ATTR_INFO_COUNT); + if (kr != KERN_SUCCESS) + panic("shared_region_pager_init: " + "memory_object_change_attributes() failed"); + +#if CONFIG_SECLUDED_MEMORY + if (secluded_for_filecache) { +#if 00 + /* + * XXX FBDP do we want this in the secluded pool? + * Ideally, we'd want the shared region used by Camera to + * NOT be in the secluded pool, but all other shared regions + * in the secluded pool... + */ + memory_object_mark_eligible_for_secluded(control, TRUE); +#endif /* 00 */ + } +#endif /* CONFIG_SECLUDED_MEMORY */ + + return KERN_SUCCESS; +} + +/* + * shared_region_data_return() + * + * Handles page-out requests from VM. This should never happen since + * the pages provided by this EMM are not supposed to be dirty or dirtied + * and VM should simply discard the contents and reclaim the pages if it + * needs to. + */ +kern_return_t +shared_region_pager_data_return( + __unused memory_object_t mem_obj, + __unused memory_object_offset_t offset, + __unused memory_object_cluster_size_t data_cnt, + __unused memory_object_offset_t *resid_offset, + __unused int *io_error, + __unused boolean_t dirty, + __unused boolean_t kernel_copy, + __unused int upl_flags) +{ + panic("shared_region_pager_data_return: should never get called"); + return KERN_FAILURE; +} + +kern_return_t +shared_region_pager_data_initialize( + __unused memory_object_t mem_obj, + __unused memory_object_offset_t offset, + __unused memory_object_cluster_size_t data_cnt) +{ + panic("shared_region_pager_data_initialize: should never get called"); + return KERN_FAILURE; +} + +kern_return_t +shared_region_pager_data_unlock( + __unused memory_object_t mem_obj, + __unused memory_object_offset_t offset, + __unused memory_object_size_t size, + __unused vm_prot_t desired_access) +{ + return KERN_FAILURE; +} + +/* + * shared_region_pager_data_request() + * + * Handles page-in requests from VM. + */ +int shared_region_pager_data_request_debug = 0; +kern_return_t +shared_region_pager_data_request( + memory_object_t mem_obj, + memory_object_offset_t offset, + memory_object_cluster_size_t length, +#if !DEBUG + __unused +#endif + vm_prot_t protection_required, + memory_object_fault_info_t mo_fault_info) +{ + shared_region_pager_t pager; + memory_object_control_t mo_control; + upl_t upl; + int upl_flags; + upl_size_t upl_size; + upl_page_info_t *upl_pl; + unsigned int pl_count; + vm_object_t src_top_object, src_page_object, dst_object; + kern_return_t kr, retval; + vm_offset_t src_vaddr, dst_vaddr; + vm_offset_t cur_offset; + vm_offset_t offset_in_page; + kern_return_t error_code; + vm_prot_t prot; + vm_page_t src_page, top_page; + int interruptible; + struct vm_object_fault_info fault_info; + mach_vm_offset_t slide_start_address; + + PAGER_DEBUG(PAGER_ALL, ("shared_region_pager_data_request: %p, %llx, %x, %x\n", mem_obj, offset, length, protection_required)); + + retval = KERN_SUCCESS; + src_top_object = VM_OBJECT_NULL; + src_page_object = VM_OBJECT_NULL; + upl = NULL; + upl_pl = NULL; + fault_info = *((struct vm_object_fault_info *)(uintptr_t)mo_fault_info); + fault_info.stealth = TRUE; + fault_info.io_sync = FALSE; + fault_info.mark_zf_absent = FALSE; + fault_info.batch_pmap_op = FALSE; + interruptible = fault_info.interruptible; + + pager = shared_region_pager_lookup(mem_obj); + assert(pager->is_ready); + assert(pager->ref_count > 1); /* pager is alive and mapped */ + + PAGER_DEBUG(PAGER_PAGEIN, ("shared_region_pager_data_request: %p, %llx, %x, %x, pager %p\n", mem_obj, offset, length, protection_required, pager)); + + /* + * Gather in a UPL all the VM pages requested by VM. + */ + mo_control = pager->sc_pgr_hdr.mo_control; + + upl_size = length; + upl_flags = + UPL_RET_ONLY_ABSENT | + UPL_SET_LITE | + UPL_NO_SYNC | + UPL_CLEAN_IN_PLACE | /* triggers UPL_CLEAR_DIRTY */ + UPL_SET_INTERNAL; + pl_count = 0; + kr = memory_object_upl_request(mo_control, + offset, upl_size, + &upl, NULL, NULL, upl_flags, VM_KERN_MEMORY_SECURITY); + if (kr != KERN_SUCCESS) { + retval = kr; + goto done; + } + dst_object = mo_control->moc_object; + assert(dst_object != VM_OBJECT_NULL); + + /* + * We'll map the original data in the kernel address space from the + * backing VM object (itself backed by the shared cache file via + * the vnode pager). + */ + src_top_object = pager->backing_object; + assert(src_top_object != VM_OBJECT_NULL); + vm_object_reference(src_top_object); /* keep the source object alive */ + + slide_start_address = pager->scp_slide_info->slid_address; + + fault_info.lo_offset += pager->backing_offset; + fault_info.hi_offset += pager->backing_offset; + + /* + * Fill in the contents of the pages requested by VM. + */ + upl_pl = UPL_GET_INTERNAL_PAGE_LIST(upl); + pl_count = length / PAGE_SIZE; + for (cur_offset = 0; + retval == KERN_SUCCESS && cur_offset < length; + cur_offset += PAGE_SIZE) { + ppnum_t dst_pnum; + + if (!upl_page_present(upl_pl, (int)(cur_offset / PAGE_SIZE))) { + /* this page is not in the UPL: skip it */ + continue; + } + + /* + * Map the source (dyld shared cache) page in the kernel's + * virtual address space. + * We already hold a reference on the src_top_object. + */ + retry_src_fault: + vm_object_lock(src_top_object); + vm_object_paging_begin(src_top_object); + error_code = 0; + prot = VM_PROT_READ; + src_page = VM_PAGE_NULL; + kr = vm_fault_page(src_top_object, + pager->backing_offset + offset + cur_offset, + VM_PROT_READ, + FALSE, + FALSE, /* src_page not looked up */ + &prot, + &src_page, + &top_page, + NULL, + &error_code, + FALSE, + FALSE, + &fault_info); + switch (kr) { + case VM_FAULT_SUCCESS: + break; + case VM_FAULT_RETRY: + goto retry_src_fault; + case VM_FAULT_MEMORY_SHORTAGE: + if (vm_page_wait(interruptible)) { + goto retry_src_fault; + } + /* fall thru */ + case VM_FAULT_INTERRUPTED: + retval = MACH_SEND_INTERRUPTED; + goto done; + case VM_FAULT_SUCCESS_NO_VM_PAGE: + /* success but no VM page: fail */ + vm_object_paging_end(src_top_object); + vm_object_unlock(src_top_object); + /*FALLTHROUGH*/ + case VM_FAULT_MEMORY_ERROR: + /* the page is not there ! */ + if (error_code) { + retval = error_code; + } else { + retval = KERN_MEMORY_ERROR; + } + goto done; + default: + panic("shared_region_pager_data_request: " + "vm_fault_page() unexpected error 0x%x\n", + kr); + } + assert(src_page != VM_PAGE_NULL); + assert(src_page->vmp_busy); + + if (src_page->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q) { + vm_page_lockspin_queues(); + if (src_page->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q) { + vm_page_speculate(src_page, FALSE); + } + vm_page_unlock_queues(); + } + + /* + * Establish pointers to the source + * and destination physical pages. + */ + dst_pnum = (ppnum_t) + upl_phys_page(upl_pl, (int)(cur_offset / PAGE_SIZE)); + assert(dst_pnum != 0); +#if __x86_64__ + src_vaddr = (vm_map_offset_t) + PHYSMAP_PTOV((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(src_page) + << PAGE_SHIFT); + dst_vaddr = (vm_map_offset_t) + PHYSMAP_PTOV((pmap_paddr_t)dst_pnum << PAGE_SHIFT); + +#elif __arm__ || __arm64__ + src_vaddr = (vm_map_offset_t) + phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(src_page) + << PAGE_SHIFT); + dst_vaddr = (vm_map_offset_t) + phystokv((pmap_paddr_t)dst_pnum << PAGE_SHIFT); +#else +#error "vm_paging_map_object: no 1-to-1 kernel mapping of physical memory..." + src_vaddr = 0; + dst_vaddr = 0; +#endif + src_page_object = VM_PAGE_OBJECT(src_page); + + /* + * Validate the original page... + */ + if (src_page_object->code_signed) { + vm_page_validate_cs_mapped( + src_page, + (const void *) src_vaddr); + } + /* + * ... and transfer the results to the destination page. + */ + UPL_SET_CS_VALIDATED(upl_pl, cur_offset / PAGE_SIZE, + src_page->vmp_cs_validated); + UPL_SET_CS_TAINTED(upl_pl, cur_offset / PAGE_SIZE, + src_page->vmp_cs_tainted); + UPL_SET_CS_NX(upl_pl, cur_offset / PAGE_SIZE, + src_page->vmp_cs_nx); + + /* + * The page provider might access a mapped file, so let's + * release the object lock for the source page to avoid a + * potential deadlock. + * The source page is kept busy and we have a + * "paging_in_progress" reference on its object, so it's safe + * to unlock the object here. + */ + assert(src_page->vmp_busy); + assert(src_page_object->paging_in_progress > 0); + vm_object_unlock(src_page_object); + + /* + * Process the original contents of the source page + * into the destination page. + */ + for (offset_in_page = 0; + offset_in_page < PAGE_SIZE; + offset_in_page += PAGE_SIZE_FOR_SR_SLIDE) { + vm_object_offset_t chunk_offset; + vm_object_offset_t offset_in_backing_object; + vm_object_offset_t offset_in_sliding_range; + + chunk_offset = offset + cur_offset + offset_in_page; + + bcopy((const char *)(src_vaddr + + offset_in_page), + (char *)(dst_vaddr + offset_in_page), + PAGE_SIZE_FOR_SR_SLIDE); + + offset_in_backing_object = (chunk_offset + + pager->backing_offset); + if ((offset_in_backing_object < pager->scp_slide_info->start) || + (offset_in_backing_object >= pager->scp_slide_info->end)) { + /* chunk is outside of sliding range: done */ + shared_region_pager_copied++; + continue; + } + + offset_in_sliding_range = + (offset_in_backing_object - + pager->scp_slide_info->start); + kr = vm_shared_region_slide_page( + pager->scp_slide_info, + dst_vaddr + offset_in_page, + (mach_vm_offset_t) (offset_in_sliding_range + + slide_start_address), + (uint32_t) (offset_in_sliding_range / + PAGE_SIZE_FOR_SR_SLIDE)); + if (shared_region_pager_data_request_debug) { + printf("shared_region_data_request" + "(%p,0x%llx+0x%llx+0x%04llx): 0x%llx " + "in sliding range [0x%llx:0x%llx]: " + "SLIDE offset 0x%llx=" + "(0x%llx+0x%llx+0x%llx+0x%04llx)" + "[0x%016llx 0x%016llx] " + "code_signed=%d " + "cs_validated=%d " + "cs_tainted=%d " + "cs_nx=%d " + "kr=0x%x\n", + pager, + offset, + (uint64_t) cur_offset, + (uint64_t) offset_in_page, + chunk_offset, + pager->scp_slide_info->start, + pager->scp_slide_info->end, + (pager->backing_offset + + offset + + cur_offset + + offset_in_page), + pager->backing_offset, + offset, + (uint64_t) cur_offset, + (uint64_t) offset_in_page, + *(uint64_t *)(dst_vaddr+offset_in_page), + *(uint64_t *)(dst_vaddr+offset_in_page+8), + src_page_object->code_signed, + src_page->vmp_cs_validated, + src_page->vmp_cs_tainted, + src_page->vmp_cs_nx, + kr); + } + if (kr != KERN_SUCCESS) { + shared_region_pager_slid_error++; + break; + } + shared_region_pager_slid++; + } + + assert(VM_PAGE_OBJECT(src_page) == src_page_object); + assert(src_page->vmp_busy); + assert(src_page_object->paging_in_progress > 0); + vm_object_lock(src_page_object); + + /* + * Cleanup the result of vm_fault_page() of the source page. + */ + PAGE_WAKEUP_DONE(src_page); + src_page = VM_PAGE_NULL; + vm_object_paging_end(src_page_object); + vm_object_unlock(src_page_object); + + if (top_page != VM_PAGE_NULL) { + assert(VM_PAGE_OBJECT(top_page) == src_top_object); + vm_object_lock(src_top_object); + VM_PAGE_FREE(top_page); + vm_object_paging_end(src_top_object); + vm_object_unlock(src_top_object); + } + } + +done: + if (upl != NULL) { + /* clean up the UPL */ + + /* + * The pages are currently dirty because we've just been + * writing on them, but as far as we're concerned, they're + * clean since they contain their "original" contents as + * provided by us, the pager. + * Tell the UPL to mark them "clean". + */ + upl_clear_dirty(upl, TRUE); + + /* abort or commit the UPL */ + if (retval != KERN_SUCCESS) { + upl_abort(upl, 0); + } else { + boolean_t empty; + upl_commit_range(upl, 0, upl->size, + UPL_COMMIT_CS_VALIDATED | UPL_COMMIT_WRITTEN_BY_KERNEL, + upl_pl, pl_count, &empty); + } + + /* and deallocate the UPL */ + upl_deallocate(upl); + upl = NULL; + } + if (src_top_object != VM_OBJECT_NULL) { + vm_object_deallocate(src_top_object); + } + return retval; +} + +/* + * shared_region_pager_reference() + * + * Get a reference on this memory object. + * For external usage only. Assumes that the initial reference count is not 0, + * i.e one should not "revive" a dead pager this way. + */ +void +shared_region_pager_reference( + memory_object_t mem_obj) +{ + shared_region_pager_t pager; + + pager = shared_region_pager_lookup(mem_obj); + + lck_mtx_lock(&shared_region_pager_lock); + assert(pager->ref_count > 0); + pager->ref_count++; + lck_mtx_unlock(&shared_region_pager_lock); +} + + +/* + * shared_region_pager_dequeue: + * + * Removes a pager from the list of pagers. + * + * The caller must hold "shared_region_pager_lock". + */ +void +shared_region_pager_dequeue( + shared_region_pager_t pager) +{ + assert(!pager->is_mapped); + + queue_remove(&shared_region_pager_queue, + pager, + shared_region_pager_t, + pager_queue); + pager->pager_queue.next = NULL; + pager->pager_queue.prev = NULL; + + shared_region_pager_count--; +} + +/* + * shared_region_pager_terminate_internal: + * + * Trigger the asynchronous termination of the memory object associated + * with this pager. + * When the memory object is terminated, there will be one more call + * to memory_object_deallocate() (i.e. shared_region_pager_deallocate()) + * to finish the clean up. + * + * "shared_region_pager_lock" should not be held by the caller. + * We don't need the lock because the pager has already been removed from + * the pagers' list and is now ours exclusively. + */ +void +shared_region_pager_terminate_internal( + shared_region_pager_t pager) +{ + assert(pager->is_ready); + assert(!pager->is_mapped); + + if (pager->backing_object != VM_OBJECT_NULL) { + vm_object_deallocate(pager->backing_object); + pager->backing_object = VM_OBJECT_NULL; + } + /* trigger the destruction of the memory object */ + memory_object_destroy(pager->sc_pgr_hdr.mo_control, 0); +} + +/* + * shared_region_pager_deallocate_internal() + * + * Release a reference on this pager and free it when the last + * reference goes away. + * Can be called with shared_region_pager_lock held or not but always returns + * with it unlocked. + */ +void +shared_region_pager_deallocate_internal( + shared_region_pager_t pager, + boolean_t locked) +{ + boolean_t needs_trimming; + int count_unmapped; + + if (! locked) { + lck_mtx_lock(&shared_region_pager_lock); + } + + count_unmapped = (shared_region_pager_count - + shared_region_pager_count_mapped); + if (count_unmapped > shared_region_pager_cache_limit) { + /* we have too many unmapped pagers: trim some */ + needs_trimming = TRUE; + } else { + needs_trimming = FALSE; + } + + /* drop a reference on this pager */ + pager->ref_count--; + + if (pager->ref_count == 1) { + /* + * Only the "named" reference is left, which means that + * no one is really holding on to this pager anymore. + * Terminate it. + */ + shared_region_pager_dequeue(pager); + /* the pager is all ours: no need for the lock now */ + lck_mtx_unlock(&shared_region_pager_lock); + shared_region_pager_terminate_internal(pager); + } else if (pager->ref_count == 0) { + /* + * Dropped the existence reference; the memory object has + * been terminated. Do some final cleanup and release the + * pager structure. + */ + lck_mtx_unlock(&shared_region_pager_lock); + if (pager->sc_pgr_hdr.mo_control != MEMORY_OBJECT_CONTROL_NULL) { + memory_object_control_deallocate(pager->sc_pgr_hdr.mo_control); + pager->sc_pgr_hdr.mo_control = MEMORY_OBJECT_CONTROL_NULL; + } + kfree(pager, sizeof (*pager)); + pager = SHARED_REGION_PAGER_NULL; + } else { + /* there are still plenty of references: keep going... */ + lck_mtx_unlock(&shared_region_pager_lock); + } + + if (needs_trimming) { + shared_region_pager_trim(); + } + /* caution: lock is not held on return... */ +} + +/* + * shared_region_pager_deallocate() + * + * Release a reference on this pager and free it when the last + * reference goes away. + */ +void +shared_region_pager_deallocate( + memory_object_t mem_obj) +{ + shared_region_pager_t pager; + + PAGER_DEBUG(PAGER_ALL, ("shared_region_pager_deallocate: %p\n", mem_obj)); + pager = shared_region_pager_lookup(mem_obj); + shared_region_pager_deallocate_internal(pager, FALSE); +} + +/* + * + */ +kern_return_t +shared_region_pager_terminate( +#if !DEBUG + __unused +#endif + memory_object_t mem_obj) +{ + PAGER_DEBUG(PAGER_ALL, ("shared_region_pager_terminate: %p\n", mem_obj)); + + return KERN_SUCCESS; +} + +/* + * + */ +kern_return_t +shared_region_pager_synchronize( + __unused memory_object_t mem_obj, + __unused memory_object_offset_t offset, + __unused memory_object_size_t length, + __unused vm_sync_t sync_flags) +{ + panic("shared_region_pager_synchronize: memory_object_synchronize no longer supported\n"); + return KERN_FAILURE; +} + +/* + * shared_region_pager_map() + * + * This allows VM to let us, the EMM, know that this memory object + * is currently mapped one or more times. This is called by VM each time + * the memory object gets mapped and we take one extra reference on the + * memory object to account for all its mappings. + */ +kern_return_t +shared_region_pager_map( + memory_object_t mem_obj, + __unused vm_prot_t prot) +{ + shared_region_pager_t pager; + + PAGER_DEBUG(PAGER_ALL, ("shared_region_pager_map: %p\n", mem_obj)); + + pager = shared_region_pager_lookup(mem_obj); + + lck_mtx_lock(&shared_region_pager_lock); + assert(pager->is_ready); + assert(pager->ref_count > 0); /* pager is alive */ + if (pager->is_mapped == FALSE) { + /* + * First mapping of this pager: take an extra reference + * that will remain until all the mappings of this pager + * are removed. + */ + pager->is_mapped = TRUE; + pager->ref_count++; + shared_region_pager_count_mapped++; + } + lck_mtx_unlock(&shared_region_pager_lock); + + return KERN_SUCCESS; +} + +/* + * shared_region_pager_last_unmap() + * + * This is called by VM when this memory object is no longer mapped anywhere. + */ +kern_return_t +shared_region_pager_last_unmap( + memory_object_t mem_obj) +{ + shared_region_pager_t pager; + int count_unmapped; + + PAGER_DEBUG(PAGER_ALL, + ("shared_region_pager_last_unmap: %p\n", mem_obj)); + + pager = shared_region_pager_lookup(mem_obj); + + lck_mtx_lock(&shared_region_pager_lock); + if (pager->is_mapped) { + /* + * All the mappings are gone, so let go of the one extra + * reference that represents all the mappings of this pager. + */ + shared_region_pager_count_mapped--; + count_unmapped = (shared_region_pager_count - + shared_region_pager_count_mapped); + if (count_unmapped > shared_region_pager_count_unmapped_max) { + shared_region_pager_count_unmapped_max = count_unmapped; + } + pager->is_mapped = FALSE; + shared_region_pager_deallocate_internal(pager, TRUE); + /* caution: deallocate_internal() released the lock ! */ + } else { + lck_mtx_unlock(&shared_region_pager_lock); + } + + return KERN_SUCCESS; +} + + +/* + * + */ +shared_region_pager_t +shared_region_pager_lookup( + memory_object_t mem_obj) +{ + shared_region_pager_t pager; + + assert(mem_obj->mo_pager_ops == &shared_region_pager_ops); + pager = (shared_region_pager_t)(uintptr_t) mem_obj; + assert(pager->ref_count > 0); + return pager; +} + +shared_region_pager_t +shared_region_pager_create( + vm_object_t backing_object, + vm_object_offset_t backing_offset, + struct vm_shared_region_slide_info *slide_info) +{ + shared_region_pager_t pager; + memory_object_control_t control; + kern_return_t kr; + + pager = (shared_region_pager_t) kalloc(sizeof (*pager)); + if (pager == SHARED_REGION_PAGER_NULL) { + return SHARED_REGION_PAGER_NULL; + } + + /* + * The vm_map call takes both named entry ports and raw memory + * objects in the same parameter. We need to make sure that + * vm_map does not see this object as a named entry port. So, + * we reserve the first word in the object for a fake ip_kotype + * setting - that will tell vm_map to use it as a memory object. + */ + pager->sc_pgr_hdr.mo_ikot = IKOT_MEMORY_OBJECT; + pager->sc_pgr_hdr.mo_pager_ops = &shared_region_pager_ops; + pager->sc_pgr_hdr.mo_control = MEMORY_OBJECT_CONTROL_NULL; + + pager->is_ready = FALSE;/* not ready until it has a "name" */ + pager->ref_count = 1; /* existence reference (for the cache) */ + pager->ref_count++; /* for the caller */ + pager->is_mapped = FALSE; + pager->backing_object = backing_object; + pager->backing_offset = backing_offset; + pager->scp_slide_info = slide_info; + + vm_object_reference(backing_object); + + lck_mtx_lock(&shared_region_pager_lock); + /* enter new pager at the head of our list of pagers */ + queue_enter_first(&shared_region_pager_queue, + pager, + shared_region_pager_t, + pager_queue); + shared_region_pager_count++; + if (shared_region_pager_count > shared_region_pager_count_max) { + shared_region_pager_count_max = shared_region_pager_count; + } + lck_mtx_unlock(&shared_region_pager_lock); + + kr = memory_object_create_named((memory_object_t) pager, + 0, + &control); + assert(kr == KERN_SUCCESS); + + lck_mtx_lock(&shared_region_pager_lock); + /* the new pager is now ready to be used */ + pager->is_ready = TRUE; + lck_mtx_unlock(&shared_region_pager_lock); + + /* wakeup anyone waiting for this pager to be ready */ + thread_wakeup(&pager->is_ready); + + return pager; +} + +/* + * shared_region_pager_setup() + * + * Provide the caller with a memory object backed by the provided + * "backing_object" VM object. + */ +memory_object_t +shared_region_pager_setup( + vm_object_t backing_object, + vm_object_offset_t backing_offset, + struct vm_shared_region_slide_info *slide_info) +{ + shared_region_pager_t pager; + + /* create new pager */ + pager = shared_region_pager_create( + backing_object, + backing_offset, + slide_info); + if (pager == SHARED_REGION_PAGER_NULL) { + /* could not create a new pager */ + return MEMORY_OBJECT_NULL; + } + + lck_mtx_lock(&shared_region_pager_lock); + while (!pager->is_ready) { + lck_mtx_sleep(&shared_region_pager_lock, + LCK_SLEEP_DEFAULT, + &pager->is_ready, + THREAD_UNINT); + } + lck_mtx_unlock(&shared_region_pager_lock); + + return (memory_object_t) pager; +} + +void +shared_region_pager_trim(void) +{ + shared_region_pager_t pager, prev_pager; + queue_head_t trim_queue; + int num_trim; + int count_unmapped; + + lck_mtx_lock(&shared_region_pager_lock); + + /* + * We have too many pagers, try and trim some unused ones, + * starting with the oldest pager at the end of the queue. + */ + queue_init(&trim_queue); + num_trim = 0; + + for (pager = (shared_region_pager_t) + queue_last(&shared_region_pager_queue); + !queue_end(&shared_region_pager_queue, + (queue_entry_t) pager); + pager = prev_pager) { + /* get prev elt before we dequeue */ + prev_pager = (shared_region_pager_t) + queue_prev(&pager->pager_queue); + + if (pager->ref_count == 2 && + pager->is_ready && + !pager->is_mapped) { + /* this pager can be trimmed */ + num_trim++; + /* remove this pager from the main list ... */ + shared_region_pager_dequeue(pager); + /* ... and add it to our trim queue */ + queue_enter_first(&trim_queue, + pager, + shared_region_pager_t, + pager_queue); + + count_unmapped = (shared_region_pager_count - + shared_region_pager_count_mapped); + if (count_unmapped <= shared_region_pager_cache_limit) { + /* we have enough pagers to trim */ + break; + } + } + } + if (num_trim > shared_region_pager_num_trim_max) { + shared_region_pager_num_trim_max = num_trim; + } + shared_region_pager_num_trim_total += num_trim; + + lck_mtx_unlock(&shared_region_pager_lock); + + /* terminate the trimmed pagers */ + while (!queue_empty(&trim_queue)) { + queue_remove_first(&trim_queue, + pager, + shared_region_pager_t, + pager_queue); + pager->pager_queue.next = NULL; + pager->pager_queue.prev = NULL; + assert(pager->ref_count == 2); + /* + * We can't call deallocate_internal() because the pager + * has already been dequeued, but we still need to remove + * a reference. + */ + pager->ref_count--; + shared_region_pager_terminate_internal(pager); + } +} diff --git a/osfmk/vm/vm_swapfile_pager.c b/osfmk/vm/vm_swapfile_pager.c index f24307d74..489297724 100644 --- a/osfmk/vm/vm_swapfile_pager.c +++ b/osfmk/vm/vm_swapfile_pager.c @@ -466,7 +466,7 @@ swapfile_pager_data_request( kr = vm_map_remove(kernel_map, kernel_mapping, kernel_mapping + PAGE_SIZE_64, - VM_MAP_NO_FLAGS); + VM_MAP_REMOVE_NO_FLAGS); assert(kr == KERN_SUCCESS); kernel_mapping = 0; dst_vaddr = 0; diff --git a/osfmk/vm/vm_user.c b/osfmk/vm/vm_user.c index a1e2c51c3..93e1374e6 100644 --- a/osfmk/vm/vm_user.c +++ b/osfmk/vm/vm_user.c @@ -102,6 +102,7 @@ #include #include +#include #include #include @@ -120,6 +121,8 @@ #include +#include + vm_size_t upl_offset_to_pagelist = 0; #if VM_CPM @@ -318,12 +321,12 @@ mach_vm_deallocate( if (size == (mach_vm_offset_t) 0) return(KERN_SUCCESS); - return(vm_map_remove(map, + return vm_map_remove(map, vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map)), vm_map_round_page(start+size, VM_MAP_PAGE_MASK(map)), - VM_MAP_NO_FLAGS)); + VM_MAP_REMOVE_NO_FLAGS); } /* @@ -344,12 +347,12 @@ vm_deallocate( if (size == (vm_offset_t) 0) return(KERN_SUCCESS); - return(vm_map_remove(map, + return vm_map_remove(map, vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map)), vm_map_round_page(start+size, VM_MAP_PAGE_MASK(map)), - VM_MAP_NO_FLAGS)); + VM_MAP_REMOVE_NO_FLAGS); } /* @@ -594,7 +597,8 @@ vm_read( if (map == VM_MAP_NULL) return(KERN_INVALID_ARGUMENT); - if (size > (unsigned)(mach_msg_type_number_t) -1) { + mach_msg_type_number_t dsize; + if (os_convert_overflow(size, &dsize)) { /* * The kernel could handle a 64-bit "size" value, but * it could not return the size of the data in "*data_size" @@ -612,7 +616,7 @@ vm_read( if (KERN_SUCCESS == error) { *data = (pointer_t) ipc_address; - *data_size = (mach_msg_type_number_t) size; + *data_size = dsize; assert(*data_size == size); } return(error); @@ -938,7 +942,7 @@ kern_return_t mach_vm_map_external( vm_map_t target_map, mach_vm_offset_t *address, - mach_vm_size_t initial_size, + mach_vm_size_t initial_size, mach_vm_offset_t mask, int flags, ipc_port_t port, @@ -951,8 +955,11 @@ mach_vm_map_external( vm_tag_t tag; VM_GET_FLAGS_ALIAS(flags, tag); - return (mach_vm_map_kernel(target_map, address, initial_size, mask, flags, tag, port, - offset, copy, cur_protection, max_protection, inheritance)); + return (mach_vm_map_kernel(target_map, address, initial_size, mask, + flags, VM_MAP_KERNEL_FLAGS_NONE, tag, + port, offset, copy, + cur_protection, max_protection, + inheritance)); } kern_return_t @@ -962,6 +969,7 @@ mach_vm_map_kernel( mach_vm_size_t initial_size, mach_vm_offset_t mask, int flags, + vm_map_kernel_flags_t vmk_flags, vm_tag_t tag, ipc_port_t port, vm_object_offset_t offset, @@ -984,7 +992,7 @@ mach_vm_map_kernel( initial_size, mask, flags, - VM_MAP_KERNEL_FLAGS_NONE, + vmk_flags, tag, port, offset, @@ -1022,8 +1030,11 @@ vm_map_64_external( vm_tag_t tag; VM_GET_FLAGS_ALIAS(flags, tag); - return (vm_map_64_kernel(target_map, address, size, mask, flags, tag, port, offset, - copy, cur_protection, max_protection, inheritance)); + return (vm_map_64_kernel(target_map, address, size, mask, + flags, VM_MAP_KERNEL_FLAGS_NONE, + tag, port, offset, copy, + cur_protection, max_protection, + inheritance)); } kern_return_t @@ -1033,6 +1044,7 @@ vm_map_64_kernel( vm_size_t size, vm_offset_t mask, int flags, + vm_map_kernel_flags_t vmk_flags, vm_tag_t tag, ipc_port_t port, vm_object_offset_t offset, @@ -1050,9 +1062,10 @@ vm_map_64_kernel( map_size = (mach_vm_size_t)size; map_mask = (mach_vm_offset_t)mask; - kr = mach_vm_map_kernel(target_map, &map_addr, map_size, map_mask, flags, tag, - port, offset, copy, - cur_protection, max_protection, inheritance); + kr = mach_vm_map_kernel(target_map, &map_addr, map_size, map_mask, + flags, vmk_flags, tag, + port, offset, copy, + cur_protection, max_protection, inheritance); *address = CAST_DOWN(vm_offset_t, map_addr); return kr; } @@ -1075,7 +1088,10 @@ vm_map_external( vm_tag_t tag; VM_GET_FLAGS_ALIAS(flags, tag); - return (vm_map_kernel(target_map, address, size, mask, flags, tag, port, offset, copy, cur_protection, max_protection, inheritance)); + return (vm_map_kernel(target_map, address, size, mask, + flags, VM_MAP_KERNEL_FLAGS_NONE, tag, + port, offset, copy, + cur_protection, max_protection, inheritance)); } kern_return_t @@ -1085,6 +1101,7 @@ vm_map_kernel( vm_size_t size, vm_offset_t mask, int flags, + vm_map_kernel_flags_t vmk_flags, vm_tag_t tag, ipc_port_t port, vm_offset_t offset, @@ -1104,9 +1121,10 @@ vm_map_kernel( map_mask = (mach_vm_offset_t)mask; obj_offset = (vm_object_offset_t)offset; - kr = mach_vm_map_kernel(target_map, &map_addr, map_size, map_mask, flags, tag, - port, obj_offset, copy, - cur_protection, max_protection, inheritance); + kr = mach_vm_map_kernel(target_map, &map_addr, map_size, map_mask, + flags, vmk_flags, tag, + port, obj_offset, copy, + cur_protection, max_protection, inheritance); *address = CAST_DOWN(vm_offset_t, map_addr); return kr; } @@ -2208,8 +2226,6 @@ mach_make_memory_entry_64( parent_handle); } -extern int pacified_purgeable_iokit; - kern_return_t mach_make_memory_entry_internal( vm_map_t target_map, @@ -2231,7 +2247,7 @@ mach_make_memory_entry_internal( boolean_t iskernel; vm_object_offset_t obj_off; vm_prot_t prot; - struct vm_object_fault_info fault_info; + struct vm_object_fault_info fault_info = {}; vm_object_t object; vm_object_t shadow_object; @@ -2259,6 +2275,9 @@ mach_make_memory_entry_internal( boolean_t force_shadow = FALSE; boolean_t use_data_addr; boolean_t use_4K_compat; +#if VM_NAMED_ENTRY_LIST + int alias = -1; +#endif /* VM_NAMED_ENTRY_LIST */ if ((permission & MAP_MEM_FLAGS_MASK) & ~MAP_MEM_FLAGS_ALL) { /* @@ -2267,7 +2286,7 @@ mach_make_memory_entry_internal( return KERN_INVALID_VALUE; } - if (parent_handle != IP_NULL && + if (IP_VALID(parent_handle) && ip_kotype(parent_handle) == IKOT_NAMED_ENTRY) { parent_entry = (vm_named_entry_t) parent_handle->ip_kobject; } else { @@ -2356,6 +2375,8 @@ mach_make_memory_entry_internal( assert(object != VM_OBJECT_NULL); if (permission & MAP_MEM_PURGABLE) { + task_t owner; + if (! (permission & VM_PROT_WRITE)) { /* if we can't write, we can't purge */ vm_object_deallocate(object); @@ -2366,27 +2387,34 @@ mach_make_memory_entry_internal( if (permission & MAP_MEM_PURGABLE_KERNEL_ONLY) { object->purgeable_only_by_kernel = TRUE; } - assert(object->vo_purgeable_owner == NULL); + assert(object->vo_owner == NULL); assert(object->resident_page_count == 0); assert(object->wired_page_count == 0); vm_object_lock(object); - if (pacified_purgeable_iokit) { - if (permission & MAP_MEM_LEDGER_TAG_NETWORK) { - vm_purgeable_nonvolatile_enqueue(object, - kernel_task); - } else { - vm_purgeable_nonvolatile_enqueue(object, - current_task()); - } - } else { - if (object->purgeable_only_by_kernel) { - vm_purgeable_nonvolatile_enqueue(object, - kernel_task); - } else { - vm_purgeable_nonvolatile_enqueue(object, - current_task()); - } + owner = current_task(); +#if __arm64__ + if (owner->task_legacy_footprint) { + /* + * For ios11, we failed to account for + * this memory. Keep doing that for + * legacy apps (built before ios12), + * for backwards compatibility's sake... + */ + owner = kernel_task; } +#endif /* __arm64__ */ + vm_purgeable_nonvolatile_enqueue(object, owner); + vm_object_unlock(object); + } + + if (permission & MAP_MEM_LEDGER_TAG_NETWORK) { + /* make this object owned by the calling task */ + vm_object_lock(object); + vm_object_ownership_change( + object, + VM_OBJECT_LEDGER_TAG_NETWORK, + current_task(), /* new owner */ + FALSE); /* task_objq locked? */ vm_object_unlock(object); } @@ -2740,6 +2768,10 @@ mach_make_memory_entry_internal( } } +#if VM_NAMED_ENTRY_LIST + alias = VME_ALIAS(map_entry); +#endif /* VM_NAMED_ENTRY_LIST */ + /* * We found the VM map entry, lock the VM object again. */ @@ -2869,7 +2901,8 @@ mach_make_memory_entry_internal( object->vo_size > vm_map_round_page(total_size, VM_MAP_PAGE_MASK(target_map))))) - && !object->true_share)) { + && !object->true_share + && object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC)) { /* * We have to unlock the VM object before * trying to upgrade the VM map lock, to @@ -3089,6 +3122,9 @@ mach_make_memory_entry_internal( user_entry->protection = protections; SET_MAP_MEM(GET_MAP_MEM(permission), user_entry->protection); user_entry->size = map_size; +#if VM_NAMED_ENTRY_LIST + user_entry->named_entry_alias = alias; +#endif /* VM_NAMED_ENTRY_LIST */ /* user_object pager and internal fields are not used */ /* when the object field is filled in. */ @@ -3173,7 +3209,7 @@ mach_make_memory_entry_internal( if(parent_entry->is_sub_map) { user_entry->backing.map = parent_entry->backing.map; vm_map_lock(user_entry->backing.map); - user_entry->backing.map->ref_count++; + user_entry->backing.map->map_refcnt++; vm_map_unlock(user_entry->backing.map); } else { object = parent_entry->backing.object; @@ -3277,10 +3313,9 @@ task_wire( if (map == VM_MAP_NULL) return(KERN_INVALID_ARGUMENT); - if (must_wire) - map->wiring_required = TRUE; - else - map->wiring_required = FALSE; + vm_map_lock(map); + map->wiring_required = (must_wire == TRUE); + vm_map_unlock(map); return(KERN_SUCCESS); } @@ -3299,6 +3334,27 @@ vm_map_exec_lockdown( return(KERN_SUCCESS); } +#if VM_NAMED_ENTRY_LIST +queue_head_t vm_named_entry_list; +int vm_named_entry_count = 0; +lck_mtx_t vm_named_entry_list_lock_data; +lck_mtx_ext_t vm_named_entry_list_lock_data_ext; +#endif /* VM_NAMED_ENTRY_LIST */ + +void vm_named_entry_init(void); +void +vm_named_entry_init(void) +{ +#if VM_NAMED_ENTRY_LIST + queue_init(&vm_named_entry_list); + vm_named_entry_count = 0; + lck_mtx_init_ext(&vm_named_entry_list_lock_data, + &vm_named_entry_list_lock_data_ext, + &vm_object_lck_grp, + &vm_object_lck_attr); +#endif /* VM_NAMED_ENTRY_LIST */ +} + __private_extern__ kern_return_t mach_memory_entry_allocate( vm_named_entry_t *user_entry_p, @@ -3311,6 +3367,7 @@ mach_memory_entry_allocate( user_entry = (vm_named_entry_t) kalloc(sizeof *user_entry); if (user_entry == NULL) return KERN_FAILURE; + bzero(user_entry, sizeof (*user_entry)); named_entry_lock_init(user_entry); @@ -3325,10 +3382,6 @@ mach_memory_entry_allocate( user_handle->ip_sorights++; ip_reference(user_handle); - user_handle->ip_destination = IP_NULL; - user_handle->ip_receiver_name = MACH_PORT_NULL; - user_handle->ip_receiver = ipc_space_kernel; - /* make a send right */ user_handle->ip_mscount++; user_handle->ip_srights++; @@ -3353,6 +3406,21 @@ mach_memory_entry_allocate( *user_entry_p = user_entry; *user_handle_p = user_handle; +#if VM_NAMED_ENTRY_LIST + /* keep a loose (no reference) pointer to the Mach port, for debugging only */ + user_entry->named_entry_port = user_handle; + /* backtrace at allocation time, for debugging only */ + OSBacktrace(&user_entry->named_entry_bt[0], + NAMED_ENTRY_BT_DEPTH); + + /* add this new named entry to the global list */ + lck_mtx_lock_spin(&vm_named_entry_list_lock_data); + queue_enter(&vm_named_entry_list, user_entry, + vm_named_entry_t, named_entry_list); + vm_named_entry_count++; + lck_mtx_unlock(&vm_named_entry_list_lock_data); +#endif /* VM_NAMED_ENTRY_LIST */ + return KERN_SUCCESS; } @@ -3454,7 +3522,7 @@ memory_entry_purgeable_control_internal( vm_named_entry_t mem_entry; vm_object_t object; - if (entry_port == IP_NULL || + if (!IP_VALID(entry_port) || ip_kotype(entry_port) != IKOT_NAMED_ENTRY) { return KERN_INVALID_ARGUMENT; } @@ -3503,6 +3571,69 @@ memory_entry_purgeable_control_internal( return kr; } +kern_return_t +mach_memory_entry_access_tracking( + ipc_port_t entry_port, + int *access_tracking, + uint32_t *access_tracking_reads, + uint32_t *access_tracking_writes) +{ + return memory_entry_access_tracking_internal(entry_port, + access_tracking, + access_tracking_reads, + access_tracking_writes); +} + +kern_return_t +memory_entry_access_tracking_internal( + ipc_port_t entry_port, + int *access_tracking, + uint32_t *access_tracking_reads, + uint32_t *access_tracking_writes) +{ + vm_named_entry_t mem_entry; + vm_object_t object; + kern_return_t kr; + + if (!IP_VALID(entry_port) || + ip_kotype(entry_port) != IKOT_NAMED_ENTRY) { + return KERN_INVALID_ARGUMENT; + } + + mem_entry = (vm_named_entry_t) entry_port->ip_kobject; + + named_entry_lock(mem_entry); + + if (mem_entry->is_sub_map || + mem_entry->is_copy) { + named_entry_unlock(mem_entry); + return KERN_INVALID_ARGUMENT; + } + + object = mem_entry->backing.object; + if (object == VM_OBJECT_NULL) { + named_entry_unlock(mem_entry); + return KERN_INVALID_ARGUMENT; + } + +#if VM_OBJECT_ACCESS_TRACKING + vm_object_access_tracking(object, + access_tracking, + access_tracking_reads, + access_tracking_writes); + kr = KERN_SUCCESS; +#else /* VM_OBJECT_ACCESS_TRACKING */ + (void) access_tracking; + (void) access_tracking_reads; + (void) access_tracking_writes; + kr = KERN_NOT_SUPPORTED; +#endif /* VM_OBJECT_ACCESS_TRACKING */ + + named_entry_unlock(mem_entry); + + return kr; +} + kern_return_t mach_memory_entry_get_page_counts( ipc_port_t entry_port, @@ -3515,7 +3646,7 @@ mach_memory_entry_get_page_counts( vm_object_offset_t offset; vm_object_size_t size; - if (entry_port == IP_NULL || + if (!IP_VALID(entry_port) || ip_kotype(entry_port) != IKOT_NAMED_ENTRY) { return KERN_INVALID_ARGUMENT; } @@ -3603,6 +3734,15 @@ mach_destroy_memory_entry( named_entry_unlock(named_entry); named_entry_lock_destroy(named_entry); +#if VM_NAMED_ENTRY_LIST + lck_mtx_lock_spin(&vm_named_entry_list_lock_data); + queue_remove(&vm_named_entry_list, named_entry, + vm_named_entry_t, named_entry_list); + assert(vm_named_entry_count > 0); + vm_named_entry_count--; + lck_mtx_unlock(&vm_named_entry_list_lock_data); +#endif /* VM_NAMED_ENTRY_LIST */ + kfree((void *) port->ip_kobject, sizeof (struct vm_named_entry)); } else @@ -3624,7 +3764,7 @@ mach_memory_entry_page_op( vm_object_t object; kern_return_t kr; - if (entry_port == IP_NULL || + if (!IP_VALID(entry_port) || ip_kotype(entry_port) != IKOT_NAMED_ENTRY) { return KERN_INVALID_ARGUMENT; } @@ -3677,7 +3817,7 @@ mach_memory_entry_range_op( vm_object_t object; kern_return_t kr; - if (entry_port == IP_NULL || + if (!IP_VALID(entry_port) || ip_kotype(entry_port) != IKOT_NAMED_ENTRY) { return KERN_INVALID_ARGUMENT; } @@ -4173,7 +4313,10 @@ vm_map( vm_tag_t tag; VM_GET_FLAGS_ALIAS(flags, tag); - return (vm_map_kernel(target_map, address, size, mask, flags, tag, port, offset, copy, cur_protection, max_protection, inheritance)); + return vm_map_kernel(target_map, address, size, mask, + flags, VM_MAP_KERNEL_FLAGS_NONE, tag, + port, offset, copy, + cur_protection, max_protection, inheritance); } #endif /* __x86_64__ */ diff --git a/osfmk/voucher/ipc_pthread_priority.c b/osfmk/voucher/ipc_pthread_priority.c index baf70d7ba..c9cd2e807 100644 --- a/osfmk/voucher/ipc_pthread_priority.c +++ b/osfmk/voucher/ipc_pthread_priority.c @@ -42,14 +42,13 @@ #include #include #include +#include ipc_voucher_attr_control_t ipc_pthread_priority_voucher_attr_control; /* communication channel from PTHPRIORITY to voucher system */ #define IPC_PTHREAD_PRIORITY_VALUE_TO_HANDLE(x) ((mach_voucher_attr_value_handle_t)(x)) #define HANDLE_TO_IPC_PTHREAD_PRIORITY_VALUE(x) ((ipc_pthread_priority_value_t)(x)) -extern unsigned long pthread_priority_canonicalize(unsigned long priority, boolean_t for_propagation); - kern_return_t ipc_pthread_priority_release_value( ipc_voucher_attr_manager_t __assert_only manager, @@ -200,8 +199,8 @@ ipc_pthread_priority_get_value( } /* Callout to pthread kext to get the canonicalized value */ - canonicalize_priority_value = (ipc_pthread_priority_value_t) pthread_priority_canonicalize( - (unsigned long)ipc_pthread_priority_value, true); + canonicalize_priority_value = (ipc_pthread_priority_value_t) + _pthread_priority_normalize_for_ipc((unsigned long)ipc_pthread_priority_value); *out_value = IPC_PTHREAD_PRIORITY_VALUE_TO_HANDLE(canonicalize_priority_value); *out_flags = MACH_VOUCHER_ATTR_VALUE_FLAGS_PERSIST; diff --git a/osfmk/x86_64/copyio.c b/osfmk/x86_64/copyio.c index dd4b93670..aae293b6c 100644 --- a/osfmk/x86_64/copyio.c +++ b/osfmk/x86_64/copyio.c @@ -44,7 +44,8 @@ #include - +#undef copyin +#undef copyout static int copyio(int, user_addr_t, char *, vm_size_t, vm_size_t *, int); static int copyio_phys(addr64_t, addr64_t, vm_size_t, int); @@ -80,6 +81,8 @@ extern int _bcopy(const void *, void *, vm_size_t); extern int _bcopystr(const void *, void *, vm_size_t, vm_size_t *); extern int _copyin_word(const char *src, uint64_t *dst, vm_size_t len); +/* On by default, optionally disabled by boot-arg */ +extern boolean_t copyio_zalloc_check; /* * Types of copies: @@ -166,6 +169,7 @@ copyio(int copy_type, user_addr_t user_addr, char *kernel_addr, int debug_type = 0xeff70010; debug_type += (copy_type << 2); #endif + vm_size_t kernel_buf_size = 0; if (__improbable(nbytes > copysize_limit_panic)) panic("%s(%p, %p, %lu) - transfer too large", __func__, @@ -177,13 +181,19 @@ copyio(int copy_type, user_addr_t user_addr, char *kernel_addr, if (__improbable(nbytes == 0)) goto out; - pmap = thread->map->pmap; - boolean_t nopagezero = pmap->pagezero_accessible; + pmap = thread->map->pmap; + boolean_t nopagezero = thread->map->pmap->pagezero_accessible; - if (__improbable((copy_type != COPYINPHYS) && (copy_type != COPYOUTPHYS) && ((vm_offset_t)kernel_addr < VM_MIN_KERNEL_AND_KEXT_ADDRESS))) { - panic("Invalid copy parameter, copy type: %d, kernel address: %p", copy_type, kernel_addr); + if ((copy_type != COPYINPHYS) && (copy_type != COPYOUTPHYS)) { + if (__improbable((vm_offset_t)kernel_addr < VM_MIN_KERNEL_AND_KEXT_ADDRESS)) + panic("Invalid copy parameter, copy type: %d, kernel address: %p", copy_type, kernel_addr); + if (__probable(copyio_zalloc_check)) { + kernel_buf_size = zone_element_size(kernel_addr, NULL); + if (__improbable(kernel_buf_size && kernel_buf_size < nbytes)) + panic("copyio: kernel buffer %p has size %lu < nbytes %lu", kernel_addr, kernel_buf_size, nbytes); + } } - + /* Sanity and security check for addresses to/from a user */ if (__improbable(((pmap != kernel_pmap) && (use_kernel_map == 0)) && @@ -371,7 +381,7 @@ copyinmsg(const user_addr_t user_addr, char *kernel_addr, mach_msg_size_t nbytes } int -copyin(const user_addr_t user_addr, char *kernel_addr, vm_size_t nbytes) +copyin(const user_addr_t user_addr, void *kernel_addr, vm_size_t nbytes) { return copyio(COPYIN, user_addr, kernel_addr, nbytes, NULL, 0); } diff --git a/osfmk/x86_64/cswitch.s b/osfmk/x86_64/cswitch.s index c0574bf8c..cb72f459e 100644 --- a/osfmk/x86_64/cswitch.s +++ b/osfmk/x86_64/cswitch.s @@ -118,7 +118,7 @@ Entry(Switch_context) Entry(Thread_continue) - movq %rax, %rdi /* load thread argument */ + movq %rax, %rdi /* this is the old thread from Switch_context */ xorq %rbp,%rbp /* zero frame pointer */ call *%rbx /* call real continuation */ diff --git a/osfmk/x86_64/idt64.s b/osfmk/x86_64/idt64.s index 54a43ec98..b1a1ada4c 100644 --- a/osfmk/x86_64/idt64.s +++ b/osfmk/x86_64/idt64.s @@ -84,10 +84,16 @@ .section __HIB, __desc .globl EXT(idt64_hndl_table0) EXT(idt64_hndl_table0): - .quad EXT(ks_dispatch) - .quad EXT(ks_64bit_return) - .quad 0 /* Populated with CPU shadow displacement*/ - .quad EXT(ks_return) +/* 0x00 */ .quad EXT(ks_dispatch) +/* 0x08 */ .quad EXT(ks_64bit_return) +/* 0x10 */ .quad 0 /* Populated with CPU shadow displacement*/ +/* 0x18 */ .quad EXT(ks_return) +#define TBL0_OFF_DISP_USER_WITH_POPRAX 0x20 +/* 0x20 */ .quad EXT(ks_dispatch_user_with_pop_rax) +#define TBL0_OFF_DISP_KERN_WITH_POPRAX 0x28 +/* 0x28 */ .quad EXT(ks_dispatch_kernel_with_pop_rax) +#define TBL0_OFF_PTR_KERNEL_STACK_MASK 0x30 +/* 0x30 */ .quad 0 /* &kernel_stack_mask */ EXT(idt64_hndl_table1): .quad EXT(hndl_allintrs) @@ -217,19 +223,19 @@ Entry(idt64_unix_scall) pushq %rax /* save system call number */ pushq $(HNDL_UNIX_SCALL) pushq $(UNIX_INT) - jmp L_dispatch + jmp L_u64bit_entry_check Entry(idt64_mach_scall) pushq %rax /* save system call number */ pushq $(HNDL_MACH_SCALL) pushq $(MACH_INT) - jmp L_dispatch + jmp L_u64bit_entry_check Entry(idt64_mdep_scall) pushq %rax /* save system call number */ pushq $(HNDL_MDEP_SCALL) pushq $(MACHDEP_INT) - jmp L_dispatch + jmp L_u64bit_entry_check /* * For GP/NP/SS faults, we use the IST1 stack. @@ -283,29 +289,95 @@ Entry(idt64_mc) * Machine checks, doublefaults and similar use IST1 */ Entry(idt64_nmi) - /* Synthesize common interrupt stack frame */ - pushq $0 - pushq $(HNDL_ALLINTRS) - pushq $(T_NMI) - /* Spill prior to RDMSR */ push %rax push %rcx push %rdx + testb $3, ISF64_CS(%rsp) + jz 1f + + /* From user-space: copy interrupt state to user PCB */ + swapgs + + leaq EXT(idt64_hndl_table0)(%rip), %rax + mov 16(%rax), %rax /* Offset of per-CPU shadow */ + mov %gs:CPU_TASK_CR3(%rax), %rax + mov %rax, %cr3 /* note that SMAP is enabled in L_common_dispatch (on Broadwell+) */ + + mov %gs:CPU_UBER_ISF, %rcx /* PCB stack addr */ + add $(ISF64_SIZE), %rcx /* adjust to base of ISF */ + + leaq TBL0_OFF_DISP_USER_WITH_POPRAX+EXT(idt64_hndl_table0)(%rip), %rax /* ks_dispatch_user_with_pop_rax */ + jmp 4f /* Copy state to PCB */ + +1: + /* + * From kernel-space: + * Determine whether the kernel or user GS is set. + * Sets the high 32 bits of the return CS to 1 to ensure that we'll swapgs back correctly at IRET. + */ mov $(MSR_IA32_GS_BASE), %ecx - rdmsr /* Check contents of GSBASE MSR */ - test $0x80000000, %edx /* MSB set? Already swapped to kernel's */ - jnz 44f - swapgs /* Either direct from user or within trampolines */ -44: - pop %rdx - pop %rcx + rdmsr /* read kernel gsbase */ + test $0x80000000, %edx /* test MSB of address */ + jnz 2f + swapgs /* so swap */ + movl $1, ISF64_CS+4(%rsp) /* and set flag in CS slot */ +2: leaq EXT(idt64_hndl_table0)(%rip), %rax mov 16(%rax), %rax /* Offset of per-CPU shadow */ - mov %gs:CPU_KERNEL_CR3(%rax), %rax + mov %cr3, %rdx + mov %gs:CPU_TASK_CR3(%rax), %rax mov %rax, %cr3 /* Unconditionally switch to primary kernel pagetables */ - leaq EXT(idt64_hndl_table0)(%rip), %rax - jmp *(%rax) + + /* + * Determine whether we're on the kernel or interrupt stack + * when the NMI hit. + */ + mov ISF64_RSP(%rsp), %rcx + mov %gs:CPU_KERNEL_STACK, %rax + xor %rcx, %rax + movq TBL0_OFF_PTR_KERNEL_STACK_MASK+EXT(idt64_hndl_table0)(%rip), %rdx + mov (%rdx), %rdx /* Load kernel_stack_mask */ + and %rdx, %rax + test %rax, %rax /* are we on the kernel stack? */ + jz 3f /* yes */ + + mov %gs:CPU_INT_STACK_TOP, %rax + cmp %rcx, %rax /* are we on the interrupt stack? */ + jb 5f /* no */ + leaq -INTSTACK_SIZE(%rax), %rax + cmp %rcx, %rax + jb 3f /* yes */ +5: + mov %gs:CPU_KERNEL_STACK, %rcx +3: + /* 16-byte-align kernel/interrupt stack for state push */ + and $0xFFFFFFFFFFFFFFF0, %rcx + + leaq TBL0_OFF_DISP_KERN_WITH_POPRAX+EXT(idt64_hndl_table0)(%rip), %rax /* ks_dispatch_kernel_with_pop_rax */ +4: + /* + * Copy state from NMI stack (RSP) to the save area (RCX) which is + * the PCB for user or kernel/interrupt stack from kernel. + * ISF64_ERR(RSP) saved RAX + * ISF64_TRAPFN(RSP) saved RCX + * ISF64_TRAPNO(RSP) saved RDX + */ + xchg %rsp, %rcx /* set for pushes */ + push ISF64_SS(%rcx) + push ISF64_RSP(%rcx) + push ISF64_RFLAGS(%rcx) + push ISF64_CS(%rcx) + push ISF64_RIP(%rcx) + /* Synthesize common interrupt stack frame */ + push $(0) /* error code 0 */ + push $(HNDL_ALLINTRS) /* trapfn allintrs */ + push $(T_NMI) /* trapno T_NMI */ + push ISF64_ERR(%rcx) /* saved %rax is popped in ks_dispatch_{kernel|user}_with_pop_rax */ + mov ISF64_TRAPNO(%rcx), %rdx + mov ISF64_TRAPFN(%rcx), %rcx + + jmp *(%rax) /* ks_dispatch_{kernel|user}_with_pop_rax */ Entry(idt64_double_fault) pushq $(HNDL_DOUBLE_FAULT) @@ -375,7 +447,7 @@ L_sysenter_continue: pushq $(HNDL_SYSENTER) pushq $(T_SYSENTER) orl $(EFL_IF), ISF64_RFLAGS(%rsp) - jmp L_dispatch + jmp L_u64bit_entry_check /* * Common dispatch point. @@ -394,15 +466,15 @@ L_dispatch: swapgs leaq EXT(idt64_hndl_table0)(%rip), %rax mov 16(%rax), %rax - +L_dispatch_kgsb: mov %gs:CPU_TASK_CR3(%rax), %rax mov %rax, %cr3 #if DEBUG mov %rax, %gs:CPU_ENTRY_CR3 #endif 1: - /* The text/data relationship here must be preserved in the doublemap, and the contents must be remapped */ leaq EXT(idt64_hndl_table0)(%rip), %rax + /* The text/data relationship here must be preserved in the doublemap, and the contents must be remapped */ /* Indirect branch to non-doublemapped trampolines */ jmp *(%rax) /* User return: register restoration and address space switch sequence */ @@ -461,6 +533,28 @@ L_sysret: pop %r11 pop %rsp sysretq /* return from system call */ + +L_u64bit_entry_check: + /* + * Check we're not a confused 64-bit user. + */ + pushq %rax + swapgs + leaq EXT(idt64_hndl_table0)(%rip), %rax + mov 16(%rax), %rax + + cmpl $(TASK_MAP_32BIT), %gs:CPU_TASK_MAP(%rax) + jne L_64bit_entry_reject + jmp L_dispatch_kgsb + +L_64bit_entry_reject: + /* + * Here for a 64-bit user attempting an invalid kernel entry. + */ + movq $(HNDL_ALLTRAPS), 8+ISF64_TRAPFN(%rsp) + movq $(T_INVALID_OPCODE), 8+ISF64_TRAPNO(%rsp) + jmp L_dispatch_kgsb + /* End of double-mapped TEXT */ .text @@ -489,8 +583,12 @@ Entry(ks_dispatch) mov %gs:CPU_UBER_TMP, %rax jmp EXT(ks_dispatch_user) +Entry(ks_dispatch_user_with_pop_rax) + pop %rax + jmp EXT(ks_dispatch_user) + Entry (ks_return) - jmp . + jmp . Entry(ks_dispatch_user) cmpl $(TASK_MAP_32BIT), %gs:CPU_TASK_MAP @@ -503,6 +601,10 @@ L_dispatch_U64: mov %gs:CPU_KERNEL_STACK, %rsp jmp L_dispatch_64bit +Entry(ks_dispatch_kernel_with_pop_rax) + pop %rax + jmp EXT(ks_dispatch_kernel) + Entry(ks_dispatch_kernel) subq $(ISS64_OFFSET), %rsp mov %r15, R64_R15(%rsp) @@ -517,8 +619,8 @@ L_dispatch_64bit: /* * Save segment regs - for completeness since theyre not used. */ - movl %fs, R64_FS(%r15) - movl %gs, R64_GS(%r15) + mov %fs, R64_FS(%r15) + mov %gs, R64_GS(%r15) /* Save general-purpose registers */ mov %rax, R64_RAX(%r15) @@ -557,22 +659,6 @@ L_dispatch_64bit: jmp L_common_dispatch -L_64bit_entry_reject: - /* - * Here for a 64-bit user attempting an invalid kernel entry. - */ - movq $(HNDL_ALLTRAPS), ISF64_TRAPFN(%rsp) - movq $(T_INVALID_OPCODE), ISF64_TRAPNO(%rsp) - jmp L_dispatch_U64 - -Entry(ks_32bit_entry_check) - /* - * Check we're not a confused 64-bit user. - */ - cmpl $(TASK_MAP_32BIT), %gs:CPU_TASK_MAP - jne L_64bit_entry_reject - /* fall through to 32-bit handler: */ - L_dispatch_U32: /* 32-bit user task */ subq $(ISS64_OFFSET), %rsp mov %rsp, %r15 @@ -582,10 +668,10 @@ L_dispatch_U32: /* 32-bit user task */ /* * Save segment regs */ - movl %ds, R32_DS(%r15) - movl %es, R32_ES(%r15) - movl %fs, R32_FS(%r15) - movl %gs, R32_GS(%r15) + mov %ds, R32_DS(%r15) + mov %es, R32_ES(%r15) + mov %fs, R32_FS(%r15) + mov %gs, R32_GS(%r15) /* * Save general 32-bit registers @@ -1113,73 +1199,6 @@ L_32bit_fault_set_seg: jmp L_dispatch_U32_after_fault -Entry(ks_idt64_nmi_kernel) - /* From user-space: copy interrupt state to user PCB */ - swapgs - mov %gs:CPU_UBER_ISF, %rcx /* PCB stack addr */ - add $(ISF64_SIZE), %rcx /* adjust to base of ISF */ - swapgs /* swap back for L_dispatch */ - jmp 4f /* Copy state to PCB */ - -1: - /* - * From kernel-space: - * Determine whether the kernel or user GS is set. - * Set the kernel and ensure that we'll swap back correctly at IRET. - */ - mov $(MSR_IA32_GS_BASE), %ecx - rdmsr /* read kernel gsbase */ - test $0x80000000, %edx /* test MSB of address */ - jne 2f - swapgs /* so swap */ - movl $1, ISF64_CS+4(%rsp) /* and set flag in CS slot */ -2: - /* - * Determine whether we're on the kernel or interrupt stack - * when the NMI hit. - */ - mov ISF64_RSP(%rsp), %rcx - mov %gs:CPU_KERNEL_STACK, %rax - xor %rcx, %rax - and EXT(kernel_stack_mask)(%rip), %rax - test %rax, %rax /* are we on the kernel stack? */ - je 3f /* yes */ - - mov %gs:CPU_INT_STACK_TOP, %rax - dec %rax /* intr stack top is byte above max */ - xor %rcx, %rax - and EXT(kernel_stack_mask)(%rip), %rax - test %rax, %rax /* are we on the interrupt stack? */ - je 3f /* yes */ - - mov %gs:CPU_KERNEL_STACK, %rcx -3: - /* 16-byte-align kernel/interrupt stack for state push */ - and $0xFFFFFFFFFFFFFFF0, %rcx - -4: - /* - * Copy state from NMI stack (RSP) to the save area (RCX) which is - * the PCB for user or kernel/interrupt stack from kernel. - * ISF64_ERR(RSP) saved RAX - * ISF64_TRAPFN(RSP) saved RCX - * ISF64_TRAPNO(RSP) saved RDX - */ - xchg %rsp, %rcx /* set for pushes */ - push ISF64_SS(%rcx) - push ISF64_RSP(%rcx) - push ISF64_RFLAGS(%rcx) - push ISF64_CS(%rcx) - push ISF64_RIP(%rcx) - push $(0) /* error code 0 */ - push $(HNDL_ALLINTRS) /* trapfn allintrs */ - push $(T_NMI) /* trapno T_NMI */ - mov ISF64_ERR(%rcx), %rax - mov ISF64_TRAPNO(%rcx), %rdx - mov ISF64_TRAPFN(%rcx), %rcx - jmp L_dispatch - - /* All 'exceptions' enter hndl_alltraps, with: * r15 x86_saved_state_t address * rsp kernel stack if user-space, otherwise interrupt or kernel stack diff --git a/osfmk/x86_64/kpc_x86.c b/osfmk/x86_64/kpc_x86.c index a34b77fb3..f24bbfa31 100644 --- a/osfmk/x86_64/kpc_x86.c +++ b/osfmk/x86_64/kpc_x86.c @@ -583,7 +583,14 @@ kpc_set_period_arch( struct kpc_config_remote *mp_config ) void kpc_arch_init(void) { - /* No-op */ + i386_cpu_info_t *info = cpuid_info(); + uint8_t version_id = info->cpuid_arch_perf_leaf.version; + /* + * kpc only supports Intel PMU versions 2 and above. + */ + if (version_id < 2) { + kpc_supported = false; + } } uint32_t diff --git a/osfmk/x86_64/machine_routines_asm.s b/osfmk/x86_64/machine_routines_asm.s index af962f2f4..c0a1bdc90 100644 --- a/osfmk/x86_64/machine_routines_asm.s +++ b/osfmk/x86_64/machine_routines_asm.s @@ -175,18 +175,40 @@ ENTRY(_rtc_tsc_to_nanoseconds) shrdq $32,%rdx,%rax /* %rdx:%rax >>= 32 */ ret - + +/* + * typedef void (*thread_continue_t)(void *param, wait_result_t) + * + * void call_continuation( thread_continue_t continuation, + * void *param, + * wait_result_t wresult, + * bool enable interrupts) + */ Entry(call_continuation) - movq %rdi,%rcx /* get continuation */ - movq %rsi,%rdi /* continuation param */ - movq %rdx,%rsi /* wait result */ + + movq %rdi, %r12 /* continuation */ + movq %rsi, %r13 /* continuation param */ + movq %rdx, %r14 /* wait result */ + movq %gs:CPU_KERNEL_STACK,%rsp /* set the stack */ xorq %rbp,%rbp /* zero frame pointer */ + + test %ecx, %ecx + jz 1f + mov $1, %edi + call _ml_set_interrupts_enabled +1: + + movq %r12,%rcx /* continuation */ + movq %r13,%rdi /* continuation param */ + movq %r14,%rsi /* wait result */ + call *%rcx /* call continuation */ movq %gs:CPU_ACTIVE_THREAD,%rdi call EXT(thread_terminate) + Entry(x86_init_wrapper) xor %rbp, %rbp movq %rsi, %rsp diff --git a/osfmk/x86_64/monotonic_x86_64.c b/osfmk/x86_64/monotonic_x86_64.c index 12c5c4b76..720560148 100644 --- a/osfmk/x86_64/monotonic_x86_64.c +++ b/osfmk/x86_64/monotonic_x86_64.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include /* static_assert, assert */ #include @@ -89,7 +90,7 @@ mt_core_snap(unsigned int ctr) return __builtin_ia32_rdpmc(PMC2_RD); default: panic("monotonic: invalid core counter read: %u", ctr); - __builtin_trap(); + __builtin_unreachable(); } } @@ -112,7 +113,7 @@ mt_core_set_snap(unsigned int ctr, uint64_t count) break; default: panic("monotonic: invalid core counter write: %u", ctr); - __builtin_trap(); + __builtin_unreachable(); } } @@ -131,7 +132,8 @@ mt_core_set_snap(unsigned int ctr, uint64_t count) * Fixed counters are enabled in all rings, so hard-code this register state to * enable in all rings and deliver PMIs. */ -#define FIXED_CTR_CTRL_INIT (0x888 | 0x333) +#define FIXED_CTR_CTRL_INIT (0x888) +#define FIXED_CTR_CTRL_ENABLE (0x333) /* * GLOBAL_CTRL controls which counters are enabled -- the high 32-bits control @@ -184,7 +186,7 @@ core_up(cpu_data_t *cpu) for (int i = 0; i < MT_CORE_NFIXED; i++) { mt_core_set_snap(i, mtc->mtc_snaps[i]); } - wrmsr64(FIXED_CTR_CTRL, FIXED_CTR_CTRL_INIT); + wrmsr64(FIXED_CTR_CTRL, FIXED_CTR_CTRL_INIT | FIXED_CTR_CTRL_ENABLE); wrmsr64(GLOBAL_CTRL, GLOBAL_CTRL_FIXED_EN); } @@ -208,7 +210,6 @@ mt_pmi_x86_64(x86_saved_state_t *state) { uint64_t status; struct mt_cpu *mtc; - bool fixed_ovf = false; assert(ml_get_interrupts_enabled() == FALSE); mtc = mt_cur_cpu(); @@ -216,18 +217,28 @@ mt_pmi_x86_64(x86_saved_state_t *state) (void)atomic_fetch_add_explicit(&mt_pmis, 1, memory_order_relaxed); - for (int i = 0; i < MT_CORE_NFIXED; i++) { + for (unsigned int i = 0; i < MT_CORE_NFIXED; i++) { if (status & CTR_FIX_POS(i)) { - fixed_ovf = true; - uint64_t prior; - - prior = CTR_MAX - mtc->mtc_snaps[i]; + uint64_t prior = CTR_MAX - mtc->mtc_snaps[i]; assert(prior <= CTR_MAX); prior += 1; /* wrapped */ - mtc->mtc_counts[i] += prior; - mtc->mtc_snaps[i] = 0; - mt_mtc_update_count(mtc, i); + uint64_t delta = mt_mtc_update_count(mtc, i); + mtc->mtc_counts[i] += delta; + + if (mt_microstackshots && mt_microstackshot_ctr == i) { + x86_saved_state64_t *state64 = saved_state64(state); + bool user_mode = (state64->isf.cs & 0x3) ? true : false; + KDBG_RELEASE(KDBG_EVENTID(DBG_MONOTONIC, DBG_MT_DEBUG, 1), + mt_microstackshot_ctr, user_mode); + mt_microstackshot_pmi_handler(user_mode, mt_microstackshot_ctx); + } else if (mt_debug) { + KDBG(KDBG_EVENTID(DBG_MONOTONIC, DBG_MT_DEBUG, 2), + mt_microstackshot_ctr, i); + } + + mtc->mtc_snaps[i] = mt_core_reset_values[i]; + mt_core_set_snap(i, mt_core_reset_values[i]); } } @@ -239,34 +250,61 @@ mt_pmi_x86_64(x86_saved_state_t *state) return 0; } -void -mt_init(void) +static void +mt_microstackshot_start_remote(__unused void *arg) +{ + struct mt_cpu *mtc = mt_cur_cpu(); + + wrmsr64(FIXED_CTR_CTRL, FIXED_CTR_CTRL_INIT); + + for (int i = 0; i < MT_CORE_NFIXED; i++) { + uint64_t delta = mt_mtc_update_count(mtc, i); + mtc->mtc_counts[i] += delta; + mt_core_set_snap(i, mt_core_reset_values[i]); + mtc->mtc_snaps[i] = mt_core_reset_values[i]; + } + + wrmsr64(FIXED_CTR_CTRL, FIXED_CTR_CTRL_INIT | FIXED_CTR_CTRL_ENABLE); +} + +int +mt_microstackshot_start_arch(uint64_t period) { - uint32_t cpuinfo[4]; + if (!mt_core_supported) { + return ENOTSUP; + } - do_cpuid(0xA, cpuinfo); + mt_core_reset_values[mt_microstackshot_ctr] = CTR_MAX - period; + mp_cpus_call(CPUMASK_ALL, ASYNC, mt_microstackshot_start_remote, + NULL); + return 0; +} - if ((cpuinfo[0] & 0xff) >= 2) { +void +mt_early_init(void) +{ + i386_cpu_info_t *info = cpuid_info(); + if (info->cpuid_arch_perf_leaf.version >= 2) { lapic_set_pmi_func((i386_intr_func_t)mt_pmi_x86_64); mt_core_supported = true; } } static int -core_init(void) +core_init(__unused mt_device_t dev) { return ENOTSUP; } #pragma mark common hooks -const struct monotonic_dev monotonic_devs[] = { +struct mt_device mt_devices[] = { [0] = { - .mtd_name = "monotonic/core", + .mtd_name = "core", .mtd_init = core_init } }; static_assert( - (sizeof(monotonic_devs) / sizeof(monotonic_devs[0])) == MT_NDEVS, - "MT_NDEVS macro should be same as the length of monotonic_devs"); + (sizeof(mt_devices) / sizeof(mt_devices[0])) == MT_NDEVS, + "MT_NDEVS macro should be same as the length of mt_devices"); diff --git a/osfmk/x86_64/pmap.c b/osfmk/x86_64/pmap.c index 852b7618d..8be1ce0de 100644 --- a/osfmk/x86_64/pmap.c +++ b/osfmk/x86_64/pmap.c @@ -1402,6 +1402,7 @@ pmap_create_options( } #if MACH_ASSERT + p->pmap_stats_assert = TRUE; p->pmap_pid = 0; strlcpy(p->pmap_procname, "", sizeof (p->pmap_procname)); #endif /* MACH_ASSERT */ @@ -1512,6 +1513,34 @@ struct { int purgeable_nonvolatile_compressed_under; ledger_amount_t purgeable_nonvolatile_compressed_under_total; ledger_amount_t purgeable_nonvolatile_compressed_under_max; + + int network_volatile_over; + ledger_amount_t network_volatile_over_total; + ledger_amount_t network_volatile_over_max; + int network_volatile_under; + ledger_amount_t network_volatile_under_total; + ledger_amount_t network_volatile_under_max; + + int network_nonvolatile_over; + ledger_amount_t network_nonvolatile_over_total; + ledger_amount_t network_nonvolatile_over_max; + int network_nonvolatile_under; + ledger_amount_t network_nonvolatile_under_total; + ledger_amount_t network_nonvolatile_under_max; + + int network_volatile_compressed_over; + ledger_amount_t network_volatile_compressed_over_total; + ledger_amount_t network_volatile_compressed_over_max; + int network_volatile_compressed_under; + ledger_amount_t network_volatile_compressed_under_total; + ledger_amount_t network_volatile_compressed_under_max; + + int network_nonvolatile_compressed_over; + ledger_amount_t network_nonvolatile_compressed_over_total; + ledger_amount_t network_nonvolatile_compressed_over_max; + int network_nonvolatile_compressed_under; + ledger_amount_t network_nonvolatile_compressed_under_total; + ledger_amount_t network_nonvolatile_compressed_under_max; } pmap_ledgers_drift; static void pmap_check_ledgers(pmap_t pmap); #else /* MACH_ASSERT */ @@ -1633,7 +1662,15 @@ pmap_protect( /* * Set the physical protection on the * specified range of this map as requested. - * Will not increase permissions. + * + * VERY IMPORTANT: Will *NOT* increase permissions. + * pmap_protect_options() should protect the range against any access types + * that are not in "prot" but it should never grant extra access. + * For example, if "prot" is READ|EXECUTE, that means "remove write + * access" but it does *not* mean "add read and execute" access. + * VM relies on getting soft-faults to enforce extra checks (code + * signing, for example), for example. + * New access permissions are granted via pmap_enter() only. */ void pmap_protect_options( @@ -1698,26 +1735,26 @@ pmap_protect_options( continue; if (is_ept) { - if (prot & VM_PROT_READ) - pmap_update_pte(spte, 0, PTE_READ(is_ept)); - else + if (! (prot & VM_PROT_READ)) { pmap_update_pte(spte, PTE_READ(is_ept), 0); + } } - if (prot & VM_PROT_WRITE) - pmap_update_pte(spte, 0, PTE_WRITE(is_ept)); - else + if (! (prot & VM_PROT_WRITE)) { pmap_update_pte(spte, PTE_WRITE(is_ept), 0); + } +#if DEVELOPMENT || DEBUG + else if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && + map == kernel_pmap) { + pmap_update_pte(spte, 0, PTE_WRITE(is_ept)); + } +#endif /* DEVELOPMENT || DEBUG */ if (set_NX) { - if (!is_ept) + if (!is_ept) { pmap_update_pte(spte, 0, INTEL_PTE_NX); - else + } else { pmap_update_pte(spte, INTEL_EPT_EX, 0); - } else { - if (!is_ept) - pmap_update_pte(spte, INTEL_PTE_NX, 0); - else - pmap_update_pte(spte, 0, INTEL_EPT_EX); + } } num_found++; } @@ -2434,9 +2471,11 @@ pmap_switch(pmap_t tpmap) { spl_t s; + PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(tpmap)); s = splhigh(); /* Make sure interruptions are disabled */ set_dirbase(tpmap, current_thread(), cpu_number()); splx(s); + PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END); } @@ -2498,7 +2537,7 @@ pmap_flush( { unsigned int my_cpu; unsigned int cpu; - unsigned int cpu_bit; + cpumask_t cpu_bit; cpumask_t cpus_to_respond = 0; cpumask_t cpus_to_signal = 0; cpumask_t cpus_signaled = 0; @@ -2629,7 +2668,7 @@ void pmap_flush_tlbs(pmap_t pmap, vm_map_offset_t startv, vm_map_offset_t endv, int options, pmap_flush_context *pfc) { unsigned int cpu; - unsigned int cpu_bit; + cpumask_t cpu_bit; cpumask_t cpus_to_signal = 0; unsigned int my_cpu = cpu_number(); pmap_paddr_t pmap_cr3 = pmap->pm_cr3; @@ -2954,6 +2993,8 @@ pmap_permissions_verify(pmap_t ipmap, vm_map_t ivmmap, vm_offset_t sv, vm_offset #if MACH_ASSERT extern int pmap_ledgers_panic; +extern int pmap_ledgers_panic_leeway; + static void pmap_check_ledgers( pmap_t pmap) @@ -2985,248 +3026,57 @@ pmap_check_ledgers( pmap_ledgers_drift.num_pmaps_checked++; - ledger_get_balance(pmap->ledger, - task_ledgers.phys_footprint, - &bal); - if (bal != 0) { - do_panic = TRUE; - printf("LEDGER BALANCE proc %d (%s) " - "\"phys_footprint\" = %lld\n", - pid, procname, bal); - if (bal > 0) { - pmap_ledgers_drift.phys_footprint_over++; - pmap_ledgers_drift.phys_footprint_over_total += bal; - if (bal > pmap_ledgers_drift.phys_footprint_over_max) { - pmap_ledgers_drift.phys_footprint_over_max = bal; - } - } else { - pmap_ledgers_drift.phys_footprint_under++; - pmap_ledgers_drift.phys_footprint_under_total += bal; - if (bal < pmap_ledgers_drift.phys_footprint_under_max) { - pmap_ledgers_drift.phys_footprint_under_max = bal; - } - } - } - ledger_get_balance(pmap->ledger, - task_ledgers.internal, - &bal); - if (bal != 0) { - do_panic = TRUE; - printf("LEDGER BALANCE proc %d (%s) " - "\"internal\" = %lld\n", - pid, procname, bal); - if (bal > 0) { - pmap_ledgers_drift.internal_over++; - pmap_ledgers_drift.internal_over_total += bal; - if (bal > pmap_ledgers_drift.internal_over_max) { - pmap_ledgers_drift.internal_over_max = bal; - } - } else { - pmap_ledgers_drift.internal_under++; - pmap_ledgers_drift.internal_under_total += bal; - if (bal < pmap_ledgers_drift.internal_under_max) { - pmap_ledgers_drift.internal_under_max = bal; - } - } - } - ledger_get_balance(pmap->ledger, - task_ledgers.internal_compressed, - &bal); - if (bal != 0) { - do_panic = TRUE; - printf("LEDGER BALANCE proc %d (%s) " - "\"internal_compressed\" = %lld\n", - pid, procname, bal); - if (bal > 0) { - pmap_ledgers_drift.internal_compressed_over++; - pmap_ledgers_drift.internal_compressed_over_total += bal; - if (bal > pmap_ledgers_drift.internal_compressed_over_max) { - pmap_ledgers_drift.internal_compressed_over_max = bal; - } - } else { - pmap_ledgers_drift.internal_compressed_under++; - pmap_ledgers_drift.internal_compressed_under_total += bal; - if (bal < pmap_ledgers_drift.internal_compressed_under_max) { - pmap_ledgers_drift.internal_compressed_under_max = bal; - } - } - } - ledger_get_balance(pmap->ledger, - task_ledgers.iokit_mapped, - &bal); - if (bal != 0) { - do_panic = TRUE; - printf("LEDGER BALANCE proc %d (%s) " - "\"iokit_mapped\" = %lld\n", - pid, procname, bal); - if (bal > 0) { - pmap_ledgers_drift.iokit_mapped_over++; - pmap_ledgers_drift.iokit_mapped_over_total += bal; - if (bal > pmap_ledgers_drift.iokit_mapped_over_max) { - pmap_ledgers_drift.iokit_mapped_over_max = bal; - } - } else { - pmap_ledgers_drift.iokit_mapped_under++; - pmap_ledgers_drift.iokit_mapped_under_total += bal; - if (bal < pmap_ledgers_drift.iokit_mapped_under_max) { - pmap_ledgers_drift.iokit_mapped_under_max = bal; - } - } - } - ledger_get_balance(pmap->ledger, - task_ledgers.alternate_accounting, - &bal); - if (bal != 0) { - do_panic = TRUE; - printf("LEDGER BALANCE proc %d (%s) " - "\"alternate_accounting\" = %lld\n", - pid, procname, bal); - if (bal > 0) { - pmap_ledgers_drift.alternate_accounting_over++; - pmap_ledgers_drift.alternate_accounting_over_total += bal; - if (bal > pmap_ledgers_drift.alternate_accounting_over_max) { - pmap_ledgers_drift.alternate_accounting_over_max = bal; - } - } else { - pmap_ledgers_drift.alternate_accounting_under++; - pmap_ledgers_drift.alternate_accounting_under_total += bal; - if (bal < pmap_ledgers_drift.alternate_accounting_under_max) { - pmap_ledgers_drift.alternate_accounting_under_max = bal; - } - } - } - ledger_get_balance(pmap->ledger, - task_ledgers.alternate_accounting_compressed, - &bal); - if (bal != 0) { - do_panic = TRUE; - printf("LEDGER BALANCE proc %d (%s) " - "\"alternate_accounting_compressed\" = %lld\n", - pid, procname, bal); - if (bal > 0) { - pmap_ledgers_drift.alternate_accounting_compressed_over++; - pmap_ledgers_drift.alternate_accounting_compressed_over_total += bal; - if (bal > pmap_ledgers_drift.alternate_accounting_compressed_over_max) { - pmap_ledgers_drift.alternate_accounting_compressed_over_max = bal; - } - } else { - pmap_ledgers_drift.alternate_accounting_compressed_under++; - pmap_ledgers_drift.alternate_accounting_compressed_under_total += bal; - if (bal < pmap_ledgers_drift.alternate_accounting_compressed_under_max) { - pmap_ledgers_drift.alternate_accounting_compressed_under_max = bal; - } - } - } - ledger_get_balance(pmap->ledger, - task_ledgers.page_table, - &bal); - if (bal != 0) { - do_panic = TRUE; - printf("LEDGER BALANCE proc %d (%s) " - "\"page_table\" = %lld\n", - pid, procname, bal); - if (bal > 0) { - pmap_ledgers_drift.page_table_over++; - pmap_ledgers_drift.page_table_over_total += bal; - if (bal > pmap_ledgers_drift.page_table_over_max) { - pmap_ledgers_drift.page_table_over_max = bal; - } - } else { - pmap_ledgers_drift.page_table_under++; - pmap_ledgers_drift.page_table_under_total += bal; - if (bal < pmap_ledgers_drift.page_table_under_max) { - pmap_ledgers_drift.page_table_under_max = bal; - } - } - } - ledger_get_balance(pmap->ledger, - task_ledgers.purgeable_volatile, - &bal); - if (bal != 0) { - do_panic = TRUE; - printf("LEDGER BALANCE proc %d (%s) " - "\"purgeable_volatile\" = %lld\n", - pid, procname, bal); - if (bal > 0) { - pmap_ledgers_drift.purgeable_volatile_over++; - pmap_ledgers_drift.purgeable_volatile_over_total += bal; - if (bal > pmap_ledgers_drift.purgeable_volatile_over_max) { - pmap_ledgers_drift.purgeable_volatile_over_max = bal; - } - } else { - pmap_ledgers_drift.purgeable_volatile_under++; - pmap_ledgers_drift.purgeable_volatile_under_total += bal; - if (bal < pmap_ledgers_drift.purgeable_volatile_under_max) { - pmap_ledgers_drift.purgeable_volatile_under_max = bal; - } - } - } - ledger_get_balance(pmap->ledger, - task_ledgers.purgeable_nonvolatile, - &bal); - if (bal != 0) { - do_panic = TRUE; - printf("LEDGER BALANCE proc %d (%s) " - "\"purgeable_nonvolatile\" = %lld\n", - pid, procname, bal); - if (bal > 0) { - pmap_ledgers_drift.purgeable_nonvolatile_over++; - pmap_ledgers_drift.purgeable_nonvolatile_over_total += bal; - if (bal > pmap_ledgers_drift.purgeable_nonvolatile_over_max) { - pmap_ledgers_drift.purgeable_nonvolatile_over_max = bal; - } - } else { - pmap_ledgers_drift.purgeable_nonvolatile_under++; - pmap_ledgers_drift.purgeable_nonvolatile_under_total += bal; - if (bal < pmap_ledgers_drift.purgeable_nonvolatile_under_max) { - pmap_ledgers_drift.purgeable_nonvolatile_under_max = bal; - } - } - } - ledger_get_balance(pmap->ledger, - task_ledgers.purgeable_volatile_compressed, - &bal); - if (bal != 0) { - do_panic = TRUE; - printf("LEDGER BALANCE proc %d (%s) " - "\"purgeable_volatile_compressed\" = %lld\n", - pid, procname, bal); - if (bal > 0) { - pmap_ledgers_drift.purgeable_volatile_compressed_over++; - pmap_ledgers_drift.purgeable_volatile_compressed_over_total += bal; - if (bal > pmap_ledgers_drift.purgeable_volatile_compressed_over_max) { - pmap_ledgers_drift.purgeable_volatile_compressed_over_max = bal; - } - } else { - pmap_ledgers_drift.purgeable_volatile_compressed_under++; - pmap_ledgers_drift.purgeable_volatile_compressed_under_total += bal; - if (bal < pmap_ledgers_drift.purgeable_volatile_compressed_under_max) { - pmap_ledgers_drift.purgeable_volatile_compressed_under_max = bal; - } - } - } - ledger_get_balance(pmap->ledger, - task_ledgers.purgeable_nonvolatile_compressed, - &bal); - if (bal != 0) { - do_panic = TRUE; - printf("LEDGER BALANCE proc %d (%s) " - "\"purgeable_nonvolatile_compressed\" = %lld\n", - pid, procname, bal); - if (bal > 0) { - pmap_ledgers_drift.purgeable_nonvolatile_compressed_over++; - pmap_ledgers_drift.purgeable_nonvolatile_compressed_over_total += bal; - if (bal > pmap_ledgers_drift.purgeable_nonvolatile_compressed_over_max) { - pmap_ledgers_drift.purgeable_nonvolatile_compressed_over_max = bal; - } - } else { - pmap_ledgers_drift.purgeable_nonvolatile_compressed_under++; - pmap_ledgers_drift.purgeable_nonvolatile_compressed_under_total += bal; - if (bal < pmap_ledgers_drift.purgeable_nonvolatile_compressed_under_max) { - pmap_ledgers_drift.purgeable_nonvolatile_compressed_under_max = bal; - } - } - } +#define LEDGER_CHECK_BALANCE(__LEDGER) \ +MACRO_BEGIN \ + int panic_on_negative = TRUE; \ + ledger_get_balance(pmap->ledger, \ + task_ledgers.__LEDGER, \ + &bal); \ + ledger_get_panic_on_negative(pmap->ledger, \ + task_ledgers.__LEDGER, \ + &panic_on_negative); \ + if (bal != 0) { \ + if (panic_on_negative || \ + (pmap_ledgers_panic && \ + pmap_ledgers_panic_leeway > 0 && \ + (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) || \ + bal < (pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \ + do_panic = TRUE; \ + } \ + printf("LEDGER BALANCE proc %d (%s) " \ + "\"%s\" = %lld\n", \ + pid, procname, #__LEDGER, bal); \ + if (bal > 0) { \ + pmap_ledgers_drift.__LEDGER##_over++; \ + pmap_ledgers_drift.__LEDGER##_over_total += bal; \ + if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \ + pmap_ledgers_drift.__LEDGER##_over_max = bal; \ + } \ + } else if (bal < 0) { \ + pmap_ledgers_drift.__LEDGER##_under++; \ + pmap_ledgers_drift.__LEDGER##_under_total += bal; \ + if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \ + pmap_ledgers_drift.__LEDGER##_under_max = bal; \ + } \ + } \ + } \ +MACRO_END + + LEDGER_CHECK_BALANCE(phys_footprint); + LEDGER_CHECK_BALANCE(internal); + LEDGER_CHECK_BALANCE(internal_compressed); + LEDGER_CHECK_BALANCE(iokit_mapped); + LEDGER_CHECK_BALANCE(alternate_accounting); + LEDGER_CHECK_BALANCE(alternate_accounting_compressed); + LEDGER_CHECK_BALANCE(page_table); + LEDGER_CHECK_BALANCE(purgeable_volatile); + LEDGER_CHECK_BALANCE(purgeable_nonvolatile); + LEDGER_CHECK_BALANCE(purgeable_volatile_compressed); + LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed); + LEDGER_CHECK_BALANCE(network_volatile); + LEDGER_CHECK_BALANCE(network_nonvolatile); + LEDGER_CHECK_BALANCE(network_volatile_compressed); + LEDGER_CHECK_BALANCE(network_nonvolatile_compressed); if (do_panic) { if (pmap_ledgers_panic) { @@ -3254,7 +3104,8 @@ pmap_check_ledgers( pmap->stats.external != 0 || pmap->stats.reusable != 0 || pmap->stats.compressed != 0) { - if (pmap_stats_assert) { + if (pmap_stats_assert && + pmap->pmap_stats_assert) { panic("pmap_destroy(%p) %d[%s] imbalanced stats: resident=%d wired=%d device=%d internal=%d external=%d reusable=%d compressed=%lld", pmap, pid, procname, pmap->stats.resident_count, @@ -3289,6 +3140,32 @@ pmap_set_process( pmap->pmap_pid = pid; strlcpy(pmap->pmap_procname, procname, sizeof (pmap->pmap_procname)); + if (pmap_ledgers_panic_leeway) { + /* + * XXX FBDP + * Some processes somehow trigger some issues that make + * the pmap stats and ledgers go off track, causing + * some assertion failures and ledger panics. + * Turn off the sanity checks if we allow some ledger leeway + * because of that. We'll still do a final check in + * pmap_check_ledgers() for discrepancies larger than the + * allowed leeway after the address space has been fully + * cleaned up. + */ + pmap->pmap_stats_assert = FALSE; + ledger_disable_panic_on_negative(pmap->ledger, + task_ledgers.phys_footprint); + ledger_disable_panic_on_negative(pmap->ledger, + task_ledgers.internal); + ledger_disable_panic_on_negative(pmap->ledger, + task_ledgers.internal_compressed); + ledger_disable_panic_on_negative(pmap->ledger, + task_ledgers.iokit_mapped); + ledger_disable_panic_on_negative(pmap->ledger, + task_ledgers.alternate_accounting); + ledger_disable_panic_on_negative(pmap->ledger, + task_ledgers.alternate_accounting_compressed); + } } #endif /* MACH_ASSERT */ @@ -3326,3 +3203,4 @@ void pmap_verify_noncacheable(uintptr_t vaddr) { return; panic("pmap_verify_noncacheable: IO read from a cacheable address? address: 0x%lx, PTE: %p, *PTE: 0x%llx", vaddr, ptep, *ptep); } + diff --git a/osfmk/x86_64/pmap_pcid.c b/osfmk/x86_64/pmap_pcid.c index 3cf4a0e49..506d684ce 100644 --- a/osfmk/x86_64/pmap_pcid.c +++ b/osfmk/x86_64/pmap_pcid.c @@ -33,7 +33,6 @@ #include #include #include -#include /* * PCID (Process context identifier) aka tagged TLB support. diff --git a/pexpert/arm/pe_identify_machine.c b/pexpert/arm/pe_identify_machine.c index 63eb8929f..4e734fbe6 100644 --- a/pexpert/arm/pe_identify_machine.c +++ b/pexpert/arm/pe_identify_machine.c @@ -97,6 +97,11 @@ pe_identify_machine(boot_args * bootArgs) pclk = hclk / 2; tclk = 100000; /* timer is at 100khz */ + } else if (!strcmp(gPESoCDeviceType, "bcm2837-io")) { + mclk = 1200000000; + hclk = mclk / 4; + pclk = hclk / 2; + tclk = 1000000; } else use_dt = 1; @@ -297,10 +302,18 @@ static struct tbd_ops t8010_funcs = {NULL, NULL, NULL}; static struct tbd_ops t8011_funcs = {NULL, NULL, NULL}; #endif /* defined(ARM_BOARD_CLASS_T8011) */ +#if defined(ARM_BOARD_CLASS_T8015) +static struct tbd_ops t8015_funcs = {NULL, NULL, NULL}; +#endif /* defined(ARM_BOARD_CLASS_T8015) */ + + +#if defined(ARM_BOARD_CLASS_BCM2837) +static struct tbd_ops bcm2837_funcs = {NULL, NULL, NULL}; +#endif /* defined(ARM_BOARD_CLASS_BCM2837) */ vm_offset_t gPicBase; vm_offset_t gTimerBase; @@ -320,7 +333,7 @@ typedef enum static panic_trace_t bootarg_panic_trace; // The command buffer contains the converted commands from the device tree for commanding cpu_halt, enable_trace, etc. -#define DEBUG_COMMAND_BUFFER_SIZE 100 +#define DEBUG_COMMAND_BUFFER_SIZE 256 typedef struct command_buffer_element{ uintptr_t address; uint16_t destination_cpu_selector; @@ -659,6 +672,16 @@ pe_arm_init_timer(void *args) if (!strcmp(gPESoCDeviceType, "t8011-io")) { tbd_funcs = &t8011_funcs; } else +#endif +#if defined(ARM_BOARD_CLASS_T8015) + if (!strcmp(gPESoCDeviceType, "t8015-io")) { + tbd_funcs = &t8015_funcs; + } else +#endif +#if defined(ARM_BOARD_CLASS_BCM2837) + if (!strcmp(gPESoCDeviceType, "bcm2837-io")) { + tbd_funcs = &bcm2837_funcs; + } else #endif return 0; diff --git a/pexpert/arm/pe_init.c b/pexpert/arm/pe_init.c index 70d5e54f8..6e65a0ab9 100644 --- a/pexpert/arm/pe_init.c +++ b/pexpert/arm/pe_init.c @@ -17,6 +17,13 @@ #include #include +#if defined __arm__ +#include +#elif defined __arm64__ +#include +#endif + + /* extern references */ extern void pe_identify_machine(boot_args *bootArgs); @@ -24,7 +31,7 @@ extern void pe_identify_machine(boot_args *bootArgs); static void pe_prepare_images(void); /* private globals */ -PE_state_t PE_state; +SECURITY_READ_ONLY_LATE(PE_state_t) PE_state; #define FW_VERS_LEN 128 char firmware_version[FW_VERS_LEN]; @@ -60,9 +67,16 @@ static boolean_t panic_console_available = FALSE; extern uint32_t crc32(uint32_t crc, const void *buf, size_t size); +void PE_slide_devicetree(vm_offset_t); + static void check_for_panic_log(void) { +#ifdef PLATFORM_PANIC_LOG_PADDR + gPanicBase = ml_io_map_wcomb(PLATFORM_PANIC_LOG_PADDR, PLATFORM_PANIC_LOG_SIZE); + panic_text_len = PLATFORM_PANIC_LOG_SIZE - sizeof(struct embedded_panic_header); + gPanicSize = PLATFORM_PANIC_LOG_SIZE; +#else DTEntry entry, chosen; unsigned int size; uintptr_t *reg_prop; @@ -93,6 +107,7 @@ check_for_panic_log(void) /* Deduct the size of the panic header from the panic region size */ panic_text_len = panic_region_length[0] - sizeof(struct embedded_panic_header); gPanicSize = panic_region_length[0]; +#endif panic_info = (struct embedded_panic_header *)gPanicBase; /* Check if a shared memory console is running in the panic buffer */ @@ -279,6 +294,14 @@ PE_init_iokit(void) StartIOKit(PE_state.deviceTreeHead, PE_state.bootArgs, (void *) 0, (void *) 0); } +void +PE_slide_devicetree(vm_offset_t slide) +{ + assert(PE_state.initialized); + PE_state.deviceTreeHead += slide; + DTInit(PE_state.deviceTreeHead); +} + void PE_init_platform(boolean_t vm_initialized, void *args) { @@ -471,6 +494,16 @@ PE_i_can_has_debugger(uint32_t *debug_flags) return (debug_enabled); } +/* + * This routine returns TRUE if the device is configured + * with panic debugging enabled. + */ +boolean_t +PE_panic_debugging_enabled() +{ + return panicDebugging; +} + void PE_save_buffer_to_vram(unsigned char *buf, unsigned int *size) { diff --git a/pexpert/arm/pe_serial.c b/pexpert/arm/pe_serial.c index 6b9000117..18c746773 100644 --- a/pexpert/arm/pe_serial.c +++ b/pexpert/arm/pe_serial.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -42,7 +43,6 @@ static int uart_initted = 0; /* 1 if init'ed */ static vm_offset_t uart_base; - /*****************************************************************************/ #ifdef S3CUART @@ -645,8 +645,7 @@ static void dockchannel_uart_init(void) rDOCKCHANNELS_DEV_DRAIN_CFG(DOCKCHANNEL_UART_CHANNEL) = max_dockchannel_drain_period; // Drain timer doesn't get loaded with value from drain period register if fifo - // is already full. Drop a character from the fifo. See chapter 8 of the Cayman - // DockChannels specification for more details. + // is already full. Drop a character from the fifo. rDOCKCHANNELS_DOCK_RDATA1(DOCKCHANNEL_UART_CHANNEL); } @@ -662,8 +661,91 @@ static struct pe_serial_functions dockchannel_uart_serial_functions = #endif /* DOCKCHANNEL_UART */ -/*****************************************************************************/ +/****************************************************************************/ +#ifdef PI3_UART +vm_offset_t pi3_gpio_base_vaddr; +vm_offset_t pi3_aux_base_vaddr; +static int pi3_uart_tr0(void) +{ + return (int) BCM2837_GET32(BCM2837_AUX_MU_LSR_REG_V) & 0x20; +} + +static void pi3_uart_td0(int c) +{ + BCM2837_PUT32(BCM2837_AUX_MU_IO_REG_V, (uint32_t) c); +} + +static int pi3_uart_rr0(void) +{ + return (int) BCM2837_GET32(BCM2837_AUX_MU_LSR_REG_V) & 0x01; +} + +static int pi3_uart_rd0(void) +{ + return (int) BCM2837_GET32(BCM2837_AUX_MU_IO_REG_V) & 0xff; +} + +static void pi3_uart_init(void) +{ + // Scratch variable + uint32_t i; + + // Reset mini uart registers + BCM2837_PUT32(BCM2837_AUX_ENABLES_V, 1); + BCM2837_PUT32(BCM2837_AUX_MU_CNTL_REG_V, 0); + BCM2837_PUT32(BCM2837_AUX_MU_LCR_REG_V, 3); + BCM2837_PUT32(BCM2837_AUX_MU_MCR_REG_V, 0); + BCM2837_PUT32(BCM2837_AUX_MU_IER_REG_V, 0); + BCM2837_PUT32(BCM2837_AUX_MU_IIR_REG_V, 0xC6); + BCM2837_PUT32(BCM2837_AUX_MU_BAUD_REG_V, 270); + + i = BCM2837_FSEL_REG(14); + // Configure GPIOs 14 & 15 for alternate function 5 + i &= ~(BCM2837_FSEL_MASK(14)); + i |= (BCM2837_FSEL_ALT5 << BCM2837_FSEL_OFFS(14)); + i &= ~(BCM2837_FSEL_MASK(15)); + i |= (BCM2837_FSEL_ALT5 << BCM2837_FSEL_OFFS(15)); + + BCM2837_PUT32(BCM2837_FSEL_REG(14), i); + + BCM2837_PUT32(BCM2837_GPPUD_V, 0); + + // Barrier before AP spinning for 150 cycles + __builtin_arm_isb(ISB_SY); + + for(i = 0; i < 150; i++) { + asm volatile("add x0, x0, xzr"); + } + + __builtin_arm_isb(ISB_SY); + BCM2837_PUT32(BCM2837_GPPUDCLK0_V,(1 << 14) | (1 << 15)); + + __builtin_arm_isb(ISB_SY); + + for(i = 0; i < 150; i++) { + asm volatile("add x0, x0, xzr"); + } + + __builtin_arm_isb(ISB_SY); + + BCM2837_PUT32(BCM2837_GPPUDCLK0_V, 0); + + BCM2837_PUT32(BCM2837_AUX_MU_CNTL_REG_V, 3); +} + +static struct pe_serial_functions pi3_uart_serial_functions = +{ + .uart_init = pi3_uart_init, + .uart_set_baud_rate = NULL, + .tr0 = pi3_uart_tr0, + .td0 = pi3_uart_td0, + .rr0 = pi3_uart_rr0, + .rd0 = pi3_uart_rd0 +}; + +#endif /* PI3_UART */ +/*****************************************************************************/ int serial_init(void) { @@ -682,12 +764,16 @@ serial_init(void) #ifdef DOCKCHANNEL_UART uint32_t no_dockchannel_uart; #endif +#ifdef PI3_UART + uint32_t is_pi3; +#endif - if (uart_initted) { + if (uart_initted && gPESF) { gPESF->uart_init(); kprintf("reinit serial\n"); return 1; } + dccmode = 0; if (PE_parse_boot_argn("dcc", &dccmode, sizeof (dccmode))) { gPESF = &dcc_serial_functions; @@ -704,6 +790,19 @@ serial_init(void) } #endif /* SHMCON */ +#ifdef PI3_UART +#pragma unused(prop_value) + is_pi3 = 0; + if (PE_parse_boot_argn("-pi3", &is_pi3, sizeof(is_pi3))) { // FIXME: remove the not operator after boot args are set up. + pi3_gpio_base_vaddr = ml_io_map((vm_offset_t)BCM2837_GPIO_BASE, BCM2837_GPIO_SIZE); + pi3_aux_base_vaddr = ml_io_map((vm_offset_t)BCM2837_AUX_BASE, BCM2837_AUX_SIZE); + gPESF = &pi3_uart_serial_functions; + gPESF->uart_init(); + uart_initted = 1; + return 1; + } +#endif /* PI3_UART */ + soc_base = pe_arm_get_soc_base_phys(); if (soc_base == 0) diff --git a/pexpert/gen/pe_gen.c b/pexpert/gen/pe_gen.c index 5e130b161..5515f47b2 100644 --- a/pexpert/gen/pe_gen.c +++ b/pexpert/gen/pe_gen.c @@ -48,6 +48,8 @@ static uint32_t gPEKernelConfigurationBitmask; int32_t gPESerialBaud = -1; +int debug_cpu_performance_degradation_factor = 1; + void pe_init_debug(void) { boolean_t boot_arg_value; @@ -86,6 +88,21 @@ void pe_init_debug(void) #endif gPEKernelConfigurationBitmask |= (boot_arg_value ? kPEICanHasDiagnosticAPI : 0); + + int factor = 1; + boolean_t have_bootarg = PE_parse_boot_argn("cpu-factor", &factor, sizeof (factor)); + if (have_bootarg) { + debug_cpu_performance_degradation_factor = factor; + } else { + DTEntry root; + if (DTLookupEntry(NULL, "/", &root) == kSuccess) { + void *prop = NULL; + uint32_t size = 0; + if (DTGetProperty(root, "target-is-fpga", &prop, &size) == kSuccess) { + debug_cpu_performance_degradation_factor = 10; + } + } + } } void PE_enter_debugger(const char *cause) diff --git a/pexpert/i386/pe_serial.c b/pexpert/i386/pe_serial.c index 94818a614..e35457dde 100644 --- a/pexpert/i386/pe_serial.c +++ b/pexpert/i386/pe_serial.c @@ -54,6 +54,8 @@ static boolean_t lpss_uart_supported = 0; /* 1 if LPSS UART is supported on plat static unsigned int lpss_uart_enabled = 0; /* 1 if it is LPSS UART is in D0 state */ static void lpss_uart_re_init (void); +static boolean_t pcie_uart_enabled = 0; /* 1 if PCIe UART is supported on platform */ + #define DEFAULT_UART_BAUD_RATE 115200 static unsigned uart_baud_rate = DEFAULT_UART_BAUD_RATE; @@ -433,6 +435,131 @@ static struct pe_serial_functions mmio_uart_serial_functions = { .rd0 = mmio_uart_rd0 }; +// ============================================================================= +// PCIE_MMIO UART +// ============================================================================= + +#define PCIE_MMIO_UART_BASE 0xFE410000 + +#define PCIE_MMIO_WRITE(r, v) ml_phys_write_byte(pcie_mmio_uart_base + PCIE_MMIO_UART_##r, v) +#define PCIE_MMIO_READ(r) ml_phys_read_byte(pcie_mmio_uart_base + PCIE_MMIO_UART_##r) + +enum { + PCIE_MMIO_UART_RBR = 0x0, /* receive buffer Register (R) */ + PCIE_MMIO_UART_THR = 0x0, /* transmit holding register (W) */ + PCIE_MMIO_UART_IER = 0x1, /* interrupt enable register */ + PCIE_MMIO_UART_FCR = 0x2, /* fifo control register (W) */ + PCIE_MMIO_UART_LCR = 0x4, /* line control register */ + PCIE_MMIO_UART_MCR = 0x4, /* modem control register */ + PCIE_MMIO_UART_LSR = 0x5, /* line status register */ + PCIE_MMIO_UART_DLL = 0x8, /* DLAB = 1, divisor latch (LSB) */ + PCIE_MMIO_UART_DLM = 0x9, /* DLAB = 1, divisor latch (MSB) */ + PCIE_MMIO_UART_SCR = 0x30, /* scratch register */ +}; + +static vm_offset_t pcie_mmio_uart_base = 0; + +static int +pcie_mmio_uart_present( void ) +{ + + PCIE_MMIO_WRITE( SCR, 0x5a ); + if (PCIE_MMIO_READ(SCR) != 0x5a) return 0; + PCIE_MMIO_WRITE( SCR, 0xa5 ); + if (PCIE_MMIO_READ(SCR) != 0xa5) return 0; + + return 1; +} + +static int +pcie_mmio_uart_probe( void ) +{ + unsigned new_pcie_mmio_uart_base = 0; + + // if specified, pcie_mmio_uart overrides all probing + if (PE_parse_boot_argn("pcie_mmio_uart", &new_pcie_mmio_uart_base, sizeof (new_pcie_mmio_uart_base))) + { + // pcie_mmio_uart=0 will disable pcie_mmio_uart support + if (new_pcie_mmio_uart_base == 0) { + return 0; + } + pcie_mmio_uart_base = new_pcie_mmio_uart_base; + return 1; + } + + pcie_mmio_uart_base = PCIE_MMIO_UART_BASE; + if (pcie_mmio_uart_present()) { + return 1; + } + + // no pcie_mmio uart found + return 0; +} + +static void +pcie_mmio_uart_set_baud_rate( __unused int unit, __unused uint32_t baud_rate ) +{ + const unsigned char lcr = PCIE_MMIO_READ( LCR ); + unsigned long div; + + if (baud_rate == 0) baud_rate = 9600; + div = LEGACY_UART_CLOCK / 16 / baud_rate; + + PCIE_MMIO_WRITE( LCR, lcr | UART_LCR_DLAB ); + PCIE_MMIO_WRITE( DLM, (unsigned char)(div >> 8) ); + PCIE_MMIO_WRITE( DLL, (unsigned char) div ); + PCIE_MMIO_WRITE( LCR, lcr & ~UART_LCR_DLAB); +} + +static int +pcie_mmio_uart_tr0( void ) +{ + return (PCIE_MMIO_READ(LSR) & UART_LSR_THRE); +} + +static void +pcie_mmio_uart_td0( int c ) +{ + PCIE_MMIO_WRITE( THR, c ); +} + +static void +pcie_mmio_uart_init( void ) +{ + uart_initted = 1; +} + +static int +pcie_mmio_uart_rr0( void ) +{ + unsigned char lsr; + + lsr = PCIE_MMIO_READ( LSR ); + + if ( lsr & (UART_LSR_FE | UART_LSR_PE | UART_LSR_OE) ) + { + PCIE_MMIO_READ( RBR ); /* discard */ + return 0; + } + + return (lsr & UART_LSR_DR); +} + +static int +pcie_mmio_uart_rd0( void ) +{ + return PCIE_MMIO_READ( RBR ); +} + +static struct pe_serial_functions pcie_mmio_uart_serial_functions = { + .uart_init = pcie_mmio_uart_init, + .uart_set_baud_rate = pcie_mmio_uart_set_baud_rate, + .tr0 = pcie_mmio_uart_tr0, + .td0 = pcie_mmio_uart_td0, + .rr0 = pcie_mmio_uart_rr0, + .rd0 = pcie_mmio_uart_rd0 +}; + // ============================================================================= // Generic serial support below // ============================================================================= @@ -465,6 +592,13 @@ serial_init( void ) legacy_uart_enabled = 1; return 1; } + else if ( pcie_mmio_uart_probe() ) + { + gPESF = &pcie_mmio_uart_serial_functions; + gPESF->uart_init(); + pcie_uart_enabled = 1; + return 1; + } else { return 0; @@ -475,7 +609,7 @@ serial_init( void ) static void uart_putc(char c) { - if (uart_initted && (legacy_uart_enabled || lpss_uart_enabled)) { + if (uart_initted && (legacy_uart_enabled || lpss_uart_enabled || pcie_uart_enabled)) { while (!gPESF->tr0()); /* Wait until THR is empty. */ gPESF->td0(c); } @@ -484,7 +618,7 @@ uart_putc(char c) static int uart_getc(void) { - if (uart_initted && (legacy_uart_enabled || lpss_uart_enabled)) { + if (uart_initted && (legacy_uart_enabled || lpss_uart_enabled || pcie_uart_enabled)) { if (!gPESF->rr0()) return -1; return gPESF->rd0(); diff --git a/pexpert/pexpert/arm64/AMCC.h b/pexpert/pexpert/arm64/AMCC.h index 2e7f3d8be..4dccef945 100644 --- a/pexpert/pexpert/arm64/AMCC.h +++ b/pexpert/pexpert/arm64/AMCC.h @@ -5,17 +5,23 @@ #ifndef _PEXPERT_ARM_AMCC_H #define _PEXPERT_ARM_AMCC_H +#include + /* * AMCC registers for KTRR/RoRegion related lockdown in early kernel bootstrap. * amcc_base must be retrieved from device tree before using. */ -//#if defined(KERNEL_INTEGRITY_KTRR) +#if defined(KERNEL_INTEGRITY_KTRR) +#define AMCC_PGSHIFT 14 +#define AMCC_PGSIZE (1 << AMCC_PGSHIFT) +#define AMCC_PGMASK (AMCC_PGSIZE - 1) + #define rMCCGEN (*(volatile uint32_t *) (amcc_base + 0x780)) #define rRORGNBASEADDR (*(volatile uint32_t *) (amcc_base + 0x7e4)) #define rRORGNENDADDR (*(volatile uint32_t *) (amcc_base + 0x7e8)) #define rRORGNLOCK (*(volatile uint32_t *) (amcc_base + 0x7ec)) -//#endif +#endif #endif /* _PEXPERT_ARM_AMCC_H */ diff --git a/pexpert/pexpert/arm64/BCM2837.h b/pexpert/pexpert/arm64/BCM2837.h new file mode 100644 index 000000000..5de092054 --- /dev/null +++ b/pexpert/pexpert/arm64/BCM2837.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2016 Apple Inc. All rights reserved. + */ + +#ifndef _PEXPERT_ARM_BCM2837_H +#define _PEXPERT_ARM_BCM2837_H + +#ifdef BCM2837 +#include "arm64_common.h" +#endif + +#define NO_MONITOR 1 +#define NO_ECORE 1 + +#ifndef ASSEMBLER + +#define PI3_UART + +#define PI3_BREAK asm volatile("brk #0"); + +#define BCM2837_GPIO_BASE 0x3F200000 +#define BCM2837_GPIO_SIZE 0xA0 +#define BCM2837_GPFSEL0 0x3F200000 +#define BCM2837_GPSET0 0x3F20001C +#define BCM2837_GPCLR0 0x3F200028 +#define BCM2837_GPPUD 0x3F200094 +#define BCM2837_GPPUDCLK0 0x3F200098 + +#define BCM2837_AUX_BASE 0x3F215000 +#define BCM2837_AUX_SIZE 0x70 +#define BCM2837_AUX_ENABLES 0x3F215004 +#define BCM2837_AUX_MU_IO_REG 0x3F215040 +#define BCM2837_AUX_MU_IER_REG 0x3F215044 +#define BCM2837_AUX_MU_IIR_REG 0x3F215048 +#define BCM2837_AUX_MU_LCR_REG 0x3F21504C +#define BCM2837_AUX_MU_MCR_REG 0x3F215050 +#define BCM2837_AUX_MU_LSR_REG 0x3F215054 +#define BCM2837_AUX_MU_MSR_REG 0x3F215058 +#define BCM2837_AUX_MU_SCRATCH 0x3F21505C +#define BCM2837_AUX_MU_CNTL_REG 0x3F215060 +#define BCM2837_AUX_MU_STAT_REG 0x3F215064 +#define BCM2837_AUX_MU_BAUD_REG 0x3F215068 + +#define BCM2837_GPFSEL0_V (pi3_gpio_base_vaddr + 0x0) +#define BCM2837_GPSET0_V (pi3_gpio_base_vaddr + 0x1C) +#define BCM2837_GPCLR0_V (pi3_gpio_base_vaddr + 0x28) +#define BCM2837_GPPUD_V (pi3_gpio_base_vaddr + 0x94) +#define BCM2837_GPPUDCLK0_V (pi3_gpio_base_vaddr + 0x98) + +#define BCM2837_FSEL_INPUT 0x0 +#define BCM2837_FSEL_OUTPUT 0x1 +#define BCM2837_FSEL_ALT0 0x4 +#define BCM2837_FSEL_ALT1 0x5 +#define BCM2837_FSEL_ALT2 0x6 +#define BCM2837_FSEL_ALT3 0x7 +#define BCM2837_FSEL_ALT4 0x3 +#define BCM2837_FSEL_ALT5 0x2 + +#define BCM2837_FSEL_NFUNCS 54 +#define BCM2837_FSEL_REG(func) (BCM2837_GPFSEL0_V + (4 * ((func) / 10))) +#define BCM2837_FSEL_OFFS(func) (((func) % 10) * 3) +#define BCM2837_FSEL_MASK(func) (0x7 << BCM2837_FSEL_OFFS(func)) + +#define BCM2837_AUX_ENABLES_V (pi3_aux_base_vaddr + 0x4) +#define BCM2837_AUX_MU_IO_REG_V (pi3_aux_base_vaddr + 0x40) +#define BCM2837_AUX_MU_IER_REG_V (pi3_aux_base_vaddr + 0x44) +#define BCM2837_AUX_MU_IIR_REG_V (pi3_aux_base_vaddr + 0x48) +#define BCM2837_AUX_MU_LCR_REG_V (pi3_aux_base_vaddr + 0x4C) +#define BCM2837_AUX_MU_MCR_REG_V (pi3_aux_base_vaddr + 0x50) +#define BCM2837_AUX_MU_LSR_REG_V (pi3_aux_base_vaddr + 0x54) +#define BCM2837_AUX_MU_MSR_REG_V (pi3_aux_base_vaddr + 0x58) +#define BCM2837_AUX_MU_SCRATCH_V (pi3_aux_base_vaddr + 0x5C) +#define BCM2837_AUX_MU_CNTL_REG_V (pi3_aux_base_vaddr + 0x60) +#define BCM2837_AUX_MU_STAT_REG_V (pi3_aux_base_vaddr + 0x64) +#define BCM2837_AUX_MU_BAUD_REG_V (pi3_aux_base_vaddr + 0x68) +#define BCM2837_PUT32(addr, value) do { *((volatile uint32_t *) addr) = value; } while(0) +#define BCM2837_GET32(addr) *((volatile uint32_t *) addr) + +#define PLATFORM_PANIC_LOG_PADDR 0x3c0fc000 +#define PLATFORM_PANIC_LOG_SIZE 16384 // 16kb +#endif /* ! ASSEMBLER */ + +#endif /* ! _PEXPERT_ARM_BCM2837_H */ diff --git a/pexpert/pexpert/arm64/Makefile b/pexpert/pexpert/arm64/Makefile index 6bdb8fc40..49f2b889e 100644 --- a/pexpert/pexpert/arm64/Makefile +++ b/pexpert/pexpert/arm64/Makefile @@ -8,6 +8,7 @@ include $(MakeInc_def) DATAFILES = \ AIC.h \ + AMCC.h \ arm64_common.h \ board_config.h \ boot.h \ @@ -19,7 +20,8 @@ DATAFILES = \ cyclone.h \ typhoon.h \ twister.h \ - hurricane.h + hurricane.h \ + BCM2837.h INSTALL_MD_LIST = ${DATAFILES} diff --git a/pexpert/pexpert/arm64/arm64_common.h b/pexpert/pexpert/arm64/arm64_common.h index 7b24690b9..ac3c6d320 100644 --- a/pexpert/pexpert/arm64/arm64_common.h +++ b/pexpert/pexpert/arm64/arm64_common.h @@ -19,19 +19,23 @@ #define ARM64_REG_HID1 S3_0_c15_c1_0 #define ARM64_REG_HID1_disCmpBrFusion (1<<14) +#define ARM64_REG_HID1_rccForceAllIexL3ClksOn (1<<23) #define ARM64_REG_HID1_rccDisStallInactiveIexCtl (1<<24) #define ARM64_REG_HID1_disLspFlushWithContextSwitch (1<<25) #define ARM64_REG_HID1_disAESFuseAcrossGrp (1<<44) +#define ARM64_REG_HID1_enaBrKillLimit (1ULL << 60) #define ARM64_REG_HID2 S3_0_c15_c2_0 #define ARM64_REG_HID2_disMMUmtlbPrefetch (1<<13) #define ARM64_REG_HID3 S3_0_c15_c3_0 -#define ARM64_REG_HID3_DisDcZvaCmdOnly (1<<25) +#define ARM64_REG_HID3_DisDcZvaCmdOnly (1<<25) #define ARM64_REG_HID3_DisXmonSnpEvictTriggerL2StarvationMode (1<<54) +#define ARM64_REG_HID3_DisColorOpt (1<<2) #define ARM64_REG_EHID3 S3_0_c15_c3_1 -#define ARM64_REG_EHID3_DisDcZvaCmdOnly (1<<25) +#define ARM64_REG_EHID3_DisColorOpt (1<<2) +#define ARM64_REG_EHID3_DisDcZvaCmdOnly (1<<25) #define ARM64_REG_HID4 S3_0_c15_c4_0 #define ARM64_REG_EHID4 S3_0_c15_c4_1 @@ -45,6 +49,7 @@ #define ARM64_REG_HID5_DisHwpLd (1<<44) #define ARM64_REG_HID5_DisHwpSt (1<<45) #define ARM64_REG_HID5_DisFullLineWr (1ULL << 57) +#define ARM64_REG_HID5_EnableDnFIFORdStall (1ULL << 54) #define ARM64_REG_HID5_CrdEdbSnpRsvd_mask (3ULL << 14) #define ARM64_REG_HID5_CrdEdbSnpRsvd_VALUE (2ULL << 14) @@ -70,11 +75,15 @@ #define ARM64_REG_HID10 S3_0_c15_c10_0 #define ARM64_REG_HID10_DisHwpGups (1ULL << 0) +#define ARM64_REG_EHID10 S3_0_c15_c10_1 +#define ARM64_REG_EHID10_rccDisPwrSavePrfClkOff (1ULL << 19) + #if defined(APPLECYCLONE) || defined(APPLETYPHOON) || defined(APPLETWISTER) #define ARM64_REG_HID11 S3_0_c15_c13_0 #else #define ARM64_REG_HID11 S3_0_c15_c11_0 #endif +#define ARM64_REG_HID11_DisX64NTLnchOpt (1ULL << 1) #define ARM64_REG_HID11_DisFillC1BubOpt (1<<7) #define ARM64_REG_HID11_DisFastDrainOpt (1ULL << 23) @@ -87,9 +96,11 @@ #define ARM64_REG_CYC_CFG_deepSleep (1ULL<<24) #else #define ARM64_REG_ACC_OVRD S3_5_c15_c6_0 +#if defined(APPLEMONSOON) +#define ARM64_REG_ACC_EBLK_OVRD S3_5_c15_c6_1 // EBLK_OVRD on Zephyr +#endif #define ARM64_REG_ACC_OVRD_enDeepSleep (1ULL << 34) - - +#define ARM64_REG_ACC_OVRD_disPioOnWfiCpu (1ULL << 32) #define ARM64_REG_ACC_OVRD_dsblClkDtr (1ULL << 29) #define ARM64_REG_ACC_OVRD_cpmWakeUp_mask (3ULL << 27) #define ARM64_REG_ACC_OVRD_cpmWakeUp_force (3ULL << 27) @@ -107,7 +118,12 @@ #define ARM64_REG_CYC_OVRD S3_5_c15_c5_0 #define ARM64_REG_CYC_OVRD_ok2pwrdn_force_up (2<<24) #define ARM64_REG_CYC_OVRD_ok2pwrdn_force_down (3<<24) +#define ARM64_REG_CYC_OVRD_disWfiRetn (1<<0) +#if defined(APPLEMONSOON) +#define ARM64_REG_CYC_OVRD_dsblSnoopTime_mask (3ULL << 30) +#define ARM64_REG_CYC_OVRD_dsblSnoopPTime (1ULL << 31) /// Don't fetch the timebase from the P-block +#endif /* APPLEMONSOON */ #define ARM64_REG_LSU_ERR_STS S3_3_c15_c0_0 #define ARM64_REG_LSU_ERR_STS_L1DTlbMultiHitEN (1ULL<<54) @@ -166,6 +182,10 @@ * 0=>not a p-core, non-zero=>p-core */ .macro ARM64_IS_PCORE +#if defined(APPLEMONSOON) || HAS_CLUSTER + mrs $0, MPIDR_EL1 + and $0, $0, #(MPIDR_PNE) +#endif .endmacro /* @@ -176,6 +196,14 @@ * arg3: SPR to use for p-core or non-AMP architecture */ .macro ARM64_READ_EP_SPR +#if defined(APPLEMONSOON) || HAS_CLUSTER + cbnz $0, 1f +// e-core + mrs $1, $2 + b 2f +// p-core +1: +#endif mrs $1, $3 2: .endmacro @@ -188,6 +216,14 @@ * arg3: SPR to use for p-core or non-AMP architecture */ .macro ARM64_WRITE_EP_SPR +#if defined(APPLEMONSOON) || HAS_CLUSTER + cbnz $0, 1f +// e-core + msr $2, $1 + b 2f +// p-core +1: +#endif msr $3, $1 2: .endmacro diff --git a/pexpert/pexpert/arm64/board_config.h b/pexpert/pexpert/arm64/board_config.h index 0aaefb898..c7c434d2a 100644 --- a/pexpert/pexpert/arm64/board_config.h +++ b/pexpert/pexpert/arm64/board_config.h @@ -5,6 +5,8 @@ #ifndef _PEXPERT_ARM_BOARD_CONFIG_H #define _PEXPERT_ARM_BOARD_CONFIG_H +#include + #ifdef ARM64_BOARD_CONFIG_S5L8960X #define APPLE_ARM64_ARCH_FAMILY 1 #define APPLECYCLONE @@ -15,6 +17,7 @@ #define ARM_BOARD_CLASS_S5L8960X #define KERNEL_INTEGRITY_WT 1 #define PEXPERT_NO_3X_IMAGES 1 +#define CORE_NCTRS 8 #endif /* ARM64_BOARD_CONFIG_S5L8960X */ #ifdef ARM64_BOARD_CONFIG_T7000 @@ -26,6 +29,7 @@ #define ARM_BOARD_WFE_TIMEOUT_NS 1000 #define ARM_BOARD_CLASS_T7000 #define KERNEL_INTEGRITY_WT 1 +#define CORE_NCTRS 8 #endif /* ARM64_BOARD_CONFIG_T7000 */ #ifdef ARM64_BOARD_CONFIG_T7001 @@ -38,6 +42,7 @@ #define ARM_BOARD_CLASS_T7000 #define KERNEL_INTEGRITY_WT 1 #define CPU_COUNT 3 +#define CORE_NCTRS 8 #endif /* ARM64_BOARD_CONFIG_T7001 */ #ifdef ARM64_BOARD_CONFIG_S8000 @@ -55,6 +60,7 @@ #define ARM_BOARD_WFE_TIMEOUT_NS 1000 #define ARM_BOARD_CLASS_S8000 #define KERNEL_INTEGRITY_WT 1 +#define CORE_NCTRS 8 #endif /* ARM64_BOARD_CONFIG_S8000 */ #ifdef ARM64_BOARD_CONFIG_S8001 @@ -72,6 +78,7 @@ #define ARM_BOARD_WFE_TIMEOUT_NS 1000 #define ARM_BOARD_CLASS_S8000 #define KERNEL_INTEGRITY_WT 1 +#define CORE_NCTRS 8 #endif /* ARM64_BOARD_CONFIG_S8001 */ #ifdef ARM64_BOARD_CONFIG_T8010 @@ -84,26 +91,80 @@ #define APPLE_ARM64_ARCH_FAMILY 1 #define APPLEHURRICANE #define ARM_ARCH_TIMER +#define KERNEL_INTEGRITY_KTRR #include #define __ARM_L2CACHE_SIZE_LOG__ 22 #define ARM_BOARD_WFE_TIMEOUT_NS 1000 #define ARM_BOARD_CLASS_T8010 +#define CORE_NCTRS 10 +#if DEVELOPMENT || DEBUG +#define PMAP_CS 1 +#define PMAP_CS_ENABLE 0 +#endif #endif /* ARM64_BOARD_CONFIG_T8010 */ #ifdef ARM64_BOARD_CONFIG_T8011 #define APPLE_ARM64_ARCH_FAMILY 1 #define APPLEHURRICANE #define ARM_ARCH_TIMER +#define KERNEL_INTEGRITY_KTRR #include #define __ARM_L2CACHE_SIZE_LOG__ 23 #define ARM_BOARD_WFE_TIMEOUT_NS 1000 #define ARM_BOARD_CLASS_T8011 #define CPU_COUNT 3 +#define CORE_NCTRS 10 +#if DEVELOPMENT || DEBUG +#define PMAP_CS 1 +#define PMAP_CS_ENABLE 0 +#endif #endif /* ARM64_BOARD_CONFIG_T8011 */ +#ifdef ARM64_BOARD_CONFIG_T8015 +/* + * The LLC size for monsoon is 8MB, but the L2E exposed to mistral is + * only 1MB. We use the larger cache size here. The expectation is + * that this may cause flushes from mistral to be less efficient + * (cycles will be wasted on unnecessary way/set operations), but it + * will be technically correct... the best kind of correct. + * + * And is an explicit flush from L2E to LLC something we'll ever want + * to do? + */ +#define APPLE_ARM64_ARCH_FAMILY 1 +#define APPLEMONSOON +#define ARM_ARCH_TIMER +#define KERNEL_INTEGRITY_KTRR +#include +#define __ARM_L2CACHE_SIZE_LOG__ 23 +#define ARM_BOARD_WFE_TIMEOUT_NS 1000 +#define ARM_BOARD_CLASS_T8015 +#define CPU_COUNT 6 +#define BROKEN_FRIGGING_SLEEP 1 /* Spurious wake: See rdar://problem/29762505 */ +#define HAS_UNCORE_CTRS 1 +#define UNCORE_VERSION 1 +#define UNCORE_PER_CLUSTER 0 +#define UNCORE_NCTRS 8 +#define CORE_NCTRS 10 +#if DEVELOPMENT || DEBUG +#define PMAP_CS 1 +#define PMAP_CS_ENABLE 0 +#endif +#endif /* ARM64_BOARD_CONFIG_T8015 */ + +#ifdef ARM64_BOARD_CONFIG_BCM2837 +#define BCM2837 +#define BCM2837_BRINGUP +#define ARM_ARCH_TIMER +#include +#define __ARM_L2CACHE_SIZE_LOG__ 19 +#define ARM_BOARD_CLASS_BCM2837 +#define CPU_COUNT 4 +#endif /* ARM64_BOARD_CONFIG_BCM2837 */ + #endif /* ! _PEXPERT_ARM_BOARD_CONFIG_H */ diff --git a/pexpert/pexpert/arm64/boot.h b/pexpert/pexpert/arm64/boot.h index c665c4721..653b8252d 100644 --- a/pexpert/pexpert/arm64/boot.h +++ b/pexpert/pexpert/arm64/boot.h @@ -33,7 +33,7 @@ struct Boot_Video { #define kBootVideoDepthRotateShift (8) #define kBootVideoDepthScaleShift (16) -#define kBootFlagsDarkBoot (1 << 0) +#define kBootFlagsDarkBoot (1ULL << 0) typedef struct Boot_Video Boot_Video; diff --git a/pexpert/pexpert/pexpert.h b/pexpert/pexpert/pexpert.h index eac7336cf..df4dd8db7 100644 --- a/pexpert/pexpert/pexpert.h +++ b/pexpert/pexpert/pexpert.h @@ -92,6 +92,8 @@ uint32_t PE_i_can_has_debugger( uint32_t *); #if defined(__arm__) || defined(__arm64__) +boolean_t PE_panic_debugging_enabled(void); + void PE_mark_hwaccess(uint64_t thread); #endif /* defined(__arm__) || defined(__arm64__) */ @@ -173,6 +175,8 @@ struct clock_frequency_info_t { unsigned long long fix_frequency_hz; }; +extern int debug_cpu_performance_degradation_factor; + typedef struct clock_frequency_info_t clock_frequency_info_t; extern clock_frequency_info_t gPEClockFrequencyInfo; @@ -396,6 +400,8 @@ extern void PE_arm_debug_enable_trace(void); extern uint8_t PE_smc_stashed_x86_power_state; extern uint8_t PE_smc_stashed_x86_efi_boot_state; extern uint8_t PE_smc_stashed_x86_system_state; +extern uint8_t PE_smc_stashed_x86_shutdown_cause; +extern uint64_t PE_smc_stashed_x86_prev_power_transitions; extern uint32_t PE_pcie_stashed_link_state; #endif diff --git a/san/Kasan_kasan.exports b/san/Kasan_kasan.exports index 864ea9424..4372d6af7 100644 --- a/san/Kasan_kasan.exports +++ b/san/Kasan_kasan.exports @@ -99,6 +99,7 @@ ___asan_version_mismatch_check_v8 ___asan_version_mismatch_check_apple_802 ___asan_version_mismatch_check_apple_900 ___asan_version_mismatch_check_apple_902 +___asan_version_mismatch_check_apple_1000 ___asan_init ___asan_memcpy ___asan_memmove diff --git a/san/Makefile b/san/Makefile index 75a98ec4c..e8c092167 100644 --- a/san/Makefile +++ b/san/Makefile @@ -31,7 +31,7 @@ EXPORT_MI_DIR = san COMP_SUBDIRS = conf .DELETE_ON_ERROR: -$(OBJROOT)/san/kasan-blacklist-%: $(SOURCE)/kasan-blacklist $(SOURCE)/kasan-blacklist-% +$(OBJROOT)/san/kasan-blacklist-%: $(SOURCE)/kasan-blacklist $(SOURCE)/ubsan-blacklist $(SOURCE)/kasan-blacklist-% @echo "$(ColorH)GENERATING$(Color0) $(ColorLF)$(notdir $@)$(Color0)" $(_v)sed -e 's,^src:\./,src:'"$(SRCROOT)/," $^ > $@ $(_v)$(SOURCE)/tools/validate_blacklist.sh "$@" diff --git a/san/conf/Makefile.arm b/san/conf/Makefile.arm new file mode 100644 index 000000000..e69de29bb diff --git a/san/conf/Makefile.arm64 b/san/conf/Makefile.arm64 new file mode 100644 index 000000000..e69de29bb diff --git a/san/conf/Makefile.template b/san/conf/Makefile.template index 5edceed17..8c60bc15b 100644 --- a/san/conf/Makefile.template +++ b/san/conf/Makefile.template @@ -47,11 +47,17 @@ COMP_SUBDIRS = # Rebuild if per-file overrides change ${OBJS}: $(firstword $(MAKEFILE_LIST)) -ifneq ($(KASAN),1) -# nothing to build for non-KASAN +# set file list manually OBJS = -COBJS = -SOBJS = + +ifeq ($(KASAN),1) +OBJS += kasan.o kasan-fakestack.o kasan-memintrinsics.o kasan_dynamic_blacklist.o +OBJS += kasan-$(CURRENT_ARCH_CONFIG_LC).o +OBJS += kasan-test.o kasan-test-$(CURRENT_ARCH_CONFIG_LC).o +endif + +ifeq ($(UBSAN),1) +OBJS += ubsan.o ubsan_log.o endif # Rebuild if global compile flags change diff --git a/san/conf/files b/san/conf/files index 30036fb3e..0c312a11f 100644 --- a/san/conf/files +++ b/san/conf/files @@ -3,3 +3,5 @@ san/kasan-fakestack.c standard san/kasan-test.c standard san/kasan-memintrinsics.c standard san/kasan_dynamic_blacklist.c standard +san/ubsan.c standard +san/ubsan_log.c standard diff --git a/san/conf/files.arm b/san/conf/files.arm new file mode 100644 index 000000000..e69de29bb diff --git a/san/conf/files.arm64 b/san/conf/files.arm64 new file mode 100644 index 000000000..4303b854d --- /dev/null +++ b/san/conf/files.arm64 @@ -0,0 +1,3 @@ +# KASAN +san/kasan-arm64.c standard +san/kasan-test-arm64.s standard diff --git a/san/kasan-arm64.c b/san/kasan-arm64.c index 77ee449a8..056e531c4 100644 --- a/san/kasan-arm64.c +++ b/san/kasan-arm64.c @@ -51,7 +51,9 @@ extern uint64_t *cpu_tte; extern unsigned long gVirtBase, gPhysBase; -#define phystokv(a) ((vm_address_t)(a) - gPhysBase + gVirtBase) + +typedef uint64_t pmap_paddr_t; +extern vm_map_address_t phystokv(pmap_paddr_t pa); vm_offset_t physmap_vbase; vm_offset_t physmap_vtop; @@ -111,10 +113,12 @@ align_to_page(vm_offset_t *addrp, vm_offset_t *sizep) static void kasan_map_shadow_internal(vm_offset_t address, vm_size_t size, bool is_zero, bool back_page) { + size = (size + 0x7UL) & ~0x7UL; vm_offset_t shadow_base = vm_map_trunc_page(SHADOW_FOR_ADDRESS(address), ARM_PGMASK); vm_offset_t shadow_top = vm_map_round_page(SHADOW_FOR_ADDRESS(address + size), ARM_PGMASK); assert(shadow_base >= KASAN_SHADOW_MIN && shadow_top <= KASAN_SHADOW_MAX); + assert((size & 0x7) == 0); for (; shadow_base < shadow_top; shadow_base += ARM_PGBYTES) { uint64_t *base = cpu_tte; diff --git a/san/kasan-blacklist b/san/kasan-blacklist index cbef48e41..48ce86d46 100644 --- a/san/kasan-blacklist +++ b/san/kasan-blacklist @@ -3,6 +3,17 @@ # the AddressSanitizer runtime itself, or because the code executes before # the runtime has been initialized. +[.*] + +# Blanket ignore non-sanitized functions +fun:__nosan_* + +# Try really hard to avoid panicing while debugging +src:./osfmk/kdp/* +src:./osfmk/kern/debug.c + +[address] + # Exclude linker sets type:struct linker_set_entry type:linker_set_entry @@ -14,13 +25,6 @@ src:./san/kasan-x86_64.c src:./san/kasan-memintrinsics.c src:./san/kasan_dynamic_blacklist.c -# Blanket ignore non-sanitized functions -fun:__nosan_* - -# Try really hard to avoid panicing while debugging -src:./osfmk/kdp/* -src:./osfmk/kern/debug.c - # Exclude dtrace function that does weird stack manipulations fun:fbt_perfCallback @@ -30,4 +34,5 @@ fun:_ZL18IOTrackingLeakScanPv # Exclude KASAN dependencies # XXX: could this be relaxed since fakestack is reentrant? src:./osfmk/kern/zalloc.c +src:./osfmk/kern/zcache.c diff --git a/san/kasan-blacklist-arm64 b/san/kasan-blacklist-arm64 index 9ef0c3aab..6f1fe4f8b 100644 --- a/san/kasan-blacklist-arm64 +++ b/san/kasan-blacklist-arm64 @@ -1,5 +1,7 @@ # ARM64 specific blacklist +[address] + # Exclude KASan runtime src:./san/kasan-arm64.c diff --git a/san/kasan-blacklist-x86_64 b/san/kasan-blacklist-x86_64 index bd1704d30..517bce143 100644 --- a/san/kasan-blacklist-x86_64 +++ b/san/kasan-blacklist-x86_64 @@ -1,5 +1,7 @@ # x86_64 specific blacklist +[address] + # Early boot AUTOGEN src:./bsd/kern/kdebug.c src:./bsd/kern/kern_csr.c @@ -11,6 +13,7 @@ src:./osfmk/i386/acpi.c src:./osfmk/i386/cpu.c src:./osfmk/i386/i386_init.c src:./osfmk/i386/locks_i386.c +src:./osfmk/i386/locks_i386_opt.c src:./osfmk/i386/machine_routines.c src:./osfmk/i386/mp.c src:./osfmk/i386/mtrr.c @@ -19,7 +22,7 @@ src:./osfmk/i386/panic_hooks.c src:./osfmk/i386/rtclock.c src:./osfmk/i386/vmx/vmx_cpu.c src:./osfmk/kern/locks.c -src:./osfmk/prng/random.c +src:./osfmk/prng/prng_random.c src:./osfmk/x86_64/loose_ends.c src:./pexpert/gen/bootargs.c src:./pexpert/gen/device_tree.c diff --git a/san/kasan-fakestack.c b/san/kasan-fakestack.c index b023ded1c..0680f0858 100644 --- a/san/kasan-fakestack.c +++ b/san/kasan-fakestack.c @@ -47,8 +47,9 @@ int fakestack_enabled = 0; #define FAKESTACK_HEADER_SZ 64 #define FAKESTACK_NUM_SZCLASS 7 -#define FAKESTACK_FREED 0 /* forced by clang */ +#define FAKESTACK_UNUSED 0 /* waiting to be collected at next gc - forced by clang */ #define FAKESTACK_ALLOCATED 1 +#define FAKESTACK_FREED 2 #if FAKESTACK @@ -120,29 +121,38 @@ ptr_is_on_stack(uptr ptr) } /* free all unused fakestack entries */ -static void NOINLINE +void kasan_fakestack_gc(thread_t thread) { struct fakestack_header *cur, *tmp; LIST_HEAD(, fakestack_header) tofree = LIST_HEAD_INITIALIZER(tofree); - /* move all the freed elements off the main list */ + boolean_t flags; + if (!thread_enter_fakestack(&flags)) { + panic("expected success entering fakestack\n"); + } + + /* move the unused objects off the per-thread list... */ struct fakestack_header_list *head = &kasan_get_thread_data(thread)->fakestack_head; LIST_FOREACH_SAFE(cur, head, list, tmp) { - if (cur->flag == FAKESTACK_FREED) { + if (cur->flag == FAKESTACK_UNUSED) { LIST_REMOVE(cur, list); LIST_INSERT_HEAD(&tofree, cur, list); + cur->flag = FAKESTACK_FREED; } } + kasan_unlock(flags); + /* ... then actually free them */ LIST_FOREACH_SAFE(cur, &tofree, list, tmp) { + LIST_REMOVE(cur, list); + zone_t zone = fakestack_zones[cur->sz_class]; size_t sz = (fakestack_min << cur->sz_class) + FAKESTACK_HEADER_SZ; - LIST_REMOVE(cur, list); void *ptr = (void *)cur; - kasan_free_internal(&ptr, &sz, KASAN_HEAP_FAKESTACK, &zone, cur->realsz, 1, FAKESTACK_QUARANTINE); + kasan_free_internal(&ptr, &sz, KASAN_HEAP_FAKESTACK, &zone, cur->realsz, 0, FAKESTACK_QUARANTINE); if (ptr) { zfree(zone, ptr); } @@ -179,8 +189,6 @@ kasan_fakestack_alloc(int sz_class, size_t realsz) return 0; } - kasan_fakestack_gc(current_thread()); /* XXX: optimal? */ - ret = (uptr)zget(zone); if (ret) { @@ -241,7 +249,7 @@ kasan_fakestack_free(int sz_class, uptr dst, size_t realsz) } void NOINLINE -kasan_unpoison_fakestack(thread_t thread) +kasan_fakestack_drop(thread_t thread) { boolean_t flags; if (!thread_enter_fakestack(&flags)) { @@ -252,11 +260,10 @@ kasan_unpoison_fakestack(thread_t thread) struct fakestack_header *cur; LIST_FOREACH(cur, head, list) { if (cur->flag == FAKESTACK_ALLOCATED) { - cur->flag = FAKESTACK_FREED; + cur->flag = FAKESTACK_UNUSED; } } - kasan_fakestack_gc(thread); kasan_unlock(flags); } diff --git a/san/kasan-test.c b/san/kasan-test.c index 6dc379c1a..672a6645e 100644 --- a/san/kasan-test.c +++ b/san/kasan-test.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -434,13 +435,13 @@ static int test_strncat(struct kasan_test *t) } /* we ignore the top *two* frames in backtrace - so add an extra one */ -static int __attribute__((noinline)) +static int OS_NOINLINE test_blacklist_helper(void) { return kasan_is_blacklisted(TYPE_TEST); } -static int __attribute__((noinline)) +static int OS_NOINLINE test_blacklist(struct kasan_test *t) { TEST_START(t); @@ -449,7 +450,7 @@ test_blacklist(struct kasan_test *t) return 0; } -static int __attribute__((noinline)) +static int OS_NOINLINE test_blacklist_str(struct kasan_test *t) { TEST_START(t); @@ -482,6 +483,50 @@ static int test_strnlen(struct kasan_test *t) } #endif +static void OS_NOINLINE +force_fakestack(char *x) +{ + __asm__ __volatile__("" :: "r" (x) : "memory"); +} + +OS_NOINLINE +static int +test_fakestack_helper(struct kasan_test *t, char *x) +{ + TEST_START(t); + + x[0] = 0x55; + + /* ensure that 'x' is on the fakestack */ + uintptr_t base = dtrace_get_kernel_stack(current_thread()); + uintptr_t p = (uintptr_t)x; + if (p >= base && p < base + kernel_stack_size) { + return 1; + } + + __asan_handle_no_return(); + + /* x better still be accessible */ + TEST_NOFAULT(t); + if (x[0] != 0x55) { + TEST_DONE(t, 1); + } + + TEST_DONE(t, 0); + return 0; +} + +static int +test_fakestack(struct kasan_test *t) +{ + char x[8]; + if (!fakestack_enabled) { + return 1; + } + force_fakestack(x); + return test_fakestack_helper(t, x); +} + int *uaf_ptr; static int * NOINLINE stack_uaf_helper(void) @@ -524,6 +569,7 @@ static struct kasan_test xnu_tests[] = { DECLARE_TEST(test_strncat, "strncat"), DECLARE_TEST(test_blacklist, "blacklist"), DECLARE_TEST(test_blacklist_str, "blacklist_str"), + DECLARE_TEST(test_fakestack, "fakestack"), // DECLARE_TEST(test_strnlen, "strnlen"), }; static int num_xnutests = sizeof(xnu_tests)/sizeof(xnu_tests[0]); @@ -557,11 +603,6 @@ kasan_run_test(struct kasan_test *test_list, int testno, int fail) status = TEST_FAIL_NOFAULT; } } else { - /* Triggering a KASan violation will return here by longjmp, bypassing - * stack unpoisoning, so do it here explicitly. We just hope that - * fakestack free will happen later... */ - kasan_unpoison_curstack(true); - if (t->result) { /* faulted, but at the wrong place */ printf("KASan: test.%02d FAIL %d (%s)\n", testno, t->result, t->name); diff --git a/san/kasan-x86_64.c b/san/kasan-x86_64.c index e2cb6d3bd..4b685e67f 100644 --- a/san/kasan-x86_64.c +++ b/san/kasan-x86_64.c @@ -194,9 +194,12 @@ kasan_map_shadow_superpage_zero(vm_offset_t address, vm_size_t size) void kasan_map_shadow(vm_offset_t address, vm_size_t size, bool is_zero) { + size = (size + 0x7UL) & ~0x7UL; vm_offset_t shadow_base = vm_map_trunc_page(SHADOW_FOR_ADDRESS(address), PAGE_MASK); vm_offset_t shadow_top = vm_map_round_page(SHADOW_FOR_ADDRESS(address + size), PAGE_MASK); + assert((size & 0x7) == 0); + for (; shadow_base < shadow_top; shadow_base += I386_PGBYTES) { split_addr_t addr = split_address(shadow_base); diff --git a/san/kasan.c b/san/kasan.c index 01faa3801..a34d479aa 100644 --- a/san/kasan.c +++ b/san/kasan.c @@ -299,7 +299,7 @@ kasan_check_range(const void *x, size_t sz, access_t access) /* * Return true if [base, base+sz) is unpoisoned or has given shadow value. */ -static bool +bool kasan_check_shadow(vm_address_t base, vm_size_t sz, uint8_t shadow) { sz -= 8 - (base % 8); @@ -371,7 +371,7 @@ kasan_shadow_crashlog(uptr p, char *buf, size_t len) uptr shadow = (uptr)SHADOW_FOR_ADDRESS(p); uptr shadow_p = shadow; - uptr shadow_page = vm_map_round_page(shadow_p, PAGE_MASK); + uptr shadow_page = vm_map_round_page(shadow_p, HW_PAGE_MASK); /* rewind to start of context block */ shadow &= ~((uptr)0xf); @@ -381,7 +381,7 @@ kasan_shadow_crashlog(uptr p, char *buf, size_t len) " Shadow 0 1 2 3 4 5 6 7 8 9 a b c d e f\n"); for (i = 0; i < 1 + before + after; i++, shadow += 16) { - if ((vm_map_round_page(shadow, PAGE_MASK) != shadow_page) && !kasan_is_shadow_mapped(shadow)) { + if ((vm_map_round_page(shadow, HW_PAGE_MASK) != shadow_page) && !kasan_is_shadow_mapped(shadow)) { /* avoid unmapped shadow when crossing page boundaries */ continue; } @@ -518,7 +518,12 @@ void NOINLINE __asan_handle_no_return(void) { kasan_unpoison_curstack(false); - kasan_unpoison_fakestack(current_thread()); + + /* + * No need to free any fakestack objects because they must stay alive until + * we drop the real stack, at which point we can drop the entire fakestack + * anyway. + */ } bool NOINLINE @@ -1258,17 +1263,17 @@ kasan_traverse_mappings(pmap_traverse_callback cb, void *ctx) { uintptr_t shadow_base = (uintptr_t)SHADOW_FOR_ADDRESS(VM_MIN_KERNEL_AND_KEXT_ADDRESS); uintptr_t shadow_top = (uintptr_t)SHADOW_FOR_ADDRESS(VM_MAX_KERNEL_ADDRESS); - shadow_base = vm_map_trunc_page(shadow_base, PAGE_MASK); - shadow_top = vm_map_round_page(shadow_top, PAGE_MASK); + shadow_base = vm_map_trunc_page(shadow_base, HW_PAGE_MASK); + shadow_top = vm_map_round_page(shadow_top, HW_PAGE_MASK); uintptr_t start = 0, end = 0; - for (uintptr_t addr = shadow_base; addr < shadow_top; addr += PAGE_SIZE) { + for (uintptr_t addr = shadow_base; addr < shadow_top; addr += HW_PAGE_SIZE) { if (kasan_is_shadow_mapped(addr)) { if (start == 0) { start = addr; } - end = addr + PAGE_SIZE; + end = addr + HW_PAGE_SIZE; } else if (start && end) { cb(start, end, ctx); start = end = 0; @@ -1307,6 +1312,7 @@ UNUSED_ABI(__asan_version_mismatch_check_v8, void); UNUSED_ABI(__asan_version_mismatch_check_apple_802, void); UNUSED_ABI(__asan_version_mismatch_check_apple_900, void); UNUSED_ABI(__asan_version_mismatch_check_apple_902, void); +UNUSED_ABI(__asan_version_mismatch_check_apple_1000, void); void UNSUPPORTED_API(__asan_init_v5, void); void UNSUPPORTED_API(__asan_register_globals, uptr a, uptr b); diff --git a/san/kasan.h b/san/kasan.h index 102a31468..4682692a8 100644 --- a/san/kasan.h +++ b/san/kasan.h @@ -46,6 +46,7 @@ typedef uintptr_t uptr; #if KASAN +#define KASAN_DEBUG 0 #define KASAN_KALLOC 1 #define KASAN_ZALLOC 1 #define KASAN_DYNAMIC_BLACKLIST 1 @@ -101,8 +102,10 @@ void kasan_notify_address(vm_offset_t address, vm_size_t size); void kasan_notify_address_nopoison(vm_offset_t address, vm_size_t size); void kasan_unpoison_stack(vm_offset_t stack, vm_size_t size); void kasan_unpoison_curstack(bool whole_stack); -void kasan_unpoison_fakestack(thread_t thread); +bool kasan_check_shadow(vm_address_t base, vm_size_t sz, uint8_t shadow); +void kasan_fakestack_drop(thread_t thread); /* mark all fakestack entries for thread as unused */ +void kasan_fakestack_gc(thread_t thread); /* free and poison all unused fakestack objects for thread */ void kasan_fakestack_suspend(void); void kasan_fakestack_resume(void); @@ -126,6 +129,7 @@ extern unsigned shadow_stolen_idx; extern vm_offset_t shadow_pnext, shadow_ptop; #endif #endif + /* * Allocator hooks */ diff --git a/san/kasan_dynamic_blacklist.c b/san/kasan_dynamic_blacklist.c index f4ad0fa05..983b83576 100644 --- a/san/kasan_dynamic_blacklist.c +++ b/san/kasan_dynamic_blacklist.c @@ -350,7 +350,7 @@ addr_to_func(uintptr_t addr, const kernel_mach_header_t *mh) return cur_name; } -bool __attribute__((noinline)) +bool OS_NOINLINE kasan_is_blacklisted(access_t type) { uint32_t nframes = 0; diff --git a/san/kasan_internal.h b/san/kasan_internal.h index c696abe92..f593fbbba 100644 --- a/san/kasan_internal.h +++ b/san/kasan_internal.h @@ -40,7 +40,6 @@ typedef uintptr_t uptr; /* * KASAN features and config */ -#define KASAN_DEBUG 0 #define FAKESTACK 1 /* KASAN_KALLOC defined in kasan.h */ /* KASAN_ZALLOC defined in kasan.h */ @@ -57,9 +56,13 @@ typedef uintptr_t uptr; /* Works out at about 25% of 512 MiB and 15% of 3GiB system */ # define STOLEN_MEM_PERCENT 13UL # define STOLEN_MEM_BYTES MiB(62) +# define HW_PAGE_SIZE (ARM_PGBYTES) +# define HW_PAGE_MASK (ARM_PGMASK) #else # define STOLEN_MEM_PERCENT 25UL # define STOLEN_MEM_BYTES 0 +# define HW_PAGE_SIZE (PAGE_SIZE) +# define HW_PAGE_MASK (PAGE_MASK) #endif /* boot-args */ @@ -81,7 +84,7 @@ typedef uintptr_t uptr; #define SHADOW_FOR_ADDRESS(x) (uint8_t *)(((x) >> 3) + KASAN_SHIFT) #if KASAN_DEBUG -# define NOINLINE __attribute__ ((noinline)) +# define NOINLINE OS_NOINLINE #else # define NOINLINE #endif @@ -191,7 +194,7 @@ struct asan_global { #endif typedef int jmp_buf[_JBLEN]; -void _longjmp(jmp_buf env, int val); -int _setjmp(jmp_buf env); +void _longjmp(jmp_buf env, int val) OS_NORETURN; +int _setjmp(jmp_buf env) __attribute__((returns_twice)); #endif /* _KASAN_INTERNAL_H_ */ diff --git a/san/ubsan-blacklist b/san/ubsan-blacklist new file mode 100644 index 000000000..2e48edff5 --- /dev/null +++ b/san/ubsan-blacklist @@ -0,0 +1,9 @@ +[.*] +src:./san/ubsan* + +[alignment] + +src:./libsa/bootstrap.cpp +src:./bsd/net/necp_client.c +src:./pexpert/arm/pe_identify_machine.c + diff --git a/san/ubsan.c b/san/ubsan.c new file mode 100644 index 000000000..0364a411f --- /dev/null +++ b/san/ubsan.c @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include "ubsan.h" + +static const bool ubsan_print = false; +static const uint32_t line_acquired = 0x80000000UL; + +static size_t +format_loc(struct san_src_loc *loc, char *dst, size_t sz) +{ + return snprintf(dst, sz, " loc: %s:%d:%d\n", + loc->filename, + loc->line & ~line_acquired, + loc->col + ); +} + +/* + * return true for the first visit to this loc, false every subsequent time + */ +static bool +ubsan_loc_acquire(struct san_src_loc *loc) +{ + uint32_t line = loc->line; + if (line & line_acquired) { + return false; + } + uint32_t acq = line | line_acquired; + return atomic_compare_exchange_strong((_Atomic uint32_t *)&loc->line, &line, acq); +} + +static const char *const +overflow_str[] = { + NULL, + "add", + "sub", + "mul", + "divrem", + "negate", + NULL +}; + +static size_t +format_overflow(struct ubsan_violation *v, char *buf, size_t sz) +{ + struct san_type_desc *ty = v->overflow->ty; + return snprintf(buf, sz, + "%s overflow, op = %s, ty = %s, width = %d, lhs = 0x%llx, rhs = 0x%llx\n", + ty->issigned ? "signed" : "unsigned", + overflow_str[v->ubsan_type], + ty->name, + 1 << ty->width, + v->lhs, + v->rhs + ); +} + +static size_t +format_shift(struct ubsan_violation *v, char *buf, size_t sz) +{ + size_t n = 0; + struct san_type_desc *l = v->shift->lhs_t; + struct san_type_desc *r = v->shift->rhs_t; + + n += snprintf(buf+n, sz-n, "bad shift\n"); + n += snprintf(buf+n, sz-n, " lhs: 0x%llx, ty = %s, signed = %d, width = %d\n", v->lhs, l->name, l->issigned, 1 << l->width); + n += snprintf(buf+n, sz-n, " rhs: 0x%llx, ty = %s, signed = %d, width = %d\n", v->rhs, r->name, r->issigned, 1 << r->width); + + return n; +} + +static const char *const +align_kinds[] = { + "load", + "store", + "", + "member access", + "", +}; + +static size_t +format_alignment(struct ubsan_violation *v, char *buf, size_t sz) +{ + size_t n = 0; + struct san_type_desc *ty = v->align->ty; + + n += snprintf(buf+n, sz-n, "mis-aligned %s of 0x%llx\n", align_kinds[v->align->kind], v->lhs); + n += snprintf(buf+n, sz-n, " expected %d-byte alignment, type = %s\n", + 1 << v->align->align, ty->name); + return n; +} + +static size_t +format_oob(struct ubsan_violation *v, char *buf, size_t sz) +{ + size_t n = 0; + struct san_type_desc *aty = v->oob->array_ty; + struct san_type_desc *ity = v->oob->index_ty; + uintptr_t idx = v->lhs; + + n += snprintf(buf+n, sz-n, "OOB array access\n"); + n += snprintf(buf+n, sz-n, " idx %ld\n", idx); + n += snprintf(buf+n, sz-n, " aty: ty = %s, signed = %d, width = %d\n", aty->name, aty->issigned, 1 << aty->width); + n += snprintf(buf+n, sz-n, " ity: ty = %s, signed = %d, width = %d\n", ity->name, ity->issigned, 1 << ity->width); + + return n; +} + +size_t +ubsan_format(struct ubsan_violation *v, char *buf, size_t sz) +{ + size_t n = 0; + + switch (v->ubsan_type) { + case UBSAN_OVERFLOW_add ... UBSAN_OVERFLOW_negate: + n += format_overflow(v, buf+n, sz-n); + break; + case UBSAN_UNREACHABLE: + n += snprintf(buf+n, sz-n, "unreachable\n"); + break; + case UBSAN_SHIFT: + n += format_shift(v, buf+n, sz-n); + break; + case UBSAN_ALIGN: + n += format_alignment(v, buf+n, sz-n); + break; + case UBSAN_POINTER_OVERFLOW: + n += snprintf(buf+n, sz-n, "pointer overflow, before = 0x%llx, after = 0x%llx\n", v->lhs, v->rhs); + break; + case UBSAN_OOB: + n += format_oob(v, buf+n, sz-n); + break; + default: + panic("unknown violation"); + } + + n += format_loc(v->loc, buf+n, sz-n); + + return n; +} + +static void +ubsan_handle(struct ubsan_violation *v, bool fatal) +{ + const size_t sz = 256; + static char buf[sz]; + size_t n = 0; + buf[0] = '\0'; + + if (!ubsan_loc_acquire(v->loc)) { + /* violation site already reported */ + return; + } + + ubsan_log_append(v); + + if (ubsan_print || fatal) { + n += ubsan_format(v, buf+n, sz-n); + } + + if (ubsan_print) { + printf("UBSan: %s", buf); + } + + if (fatal) { + panic("UBSan: %s", buf); + } +} + +void +__ubsan_handle_builtin_unreachable(struct ubsan_unreachable_desc *desc) +{ + struct ubsan_violation v = { UBSAN_UNREACHABLE, 0, 0, .unreachable = desc, &desc->loc }; + ubsan_handle(&v, true); +} + +void +__ubsan_handle_shift_out_of_bounds(struct ubsan_shift_desc *desc, uint64_t lhs, uint64_t rhs) +{ + struct ubsan_violation v = { UBSAN_SHIFT, lhs, rhs, .shift = desc, &desc->loc }; + ubsan_handle(&v, false); +} + +void +__ubsan_handle_shift_out_of_bounds_abort(struct ubsan_shift_desc *desc, uint64_t lhs, uint64_t rhs) +{ + struct ubsan_violation v = { UBSAN_SHIFT, lhs, rhs, .shift = desc, &desc->loc }; + ubsan_handle(&v, true); +} + +#define DEFINE_OVERFLOW(op) \ + void __ubsan_handle_##op##_overflow(struct ubsan_overflow_desc *desc, uint64_t lhs, uint64_t rhs) { \ + struct ubsan_violation v = { UBSAN_OVERFLOW_##op, lhs, rhs, .overflow = desc, &desc->loc }; \ + ubsan_handle(&v, false); \ + } \ + void __ubsan_handle_##op##_overflow_abort(struct ubsan_overflow_desc *desc, uint64_t lhs, uint64_t rhs) { \ + struct ubsan_violation v = { UBSAN_OVERFLOW_##op, lhs, rhs, .overflow = desc, &desc->loc }; \ + ubsan_handle(&v, true); \ + } + +DEFINE_OVERFLOW(add) +DEFINE_OVERFLOW(sub) +DEFINE_OVERFLOW(mul) +DEFINE_OVERFLOW(divrem) +DEFINE_OVERFLOW(negate) + +void __ubsan_handle_type_mismatch_v1(struct ubsan_align_desc *desc, uint64_t val) +{ + struct ubsan_violation v = { UBSAN_ALIGN, val, 0, .align = desc, &desc->loc }; + ubsan_handle(&v, false); +} + +void __ubsan_handle_type_mismatch_v1_abort(struct ubsan_align_desc *desc, uint64_t val) +{ + struct ubsan_violation v = { UBSAN_ALIGN, val, 0, .align = desc, &desc->loc }; + ubsan_handle(&v, true); +} + +void __ubsan_handle_pointer_overflow(struct ubsan_ptroverflow_desc *desc, uint64_t before, uint64_t after) +{ + struct ubsan_violation v = { UBSAN_POINTER_OVERFLOW, before, after, .ptroverflow = desc, &desc->loc }; + ubsan_handle(&v, false); +} + +void __ubsan_handle_pointer_overflow_abort(struct ubsan_ptroverflow_desc *desc, uint64_t before, uint64_t after) +{ + struct ubsan_violation v = { UBSAN_POINTER_OVERFLOW, before, after, .ptroverflow = desc, &desc->loc }; + ubsan_handle(&v, true); +} + +void __ubsan_handle_out_of_bounds(struct ubsan_oob_desc *desc, uint64_t idx) +{ + struct ubsan_violation v = { UBSAN_OOB, idx, 0, .oob = desc, &desc->loc }; + ubsan_handle(&v, false); +} + +void __ubsan_handle_out_of_bounds_abort(struct ubsan_oob_desc *desc, uint64_t idx) +{ + struct ubsan_violation v = { UBSAN_OOB, idx, 0, .oob = desc, &desc->loc }; + ubsan_handle(&v, true); +} diff --git a/san/ubsan.h b/san/ubsan.h new file mode 100644 index 000000000..e78dacefc --- /dev/null +++ b/san/ubsan.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _UBSAN_H_ +#define _UBSAN_H_ + +#include +#include + +struct san_type_desc { + uint16_t type; // 0: integer, 1: float + union { + struct { + uint16_t issigned : 1; + uint16_t width : 15; + }; /* int descriptor */ + struct { + uint16_t float_desc; + }; /* float descriptor */ + }; + const char name[]; +}; + +struct san_src_loc { + const char *filename; + uint32_t line; + uint32_t col; +}; + +struct ubsan_overflow_desc { + struct san_src_loc loc; + struct san_type_desc *ty; +}; + +struct ubsan_unreachable_desc { + struct san_src_loc loc; +}; + +struct ubsan_shift_desc { + struct san_src_loc loc; + struct san_type_desc *lhs_t; + struct san_type_desc *rhs_t; +}; + +struct ubsan_align_desc { + struct san_src_loc loc; + struct san_type_desc *ty; + uint8_t align; + uint8_t kind; +}; + +struct ubsan_ptroverflow_desc { + struct san_src_loc loc; +}; + +struct ubsan_oob_desc { + struct san_src_loc loc; + struct san_type_desc *array_ty; + struct san_type_desc *index_ty; +}; + +enum { + UBSAN_OVERFLOW_add = 1, + UBSAN_OVERFLOW_sub, + UBSAN_OVERFLOW_mul, + UBSAN_OVERFLOW_divrem, + UBSAN_OVERFLOW_negate, + UBSAN_UNREACHABLE, + UBSAN_SHIFT, + UBSAN_ALIGN, + UBSAN_POINTER_OVERFLOW, + UBSAN_OOB, + UBSAN_VIOLATION_MAX, +}; + +struct ubsan_violation { + uint8_t ubsan_type; + uint64_t lhs; + uint64_t rhs; + union { + struct ubsan_overflow_desc *overflow; + struct ubsan_unreachable_desc *unreachable; + struct ubsan_shift_desc *shift; + struct ubsan_align_desc *align; + struct ubsan_ptroverflow_desc *ptroverflow; + struct ubsan_oob_desc *oob; + }; + struct san_src_loc *loc; +}; + +void ubsan_log_append(struct ubsan_violation *); +size_t ubsan_format(struct ubsan_violation *, char *buf, size_t sz); + +/* + * UBSan ABI + */ + +void __ubsan_handle_add_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); +void __ubsan_handle_sub_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); +void __ubsan_handle_mul_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); +void __ubsan_handle_divrem_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); +void __ubsan_handle_negate_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); +void __ubsan_handle_add_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); +void __ubsan_handle_sub_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); +void __ubsan_handle_mul_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); +void __ubsan_handle_divrem_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); +void __ubsan_handle_negate_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs); +void __ubsan_handle_builtin_unreachable(struct ubsan_unreachable_desc *); +void __ubsan_handle_shift_out_of_bounds(struct ubsan_shift_desc *, uint64_t lhs, uint64_t rhs); +void __ubsan_handle_shift_out_of_bounds_abort(struct ubsan_shift_desc *, uint64_t lhs, uint64_t rhs); +void __ubsan_handle_type_mismatch_v1(struct ubsan_align_desc *, uint64_t val); +void __ubsan_handle_type_mismatch_v1_abort(struct ubsan_align_desc *, uint64_t val); +void __ubsan_handle_pointer_overflow(struct ubsan_ptroverflow_desc *, uint64_t lhs, uint64_t rhs); +void __ubsan_handle_pointer_overflow_abort(struct ubsan_ptroverflow_desc *, uint64_t lhs, uint64_t rhs); +void __ubsan_handle_out_of_bounds(struct ubsan_oob_desc *, uint64_t idx); +void __ubsan_handle_out_of_bounds_abort(struct ubsan_oob_desc *, uint64_t idx); + +#endif /* _UBSAN_H_ */ diff --git a/san/ubsan_log.c b/san/ubsan_log.c new file mode 100644 index 000000000..dc06cd722 --- /dev/null +++ b/san/ubsan_log.c @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include "ubsan.h" + +/* + * To dump the violation log: + * $ sysctl kern.ubsan.log + * + * To reset: + * $ sysctl kern.ubsan.logentries=0 + */ + +static const size_t ubsan_log_size = 2048; +struct ubsan_violation ubsan_log[ubsan_log_size]; + +_Atomic size_t ubsan_log_head = 0; /* first valid entry */ +_Atomic size_t ubsan_log_tail = 0; /* next free slot (reader) */ +_Atomic size_t ubsan_log_next = 0; /* next free slot (writer) */ + +static const bool ubsan_logging = true; + +static inline size_t +next_entry(size_t x) +{ + return (x + 1) % ubsan_log_size; +} + +void +ubsan_log_append(struct ubsan_violation *e) +{ + if (!ubsan_logging) { + return; + } + + /* reserve a slot */ + size_t i = atomic_load(&ubsan_log_next); + size_t n; + do { + n = next_entry(i); + if (n == ubsan_log_tail) { + return; /* full */ + } + } while (!atomic_compare_exchange_weak(&ubsan_log_next, &i, n)); + + ubsan_log[i] = *e; + + /* make the entry available */ + size_t prev; + do { + prev = i; + } while (!atomic_compare_exchange_weak(&ubsan_log_head, &prev, n)); +} + +static int +sysctl_ubsan_log_dump SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + const size_t sz = ubsan_log_size * 256; + size_t start = atomic_load(&ubsan_log_tail); + size_t end = atomic_load(&ubsan_log_head); + + char *buf; + size_t n = 0; + int err; + + if (start == end) { + return 0; /* log is empty */ + } + + buf = kalloc(sz); + if (!buf) { + return 0; + } + buf[0] = '\0'; + + for (size_t i = start; i != end; i = next_entry(i)) { + n += ubsan_format(&ubsan_log[i], buf+n, sz-n); + } + + err = SYSCTL_OUT(req, buf, n); + + kfree(buf, sz); + return err; +} + +static int +sysctl_ubsan_log_entries SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + int ch, err, val; + + int nentries; + if (ubsan_log_head >= ubsan_log_tail) { + nentries = ubsan_log_head - ubsan_log_tail; + } else { + nentries = ubsan_log_size - (ubsan_log_tail - ubsan_log_head + 1); + } + + err = sysctl_io_number(req, nentries, sizeof(nentries), &val, &ch); + if (err == 0 && ch) { + if (val != 0) { + err = EINVAL; + } else { + ubsan_log_tail = ubsan_log_head; + } + } + + return err; +} + +SYSCTL_DECL(ubsan); +SYSCTL_NODE(_kern, OID_AUTO, ubsan, CTLFLAG_RW | CTLFLAG_LOCKED, 0, ""); + +SYSCTL_COMPAT_UINT(_kern_ubsan, OID_AUTO, logsize, CTLFLAG_RD, NULL, (unsigned)ubsan_log_size, ""); + +SYSCTL_PROC(_kern_ubsan, OID_AUTO, logentries, + CTLTYPE_INT | CTLFLAG_RW, + 0, 0, sysctl_ubsan_log_entries, "I", ""); + +SYSCTL_PROC(_kern_ubsan, OID_AUTO, log, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MASKED, + 0, 0, sysctl_ubsan_log_dump, "A", ""); diff --git a/security/mac_base.c b/security/mac_base.c index 2bd03b8eb..0cc9f0b0d 100644 --- a/security/mac_base.c +++ b/security/mac_base.c @@ -2007,25 +2007,6 @@ int mac_iokit_check_hid_control(kauth_cred_t cred __unused) return 0; } - -int mac_iokit_check_nvram_delete(kauth_cred_t cred __unused, const char *name __unused); -int mac_iokit_check_nvram_delete(kauth_cred_t cred __unused, const char *name __unused) -{ - return 0; -} - -int mac_iokit_check_nvram_get(kauth_cred_t cred __unused, const char *name __unused); -int mac_iokit_check_nvram_get(kauth_cred_t cred __unused, const char *name __unused) -{ - return 0; -} - -int mac_iokit_check_nvram_set(kauth_cred_t cred __unused, const char *name __unused, io_object_t value __unused); -int mac_iokit_check_nvram_set(kauth_cred_t cred __unused, const char *name __unused, io_object_t value __unused) -{ - return 0; -} - int mac_vnode_check_trigger_resolve(vfs_context_t ctx __unused, struct vnode *dvp __unused, struct componentname *cnp __unused); int mac_vnode_check_trigger_resolve(vfs_context_t ctx __unused, struct vnode *dvp __unused, struct componentname *cnp __unused) { diff --git a/security/mac_framework.h b/security/mac_framework.h index 81a4839c8..d735e0124 100644 --- a/security/mac_framework.h +++ b/security/mac_framework.h @@ -238,9 +238,6 @@ int mac_iokit_check_set_properties(kauth_cred_t cred, io_object_t registry_entry int mac_iokit_check_filter_properties(kauth_cred_t cred, io_object_t registry_entry); int mac_iokit_check_get_property(kauth_cred_t cred, io_object_t registry_entry, const char *name); int mac_iokit_check_hid_control(kauth_cred_t cred); -int mac_iokit_check_nvram_delete(kauth_cred_t cred, const char *name); -int mac_iokit_check_nvram_get(kauth_cred_t cred, const char *name); -int mac_iokit_check_nvram_set(kauth_cred_t cred, const char *name, io_object_t value); void mac_ipq_label_associate(struct mbuf *fragment, struct ipq *ipq); int mac_ipq_label_compare(struct mbuf *fragment, struct ipq *ipq); void mac_ipq_label_destroy(struct ipq *ipq); diff --git a/security/mac_iokit.c b/security/mac_iokit.c index d9dff9460..fd41b7538 100644 --- a/security/mac_iokit.c +++ b/security/mac_iokit.c @@ -119,31 +119,3 @@ mac_iokit_check_hid_control(kauth_cred_t cred) return (error); } -int -mac_iokit_check_nvram_delete(kauth_cred_t cred, const char *name) -{ - int error; - - MAC_CHECK(iokit_check_nvram_delete, cred, name); - return (error); -} - -int -mac_iokit_check_nvram_get(kauth_cred_t cred, const char *name) -{ - int error; - - MAC_CHECK(iokit_check_nvram_get, cred, name); - return (error); -} - -int -mac_iokit_check_nvram_set(kauth_cred_t cred, const char *name, io_object_t value) -{ - int error; - - MAC_CHECK(iokit_check_nvram_set, cred, name, value); - return (error); -} - - diff --git a/security/mac_mach.c b/security/mac_mach.c index 5669e0daf..4ae380665 100644 --- a/security/mac_mach.c +++ b/security/mac_mach.c @@ -149,6 +149,21 @@ mac_thread_userret(struct thread *td) MAC_PERFORM(thread_userret, td); } +void +mac_proc_notify_exec_complete(struct proc *proc) +{ + thread_t thread = current_thread(); + + /* + * Since this MAC hook was designed to support upcalls, make sure the hook + * is called with kernel importance propagation enabled so any daemons + * can get any appropriate importance donations. + */ + thread_enable_send_importance(thread, TRUE); + MAC_PERFORM(proc_notify_exec_complete, proc); + thread_enable_send_importance(thread, FALSE); +} + /**** Exception Policy * * Note that the functions below do not fully follow the usual convention for mac policy functions diff --git a/security/mac_mach_internal.h b/security/mac_mach_internal.h index 4849bfabd..df3bae67b 100644 --- a/security/mac_mach_internal.h +++ b/security/mac_mach_internal.h @@ -99,6 +99,8 @@ int mac_exc_update_task_crash_label(struct task *task, struct label *newlabel); int mac_exc_action_check_exception_send(struct task *victim_task, struct exception_action *action); +void mac_proc_notify_exec_complete(struct proc *proc); + struct label *mac_exc_create_label_for_proc(struct proc *proc); struct label *mac_exc_create_label_for_current_proc(void); diff --git a/security/mac_policy.h b/security/mac_policy.h index 5cae62529..a36ebe953 100644 --- a/security/mac_policy.h +++ b/security/mac_policy.h @@ -4549,6 +4549,19 @@ typedef int mpo_proc_check_run_cs_invalid_t( struct proc *p ); +/** + @brief Notification a process is finished with exec and will jump to userspace + @param p Object process + + Notifies all MAC policies that a process has completed an exec and is about to + jump to userspace to continue execution. This may result in process termination + via signals. Hook is designed to hold no/minimal locks so it can be used for any + necessary upcalls. + */ +typedef void mpo_proc_notify_exec_complete_t( + struct proc *p +); + /** @brief Perform MAC-related events when a thread returns to user space @param thread Mach (not BSD) thread that is returning @@ -5390,6 +5403,7 @@ typedef int mpo_vnode_check_setutimes_t( @brief Access control check after determining the code directory hash @param vp vnode vnode to combine into proc @param label label associated with the vnode + @param cpu_type cpu type of the signature being checked @param cs_blob the code signature to check @param cs_flags update code signing flags if needed @param signer_type output parameter for the code signature's signer type @@ -5403,6 +5417,7 @@ typedef int mpo_vnode_check_setutimes_t( typedef int mpo_vnode_check_signature_t( struct vnode *vp, struct label *label, + cpu_type_t cpu_type, struct cs_blob *cs_blob, unsigned int *cs_flags, unsigned int *signer_type, @@ -6262,56 +6277,6 @@ typedef int mpo_kext_check_query_t( kauth_cred_t cred ); -/** - @brief Access control check for getting NVRAM variables. - @param cred Subject credential - @param name NVRAM variable to get - - Determine whether the subject identifier by the credential can get the - value of the named NVRAM variable. - - @return Return 0 if access is granted, otherwise an appropriate value for - errno should be returned. Suggested failure: EPERM for lack of privilege. -*/ -typedef int mpo_iokit_check_nvram_get_t( - kauth_cred_t cred, - const char *name -); - -/** - @brief Access control check for setting NVRAM variables. - @param cred Subject credential - @param name NVRAM variable to set - @param value The new value for the NVRAM variable - - Determine whether the subject identifier by the credential can set the - value of the named NVRAM variable. - - @return Return 0 if access is granted, otherwise an appropriate value for - errno should be returned. Suggested failure: EPERM for lack of privilege. -*/ -typedef int mpo_iokit_check_nvram_set_t( - kauth_cred_t cred, - const char *name, - io_object_t value -); - -/** - @brief Access control check for deleting NVRAM variables. - @param cred Subject credential - @param name NVRAM variable to delete - - Determine whether the subject identifier by the credential can delete the - named NVRAM variable. - - @return Return 0 if access is granted, otherwise an appropriate value for - errno should be returned. Suggested failure: EPERM for lack of privilege. -*/ -typedef int mpo_iokit_check_nvram_delete_t( - kauth_cred_t cred, - const char *name -); - /* * Placeholder for future events that may need mac hooks. */ @@ -6323,7 +6288,7 @@ typedef void mpo_reserved_hook_t(void); * Please note that this should be kept in sync with the check assumptions * policy in bsd/kern/policy_check.c (policy_ops struct). */ -#define MAC_POLICY_OPS_VERSION 53 /* inc when new reserved slots are taken */ +#define MAC_POLICY_OPS_VERSION 55 /* inc when new reserved slots are taken */ struct mac_policy_ops { mpo_audit_check_postselect_t *mpo_audit_check_postselect; mpo_audit_check_preselect_t *mpo_audit_check_preselect; @@ -6462,9 +6427,9 @@ struct mac_policy_ops { mpo_proc_check_inherit_ipc_ports_t *mpo_proc_check_inherit_ipc_ports; mpo_vnode_check_rename_t *mpo_vnode_check_rename; mpo_kext_check_query_t *mpo_kext_check_query; - mpo_iokit_check_nvram_get_t *mpo_iokit_check_nvram_get; - mpo_iokit_check_nvram_set_t *mpo_iokit_check_nvram_set; - mpo_iokit_check_nvram_delete_t *mpo_iokit_check_nvram_delete; + mpo_proc_notify_exec_complete_t *mpo_proc_notify_exec_complete; + mpo_reserved_hook_t *mpo_reserved5; + mpo_reserved_hook_t *mpo_reserved6; mpo_proc_check_expose_task_t *mpo_proc_check_expose_task; mpo_proc_check_set_host_special_port_t *mpo_proc_check_set_host_special_port; mpo_proc_check_set_host_exception_port_t *mpo_proc_check_set_host_exception_port; diff --git a/security/mac_vfs.c b/security/mac_vfs.c index 9c9b36c66..b18fc092c 100644 --- a/security/mac_vfs.c +++ b/security/mac_vfs.c @@ -1111,6 +1111,7 @@ mac_vnode_check_signature(struct vnode *vp, struct cs_blob *cs_blob, char *vn_path = NULL; vm_size_t vn_pathlen = MAXPATHLEN; + cpu_type_t cpu_type = (imgp == NULL) ? CPU_TYPE_ANY : imgp->ip_origcputype; #if SECURITY_MAC_CHECK_ENFORCE @@ -1119,7 +1120,7 @@ mac_vnode_check_signature(struct vnode *vp, struct cs_blob *cs_blob, return 0; #endif - MAC_CHECK(vnode_check_signature, vp, vp->v_label, cs_blob, + MAC_CHECK(vnode_check_signature, vp, vp->v_label, cpu_type, cs_blob, cs_flags, signer_type, flags, &fatal_failure_desc, &fatal_failure_desc_len); if (fatal_failure_desc_len) { diff --git a/tools/tests/darwintests/Makefile b/tests/Makefile similarity index 79% rename from tools/tests/darwintests/Makefile rename to tests/Makefile index 24560ab6c..019b19433 100644 --- a/tools/tests/darwintests/Makefile +++ b/tests/Makefile @@ -14,7 +14,7 @@ OTHER_LTE_INCLUDE_FILES += \ /usr/local/lib/libdarwintest_utils.dylib, \ /usr/lib/libapple_crypto.dylib, -DEVELOPER_DIR ?= /Applications/Xcode.app/Contents/Developer/ +DEVELOPER_DIR ?= $(shell xcode-select -p) # the xnu build system will only ever call us with the default target .DEFAULT_GOAL := install @@ -24,8 +24,9 @@ include $(DEVELOPER_DIR)/AppleInternal/Makefiles/darwintest/Makefile.common OTHER_CFLAGS = -Weverything -Wno-gnu-union-cast -Wno-missing-field-initializers -Wno-partial-availability OTHER_CFLAGS += -Wno-missing-noreturn -Wno-vla -Wno-reserved-id-macro -Wno-documentation-unknown-command OTHER_CFLAGS += -Wno-padded -Wno-used-but-marked-unused -Wno-covered-switch-default -Wno-nullability-extension +OTHER_CFLAGS += -Wno-gnu-empty-initializer -Wno-unused-macros OTHER_CFLAGS += --std=gnu11 -isystem $(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders -OTHER_CFLAGS += -DT_NAMESPACE_PREFIX=xnu +OTHER_CFLAGS += -UT_NAMESPACE_PREFIX -DT_NAMESPACE_PREFIX=xnu OTHER_CFLAGS += -F $(SDKROOT)/System/Library/PrivateFrameworks CODESIGN:=$(shell xcrun -sdk "$(TARGETSDK)" -find codesign) @@ -54,7 +55,7 @@ backtracing: OTHER_LDFLAGS += -framework CoreSymbolication data_protection: OTHER_LDFLAGS += -ldarwintest_utils -framework IOKit kdebug: INVALID_ARCHS = i386 -kdebug: OTHER_LDFLAGS = -framework ktrace +kdebug: OTHER_LDFLAGS = -framework ktrace -ldarwintest_utils -framework kperf EXCLUDED_SOURCES += drop_priv.c kperf_helpers.c xnu_quick_test_helpers.c @@ -69,9 +70,16 @@ perf_compressor: OTHER_CFLAGS += $(CONFIG_FREEZE_DEFINE) perf_compressor: OTHER_LDFLAGS += -ldarwintest_utils perf_compressor: CODE_SIGN_ENTITLEMENTS=./private_entitlement.plist +memorystatus_freeze_test: OTHER_CFLAGS += $(CONFIG_FREEZE_DEFINE) +memorystatus_freeze_test: OTHER_LDFLAGS += -ldarwintest_utils + +stackshot: OTHER_CFLAGS += -Wno-objc-messaging-id stackshot: OTHER_LDFLAGS += -lkdd -framework Foundation stackshot: INVALID_ARCHS = i386 +telemetry: OTHER_LDFLAGS = -framework ktrace +telemetry: INVALID_ARCHS = i386 + memorystatus_zone_test: INVALID_ARCHS = i386 memorystatus_zone_test: OTHER_CFLAGS += -isystem $(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders memorystatus_zone_test: OTHER_LDFLAGS += -framework ktrace @@ -96,6 +104,7 @@ mach_get_times: OTHER_LDFLAGS += -ldarwintest_utils monotonic_core: OTHER_LDFLAGS += -framework ktrace monotonic_core: INVALID_ARCHS = i386 +perf_exit: perf_exit_proc perf_exit: OTHER_LDFLAGS = -framework ktrace perf_exit: INVALID_ARCHS = i386 perf_exit: CODE_SIGN_ENTITLEMENTS=./private_entitlement.plist @@ -108,6 +117,9 @@ os_thread_self_restrict: CODE_SIGN_ENTITLEMENTS=os_thread_self_restrict-entitlem task_inspect: CODE_SIGN_ENTITLEMENTS = task_inspect.entitlements task_inspect: OTHER_CFLAGS += -DENTITLED=1 +turnstile_multihop: OTHER_CFLAGS += -Wno-unused-macros +turnstile_multihop: OTHER_CFLAGS += -I $(OBJROOT)/ + CUSTOM_TARGETS += perf_exit_proc perf_exit_proc: @@ -130,13 +142,26 @@ endif all: $(DSTROOT)/usr/local/bin/kcdata -$(DSTROOT)/usr/local/bin/kcdata: $(SRCROOT)/../../lldbmacros/kcdata.py +$(DSTROOT)/usr/local/bin/kcdata: $(SRCROOT)/../tools/lldbmacros/kcdata.py mkdir -p $(dir $@) cp $< $@ chmod a+x $@ xnu_quick_test: OTHER_CFLAGS += xnu_quick_test_helpers.c +xnu_quick_test_entitled: CODE_SIGN_ENTITLEMENTS = xnu_quick_test.entitlements + +CUSTOM_TARGETS += vm_set_max_addr_helper + +vm_set_max_addr_helper: vm_set_max_addr_helper.c + $(CC) $(OTHER_CFLAGS) $(CFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) vm_set_max_addr_helper.c -o $(SYMROOT)/vm_set_max_addr_helper; \ + echo $(CODESIGN) --force --sign - --timestamp=none $(SYMROOT)/$@; \ + env CODESIGN_ALLOCATE=$(CODESIGN_ALLOCATE) $(CODESIGN) --force --sign - --timestamp=none $(SYMROOT)/$@; + +install-vm_set_max_addr_helper: vm_set_max_addr_helper + mkdir -p $(INSTALLDIR) + cp $(SYMROOT)/vm_set_max_addr_helper $(INSTALLDIR)/ + ifeq ($(PLATFORM),iPhoneOS) OTHER_TEST_TARGETS += jumbo_va_spaces_28530648_unentitled jumbo_va_spaces_28530648: CODE_SIGN_ENTITLEMENTS = jumbo_va_spaces_28530648.entitlements @@ -153,6 +178,8 @@ task_info_28439149: CODE_SIGN_ENTITLEMENTS = ./task_for_pid_entitlement.plist proc_info: CODE_SIGN_ENTITLEMENTS = ./task_for_pid_entitlement.plist proc_info: OTHER_LDFLAGS += -ldarwintest_utils +proc_info_list_kthreads: CODE_SIGN_ENTITLEMENTS = ./proc_info_list_kthreads.entitlements + disk_mount_conditioner: disk_mount_conditioner* disk_mount_conditioner: CODE_SIGN_ENTITLEMENTS=./disk_mount_conditioner-entitlements.plist disk_mount_conditioner: OTHER_LDFLAGS += -ldarwintest_utils @@ -179,6 +206,8 @@ task_info: CODE_SIGN_ENTITLEMENTS = task_for_pid_entitlement.plist socket_bind_35243417: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist socket_bind_35685803: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist +net_tuntests: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist + ifneq (osx,$(TARGET_NAME)) EXCLUDED_SOURCES += no32exec_35914211.c no32exec_35914211_helper.c endif @@ -186,6 +215,21 @@ endif no32exec_35914211_helper: INVALID_ARCHS = x86_64 no32exec_35914211: INVALID_ARCHS = i386 +MIG:=SDKROOT=$(SDKROOT) $(shell xcrun -sdk "$(TARGETSDK)" -find mig) + +CUSTOM_TARGETS += excserver + +excserver: + $(MIG) $(CFLAGS) \ + -sheader $(OBJROOT)/excserver.h \ + -server $(OBJROOT)/excserver.c \ + -header /dev/null -user /dev/null \ + excserver.defs +install-excserver: ; + +exc_resource_threads: excserver +exc_resource_threads: OTHER_CFLAGS += $(OBJROOT)/excserver.c -I $(OBJROOT) + ifneq ($(PLATFORM),BridgeOS) EXCLUDED_SOURCES += remote_time.c else diff --git a/tools/tests/darwintests/atm_diagnostic_flag.c b/tests/atm_diagnostic_flag.c similarity index 100% rename from tools/tests/darwintests/atm_diagnostic_flag.c rename to tests/atm_diagnostic_flag.c diff --git a/tools/tests/darwintests/avx.c b/tests/avx.c similarity index 100% rename from tools/tests/darwintests/avx.c rename to tests/avx.c diff --git a/tools/tests/darwintests/backtracing.c b/tests/backtracing.c similarity index 100% rename from tools/tests/darwintests/backtracing.c rename to tests/backtracing.c diff --git a/tools/tests/darwintests/contextswitch.c b/tests/contextswitch.c similarity index 98% rename from tools/tests/darwintests/contextswitch.c rename to tests/contextswitch.c index b059be9a3..3969ead2e 100644 --- a/tools/tests/darwintests/contextswitch.c +++ b/tests/contextswitch.c @@ -190,7 +190,7 @@ void record_perfcontrol_stats(const char *sysctlname, const char *units, const c T_GLOBAL_META(T_META_NAMESPACE("xnu.scheduler")); /* Disable the test on MacOS for now */ -T_DECL(perf_csw, "context switch performance", T_META_TYPE_PERF, T_META_CHECK_LEAKS(NO), T_META_ASROOT(YES)) +T_DECL(perf_csw, "context switch performance", T_META_TAG_PERF, T_META_CHECK_LEAKS(false), T_META_ASROOT(true)) { #if !CONFIG_EMBEDDED diff --git a/tools/tests/darwintests/cpucount.c b/tests/cpucount.c similarity index 99% rename from tools/tests/darwintests/cpucount.c rename to tests/cpucount.c index bd0548a96..47159c1c9 100644 --- a/tools/tests/darwintests/cpucount.c +++ b/tests/cpucount.c @@ -203,7 +203,7 @@ spin_fn(__unused void *arg) #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wgnu-flexible-array-initializer" T_DECL(count_cpus, "Tests we can schedule threads on all hw.ncpus cores according to _os_cpu_number", - T_META_CHECK_LEAKS(NO)) + T_META_CHECK_LEAKS(false), T_META_ENABLED(false)) #pragma clang diagnostic pop { setvbuf(stdout, NULL, _IONBF, 0); diff --git a/tools/tests/darwintests/data_protection.c b/tests/data_protection.c similarity index 100% rename from tools/tests/darwintests/data_protection.c rename to tests/data_protection.c diff --git a/tools/tests/darwintests/disk_mount_conditioner-entitlements.plist b/tests/disk_mount_conditioner-entitlements.plist similarity index 100% rename from tools/tests/darwintests/disk_mount_conditioner-entitlements.plist rename to tests/disk_mount_conditioner-entitlements.plist diff --git a/tools/tests/darwintests/disk_mount_conditioner.c b/tests/disk_mount_conditioner.c similarity index 69% rename from tools/tests/darwintests/disk_mount_conditioner.c rename to tests/disk_mount_conditioner.c index 5847149e0..fc3db9f89 100644 --- a/tools/tests/darwintests/disk_mount_conditioner.c +++ b/tests/disk_mount_conditioner.c @@ -70,6 +70,11 @@ T_DECL(fsctl_set, info.read_throughput_mbps = 40; info.write_throughput_mbps = 40; info.is_ssd = 0; + info.ioqueue_depth = 8; + info.maxreadcnt = 8; + info.maxwritecnt = 8; + info.segreadcnt = 8; + info.segwritecnt = 8; expected_info = info; err = fsctl(mount_path, DISK_CONDITIONER_IOC_SET, &info, 0); @@ -84,6 +89,128 @@ T_DECL(fsctl_set, T_ASSERT_EQ_INT(0, err, "fsctl.get is the info configured by fsctl.set"); } +static void +verify_mount_fallback_values(const char *mount_path, disk_conditioner_info *info) +{ + int err; + disk_conditioner_info newinfo = {0}; + + err = fsctl(mount_path, DISK_CONDITIONER_IOC_SET, info, 0); + T_WITH_ERRNO; + T_ASSERT_EQ_INT(0, err, "fsctl(DISK_CONDITIONER_IOC_SET)"); + + err = fsctl(mount_path, DISK_CONDITIONER_IOC_GET, &newinfo, 0); + T_WITH_ERRNO; + T_ASSERT_EQ_INT(0, err, "fsctl(DISK_CONDITIONER_IOC_GET) after SET"); + + // without querying the drive for the expected values, the best we can do is + // assert that they are not zero (impossible) or less than UINT32_MAX (unlikely) + T_ASSERT_GT(newinfo.ioqueue_depth, 0u, "ioqueue_depth is the value from the mount"); + T_ASSERT_GT(newinfo.maxreadcnt, 0u, "maxreadcnt is value from the mount"); + T_ASSERT_GT(newinfo.maxwritecnt, 0u, "maxwritecnt is value from the mount"); + T_ASSERT_GT(newinfo.segreadcnt, 0u, "segreadcnt is value from the mount"); + T_ASSERT_GT(newinfo.segwritecnt, 0u, "segwritecnt is value from the mount"); + T_ASSERT_LT(newinfo.ioqueue_depth, UINT32_MAX, "ioqueue_depth is the value from the mount"); + T_ASSERT_LT(newinfo.maxreadcnt, UINT32_MAX, "maxreadcnt is value from the mount"); + T_ASSERT_LT(newinfo.maxwritecnt, UINT32_MAX, "maxwritecnt is value from the mount"); + T_ASSERT_LT(newinfo.segreadcnt, UINT32_MAX, "segreadcnt is value from the mount"); + T_ASSERT_LT(newinfo.segwritecnt, UINT32_MAX, "segwritecnt is value from the mount"); +} + +T_DECL(fsctl_set_zero, + "fsctl.set zero values should fall back to original mount settings") +{ + char *mount_path; + disk_conditioner_info info = {0}; + + T_SETUPBEGIN; + mount_path = mktempmount(); + + info.enabled = 1; + /* everything else is 0 */ + + T_SETUPEND; + + verify_mount_fallback_values(mount_path, &info); +} + +T_DECL(fsctl_set_out_of_bounds, + "fsctl.set out-of-bounds values should fall back to original mount settings") +{ + char *mount_path; + disk_conditioner_info info; + + T_SETUPBEGIN; + mount_path = mktempmount(); + + memset(&info, UINT32_MAX, sizeof(info)); + info.enabled = 1; + info.access_time_usec = 0; + info.read_throughput_mbps = 0; + info.write_throughput_mbps = 0; + /* everything else is UINT32_MAX */ + + T_SETUPEND; + + verify_mount_fallback_values(mount_path, &info); +} + +T_DECL(fsctl_restore_mount_fields, + "fsctl.set should restore fields on mount_t that it temporarily overrides") +{ + int err; + char *mount_path; + disk_conditioner_info info; + disk_conditioner_info mount_fields; + + T_SETUPBEGIN; + mount_path = mktempmount(); + T_SETUPEND; + + /* first set out-of-bounds values to retrieve the original mount_t fields */ + memset(&info, UINT32_MAX, sizeof(info)); + info.enabled = 1; + info.access_time_usec = 0; + info.read_throughput_mbps = 0; + info.write_throughput_mbps = 0; + /* everything else is UINT32_MAX */ + err = fsctl(mount_path, DISK_CONDITIONER_IOC_SET, &info, 0); + T_WITH_ERRNO; + T_ASSERT_EQ_INT(0, err, "fsctl(DISK_CONDITIONER_IOC_SET)"); + + err = fsctl(mount_path, DISK_CONDITIONER_IOC_GET, &mount_fields, 0); + T_WITH_ERRNO; + T_ASSERT_EQ_INT(0, err, "fsctl(DISK_CONDITIONER_IOC_GET)"); + + /* now turn off the disk conditioner which should restore fields on the mount_t */ + memset(&info, 1, sizeof(info)); + info.enabled = 0; + err = fsctl(mount_path, DISK_CONDITIONER_IOC_SET, &info, 0); + T_WITH_ERRNO; + T_ASSERT_EQ_INT(0, err, "fsctl(DISK_CONDITIONER_IOC_SET)"); + + /* and finally set out-of-bounds values again to retrieve the new mount_t fields which should not have changed */ + memset(&info, UINT32_MAX, sizeof(info)); + info.enabled = 0; + info.access_time_usec = 0; + info.read_throughput_mbps = 0; + info.write_throughput_mbps = 0; + /* everything else is UINT32_MAX */ + err = fsctl(mount_path, DISK_CONDITIONER_IOC_SET, &info, 0); + T_WITH_ERRNO; + T_ASSERT_EQ_INT(0, err, "fsctl(DISK_CONDITIONER_IOC_SET)"); + + err = fsctl(mount_path, DISK_CONDITIONER_IOC_GET, &info, 0); + T_WITH_ERRNO; + T_ASSERT_EQ_INT(0, err, "fsctl(DISK_CONDITIONER_IOC_GET)"); + + T_ASSERT_EQ(info.maxreadcnt, mount_fields.maxreadcnt, "mount_t maxreadcnt restored"); + T_ASSERT_EQ(info.maxwritecnt, mount_fields.maxwritecnt, "mount_t maxwritecnt restored"); + T_ASSERT_EQ(info.segreadcnt, mount_fields.segreadcnt, "mount_t segreadcnt restored"); + T_ASSERT_EQ(info.segwritecnt, mount_fields.segwritecnt, "mount_t segwritecnt restored"); + T_ASSERT_EQ(info.ioqueue_depth, mount_fields.ioqueue_depth, "mount_t ioqueue_depth restored"); +} + T_DECL(fsctl_get_nonroot, "fsctl.get should not require root", T_META_ASROOT(false)) @@ -150,7 +277,7 @@ T_DECL(fsctl_delays, int fd; int err; uint64_t elapsed_nsec, expected_nsec; - disk_conditioner_info info; + disk_conditioner_info info = {0}; char buf[READSIZE]; T_SETUPBEGIN; diff --git a/tools/tests/darwintests/drop_priv.c b/tests/drop_priv.c similarity index 100% rename from tools/tests/darwintests/drop_priv.c rename to tests/drop_priv.c diff --git a/tests/exc_resource_threads.c b/tests/exc_resource_threads.c new file mode 100644 index 000000000..4b247c6bc --- /dev/null +++ b/tests/exc_resource_threads.c @@ -0,0 +1,175 @@ +#ifdef T_NAMESPACE +#undef T_NAMESPACE +#endif + + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include + +static dispatch_semaphore_t sync_sema; + +kern_return_t +catch_mach_exception_raise(mach_port_t exception_port, + mach_port_t thread, + mach_port_t task, + exception_type_t exception, + mach_exception_data_t code, + mach_msg_type_number_t code_count) +{ +#pragma unused(exception_port, thread, task, code, code_count) + pid_t pid; + pid_for_task(task, &pid); + T_ASSERT_EQ(exception, EXC_CORPSE_NOTIFY, "exception type"); + T_ASSERT_POSIX_ZERO(kill(pid, SIGKILL), "kill"); + dispatch_semaphore_signal(sync_sema); + return KERN_SUCCESS; +} + +kern_return_t +catch_mach_exception_raise_state(mach_port_t exception_port, + exception_type_t exception, + const mach_exception_data_t code, + mach_msg_type_number_t code_count, + int * flavor, + const thread_state_t old_state, + mach_msg_type_number_t old_state_count, + thread_state_t new_state, + mach_msg_type_number_t * new_state_count) +{ +#pragma unused(exception_port, exception, code, code_count, flavor, old_state, old_state_count, new_state, new_state_count) + T_FAIL("Unsupported catch_mach_exception_raise_state"); + return KERN_NOT_SUPPORTED; +} + +kern_return_t +catch_mach_exception_raise_state_identity(mach_port_t exception_port, + mach_port_t thread, + mach_port_t task, + exception_type_t exception, + mach_exception_data_t code, + mach_msg_type_number_t code_count, + int * flavor, + thread_state_t old_state, + mach_msg_type_number_t old_state_count, + thread_state_t new_state, + mach_msg_type_number_t * new_state_count) +{ +#pragma unused(exception_port, thread, task, exception, code, code_count, flavor, old_state, old_state_count, new_state, new_state_count) + T_FAIL("Unsupported catch_mach_exception_raise_state_identity"); + return KERN_NOT_SUPPORTED; +} + + +/* + * setup exception handling port for EXC_CORPSE_NOTIFY. + * runs mach_msg_server once for receiving exception messages from kernel. + */ +static void * +exc_handler(void * arg) +{ +#pragma unused(arg) + kern_return_t kret; + mach_port_t exception_port; + + kret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &exception_port); + if (kret != KERN_SUCCESS) + T_FAIL("mach_port_allocate: %s (%d)", mach_error_string(kret), kret); + + kret = mach_port_insert_right(mach_task_self(), exception_port, exception_port, MACH_MSG_TYPE_MAKE_SEND); + if (kret != KERN_SUCCESS) + T_FAIL("mach_port_insert_right: %s (%d)", mach_error_string(kret), kret); + + kret = task_set_exception_ports(mach_task_self(), EXC_MASK_CRASH | EXC_MASK_CORPSE_NOTIFY, exception_port, + (exception_behavior_t)(EXCEPTION_DEFAULT | MACH_EXCEPTION_CODES), 0); + if (kret != KERN_SUCCESS) + T_FAIL("task_set_exception_ports: %s (%d)", mach_error_string(kret), kret); + + dispatch_semaphore_signal(sync_sema); + + kret = mach_msg_server(mach_exc_server, MACH_MSG_SIZE_RELIABLE, exception_port, 0); + if (kret != KERN_SUCCESS) + T_FAIL("mach_msg_server: %s (%d)", mach_error_string(kret), kret); + + return NULL; +} + +static void* +dummy_thread(void *arg) { +#pragma unused(arg) + while (1) { + sleep(60); + } +} + +#define THREAD_LIMIT 2 + +T_HELPER_DECL(exc_resource_helper, "exc_resource helper") +{ + pthread_t tid; + for (int i = 0; i < THREAD_LIMIT; i++) { + T_QUIET; + T_EXPECT_POSIX_SUCCESS(pthread_create(&tid, NULL, dummy_thread, NULL), "pthread_create"); + } + while (1) { + sleep(60); + } +} + +static void +check_exc_resource_threads_enabled() +{ + int err; + int enabled; + size_t enabled_size = sizeof(enabled); + err = sysctlbyname("kern.exc_resource_threads_enabled", &enabled, &enabled_size, NULL, 0); + + if (err || !enabled) + T_SKIP("EXC_RESOURCE RESOURCE_TYPE_THREADS not enabled on this system"); + +} + +T_DECL(exc_resource_threads, "Ensures that a process with a thread_limit set will receive an exc_resource when it crosses its thread limit", + T_META_ASROOT(true), + T_META_CHECK_LEAKS(false)) +{ + pthread_t handle_thread; + + check_exc_resource_threads_enabled(); + + sync_sema = dispatch_semaphore_create(0); + + T_ASSERT_POSIX_ZERO(pthread_create(&handle_thread, NULL, exc_handler, NULL), "pthread_create"); + dispatch_semaphore_wait(sync_sema, DISPATCH_TIME_FOREVER); + + pid_t helper_pid; + char path[PATH_MAX]; + uint32_t path_size = sizeof(path); + + T_ASSERT_POSIX_ZERO(_NSGetExecutablePath(path, &path_size), "_NSGetExecutablePath"); + + char *args[] = { path, "-n", "exc_resource_helper", NULL }; + + posix_spawnattr_t attr; + T_ASSERT_POSIX_ZERO(posix_spawnattr_init(&attr), "posix_spawnattr_init"); + + T_EXPECT_POSIX_ZERO(posix_spawnattr_set_threadlimit_ext(&attr, THREAD_LIMIT), "posix_spawnattr_set_threadlimit_ext"); + + T_EXPECT_POSIX_ZERO(posix_spawn(&helper_pid, args[0], NULL, &attr, args, NULL), "posix_spawn"); + + T_ASSERT_POSIX_ZERO(posix_spawnattr_destroy(&attr), "posix_spawnattr_destroy"); + + dispatch_semaphore_wait(sync_sema, DISPATCH_TIME_FOREVER); +} diff --git a/tests/excserver.defs b/tests/excserver.defs new file mode 100644 index 000000000..e528df455 --- /dev/null +++ b/tests/excserver.defs @@ -0,0 +1 @@ +#include diff --git a/tools/tests/darwintests/freebsd_waitpid_nohang.c b/tests/freebsd_waitpid_nohang.c similarity index 100% rename from tools/tests/darwintests/freebsd_waitpid_nohang.c rename to tests/freebsd_waitpid_nohang.c diff --git a/tools/tests/darwintests/gettimeofday.c b/tests/gettimeofday.c similarity index 100% rename from tools/tests/darwintests/gettimeofday.c rename to tests/gettimeofday.c diff --git a/tools/tests/darwintests/gettimeofday_29192647.c b/tests/gettimeofday_29192647.c similarity index 97% rename from tools/tests/darwintests/gettimeofday_29192647.c rename to tests/gettimeofday_29192647.c index bd7b66159..f580c2ffb 100644 --- a/tools/tests/darwintests/gettimeofday_29192647.c +++ b/tests/gettimeofday_29192647.c @@ -8,6 +8,8 @@ #include #include +T_GLOBAL_META(T_META_TAG_PERF); + T_DECL(gettimeofday_tl, "gettimeofday performance in tight loop") { { struct timeval time; diff --git a/tools/tests/darwintests/host_notifications.c b/tests/host_notifications.c similarity index 100% rename from tools/tests/darwintests/host_notifications.c rename to tests/host_notifications.c diff --git a/tools/tests/darwintests/host_statistics_rate_limiting.c b/tests/host_statistics_rate_limiting.c similarity index 100% rename from tools/tests/darwintests/host_statistics_rate_limiting.c rename to tests/host_statistics_rate_limiting.c diff --git a/tools/tests/darwintests/ioperf.c b/tests/ioperf.c similarity index 99% rename from tools/tests/darwintests/ioperf.c rename to tests/ioperf.c index c2586ac53..1eb2e8cf2 100644 --- a/tools/tests/darwintests/ioperf.c +++ b/tests/ioperf.c @@ -196,7 +196,7 @@ perform_io(dt_stat_time_t stat) close(test_file_fd); } -T_GLOBAL_META(T_META_NAMESPACE("xnu.io")); +T_GLOBAL_META(T_META_NAMESPACE("xnu.io"), T_META_TAG_PERF); /* Disable the test on MacOS for now */ T_DECL(read_perf, "Sequential Uncached Read Performance", T_META_TYPE_PERF, T_META_CHECK_LEAKS(NO), T_META_ASROOT(YES), T_META_LTEPHASE(LTE_POSTINIT)) diff --git a/tools/tests/darwintests/jumbo_va_spaces_28530648.c b/tests/jumbo_va_spaces_28530648.c similarity index 100% rename from tools/tests/darwintests/jumbo_va_spaces_28530648.c rename to tests/jumbo_va_spaces_28530648.c diff --git a/tools/tests/darwintests/jumbo_va_spaces_28530648.entitlements b/tests/jumbo_va_spaces_28530648.entitlements similarity index 100% rename from tools/tests/darwintests/jumbo_va_spaces_28530648.entitlements rename to tests/jumbo_va_spaces_28530648.entitlements diff --git a/tests/kdebug.c b/tests/kdebug.c new file mode 100644 index 000000000..6be5164d1 --- /dev/null +++ b/tests/kdebug.c @@ -0,0 +1,1101 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.ktrace"), + T_META_ASROOT(true)); + +#define KDBG_TEST_MACROS 1 +#define KDBG_TEST_OLD_TIMES 2 + +static void +assert_kdebug_test(unsigned int flavor) +{ + size_t size = flavor; + int mib[] = { CTL_KERN, KERN_KDEBUG, KERN_KDTEST }; + T_ASSERT_POSIX_SUCCESS(sysctl(mib, sizeof(mib) / sizeof(mib[0]), NULL, + &size, NULL, 0), "KERN_KDTEST sysctl"); +} + +#pragma mark kdebug syscalls + +#define TRACE_DEBUGID (0xfedfed00U) + +T_DECL(kdebug_trace_syscall, "test that kdebug_trace(2) emits correct events") +{ + ktrace_session_t s = ktrace_session_create(); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session"); + + ktrace_events_class(s, DBG_MACH, ^(__unused struct trace_point *tp){}); + + __block int events_seen = 0; + ktrace_events_single(s, TRACE_DEBUGID, ^void(struct trace_point *tp) { + events_seen++; + T_PASS("saw traced event"); + + T_EXPECT_EQ(tp->arg1, 1UL, "argument 1 of traced event is correct"); + T_EXPECT_EQ(tp->arg2, 2UL, "argument 2 of traced event is correct"); + T_EXPECT_EQ(tp->arg3, 3UL, "argument 3 of traced event is correct"); + T_EXPECT_EQ(tp->arg4, 4UL, "argument 4 of traced event is correct"); + + ktrace_end(s, 1); + }); + + ktrace_set_completion_handler(s, ^{ + T_EXPECT_GE(events_seen, 1, NULL); + ktrace_session_destroy(s); + T_END; + }); + + ktrace_filter_pid(s, getpid()); + + T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL); + T_ASSERT_POSIX_SUCCESS(kdebug_trace(TRACE_DEBUGID, 1, 2, 3, 4), NULL); + ktrace_end(s, 0); + + dispatch_main(); +} + +#define SIGNPOST_SINGLE_CODE (0x10U) +#define SIGNPOST_PAIRED_CODE (0x20U) + +T_DECL(kdebug_signpost_syscall, + "test that kdebug_signpost(2) emits correct events") +{ + ktrace_session_t s = ktrace_session_create(); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session"); + + __block int single_seen = 0; + __block int paired_seen = 0; + + /* make sure to get enough events for the KDBUFWAIT to trigger */ + // ktrace_events_class(s, DBG_MACH, ^(__unused struct trace_point *tp){}); + ktrace_events_single(s, + APPSDBG_CODE(DBG_APP_SIGNPOST, SIGNPOST_SINGLE_CODE), + ^(struct trace_point *tp) { + single_seen++; + T_PASS("single signpost is traced"); + + T_EXPECT_EQ(tp->arg1, 1UL, "argument 1 of single signpost is correct"); + T_EXPECT_EQ(tp->arg2, 2UL, "argument 2 of single signpost is correct"); + T_EXPECT_EQ(tp->arg3, 3UL, "argument 3 of single signpost is correct"); + T_EXPECT_EQ(tp->arg4, 4UL, "argument 4 of single signpost is correct"); + }); + + ktrace_events_single_paired(s, + APPSDBG_CODE(DBG_APP_SIGNPOST, SIGNPOST_PAIRED_CODE), + ^(struct trace_point *start, struct trace_point *end) { + paired_seen++; + T_PASS("paired signposts are traced"); + + T_EXPECT_EQ(start->arg1, 5UL, "argument 1 of start signpost is correct"); + T_EXPECT_EQ(start->arg2, 6UL, "argument 2 of start signpost is correct"); + T_EXPECT_EQ(start->arg3, 7UL, "argument 3 of start signpost is correct"); + T_EXPECT_EQ(start->arg4, 8UL, "argument 4 of start signpost is correct"); + + T_EXPECT_EQ(end->arg1, 9UL, "argument 1 of end signpost is correct"); + T_EXPECT_EQ(end->arg2, 10UL, "argument 2 of end signpost is correct"); + T_EXPECT_EQ(end->arg3, 11UL, "argument 3 of end signpost is correct"); + T_EXPECT_EQ(end->arg4, 12UL, "argument 4 of end signpost is correct"); + + T_EXPECT_EQ(single_seen, 1, + "signposts are traced in the correct order"); + + ktrace_end(s, 1); + }); + + ktrace_set_completion_handler(s, ^(void) { + T_QUIET; T_EXPECT_NE(single_seen, 0, + "did not see single tracepoint before timeout"); + T_QUIET; T_EXPECT_NE(paired_seen, 0, + "did not see single tracepoint before timeout"); + ktrace_session_destroy(s); + T_END; + }); + + ktrace_filter_pid(s, getpid()); + + T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), + "started tracing"); + + T_EXPECT_POSIX_SUCCESS(kdebug_signpost(SIGNPOST_SINGLE_CODE, 1, 2, 3, 4), + "emitted single signpost"); + T_EXPECT_POSIX_SUCCESS( + kdebug_signpost_start(SIGNPOST_PAIRED_CODE, 5, 6, 7, 8), + "emitted start signpost"); + T_EXPECT_POSIX_SUCCESS( + kdebug_signpost_end(SIGNPOST_PAIRED_CODE, 9, 10, 11, 12), + "emitted end signpost"); + ktrace_end(s, 0); + + dispatch_main(); +} + +#pragma mark kdebug behaviors + +#define WRAPPING_EVENTS_COUNT (150000) +#define TRACE_ITERATIONS (5000) +#define WRAPPING_EVENTS_THRESHOLD (100) + +T_DECL(wrapping, + "ensure that wrapping traces lost events and no events prior to the wrap", + T_META_CHECK_LEAKS(false)) +{ + int mib[4]; + kbufinfo_t buf_info; + int wait_wrapping_secs = (WRAPPING_EVENTS_COUNT / TRACE_ITERATIONS) + 5; + int current_secs = wait_wrapping_secs; + + /* use sysctls manually to bypass libktrace assumptions */ + + mib[0] = CTL_KERN; mib[1] = KERN_KDEBUG; mib[2] = KERN_KDSETUP; mib[3] = 0; + size_t needed = 0; + T_ASSERT_POSIX_SUCCESS(sysctl(mib, 3, NULL, &needed, NULL, 0), + "KERN_KDSETUP"); + + mib[2] = KERN_KDSETBUF; mib[3] = WRAPPING_EVENTS_COUNT; + T_ASSERT_POSIX_SUCCESS(sysctl(mib, 4, NULL, 0, NULL, 0), "KERN_KDSETBUF"); + + mib[2] = KERN_KDENABLE; mib[3] = 1; + T_ASSERT_POSIX_SUCCESS(sysctl(mib, 4, NULL, 0, NULL, 0), "KERN_KDENABLE"); + + /* wrapping is on by default */ + + /* wait until wrapped */ + T_LOG("waiting for trace to wrap"); + mib[2] = KERN_KDGETBUF; + needed = sizeof(buf_info); + do { + sleep(1); + for (int i = 0; i < TRACE_ITERATIONS; i++) { + T_QUIET; + T_ASSERT_POSIX_SUCCESS(kdebug_trace(0xfefe0000, 0, 0, 0, 0), NULL); + } + T_QUIET; + T_ASSERT_POSIX_SUCCESS(sysctl(mib, 3, &buf_info, &needed, NULL, 0), + NULL); + } while (!(buf_info.flags & KDBG_WRAPPED) && --current_secs > 0); + + T_ASSERT_TRUE(buf_info.flags & KDBG_WRAPPED, + "trace wrapped (after %d seconds within %d second timeout)", + wait_wrapping_secs - current_secs, wait_wrapping_secs); + + ktrace_session_t s = ktrace_session_create(); + T_QUIET; T_ASSERT_NOTNULL(s, NULL); + T_QUIET; T_ASSERT_POSIX_ZERO(ktrace_set_use_existing(s), NULL); + + __block int events = 0; + + ktrace_events_all(s, ^(struct trace_point *tp) { + if (events == 0) { + T_EXPECT_EQ(tp->debugid, (unsigned int)TRACE_LOST_EVENTS, + "first event's debugid 0x%08x (%s) should be TRACE_LOST_EVENTS", + tp->debugid, + ktrace_name_for_eventid(s, tp->debugid & KDBG_EVENTID_MASK)); + } else { + T_QUIET; + T_EXPECT_NE(tp->debugid, (unsigned int)TRACE_LOST_EVENTS, + "event debugid 0x%08x (%s) should not be TRACE_LOST_EVENTS", + tp->debugid, + ktrace_name_for_eventid(s, tp->debugid & KDBG_EVENTID_MASK)); + } + + events++; + if (events > WRAPPING_EVENTS_THRESHOLD) { + ktrace_end(s, 1); + } + }); + + ktrace_set_completion_handler(s, ^{ + ktrace_session_destroy(s); + T_END; + }); + + T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), + "started tracing"); + + dispatch_main(); +} + +T_DECL(reject_old_events, + "ensure that kdebug rejects events from before tracing began", + T_META_CHECK_LEAKS(false)) +{ + __block uint64_t event_horizon_ts; + + ktrace_session_t s = ktrace_session_create(); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session"); + + __block int events = 0; + ktrace_events_range(s, KDBG_EVENTID(DBG_BSD, DBG_BSD_KDEBUG_TEST, 0), + KDBG_EVENTID(DBG_BSD + 1, 0, 0), ^(struct trace_point *tp) { + events++; + T_EXPECT_GT(tp->timestamp, event_horizon_ts, + "events in trace should be from after tracing began"); + }); + + ktrace_set_completion_handler(s, ^{ + T_EXPECT_EQ(events, 2, "should see only two events"); + ktrace_session_destroy(s); + T_END; + }); + + event_horizon_ts = mach_absolute_time(); + + T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL); + /* first, try an old event at the beginning of trace */ + assert_kdebug_test(KDBG_TEST_OLD_TIMES); + /* after a good event has been traced, old events should be rejected */ + assert_kdebug_test(KDBG_TEST_OLD_TIMES); + ktrace_end(s, 0); + + dispatch_main(); +} + +#define ORDERING_TIMEOUT_SEC 5 + +T_DECL(ascending_time_order, + "ensure that kdebug events are in ascending order based on time", + T_META_CHECK_LEAKS(false)) +{ + __block uint64_t prev_ts = 0; + __block uint32_t prev_debugid = 0; + __block unsigned int prev_cpu = 0; + __block bool in_order = true; + + ktrace_session_t s = ktrace_session_create(); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session"); + + ktrace_events_all(s, ^(struct trace_point *tp) { + if (tp->timestamp < prev_ts) { + in_order = false; + T_LOG("%" PRIu64 ": %#" PRIx32 " (cpu %d)", + prev_ts, prev_debugid, prev_cpu); + T_LOG("%" PRIu64 ": %#" PRIx32 " (cpu %d)", + tp->timestamp, tp->debugid, tp->cpuid); + ktrace_end(s, 1); + } + }); + + ktrace_set_completion_handler(s, ^{ + ktrace_session_destroy(s); + T_EXPECT_TRUE(in_order, "event timestamps were in-order"); + T_END; + }); + + T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), + "started tracing"); + + /* try to inject old timestamps into trace */ + assert_kdebug_test(KDBG_TEST_OLD_TIMES); + + dispatch_after(dispatch_time(DISPATCH_TIME_NOW, ORDERING_TIMEOUT_SEC * NSEC_PER_SEC), + dispatch_get_main_queue(), ^{ + T_LOG("ending test after timeout"); + ktrace_end(s, 1); + }); + + dispatch_main(); +} + +#pragma mark dyld tracing + +__attribute__((aligned(8))) + static const char map_uuid[16] = "map UUID"; + +__attribute__((aligned(8))) + static const char unmap_uuid[16] = "unmap UUID"; + +__attribute__((aligned(8))) + static const char sc_uuid[16] = "shared UUID"; + + static fsid_t map_fsid = { .val = { 42, 43 } }; +static fsid_t unmap_fsid = { .val = { 44, 45 } }; +static fsid_t sc_fsid = { .val = { 46, 47 } }; + +static fsobj_id_t map_fsobjid = { .fid_objno = 42, .fid_generation = 43 }; +static fsobj_id_t unmap_fsobjid = { .fid_objno = 44, .fid_generation = 45 }; +static fsobj_id_t sc_fsobjid = { .fid_objno = 46, .fid_generation = 47 }; + +#define MAP_LOAD_ADDR 0xabadcafe +#define UNMAP_LOAD_ADDR 0xfeedface +#define SC_LOAD_ADDR 0xfedfaced + +__unused +static void +expect_dyld_image_info(struct trace_point *tp, const uint64_t *exp_uuid, + uint64_t exp_load_addr, fsid_t *exp_fsid, fsobj_id_t *exp_fsobjid, + int order) +{ +#if defined(__LP64__) || defined(__arm64__) + if (order == 0) { + uint64_t uuid[2]; + uint64_t load_addr; + fsid_t fsid; + + uuid[0] = (uint64_t)tp->arg1; + uuid[1] = (uint64_t)tp->arg2; + load_addr = (uint64_t)tp->arg3; + fsid.val[0] = (int32_t)(tp->arg4 & UINT32_MAX); + fsid.val[1] = (int32_t)((uint64_t)tp->arg4 >> 32); + + T_QUIET; T_EXPECT_EQ(uuid[0], exp_uuid[0], NULL); + T_QUIET; T_EXPECT_EQ(uuid[1], exp_uuid[1], NULL); + T_QUIET; T_EXPECT_EQ(load_addr, exp_load_addr, NULL); + T_QUIET; T_EXPECT_EQ(fsid.val[0], exp_fsid->val[0], NULL); + T_QUIET; T_EXPECT_EQ(fsid.val[1], exp_fsid->val[1], NULL); + } else if (order == 1) { + fsobj_id_t fsobjid; + + fsobjid.fid_objno = (uint32_t)(tp->arg1 & UINT32_MAX); + fsobjid.fid_generation = (uint32_t)((uint64_t)tp->arg1 >> 32); + + T_QUIET; T_EXPECT_EQ(fsobjid.fid_objno, exp_fsobjid->fid_objno, NULL); + T_QUIET; T_EXPECT_EQ(fsobjid.fid_generation, + exp_fsobjid->fid_generation, NULL); + } else { + T_ASSERT_FAIL("unrecognized order of events %d", order); + } +#else /* defined(__LP64__) */ + if (order == 0) { + uint32_t uuid[4]; + + uuid[0] = (uint32_t)tp->arg1; + uuid[1] = (uint32_t)tp->arg2; + uuid[2] = (uint32_t)tp->arg3; + uuid[3] = (uint32_t)tp->arg4; + + T_QUIET; T_EXPECT_EQ(uuid[0], (uint32_t)exp_uuid[0], NULL); + T_QUIET; T_EXPECT_EQ(uuid[1], (uint32_t)(exp_uuid[0] >> 32), NULL); + T_QUIET; T_EXPECT_EQ(uuid[2], (uint32_t)exp_uuid[1], NULL); + T_QUIET; T_EXPECT_EQ(uuid[3], (uint32_t)(exp_uuid[1] >> 32), NULL); + } else if (order == 1) { + uint32_t load_addr; + fsid_t fsid; + fsobj_id_t fsobjid; + + load_addr = (uint32_t)tp->arg1; + fsid.val[0] = (int32_t)tp->arg2; + fsid.val[1] = (int32_t)tp->arg3; + fsobjid.fid_objno = (uint32_t)tp->arg4; + + T_QUIET; T_EXPECT_EQ(load_addr, (uint32_t)exp_load_addr, NULL); + T_QUIET; T_EXPECT_EQ(fsid.val[0], exp_fsid->val[0], NULL); + T_QUIET; T_EXPECT_EQ(fsid.val[1], exp_fsid->val[1], NULL); + T_QUIET; T_EXPECT_EQ(fsobjid.fid_objno, exp_fsobjid->fid_objno, NULL); + } else if (order == 2) { + fsobj_id_t fsobjid; + + fsobjid.fid_generation = tp->arg1; + + T_QUIET; T_EXPECT_EQ(fsobjid.fid_generation, + exp_fsobjid->fid_generation, NULL); + } else { + T_ASSERT_FAIL("unrecognized order of events %d", order); + } +#endif /* defined(__LP64__) */ +} + +#if defined(__LP64__) || defined(__arm64__) +#define DYLD_CODE_OFFSET (0) +#define DYLD_EVENTS (2) +#else +#define DYLD_CODE_OFFSET (2) +#define DYLD_EVENTS (3) +#endif + +static void +expect_dyld_events(ktrace_session_t s, const char *name, uint32_t base_code, + const char *exp_uuid, uint64_t exp_load_addr, fsid_t *exp_fsid, + fsobj_id_t *exp_fsobjid, uint8_t *saw_events) +{ + for (int i = 0; i < DYLD_EVENTS; i++) { + ktrace_events_single(s, KDBG_EVENTID(DBG_DYLD, DBG_DYLD_UUID, + base_code + DYLD_CODE_OFFSET + (unsigned int)i), + ^(struct trace_point *tp) { + T_LOG("checking %s event %c", name, 'A' + i); + expect_dyld_image_info(tp, (const void *)exp_uuid, exp_load_addr, + exp_fsid, exp_fsobjid, i); + *saw_events |= (1U << i); + }); + } +} + +T_DECL(dyld_events, "test that dyld registering libraries emits events") +{ + dyld_kernel_image_info_t info; + + /* + * Use pointers instead of __block variables in order to use these variables + * in the completion block below _and_ pass pointers to them to the + * expect_dyld_events function. + */ + uint8_t saw_events[3] = { 0 }; + uint8_t *saw_mapping = &(saw_events[0]); + uint8_t *saw_unmapping = &(saw_events[1]); + uint8_t *saw_shared_cache = &(saw_events[2]); + + ktrace_session_t s = ktrace_session_create(); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session"); + + T_QUIET; + T_ASSERT_POSIX_ZERO(ktrace_filter_pid(s, getpid()), + "filtered to current process"); + + expect_dyld_events(s, "mapping", DBG_DYLD_UUID_MAP_A, map_uuid, + MAP_LOAD_ADDR, &map_fsid, &map_fsobjid, saw_mapping); + expect_dyld_events(s, "unmapping", DBG_DYLD_UUID_UNMAP_A, unmap_uuid, + UNMAP_LOAD_ADDR, &unmap_fsid, &unmap_fsobjid, saw_unmapping); + expect_dyld_events(s, "shared cache", DBG_DYLD_UUID_SHARED_CACHE_A, + sc_uuid, SC_LOAD_ADDR, &sc_fsid, &sc_fsobjid, saw_shared_cache); + + ktrace_set_completion_handler(s, ^{ + ktrace_session_destroy(s); + + T_EXPECT_EQ(__builtin_popcount(*saw_mapping), DYLD_EVENTS, NULL); + T_EXPECT_EQ(__builtin_popcount(*saw_unmapping), DYLD_EVENTS, NULL); + T_EXPECT_EQ(__builtin_popcount(*saw_shared_cache), DYLD_EVENTS, NULL); + T_END; + }); + + T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL); + + info.load_addr = MAP_LOAD_ADDR; + memcpy(info.uuid, map_uuid, sizeof(info.uuid)); + info.fsid = map_fsid; + info.fsobjid = map_fsobjid; + T_EXPECT_MACH_SUCCESS(task_register_dyld_image_infos(mach_task_self(), + &info, 1), "registered dyld image info"); + + info.load_addr = UNMAP_LOAD_ADDR; + memcpy(info.uuid, unmap_uuid, sizeof(info.uuid)); + info.fsid = unmap_fsid; + info.fsobjid = unmap_fsobjid; + T_EXPECT_MACH_SUCCESS(task_unregister_dyld_image_infos(mach_task_self(), + &info, 1), "unregistered dyld image info"); + + info.load_addr = SC_LOAD_ADDR; + memcpy(info.uuid, sc_uuid, sizeof(info.uuid)); + info.fsid = sc_fsid; + info.fsobjid = sc_fsobjid; + T_EXPECT_MACH_SUCCESS(task_register_dyld_shared_cache_image_info( + mach_task_self(), info, FALSE, FALSE), + "registered dyld shared cache image info"); + + ktrace_end(s, 0); + + dispatch_main(); +} + +#pragma mark kdebug kernel macros + +#define EXP_KERNEL_EVENTS 5U + +static const uint32_t dev_evts[EXP_KERNEL_EVENTS] = { + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 0), + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 1), + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 2), + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 3), + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 4), +}; + +static const uint32_t rel_evts[EXP_KERNEL_EVENTS] = { + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 5), + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 6), + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 7), + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 8), + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 9), +}; + +static const uint32_t filt_evts[EXP_KERNEL_EVENTS] = { + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 10), + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 11), + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 12), + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 13), + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 14), +}; + +static const uint32_t noprocfilt_evts[EXP_KERNEL_EVENTS] = { + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 15), + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 16), + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 17), + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 18), + BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 19), +}; + +static bool +is_development_kernel(void) +{ + static dispatch_once_t is_development_once; + static bool is_development; + + dispatch_once(&is_development_once, ^{ + int dev; + size_t dev_size = sizeof(dev); + + T_QUIET; + T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.development", &dev, + &dev_size, NULL, 0), NULL); + is_development = (dev != 0); + }); + + return is_development; +} + +static void +expect_event(struct trace_point *tp, const char *name, unsigned int *events, + const uint32_t *event_ids, size_t event_ids_len) +{ + unsigned int event_idx = *events; + bool event_found = false; + size_t i; + for (i = 0; i < event_ids_len; i++) { + if (event_ids[i] == (tp->debugid & KDBG_EVENTID_MASK)) { + T_LOG("found %s event 0x%x", name, tp->debugid); + event_found = true; + } + } + + if (!event_found) { + return; + } + + *events += 1; + for (i = 0; i < event_idx; i++) { + T_QUIET; T_EXPECT_EQ(((uint64_t *)&tp->arg1)[i], (uint64_t)i + 1, + NULL); + } + for (; i < 4; i++) { + T_QUIET; T_EXPECT_EQ(((uint64_t *)&tp->arg1)[i], (uint64_t)0, NULL); + } +} + +static void +expect_release_event(struct trace_point *tp, unsigned int *events) +{ + expect_event(tp, "release", events, rel_evts, + sizeof(rel_evts) / sizeof(rel_evts[0])); +} + +static void +expect_development_event(struct trace_point *tp, unsigned int *events) +{ + expect_event(tp, "dev", events, dev_evts, sizeof(dev_evts) / sizeof(dev_evts[0])); +} + +static void +expect_filtered_event(struct trace_point *tp, unsigned int *events) +{ + expect_event(tp, "filtered", events, filt_evts, + sizeof(filt_evts) / sizeof(filt_evts[0])); +} + +static void +expect_noprocfilt_event(struct trace_point *tp, unsigned int *events) +{ + expect_event(tp, "noprocfilt", events, noprocfilt_evts, + sizeof(noprocfilt_evts) / sizeof(noprocfilt_evts[0])); +} + +static void +expect_kdbg_test_events(ktrace_session_t s, bool use_all_callback, + void (^cb)(unsigned int dev_seen, unsigned int rel_seen, + unsigned int filt_seen, unsigned int noprocfilt_seen)) +{ + __block unsigned int dev_seen = 0; + __block unsigned int rel_seen = 0; + __block unsigned int filt_seen = 0; + __block unsigned int noprocfilt_seen = 0; + + void (^evtcb)(struct trace_point *tp) = ^(struct trace_point *tp) { + expect_development_event(tp, &dev_seen); + expect_release_event(tp, &rel_seen); + expect_filtered_event(tp, &filt_seen); + expect_noprocfilt_event(tp, &noprocfilt_seen); + }; + + if (use_all_callback) { + ktrace_events_all(s, evtcb); + } else { + ktrace_events_range(s, KDBG_EVENTID(DBG_BSD, DBG_BSD_KDEBUG_TEST, 0), + KDBG_EVENTID(DBG_BSD + 1, 0, 0), evtcb); + } + + ktrace_set_completion_handler(s, ^{ + ktrace_session_destroy(s); + cb(dev_seen, rel_seen, filt_seen, noprocfilt_seen); + T_END; + }); + + T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL); + assert_kdebug_test(KDBG_TEST_MACROS); + + ktrace_end(s, 0); +} + +T_DECL(kernel_events, "ensure kernel macros work") +{ + ktrace_session_t s = ktrace_session_create(); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session"); + + T_QUIET; T_ASSERT_POSIX_ZERO(ktrace_filter_pid(s, getpid()), + "filtered events to current process"); + + expect_kdbg_test_events(s, false, + ^(unsigned int dev_seen, unsigned int rel_seen, + unsigned int filt_seen, unsigned int noprocfilt_seen) { + /* + * Development-only events are only filtered if running on an embedded + * OS. + */ + unsigned int dev_exp; +#if TARGET_OS_EMBEDDED + dev_exp = is_development_kernel() ? EXP_KERNEL_EVENTS : 0U; +#else + dev_exp = EXP_KERNEL_EVENTS; +#endif + + T_EXPECT_EQ(rel_seen, EXP_KERNEL_EVENTS, + "release and development events seen"); + T_EXPECT_EQ(dev_seen, dev_exp, "development-only events %sseen", + dev_exp ? "" : "not "); + T_EXPECT_EQ(filt_seen, dev_exp, "filter-only events seen"); + T_EXPECT_EQ(noprocfilt_seen, EXP_KERNEL_EVENTS, + "process filter-agnostic events seen"); + }); + + dispatch_main(); +} + +T_DECL(kernel_events_filtered, "ensure that the filtered kernel macros work") +{ + ktrace_session_t s = ktrace_session_create(); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session"); + + T_QUIET; T_ASSERT_POSIX_ZERO(ktrace_filter_pid(s, getpid()), + "filtered events to current process"); + + expect_kdbg_test_events(s, true, + ^(unsigned int dev_seen, unsigned int rel_seen, + unsigned int filt_seen, unsigned int noprocfilt_seen) { + T_EXPECT_EQ(rel_seen, EXP_KERNEL_EVENTS, NULL); +#if defined(__arm__) || defined(__arm64__) + T_EXPECT_EQ(dev_seen, is_development_kernel() ? EXP_KERNEL_EVENTS : 0U, + NULL); +#else + T_EXPECT_EQ(dev_seen, EXP_KERNEL_EVENTS, + "development-only events seen"); +#endif /* defined(__arm__) || defined(__arm64__) */ + T_EXPECT_EQ(filt_seen, 0U, "no filter-only events seen"); + T_EXPECT_EQ(noprocfilt_seen, EXP_KERNEL_EVENTS, + "process filter-agnostic events seen"); + }); + + dispatch_main(); +} + +T_DECL(kernel_events_noprocfilt, + "ensure that the no process filter kernel macros work") +{ + ktrace_session_t s = ktrace_session_create(); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session"); + + /* + * Only allow launchd events through. + */ + T_ASSERT_POSIX_ZERO(ktrace_filter_pid(s, 1), "filtered events to launchd"); + for (size_t i = 0; i < sizeof(noprocfilt_evts) / sizeof(noprocfilt_evts[0]); i++) { + T_QUIET; + T_ASSERT_POSIX_ZERO(ktrace_ignore_process_filter_for_event(s, + noprocfilt_evts[i]), + "ignored process filter for noprocfilt event"); + } + + expect_kdbg_test_events(s, false, + ^(unsigned int dev_seen, unsigned int rel_seen, + unsigned int filt_seen, unsigned int noprocfilt_seen) { + T_EXPECT_EQ(rel_seen, 0U, "release and development events not seen"); + T_EXPECT_EQ(dev_seen, 0U, "development-only events not seen"); + T_EXPECT_EQ(filt_seen, 0U, "filter-only events not seen"); + + T_EXPECT_EQ(noprocfilt_seen, EXP_KERNEL_EVENTS, + "process filter-agnostic events seen"); + }); + + dispatch_main(); +} + +static volatile bool continue_abuse = true; + +#define STRESS_DEBUGID (0xfeedfac0) +#define ABUSE_SECS (10) +#define TIMER_NS (100 * NSEC_PER_USEC) +/* + * Use the quantum as the gap threshold. + */ +#define GAP_THRESHOLD_NS (10 * NSEC_PER_MSEC) + +static void * +kdebug_abuser_thread(void *ctx) +{ + unsigned int id = (unsigned int)ctx; + uint64_t i = 0; + while (continue_abuse) { + kdebug_trace(STRESS_DEBUGID, id, i, 0, 0); + i++; + } + + return NULL; +} + +T_DECL(stress, "emit events on all but one CPU with a small buffer", + T_META_CHECK_LEAKS(false)) +{ + T_SETUPBEGIN; + ktrace_session_t s = ktrace_session_create(); + T_WITH_ERRNO; T_QUIET; T_ASSERT_NOTNULL(s, "ktrace_session_create"); + + /* Let's not waste any time with pleasantries. */ + ktrace_set_uuid_map_enabled(s, KTRACE_FEATURE_DISABLED); + + /* Ouch. */ + ktrace_events_all(s, ^(__unused struct trace_point *tp) {}); + ktrace_set_vnode_paths_enabled(s, KTRACE_FEATURE_ENABLED); + (void)atexit_b(^{ kperf_reset(); }); + (void)kperf_action_count_set(1); + (void)kperf_timer_count_set(1); + int kperror = kperf_timer_period_set(0, kperf_ns_to_ticks(TIMER_NS)); + T_QUIET; T_ASSERT_POSIX_SUCCESS(kperror, "kperf_timer_period_set %llu ns", + TIMER_NS); + kperror = kperf_timer_action_set(0, 1); + T_QUIET; T_ASSERT_POSIX_SUCCESS(kperror, "kperf_timer_action_set"); + kperror = kperf_action_samplers_set(1, KPERF_SAMPLER_TINFO | + KPERF_SAMPLER_TH_SNAPSHOT | KPERF_SAMPLER_KSTACK | + KPERF_SAMPLER_USTACK | KPERF_SAMPLER_MEMINFO | + KPERF_SAMPLER_TINFO_SCHED | KPERF_SAMPLER_TH_DISPATCH | + KPERF_SAMPLER_TK_SNAPSHOT | KPERF_SAMPLER_SYS_MEM | + KPERF_SAMPLER_TH_INSTRS_CYCLES); + T_QUIET; T_ASSERT_POSIX_SUCCESS(kperror, "kperf_action_samplers_set"); + /* You monster... */ + + /* The coup-de-grace. */ + ktrace_set_buffer_size(s, 10); + + char filepath_arr[MAXPATHLEN] = ""; + strlcpy(filepath_arr, dt_tmpdir(), sizeof(filepath_arr)); + strlcat(filepath_arr, "/stress.ktrace", sizeof(filepath_arr)); + char *filepath = filepath_arr; + + int ncpus = 0; + size_t ncpus_size = sizeof(ncpus); + int ret = sysctlbyname("hw.logicalcpu_max", &ncpus, &ncpus_size, NULL, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctlbyname(\"hw.logicalcpu_max\""); + T_QUIET; T_ASSERT_GT(ncpus, 0, "realistic number of CPUs"); + + pthread_t *threads = calloc((unsigned int)ncpus - 1, sizeof(pthread_t)); + T_WITH_ERRNO; T_QUIET; T_ASSERT_NOTNULL(threads, "calloc(%d threads)", + ncpus - 1); + + ktrace_set_completion_handler(s, ^{ + T_SETUPBEGIN; + ktrace_session_destroy(s); + + T_LOG("trace ended, searching for gaps"); + + ktrace_session_t sread = ktrace_session_create(); + T_WITH_ERRNO; T_QUIET; T_ASSERT_NOTNULL(sread, "ktrace_session_create"); + + int error = ktrace_set_file(sread, filepath); + T_QUIET; T_ASSERT_POSIX_ZERO(error, "ktrace_set_file %s", filepath); + + ktrace_file_t f = ktrace_file_open(filepath, false); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(f, "ktrace_file_open %s", + filepath); + uint64_t first_timestamp = 0; + error = ktrace_file_earliest_timestamp(f, &first_timestamp); + T_QUIET; T_ASSERT_POSIX_ZERO(error, "ktrace_file_earliest_timestamp"); + + uint64_t last_timestamp = 0; + (void)ktrace_file_latest_timestamp(f, &last_timestamp); + + __block uint64_t prev_timestamp = 0; + __block uint64_t nevents = 0; + ktrace_events_all(sread, ^(struct trace_point *tp) { + nevents++; + uint64_t delta_ns = 0; + T_QUIET; T_EXPECT_GE(tp->timestamp, prev_timestamp, + "timestamps are monotonically increasing"); + int converror = ktrace_convert_timestamp_to_nanoseconds(sread, + tp->timestamp - prev_timestamp, &delta_ns); + T_QUIET; T_ASSERT_POSIX_ZERO(converror, "convert timestamp to ns"); + if (prev_timestamp && delta_ns > GAP_THRESHOLD_NS) { + if (tp->debugname) { + T_LOG("gap: %gs at %llu - %llu on %d: %s (%#08x)", + (double)delta_ns / 1e9, prev_timestamp, + tp->timestamp, tp->cpuid, tp->debugname, tp->debugid); + } else { + T_LOG("gap: %gs at %llu - %llu on %d: %#x", + (double)delta_ns / 1e9, prev_timestamp, + tp->timestamp, tp->cpuid, tp->debugid); + } + + /* + * These gaps are ok -- they appear after CPUs are brought back + * up. + */ +#define INTERRUPT (0x1050000) +#define PERF_CPU_IDLE (0x27001000) +#define INTC_HANDLER (0x5000004) +#define DECR_TRAP (0x1090000) + uint32_t eventid = tp->debugid & KDBG_EVENTID_MASK; + if (eventid != INTERRUPT && eventid != PERF_CPU_IDLE && + eventid != INTC_HANDLER && eventid != DECR_TRAP) { + unsigned int lost_events = TRACE_LOST_EVENTS; + T_QUIET; T_EXPECT_EQ(tp->debugid, lost_events, + "gaps should end with lost events"); + } + } + + prev_timestamp = tp->timestamp; + }); + ktrace_events_single(sread, TRACE_LOST_EVENTS, ^(struct trace_point *tp){ + T_LOG("lost: %llu on %d (%lu)", tp->timestamp, tp->cpuid, tp->arg1); + }); + + __block uint64_t last_write = 0; + ktrace_events_single_paired(sread, TRACE_WRITING_EVENTS, + ^(struct trace_point *start, struct trace_point *end) { + uint64_t delta_ns; + int converror = ktrace_convert_timestamp_to_nanoseconds(sread, + start->timestamp - last_write, &delta_ns); + T_QUIET; T_ASSERT_POSIX_ZERO(converror, "convert timestamp to ns"); + + uint64_t dur_ns; + converror = ktrace_convert_timestamp_to_nanoseconds(sread, + end->timestamp - start->timestamp, &dur_ns); + T_QUIET; T_ASSERT_POSIX_ZERO(converror, "convert timestamp to ns"); + + T_LOG("write: %llu (+%gs): %gus on %d: %lu events", start->timestamp, + (double)delta_ns / 1e9, (double)dur_ns / 1e3, end->cpuid, end->arg1); + last_write = end->timestamp; + }); + ktrace_set_completion_handler(sread, ^{ + uint64_t duration_ns = 0; + if (last_timestamp) { + int converror = ktrace_convert_timestamp_to_nanoseconds(sread, + last_timestamp - first_timestamp, &duration_ns); + T_QUIET; T_ASSERT_POSIX_ZERO(converror, + "convert timestamp to ns"); + T_LOG("file was %gs long, %llu events: %g events/msec/cpu", + (double)duration_ns / 1e9, nevents, + (double)nevents / ((double)duration_ns / 1e6) / ncpus); + } + (void)unlink(filepath); + ktrace_session_destroy(sread); + T_END; + }); + + int starterror = ktrace_start(sread, dispatch_get_main_queue()); + T_QUIET; T_ASSERT_POSIX_ZERO(starterror, "ktrace_start read session"); + + T_SETUPEND; + }); + +/* Just kidding... for now. */ +#if 0 + kperror = kperf_sample_set(1); + T_ASSERT_POSIX_SUCCESS(kperror, + "started kperf timer sampling every %llu ns", TIMER_NS); +#endif + + for (int i = 0; i < (ncpus - 1); i++) { + int error = pthread_create(&threads[i], NULL, kdebug_abuser_thread, + (void *)(uintptr_t)i); + T_QUIET; T_ASSERT_POSIX_ZERO(error, + "pthread_create abuser thread %d", i); + } + + int error = ktrace_start_writing_file(s, filepath, + ktrace_compression_none, NULL, NULL); + T_ASSERT_POSIX_ZERO(error, "started writing ktrace to %s", filepath); + + T_SETUPEND; + + dispatch_after(dispatch_time(DISPATCH_TIME_NOW, ABUSE_SECS * NSEC_PER_SEC), + dispatch_get_main_queue(), ^{ + T_LOG("ending trace"); + ktrace_end(s, 1); + + continue_abuse = false; + for (int i = 0; i < (ncpus - 1); i++) { + int joinerror = pthread_join(threads[i], NULL); + T_QUIET; T_EXPECT_POSIX_ZERO(joinerror, "pthread_join thread %d", + i); + } + }); + + dispatch_main(); +} + +#define ROUND_TRIP_PERIOD UINT64_C(10 * 1000) +#define ROUND_TRIPS_THRESHOLD UINT64_C(25) +#define ROUND_TRIPS_TIMEOUT_SECS (2 * 60) +#define COLLECTION_INTERVAL_MS 100 + +/* + * Test a sustained tracing session, involving multiple round-trips to the + * kernel. + * + * Trace all events, and every `ROUND_TRIP_PERIOD` events, emit an event that's + * unlikely to be emitted elsewhere. Look for this event, too, and make sure we + * see as many of them as we emitted. + * + * After seeing `ROUND_TRIPS_THRESHOLD` of the unlikely events, end tracing. + * In the failure mode, we won't see any of these, so set a timeout of + * `ROUND_TRIPS_TIMEOUT_SECS` to prevent hanging, waiting for events that we'll + * never see. + */ +T_DECL(round_trips, + "test sustained tracing with multiple round-trips through the kernel") +{ + ktrace_session_t s = ktrace_session_create(); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session"); + + /* + * Set a small buffer and collection interval to increase the number of + * round-trips. + */ + ktrace_set_buffer_size(s, 50); + ktrace_set_collection_interval(s, COLLECTION_INTERVAL_MS); + + __block uint64_t events = 0; + __block uint64_t emitted = 0; + __block uint64_t seen = 0; + ktrace_events_all(s, ^(__unused struct trace_point *tp) { + events++; + if (events % ROUND_TRIP_PERIOD == 0) { + T_LOG("emitting round-trip event %" PRIu64, emitted); + kdebug_trace(TRACE_DEBUGID, events, 0, 0, 0); + emitted++; + } + }); + + ktrace_events_single(s, TRACE_DEBUGID, ^(__unused struct trace_point *tp) { + T_LOG("saw round-trip event after %" PRIu64 " events", events); + seen++; + if (seen >= ROUND_TRIPS_THRESHOLD) { + T_LOG("ending trace after seeing %" PRIu64 " events, " + "emitting %" PRIu64, seen, emitted); + ktrace_end(s, 1); + } + }); + + ktrace_set_completion_handler(s, ^{ + T_EXPECT_GE(emitted, ROUND_TRIPS_THRESHOLD, + "emitted %" PRIu64 " round-trip events", emitted); + T_EXPECT_GE(seen, ROUND_TRIPS_THRESHOLD, + "saw %" PRIu64 " round-trip events", seen); + ktrace_session_destroy(s); + T_END; + }); + + int error = ktrace_start(s, dispatch_get_main_queue()); + T_ASSERT_POSIX_ZERO(error, "started tracing"); + + dispatch_after(dispatch_time(DISPATCH_TIME_NOW, + ROUND_TRIPS_TIMEOUT_SECS * NSEC_PER_SEC), dispatch_get_main_queue(), + ^{ + T_LOG("ending trace after %d seconds", ROUND_TRIPS_TIMEOUT_SECS); + ktrace_end(s, 0); + }); + + dispatch_main(); +} + +#define HEARTBEAT_INTERVAL_SECS 2 +#define HEARTBEAT_COUNT 20 + +/* + * Ensure we see events periodically, checking for recent events on a + * heart-beat. + */ +T_DECL(event_coverage, "ensure events appear up to the end of tracing") +{ + ktrace_session_t s = ktrace_session_create(); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session"); + + __block uint64_t current_timestamp = 0; + __block uint64_t events = 0; + ktrace_events_all(s, ^(struct trace_point *tp) { + current_timestamp = tp->timestamp; + events++; + }); + + ktrace_set_buffer_size(s, 20); + ktrace_set_collection_interval(s, COLLECTION_INTERVAL_MS); + + __block uint64_t last_timestamp = 0; + __block uint64_t last_events = 0; + __block unsigned int heartbeats = 0; + + ktrace_set_completion_handler(s, ^{ + ktrace_session_destroy(s); + T_QUIET; T_EXPECT_GT(events, 0ULL, "should have seen some events"); + T_END; + }); + + dispatch_source_t timer = dispatch_source_create(DISPATCH_SOURCE_TYPE_TIMER, + 0, 0, dispatch_get_main_queue()); + dispatch_source_set_timer(timer, dispatch_time(DISPATCH_TIME_NOW, + HEARTBEAT_INTERVAL_SECS * NSEC_PER_SEC), + HEARTBEAT_INTERVAL_SECS * NSEC_PER_SEC, 0); + dispatch_source_set_cancel_handler(timer, ^{ + dispatch_release(timer); + }); + + dispatch_source_set_event_handler(timer, ^{ + heartbeats++; + + T_LOG("heartbeat %u at time %lld, seen %" PRIu64 " events, " + "current event time %lld", heartbeats, mach_absolute_time(), + events, current_timestamp); + + if (current_timestamp > 0) { + T_EXPECT_GT(current_timestamp, last_timestamp, + "event timestamps should be increasing"); + T_QUIET; T_EXPECT_GT(events, last_events, + "number of events should be increasing"); + } + + last_timestamp = current_timestamp; + last_events = events; + + if (heartbeats >= HEARTBEAT_COUNT) { + T_LOG("ending trace after %u heartbeats", HEARTBEAT_COUNT); + ktrace_end(s, 0); + } + }); + + int error = ktrace_start(s, dispatch_get_main_queue()); + T_ASSERT_POSIX_ZERO(error, "started tracing"); + + dispatch_activate(timer); + + dispatch_main(); +} diff --git a/tests/kernel_mtx_perf.c b/tests/kernel_mtx_perf.c new file mode 100644 index 000000000..396104fd2 --- /dev/null +++ b/tests/kernel_mtx_perf.c @@ -0,0 +1,306 @@ +#ifdef T_NAMESPACE +#undef T_NAMESPACE +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META(T_META_NAMESPACE("xnu.kernel_mtx_perf_test")); + +#define ITER 100000 +#define TEST_MTX_MAX_STATS 8 + +#define TEST_MTX_LOCK_STATS 0 +#define TEST_MTX_UNLOCK_MTX_STATS 6 + +static void +test_from_kernel_lock_unlock_contended(void) +{ + int i, ret, name_size; + uint64_t avg, run, tot; + size_t size; + char iter[35]; + char *buff, *buff_p, *avg_p, *name, *end_name; + + T_LOG("Testing locking/unlocking mutex from kernel with contention.\n"); + T_LOG("Requesting test with %d iterations\n", ITER); + + size = 1000; + buff = calloc(size, sizeof(char)); + T_QUIET;T_ASSERT_NOTNULL(buff, "Allocating buffer fo sysctl"); + + snprintf(iter, sizeof(iter), "%d", ITER); + ret = sysctlbyname("kern.test_mtx_contended", buff, &size, iter, sizeof(iter)); + T_ASSERT_POSIX_SUCCESS(ret, "sysctlbyname kern.test_mtx_contended"); + + T_LOG("%s stats:\n%s\n", __func__, buff); + + /* first line is "STATS INNER LOOP" */ + buff_p = buff; + while( *buff_p != '\n' ) buff_p++; + buff_p++; + + /* + * Sequence of statistic lines like + * { samples 100000, tot 3586175 ns, avg 35 ns, max 3997 ns, min 33 ns } TEST_MTX_LOCK_STATS + * for all TEST_MTX_MAX_STATS statistics + */ + for (i = 0; i < TEST_MTX_MAX_STATS; i++) { + avg_p = strstr(buff_p, "avg "); + + /* contended test records statistics only for lock/unlock for now */ + if (i == TEST_MTX_LOCK_STATS || i == TEST_MTX_UNLOCK_MTX_STATS ) { + T_QUIET;T_ASSERT_NOTNULL(avg_p, "contended %i average not found", i); + sscanf(avg_p, "avg %llu", &avg); + + name = strstr(buff_p, "TEST_MTX_"); + end_name = strstr(buff_p, "_STATS"); + name_size = end_name - name - strlen("TEST_MTX_") + 1; + + char name_string[40]; + char avg_name_string[50]; + char *pre_string = "contended "; + snprintf(name_string, name_size + strlen(pre_string), "%s%s", pre_string, &name[strlen("TEST_MTX_")]); + pre_string = "avg contended "; + snprintf(avg_name_string, name_size + strlen(pre_string), "%s%s", pre_string, &name[strlen("TEST_MTX_")]); + T_PERF(name_string, avg, "ns", avg_name_string); + } + + buff_p = avg_p; + while( *buff_p != '\n' ) buff_p++; + buff_p++; + + } + + while( *buff_p != '\n' ) buff_p++; + buff_p++; + + /* next line is "STATS OUTER LOOP" */ + while( *buff_p != '\n' ) buff_p++; + buff_p++; + + /* contended test records statistics only for lock/unlock for now */ + avg_p = strstr(buff_p, "run time "); + T_QUIET;T_ASSERT_NOTNULL(avg_p, "contended %d loop run time not found", 0); + sscanf(avg_p, "run time %llu", &run); + + avg_p = strstr(buff_p, "total time "); + T_QUIET;T_ASSERT_NOTNULL(avg_p, "uncontended %d loop total time not found", 0); + sscanf(avg_p, "total time %llu", &tot); + + if (run < tot) + avg = run; + else + avg = tot; + + name = strstr(buff_p, "TEST_MTX_"); + end_name = strstr(buff_p, "_STATS"); + name_size = end_name - name - strlen("TEST_MTX_") + 1; + + char name_string[50]; + char avg_name_string[60]; + char *pre_string = "contended loop "; + snprintf(name_string, name_size + strlen(pre_string), "%s%s", pre_string, &name[strlen("TEST_MTX_")]); + pre_string = "avg time contended loop "; + snprintf(avg_name_string, name_size + strlen(pre_string), "%s%s", pre_string, &name[strlen("TEST_MTX_")]); + T_PERF(name_string, avg/ITER, "ns", avg_name_string); + + free(buff); +} + +static void +test_from_kernel_lock_unlock_uncontended(void) +{ + int i, ret, name_size; + uint64_t avg, run, tot; + size_t size; + char iter[35]; + char *buff, *buff_p, *avg_p, *name, *end_name; + + T_LOG("Testing locking/unlocking mutex from kernel without contention.\n"); + T_LOG("Requesting test with %d iterations\n", ITER); + + size = 2000; + buff = calloc(size, sizeof(char)); + T_QUIET;T_ASSERT_NOTNULL(buff, "Allocating buffer fo sysctl"); + + snprintf(iter, sizeof(iter), "%d", ITER); + ret = sysctlbyname("kern.test_mtx_uncontended", buff, &size, iter, sizeof(iter)); + T_ASSERT_POSIX_SUCCESS(ret, "sysctlbyname kern.test_mtx_uncontended"); + + T_LOG("%s stats:\n%s\n", __func__, buff); + + /* first line is "STATS INNER LOOP" */ + buff_p = buff; + while( *buff_p != '\n' ) buff_p++; + buff_p++; + + /* + * Sequence of statistic lines like + * { samples 100000, tot 3586175 ns, avg 35 ns, max 3997 ns, min 33 ns } TEST_MTX_LOCK_STATS + * for all TEST_MTX_MAX_STATS statistics + */ + for (i = 0; i < TEST_MTX_MAX_STATS; i++) { + avg_p = strstr(buff_p, "avg "); + T_QUIET;T_ASSERT_NOTNULL(avg_p, "uncontended %i average not found", i); + sscanf(avg_p, "avg %llu", &avg); + + name = strstr(buff_p, "TEST_MTX_"); + end_name = strstr(buff_p, "_STATS"); + name_size = end_name - name - strlen("TEST_MTX_") + 1; + + char name_string[40]; + char avg_name_string[50]; + char *pre_string = "uncontended "; + snprintf(name_string, name_size + strlen(pre_string), "%s%s", pre_string, &name[strlen("TEST_MTX_")]); + pre_string = "avg time uncontended "; + snprintf(avg_name_string, name_size + strlen(pre_string), "%s%s", pre_string, &name[strlen("TEST_MTX_")]); + T_PERF(name_string, avg, "ns", avg_name_string); + + buff_p = avg_p; + while( *buff_p != '\n' ) buff_p++; + buff_p++; + } + + while( *buff_p != '\n' ) buff_p++; + buff_p++; + + /* next line is "STATS OUTER LOOP" */ + while( *buff_p != '\n' ) buff_p++; + buff_p++; + + /* + * Sequence of statistic lines like + * total time 4040673 ns total run time 3981080 ns TEST_MTX_LOCK_STATS + * for all TEST_MTX_MAX_STATS statistics exept UNLOCK + */ + for (i = 0; i < TEST_MTX_MAX_STATS - 2; i++) { + avg_p = strstr(buff_p, "run time "); + T_QUIET;T_ASSERT_NOTNULL(avg_p, "uncontended %d loop run time not found", i); + sscanf(avg_p, "run time %llu", &run); + + avg_p = strstr(buff_p, "total time "); + T_QUIET;T_ASSERT_NOTNULL(avg_p, "uncontended %d loop total time not found", i); + sscanf(avg_p, "total time %llu", &tot); + + if (run < tot) + avg = run; + else + avg = tot; + + name = strstr(buff_p, "TEST_MTX_"); + end_name = strstr(buff_p, "_STATS"); + name_size = end_name - name - strlen("TEST_MTX_") + 1; + + char name_string[50]; + char avg_name_string[60]; + char *pre_string = "uncontended loop "; + snprintf(name_string, name_size + strlen(pre_string), "%s%s", pre_string, &name[strlen("TEST_MTX_")]); + pre_string = "avg time uncontended loop "; + snprintf(avg_name_string, name_size + strlen(pre_string), "%s%s", pre_string, &name[strlen("TEST_MTX_")]); + T_PERF(name_string, avg/ITER, "ns", avg_name_string); + + buff_p = avg_p; + while( *buff_p != '\n' ) buff_p++; + buff_p++; + + } + free(buff); +} + +extern char **environ; +static void +fix_cpu_frequency(void) +{ +#if CONFIG_EMBEDDED + int spawn_ret, pid; + char *const clpcctrl_args[] = {"/usr/local/bin/clpcctrl", "-f", "5000", NULL}; + + T_LOG("Setting cpu frequency to %d\n", 5000); + + spawn_ret = posix_spawn(&pid, clpcctrl_args[0], NULL, NULL, clpcctrl_args, environ); + waitpid(pid, &spawn_ret, 0); + +#else /*CONFIG_EMBEDDED*/ + + int spawn_ret, pid; + int ret, nom_freq; + size_t len; + float val; + char scale; + char *buffer, *cpu_freq; + char str_val[10]; + + ret = sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0); + T_QUIET;T_ASSERT_POSIX_SUCCESS(ret, "sysctlbyname machdep.cpu.brand_string"); + + buffer = malloc(len+2); + ret = sysctlbyname("machdep.cpu.brand_string", buffer, &len, NULL, 0); + T_QUIET;T_ASSERT_POSIX_SUCCESS(ret, "sysctlbyname machdep.cpu.brand_string"); + buffer[len+1] = '\0'; + + cpu_freq = strstr(buffer, "CPU @ "); + if (cpu_freq == NULL) { + T_LOG("Could not fix frequency, %s field not present\n", "CPU @ "); + goto out; + } + + if (strstr(cpu_freq, "Hz") != NULL) { + sscanf(cpu_freq, "CPU @ %f%cHz", &val, &scale); + } else { + if (strstr(cpu_freq, "hz") != NULL) { + sscanf(cpu_freq, "CPU @ %f%chz", &val, &scale); + } else { + T_LOG("Could not fix frequency, %s field not present\n", "Hz"); + goto out; + } + } + + switch(scale){ + case 'M': + case 'm': + nom_freq = (int) val; + break; + case 'G': + case 'g': + nom_freq = (int) (val*1000); + break; + default: + T_LOG("Could not fix frequency, scale field is %c\n", scale); + goto out; + } + + snprintf(str_val, 10, "%d", nom_freq); + T_LOG("Setting min and max cpu frequency to %d (%s)\n", nom_freq, str_val); + char *xcpm_args[] = {"/usr/local/bin/xcpm", "limits", str_val, str_val, NULL}; + spawn_ret = posix_spawn(&pid, xcpm_args[0], NULL, NULL, xcpm_args, environ); + waitpid(pid, &spawn_ret, 0); + +out: + free(buffer); + return; +#endif /*CONFIG_EMBEDDED*/ +} + +T_DECL(kernel_mtx_perf_test, + "Kernel mutex performance test", + T_META_ASROOT(YES), T_META_CHECK_LEAKS(NO)) +{ + fix_cpu_frequency(); + + test_from_kernel_lock_unlock_uncontended(); + test_from_kernel_lock_unlock_contended(); +} + diff --git a/tests/kernel_uuid_match.c b/tests/kernel_uuid_match.c new file mode 100644 index 000000000..f5f32d45b --- /dev/null +++ b/tests/kernel_uuid_match.c @@ -0,0 +1,192 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_LEN 1024 + +#if TARGET_OS_MAC && !TARGET_OS_EMBEDDED + //running on macOS + #define KERNEL_SEARCH_DIR "/System/Library/Kernels/*" +#else + //running on a different OS (e.g. iOS, watchOS, etc.) + #define KERNEL_SEARCH_DIR "/*" +#endif + +#define SWAP32(v) v = OSSwapInt32(v) + + +/* opens and maps the file at [path] in memory, + * sets the length in [len] and returns a pointer + * to the beginning of the memory region or NULL + * if unable to open and map the file + */ +static void *open_file(char *path, size_t *len) { + int fd; + if ((fd = open(path, O_RDONLY)) < 0) { + return NULL; + } + *len = (size_t)lseek(fd, (off_t)0, SEEK_END); + void *p = mmap(NULL, *len, PROT_READ, MAP_PRIVATE, fd, 0); + close(fd); + if (p == MAP_FAILED) { + return NULL; + } + return p; +} + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wsign-conversion" +static void __swap_mach_header(struct mach_header *header) { + SWAP32(header->magic); + SWAP32(header->cputype); + SWAP32(header->cpusubtype); + SWAP32(header->filetype); + SWAP32(header->ncmds); + SWAP32(header->sizeofcmds); + SWAP32(header->flags); +} + +static void __swap_mach_header_64(struct mach_header_64 *header) { + SWAP32(header->magic); + SWAP32(header->cputype); + SWAP32(header->cpusubtype); + SWAP32(header->filetype); + SWAP32(header->ncmds); + SWAP32(header->sizeofcmds); + SWAP32(header->flags); +} +#pragma clang diagnostic pop + +/* parses the uuid from the file at [path] and sets the uuid in [uuid] + * returns true if successfully parses the file, returns false otherwise + * (e.g. the file is not a Mach-O binary) + */ +static bool parse_binary_uuid(char *path, uuid_t uuid) { + size_t len = 0; + bool should_swap = false; + unsigned int ncmds = 0; + struct load_command *lc = NULL; + bool ret = false; + + struct mach_header *h = open_file(path, &len); + if (!h) { + return false; + } + if (h->magic == MH_MAGIC || h->magic == MH_CIGAM) { + //32-bit header + struct mach_header *header = h; + if (header->magic == MH_CIGAM) { + __swap_mach_header(header); + should_swap = true; + } + ncmds = header->ncmds; + //the first load command is after the header + lc = (struct load_command *)(header + 1); + } else if (h->magic == MH_MAGIC_64 || h->magic == MH_CIGAM_64) { + //64-bit header + struct mach_header_64 *header = (struct mach_header_64 *)h; + if (header->magic == MH_CIGAM_64) { + __swap_mach_header_64(header); + should_swap = true; + } + ncmds = header->ncmds; + lc = (struct load_command *)(header + 1); + } else { + //this is not a Mach-O binary, or it is a FAT binary + munmap(h, len); + return false; + } + for (unsigned int i = 0; i < ncmds; i++) { + uint32_t cmd = lc->cmd; + uint32_t cmdsize = lc->cmdsize; + if (should_swap) { + SWAP32(cmd); + SWAP32(cmdsize); + } + if (cmd == LC_UUID) { + struct uuid_command *uuid_cmd = + (struct uuid_command *)lc; + uuid_copy(uuid, uuid_cmd->uuid); + uuid_string_t tuuid_str; + uuid_unparse(uuid, tuuid_str); + T_LOG("Trying test UUID %s", tuuid_str); + ret = true; + break; + } + lc = (struct load_command *)((uintptr_t)lc + cmdsize); + } + munmap(h, len); + return ret; +} + +/* uses the sysctl command line tool to get the uuid + * of the currently running kernel + */ +static void get_system_kernel_uuid(uuid_t kuuid) { + char kuuid_line[MAX_LEN]; + memset(kuuid_line, 0, sizeof(kuuid_line)); + size_t len = sizeof(kuuid_line); + int ret = sysctlbyname("kern.uuid", kuuid_line, &len, NULL, 0); + T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.uuid"); + + T_ASSERT_TRUE(uuid_parse(kuuid_line, kuuid) == 0, + "Parse running kernel uuid"); +} + +/* compares [kuuid] to the uuid in each of the kernel binaries on OS's + * other than macOS (there can be multiple kernel binaries if the mastering + * process doesn't remove all of the irrelevant binaries) + */ +static void find_and_compare_test_uuids(char *search_path, uuid_t kuuid) { + glob_t g; + int ret = glob(search_path, 0, NULL, &g); + T_WITH_ERRNO; T_ASSERT_EQ(ret, 0, "glob %s", search_path); + + bool pass = false; + for (int i = 0; i < g.gl_matchc; i++) { + char *path = g.gl_pathv[i]; + + //check that [path] is the path for a file (not a directory, device, etc.) + struct stat s; + int ret = stat(path, &s); + T_ASSERT_POSIX_SUCCESS(ret, "stat %s", path); + if ((s.st_mode & S_IFREG) == 0) { + continue; + } + + T_LOG("Reading file at path: %s", path); + uuid_t tuuid; + if (parse_binary_uuid(path, tuuid) && + uuid_compare(kuuid, tuuid) == 0) { + pass = true; + break; + } + } + globfree(&g); + T_EXPECT_TRUE(pass, "The sources match"); +} + +T_DECL(uuid_match, "Compare the running kernel UUID to kernel binaries.") +{ + uuid_t kuuid; + uuid_clear(kuuid); + get_system_kernel_uuid(kuuid); + uuid_string_t kuuid_str; + uuid_unparse(kuuid, kuuid_str); + T_LOG("Got running kernel UUID %s", kuuid_str); + find_and_compare_test_uuids(KERNEL_SEARCH_DIR, kuuid); +} diff --git a/tools/tests/darwintests/kevent_continuous_time.c b/tests/kevent_continuous_time.c similarity index 78% rename from tools/tests/darwintests/kevent_continuous_time.c rename to tests/kevent_continuous_time.c index 93015cd5a..607cce682 100644 --- a/tools/tests/darwintests/kevent_continuous_time.c +++ b/tests/kevent_continuous_time.c @@ -87,11 +87,17 @@ T_DECL(kevent_continuous_time_periodic_tick, "kevent(EVFILT_TIMER with NOTE_MACH int kq; T_ASSERT_POSIX_SUCCESS((kq = kqueue()), NULL); - struct kevent64_s change = {0}; - EV_SET64(&change, 1, EVFILT_TIMER, EV_ADD, NOTE_SECONDS | NOTE_MACH_CONTINUOUS_TIME, 4, 0, 0, 0); - T_LOG("EV_SET(&change, 1, EVFILT_TIMER, EV_ADD, NOTE_SECONDS | NOTE_MACH_CONTINUOUS_TIME, 4, 0, 0, 0);"); + struct kevent64_s kev = { + .ident = 1, + .filter = EVFILT_TIMER, + .flags = EV_ADD | EV_RECEIPT, + .fflags = NOTE_SECONDS | NOTE_MACH_CONTINUOUS_TIME, + .data = 4, + }; + T_LOG("EV_SET(&kev, 1, EVFILT_TIMER, EV_ADD, NOTE_SECONDS | NOTE_MACH_CONTINUOUS_TIME, 4, 0, 0, 0);"); - T_ASSERT_POSIX_ZERO(kevent64(kq, &change, 1, NULL, 0, 0, NULL), NULL); + T_ASSERT_EQ(kevent64(kq, &kev, 1, &kev, 1, 0, NULL), 1, NULL); + T_ASSERT_EQ(0ll, kev.data, "No error returned"); uint64_t abs_then = mach_absolute_time(); uint64_t cnt_then = mach_continuous_time();; @@ -99,10 +105,9 @@ T_DECL(kevent_continuous_time_periodic_tick, "kevent(EVFILT_TIMER with NOTE_MACH trigger_sleep(1); int sleep_secs = wait_for_sleep(); - struct kevent64_s event = {0}; - T_WITH_ERRNO; T_ASSERT_EQ(kevent64(kq, NULL, 0, &event, 1, 0, NULL), 1, "kevent() should have returned one event"); - T_LOG("event = {.ident = %llx, .filter = %d, .flags = %d, .fflags = %d, .data = %lld, .udata = %lld}", event.ident, event.filter, event.flags, event.fflags, event.data, event.udata); - T_ASSERT_EQ(event.flags & EV_ERROR, 0, "event should not have EV_ERROR set: %s", event.flags & EV_ERROR ? strerror((int)event.data) : "no error"); + T_WITH_ERRNO; T_ASSERT_EQ(kevent64(kq, NULL, 0, &kev, 1, 0, NULL), 1, "kevent() should have returned one event"); + T_LOG("event = {.ident = %llx, .filter = %d, .flags = %d, .fflags = %d, .data = %lld, .udata = %lld}", kev.ident, kev.filter, kev.flags, kev.fflags, kev.data, kev.udata); + T_ASSERT_EQ(kev.flags & EV_ERROR, 0, "event should not have EV_ERROR set: %s", kev.flags & EV_ERROR ? strerror((int)kev.data) : "no error"); uint64_t abs_now = mach_absolute_time(); uint64_t cnt_now = mach_continuous_time();; @@ -123,9 +128,14 @@ T_DECL(kevent_continuous_time_periodic_tick, "kevent(EVFILT_TIMER with NOTE_MACH sleep(1); - EV_SET64(&change, 1, EVFILT_TIMER, EV_DELETE, 0, 0, 0, 0, 0); - T_LOG("EV_SET(&change, 1, EVFILT_TIMER, EV_DELETE, 0, 0, 0);"); - T_ASSERT_EQ(kevent64(kq, &change, 1, NULL, 0, 0, NULL), 0, NULL); + kev = (struct kevent64_s){ + .ident = 1, + .filter = EVFILT_TIMER, + .flags = EV_DELETE | EV_RECEIPT, + }; + T_LOG("EV_SET(&kev, 1, EVFILT_TIMER, EV_DELETE, 0, 0, 0);"); + T_ASSERT_EQ(kevent64(kq, &kev, 1, &kev, 1, 0, NULL), 1, NULL); + T_ASSERT_EQ(0ll, kev.data, "No error returned"); T_ASSERT_POSIX_ZERO(close(kq), NULL); } @@ -138,19 +148,25 @@ T_DECL(kevent_continuous_time_absolute, "kevent(EVFILT_TIMER with NOTE_MACH_CONT struct timeval tv; gettimeofday(&tv, NULL); - uint64_t nowus = (uint64_t)tv.tv_sec * USEC_PER_SEC + (uint64_t)tv.tv_usec; - uint64_t fire_at = (3*USEC_PER_SEC) + nowus; + int64_t nowus = (int64_t)tv.tv_sec * USEC_PER_SEC + (int64_t)tv.tv_usec; + int64_t fire_at = (3*USEC_PER_SEC) + nowus; uint64_t cnt_now = mach_continuous_time(); uint64_t cnt_then = cnt_now + ms_to_tick(3000); T_LOG("currently is %llu, firing at %llu", nowus, fire_at); - struct kevent64_s change = {0}; - EV_SET64(&change, 2, EVFILT_TIMER, EV_ADD, NOTE_MACH_CONTINUOUS_TIME | NOTE_ABSOLUTE | NOTE_USECONDS, fire_at, 0, 0, 0); - T_LOG("EV_SET(&change, 2, EVFILT_TIMER, EV_ADD, NOTE_MACH_CONTINUOUS_TIME | NOTE_ABSOLUTE | NOTE_USECONDS, fire_at, 0);"); + struct kevent64_s kev = { + .ident = 2, + .filter = EVFILT_TIMER, + .flags = EV_ADD | EV_RECEIPT, + .fflags = NOTE_MACH_CONTINUOUS_TIME | NOTE_ABSOLUTE | NOTE_USECONDS, + .data = fire_at, + }; + T_LOG("EV_SET(&kev, 2, EVFILT_TIMER, EV_ADD, NOTE_MACH_CONTINUOUS_TIME | NOTE_ABSOLUTE | NOTE_USECONDS, fire_at, 0);"); - T_ASSERT_EQ(kevent64(kq, &change, 1, NULL, 0, 0, NULL), 0, NULL); + T_ASSERT_EQ(kevent64(kq, &kev, 1, &kev, 1, 0, NULL), 1, NULL); + T_ASSERT_EQ(0ll, kev.data, "No error returned"); T_LOG("testing NOTE_MACH_CONTINUOUS_TIME | NOTE_ABSOLUTE between sleep"); @@ -158,7 +174,7 @@ T_DECL(kevent_continuous_time_absolute, "kevent(EVFILT_TIMER with NOTE_MACH_CONT struct timespec timeout = { .tv_sec = 10, - .tv_nsec = 0 + .tv_nsec = 0, }; struct kevent64_s event = {0}; T_ASSERT_EQ(kevent64(kq, NULL, 0, &event, 1, 0, &timeout), 1, "kevent() should have returned one event"); @@ -192,12 +208,18 @@ T_DECL(kevent_continuous_time_pops, "kevent(EVFILT_TIMER with NOTE_MACH_CONTINUO T_ASSERT_POSIX_SUCCESS((kq = kqueue()), NULL); // test that periodic ticks accumulate while asleep - struct kevent64_s change = {0}; - EV_SET64(&change, 3, EVFILT_TIMER, EV_ADD, NOTE_MACH_CONTINUOUS_TIME, 100, 0, 0, 0); // tick every 100 ms - T_LOG("EV_SET(&change, 3, EVFILT_TIMER, EV_ADD, NOTE_MACH_CONTINUOUS_TIME, 100, 0);"); + struct kevent64_s kev = { + .ident = 3, + .filter = EVFILT_TIMER, + .flags = EV_ADD | EV_RECEIPT, + .fflags = NOTE_MACH_CONTINUOUS_TIME, + .data = 100, + }; + T_LOG("EV_SET(&kev, 3, EVFILT_TIMER, EV_ADD, NOTE_MACH_CONTINUOUS_TIME, 100, 0);"); // wait for first pop, then sleep - T_ASSERT_EQ(kevent64(kq, &change, 1, NULL, 0, 0, NULL), 0, NULL); + T_ASSERT_EQ(kevent64(kq, &kev, 1, &kev, 1, 0, NULL), 1, NULL); + T_ASSERT_EQ(0ll, kev.data, "No error returned"); struct kevent64_s event = {0}; T_ASSERT_EQ(kevent64(kq, NULL, 0, &event, 1, 0, NULL), 1, "kevent() should have returned one event"); diff --git a/tools/tests/darwintests/kevent_pty.c b/tests/kevent_pty.c similarity index 78% rename from tools/tests/darwintests/kevent_pty.c rename to tests/kevent_pty.c index a64c48dfa..2fad75e6c 100644 --- a/tools/tests/darwintests/kevent_pty.c +++ b/tests/kevent_pty.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -42,7 +43,7 @@ child_tty_client(void) snprintf(buf, sizeof(buf), "%ds", getpid()); bytes_wr = write(child_ready[1], buf, strlen(buf)); if (bytes_wr < 0) { - exit(1); + err(1, "failed to write on child ready pipe"); } dispatch_main(); @@ -164,11 +165,11 @@ writer_thread(void *arg) { int fd = (int)arg; char c[4096]; + memset(c, 'a', sizeof(c)); T_SETUPBEGIN; T_QUIET; T_ASSERT_GT(fd, 0, "writer thread received valid fd"); - memset(c, 'a', sizeof(c)); T_SETUPEND; while (writing) { @@ -188,14 +189,14 @@ writer_thread(void *arg) #define ATTACH_ITERATIONS 10000 -static int master, slave; +static int attach_master, attach_slave; static pthread_t reader, writer; static void join_threads(void) { - close(slave); - close(master); + close(attach_slave); + close(attach_master); writing = false; pthread_join(reader, NULL); pthread_join(writer, NULL); @@ -238,17 +239,18 @@ T_DECL(attach_while_tty_wakeups, dispatch_group_t grp = dispatch_group_create(); T_SETUPBEGIN; - T_ASSERT_POSIX_SUCCESS(openpty(&master, &slave, NULL, NULL, NULL), NULL); + T_ASSERT_POSIX_SUCCESS(openpty(&attach_master, &attach_slave, NULL, NULL, + NULL), NULL); T_ASSERT_POSIX_ZERO(pthread_create(&reader, NULL, reader_thread, - (void *)(uintptr_t)master), NULL); + (void *)(uintptr_t)attach_master), NULL); T_ASSERT_POSIX_ZERO(pthread_create(&writer, NULL, writer_thread, - (void *)(uintptr_t)slave), NULL); + (void *)(uintptr_t)attach_slave), NULL); T_ATEND(join_threads); T_SETUPEND; - redispatch(grp, DISPATCH_SOURCE_TYPE_READ, master); - redispatch(grp, DISPATCH_SOURCE_TYPE_WRITE, slave); + redispatch(grp, DISPATCH_SOURCE_TYPE_READ, attach_master); + redispatch(grp, DISPATCH_SOURCE_TYPE_WRITE, attach_slave); dispatch_group_notify(grp, dispatch_get_main_queue(), ^{ T_LOG("both reader and writer sources cleaned up"); @@ -257,3 +259,41 @@ T_DECL(attach_while_tty_wakeups, dispatch_main(); } + +T_DECL(master_read_data_set, + "check that the data is set on read sources of master fds") +{ + int master = -1, slave = -1; + + T_SETUPBEGIN; + T_ASSERT_POSIX_SUCCESS(openpty(&master, &slave, NULL, NULL, NULL), NULL); + T_QUIET; T_ASSERT_GE(master, 0, "master fd is valid"); + T_QUIET; T_ASSERT_GE(slave, 0, "slave fd is valid"); + + dispatch_source_t src = dispatch_source_create(DISPATCH_SOURCE_TYPE_READ, + (uintptr_t)master, 0, dispatch_get_main_queue()); + + dispatch_source_set_event_handler(src, ^{ + unsigned long len = dispatch_source_get_data(src); + T_EXPECT_GT(len, (unsigned long)0, + "the amount of data to read was set for the master source"); + dispatch_cancel(src); + }); + + dispatch_source_set_cancel_handler(src, ^{ + dispatch_release(src); + T_END; + }); + + dispatch_activate(src); + T_SETUPEND; + + // Let's not fill up the TTY's buffer, otherwise write(2) will block. + char buf[512] = ""; + + int ret = 0; + while ((ret = write(slave, buf, sizeof(buf)) == -1 && errno == EAGAIN)); + T_ASSERT_POSIX_SUCCESS(ret, "slave wrote data"); + + dispatch_main(); +} diff --git a/tests/kevent_qos.c b/tests/kevent_qos.c new file mode 100644 index 000000000..df021e3ac --- /dev/null +++ b/tests/kevent_qos.c @@ -0,0 +1,1767 @@ +/* + * kevent_qos: Tests Synchronous IPC QOS override. + */ + +#ifdef T_NAMESPACE +#undef T_NAMESPACE +#endif + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META(T_META_NAMESPACE("xnu.kevent_qos")); + +#define ARRAYLEN(arr) (sizeof(arr) / sizeof(arr[0])) + +#define INTERMITTENT_TIMEOUT_SEC (3) +#define RECV_TIMEOUT_SECS (4) +#define SEND_TIMEOUT_SECS (6) +#define HELPER_TIMEOUT_SECS (15) + +#define ENV_VAR_QOS (3) +static const char *qos_env[ENV_VAR_QOS] = {"XNU_TEST_QOS_BO", "XNU_TEST_QOS_QO", "XNU_TEST_QOS_AO"}; +static const char *qos_name_env[ENV_VAR_QOS] = {"XNU_TEST_QOS_NAME_BO", "XNU_TEST_QOS_NAME_QO", "XNU_TEST_QOS_NAME_AO"}; + +#define ENV_VAR_FUNCTION (1) +static const char *wl_function_name = "XNU_TEST_WL_FUNCTION"; + +static qos_class_t g_expected_qos[ENV_VAR_QOS]; +static const char *g_expected_qos_name[ENV_VAR_QOS]; + +#define ENV_QOS_BEFORE_OVERRIDE (0) +#define ENV_QOS_QUEUE_OVERRIDE (1) +#define ENV_QOS_AFTER_OVERRIDE (2) + +struct test_msg { + mach_msg_header_t header; + mach_msg_body_t body; + mach_msg_port_descriptor_t port_descriptor; + mach_msg_option_t opts; + mach_msg_priority_t qos; +}; + +#pragma mark pthread callbacks + +static void +thread_create_at_qos(qos_class_t qos, void * (*function)(void *)); +static void +send(mach_port_t send_port, mach_port_t reply_port, mach_port_t msg_port, mach_msg_priority_t qos, mach_msg_option_t options); +static void +enable_kevent(uint64_t *workloop_id, unsigned long long port); +static void +populate_kevent(struct kevent_qos_s *kev, unsigned long long port); + +static void +worker_cb(pthread_priority_t __unused priority) +{ + T_FAIL("a worker thread was created"); +} + +static void +event_cb(void ** __unused events, int * __unused nevents) +{ + T_FAIL("a kevent routine was called instead of workloop"); +} + +static uint32_t +get_user_promotion_basepri(void) +{ + mach_msg_type_number_t count = THREAD_POLICY_STATE_COUNT; + struct thread_policy_state thread_policy; + boolean_t get_default = FALSE; + mach_port_t thread_port = pthread_mach_thread_np(pthread_self()); + + kern_return_t kr = thread_policy_get(thread_port, THREAD_POLICY_STATE, + (thread_policy_t)&thread_policy, &count, &get_default); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_policy_get"); + return thread_policy.thps_user_promotion_basepri; +} + +#define EXPECT_QOS_EQ(qos, ...) do { \ + if ((qos) == QOS_CLASS_USER_INTERACTIVE) { \ + T_EXPECT_EFFECTIVE_QOS_EQ(QOS_CLASS_USER_INITIATED, __VA_ARGS__); \ + T_EXPECT_EQ(47u, get_user_promotion_basepri(), __VA_ARGS__); \ + } else { \ + T_EXPECT_EFFECTIVE_QOS_EQ(qos, __VA_ARGS__); \ + } \ + } while (0) + +#define EXPECT_TEST_MSG(_ke) do { \ + struct kevent_qos_s *ke = _ke; \ + mach_msg_header_t *hdr = (mach_msg_header_t *)ke->ext[0]; \ + T_ASSERT_NOTNULL(hdr, "has a message"); \ + T_ASSERT_EQ(hdr->msgh_size, (uint32_t)sizeof(struct test_msg), "of the right size"); \ + struct test_msg *tmsg = (struct test_msg *)hdr; \ + if (tmsg->opts & MACH_SEND_PROPAGATE_QOS) { \ + T_EXPECT_EQ(tmsg->qos, ((uint32_t)(ke->ext[2] >> 32)), \ + "propagation works"); \ + } \ + } while (0) + +/* + * Basic WL handler callback, it sleeps for n seconds and then checks the + * effective Qos of the servicer thread. + */ +static void +workloop_cb_test_intransit(uint64_t *workloop_id __unused, void **eventslist, int *events) +{ + T_LOG("Workloop handler workloop_cb_test_intransit called. " + "Will wait for %d seconds to make sure client enqueues the sync msg \n", + 2 * RECV_TIMEOUT_SECS); + + EXPECT_TEST_MSG(*eventslist); + + /* Wait for the client to send the high priority message to override the qos */ + sleep(2 * RECV_TIMEOUT_SECS); + + /* Skip the test if we can't check Qos */ + if (geteuid() != 0) { + T_SKIP("kevent_qos test requires root privileges to run."); + } + + /* The effective Qos should be the one expected after override */ + EXPECT_QOS_EQ(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], + "dispatch_source event handler QoS should be %s", g_expected_qos_name[ENV_QOS_AFTER_OVERRIDE]); + + *events = 0; + T_END; +} + +/* + * WL handler which checks if the servicer thread has correct Qos. + */ +static void +workloop_cb_test_sync_send(uint64_t *workloop_id __unused, void **eventslist, int *events) +{ + T_LOG("Workloop handler workloop_cb_test_sync_send called"); + + EXPECT_TEST_MSG(*eventslist); + + if (geteuid() != 0) { + T_SKIP("kevent_qos test requires root privileges to run."); + } + + /* The effective Qos should be the one expected after override */ + EXPECT_QOS_EQ(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], + "dispatch_source event handler QoS should be %s", g_expected_qos_name[ENV_QOS_AFTER_OVERRIDE]); + + *events = 0; + T_END; +} + +/* + * WL handler which checks the overridden Qos and then enables the knote and checks + * for the Qos again if that dropped the sync ipc override. + */ +static void +workloop_cb_test_sync_send_and_enable(uint64_t *workloop_id, struct kevent_qos_s **eventslist, int *events) +{ + unsigned override_priority; + unsigned reenable_priority; + + T_LOG("Workloop handler workloop_cb_test_sync_send_and_enable called"); + + EXPECT_TEST_MSG(*eventslist); + + if (geteuid() != 0) { + T_SKIP("kevent_qos test requires root privileges to run."); + } + + /* The effective Qos should be the one expected after override */ + EXPECT_QOS_EQ(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], + "dispatch_source event handler QoS should be %s", g_expected_qos_name[ENV_QOS_AFTER_OVERRIDE]); + + /* Snapshot the current override priority */ + override_priority = get_user_promotion_basepri(); + + /* Enable the knote */ + struct kevent_qos_s *kev = *eventslist; + enable_kevent(workloop_id, kev->ident); + + /* + * Check if the override has been dropped, check for priority instead of qos since + * there will be async qos push. + */ + reenable_priority = get_user_promotion_basepri(); + T_EXPECT_LT(reenable_priority, override_priority, + "thread's current override priority %d should be less than override priority prior to enabling knote %d", + reenable_priority, override_priority); + + *events = 0; + T_END; +} + +/* + * WL handler receives the first message and checks sync ipc override, then enables the knote + * and receives 2nd message and checks it sync ipc override. + */ +static int send_two_sync_handler_called = 0; +static void +workloop_cb_test_send_two_sync(uint64_t *workloop_id __unused, struct kevent_qos_s **eventslist, int *events) +{ + T_LOG("Workloop handler workloop_cb_test_send_two_sync called for %d time", send_two_sync_handler_called + 1); + + EXPECT_TEST_MSG(*eventslist); + + if (geteuid() != 0) { + T_SKIP("kevent_qos test requires root privileges to run."); + } + + T_LOG("Number of events received is %d\n", *events); + + if (send_two_sync_handler_called == 0) { + /* The effective Qos should be the one expected after override */ + EXPECT_QOS_EQ(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], + "dispatch_source event handler QoS should be %s", g_expected_qos_name[ENV_QOS_AFTER_OVERRIDE]); + + /* Enable the knote to get 2nd message */ + struct kevent_qos_s *kev = *eventslist; + uint64_t port = kev->ident; + populate_kevent(kev, port); + *events = 1; + } else { + EXPECT_QOS_EQ(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], + "dispatch_source event handler QoS should be %s", g_expected_qos_name[ENV_QOS_BEFORE_OVERRIDE]); + *events = 0; + T_END; + } + send_two_sync_handler_called++; +} + +/* + * Checks the sync ipc override and then waits for client to destroy the + * special reply port and checks if that removes the sync ipc override. + */ +static boolean_t two_send_and_destroy_test_passed = FALSE; +static int two_send_and_destroy_handler = 0; +static void +workloop_cb_test_two_send_and_destroy(uint64_t *workloop_id __unused, struct kevent_qos_s **eventslist __unused, int *events) +{ + T_LOG("Workloop handler workloop_cb_test_two_send_and_destroy called %d times", two_send_and_destroy_handler + 1); + + EXPECT_TEST_MSG(*eventslist); + + if (geteuid() != 0) { + T_SKIP("kevent_qos test requires root privileges to run."); + } + + if (two_send_and_destroy_handler == 0) { + /* Sleep to make sure the mqueue gets full */ + sleep(RECV_TIMEOUT_SECS); + + /* The effective Qos should be the one expected after override */ + EXPECT_QOS_EQ(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], + "dispatch_source event handler QoS should be %s", g_expected_qos_name[ENV_QOS_AFTER_OVERRIDE]); + + sleep(SEND_TIMEOUT_SECS); + + /* Special reply port should have been destroyed, check Qos again */ + EXPECT_QOS_EQ(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], + "dispatch_source event handler QoS should be %s", g_expected_qos_name[ENV_QOS_BEFORE_OVERRIDE]); + + two_send_and_destroy_test_passed = TRUE; + } else { + if (two_send_and_destroy_test_passed) { + T_END; + } + } + + /* Enable the knote to get next message */ + struct kevent_qos_s *kev = *eventslist; + uint64_t port = kev->ident; + populate_kevent(kev, port); + *events = 1; + two_send_and_destroy_handler++; + T_LOG("Handler returning \n"); +} + +static mach_port_type_t +get_reply_port(struct kevent_qos_s *kev) +{ + mach_msg_header_t *hdr; + mach_port_t reply_port; + mach_port_type_t type; + kern_return_t kr; + + hdr = (void*)kev->ext[0]; + T_QUIET; T_ASSERT_NOTNULL(hdr, "msg hdr"); + + reply_port = hdr->msgh_remote_port; + T_QUIET;T_ASSERT_TRUE(MACH_PORT_VALID(reply_port), "reply port valid"); + kr = mach_port_type(mach_task_self(), reply_port, &type); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_type"); + T_QUIET; T_ASSERT_TRUE(type & MACH_PORT_TYPE_SEND_ONCE, "send once received"); + + return reply_port; +} + +static void +send_reply(mach_port_t reply_port) +{ + kern_return_t kr; + + struct { + mach_msg_header_t header; + } send_msg = { + .header = { + .msgh_remote_port = reply_port, + .msgh_local_port = MACH_PORT_NULL, + .msgh_bits = MACH_MSGH_BITS(MACH_MSG_TYPE_MOVE_SEND_ONCE, 0), + .msgh_id = 0x100, + .msgh_size = sizeof(send_msg), + }, + }; + + kr = mach_msg(&(send_msg.header), + MACH_SEND_MSG, + send_msg.header.msgh_size, + 0, + MACH_PORT_NULL, + 0, + 0); + + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "server mach_msg"); +} + +static void +populate_kevent(struct kevent_qos_s *kev, unsigned long long port) +{ + + memset(kev, 0, sizeof(struct kevent_qos_s)); + kev->ident = port; + kev->filter = EVFILT_MACHPORT; + kev->flags = EV_ADD | EV_ENABLE | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED; + kev->fflags = (MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY | + MACH_RCV_TRAILER_ELEMENTS(MACH_RCV_TRAILER_CTX) | + MACH_RCV_TRAILER_TYPE(MACH_MSG_TRAILER_FORMAT_0)); + kev->data = 1; + +} + +static void +enable_kevent(uint64_t *workloop_id, unsigned long long port) +{ + kern_return_t kr; + struct kevent_qos_s kev; + + populate_kevent(&kev, port); + struct kevent_qos_s kev_err[] = {{ 0 }}; + + kr = kevent_id(*workloop_id, &kev, 1, kev_err, 1, NULL, + NULL, KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_ERROR_EVENTS | KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST); + T_QUIET; T_ASSERT_POSIX_SUCCESS(kr, "kevent_id"); +} + +/* + * WL handler which sends a msg to the client from handler. + */ +static void +workloop_cb_test_sync_send_reply(uint64_t *workloop_id __unused, struct kevent_qos_s **eventslist, int *events) +{ + + T_LOG("Workloop handler workloop_cb_test_sync_send_reply called"); + + if (geteuid() != 0) { + T_SKIP("kevent_qos test requires root privileges to run."); + } + + T_QUIET; T_ASSERT_EQ_INT(*events, 1, "events received"); + T_QUIET; T_ASSERT_EQ_INT((*eventslist)->filter, EVFILT_MACHPORT, "received EVFILT_MACHPORT"); + + /* send reply */ + send_reply(get_reply_port(*eventslist)); + + *events = 0; +} + +/* + * WL handler which deallocates reply port. + */ +static void +workloop_cb_test_sync_send_deallocate(uint64_t *workloop_id __unused, struct kevent_qos_s **eventslist, int *events) +{ + mach_port_t reply_port; + kern_return_t kr; + + T_LOG("Workloop handler workloop_cb_test_sync_send_deallocate called"); + + if (geteuid() != 0) { + T_SKIP("kevent_qos test requires root privileges to run."); + } + + T_QUIET; T_ASSERT_EQ_INT(*events, 1, "events received"); + T_QUIET; T_ASSERT_EQ_INT((*eventslist)->filter, EVFILT_MACHPORT, "received EVFILT_MACHPORT"); + + reply_port = get_reply_port(*eventslist); + + /* deallocate port */ + kr = mach_port_deallocate(mach_task_self(), reply_port); + + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_deallocate"); + + *events = 0; + + T_LOG("Handler returning \n"); +} + + +/* + * WL handler which sends a msg to the client before enabling the event from handler. + */ +static void +workloop_cb_test_sync_send_reply_kevent(uint64_t *workloop_id, struct kevent_qos_s **eventslist, int *events) +{ + + T_LOG("Workloop handler workloop_cb_test_sync_send_reply_kevent called"); + + if (geteuid() != 0) { + T_SKIP("kevent_qos test requires root privileges to run."); + } + + T_QUIET; T_ASSERT_EQ_INT(*events, 1, "events received"); + T_QUIET; T_ASSERT_EQ_INT(((*eventslist)->filter), EVFILT_MACHPORT, "received EVFILT_MACHPORT"); + + struct kevent_qos_s *kev = *eventslist; + + /* send reply */ + send_reply(get_reply_port(kev)); + + /* Enable the knote */ + enable_kevent(workloop_id, kev->ident); + + *events = 0; + + T_LOG("Handler returning \n"); +} + +/* + * WL handler which sends a msg to the client before enabling the event from pthread. + */ +static void +workloop_cb_test_sync_send_reply_kevent_pthread(uint64_t *workloop_id __unused, struct kevent_qos_s **eventslist, int *events) +{ + + T_LOG("Workloop handler workloop_cb_test_sync_send_reply_kevent_pthread called"); + + if (geteuid() != 0) { + T_SKIP("kevent_qos test requires root privileges to run."); + } + + T_QUIET; T_ASSERT_EQ_INT(*events, 1, "events received"); + T_QUIET; T_ASSERT_EQ_INT((*eventslist)->filter, EVFILT_MACHPORT, "received EVFILT_MACHPORT"); + + struct kevent_qos_s *kev = *eventslist; + + /* send reply */ + send_reply(get_reply_port(kev)); + + populate_kevent(kev, kev->ident); + + *events = 1; + + T_LOG("Handler returning \n"); +} + +/* + * WL handler which sends a msg to the client after reenabling the event. + */ +static void +workloop_cb_test_sync_send_kevent_reply(uint64_t *workloop_id, struct kevent_qos_s **eventslist, int *events) +{ + + T_LOG("workloop handler workloop_cb_test_sync_send_kevent_reply called"); + + if (geteuid() != 0) { + T_SKIP("kevent_qos test requires root privileges to run."); + } + + T_QUIET; T_ASSERT_EQ_INT(*events, 1, "events received"); + T_QUIET; T_ASSERT_EQ_INT((*eventslist)->filter, EVFILT_MACHPORT, "received EVFILT_MACHPORT"); + + struct kevent_qos_s *kev = *eventslist; + mach_port_t reply_port = get_reply_port(*eventslist); + + /* Enable the knote */ + enable_kevent(workloop_id, kev->ident); + + /* send reply */ + send_reply(reply_port); + + *events = 0; + + T_LOG("Handler returning \n"); +} + +/* + * WL handler that does nothing. + */ +static void +workloop_cb_test_sync_send_do_nothing(uint64_t *workloop_id __unused, struct kevent_qos_s **eventslist, int *events) +{ + + T_LOG("Workloop handler workloop_cb_test_sync_send_do_nothing called"); + + if (geteuid() != 0) { + T_SKIP("kevent_qos test requires root privileges to run."); + } + + T_QUIET; T_ASSERT_EQ_INT(*events, 1, "events received"); + T_QUIET; T_ASSERT_EQ_INT((*eventslist)->filter, EVFILT_MACHPORT, "received EVFILT_MACHPORT"); + + /* do nothing */ + + *events = 0; + + T_LOG("Handler returning \n"); +} + +/* + * WL handler that returns the event to reenable. + */ +static void +workloop_cb_test_sync_send_do_nothing_kevent_pthread(uint64_t *workloop_id __unused, struct kevent_qos_s **eventslist, int *events) +{ + + T_LOG("Workloop handler workloop_cb_test_sync_send_do_nothing_kevent_pthread called"); + + if (geteuid() != 0) { + T_SKIP("kevent_qos test requires root privileges to run."); + } + + T_QUIET; T_ASSERT_EQ_INT(*events, 1, "events received"); + T_QUIET; T_ASSERT_EQ_INT((*eventslist)->filter, EVFILT_MACHPORT, "received EVFILT_MACHPORT"); + + struct kevent_qos_s *kev = *eventslist; + populate_kevent(kev, kev->ident); + + *events = 1; + + T_LOG("handler returning \n"); +} + +/* + * WL handler that exits. + */ +static void +workloop_cb_test_sync_send_do_nothing_exit(uint64_t *workloop_id __unused, struct kevent_qos_s **eventslist, __unused int *events) +{ + + T_LOG("workloop handler workloop_cb_test_sync_send_do_nothing_exit called"); + + if (geteuid() != 0) { + T_SKIP("kevent_qos test requires root privileges to run."); + } + + T_QUIET; T_ASSERT_EQ_INT(*events, 1, "events received"); + T_QUIET; T_ASSERT_EQ_INT((*eventslist)->filter, EVFILT_MACHPORT, "received EVFILT_MACHPORT"); + + /* call exit */ + exit(0); +} + +/* + * WL handler which: + * first sync sends a msg to the client and reenables kevent after + * second sync sends a msg and reenables kevent after. + */ +static void +workloop_cb_test_sync_send_reply_kevent_reply_kevent(uint64_t *workloop_id __unused, struct kevent_qos_s **eventslist, int *events) +{ + + T_LOG("Workloop handler workloop_cb_test_sync_send_reply_kevent_reply_kevent called"); + + if (geteuid() != 0) { + T_SKIP("kevent_qos test requires root privileges to run."); + } + + T_QUIET; T_ASSERT_EQ_INT(*events, 1, "events received"); + T_QUIET; T_ASSERT_EQ_INT((*eventslist)->filter, EVFILT_MACHPORT, "received EVFILT_MACHPORT"); + + struct kevent_qos_s *kev = *eventslist; + + /* send reply */ + send_reply(get_reply_port(kev)); + + populate_kevent(kev, kev->ident); + + *events = 1; + + T_LOG("Handler returning \n"); +} + +/* + * WL handler which: + * first sync reenables kevent and after sends a msg + * second sync sends a msg and reenables kevent after. + */ +static int workloop_cb_test_sync_send_kevent_reply_reply_kevent_handler_called = 0; +static void +workloop_cb_test_sync_send_kevent_reply_reply_kevent(uint64_t *workloop_id, struct kevent_qos_s **eventslist, int *events) +{ + T_LOG("workloop handler workloop_cb_test_sync_send_kevent_reply_reply_kevent called"); + + if (geteuid() != 0) { + T_SKIP("kevent_qos test requires root privileges to run."); + } + + T_QUIET; T_ASSERT_EQ_INT(*events, 1, "events received"); + T_QUIET; T_ASSERT_EQ_INT((*eventslist)->filter, EVFILT_MACHPORT, "received EVFILT_MACHPORT"); + + struct kevent_qos_s *kev = *eventslist; + mach_port_t reply_port = get_reply_port(kev); + + if (workloop_cb_test_sync_send_kevent_reply_reply_kevent_handler_called == 0) { + workloop_cb_test_sync_send_kevent_reply_reply_kevent_handler_called = 1; + + /* Enable the knote */ + enable_kevent(workloop_id, kev->ident); + + /* send reply */ + send_reply(reply_port); + + *events = 0; + + } else { + /* send reply */ + send_reply(reply_port); + + /* Enable the knote */ + enable_kevent(workloop_id, kev->ident); + + *events = 0; + } + + T_LOG("Handler returning \n"); +} + +/* + * WL handler which: + * first sync reenables kevent and after sends a msg + * second sync reenables kevent and after sends a msg + */ +static void +workloop_cb_test_sync_send_kevent_reply_kevent_reply(uint64_t *workloop_id, struct kevent_qos_s **eventslist, int *events) +{ + T_LOG("workloop handler workloop_cb_test_sync_send_kevent_reply_kevent_reply called"); + + if (geteuid() != 0) { + T_SKIP("kevent_qos test requires root privileges to run."); + } + + T_QUIET; T_ASSERT_EQ_INT(*events, 1, "events received"); + T_QUIET; T_ASSERT_EQ_INT((*eventslist)->filter, EVFILT_MACHPORT, "received EVFILT_MACHPORT"); + + struct kevent_qos_s *kev = *eventslist; + mach_port_t reply_port = get_reply_port(kev); + + /* Enable the knote */ + enable_kevent(workloop_id, kev->ident); + + /* send reply */ + send_reply(reply_port); + + *events = 0; + T_LOG("Handler returning \n"); +} + +/* + * WL handler which: + * first sync ends a msg and reenables kevent after + * second sync reenables kevent and sends a msg after + */ +static int workloop_cb_test_sync_send_reply_kevent_kevent_reply_handler_called = 0; +static void +workloop_cb_test_sync_send_reply_kevent_kevent_reply(uint64_t *workloop_id, struct kevent_qos_s **eventslist, int *events) +{ + T_LOG("workloop handler workloop_cb_test_sync_send_reply_kevent_kevent_reply called"); + + if (geteuid() != 0) { + T_SKIP("kevent_qos test requires root privileges to run."); + } + + T_QUIET; T_ASSERT_EQ_INT(*events, 1, "events received"); + T_QUIET; T_ASSERT_EQ_INT((*eventslist)->filter, EVFILT_MACHPORT, "received EVFILT_MACHPORT"); + + struct kevent_qos_s *kev = *eventslist; + mach_port_t reply_port = get_reply_port(kev); + + if (workloop_cb_test_sync_send_reply_kevent_kevent_reply_handler_called == 0) { + workloop_cb_test_sync_send_reply_kevent_kevent_reply_handler_called = 1; + + /* send reply */ + send_reply(reply_port); + + populate_kevent(kev, kev->ident); + + *events = 1; + + } else { + + /* Enable the knote */ + enable_kevent(workloop_id, kev->ident); + /* send reply */ + send_reply(reply_port); + + *events = 0; + } + + T_LOG("Handler returning \n"); +} +#pragma mark Mach receive + +#define KEVENT_QOS_SERVICE_NAME "com.apple.xnu.test.kevent_qos" + +static mach_port_t +get_server_port(void) +{ + mach_port_t port; + kern_return_t kr = bootstrap_check_in(bootstrap_port, + KEVENT_QOS_SERVICE_NAME, &port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "server bootstrap_check_in"); + return port; +} + +static void +env_set_qos(char **env, qos_class_t qos[], const char *qos_name[], const char *wl_function) +{ + int i; + char *qos_str, *qos_name_str; + for (i = 0; i < ENV_VAR_QOS; i++) { + T_QUIET; T_ASSERT_POSIX_SUCCESS(asprintf(&qos_str, "%s=%d", qos_env[i] , qos[i]), + NULL); + T_QUIET; T_ASSERT_POSIX_SUCCESS( + asprintf(&qos_name_str, "%s=%s", qos_name_env[i], qos_name[i]), NULL); + env[2 * i] = qos_str; + env[2 * i + 1] = qos_name_str; + } + T_QUIET; T_ASSERT_POSIX_SUCCESS(asprintf(&env[2 * i], "%s=%s", wl_function_name, wl_function), + NULL); + env[2 * i + 1] = NULL; +} + +static void +environ_get_qos(qos_class_t qos[], const char *qos_name[], const char **wl_function) +{ + char *qos_str; + char *qos_end; + int i; + + for (i = 0; i < ENV_VAR_QOS; i++) { + qos_str = getenv(qos_env[i]); + T_QUIET; T_ASSERT_NOTNULL(qos_str, "getenv(%s)", qos_env[i]); + + unsigned long qos_l = strtoul(qos_str, &qos_end, 10); + T_QUIET; T_ASSERT_EQ(*qos_end, '\0', "getenv(%s) = '%s' should be an " + "integer", qos_env[i], qos_str); + + T_QUIET; T_ASSERT_LT(qos_l, (unsigned long)100, "getenv(%s) = '%s' should " + "be less than 100", qos_env[i], qos_str); + + qos[i] = (qos_class_t)qos_l; + qos_name[i] = getenv(qos_name_env[i]); + T_QUIET; T_ASSERT_NOTNULL(qos_name[i], "getenv(%s)", qos_name_env[i]); + } + *wl_function = getenv(wl_function_name); + T_QUIET; T_ASSERT_NOTNULL(*wl_function, "getenv(%s)", wl_function_name); +} + +static mach_voucher_t +create_pthpriority_voucher(mach_msg_priority_t qos) +{ + char voucher_buf[sizeof(mach_voucher_attr_recipe_data_t) + sizeof(ipc_pthread_priority_value_t)]; + + mach_voucher_t voucher = MACH_PORT_NULL; + kern_return_t ret; + ipc_pthread_priority_value_t ipc_pthread_priority_value = + (ipc_pthread_priority_value_t)qos; + + mach_voucher_attr_raw_recipe_array_t recipes; + mach_voucher_attr_raw_recipe_size_t recipe_size = 0; + mach_voucher_attr_recipe_t recipe = + (mach_voucher_attr_recipe_t)&voucher_buf[recipe_size]; + + recipe->key = MACH_VOUCHER_ATTR_KEY_PTHPRIORITY; + recipe->command = MACH_VOUCHER_ATTR_PTHPRIORITY_CREATE; + recipe->previous_voucher = MACH_VOUCHER_NULL; + memcpy((char *)&recipe->content[0], &ipc_pthread_priority_value, sizeof(ipc_pthread_priority_value)); + recipe->content_size = sizeof(ipc_pthread_priority_value_t); + recipe_size += sizeof(mach_voucher_attr_recipe_data_t) + recipe->content_size; + + recipes = (mach_voucher_attr_raw_recipe_array_t)&voucher_buf[0]; + + ret = host_create_mach_voucher(mach_host_self(), + recipes, + recipe_size, + &voucher); + + T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "client host_create_mach_voucher"); + return voucher; +} + +static void +send( + mach_port_t send_port, + mach_port_t reply_port, + mach_port_t msg_port, + mach_msg_priority_t qos, + mach_msg_option_t options) +{ + kern_return_t ret = 0; + + struct test_msg send_msg = { + .header = { + .msgh_remote_port = send_port, + .msgh_local_port = reply_port, + .msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_COPY_SEND, + reply_port ? MACH_MSG_TYPE_MAKE_SEND_ONCE : 0, + MACH_MSG_TYPE_MOVE_SEND, + MACH_MSGH_BITS_COMPLEX), + .msgh_id = 0x100, + .msgh_size = sizeof(send_msg), + }, + .body = { + .msgh_descriptor_count = 1, + }, + .port_descriptor = { + .name = msg_port, + .disposition = MACH_MSG_TYPE_MOVE_RECEIVE, + .type = MACH_MSG_PORT_DESCRIPTOR, + }, + .opts = options, + }; + + if (msg_port == MACH_PORT_NULL) { + send_msg.body.msgh_descriptor_count = 0; + } + + if ((options & MACH_SEND_PROPAGATE_QOS) == 0) { + send_msg.header.msgh_voucher_port = create_pthpriority_voucher(qos); + send_msg.qos = qos; + } else { + qos_class_t qc; + int relpri; + pthread_get_qos_class_np(pthread_self(), &qc, &relpri); + send_msg.qos = (uint32_t)_pthread_qos_class_encode(qc, relpri, 0); + } + + ret = mach_msg(&(send_msg.header), + MACH_SEND_MSG | + MACH_SEND_TIMEOUT | + MACH_SEND_OVERRIDE| + ((reply_port ? MACH_SEND_SYNC_OVERRIDE : 0) | options), + send_msg.header.msgh_size, + 0, + MACH_PORT_NULL, + 10000, + 0); + + T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "client mach_msg"); +} + +static kern_return_t +receive( + mach_port_t rcv_port, + mach_port_t notify_port) +{ + kern_return_t ret = 0; + + struct test_msg rcv_msg = { + .header = { + .msgh_remote_port = MACH_PORT_NULL, + .msgh_local_port = rcv_port, + .msgh_size = sizeof(rcv_msg), + }, + }; + + T_LOG("Client: Starting sync receive\n"); + + ret = mach_msg(&(rcv_msg.header), + MACH_RCV_MSG | + MACH_RCV_TIMEOUT | + MACH_RCV_SYNC_WAIT, + 0, + rcv_msg.header.msgh_size, + rcv_port, + SEND_TIMEOUT_SECS * 1000, + notify_port); + + return ret; +} + +T_HELPER_DECL(qos_get_special_reply_port, + "Test get_special_reply_port and it's corner cases.") +{ + mach_port_t special_reply_port; + mach_port_t new_special_reply_port; + + special_reply_port = thread_get_special_reply_port(); + T_QUIET; T_ASSERT_TRUE(MACH_PORT_VALID(special_reply_port), "get_thread_special_reply_port"); + + new_special_reply_port = thread_get_special_reply_port(); + T_QUIET; T_ASSERT_TRUE(MACH_PORT_VALID(new_special_reply_port), "get_thread_special_reply_port"); + + mach_port_destroy(mach_task_self(), special_reply_port); + mach_port_destroy(mach_task_self(), new_special_reply_port); + + new_special_reply_port = thread_get_special_reply_port(); + T_QUIET; T_ASSERT_TRUE(MACH_PORT_VALID(new_special_reply_port), "get_thread_special_reply_port"); + + T_END; +} + +static void * +qos_client_send_to_intransit(void *arg __unused) +{ + mach_port_t qos_send_port; + mach_port_t msg_port; + mach_port_t special_reply_port; + + kern_return_t kr = bootstrap_look_up(bootstrap_port, + KEVENT_QOS_SERVICE_NAME, &qos_send_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up"); + + special_reply_port = thread_get_special_reply_port(); + T_QUIET; T_ASSERT_TRUE(MACH_PORT_VALID(special_reply_port), "get_thread_special_reply_port"); + + /* Create a rcv right to send in a msg */ + kr = mach_port_allocate(mach_task_self(), + MACH_PORT_RIGHT_RECEIVE, + &msg_port); + + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client mach_port_allocate"); + + kr = mach_port_insert_right(mach_task_self(), + msg_port, + msg_port, + MACH_MSG_TYPE_MAKE_SEND); + + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client mach_port_insert_right"); + + /* Send an empty msg on the port to fire the WL thread */ + send(qos_send_port, MACH_PORT_NULL, MACH_PORT_NULL, + (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], 0, 0), 0); + + /* Sleep 3 seconds for the server to start */ + sleep(3); + + /* Send the message with msg port as in-transit port, this msg will not be dequeued */ + send(qos_send_port, MACH_PORT_NULL, msg_port, + (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], 0, 0), 0); + + /* Send 5 messages to msg port to make sure the port is full */ + for (int i = 0; i < 5; i++) { + send(msg_port, MACH_PORT_NULL, MACH_PORT_NULL, + (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], 0, 0), 0); + } + + T_LOG("Sent 5 msgs, now trying to send sync ipc messgae, which will block with a timeout\n"); + /* Send the message to the in-transit port, it should block and override the rcv's workloop */ + send(msg_port, special_reply_port, MACH_PORT_NULL, + (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], 0, 0), 0); + T_LOG("Client done sending messages, now waiting for server to end the test"); + + T_ASSERT_FAIL("client timed out"); + return NULL; +} + +T_HELPER_DECL(qos_client_send_to_intransit_with_thr_pri, + "Send synchronous messages from a pri thread to an intransit port") +{ + thread_create_at_qos(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], qos_client_send_to_intransit); + sleep(HELPER_TIMEOUT_SECS); +} + +static void +thread_create_at_qos(qos_class_t qos, void * (*function)(void *)) +{ + qos_class_t qos_thread; + pthread_t thread; + pthread_attr_t attr; + int ret; + + ret = setpriority(PRIO_DARWIN_ROLE, 0, PRIO_DARWIN_ROLE_UI_FOCAL); + if (ret != 0) { + T_LOG("set priority failed\n"); + } + + pthread_attr_init(&attr); + pthread_attr_set_qos_class_np(&attr, qos, 0); + pthread_create(&thread, &attr, function, NULL); + + T_LOG("pthread created\n"); + pthread_get_qos_class_np(thread, &qos_thread, NULL); + T_EXPECT_EQ(qos_thread, (qos_class_t)qos, NULL); +} + +static void * +qos_send_and_sync_rcv(void *arg __unused) +{ + mach_port_t qos_send_port; + mach_port_t special_reply_port; + + T_LOG("Client: from created thread\n"); + T_EXPECT_EFFECTIVE_QOS_EQ(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], + "pthread QoS should be %s", g_expected_qos_name[ENV_QOS_AFTER_OVERRIDE]); + + kern_return_t kr = bootstrap_look_up(bootstrap_port, + KEVENT_QOS_SERVICE_NAME, &qos_send_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up"); + + special_reply_port = thread_get_special_reply_port(); + T_QUIET; T_ASSERT_TRUE(MACH_PORT_VALID(special_reply_port), "get_thread_special_reply_port"); + + /* enqueue two messages to make sure that mqueue is not empty */ + send(qos_send_port, MACH_PORT_NULL, MACH_PORT_NULL, + (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_QUEUE_OVERRIDE], 0, 0), 0); + + send(qos_send_port, MACH_PORT_NULL, MACH_PORT_NULL, + (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_QUEUE_OVERRIDE], 0, 0), 0); + + sleep(SEND_TIMEOUT_SECS); + + /* sync wait on msg port */ + receive(special_reply_port, qos_send_port); + + T_LOG("Client done doing sync rcv, now waiting for server to end the test"); + sleep(SEND_TIMEOUT_SECS); + + T_ASSERT_FAIL("client timed out"); + return NULL; +} + +T_HELPER_DECL(qos_client_send_sync_and_sync_rcv, + "Send messages and syncronously wait for rcv") +{ + thread_create_at_qos(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], qos_send_and_sync_rcv); + sleep(HELPER_TIMEOUT_SECS); +} + +static void * +qos_client_send_sync_msg_and_test_link(void *arg) +{ + mach_port_t qos_send_port; + mach_port_t special_reply_port; + boolean_t in_effect = FALSE; + kern_return_t kr; + unsigned long expected_result = (unsigned long) arg; + + kr = bootstrap_look_up(bootstrap_port, + KEVENT_QOS_SERVICE_NAME, &qos_send_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up"); + + /* start monitoring sync ipc link */ + kr = mach_sync_ipc_link_monitoring_start(&special_reply_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_sync_ipc_link_monitoring_start"); + + /* Send the message to msg port */ + send(qos_send_port, special_reply_port, MACH_PORT_NULL, + (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], 0, 0), 0); + + /* + * wait for the reply + * some tests do not send a msg back so the receive + * might fail + */ + receive(special_reply_port, qos_send_port); + + /* stop monitoring link */ + kr = mach_sync_ipc_link_monitoring_stop(special_reply_port, &in_effect); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_sync_ipc_link_monitoring_stop"); + + if (!in_effect) + T_LOG("Link was broken"); + else + T_LOG("Link correct"); + + if (expected_result == 1) + T_ASSERT_TRUE(in_effect, "special reply port link after rcv"); + else + T_ASSERT_FALSE(in_effect, "special reply port link after rcv"); + T_END; +} + +static void * +qos_client_send_2sync_msg_and_test_link(void *arg) +{ + mach_port_t qos_send_port; + mach_port_t special_reply_port; + boolean_t in_effect = FALSE; + kern_return_t kr; + unsigned long expected_result = (unsigned long) arg; + + kr = bootstrap_look_up(bootstrap_port, + KEVENT_QOS_SERVICE_NAME, &qos_send_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up"); + + /* start monitoring sync ipc link */ + kr = mach_sync_ipc_link_monitoring_start(&special_reply_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_sync_ipc_link_monitoring_start"); + + /* Send the first message to msg port */ + send(qos_send_port, special_reply_port, MACH_PORT_NULL, + (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], 0, 0), 0); + + /* wait for the reply */ + kr = receive(special_reply_port, qos_send_port); + T_QUIET;T_ASSERT_MACH_SUCCESS(kr, "receive"); + + /* Send the second message to msg port */ + send(qos_send_port, special_reply_port, MACH_PORT_NULL, + (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], 0, 0), 0); + + /* wait for the reply */ + kr = receive(special_reply_port, qos_send_port); + T_QUIET;T_ASSERT_MACH_SUCCESS(kr, "receive"); + + /* stop monitoring link */ + kr = mach_sync_ipc_link_monitoring_stop(special_reply_port, &in_effect); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_sync_ipc_link_monitoring_stop"); + + if (!in_effect) + T_LOG("Link was broken"); + else + T_LOG("Link correct"); + + if (expected_result == 1) + T_ASSERT_TRUE(in_effect, "special reply port link after rcv"); + else + T_ASSERT_FALSE(in_effect, "special reply port link after rcv"); + T_END; +} +T_HELPER_DECL(qos_client_send_sync_msg_with_link_check_correct_server, + "Send sync message, wait for reply and check sync ipc link") +{ + pthread_t thread; + pthread_attr_t attr; + unsigned long expected_result = 1; + + pthread_attr_init(&attr); + pthread_create(&thread, &attr, qos_client_send_sync_msg_and_test_link, (void *)expected_result); + + sleep(HELPER_TIMEOUT_SECS); +} + +T_HELPER_DECL(qos_client_send_sync_msg_with_link_check_incorrect_server, + "Send sync message, wait for reply and check sync ipc link") +{ + pthread_t thread; + pthread_attr_t attr; + unsigned long expected_result = 0; + + pthread_attr_init(&attr); + pthread_create(&thread, &attr, qos_client_send_sync_msg_and_test_link, (void *)expected_result); + + sleep(HELPER_TIMEOUT_SECS); +} + +T_HELPER_DECL(qos_client_send_2sync_msg_with_link_check_correct_server, + "Send sync message, wait for reply and check sync ipc link") +{ + pthread_t thread; + pthread_attr_t attr; + unsigned long expected_result = 1; + + pthread_attr_init(&attr); + pthread_create(&thread, &attr, qos_client_send_2sync_msg_and_test_link, (void *)expected_result); + + sleep(HELPER_TIMEOUT_SECS); +} + +T_HELPER_DECL(qos_client_send_2sync_msg_with_link_check_incorrect_server, + "Send sync message, wait for reply and check sync ipc link") +{ + pthread_t thread; + pthread_attr_t attr; + unsigned long expected_result = 0; + + pthread_attr_init(&attr); + pthread_create(&thread, &attr, qos_client_send_2sync_msg_and_test_link, (void *)expected_result); + + sleep(HELPER_TIMEOUT_SECS); +} + +static void * +qos_client_send_sync_msg(void *arg __unused) +{ + mach_port_t qos_send_port; + mach_port_t special_reply_port; + + kern_return_t kr = bootstrap_look_up(bootstrap_port, + KEVENT_QOS_SERVICE_NAME, &qos_send_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up"); + + special_reply_port = thread_get_special_reply_port(); + T_QUIET; T_ASSERT_TRUE(MACH_PORT_VALID(special_reply_port), "get_thread_special_reply_port"); + + /* Send the message to msg port */ + send(qos_send_port, special_reply_port, MACH_PORT_NULL, + (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], 0, 0), 0); + + /* wait for the reply */ + receive(special_reply_port, qos_send_port); + + T_LOG("Client done sending messages, now waiting for server to end the test"); + sleep(2 * SEND_TIMEOUT_SECS); + + T_ASSERT_FAIL("client timed out"); + return NULL; +} + +T_HELPER_DECL(qos_client_send_sync_msg_with_pri, + "Send sync message and wait for reply") +{ + thread_create_at_qos(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], qos_client_send_sync_msg); + sleep(HELPER_TIMEOUT_SECS); +} + +static void * +qos_client_send_two_sync_msg_high_qos(void *arg __unused) +{ + mach_port_t qos_send_port; + mach_port_t special_reply_port; + + kern_return_t kr = bootstrap_look_up(bootstrap_port, + KEVENT_QOS_SERVICE_NAME, &qos_send_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up"); + + special_reply_port = thread_get_special_reply_port(); + T_QUIET; T_ASSERT_TRUE(MACH_PORT_VALID(special_reply_port), "get_thread_special_reply_port"); + + /* Send the message to msg port */ + send(qos_send_port, special_reply_port, MACH_PORT_NULL, + (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], 0, 0), 0); + + /* wait for the reply */ + receive(special_reply_port, qos_send_port); + + T_LOG("Client done sending messages, now waiting for server to end the test"); + sleep(SEND_TIMEOUT_SECS); + + T_ASSERT_FAIL("client timed out"); + return NULL; +} + +static void * +qos_client_send_two_sync_msg_low_qos(void *arg __unused) +{ + mach_port_t qos_send_port; + mach_port_t special_reply_port; + + kern_return_t kr = bootstrap_look_up(bootstrap_port, + KEVENT_QOS_SERVICE_NAME, &qos_send_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up"); + + special_reply_port = thread_get_special_reply_port(); + T_QUIET; T_ASSERT_TRUE(MACH_PORT_VALID(special_reply_port), "get_thread_special_reply_port"); + + /* Send the message to msg port */ + send(qos_send_port, special_reply_port, MACH_PORT_NULL, + (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], 0, 0), 0); + + /* wait for the reply */ + receive(special_reply_port, qos_send_port); + + T_LOG("Client done sending messages, now waiting for server to end the test"); + sleep(SEND_TIMEOUT_SECS); + + T_ASSERT_FAIL("client timed out"); + return NULL; +} + +T_HELPER_DECL(qos_client_send_two_sync_msg_with_thr_pri, + "Send messages sync msgs from 2 threads at given thread pri") +{ + thread_create_at_qos(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], qos_client_send_two_sync_msg_high_qos); + sleep(INTERMITTENT_TIMEOUT_SEC); + thread_create_at_qos(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], qos_client_send_two_sync_msg_low_qos); + sleep(HELPER_TIMEOUT_SECS); +} + +static mach_port_t other_thread_reply_port = MACH_PORT_NULL; +static void * +qos_client_destroy_other_threads_port(void *arg __unused) +{ + T_LOG("Waiting 6 seconds before destroying other thread's reply port"); + sleep(SEND_TIMEOUT_SECS); + + T_LOG("Destroying other thread's special reply port "); + mach_port_destroy(mach_task_self(), other_thread_reply_port); + + T_LOG("Other thread done destroying "); + sleep(3 * SEND_TIMEOUT_SECS); + + T_ASSERT_FAIL("client timed out"); + return NULL; +} + +static void * +qos_client_create_sepcial_reply_and_spawn_thread(void *arg __unused) +{ + mach_port_t qos_send_port; + mach_port_t special_reply_port; + + kern_return_t kr = bootstrap_look_up(bootstrap_port, + KEVENT_QOS_SERVICE_NAME, &qos_send_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up"); + + special_reply_port = thread_get_special_reply_port(); + T_QUIET; T_ASSERT_TRUE(MACH_PORT_VALID(special_reply_port), "get_thread_special_reply_port"); + + other_thread_reply_port = special_reply_port; + + /* Send an async message */ + send(qos_send_port, MACH_PORT_NULL, MACH_PORT_NULL, + (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], 0, 0), 0); + + /* Send the sync ipc message */ + send(qos_send_port, special_reply_port, MACH_PORT_NULL, + (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], 0, 0), 0); + + /* Create a new thread to send the sync message on our special reply port */ + thread_create_at_qos(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], qos_client_destroy_other_threads_port); + + /* Client starting to receive messgae */ + receive(special_reply_port, qos_send_port); + + sleep(3 * SEND_TIMEOUT_SECS); + + T_ASSERT_FAIL("client timed out"); + return NULL; +} + +T_HELPER_DECL(qos_client_send_two_msg_and_destroy, + "Send a message with another threads special reply port while that thread destroys the port") +{ + thread_create_at_qos(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], qos_client_create_sepcial_reply_and_spawn_thread); + sleep(HELPER_TIMEOUT_SECS); +} + +static mach_port_t send_complex_connection_port = MACH_PORT_NULL; + +static void * +qos_client_send_complex_msg_to_service_port(void *arg __unused) +{ + mach_port_t svc_port, tsr_port, conn_port; + kern_return_t kr; + + kr = bootstrap_look_up(bootstrap_port, KEVENT_QOS_SERVICE_NAME, &svc_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up"); + + tsr_port = thread_get_special_reply_port(); + T_QUIET; T_ASSERT_TRUE(MACH_PORT_VALID(tsr_port), "get_thread_special_reply_port"); + + conn_port = send_complex_connection_port; + + T_LOG("Sending to the service port with a sync IPC"); + send(svc_port, tsr_port, conn_port, + (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], 0, 0), + MACH_SEND_PROPAGATE_QOS); + + receive(tsr_port, svc_port); + + sleep(3 * SEND_TIMEOUT_SECS); + + T_ASSERT_FAIL("client timed out"); + return NULL; +} + +static void * +qos_client_send_to_connection_then_service_port(void *arg __unused) +{ + mach_port_t tsr_port, conn_port; + mach_port_options_t opts = { + .flags = MPO_INSERT_SEND_RIGHT, + }; + kern_return_t kr; + + kr = mach_port_construct(mach_task_self(), &opts, 0ull, &conn_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_construct"); + send_complex_connection_port = conn_port; + + tsr_port = thread_get_special_reply_port(); + T_QUIET; T_ASSERT_TRUE(MACH_PORT_VALID(tsr_port), "get_thread_special_reply_port"); + + T_LOG("Sending to the connection port with a sync IPC"); + send(conn_port, tsr_port, MACH_PORT_NULL, + (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], 0, 0), + MACH_SEND_PROPAGATE_QOS); + + thread_create_at_qos(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], + qos_client_send_complex_msg_to_service_port); + + receive(tsr_port, conn_port); + + sleep(3 * SEND_TIMEOUT_SECS); + + T_ASSERT_FAIL("client timed out"); + return NULL; +} + +T_HELPER_DECL(qos_client_send_complex_msg_with_pri, + "Send a message with several ports causing links") +{ + thread_create_at_qos(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], + qos_client_send_to_connection_then_service_port); + sleep(HELPER_TIMEOUT_SECS); +} + +static void +run_client_server(const char *server_name, const char *client_name, qos_class_t qos[], + const char *qos_name[], const char *wl_function) +{ + char *env[2 * ENV_VAR_QOS + ENV_VAR_FUNCTION + 1]; + env_set_qos(env, qos, qos_name, wl_function); + + for (int i = 0; i < ENV_VAR_QOS; i++) { + g_expected_qos[i] = qos[i]; + g_expected_qos_name[i] = qos_name[i]; + } + + dt_helper_t helpers[] = { + dt_launchd_helper_env("com.apple.xnu.test.kevent_qos.plist", + server_name, env), + dt_fork_helper(client_name) + }; + dt_run_helpers(helpers, 2, HELPER_TIMEOUT_SECS); +} + +#pragma mark Mach receive - kevent_qos + +static void +expect_kevent_id_recv(mach_port_t port, qos_class_t qos[], const char *qos_name[], const char *wl_function) +{ + int r; + + /* Qos expected by workloop thread */ + for (int i = 0; i < ENV_VAR_QOS; i++) { + g_expected_qos[i] = qos[i]; + g_expected_qos_name[i] = qos_name[i]; + } + + if (strcmp(wl_function, "workloop_cb_test_intransit") == 0) { + T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop( + worker_cb, event_cb, + (pthread_workqueue_function_workloop_t)workloop_cb_test_intransit, 0, 0), NULL); + } else if (strcmp(wl_function, "workloop_cb_test_sync_send") == 0) { + T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop( + worker_cb, event_cb, + (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send, 0, 0), NULL); + } else if (strcmp(wl_function, "workloop_cb_test_sync_send_and_enable") == 0) { + T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop( + worker_cb, event_cb, + (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_and_enable, 0, 0), NULL); + } else if (strcmp(wl_function, "workloop_cb_test_send_two_sync") == 0) { + T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop( + worker_cb, event_cb, + (pthread_workqueue_function_workloop_t)workloop_cb_test_send_two_sync, 0, 0), NULL); + } else if (strcmp(wl_function, "workloop_cb_test_two_send_and_destroy") == 0) { + T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop( + worker_cb, event_cb, + (pthread_workqueue_function_workloop_t)workloop_cb_test_two_send_and_destroy, 0, 0), NULL); + } else if (strcmp(wl_function, "workloop_cb_test_sync_send_reply") == 0) { + T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop( + worker_cb, event_cb, + (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_reply, 0, 0), NULL); + } else if (strcmp(wl_function, "workloop_cb_test_sync_send_deallocate") == 0) { + T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop( + worker_cb, event_cb, + (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_deallocate, 0, 0), NULL); + } else if (strcmp(wl_function, "workloop_cb_test_sync_send_reply_kevent") == 0) { + T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop( + worker_cb, event_cb, + (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_reply_kevent, 0, 0), NULL); + } else if (strcmp(wl_function, "workloop_cb_test_sync_send_reply_kevent_pthread") == 0) { + T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop( + worker_cb, event_cb, + (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_reply_kevent_pthread, 0, 0), NULL); + } else if (strcmp(wl_function, "workloop_cb_test_sync_send_kevent_reply") == 0) { + T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop( + worker_cb, event_cb, + (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_kevent_reply, 0, 0), NULL); + } else if (strcmp(wl_function, "workloop_cb_test_sync_send_do_nothing") == 0) { + T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop( + worker_cb, event_cb, + (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_do_nothing, 0, 0), NULL); + } else if (strcmp(wl_function, "workloop_cb_test_sync_send_do_nothing_kevent_pthread") == 0) { + T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop( + worker_cb, event_cb, + (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_do_nothing_kevent_pthread, 0, 0), NULL); + } else if (strcmp(wl_function, "workloop_cb_test_sync_send_do_nothing_exit") == 0) { + T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop( + worker_cb, event_cb, + (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_do_nothing_exit, 0, 0), NULL); + } else if (strcmp(wl_function, "workloop_cb_test_sync_send_reply_kevent_reply_kevent") == 0) { + T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop( + worker_cb, event_cb, + (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_reply_kevent_reply_kevent, 0, 0), NULL); + } else if (strcmp(wl_function, "workloop_cb_test_sync_send_kevent_reply_reply_kevent") == 0) { + T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop( + worker_cb, event_cb, + (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_kevent_reply_reply_kevent, 0, 0), NULL); + } else if (strcmp(wl_function, "workloop_cb_test_sync_send_kevent_reply_kevent_reply") == 0) { + T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop( + worker_cb, event_cb, + (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_kevent_reply_kevent_reply, 0, 0), NULL); + } else if (strcmp(wl_function, "workloop_cb_test_sync_send_reply_kevent_kevent_reply") == 0) { + T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop( + worker_cb, event_cb, + (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_reply_kevent_kevent_reply, 0, 0), NULL); + } else { + T_ASSERT_FAIL("no workloop function specified \n"); + } + + struct kevent_qos_s kev[] = {{ + .ident = port, + .filter = EVFILT_MACHPORT, + .flags = EV_ADD | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED, + .fflags = (MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY | + MACH_RCV_TRAILER_ELEMENTS(MACH_RCV_TRAILER_CTX) | + MACH_RCV_TRAILER_TYPE(MACH_MSG_TRAILER_FORMAT_0)), + .data = 1, + .qos = (int32_t)_pthread_qos_class_encode(qos[ENV_QOS_QUEUE_OVERRIDE], 0, 0) + }}; + + struct kevent_qos_s kev_err[] = {{ 0 }}; + + /* Setup workloop for mach msg rcv */ + r = kevent_id(25, kev, 1, kev_err, 1, NULL, + NULL, KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_ERROR_EVENTS); + + T_QUIET; T_ASSERT_POSIX_SUCCESS(r, "kevent_id"); + T_QUIET; T_ASSERT_EQ(r, 0, "no errors returned from kevent_id"); + sleep(HELPER_TIMEOUT_SECS); +} + +T_HELPER_DECL(server_kevent_id, + "Reply with the QoS that a dispatch source event handler ran with") +{ + qos_class_t qos[ENV_VAR_QOS]; + const char *qos_name[ENV_VAR_QOS]; + const char *wl_function; + environ_get_qos(qos, qos_name, &wl_function); + + expect_kevent_id_recv(get_server_port(), qos, qos_name, wl_function); + sleep(HELPER_TIMEOUT_SECS); + T_ASSERT_FAIL("should receive a message within %d seconds", + RECV_TIMEOUT_SECS); +} + +#define TEST_QOS(server_name, client_name, name, wl_function_name, qos_bo, qos_bo_name, qos_qo, qos_qo_name, qos_ao, qos_ao_name) \ + T_DECL(server_kevent_id_##name, \ + "Event delivery at " qos_ao_name " QoS using a kevent_id", \ + T_META_ASROOT(YES)) \ + { \ + qos_class_t qos_array[ENV_VAR_QOS] = {qos_bo, qos_qo, qos_ao}; \ + const char *qos_name_array[ENV_VAR_QOS] = {qos_bo_name, qos_qo_name, qos_ao_name}; \ + run_client_server(server_name, client_name, qos_array, qos_name_array, wl_function_name); \ + } +/* + * Test 1: Test special reply port SPI + * + * Create thread special reply port and check any subsequent calls to + * the same should return MACH_PORT_NULL, unless the reply port is destroyed. + */ +TEST_QOS("server_kevent_id", "qos_get_special_reply_port", special_reply_port, "workloop_cb_test_intransit", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_DEFAULT, "default") + +/* + * Test 2: Test sync ipc send to an in-transit port + * + * Send a sync ipc message (at IN qos) to an in-transit port enqueued in a port + * attached to a workloop. Test that the servicer of the workloop gets + * sync ipc override. + */ +TEST_QOS("server_kevent_id", "qos_client_send_to_intransit_with_thr_pri", transit_IN, "workloop_cb_test_intransit", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_MAINTENANCE, "maintenance", + QOS_CLASS_USER_INITIATED, "user initiated") + +/* + * Test 3: Test sync ipc send to an in-transit port + * + * Send a sync ipc message (at UI qos) to an in-transit port enqueued in a port + * attached to a workloop. Test that the servicer of the workloop gets + * sync ipc override. + */ +TEST_QOS("server_kevent_id", "qos_client_send_to_intransit_with_thr_pri", transit_UI, "workloop_cb_test_intransit", + QOS_CLASS_USER_INITIATED, "user initiated", + QOS_CLASS_MAINTENANCE, "maintenance", + QOS_CLASS_USER_INTERACTIVE, "user initiated with 47 basepri promotion") + +/* + * Test 4: Test starting a sync rcv overrides the servicer + * + * Send an async message to a port and then start waiting on + * the port in mach msg rcv (at IN qos) with sync wait and test if the + * servicer of the workloop gets sync ipc override. + */ +TEST_QOS("server_kevent_id", "qos_client_send_sync_and_sync_rcv", rcv_IN, "workloop_cb_test_intransit", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_MAINTENANCE, "maintenance", + QOS_CLASS_USER_INITIATED, "user initiated") + +/* + * Test 5: Test starting a sync rcv overrides the servicer + * + * Send an async message to a port and then start waiting on + * the port in mach msg rcv (at UI qos) with sync wait and test if the + * servicer of the workloop gets sync ipc override. + */ +TEST_QOS("server_kevent_id", "qos_client_send_sync_and_sync_rcv", rcv_UI, "workloop_cb_test_intransit", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_MAINTENANCE, "maintenance", + QOS_CLASS_USER_INTERACTIVE, "user interactive with 47 basepri promotion") + +/* + * Test 6: test sending sync ipc message (at IN qos) to port will override the servicer + * + * Send a message with sync ipc override to a port and check if the servicer + * of the workloop on other side gets sync ipc override. + */ +TEST_QOS("server_kevent_id", "qos_client_send_sync_msg_with_pri", send_sync_IN, "workloop_cb_test_sync_send", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_MAINTENANCE, "maintenance", + QOS_CLASS_USER_INITIATED, "user initiated") + +/* + * Test 7: test sending sync ipc message (at UI qos) to port will override the servicer + * + * Send a message with sync ipc override to a port and check if the servicer + * of the workloop on other side gets sync ipc override. + */ +TEST_QOS("server_kevent_id", "qos_client_send_sync_msg_with_pri", send_sync_UI, "workloop_cb_test_sync_send", + QOS_CLASS_MAINTENANCE, "maintenance", + QOS_CLASS_MAINTENANCE, "maintenance", + QOS_CLASS_USER_INTERACTIVE, "user initiated with 47 basepri promotion") + +/* + * Test 8: test enabling a knote in workloop handler will drop the sync ipc override of delivered message + * + * Send a sync ipc message to port and check the servicer of the workloop + * on other side gets sync ipc override and once the handler enables the knote, + * that sync ipc override is dropped. + */ +TEST_QOS("server_kevent_id", "qos_client_send_sync_msg_with_pri", send_sync_UI_and_enable, "workloop_cb_test_sync_send_and_enable", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_USER_INTERACTIVE, "user initiated with 47 basepri promotion") + +/* + * Test 9: test returning to begin processing drops sync ipc override of delivered message + * + * Send a sync ipc message and check if enabling the knote clears the override of + * the delivered message, but should still have the override of an enqueued message. + */ +TEST_QOS("server_kevent_id", "qos_client_send_two_sync_msg_with_thr_pri", send_two_sync_UI, "workloop_cb_test_send_two_sync", + QOS_CLASS_BACKGROUND, "background", + QOS_CLASS_MAINTENANCE, "maintenance", + QOS_CLASS_USER_INTERACTIVE, "user initiated with 47 basepri promotion") + +/* + * Test 10: test destroying the special reply port drops the override + * + * Send an async messages and a sync ipc message, the workloop handler + * should get a sync ipc override, now test if destroying the special + * reply port drops the sync ipc override on the servicer. + */ +TEST_QOS("server_kevent_id", "qos_client_send_two_msg_and_destroy", send_two_UI_and_destroy, "workloop_cb_test_two_send_and_destroy", + QOS_CLASS_BACKGROUND, "background", + QOS_CLASS_MAINTENANCE, "maintenance", + QOS_CLASS_USER_INTERACTIVE, "user initiated with 47 basepri promotion") +/* + * Test 11: test sending two ports with chaining + * + * Send a sync IPC to a connection port, which itself is embedded in a message + * sent as a sync IPC to a service port. + */ +TEST_QOS("server_kevent_id", "qos_client_send_complex_msg_with_pri", send_complex_sync_UI_and_enable, "workloop_cb_test_sync_send_and_enable", + QOS_CLASS_USER_INITIATED, "user initiated", + QOS_CLASS_USER_INITIATED, "user initiated", + QOS_CLASS_USER_INTERACTIVE, "user initiated with 47 basepri promotion") + +/* + * Test 12 - 19 + * + * Test single sync ipc link with server that breaks/preserves the link in different ways. + */ +TEST_QOS("server_kevent_id", "qos_client_send_sync_msg_with_link_check_correct_server", send_sync_link_correct_server_s, "workloop_cb_test_sync_send_reply", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_DEFAULT, "default") + +TEST_QOS("server_kevent_id", "qos_client_send_sync_msg_with_link_check_correct_server", send_sync_link_correct_server_d, "workloop_cb_test_sync_send_deallocate", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_DEFAULT, "default") + +TEST_QOS("server_kevent_id", "qos_client_send_sync_msg_with_link_check_correct_server", send_sync_link_correct_server_sk, "workloop_cb_test_sync_send_reply_kevent", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_DEFAULT, "default") + +TEST_QOS("server_kevent_id", "qos_client_send_sync_msg_with_link_check_correct_server", send_sync_link_correct_server_skp, "workloop_cb_test_sync_send_reply_kevent_pthread", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_DEFAULT, "default") + +TEST_QOS("server_kevent_id", "qos_client_send_sync_msg_with_link_check_incorrect_server", send_sync_link_incorrect_server_ks, "workloop_cb_test_sync_send_kevent_reply", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_DEFAULT, "default") + +TEST_QOS("server_kevent_id", "qos_client_send_sync_msg_with_link_check_correct_server", send_sync_link_correct_server_n, "workloop_cb_test_sync_send_do_nothing", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_DEFAULT, "default") + +TEST_QOS("server_kevent_id", "qos_client_send_sync_msg_with_link_check_incorrect_server", send_sync_link_incorrect_server_kp, "workloop_cb_test_sync_send_do_nothing_kevent_pthread", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_DEFAULT, "default") + +TEST_QOS("server_kevent_id", "qos_client_send_sync_msg_with_link_check_correct_server", send_sync_link_correct_server_e, "workloop_cb_test_sync_send_do_nothing_exit", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_DEFAULT, "default") +/* + * Test 20 - 23 + * + * Test sequential sync ipc link with server that breaks/preserves the link. + */ +TEST_QOS("server_kevent_id", "qos_client_send_2sync_msg_with_link_check_correct_server", send_2sync_link_correct_server_sksk, "workloop_cb_test_sync_send_reply_kevent_reply_kevent", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_DEFAULT, "default") + +TEST_QOS("server_kevent_id", "qos_client_send_2sync_msg_with_link_check_incorrect_server", send_2sync_link_incorrect_server_kssk, "workloop_cb_test_sync_send_kevent_reply_reply_kevent", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_DEFAULT, "default") + +TEST_QOS("server_kevent_id", "qos_client_send_2sync_msg_with_link_check_incorrect_server", send_2sync_link_incorrect_server_ksks, "workloop_cb_test_sync_send_kevent_reply_kevent_reply", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_DEFAULT, "default") + +TEST_QOS("server_kevent_id", "qos_client_send_2sync_msg_with_link_check_incorrect_server", send_2sync_link_incorrect_server_skks, "workloop_cb_test_sync_send_reply_kevent_kevent_reply", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_DEFAULT, "default", + QOS_CLASS_DEFAULT, "default") diff --git a/tools/tests/darwintests/kpc.c b/tests/kpc.c similarity index 100% rename from tools/tests/darwintests/kpc.c rename to tests/kpc.c diff --git a/tools/tests/darwintests/kperf.c b/tests/kperf.c similarity index 100% rename from tools/tests/darwintests/kperf.c rename to tests/kperf.c diff --git a/tools/tests/darwintests/kperf_backtracing.c b/tests/kperf_backtracing.c similarity index 100% rename from tools/tests/darwintests/kperf_backtracing.c rename to tests/kperf_backtracing.c diff --git a/tools/tests/darwintests/kperf_helpers.c b/tests/kperf_helpers.c similarity index 100% rename from tools/tests/darwintests/kperf_helpers.c rename to tests/kperf_helpers.c diff --git a/tools/tests/darwintests/kperf_helpers.h b/tests/kperf_helpers.h similarity index 100% rename from tools/tests/darwintests/kperf_helpers.h rename to tests/kperf_helpers.h diff --git a/tools/tests/darwintests/kqueue_add_and_trigger.c b/tests/kqueue_add_and_trigger.c similarity index 100% rename from tools/tests/darwintests/kqueue_add_and_trigger.c rename to tests/kqueue_add_and_trigger.c diff --git a/tools/tests/darwintests/kqueue_close.c b/tests/kqueue_close.c similarity index 100% rename from tools/tests/darwintests/kqueue_close.c rename to tests/kqueue_close.c diff --git a/tools/tests/darwintests/kqueue_fifo_18776047.c b/tests/kqueue_fifo_18776047.c similarity index 100% rename from tools/tests/darwintests/kqueue_fifo_18776047.c rename to tests/kqueue_fifo_18776047.c diff --git a/tools/tests/darwintests/kqueue_file_tests.c b/tests/kqueue_file_tests.c similarity index 100% rename from tools/tests/darwintests/kqueue_file_tests.c rename to tests/kqueue_file_tests.c diff --git a/tools/tests/darwintests/kqueue_timer_tests.c b/tests/kqueue_timer_tests.c similarity index 100% rename from tools/tests/darwintests/kqueue_timer_tests.c rename to tests/kqueue_timer_tests.c diff --git a/tools/tests/darwintests/launchd_plists/com.apple.xnu.test.kevent_qos.plist b/tests/launchd_plists/com.apple.xnu.test.kevent_qos.plist similarity index 100% rename from tools/tests/darwintests/launchd_plists/com.apple.xnu.test.kevent_qos.plist rename to tests/launchd_plists/com.apple.xnu.test.kevent_qos.plist diff --git a/tests/launchd_plists/com.apple.xnu.test.turnstile_multihop.plist b/tests/launchd_plists/com.apple.xnu.test.turnstile_multihop.plist new file mode 100644 index 000000000..e4d42415f --- /dev/null +++ b/tests/launchd_plists/com.apple.xnu.test.turnstile_multihop.plist @@ -0,0 +1,24 @@ + + + + + Label + com.apple.xnu.test.turnstile_multihop + MachServices + + com.apple.xnu.test.turnstile_multihop + + + ThrottleInterval + 1 + UserName + root + ProcessType + Adaptive + EnvironmentVariables + + MallocNanoZone + 1 + + + diff --git a/tests/ltable_exhaustion_test.c b/tests/ltable_exhaustion_test.c new file mode 100644 index 000000000..9bfeba8ea --- /dev/null +++ b/tests/ltable_exhaustion_test.c @@ -0,0 +1,35 @@ +#include +#include +#include +#include +#include +#include + +#define ITER 100 + +T_DECL(ltable_exhaustion_test, + "check if allocating not used ltable entries can panic the system", + T_META_ASROOT(true)) +{ + int n_ltable_entries,n_ltable_entries_after; + size_t len = sizeof(int); + int i; + mach_port_name_t portset; + + /* + * Get how many ltable entries are allocated right now. + */ + T_EXPECT_POSIX_SUCCESS(sysctlbyname("kern.n_ltable_entries", &n_ltable_entries, &len, NULL, 0), "kern.n_ltable_entries"); + + for (i = 0; i < ITER; i++) { + mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_PORT_SET, &portset); + } + + /* + * Get how many ltable entries are allocated after the loop. Other processes in the system might have allocated entries, + * so don't expect the same value. + */ + T_EXPECT_POSIX_SUCCESS(sysctlbyname("kern.n_ltable_entries", &n_ltable_entries_after, &len, NULL, 0), "kern.n_ltable_entries"); + + T_EXPECT_LE(n_ltable_entries_after, n_ltable_entries+ITER, "ltable before %d after %d iter %d", n_ltable_entries, n_ltable_entries_after, ITER); +} diff --git a/tools/tests/darwintests/mach_boottime_usec.c b/tests/mach_boottime_usec.c similarity index 100% rename from tools/tests/darwintests/mach_boottime_usec.c rename to tests/mach_boottime_usec.c diff --git a/tools/tests/darwintests/mach_continuous_time.c b/tests/mach_continuous_time.c similarity index 100% rename from tools/tests/darwintests/mach_continuous_time.c rename to tests/mach_continuous_time.c diff --git a/tools/tests/darwintests/mach_get_times.c b/tests/mach_get_times.c similarity index 100% rename from tools/tests/darwintests/mach_get_times.c rename to tests/mach_get_times.c diff --git a/tools/tests/darwintests/mach_port_deallocate_21692215.c b/tests/mach_port_deallocate_21692215.c similarity index 100% rename from tools/tests/darwintests/mach_port_deallocate_21692215.c rename to tests/mach_port_deallocate_21692215.c diff --git a/tests/mach_port_insert_right.c b/tests/mach_port_insert_right.c new file mode 100644 index 000000000..f4228927b --- /dev/null +++ b/tests/mach_port_insert_right.c @@ -0,0 +1,32 @@ +#include +#include +#include +#include + +T_DECL(mach_port_insert_right,"insert send right for an existing right", T_META_CHECK_LEAKS(false)) +{ + mach_port_t port = MACH_PORT_NULL; + mach_port_t port2 = MACH_PORT_NULL; + kern_return_t retval; + + mach_port_t task = mach_task_self(); + + retval = mach_port_allocate(task, MACH_PORT_RIGHT_RECEIVE, &port); + T_ASSERT_MACH_SUCCESS(retval, "allocate a port=[%d]", port); + + mach_port_name_t name = 123; + + retval = mach_port_insert_right(task, name, port, MACH_MSG_TYPE_MAKE_SEND); + T_ASSERT_MACH_ERROR(retval, KERN_FAILURE, "insert a send right for port=[%d] with name=[%d]", port, name); + + name = port + 1; + retval = mach_port_insert_right(task, name, port, MACH_MSG_TYPE_MAKE_SEND); + T_ASSERT_MACH_ERROR(retval, KERN_FAILURE, "insert a send right for port=[%d] with name=[%d]", port, name); + + retval = mach_port_allocate(task, MACH_PORT_RIGHT_RECEIVE, &port2); + T_ASSERT_MACH_SUCCESS(retval, "allocate a port=[%d]", port2); + + name = port; + retval = mach_port_insert_right(task, name, port2, MACH_MSG_TYPE_MAKE_SEND); + T_ASSERT_MACH_ERROR(retval, KERN_RIGHT_EXISTS, "insert a send right for port=[%d] with name=[%d]", port2, name); +} diff --git a/tools/tests/darwintests/mach_port_mod_refs.c b/tests/mach_port_mod_refs.c similarity index 100% rename from tools/tests/darwintests/mach_port_mod_refs.c rename to tests/mach_port_mod_refs.c diff --git a/tools/tests/darwintests/mach_timebase_info.c b/tests/mach_timebase_info.c similarity index 100% rename from tools/tests/darwintests/mach_timebase_info.c rename to tests/mach_timebase_info.c diff --git a/tests/memorystatus_freeze_test.c b/tests/memorystatus_freeze_test.c new file mode 100644 index 000000000..d41c66465 --- /dev/null +++ b/tests/memorystatus_freeze_test.c @@ -0,0 +1,270 @@ +#include +#include +#include +#include +#include + +#ifdef T_NAMESPACE +#undef T_NAMESPACE +#endif +#include +#include + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vm"), + T_META_CHECK_LEAKS(false) +); + +#define MEM_SIZE_MB 10 +#define NUM_ITERATIONS 5 + +#define CREATE_LIST(X) \ + X(SUCCESS) \ + X(TOO_FEW_ARGUMENTS) \ + X(SYSCTL_VM_PAGESIZE_FAILED) \ + X(VM_PAGESIZE_IS_ZERO) \ + X(SYSCTL_VM_FREEZE_ENABLED_FAILED) \ + X(FREEZER_DISABLED) \ + X(DISPATCH_SOURCE_CREATE_FAILED) \ + X(INITIAL_SIGNAL_TO_PARENT_FAILED) \ + X(SIGNAL_TO_PARENT_FAILED) \ + X(MEMORYSTATUS_CONTROL_FAILED) \ + X(IS_FREEZABLE_NOT_AS_EXPECTED) \ + X(MEMSTAT_PRIORITY_CHANGE_FAILED) \ + X(EXIT_CODE_MAX) + +#define EXIT_CODES_ENUM(VAR) VAR, +enum exit_codes_num { + CREATE_LIST(EXIT_CODES_ENUM) +}; + +#define EXIT_CODES_STRING(VAR) #VAR, +static const char *exit_codes_str[] = { + CREATE_LIST(EXIT_CODES_STRING) +}; + + +static pid_t pid = -1; +static int freeze_count = 0; + +void move_to_idle_band(void); +void run_freezer_test(int size_mb); +void freeze_helper_process(void); + + +void move_to_idle_band(void) { + + memorystatus_priority_properties_t props; + /* + * Freezing a process also moves it to an elevated jetsam band in order to protect it from idle exits. + * So we move the child process to the idle band to mirror the typical 'idle app being frozen' scenario. + */ + props.priority = JETSAM_PRIORITY_IDLE; + props.user_data = 0; + + /* + * This requires us to run as root (in the absence of entitlement). + * Hence the T_META_ASROOT(true) in the T_HELPER_DECL. + */ + if (memorystatus_control(MEMORYSTATUS_CMD_SET_PRIORITY_PROPERTIES, getpid(), 0, &props, sizeof(props))) { + exit(MEMSTAT_PRIORITY_CHANGE_FAILED); + } +} + +void freeze_helper_process(void) { + int ret; + + T_LOG("Freezing child pid %d", pid); + ret = sysctlbyname("kern.memorystatus_freeze", NULL, NULL, &pid, sizeof(pid)); + sleep(1); + + if (freeze_count % 2 == 0) { + /* + * The child process toggles its freezable state on each iteration. + * So a failure for every alternate freeze is expected. + */ + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.memorystatus_freeze failed"); + T_LOG("Freeze succeeded. Thawing child pid %d", pid); + ret = sysctlbyname("kern.memorystatus_thaw", NULL, NULL, &pid, sizeof(pid)); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.memorystatus_thaw failed"); + } else { + T_QUIET; T_ASSERT_TRUE(ret != KERN_SUCCESS, "Freeze should have failed"); + T_LOG("Freeze failed as expected"); + } + + freeze_count++; + + T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(pid, SIGUSR1), "failed to send SIGUSR1 to child process"); +} + +void run_freezer_test(int size_mb) { + int ret; + char sz_str[50]; + char **launch_tool_args; + char testpath[PATH_MAX]; + uint32_t testpath_buf_size; + dispatch_source_t ds_freeze, ds_proc; + +#ifndef CONFIG_FREEZE + T_SKIP("Task freeze not supported."); +#endif + + signal(SIGUSR1, SIG_IGN); + ds_freeze = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR1, 0, dispatch_get_main_queue()); + T_QUIET; T_ASSERT_NOTNULL(ds_freeze, "dispatch_source_create (ds_freeze)"); + + dispatch_source_set_event_handler(ds_freeze, ^{ + if (freeze_count < NUM_ITERATIONS) { + freeze_helper_process(); + } else { + kill(pid, SIGKILL); + dispatch_source_cancel(ds_freeze); + } + }); + dispatch_activate(ds_freeze); + + testpath_buf_size = sizeof(testpath); + ret = _NSGetExecutablePath(testpath, &testpath_buf_size); + T_QUIET; T_ASSERT_POSIX_ZERO(ret, "_NSGetExecutablePath"); + T_LOG("Executable path: %s", testpath); + + sprintf(sz_str, "%d", size_mb); + launch_tool_args = (char *[]){ + testpath, + "-n", + "allocate_pages", + "--", + sz_str, + NULL + }; + + /* Spawn the child process. Suspend after launch until the exit proc handler has been set up. */ + ret = dt_launch_tool(&pid, launch_tool_args, true, NULL, NULL); + if (ret != 0) { + T_LOG("dt_launch tool returned %d with error code %d", ret, errno); + } + T_QUIET; T_ASSERT_POSIX_SUCCESS(pid, "dt_launch_tool"); + + ds_proc = dispatch_source_create(DISPATCH_SOURCE_TYPE_PROC, (uintptr_t)pid, DISPATCH_PROC_EXIT, dispatch_get_main_queue()); + T_QUIET; T_ASSERT_NOTNULL(ds_proc, "dispatch_source_create (ds_proc)"); + + dispatch_source_set_event_handler(ds_proc, ^{ + int status = 0, code = 0; + pid_t rc = waitpid(pid, &status, 0); + T_QUIET; T_ASSERT_EQ(rc, pid, "waitpid"); + code = WEXITSTATUS(status); + + if (code == 0) { + T_END; + } else if (code > 0 && code < EXIT_CODE_MAX) { + T_ASSERT_FAIL("Child exited with %s", exit_codes_str[code]); + } else { + T_ASSERT_FAIL("Child exited with unknown exit code %d", code); + } + }); + dispatch_activate(ds_proc); + + T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(pid, SIGCONT), "failed to send SIGCONT to child process"); + dispatch_main(); +} + +T_HELPER_DECL(allocate_pages, + "allocates pages to freeze", + T_META_ASROOT(true)) { + int i, j, temp, ret, size_mb, vmpgsize; + size_t len; + char val; + __block int num_pages, num_iter = 0; + __block char **buf; + dispatch_source_t ds_signal; + + len = sizeof(vmpgsize); + ret = sysctlbyname("vm.pagesize", &vmpgsize, &len, NULL, 0); + if (ret != 0) { + exit(SYSCTL_VM_PAGESIZE_FAILED); + } + if (vmpgsize == 0) { + exit(VM_PAGESIZE_IS_ZERO); + } + + if (argc < 1) { + exit(TOO_FEW_ARGUMENTS); + } + + len = sizeof(temp); + ret = sysctlbyname("vm.freeze_enabled", &temp, &len, NULL, 0); + if (ret != 0) { + exit(SYSCTL_VM_FREEZE_ENABLED_FAILED); + } + if (temp == 0) { + exit(FREEZER_DISABLED); + } + + size_mb = atoi(argv[0]); + num_pages = size_mb * 1024 * 1024 / vmpgsize; + buf = (char**)malloc(sizeof(char*) * (size_t)num_pages); + + /* Gives us the compression ratio we see in the typical case (~2.7) */ + for (j = 0; j < num_pages; j++) { + buf[j] = (char*)malloc((size_t)vmpgsize * sizeof(char)); + val = 0; + for (i = 0; i < vmpgsize; i += 16) { + memset(&buf[j][i], val, 16); + if (i < 3400 * (vmpgsize / 4096)) { + val++; + } + } + } + + dispatch_after(dispatch_time(DISPATCH_TIME_NOW, NSEC_PER_SEC), dispatch_get_main_queue(), ^{ + /* Signal to the parent that we're done allocating and it's ok to freeze us */ + printf("Sending initial signal to parent to begin freezing\n"); + if (kill(getppid(), SIGUSR1) != 0) { + exit(INITIAL_SIGNAL_TO_PARENT_FAILED); + } + }); + + signal(SIGUSR1, SIG_IGN); + ds_signal = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR1, 0, dispatch_get_main_queue()); + if (ds_signal == NULL) { + exit(DISPATCH_SOURCE_CREATE_FAILED); + } + + dispatch_source_set_event_handler(ds_signal, ^{ + int current_state, new_state; + volatile int tmp; + + /* Make sure all the pages are accessed before trying to freeze again */ + for (int x = 0; x < num_pages; x++) { + tmp = buf[x][0]; + } + + current_state = memorystatus_control(MEMORYSTATUS_CMD_GET_PROCESS_IS_FREEZABLE, getpid(), 0, NULL, 0); + + /* Toggle freezable state */ + new_state = (current_state) ? 0: 1; + printf("Changing state from %s to %s\n", (current_state) ? "freezable": "unfreezable", (new_state) ? "freezable": "unfreezable"); + if (memorystatus_control(MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE, getpid(), (uint32_t)new_state, NULL, 0) != KERN_SUCCESS) { + exit(MEMORYSTATUS_CONTROL_FAILED); + } + + /* Verify that the state has been set correctly */ + current_state = memorystatus_control(MEMORYSTATUS_CMD_GET_PROCESS_IS_FREEZABLE, getpid(), 0, NULL, 0); + if (new_state != current_state) { + exit(IS_FREEZABLE_NOT_AS_EXPECTED); + } + num_iter++; + + if (kill(getppid(), SIGUSR1) != 0) { + exit(SIGNAL_TO_PARENT_FAILED); + } + }); + dispatch_activate(ds_signal); + move_to_idle_band(); + + dispatch_main(); +} + +T_DECL(freeze, "VM freezer test") { + run_freezer_test(MEM_SIZE_MB); +} diff --git a/tools/tests/darwintests/memorystatus_vm_map_fork.c b/tests/memorystatus_vm_map_fork.c similarity index 96% rename from tools/tests/darwintests/memorystatus_vm_map_fork.c rename to tests/memorystatus_vm_map_fork.c index dc92e5c11..e321bea17 100644 --- a/tools/tests/darwintests/memorystatus_vm_map_fork.c +++ b/tests/memorystatus_vm_map_fork.c @@ -30,8 +30,8 @@ extern char **environ; * kernel routine used to generate a corpse task. * * A corpse is allowed to be taken if a task's memory resource limit that - * is exceeded is less than 1/2 of the system wide task limit. - * If the amount exceeds 1/2 the sytem wide limit, then the corpse is disallowed. + * is exceeded is less than 1/4 of the system wide task limit. + * If the amount exceeds 1/4 the sytem wide limit, then the corpse is disallowed. * * If the device under test is already under pressure, the test * could fail due to jetsam cutting in and killing the parent, child or @@ -330,21 +330,21 @@ memorystatus_vm_map_fork_parent(int test_variant) if (test_variant == TEST_ALLOWED) { /* - * Tell the child to allocate less than 1/2 the system wide limit. + * Tell the child to allocate less than 1/4 the system wide limit. */ - if (max_task_pmem / 2 - LIMIT_DELTA_MB <= 0) { + if (max_task_pmem / 4 - LIMIT_DELTA_MB <= 0) { active_limit_mb = LIMIT_DELTA_MB; } else { - active_limit_mb = max_task_pmem / 2 - LIMIT_DELTA_MB; + active_limit_mb = max_task_pmem / 4 - LIMIT_DELTA_MB; } expected_pidwatch_val = MEMORYSTATUS_VM_MAP_FORK_ALLOWED; } else { /* TEST_NOT_ALLOWED */ /* - * Tell the child to allocate more than 1/2 the system wide limit. + * Tell the child to allocate more than 1/4 the system wide limit. */ - active_limit_mb = (max_task_pmem / 2) + LIMIT_DELTA_MB; + active_limit_mb = (max_task_pmem / 4) + LIMIT_DELTA_MB; if (max_task_pmem == 0) { expected_pidwatch_val = MEMORYSTATUS_VM_MAP_FORK_ALLOWED; } else { diff --git a/tools/tests/darwintests/memorystatus_zone_test.c b/tests/memorystatus_zone_test.c similarity index 98% rename from tools/tests/darwintests/memorystatus_zone_test.c rename to tests/memorystatus_zone_test.c index f652725bb..007970ec9 100644 --- a/tools/tests/darwintests/memorystatus_zone_test.c +++ b/tests/memorystatus_zone_test.c @@ -274,6 +274,11 @@ static void cleanup_and_end_test(void) /* Kill all the child processes that were spawned */ for (i = 0; i < num_children; i++) { kill(child_pids[i], SIGKILL); + /* + * Sleep between kills to avoid hogging the VM map entries zone lock (on the task_terminate path). + * Without this we were seeing hw_lock_bit timeouts in BATS. + */ + sleep(1); } for (i = 0; i < num_children; i++) { int status = 0; diff --git a/tools/tests/darwintests/mktimer_kobject.c b/tests/mktimer_kobject.c similarity index 100% rename from tools/tests/darwintests/mktimer_kobject.c rename to tests/mktimer_kobject.c diff --git a/tools/tests/darwintests/monotonic_core.c b/tests/monotonic_core.c similarity index 97% rename from tools/tests/darwintests/monotonic_core.c rename to tests/monotonic_core.c index 66bcc3185..3feaeba94 100644 --- a/tools/tests/darwintests/monotonic_core.c +++ b/tests/monotonic_core.c @@ -184,25 +184,26 @@ perf_sysctl_deltas(const char *sysctl_name, const char *stat_name) } T_DECL(perf_core_fixed_cpu, "test the performance of fixed CPU counter access", - T_META_ASROOT(true)) + T_META_ASROOT(true), T_META_TAG_PERF) { perf_sysctl_deltas("kern.monotonic.fixed_cpu_perf", "fixed_cpu_counters"); } T_DECL(perf_core_fixed_thread, "test the performance of fixed thread counter access", - T_META_ASROOT(true)) + T_META_ASROOT(true), T_META_TAG_PERF) { perf_sysctl_deltas("kern.monotonic.fixed_thread_perf", "fixed_thread_counters"); } T_DECL(perf_core_fixed_task, "test the performance of fixed task counter access", - T_META_ASROOT(true)) + T_META_ASROOT(true), T_META_TAG_PERF) { perf_sysctl_deltas("kern.monotonic.fixed_task_perf", "fixed_task_counters"); } -T_DECL(perf_core_fixed_thread_self, "test the performance of thread self counts") +T_DECL(perf_core_fixed_thread_self, "test the performance of thread self counts", + T_META_TAG_PERF) { extern int thread_selfcounts(int type, void *buf, size_t nbytes); uint64_t counts[2][2]; diff --git a/tools/tests/darwintests/net_tun_pr_35136664.c b/tests/net_tun_pr_35136664.c similarity index 100% rename from tools/tests/darwintests/net_tun_pr_35136664.c rename to tests/net_tun_pr_35136664.c diff --git a/tools/tests/darwintests/net_tuntests.c b/tests/net_tuntests.c similarity index 100% rename from tools/tests/darwintests/net_tuntests.c rename to tests/net_tuntests.c diff --git a/tools/tests/darwintests/netbsd_utimensat.c b/tests/netbsd_utimensat.c similarity index 100% rename from tools/tests/darwintests/netbsd_utimensat.c rename to tests/netbsd_utimensat.c diff --git a/tools/tests/darwintests/network_entitlements.plist b/tests/network_entitlements.plist similarity index 81% rename from tools/tests/darwintests/network_entitlements.plist rename to tests/network_entitlements.plist index c326c8341..83c92cad6 100644 --- a/tools/tests/darwintests/network_entitlements.plist +++ b/tests/network_entitlements.plist @@ -6,5 +6,7 @@ com.apple.security.network.server + com.apple.private.skywalk.register-kernel-pipe + diff --git a/tools/tests/darwintests/no32exec_35914211.c b/tests/no32exec_35914211.c similarity index 100% rename from tools/tests/darwintests/no32exec_35914211.c rename to tests/no32exec_35914211.c diff --git a/tools/tests/darwintests/no32exec_35914211_helper.c b/tests/no32exec_35914211_helper.c similarity index 100% rename from tools/tests/darwintests/no32exec_35914211_helper.c rename to tests/no32exec_35914211_helper.c diff --git a/tools/tests/darwintests/ntp_adjtime_29192647.c b/tests/ntp_adjtime_29192647.c similarity index 100% rename from tools/tests/darwintests/ntp_adjtime_29192647.c rename to tests/ntp_adjtime_29192647.c diff --git a/tools/tests/darwintests/perf_compressor.c b/tests/perf_compressor.c similarity index 93% rename from tools/tests/darwintests/perf_compressor.c rename to tests/perf_compressor.c index 1d3b23d2c..1a8a57f3f 100644 --- a/tools/tests/darwintests/perf_compressor.c +++ b/tests/perf_compressor.c @@ -2,6 +2,7 @@ #include #include #include +#include #ifdef T_NAMESPACE #undef T_NAMESPACE @@ -11,7 +12,8 @@ T_GLOBAL_META( T_META_NAMESPACE("xnu.vm.perf"), - T_META_CHECK_LEAKS(false) + T_META_CHECK_LEAKS(false), + T_META_TAG_PERF ); enum { @@ -106,12 +108,6 @@ void freeze_helper_process(void) { int64_t compressed_before, compressed_after, input_before, input_after; size_t length; - /* - * Wait a bit after the pages have been allocated/accessed before trying to freeze. - * The sleeps are not needed, they just separate the operations into three logical chunks: - * touch a few pages, freeze them, thaw them (and repeat). - */ - usleep(100); length = sizeof(compressed_before); T_QUIET; T_ASSERT_POSIX_SUCCESS(sysctlbyname("vm.compressor_compressed_bytes", &compressed_before, &length, NULL, 0), "failed to query vm.compressor_compressed_bytes"); @@ -130,18 +126,14 @@ void freeze_helper_process(void) { T_QUIET; T_ASSERT_POSIX_SUCCESS(sysctlbyname("vm.compressor_input_bytes", &input_after, &length, NULL, 0), "failed to query vm.compressor_input_bytes"); - T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.memorystatus_freeze failed on pid %d", pid); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.memorystatus_freeze failed"); dt_stat_add(r, (double)(input_after - input_before)/(double)(compressed_after - compressed_before)); - /* Wait a bit after freezing before trying to thaw */ - usleep(100); ret = sysctlbyname("kern.memorystatus_thaw", NULL, NULL, &pid, sizeof(pid)); - T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.memorystatus_thaw failed on pid %d", pid); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.memorystatus_thaw failed"); - /* Wait a bit after thawing before pages can be re-accessed */ - usleep(100); - T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(pid, SIGUSR1), "failed to send SIGUSR1 to child process [%d]", pid); + T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(pid, SIGUSR1), "failed to send SIGUSR1 to child process"); } void run_compressor_test(int size_mb, int page_type) { @@ -159,6 +151,8 @@ void run_compressor_test(int size_mb, int page_type) { r = dt_stat_create("(input bytes / compressed bytes)", "compression_ratio"); s = dt_stat_time_create("compressor_latency"); + // This sets the A/B failure threshold at 50% of baseline for compressor_latency + dt_stat_set_variable(s, kPCFailureThresholdPctVar, 50.0); signal(SIGUSR1, SIG_IGN); ds_freeze = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR1, 0, dispatch_get_main_queue()); @@ -220,7 +214,7 @@ void run_compressor_test(int size_mb, int page_type) { }); dispatch_activate(ds_proc); - T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(pid, SIGCONT), "failed to send SIGCONT to child process [%d]", pid); + T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(pid, SIGCONT), "failed to send SIGCONT to child process"); dispatch_main(); } @@ -302,6 +296,9 @@ T_HELPER_DECL(allocate_pages, "allocates pages to compress") { } // Numbers for 10MB and above are fairly reproducible. Anything smaller shows a lot of variation. + +// Keeping just the 100MB version for iOSMark +#ifndef DT_IOSMARK T_DECL(compr_10MB_zero, "Compressor latencies") { run_compressor_test(10, ALL_ZEROS); } @@ -329,6 +326,7 @@ T_DECL(compr_100MB_mostly_zero, "Compressor latencies") { T_DECL(compr_100MB_random, "Compressor latencies") { run_compressor_test(100, RANDOM); } +#endif T_DECL(compr_100MB_typical, "Compressor latencies") { run_compressor_test(100, TYPICAL); diff --git a/tests/perf_exit.c b/tests/perf_exit.c new file mode 100644 index 000000000..1dba37cd0 --- /dev/null +++ b/tests/perf_exit.c @@ -0,0 +1,190 @@ +#ifdef T_NAMESPACE +#undef T_NAMESPACE +#endif +#include +#include + +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.perf"), + T_META_ASROOT(true), + T_META_LTEPHASE(LTE_SINGLEUSER), + T_META_TAG_PERF +); +#if TARGET_OS_WATCH +#define TEST_TIMEOUT 3600 * (NSEC_PER_SEC) +#else +#define TEST_TIMEOUT 1800 * (NSEC_PER_SEC) +#endif +// From bsd/sys/proc_internal.h +#define PID_MAX 99999 + +#define EXIT_BINARY "perf_exit_proc" +#define EXIT_BINARY_PATH "./" EXIT_BINARY + +#define NEXT_CASE_EVENTID (0xfedcbb00) + +struct test_case { + int wired_mem; + int threads; +}; + +static struct test_case test_cases[] = { + {0, 0}, + {0, 10}, + {1000000, 0}, +#if !TARGET_OS_WATCH + {10000000, 0} +#endif +}; + +#define TEST_CASES_COUNT (sizeof(test_cases) / sizeof(struct test_case)) + +static _Atomic int producer_i, consumer_i; + +static ktrace_session_t session; + +static dispatch_queue_t spawn_queue, processing_queue; + +static uint64_t *begin_ts; +static dt_stat_time_t s; +static _Atomic bool tracing_on = false; + +void run_exit_test(int proc_wired_mem, int nthreads); + +static void cleanup(void) { + free(begin_ts); + dispatch_release(spawn_queue); + dispatch_release(processing_queue); + if (tracing_on) { + ktrace_end(session, 1); + } +} + +static dt_stat_time_t +create_stat(int proc_wired_mem, int nthreads) +{ + dt_stat_time_t dst = dt_stat_time_create("time"); + T_ASSERT_NOTNULL(dst, "created time statistic"); + + dt_stat_set_variable((dt_stat_t)dst, "proc_threads", nthreads); + dt_stat_set_variable((dt_stat_t)dst, "proc_wired_mem", proc_wired_mem);; + + return dst; +} + +T_DECL(exit, "exit(2) time from syscall start to end", T_META_TIMEOUT(TEST_TIMEOUT)) { + s = create_stat(test_cases[consumer_i].wired_mem, test_cases[consumer_i].threads); + + begin_ts = malloc(sizeof(uint64_t) * PID_MAX); + T_ASSERT_NOTNULL(begin_ts, "created pid array"); + + T_ATEND(cleanup); + + session = ktrace_session_create(); + T_ASSERT_NOTNULL(session, "created a trace session"); + + spawn_queue = dispatch_queue_create("com.apple.perf_exit.spawn_queue", NULL); + processing_queue = dispatch_queue_create("com.apple.perf_exit.processing_queue", NULL); + + ktrace_set_completion_handler(session, ^{ + T_ASSERT_EQ(consumer_i, TEST_CASES_COUNT, "ran all the test cases"); + dispatch_sync(spawn_queue, ^(void) { + tracing_on = false; + }); + ktrace_session_destroy(session); + T_END; + }); + + ktrace_set_signal_handler(session); + ktrace_set_execnames_enabled(session, KTRACE_FEATURE_ENABLED); + + // We are only interested in the processes we launched and ourselves + ktrace_filter_process(session, EXIT_BINARY); + ktrace_filter_process(session, "perf_exit"); + + ktrace_events_single(session, NEXT_CASE_EVENTID, ^(__unused ktrace_event_t e) { + consumer_i++; + dt_stat_finalize(s); + if (consumer_i >= TEST_CASES_COUNT) { + ktrace_end(session, 1); + } + else { + s = create_stat(test_cases[consumer_i].wired_mem, test_cases[consumer_i].threads); + } + }); + + ktrace_events_single(session, (BSDDBG_CODE(DBG_BSD_EXCP_SC, 1) | DBG_FUNC_START), ^(ktrace_event_t e) { + T_QUIET; T_ASSERT_LE(e->pid, PID_MAX, "pid %d is valid in start tracepoint", e->pid); + begin_ts[e->pid] = e->timestamp; + }); + + ktrace_events_single(session, (BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXIT) | DBG_FUNC_END), ^(ktrace_event_t e) { + T_ASSERT_LE(e->pid, PID_MAX, "pid %d is valid in end tracepoint", e->pid); + + if (begin_ts[e->pid] == 0) { + return; + } + + T_QUIET; T_ASSERT_LE(begin_ts[e->pid], e->timestamp, "timestamps are monotonically increasing"); + dt_stat_mach_time_add(s, e->timestamp - begin_ts[e->pid]); + + + if (dt_stat_stable(s) && producer_i == consumer_i) { + dispatch_sync(spawn_queue, ^(void) { + producer_i++; + T_ASSERT_POSIX_ZERO(kdebug_trace(NEXT_CASE_EVENTID, producer_i, 0, 0, 0), "kdebug_trace returns 0"); + }); + } + }); + + int ret = ktrace_start(session, processing_queue); + T_ASSERT_POSIX_ZERO(ret, "starting trace"); + tracing_on = true; + + // Spawn processes continuously until the test is over + + __block void (^spawn_process)(void) = Block_copy(^(void) { + char nthreads_buf[32], mem_buf[32]; + + if (producer_i >= TEST_CASES_COUNT || !tracing_on) { + return; + } + + snprintf(nthreads_buf, 32, "%d", test_cases[producer_i].threads); + snprintf(mem_buf, 32, "%d", test_cases[producer_i].wired_mem); + + char *args[] = {EXIT_BINARY_PATH, nthreads_buf, mem_buf, NULL}; + int status; + + pid_t pid; + int bret = posix_spawn(&pid, args[0], NULL, NULL, args, NULL); + T_ASSERT_POSIX_ZERO(bret, "spawned process with pid %d (threads=%s mem=%s)", pid, nthreads_buf, mem_buf); + + bret = waitpid(pid, &status, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(bret, "waited for process %d\n", pid); + + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) + T_ASSERT_FAIL("child process failed to run"); + + // Avoid saturating the CPU with new processes + usleep(1000); + + dispatch_async(spawn_queue, spawn_process); + }); + + dispatch_async(spawn_queue, spawn_process); + + dispatch_after(dispatch_time(DISPATCH_TIME_NOW, TEST_TIMEOUT), dispatch_get_main_queue(), ^{ + ktrace_end(session, 0); + }); + + dispatch_main(); +} + diff --git a/tools/tests/darwintests/perf_exit_proc.c b/tests/perf_exit_proc.c similarity index 74% rename from tools/tests/darwintests/perf_exit_proc.c rename to tests/perf_exit_proc.c index fa157cdd6..b8bb88a7f 100644 --- a/tools/tests/darwintests/perf_exit_proc.c +++ b/tests/perf_exit_proc.c @@ -60,34 +60,16 @@ static int allocate_and_wire_memory(mach_vm_size_t size) { return 0; } -static int set_thread_priority(int priority) { - struct sched_param param; - int policy; - - int err = pthread_getschedparam(pthread_self(), &policy, ¶m); - if (err) return err; - - param.sched_priority = priority; - - err = pthread_setschedparam(pthread_self(), policy, ¶m); - if (err) return err; - - return 0; -} - int main(int argc, char *argv[]) { - int priority = 47, nthreads = 0; + int nthreads = 0; int err; mach_vm_size_t wired_mem = 0; if (argc > 1) { - priority = (int)strtoul(argv[1], NULL, 10); + nthreads = (int)strtoul(argv[1], NULL, 10); } if (argc > 2) { - nthreads = (int)strtoul(argv[2], NULL, 10); - } - if (argc > 3) { - wired_mem = (mach_vm_size_t)strtoul(argv[3], NULL, 10); + wired_mem = (mach_vm_size_t)strtoul(argv[2], NULL, 10); } err = allocate_and_wire_memory(wired_mem); @@ -95,11 +77,6 @@ int main(int argc, char *argv[]) { return err; } - err = set_thread_priority(priority); - if (err) { - return err; - } - err = run_additional_threads(nthreads); if (err) { return err; diff --git a/tools/tests/darwintests/perf_kdebug.c b/tests/perf_kdebug.c similarity index 99% rename from tools/tests/darwintests/perf_kdebug.c rename to tests/perf_kdebug.c index f0f058fbd..0b8240ec4 100644 --- a/tools/tests/darwintests/perf_kdebug.c +++ b/tests/perf_kdebug.c @@ -12,7 +12,8 @@ T_GLOBAL_META( T_META_NAMESPACE("xnu.perf.kdebug"), T_META_ASROOT(true), - T_META_CHECK_LEAKS(false) + T_META_CHECK_LEAKS(false), + T_META_TAG_PERF ); // diff --git a/tools/tests/darwintests/perf_spawn_fork.c b/tests/perf_spawn_fork.c similarity index 93% rename from tools/tests/darwintests/perf_spawn_fork.c rename to tests/perf_spawn_fork.c index 13a85fff7..fad33b2ae 100644 --- a/tools/tests/darwintests/perf_spawn_fork.c +++ b/tests/perf_spawn_fork.c @@ -8,8 +8,9 @@ #include T_GLOBAL_META( - T_META_NAMESPACE("xnu.perf.fork"), - T_META_CHECK_LEAKS(false) + T_META_NAMESPACE("xnu.perf"), + T_META_CHECK_LEAKS(false), + T_META_TAG_PERF ); #define SPAWN_MEASURE_LOOP(s) \ @@ -38,7 +39,7 @@ T_DECL(posix_spawn_platform_binary_latency, "posix_spawn platform binary latency } { - dt_stat_thread_cpu_time_t s = dt_stat_thread_cpu_time_create("on-cpu time"); + dt_stat_thread_cpu_time_t s = dt_stat_thread_cpu_time_create("on_cpu_time"); SPAWN_MEASURE_LOOP(s); dt_stat_finalize(s); } @@ -68,7 +69,7 @@ T_DECL(fork, "fork latency") { dt_stat_finalize(s); } { - dt_stat_thread_cpu_time_t s = dt_stat_thread_cpu_time_create("on-cpu time"); + dt_stat_thread_cpu_time_t s = dt_stat_thread_cpu_time_create("on_cpu_time"); FORK_MEASURE_LOOP(s); dt_stat_finalize(s); } diff --git a/tests/perf_vmfault.c b/tests/perf_vmfault.c new file mode 100644 index 000000000..e3e81f1bd --- /dev/null +++ b/tests/perf_vmfault.c @@ -0,0 +1,243 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.vm.perf"), + T_META_CHECK_LEAKS(false), + T_META_TAG_PERF +); + +#ifdef DT_IOSMARK +#define MEMSIZE (1UL<<29) /* 512 MB */ +#else +#define MEMSIZE (1UL<<27) /* 128 MB */ +#endif + +enum { + SOFT_FAULT, + ZERO_FILL, + NUM_TESTS +}; + +static int test_type; +static int num_threads; +static int ready_thread_count; +static size_t pgsize; +static size_t num_pages; +static char *memblock; +static char *memblock_share; +static dt_stat_time_t t; +static pthread_cond_t start_cvar; +static pthread_cond_t threads_ready_cvar; +static pthread_mutex_t ready_thread_count_lock; + +static void map_mem_regions(void); +static void unmap_mem_regions(void); +static void fault_pages(int thread_id); +static void execute_threads(void); +static void *thread_setup(void *arg); +static void run_test(int test, int threads, int cpus); +static int get_ncpu(void); + +static void map_mem_regions(void) +{ + char *ptr; + volatile char val; + vm_prot_t curprot, maxprot; + + memblock = (char *)mmap(NULL, MEMSIZE, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); + T_QUIET; T_ASSERT_NE((void *)memblock, MAP_FAILED, "mmap"); + + if (test_type == SOFT_FAULT) { + + /* Fault in all the pages of the original region. */ + for(ptr = memblock; ptr < memblock + MEMSIZE; ptr += pgsize) { + val = *ptr; + } + /* Remap the region so that subsequent accesses result in read soft faults. */ + T_QUIET; T_ASSERT_MACH_SUCCESS(vm_remap(mach_task_self(), (vm_address_t *)&memblock_share, + MEMSIZE, 0, VM_FLAGS_ANYWHERE, mach_task_self(), (vm_address_t)memblock, FALSE, + &curprot, &maxprot, VM_INHERIT_DEFAULT), "vm_remap"); + } +} + +static void unmap_mem_regions(void) +{ + if (test_type == SOFT_FAULT) { + T_QUIET; T_ASSERT_MACH_SUCCESS(munmap(memblock_share, MEMSIZE), "munmap"); + } + T_QUIET; T_ASSERT_MACH_SUCCESS(munmap(memblock, MEMSIZE), "munmap"); +} + +static void fault_pages(int thread_id) +{ + size_t region_len, region_start, region_end; + char *ptr, *block; + volatile char val; + + region_len = num_pages / (size_t)num_threads; + region_start = region_len * (size_t)thread_id; + + if((size_t)thread_id < num_pages % (size_t)num_threads) { + region_start += (size_t)thread_id; + region_len++; + } + else { + region_start += num_pages % (size_t)num_threads; + } + + region_start *= pgsize; + region_len *= pgsize; + region_end = region_start + region_len; + + block = (test_type == SOFT_FAULT)? memblock_share: memblock; + for(ptr = block + region_start; ptr < block + region_end; ptr += pgsize) { + val = *ptr; + } +} + +static void execute_threads(void) +{ + int thread_index, thread_retval; + int *thread_indices; + void *thread_retval_ptr = &thread_retval; + pthread_t* threads; + + T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_init(&threads_ready_cvar, NULL), "pthread_cond_init"); + T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_init(&start_cvar, NULL), "pthread_cond_init"); + T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_init(&ready_thread_count_lock, NULL), "pthread_mutex_init"); + ready_thread_count = 0; + + threads = (pthread_t *)malloc(sizeof(*threads) * (size_t)num_threads); + thread_indices = (int *)malloc(sizeof(*thread_indices) * (size_t)num_threads); + for(thread_index = 0; thread_index < num_threads; thread_index++) { + thread_indices[thread_index] = thread_index; + T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_create(&threads[thread_index], NULL, + thread_setup, (void *)&thread_indices[thread_index]), "pthread_create"); + } + + T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_lock(&ready_thread_count_lock), "pthread_mutex_lock"); + if(ready_thread_count != num_threads) { + T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_wait(&threads_ready_cvar, &ready_thread_count_lock), + "pthread_cond_wait"); + } + T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_unlock(&ready_thread_count_lock), "pthread_mutex_unlock"); + + T_STAT_MEASURE(t) { + T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_broadcast(&start_cvar), "pthread_cond_broadcast"); + for(thread_index = 0; thread_index < num_threads; thread_index++) { + T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_join(threads[thread_index], &thread_retval_ptr), + "pthread_join"); + } + }; + + free(threads); + free(thread_indices); +} + +static void *thread_setup(void *arg) +{ + int my_index = *((int *)arg); + + T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_lock(&ready_thread_count_lock), "pthread_mutex_lock"); + ready_thread_count++; + if(ready_thread_count == num_threads) { + T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_signal(&threads_ready_cvar), "pthread_cond_signal"); + } + T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_wait(&start_cvar, &ready_thread_count_lock), "pthread_cond_wait"); + T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_unlock(&ready_thread_count_lock), "pthread_mutex_unlock"); + + fault_pages(my_index); + return NULL; +} + +static void run_test(int test, int threads, int cpus) +{ + size_t sysctl_size = sizeof(pgsize); + int ret = sysctlbyname("vm.pagesize", &pgsize, &sysctl_size, NULL, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl vm.pagesize failed"); + + test_type = test; + num_threads = threads; + num_pages = MEMSIZE / pgsize; + + T_QUIET; T_ASSERT_LT(test_type, NUM_TESTS, "invalid test type"); + T_QUIET; T_ASSERT_GT(num_threads, 0, "num_threads <= 0"); + T_QUIET; T_ASSERT_GT((int)num_pages/ num_threads, 0, "num_pages/num_threads <= 0"); + + T_LOG("No. of cpus: %d", cpus); + T_LOG("No. of threads: %d", num_threads); + T_LOG("No. of pages: %ld", num_pages); + T_LOG("Pagesize: %ld", pgsize); + + t = dt_stat_time_create("Runtime"); + // This sets the A/B failure threshold at 50% of baseline for Runtime + dt_stat_set_variable(t, kPCFailureThresholdPctVar, 50.0); + while (!dt_stat_stable(t)) { + map_mem_regions(); + execute_threads(); + unmap_mem_regions(); + } + + dt_stat_finalize(t); + T_END; +} + +static int get_ncpu(void) +{ + int ncpu; + size_t length = sizeof(ncpu); + + T_QUIET; T_ASSERT_POSIX_SUCCESS(sysctlbyname("hw.ncpu", &ncpu, &length, NULL, 0), + "failed to query hw.ncpu"); + return ncpu; +} + +T_DECL(read_soft_fault, + "Read soft faults (single thread)") +{ + run_test(SOFT_FAULT, 1, get_ncpu()); +} + +T_DECL(read_soft_fault_multithreaded, + "Read soft faults (multi-threaded)") +{ + char *e; + int nthreads; + + /* iOSMark passes in the no. of threads via an env. variable */ + if ((e = getenv("DT_STAT_NTHREADS"))) { + nthreads = (int)strtol(e, NULL, 0); + } else { + nthreads = get_ncpu(); + } + run_test(SOFT_FAULT, nthreads, get_ncpu()); +} + +T_DECL(zero_fill_fault, + "Zero fill faults (single thread)") +{ + run_test(ZERO_FILL, 1, get_ncpu()); +} + +T_DECL(zero_fill_fault_multithreaded, + "Zero fill faults (multi-threaded)") +{ + char *e; + int nthreads; + + /* iOSMark passes in the no. of threads via an env. variable */ + if ((e = getenv("DT_STAT_NTHREADS"))) { + nthreads = (int)strtol(e, NULL, 0); + } else { + nthreads = get_ncpu(); + } + run_test(ZERO_FILL, nthreads, get_ncpu()); +} diff --git a/tests/phys_footprint_interval_max.c b/tests/phys_footprint_interval_max.c new file mode 100644 index 000000000..846b59151 --- /dev/null +++ b/tests/phys_footprint_interval_max.c @@ -0,0 +1,94 @@ + +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ALLOC_SIZE_LARGE 5*1024*1024 +#define ALLOC_SIZE_SMALL 2*1024*1024 + +int proc_rlimit_control(pid_t pid, int flavor, void *arg); + +T_DECL(phys_footprint_interval_max, + "Validate physical footprint interval tracking") +{ + int ret; + struct rusage_info_v4 ru; + mach_vm_address_t addr = (mach_vm_address_t)NULL; + + ret = proc_pid_rusage(getpid(), RUSAGE_INFO_V4, (rusage_info_t *)&ru); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(ret, "proc_pid_rusage"); + T_ASSERT_EQ(ru.ri_lifetime_max_phys_footprint, ru.ri_interval_max_phys_footprint, + "Max footprint and interval footprint are equal prior to dirtying memory"); + + ret = mach_vm_allocate(mach_task_self(), &addr, (mach_vm_size_t)ALLOC_SIZE_LARGE, VM_FLAGS_ANYWHERE); + T_QUIET; + T_ASSERT_MACH_SUCCESS(ret, "mach_vm_allocate(ALLOC_SIZE_LARGE)"); + + memset((void *)addr, 0xab, ALLOC_SIZE_LARGE); + + ret = proc_pid_rusage(getpid(), RUSAGE_INFO_V4, (rusage_info_t *)&ru); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(ret, "proc_pid_rusage"); + T_ASSERT_EQ(ru.ri_lifetime_max_phys_footprint, ru.ri_interval_max_phys_footprint, + "Max footprint and interval footprint are equal after dirtying large memory region"); + + mach_vm_deallocate(mach_task_self(), addr, (mach_vm_size_t)ALLOC_SIZE_LARGE); + + ret = proc_pid_rusage(getpid(), RUSAGE_INFO_V4, (rusage_info_t *)&ru); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(ret, "proc_pid_rusage"); + T_ASSERT_EQ(ru.ri_lifetime_max_phys_footprint, ru.ri_interval_max_phys_footprint, + "Max footprint and interval footprint are still equal after freeing large memory region"); + + ret = proc_reset_footprint_interval(getpid()); + T_ASSERT_POSIX_SUCCESS(ret, "proc_reset_footprint_interval()"); + + ret = proc_pid_rusage(getpid(), RUSAGE_INFO_V4, (rusage_info_t *)&ru); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(ret, "proc_pid_rusage"); + T_ASSERT_GT(ru.ri_lifetime_max_phys_footprint, ru.ri_interval_max_phys_footprint, + "Max footprint is greater than interval footprint after resetting interval"); + + ret = mach_vm_allocate(mach_task_self(), &addr, (mach_vm_size_t)ALLOC_SIZE_SMALL, VM_FLAGS_ANYWHERE); + T_QUIET; + T_ASSERT_MACH_SUCCESS(ret, "mach_vm_allocate(ALLOC_SIZE_SMALL)"); + memset((void *)addr, 0xab, ALLOC_SIZE_SMALL); + + ret = proc_pid_rusage(getpid(), RUSAGE_INFO_V4, (rusage_info_t *)&ru); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(ret, "proc_pid_rusage"); + T_ASSERT_GT(ru.ri_lifetime_max_phys_footprint, ru.ri_interval_max_phys_footprint, + "Max footprint is still greater than interval footprint after dirtying small memory region"); +} diff --git a/tools/tests/darwintests/poll.c b/tests/poll.c similarity index 100% rename from tools/tests/darwintests/poll.c rename to tests/poll.c diff --git a/tools/tests/darwintests/poll_select_kevent_paired_fds.c b/tests/poll_select_kevent_paired_fds.c similarity index 100% rename from tools/tests/darwintests/poll_select_kevent_paired_fds.c rename to tests/poll_select_kevent_paired_fds.c diff --git a/tests/port_descriptions.c b/tests/port_descriptions.c new file mode 100644 index 000000000..a42ab29be --- /dev/null +++ b/tests/port_descriptions.c @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2018 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include +#include + +static void +expect_special_port_description(const char *(*fn)(mach_port_t), + mach_port_t port, const char *namestr) +{ + const char *desc = fn(port); + T_EXPECT_NOTNULL(desc, "%s is %s", namestr, desc); + if (desc) { + T_QUIET; T_EXPECT_GT(strlen(desc), strlen(""), + "%s's description string is not empty", namestr); + } +} + +T_DECL(host_special_port_descriptions, + "verify that host special ports can be described") +{ +#define TEST_HSP(portdef) \ + expect_special_port_description(mach_host_special_port_description, \ + portdef, #portdef) + + TEST_HSP(HOST_PORT); + TEST_HSP(HOST_PRIV_PORT); + TEST_HSP(HOST_IO_MASTER_PORT); + TEST_HSP(HOST_DYNAMIC_PAGER_PORT); + TEST_HSP(HOST_AUDIT_CONTROL_PORT); + TEST_HSP(HOST_USER_NOTIFICATION_PORT); + TEST_HSP(HOST_AUTOMOUNTD_PORT); + TEST_HSP(HOST_LOCKD_PORT); + TEST_HSP(HOST_KTRACE_BACKGROUND_PORT); + TEST_HSP(HOST_SEATBELT_PORT); + TEST_HSP(HOST_KEXTD_PORT); + TEST_HSP(HOST_LAUNCHCTL_PORT); + TEST_HSP(HOST_UNFREED_PORT); + TEST_HSP(HOST_AMFID_PORT); + TEST_HSP(HOST_GSSD_PORT); + TEST_HSP(HOST_TELEMETRY_PORT); + TEST_HSP(HOST_ATM_NOTIFICATION_PORT); + TEST_HSP(HOST_COALITION_PORT); + TEST_HSP(HOST_SYSDIAGNOSE_PORT); + TEST_HSP(HOST_XPC_EXCEPTION_PORT); + TEST_HSP(HOST_CONTAINERD_PORT); + TEST_HSP(HOST_NODE_PORT); + TEST_HSP(HOST_RESOURCE_NOTIFY_PORT); + TEST_HSP(HOST_CLOSURED_PORT); + TEST_HSP(HOST_SYSPOLICYD_PORT); + +#undef TEST_HSP + + T_EXPECT_EQ(HOST_SYSPOLICYD_PORT, HOST_MAX_SPECIAL_PORT, + "checked all of the ports"); + + const char *invalid_hsp = + mach_host_special_port_description(HOST_MAX_SPECIAL_PORT + 1); + T_EXPECT_NULL(invalid_hsp, + "invalid host special port description should be NULL"); +} + +T_DECL(task_special_port_descriptions, + "verify that task special ports can be described") +{ +#define TEST_TSP(portdef) \ + expect_special_port_description(mach_task_special_port_description, \ + portdef, #portdef) + + TEST_TSP(TASK_KERNEL_PORT); + TEST_TSP(TASK_HOST_PORT); + TEST_TSP(TASK_NAME_PORT); + TEST_TSP(TASK_BOOTSTRAP_PORT); + TEST_TSP(TASK_SEATBELT_PORT); + TEST_TSP(TASK_ACCESS_PORT); + TEST_TSP(TASK_DEBUG_CONTROL_PORT); + TEST_TSP(TASK_RESOURCE_NOTIFY_PORT); + +#undef TEST_TSP + + T_EXPECT_EQ(TASK_RESOURCE_NOTIFY_PORT, TASK_MAX_SPECIAL_PORT, + "checked all of the ports"); + + const char *invalid_tsp = + mach_task_special_port_description(TASK_MAX_SPECIAL_PORT + 1); + T_EXPECT_NULL(invalid_tsp, + "invalid task special port description should be NULL"); +} + +static void +expect_special_port_id(int (*fn)(const char *id), int port, const char *portid) +{ + int observed_port = fn(portid); + T_WITH_ERRNO; + T_EXPECT_EQ(observed_port, port, "%s is %d", portid, observed_port); +} + +T_DECL(host_special_port_mapping, + "verify that task special port names can be mapped to numbers") +{ +#define TEST_HSP(portdef) \ + expect_special_port_id(mach_host_special_port_for_id, \ + portdef, #portdef) + + TEST_HSP(HOST_PORT); + TEST_HSP(HOST_PRIV_PORT); + TEST_HSP(HOST_IO_MASTER_PORT); + TEST_HSP(HOST_DYNAMIC_PAGER_PORT); + TEST_HSP(HOST_AUDIT_CONTROL_PORT); + TEST_HSP(HOST_USER_NOTIFICATION_PORT); + TEST_HSP(HOST_AUTOMOUNTD_PORT); + TEST_HSP(HOST_LOCKD_PORT); + TEST_HSP(HOST_KTRACE_BACKGROUND_PORT); + TEST_HSP(HOST_SEATBELT_PORT); + TEST_HSP(HOST_KEXTD_PORT); + TEST_HSP(HOST_LAUNCHCTL_PORT); + TEST_HSP(HOST_UNFREED_PORT); + TEST_HSP(HOST_AMFID_PORT); + TEST_HSP(HOST_GSSD_PORT); + TEST_HSP(HOST_TELEMETRY_PORT); + TEST_HSP(HOST_ATM_NOTIFICATION_PORT); + TEST_HSP(HOST_COALITION_PORT); + TEST_HSP(HOST_SYSDIAGNOSE_PORT); + TEST_HSP(HOST_XPC_EXCEPTION_PORT); + TEST_HSP(HOST_CONTAINERD_PORT); + TEST_HSP(HOST_NODE_PORT); + TEST_HSP(HOST_RESOURCE_NOTIFY_PORT); + TEST_HSP(HOST_CLOSURED_PORT); + TEST_HSP(HOST_SYSPOLICYD_PORT); + +#undef TEST_HSP + + int invalid_tsp = mach_host_special_port_for_id("BOGUS_SPECIAL_PORT_NAME"); + T_EXPECT_EQ(invalid_tsp, -1, + "invalid host special port IDs should return -1"); +} + +T_DECL(task_special_port_mapping, + "verify that task special port names can be mapped to numbers") +{ +#define TEST_TSP(portdef) \ + expect_special_port_id(mach_task_special_port_for_id, \ + portdef, #portdef) + + TEST_TSP(TASK_KERNEL_PORT); + TEST_TSP(TASK_HOST_PORT); + TEST_TSP(TASK_NAME_PORT); + TEST_TSP(TASK_BOOTSTRAP_PORT); + TEST_TSP(TASK_SEATBELT_PORT); + TEST_TSP(TASK_ACCESS_PORT); + TEST_TSP(TASK_DEBUG_CONTROL_PORT); + TEST_TSP(TASK_RESOURCE_NOTIFY_PORT); + +#undef TEST_TSP + + int invalid_tsp = mach_task_special_port_for_id("BOGUS_SPECIAL_PORT_NAME"); + T_EXPECT_EQ(invalid_tsp, -1, + "invalid task special port IDs should return -1"); +} diff --git a/tools/tests/darwintests/private_entitlement.plist b/tests/private_entitlement.plist similarity index 100% rename from tools/tests/darwintests/private_entitlement.plist rename to tests/private_entitlement.plist diff --git a/tests/proc_core_name_24152432.c b/tests/proc_core_name_24152432.c new file mode 100644 index 000000000..11317c629 --- /dev/null +++ b/tests/proc_core_name_24152432.c @@ -0,0 +1,197 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define BUFFLEN 2048 +#define EVILLEN 19 +#define TIMEOUT 420 /* Timeout in seconds to wait for coredumps to appear */ + +static const char corefile_ctl[] = "kern.corefile"; +static const char coredump_ctl[] = "kern.coredump"; +/* The directory where coredumps will be */ +static const char dump_dir[] = "/cores"; +/* The default coredump location if the kern.coredump ctl is invalid */ +static const char default_dump_fmt[] = "/cores/core.%d"; +/* The coredump location when we set kern.coredump ctl to something valid */ +static const char valid_dump_fmt[] = "/cores/test-core.%d"; +static const char ls_path[] = "/bin/ls"; + +/* /cores/core.%(null), then BORK immediately after. */ +static char evil[] = "/cores/core.%\0BORK"; +/* A valid coredump location to test. */ +static char valid_dump_loc[] = "/cores/test-core.%P"; + +static const struct rlimit lim_infty = { + RLIM_INFINITY, + RLIM_INFINITY +}; + +static volatile int stop_looking = 0; + +static const struct timespec timeout = { + TIMEOUT, + 0 +}; + +#if TARGET_OS_OSX +static int fork_and_wait_for_segfault(void); + +static void sigalrm_handler(int sig) +{ + (void)sig; + stop_looking = 1; + return; +} + +static void list_coredump_files() +{ + int ret; + char buf[BUFFLEN] = { 0 }; + + T_LOG("Contents of %s:", dump_dir); + snprintf(buf, BUFFLEN, "%s %s", ls_path, dump_dir); + ret = system(buf); + T_ASSERT_POSIX_SUCCESS(ret, "Listing contents of cores directory"); + return; +} + +static int fork_and_wait_for_segfault() { + int pid, ret; + pid = fork(); + if (pid == 0) { + unsigned int *ptr = NULL; /* Cause a segfault so that we get a coredump */ + *ptr = 0xdeadd00d; + T_FAIL("Expected segmentation fault on write to NULL pointer"); + } + T_ASSERT_TRUE(pid != -1, "Checking fork success in parent"); + + ret = wait(NULL); + T_ASSERT_TRUE(ret != -1, "Waited for child to segfault and dump core"); + return pid; +} + +static int setup_coredump_kevent(struct kevent *kev, int dir) +{ + int ret; + int kqfd; + + EV_SET(kev, dir, EVFILT_VNODE, EV_ADD, NOTE_WRITE, 0, NULL); + kqfd = kqueue(); + T_ASSERT_POSIX_SUCCESS(kqfd, "kqueue: get kqueue for coredump monitoring"); + + ret = kevent(kqfd, kev, 1, NULL, 0, NULL); + T_ASSERT_POSIX_SUCCESS(ret, "kevent: setup directory monitoring for coredump"); + return kqfd; +} + +static void look_for_coredump(const char *format, int pid, int kqfd, struct kevent *kev) +{ + int ret = 0; + int i = 0; + char buf[BUFFLEN]; + memset(buf, 0, BUFFLEN); + /* + * Something else might touch this directory. If we get notified and don't see + * anything, try a few more times before failing. + */ + alarm(TIMEOUT); + while (!stop_looking) { + /* Wait for kevent to tell us the coredump folder was modified */ + ret = kevent(kqfd, NULL, 0, kev, 1, &timeout); + T_ASSERT_POSIX_SUCCESS(ret, "kevent: Waiting for coredump to appear"); + + snprintf(buf, BUFFLEN, format, pid); + ret = remove(buf); + + if (ret != -1) + break; + + T_LOG("Couldn't find coredump file (try #%d).", i+1); + i++; + } + alarm(0); + + if (ret == -1) { + /* Couldn't find the coredump -- list contents of /cores */ + list_coredump_files(); + } + T_ASSERT_POSIX_SUCCESS(ret, "Removing coredump file (should be at %s)", buf); +} + +static void sysctl_enable_coredumps(void) +{ + int ret; + int enable_core_dump = 1; + size_t oldlen = BUFFLEN; + char buf[BUFFLEN]; + memset(buf, 0, BUFFLEN); + + ret = sysctlbyname(coredump_ctl, buf, &oldlen, &enable_core_dump, sizeof(int)); + T_ASSERT_POSIX_SUCCESS(ret, "sysctl: enable core dumps"); + + ret = setrlimit(RLIMIT_CORE, &lim_infty); + T_ASSERT_POSIX_SUCCESS(ret, "setrlimit: remove limit on maximum coredump size"); +} +#endif + +T_DECL( + proc_core_name_24152432, + "Tests behavior of core dump when kern.corefile ends in %, e.g., /cores/core.%", + T_META_ASROOT(true), + T_META_IGNORECRASHES("proc_core_name_24152432.*")) +{ +#if TARGET_OS_OSX + DIR *dirp; + int ret, pid, dir; + char buf[BUFFLEN]; + memset(buf, 0, BUFFLEN); + size_t oldlen = BUFFLEN; + struct kevent kev; + sig_t sig; + int kqfd; + + sig = signal(SIGALRM, sigalrm_handler); + T_WITH_ERRNO; T_EXPECT_NE(sig, SIG_ERR, "signal: set sigalrm handler"); + + dirp = opendir(dump_dir); + T_ASSERT_NOTNULL(dirp, "opendir: opening coredump directory"); + dir = dirfd(dirp); + T_ASSERT_POSIX_SUCCESS(dir, "dirfd: getting file descriptor for coredump directory"); + kqfd = setup_coredump_kevent(&kev, dir); + + sysctl_enable_coredumps(); + + ret = sysctlbyname(corefile_ctl, buf, &oldlen, evil, EVILLEN); + T_ASSERT_POSIX_SUCCESS(ret, "sysctl: set bad core dump location, old value was %s", buf); + memset(buf, 0, BUFFLEN); + oldlen = BUFFLEN; + + pid = fork_and_wait_for_segfault(); + look_for_coredump(default_dump_fmt, pid, kqfd, &kev); + + ret = sysctlbyname(corefile_ctl, buf, &oldlen, valid_dump_loc, strlen(valid_dump_loc)); + T_ASSERT_POSIX_SUCCESS(ret, "sysctl: set valid core dump location, old value was %s", buf); + memset(buf, 0, BUFFLEN); + + pid = fork_and_wait_for_segfault(); + look_for_coredump(valid_dump_fmt, pid, kqfd, &kev); + + closedir(dirp); + close(kqfd); +#else + T_LOG("proc_core_name appears in OS X only, skipping test."); +#endif + T_PASS("proc_core_name_24152432 PASSED"); +} diff --git a/tools/tests/darwintests/proc_info.c b/tests/proc_info.c similarity index 95% rename from tools/tests/darwintests/proc_info.c rename to tests/proc_info.c index 3a1e73820..cb5799d29 100644 --- a/tools/tests/darwintests/proc_info.c +++ b/tests/proc_info.c @@ -9,9 +9,11 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -52,7 +54,22 @@ #define MAXPRI_USER 63 #define CONF_OPN_FILE_COUNT 3 -#define CONF_TMP_FILE_PATH "/tmp/testfile" +#define CONF_TMP_FILE_PFX "/tmp/xnu.tests.proc_info." +static int CONF_TMP_FILE_OPEN(char path[PATH_MAX]) +{ + static char stmp_path[PATH_MAX] = {}; + char *nm; + if (path) { + nm = path; + } else { + nm = stmp_path; + } + strlcpy(nm, CONF_TMP_FILE_PFX "XXXXXXXXXX", PATH_MAX); + int fd = mkstemp(nm); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(fd, "mkstemp(" CONF_TMP_FILE_PFX "XXXXXXXXXX)"); + return fd; +} uint32_t get_tty_dev(void); @@ -576,39 +593,82 @@ proc_info_caller(int proc_info_opts, void ** ret_structs, int * ret_child_pid) PROC_INFO_CALL(proc_archinfo, getpid(), PROC_PIDARCHINFO, 0); } + vm_map_size_t map_tmp_sz = 0; if ((proc_info_opts & PREGINFO) | (proc_info_opts & PREGINFO_PATH) | (proc_info_opts & PREGINFO_PATH_2) | (proc_info_opts & PREGINFO_PATH_3)) { - tmp_fd = open(CONF_TMP_FILE_PATH, O_RDWR | O_CREAT); + static char tmp_path[PATH_MAX] = {}; + tmp_fd = CONF_TMP_FILE_OPEN(tmp_path); - for (int j = 0; j < 100; j++) { - char buf[50]; - write(tmp_fd, buf, sizeof(buf)); - } - retval = fsync(tmp_fd); + /* + * subsequent checks assume that this data does *not* stay + * resident in the buffer cache, so set F_NOCACHE for direct + * to storage writing. NOTE: this works if the writes are + * page-aligned and > 2 pages in length. + */ + retval = fcntl(tmp_fd, F_NOCACHE, 1); T_QUIET; - T_ASSERT_POSIX_SUCCESS(retval, "file fsync()"); + T_ASSERT_POSIX_SUCCESS(retval, "fcntl(%d, F_NOCACHE) failed", tmp_fd); - map_tmp = mmap(0, PAGE_SIZE, PROT_WRITE, MAP_PRIVATE, tmp_fd, (off_t)PAGE_SIZE); + int npages_to_write = 10; + map_tmp_sz = (vm_map_size_t)npages_to_write * (vm_map_size_t)PAGE_SIZE; + + /* + * To make sure we don't go through the cached write paths in + * the VM, we allocate a PAGE-aligned buffer that is > 2 + * pages, and perform a write of the entire buffer (not in + * small page-aligned chunks). + */ + char *buf = valloc((size_t)map_tmp_sz); + T_QUIET; + T_ASSERT_NOTNULL(buf, "valloc(%d) failed", (int)map_tmp_sz); + + memset(buf, 0x5, map_tmp_sz); + ssize_t bw = write(tmp_fd, buf, (size_t)map_tmp_sz); + T_QUIET; + T_ASSERT_GT_INT((int)bw, 0, "write(%d, buf, %d) failed", tmp_fd, (int)map_tmp_sz); + + free(buf); + + map_tmp_sz -= PAGE_SIZE; + map_tmp = mmap(0, (size_t)map_tmp_sz, PROT_WRITE, MAP_PRIVATE, tmp_fd, (off_t)PAGE_SIZE); T_ASSERT_NE_PTR(map_tmp, MAP_FAILED, "mmap() for PROC_PIDREGIONINFO"); - T_LOG("file: %s is opened as fd %d and mapped at %llx with size %lu", CONF_TMP_FILE_PATH, tmp_fd, (uint64_t)map_tmp, + T_LOG("file: %s is opened as fd %d and mapped at %llx with size %lu", tmp_path, tmp_fd, (uint64_t)map_tmp, (unsigned long)PAGE_SIZE); + + /* + * unlink() the file to be nice, but do it _after_ we've + * already flushed and mapped the file. This will ensure that + * we don't end up writing to the buffer cache because the + * file is unlinked. + */ + if (!(proc_info_opts & PREGINFO_PATH_3)) { + retval = unlink(tmp_path); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(retval, "unlink(%s) failed", tmp_path); + } } if (proc_info_opts & PREGINFO) { PROC_INFO_CALL(proc_regioninfo, getpid(), PROC_PIDREGIONINFO, map_tmp); ret_structs[i] = map_tmp; i++; + ret_structs[i] = (void *)(uintptr_t)map_tmp_sz; + i++; } if (proc_info_opts & PREGINFO_PATH) { PROC_INFO_CALL(proc_regionwithpathinfo, getpid(), PROC_PIDREGIONPATHINFO, map_tmp); ret_structs[i] = map_tmp; i++; + ret_structs[i] = (void *)(uintptr_t)map_tmp_sz; + i++; } if (proc_info_opts & PREGINFO_PATH_2) { PROC_INFO_CALL(proc_regionwithpathinfo, getpid(), PROC_PIDREGIONPATHINFO2, map_tmp); ret_structs[i] = map_tmp; i++; + ret_structs[i] = (void *)(uintptr_t)map_tmp_sz; + i++; } if (proc_info_opts & PREGINFO_PATH_3) { @@ -628,6 +688,14 @@ proc_info_caller(int proc_info_opts, void ** ret_structs, int * ret_child_pid) T_ASSERT_EQ_INT(retval, (int)sizeof(struct proc_regionwithpathinfo), "__proc_info call for PROC_PIDREGIONPATHWITHINFO3"); ret_structs[i] = (void *)preginfo_path; i++; + ret_structs[i] = (void *)map_tmp; + i++; + ret_structs[i] = (void *)(uintptr_t)map_tmp_sz; + i++; + + retval = unlink(preginfo_path->prp_vip.vip_path); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(retval, "unlink(%s) failed", preginfo_path->prp_vip.vip_path); } if (proc_info_opts & PVNINFO) { @@ -1241,15 +1309,15 @@ T_DECL(proc_info_proc_pidregioninfo, T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT)) { - void * proc_info[2]; - void * map_tmp = NULL; + void * proc_info[3]; proc_info_caller(PREGINFO, proc_info, NULL); struct proc_regioninfo preginfo = *((struct proc_regioninfo *)proc_info[0]); /* * map_tmp isn't a struct like the rest of our ret_structs, but we sneak it back because we need it */ - map_tmp = proc_info[1]; + void *map_tmp = proc_info[1]; + vm_map_size_t map_tmp_sz = (vm_map_size_t)(uintptr_t)proc_info[2]; T_EXPECT_EQ_ULLONG(preginfo.pri_offset, (unsigned long long)PAGE_SIZE, "PROC_PIDREGIONINFO returns valid value for pri_offset"); T_EXPECT_EQ_UINT((preginfo.pri_protection ^ (VM_PROT_READ | VM_PROT_WRITE)), 0U, @@ -1273,15 +1341,15 @@ T_DECL(proc_info_proc_pidregioninfo, T_EXPECT_EQ_UINT(preginfo.pri_share_mode, (unsigned int)SM_COW, "PROC_PIDREGIONINFO returns valid value for pri_share_mode"); T_EXPECT_EQ_UINT(preginfo.pri_private_pages_resident, 0U, "PROC_PIDREGIONINFO returns valid value for pri_private_pages_resident"); - T_EXPECT_GE_UINT(preginfo.pri_shared_pages_resident, 1U, + T_EXPECT_GE_UINT(preginfo.pri_shared_pages_resident, 0U, "PROC_PIDREGIONINFO returns valid value for pri_shared_pages_resident"); T_EXPECT_EQ_ULLONG(preginfo.pri_address, (uint64_t)map_tmp, "PROC_PIDREGIONINFO returns valid value for pri_addr"); T_EXPECT_NE_UINT(preginfo.pri_obj_id, 0U, "PROC_PIDREGIONINFO returns valid value for pri_obj_id"); - T_EXPECT_EQ_ULLONG(preginfo.pri_size, (unsigned long long)PAGE_SIZE, "PROC_PIDREGIONINFO returns valid value for pri_size"); + T_EXPECT_EQ_ULLONG(preginfo.pri_size, (unsigned long long)map_tmp_sz, "PROC_PIDREGIONINFO returns valid value for pri_size"); T_EXPECT_EQ_UINT(preginfo.pri_depth, 0U, "PROC_PIDREGIONINFO returns valid value for pri_depth"); int ret = 0; - ret = munmap(map_tmp, PAGE_SIZE); + ret = munmap(map_tmp, (size_t)map_tmp_sz); T_QUIET; T_EXPECT_POSIX_SUCCESS(ret, "munmap of map_tmp"); free_proc_info(proc_info, 1); @@ -1292,15 +1360,15 @@ T_DECL(proc_info_proc_pidregionpathinfo, T_META_ASROOT(true), T_META_LTEPHASE(LTE_INSTALLEDUSEROS)) { - void * proc_info[2]; - void * map_tmp = NULL; + void * proc_info[3]; proc_info_caller(PREGINFO_PATH, proc_info, NULL); struct proc_regionwithpathinfo preginfo_path = *((struct proc_regionwithpathinfo *)proc_info[0]); /* * map_tmp isn't a struct like the rest of our ret_structs, but we sneak it back because we need it */ - map_tmp = proc_info[1]; + void *map_tmp = proc_info[1]; + vm_map_size_t map_tmp_sz = (vm_map_size_t)(uintptr_t)proc_info[2]; T_EXPECT_EQ_ULLONG(preginfo_path.prp_prinfo.pri_offset, (uint64_t)PAGE_SIZE, "PROC_PIDREGIONPATHINFO returns valid value for pri_offset"); @@ -1333,12 +1401,12 @@ T_DECL(proc_info_proc_pidregionpathinfo, "PROC_PIDREGIONPATHINFO returns valid value for pri_share_mode"); T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_private_pages_resident, 0U, "PROC_PIDREGIONPATHINFO returns valid value for pri_private_pages_resident"); - T_EXPECT_GE_UINT(preginfo_path.prp_prinfo.pri_shared_pages_resident, 1U, + T_EXPECT_GE_UINT(preginfo_path.prp_prinfo.pri_shared_pages_resident, 0U, "PROC_PIDREGIONPATHINFO returns valid value for pri_shared_pages_resident"); T_EXPECT_EQ_ULLONG(preginfo_path.prp_prinfo.pri_address, (uint64_t)map_tmp, "PROC_PIDREGIONPATHINFO returns valid value for pri_addr"); T_EXPECT_NE_UINT(preginfo_path.prp_prinfo.pri_obj_id, 0U, "PROC_PIDREGIONPATHINFO returns valid value for pri_obj_id"); - T_EXPECT_EQ_ULLONG(preginfo_path.prp_prinfo.pri_size, (uint64_t)PAGE_SIZE, + T_EXPECT_EQ_ULLONG(preginfo_path.prp_prinfo.pri_size, (uint64_t)map_tmp_sz, "PROC_PIDREGIONPATHINFO returns valid value for pri_size"); T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_depth, 0U, "PROC_PIDREGIONPATHINFO returns valid value for pri_depth"); T_EXPECT_EQ_INT(preginfo_path.prp_vip.vip_vi.vi_type, VREG, "PROC_PIDREGIONPATHINFO returns valid value for vi_type"); @@ -1347,7 +1415,7 @@ T_DECL(proc_info_proc_pidregionpathinfo, "PROC_PIDREGIONPATHINFO returns valid value for vi_fsid.val[0]"); T_EXPECT_NE_INT(preginfo_path.prp_vip.vip_vi.vi_fsid.val[1], 0, "PROC_PIDREGIONPATHINFO returns valid value for vi_fsid.val[1]"); - T_EXPECT_NE_PTR((void *)(strcasestr(preginfo_path.prp_vip.vip_path, CONF_TMP_FILE_PATH)), NULL, + T_EXPECT_NE_PTR((void *)(strcasestr(preginfo_path.prp_vip.vip_path, CONF_TMP_FILE_PFX)), NULL, "PROC_PIDREGIONPATHINFO returns valid value for vi_path"); /* * Basic sanity checks for vnode stat returned by the API @@ -1355,7 +1423,7 @@ T_DECL(proc_info_proc_pidregionpathinfo, T_EXPECT_NE_UINT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_dev, 0U, "PROC_PIDREGIONPATHINFO returns valid value for vst_dev"); T_EXPECT_EQ_INT(((preginfo_path.prp_vip.vip_vi.vi_stat.vst_mode & S_IFMT) ^ S_IFREG), 0, "PROC_PIDREGIONPATHINFO returns valid value for vst_mode"); - T_EXPECT_EQ_USHORT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_nlink, (unsigned short)1, + T_EXPECT_EQ_USHORT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_nlink, (unsigned short)0, /* the file was unlink()'d! */ "PROC_PIDREGIONPATHINFO returns valid value for vst_nlink"); T_EXPECT_NE_ULLONG(preginfo_path.prp_vip.vip_vi.vi_stat.vst_ino, 0ULL, "PROC_PIDREGIONPATHINFO returns valid value for vst_ino"); @@ -1369,7 +1437,7 @@ T_DECL(proc_info_proc_pidregionpathinfo, "PROC_PIDREGIONPATHINFO returns valid value for vst_blksize"); int ret = 0; - ret = munmap(map_tmp, PAGE_SIZE); + ret = munmap(map_tmp, (size_t)map_tmp_sz); T_QUIET; T_EXPECT_POSIX_SUCCESS(ret, "munmap of map_tmp"); free_proc_info(proc_info, 1); @@ -1380,15 +1448,15 @@ T_DECL(proc_info_proc_pidregionpathinfo2, T_META_ASROOT(true), T_META_LTEPHASE(LTE_INSTALLEDUSEROS)) { - void * proc_info[2]; - void * map_tmp = NULL; + void * proc_info[3]; proc_info_caller(PREGINFO_PATH_2, proc_info, NULL); struct proc_regionwithpathinfo preginfo_path = *((struct proc_regionwithpathinfo *)proc_info[0]); /* * map_tmp isn't a struct like the rest of our ret_structs, but we sneak it back because we need it */ - map_tmp = proc_info[1]; + void *map_tmp = proc_info[1]; + vm_map_size_t map_tmp_sz = (vm_map_size_t)(uintptr_t)proc_info[2]; T_EXPECT_EQ_ULLONG(preginfo_path.prp_prinfo.pri_offset, (uint64_t)PAGE_SIZE, "PROC_PIDREGIONPATHINFO2 returns valid value for pri_offset"); @@ -1428,7 +1496,7 @@ T_DECL(proc_info_proc_pidregionpathinfo2, T_EXPECT_EQ_ULLONG(preginfo_path.prp_prinfo.pri_address, (uint64_t)map_tmp, "PROC_PIDREGIONPATHINFO2 returns valid value for pri_addr"); T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_obj_id, 0U, "PROC_PIDREGIONPATHINFO2 returns valid value for pri_obj_id"); - T_EXPECT_EQ_ULLONG(preginfo_path.prp_prinfo.pri_size, (unsigned long long)PAGE_SIZE, + T_EXPECT_EQ_ULLONG(preginfo_path.prp_prinfo.pri_size, (unsigned long long)map_tmp_sz, "PROC_PIDREGIONPATHINFO2 returns valid value for pri_size"); T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_depth, 0U, "PROC_PIDREGIONPATHINFO2 returns valid value for pri_depth"); @@ -1440,7 +1508,7 @@ T_DECL(proc_info_proc_pidregionpathinfo2, T_EXPECT_NE_INT(preginfo_path.prp_vip.vip_vi.vi_fsid.val[1], 0, "PROC_PIDREGIONPATHINFO2 returns valid value for vi_fsid.val[1]:%d", preginfo_path.prp_vip.vip_vi.vi_fsid.val[1]); - T_EXPECT_NE_PTR((void *)(strcasestr(preginfo_path.prp_vip.vip_path, CONF_TMP_FILE_PATH)), NULL, + T_EXPECT_NE_PTR((void *)(strcasestr(preginfo_path.prp_vip.vip_path, CONF_TMP_FILE_PFX)), NULL, "PROC_PIDREGIONPATHINFO2 returns valid value for vi_path"); /* * Basic sanity checks for vnode stat returned by the API @@ -1448,7 +1516,7 @@ T_DECL(proc_info_proc_pidregionpathinfo2, T_EXPECT_NE_UINT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_dev, 0U, "PROC_PIDREGIONPATHINFO2 returns valid value for vst_dev"); T_EXPECT_EQ_UINT(((preginfo_path.prp_vip.vip_vi.vi_stat.vst_mode & S_IFMT) ^ S_IFREG), 0, "PROC_PIDREGIONPATHINFO2 returns valid value for vst_mode"); - T_EXPECT_EQ_USHORT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_nlink, (unsigned short)1, + T_EXPECT_EQ_USHORT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_nlink, (unsigned short)0, /* the file was unlink()'d! */ "PROC_PIDREGIONPATHINFO2 returns valid value for vst_nlink"); T_EXPECT_NE_ULLONG(preginfo_path.prp_vip.vip_vi.vi_stat.vst_ino, 0ULL, "PROC_PIDREGIONPATHINFO2 returns valid value for vst_ino"); @@ -1462,7 +1530,7 @@ T_DECL(proc_info_proc_pidregionpathinfo2, "PROC_PIDREGIONPATHINFO2 returns valid value for vst_blksize"); int ret = 0; - ret = munmap(map_tmp, PAGE_SIZE); + ret = munmap(map_tmp, (size_t)map_tmp_sz); T_QUIET; T_EXPECT_POSIX_SUCCESS(ret, "munmap of map_tmp"); free_proc_info(proc_info, 1); @@ -1473,10 +1541,12 @@ T_DECL(proc_info_proc_pidregionpathinfo3, T_META_ASROOT(true), T_META_LTEPHASE(LTE_INSTALLEDUSEROS)) { - void * proc_info[1]; + void * proc_info[3]; proc_info_caller(PREGINFO_PATH_3, proc_info, NULL); struct proc_regionwithpathinfo preginfo_path = *((struct proc_regionwithpathinfo *)proc_info[0]); + void *map_tmp = proc_info[1]; + vm_map_size_t map_tmp_sz = (vm_map_size_t)(uintptr_t)proc_info[2]; T_EXPECT_GE_ULLONG(preginfo_path.prp_prinfo.pri_offset, (uint64_t)PAGE_SIZE, "PROC_PIDREGIONPATHINFO3 returns valid value for pri_offset"); @@ -1516,7 +1586,7 @@ T_DECL(proc_info_proc_pidregionpathinfo3, "PROC_PIDREGIONPATHINFO3 returns valid value for pri_shared_pages_resident"); T_EXPECT_NE_ULLONG(preginfo_path.prp_prinfo.pri_address, 0ULL, "PROC_PIDREGIONPATHINFO3 returns valid value for pri_addr"); T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_obj_id, 0U, "PROC_PIDREGIONPATHINFO3 returns valid value for pri_obj_id"); - T_EXPECT_GE_ULLONG(preginfo_path.prp_prinfo.pri_size, (uint64_t)PAGE_SIZE, + T_EXPECT_GE_ULLONG(preginfo_path.prp_prinfo.pri_size, (uint64_t)map_tmp_sz, "PROC_PIDREGIONPATHINFO3 returns valid value for pri_size"); T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_depth, 0U, "PROC_PIDREGIONPATHINFO3 returns valid value for pri_depth"); @@ -1532,7 +1602,7 @@ T_DECL(proc_info_proc_pidregionpathinfo3, T_EXPECT_NE_UINT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_dev, 0U, "PROC_PIDREGIONPATHINFO3 returns valid value for vst_dev"); T_EXPECT_EQ_UINT(((preginfo_path.prp_vip.vip_vi.vi_stat.vst_mode & S_IFMT) ^ S_IFREG), 0, "PROC_PIDREGIONPATHINFO3 returns valid value for vst_mode"); - T_EXPECT_EQ_USHORT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_nlink, (unsigned short)1, + T_EXPECT_EQ_USHORT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_nlink, (unsigned short)1, /* the file was unlink()'d _after_ calling proc_info */ "PROC_PIDREGIONPATHINFO3 returns valid value for vst_nlink"); T_EXPECT_NE_ULLONG(preginfo_path.prp_vip.vip_vi.vi_stat.vst_ino, 0ULL, "PROC_PIDREGIONPATHINFO3 returns valid value for vst_ino"); @@ -1548,6 +1618,10 @@ T_DECL(proc_info_proc_pidregionpathinfo3, T_EXPECT_GE_UINT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_blksize, CONF_BLK_SIZE, "PROC_PIDREGIONPATHINFO3 returns valid value for vst_blksize"); + int ret = 0; + ret = munmap(map_tmp, (size_t)map_tmp_sz); + T_QUIET; + T_EXPECT_POSIX_SUCCESS(ret, "munmap of map_tmp"); free_proc_info(proc_info, 1); } @@ -1573,8 +1647,8 @@ T_DECL(proc_info_proc_pidvnodepathinfo, T_EXPECT_GE_USHORT(pvninfo.pvi_cdir.vip_vi.vi_stat.vst_nlink, (unsigned short)2, "PROC_PIDVNODEPATHINFO returns valid value for vst_nlink"); T_EXPECT_NE_ULLONG(pvninfo.pvi_cdir.vip_vi.vi_stat.vst_ino, 0ULL, "PROC_PIDVNODEPATHINFO returns valid value for vst_ino"); - T_EXPECT_GE_UINT(pvninfo.pvi_cdir.vip_vi.vi_stat.vst_uid, 0U, "PROC_PIDREGIONPATHINFO3 returns valid value for vst_uid"); - T_EXPECT_GE_UINT(pvninfo.pvi_cdir.vip_vi.vi_stat.vst_gid, 0U, "PROC_PIDREGIONPATHINFO3 returns valid value for vst_gid"); + T_EXPECT_GE_UINT(pvninfo.pvi_cdir.vip_vi.vi_stat.vst_uid, 0U, "PROC_PIDVNODEPATHINFO returns valid value for vst_uid"); + T_EXPECT_GE_UINT(pvninfo.pvi_cdir.vip_vi.vi_stat.vst_gid, 0U, "PROC_PIDVNODEPATHINFO returns valid value for vst_gid"); T_EXPECT_GT_LLONG(pvninfo.pvi_cdir.vip_vi.vi_stat.vst_size, 0LL, "PROC_PIDVNODEPATHINFO returns valid value for vst_size"); T_EXPECT_GE_LLONG(pvninfo.pvi_cdir.vip_vi.vi_stat.vst_blocks, 0LL, "PROC_PIDVNODEPATHINFO returns valid value for vst_blocks"); T_EXPECT_GE_UINT(pvninfo.pvi_cdir.vip_vi.vi_stat.vst_blksize, CONF_BLK_SIZE, @@ -1608,7 +1682,7 @@ T_DECL(proc_info_pidinfo_proc_pidlistfds, */ T_LOG("Test to verify PROC_PIDLISTFDS returns valid fd information"); fd_info = malloc(sizeof(*fd_info) * 5); - tmp_fd = open(CONF_TMP_FILE_PATH, O_RDONLY | O_CREAT); + tmp_fd = CONF_TMP_FILE_OPEN(NULL); T_LOG("tmp_fd val:%d", tmp_fd); T_QUIET; T_EXPECT_POSIX_SUCCESS(tmp_fd, "open() for PROC_PIDLISTFDS"); @@ -1666,7 +1740,7 @@ T_DECL(proc_info_proc_pidlistfileports, /* * Create a file port */ - tmp_fd = open(CONF_TMP_FILE_PATH, O_RDWR | O_CREAT); + tmp_fd = CONF_TMP_FILE_OPEN(NULL); int retval = fileport_makeport(tmp_fd, &tmp_file_port); T_EXPECT_POSIX_SUCCESS(retval, "fileport_makeport() for PROC_PIDLISTFILEPORTS"); @@ -1896,8 +1970,14 @@ T_DECL(proc_list_uptrs, "the kernel should return any up-pointers it knows about * Should find uptrs both on a kevent_id kqueue and in a workloop * kqueue's knote's udata field. */ - uptr_names[cur_uptr] = "dynamic kqueue non-file-backed knote"; - struct kevent_qos_s events_id[] = {{.filter = EVFILT_USER, .ident = 1, .flags = EV_ADD, .udata = uptrs[cur_uptr++]}}; + uptr_names[cur_uptr] = "dynamic kqueue non-file-backed knote"; + struct kevent_qos_s events_id[] = {{ + .filter = EVFILT_USER, + .ident = 1, + .flags = EV_ADD, + .qos = (int)_pthread_qos_class_encode(QOS_CLASS_DEFAULT, 0, 0), + .udata = uptrs[cur_uptr++] + }}; uptr_names[cur_uptr] = "dynamic kqueue ID"; kev_err = kevent_id(uptrs[cur_uptr++], events_id, 1, NULL, 0, NULL, NULL, KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_IMMEDIATE); @@ -1932,7 +2012,7 @@ T_DECL(proc_list_uptrs, "the kernel should return any up-pointers it knows about uint64_t up_overflow[2] = {0}; uptrs_count = proc_list_uptrs(getpid(), up_overflow, sizeof(uint64_t)+1); - T_ASSERT_EQ(up_overflow[1], 0 , "overflow check"); + T_ASSERT_EQ(up_overflow[1], (uint64_t)0 , "overflow check"); } #pragma mark dynamic kqueue info @@ -1946,7 +2026,13 @@ T_DECL(proc_list_uptrs, "the kernel should return any up-pointers it knows about static void setup_kevent_id(kqueue_id_t id) { - struct kevent_qos_s events_id[] = {{.filter = EVFILT_USER, .ident = 1, .flags = EV_ADD, .udata = EXPECTED_UDATA}}; + struct kevent_qos_s events_id[] = {{ + .filter = EVFILT_USER, + .ident = 1, + .flags = EV_ADD, + .qos = (int)_pthread_qos_class_encode(QOS_CLASS_DEFAULT, 0, 0), + .udata = EXPECTED_UDATA + }}; int err = kevent_id(id, events_id, 1, NULL, 0, NULL, NULL, KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_IMMEDIATE); T_ASSERT_POSIX_SUCCESS(err, "register event with kevent_id"); diff --git a/tests/proc_info_list_kthreads.c b/tests/proc_info_list_kthreads.c new file mode 100644 index 000000000..f7c410550 --- /dev/null +++ b/tests/proc_info_list_kthreads.c @@ -0,0 +1,110 @@ +/* + * proc_info_list_kthreads + * + * list 64 bit thread ids of kernel_task + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#define MAX_TRIES 20 +#define EXTRA_THREADS 15 + +#if TARGET_OS_OSX +T_DECL(proc_info_list_kthreads, + "Test to verify PROC_PIDLISTTHREADIDS returns kernel thread IDs for pid 0", + T_META_ASROOT(true), + T_META_CHECK_LEAKS(false)) +#else +T_DECL(proc_info_list_kthreads, + "Test to verify PROC_PIDLISTTHREADIDS returns kernel thread IDs for pid 0", + T_META_ASROOT(false), + T_META_CHECK_LEAKS(false)) +#endif /* TARGET_OS_OSX */ +{ + int buf_used = 0; + + int thread_count = 0; + uint64_t *thread_list = NULL; + + /* + * To use PROC_PIDLISTTHREADIDS, we must pass a buffer of uint64_t's for each thread ID. + * However, there is a TOCTOU race between asking for the thread count + * and asking for the array of identifiers. + * + * Because the process could have allocated more threads since last we asked + * how many threads there are, we instead pass an extra slot in the array, + * and try again if it used that slot. + */ + + int attempt = 1; + while (!thread_count && (attempt < MAX_TRIES)) { + struct proc_taskinfo ti; + + buf_used = proc_pidinfo(0, PROC_PIDTASKINFO, 0, &ti, sizeof(ti)); + + T_QUIET; T_WITH_ERRNO; T_ASSERT_GT(buf_used, 0, "proc_pidinfo(PROC_PIDTASKINFO) returned a value > 0"); + T_QUIET; T_ASSERT_EQ(buf_used, (int)sizeof(ti), "proc_pidinfo(PROC_PIDTASKINFO) returned size %d == %lu", buf_used, sizeof(ti)); + + T_LOG("The kernel says it has %d threads", ti.pti_threadnum); + + int expected_size = ti.pti_threadnum * (int)sizeof(uint64_t); + /* tack on five extra to detect newly allocated threads */ + int allocated_size = expected_size + EXTRA_THREADS*(int)sizeof(uint64_t); + uint64_t *thread_list_tmp = malloc((size_t)allocated_size); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(thread_list_tmp, "malloc(size = %d) failed", allocated_size); + + buf_used = proc_pidinfo(0, PROC_PIDLISTTHREADIDS, 0, thread_list_tmp, (int)allocated_size); + T_LOG("proc_pidinfo(PROC_PIDLISTTHREADIDS) buf_used = %d, expected_size = %d", buf_used, expected_size); + + if (buf_used == 0) { + T_WITH_ERRNO; T_ASSERT_FAIL("proc_pidinfo(PROC_PIDLISTTHREADIDS) failed"); + } + if (buf_used == expected_size) { + /* success, we found the expected number of threads */ + thread_list = thread_list_tmp; + thread_count = expected_size / (int)sizeof(uint64_t); + } else if (buf_used < expected_size) { + /* there were fewer threads than we expected, fix up the allocation */ + thread_list = realloc(thread_list_tmp, (size_t)buf_used); + thread_count = buf_used / (int)sizeof(uint64_t); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(thread_list, "realloc(size = %d) failed", buf_used); + } else if (buf_used > expected_size) { + if (buf_used < allocated_size) { + thread_list = realloc(thread_list_tmp, (size_t)buf_used); + thread_count = buf_used / (int)sizeof(uint64_t); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(thread_list, "realloc(size = %d) failed", buf_used); + } else { + /* + * it used all the extra slots, meaning there are more + * threads than we thought, try again! + */ + T_LOG("expected %d threads, but saw an extra thread: %d", + expected_size / (int)sizeof(uint64_t), buf_used / (int)sizeof(uint64_t)); + free(thread_list_tmp); + } + } + attempt++; + } + T_QUIET; T_ASSERT_LE(attempt, MAX_TRIES, "attempt <= MAX_TRIES"); + T_QUIET; T_ASSERT_NOTNULL(thread_list, "thread_list != NULL"); + T_QUIET; T_ASSERT_GT(thread_count, 0, "thread_count > 0"); + + struct proc_threadinfo pthinfo_64; + for (int i = 0 ; i < thread_count ; i++) { + bzero(&pthinfo_64, sizeof(struct proc_threadinfo)); + int retval = proc_pidinfo(0, PROC_PIDTHREADID64INFO, thread_list[i], + (void *)&pthinfo_64, (uint32_t)sizeof(pthinfo_64)); + T_QUIET; T_WITH_ERRNO; T_EXPECT_GT(retval, 0, "proc_pidinfo(PROC_PIDTASKINFO) returned %d", retval); + T_QUIET; T_EXPECT_EQ(retval, (int)sizeof(pthinfo_64), "proc_pidinfo(PROC_PIDTASKINFO) returned size %d == %lu", + retval, sizeof(pthinfo_64)); + } +} + diff --git a/tests/proc_info_list_kthreads.entitlements b/tests/proc_info_list_kthreads.entitlements new file mode 100644 index 000000000..a333f4755 --- /dev/null +++ b/tests/proc_info_list_kthreads.entitlements @@ -0,0 +1,8 @@ + + + + + com.apple.private.kernel.global-proc-info + + + diff --git a/tools/tests/darwintests/proc_info_udata.c b/tests/proc_info_udata.c similarity index 94% rename from tools/tests/darwintests/proc_info_udata.c rename to tests/proc_info_udata.c index f814be4e3..3a37cbf37 100644 --- a/tools/tests/darwintests/proc_info_udata.c +++ b/tests/proc_info_udata.c @@ -1,6 +1,6 @@ #include -#include "../../../bsd/sys/proc_info.h" -#include "../../../libsyscall/wrappers/libproc/libproc.h" +#include "../bsd/sys/proc_info.h" +#include "../libsyscall/wrappers/libproc/libproc.h" #include #include @@ -8,7 +8,7 @@ T_DECL(proc_udata_info, "Get and set a proc udata token"){ uint64_t token = mach_absolute_time(); proc_info_udata_t udata; int ret; - + udata = token; ret = proc_udata_info(getpid(), PROC_UDATA_INFO_SET, &udata, sizeof (udata)); diff --git a/tools/tests/darwintests/proc_uuid_policy_26567533.c b/tests/proc_uuid_policy_26567533.c similarity index 100% rename from tools/tests/darwintests/proc_uuid_policy_26567533.c rename to tests/proc_uuid_policy_26567533.c diff --git a/tools/tests/darwintests/pwrite_avoid_sigxfsz_28581610.c b/tests/pwrite_avoid_sigxfsz_28581610.c similarity index 100% rename from tools/tests/darwintests/pwrite_avoid_sigxfsz_28581610.c rename to tests/pwrite_avoid_sigxfsz_28581610.c diff --git a/tests/quiesce_counter.c b/tests/quiesce_counter.c new file mode 100644 index 000000000..563d13d04 --- /dev/null +++ b/tests/quiesce_counter.c @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2018 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Test to validate that _COMM_PAGE_CPU_QUIESCENT_COUNTER ticks at least once per second + * + * + */ + +#include + +#include + +#include +#include +#include +#include + +#ifndef _COMM_PAGE_CPU_QUIESCENT_COUNTER + +T_DECL(test_quiescent_counter, "Validate that _COMM_PAGE_CPU_QUIESCENT_COUNTER increments", + T_META_CHECK_LEAKS(false)) +{ + T_SKIP("_COMM_PAGE_CPU_QUIESCENT_COUNTER doesn't exist on this system"); +} + +#else /* _COMM_PAGE_CPU_QUIESCENT_COUNTER */ + +T_DECL(test_quiescent_counter, "Validate that _COMM_PAGE_CPU_QUIESCENT_COUNTER increments", + T_META_CHECK_LEAKS(false)) +{ + int rv; + + uint32_t cpu_checkin_min_interval = 0; /* set by sysctl hw.ncpu */ + + size_t value_size = sizeof(cpu_checkin_min_interval); + rv = sysctlbyname("kern.cpu_checkin_interval", &cpu_checkin_min_interval, &value_size, NULL, 0); + T_ASSERT_POSIX_SUCCESS(rv, "sysctlbyname(kern.cpu_checkin_interval)"); + + T_LOG("kern.cpu_checkin_interval is %d", cpu_checkin_min_interval); + + T_ASSERT_GT(cpu_checkin_min_interval, 0, "kern.cpu_checkin_interval should be > 0"); + + uint64_t* commpage_addr = (uint64_t *)(uintptr_t)_COMM_PAGE_CPU_QUIESCENT_COUNTER; + + T_LOG("address of _COMM_PAGE_CPU_QUIESCENT_COUNTER is %p", (void*) commpage_addr); + + uint64_t counter = *commpage_addr; + uint64_t last_counter = counter; + T_LOG("first value of _COMM_PAGE_CPU_QUIESCENT_COUNTER is %llu", counter); + + for (int i = 0 ; i < 10 ; i++) + { + sleep(1); + + last_counter = counter; + counter = *commpage_addr; + + T_LOG("value of _COMM_PAGE_CPU_QUIESCENT_COUNTER is %llu", counter); + + T_ASSERT_GT(counter, last_counter, "_COMM_PAGE_CPU_QUIESCENT_COUNTER must monotonically increase at least once per second"); + } +} + +#endif /* _COMM_PAGE_CPU_QUIESCENT_COUNTER */ + diff --git a/tools/tests/darwintests/regression_17272465.c b/tests/regression_17272465.c similarity index 100% rename from tools/tests/darwintests/regression_17272465.c rename to tests/regression_17272465.c diff --git a/tools/tests/darwintests/remote_time.c b/tests/remote_time.c similarity index 100% rename from tools/tests/darwintests/remote_time.c rename to tests/remote_time.c diff --git a/tools/tests/darwintests/settimeofday_29193041.c b/tests/settimeofday_29193041.c similarity index 100% rename from tools/tests/darwintests/settimeofday_29193041.c rename to tests/settimeofday_29193041.c diff --git a/tools/tests/darwintests/settimeofday_29193041.entitlements b/tests/settimeofday_29193041.entitlements similarity index 100% rename from tools/tests/darwintests/settimeofday_29193041.entitlements rename to tests/settimeofday_29193041.entitlements diff --git a/tools/tests/darwintests/settimeofday_29193041_entitled.c b/tests/settimeofday_29193041_entitled.c similarity index 100% rename from tools/tests/darwintests/settimeofday_29193041_entitled.c rename to tests/settimeofday_29193041_entitled.c diff --git a/tools/tests/darwintests/sigchld_return.c b/tests/sigchld_return.c similarity index 100% rename from tools/tests/darwintests/sigchld_return.c rename to tests/sigchld_return.c diff --git a/tools/tests/darwintests/sigcont_return.c b/tests/sigcont_return.c similarity index 100% rename from tools/tests/darwintests/sigcont_return.c rename to tests/sigcont_return.c diff --git a/tools/tests/darwintests/socket_bind_35243417.c b/tests/socket_bind_35243417.c similarity index 100% rename from tools/tests/darwintests/socket_bind_35243417.c rename to tests/socket_bind_35243417.c diff --git a/tools/tests/darwintests/socket_bind_35685803.c b/tests/socket_bind_35685803.c similarity index 100% rename from tools/tests/darwintests/socket_bind_35685803.c rename to tests/socket_bind_35685803.c diff --git a/tools/tests/darwintests/socket_poll_close_25786011.c b/tests/socket_poll_close_25786011.c similarity index 100% rename from tools/tests/darwintests/socket_poll_close_25786011.c rename to tests/socket_poll_close_25786011.c diff --git a/tests/stackshot.m b/tests/stackshot.m new file mode 100644 index 000000000..7aef17c1d --- /dev/null +++ b/tests/stackshot.m @@ -0,0 +1,1022 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * mirrors the dyld_cache_header struct defined in dyld_cache_format.h from dyld source code + * TODO: remove once rdar://42361850 is in the build + */ +struct dyld_cache_header +{ + char magic[16]; // e.g. "dyld_v0 i386" + uint32_t mappingOffset; // file offset to first dyld_cache_mapping_info + uint32_t mappingCount; // number of dyld_cache_mapping_info entries + uint32_t imagesOffset; // file offset to first dyld_cache_image_info + uint32_t imagesCount; // number of dyld_cache_image_info entries + uint64_t dyldBaseAddress; // base address of dyld when cache was built + uint64_t codeSignatureOffset; // file offset of code signature blob + uint64_t codeSignatureSize; // size of code signature blob (zero means to end of file) + uint64_t slideInfoOffset; // file offset of kernel slid info + uint64_t slideInfoSize; // size of kernel slid info + uint64_t localSymbolsOffset; // file offset of where local symbols are stored + uint64_t localSymbolsSize; // size of local symbols information + uint8_t uuid[16]; // unique value for each shared cache file + uint64_t cacheType; // 0 for development, 1 for production + uint32_t branchPoolsOffset; // file offset to table of uint64_t pool addresses + uint32_t branchPoolsCount; // number of uint64_t entries + uint64_t accelerateInfoAddr; // (unslid) address of optimization info + uint64_t accelerateInfoSize; // size of optimization info + uint64_t imagesTextOffset; // file offset to first dyld_cache_image_text_info + uint64_t imagesTextCount; // number of dyld_cache_image_text_info entries + uint64_t dylibsImageGroupAddr; // (unslid) address of ImageGroup for dylibs in this cache + uint64_t dylibsImageGroupSize; // size of ImageGroup for dylibs in this cache + uint64_t otherImageGroupAddr; // (unslid) address of ImageGroup for other OS dylibs + uint64_t otherImageGroupSize; // size of oImageGroup for other OS dylibs + uint64_t progClosuresAddr; // (unslid) address of list of program launch closures + uint64_t progClosuresSize; // size of list of program launch closures + uint64_t progClosuresTrieAddr; // (unslid) address of trie of indexes into program launch closures + uint64_t progClosuresTrieSize; // size of trie of indexes into program launch closures + uint32_t platform; // platform number (macOS=1, etc) + uint32_t formatVersion : 8, // dyld3::closure::kFormatVersion + dylibsExpectedOnDisk : 1, // dyld should expect the dylib exists on disk and to compare inode/mtime to see if cache is valid + simulator : 1, // for simulator of specified platform + locallyBuiltCache : 1, // 0 for B&I built cache, 1 for locally built cache + padding : 21; // TBD +}; + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.stackshot"), + T_META_CHECK_LEAKS(false), + T_META_ASROOT(true) + ); + +static const char *current_process_name(void); +static void verify_stackshot_sharedcache_layout(struct dyld_uuid_info_64 *uuids, uint32_t uuid_count); +static void parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, int child_pid); +static void parse_thread_group_stackshot(void **sbuf, size_t sslen); +static uint64_t stackshot_timestamp(void *ssbuf, size_t sslen); +static void initialize_thread(void); + +#define DEFAULT_STACKSHOT_BUFFER_SIZE (1024 * 1024) +#define MAX_STACKSHOT_BUFFER_SIZE (6 * 1024 * 1024) + +/* bit flags for parse_stackshot */ +#define PARSE_STACKSHOT_DELTA 0x1 +#define PARSE_STACKSHOT_ZOMBIE 0x2 +#define PARSE_STACKSHOT_SHAREDCACHE_LAYOUT 0x4 + +T_DECL(microstackshots, "test the microstackshot syscall") +{ + void *buf = NULL; + unsigned int size = DEFAULT_STACKSHOT_BUFFER_SIZE; + + while (1) { + buf = malloc(size); + T_QUIET; T_ASSERT_NOTNULL(buf, "allocated stackshot buffer"); + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wdeprecated-declarations" + int len = syscall(SYS_microstackshot, buf, size, + STACKSHOT_GET_MICROSTACKSHOT); +#pragma clang diagnostic pop + if (len == ENOSYS) { + T_SKIP("microstackshot syscall failed, likely not compiled with CONFIG_TELEMETRY"); + } + if (len == -1 && errno == ENOSPC) { + /* syscall failed because buffer wasn't large enough, try again */ + free(buf); + buf = NULL; + size *= 2; + T_ASSERT_LE(size, (unsigned int)MAX_STACKSHOT_BUFFER_SIZE, + "growing stackshot buffer to sane size"); + continue; + } + T_ASSERT_POSIX_SUCCESS(len, "called microstackshot syscall"); + break; + } + + T_EXPECT_EQ(*(uint32_t *)buf, + (uint32_t)STACKSHOT_MICRO_SNAPSHOT_MAGIC, + "magic value for microstackshot matches"); + + free(buf); +} + +struct scenario { + const char *name; + uint32_t flags; + bool should_fail; + bool maybe_unsupported; + pid_t target_pid; + uint64_t since_timestamp; + uint32_t size_hint; + dt_stat_time_t timer; +}; + +static void +quiet(struct scenario *scenario) +{ + if (scenario->timer) { + T_QUIET; + } +} + +static void +take_stackshot(struct scenario *scenario, void (^cb)(void *buf, size_t size)) +{ + initialize_thread(); + + void *config = stackshot_config_create(); + quiet(scenario); + T_ASSERT_NOTNULL(config, "created stackshot config"); + + int ret = stackshot_config_set_flags(config, scenario->flags); + quiet(scenario); + T_ASSERT_POSIX_ZERO(ret, "set flags %#x on stackshot config", scenario->flags); + + if (scenario->size_hint > 0) { + ret = stackshot_config_set_size_hint(config, scenario->size_hint); + quiet(scenario); + T_ASSERT_POSIX_ZERO(ret, "set size hint %" PRIu32 " on stackshot config", + scenario->size_hint); + } + + if (scenario->target_pid > 0) { + ret = stackshot_config_set_pid(config, scenario->target_pid); + quiet(scenario); + T_ASSERT_POSIX_ZERO(ret, "set target pid %d on stackshot config", + scenario->target_pid); + } + + if (scenario->since_timestamp > 0) { + ret = stackshot_config_set_delta_timestamp(config, scenario->since_timestamp); + quiet(scenario); + T_ASSERT_POSIX_ZERO(ret, "set since timestamp %" PRIu64 " on stackshot config", + scenario->since_timestamp); + } + + int retries_remaining = 5; + +retry: ; + uint64_t start_time = mach_absolute_time(); + ret = stackshot_capture_with_config(config); + uint64_t end_time = mach_absolute_time(); + + if (scenario->should_fail) { + T_EXPECTFAIL; + T_ASSERT_POSIX_ZERO(ret, "called stackshot_capture_with_config"); + return; + } + + if (ret == EBUSY || ret == ETIMEDOUT) { + if (retries_remaining > 0) { + if (!scenario->timer) { + T_LOG("stackshot_capture_with_config failed with %s (%d), retrying", + strerror(ret), ret); + } + + retries_remaining--; + goto retry; + } else { + T_ASSERT_POSIX_ZERO(ret, + "called stackshot_capture_with_config (no retries remaining)"); + } + } else if ((ret == ENOTSUP) && scenario->maybe_unsupported) { + T_SKIP("kernel indicated this stackshot configuration is not supported"); + } else { + quiet(scenario); + T_ASSERT_POSIX_ZERO(ret, "called stackshot_capture_with_config"); + } + + if (scenario->timer) { + dt_stat_mach_time_add(scenario->timer, end_time - start_time); + } + void *buf = stackshot_config_get_stackshot_buffer(config); + size_t size = stackshot_config_get_stackshot_size(config); + if (scenario->name) { + char sspath[MAXPATHLEN]; + strlcpy(sspath, scenario->name, sizeof(sspath)); + strlcat(sspath, ".kcdata", sizeof(sspath)); + T_QUIET; T_ASSERT_POSIX_ZERO(dt_resultfile(sspath, sizeof(sspath)), + "create result file path"); + + T_LOG("writing stackshot to %s", sspath); + + FILE *f = fopen(sspath, "w"); + T_WITH_ERRNO; T_QUIET; T_ASSERT_NOTNULL(f, + "open stackshot output file"); + + size_t written = fwrite(buf, size, 1, f); + T_QUIET; T_ASSERT_POSIX_SUCCESS(written, "wrote stackshot to file"); + + fclose(f); + } + cb(buf, size); + + ret = stackshot_config_dealloc(config); + T_QUIET; T_EXPECT_POSIX_ZERO(ret, "deallocated stackshot config"); +} + +T_DECL(kcdata, "test that kcdata stackshots can be taken and parsed") +{ + struct scenario scenario = { + .name = "kcdata", + .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS | + STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT), + }; + + T_LOG("taking kcdata stackshot"); + take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { + parse_stackshot(0, ssbuf, sslen, -1); + }); +} + +T_DECL(kcdata_faulting, "test that kcdata stackshots while faulting can be taken and parsed") +{ + struct scenario scenario = { + .name = "faulting", + .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS + | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT + | STACKSHOT_ENABLE_BT_FAULTING | STACKSHOT_ENABLE_UUID_FAULTING), + }; + + T_LOG("taking faulting stackshot"); + take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { + parse_stackshot(0, ssbuf, sslen, -1); + }); +} + +T_DECL(bad_flags, "test a poorly-formed stackshot syscall") +{ + struct scenario scenario = { + .flags = STACKSHOT_SAVE_IN_KERNEL_BUFFER /* not allowed from user space */, + .should_fail = true, + }; + + T_LOG("attempting to take stackshot with kernel-only flag"); + take_stackshot(&scenario, ^(__unused void *ssbuf, __unused size_t sslen) { + T_ASSERT_FAIL("stackshot data callback called"); + }); +} + +T_DECL(delta, "test delta stackshots") +{ + struct scenario scenario = { + .name = "delta", + .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS + | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT), + }; + + T_LOG("taking full stackshot"); + take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { + uint64_t stackshot_time = stackshot_timestamp(ssbuf, sslen); + + T_LOG("taking delta stackshot since time %" PRIu64, stackshot_time); + + parse_stackshot(0, ssbuf, sslen, -1); + + struct scenario delta_scenario = { + .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS + | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT + | STACKSHOT_COLLECT_DELTA_SNAPSHOT), + .since_timestamp = stackshot_time + }; + + take_stackshot(&delta_scenario, ^(void *dssbuf, size_t dsslen) { + parse_stackshot(PARSE_STACKSHOT_DELTA, dssbuf, dsslen, -1); + }); + }); +} + +T_DECL(shared_cache_layout, "test stackshot inclusion of shared cache layout") +{ + struct scenario scenario = { + .name = "shared_cache_layout", + .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS + | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT | + STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT), + }; + + T_LOG("taking stackshot with STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT set"); + take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { + parse_stackshot(PARSE_STACKSHOT_SHAREDCACHE_LAYOUT, ssbuf, sslen, -1); + }); +} + +static void *stuck_sysctl_thread(void *arg) { + int val = 1; + dispatch_semaphore_t child_thread_started = *(dispatch_semaphore_t *)arg; + + dispatch_semaphore_signal(child_thread_started); + T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.wedge_thread", NULL, NULL, &val, sizeof(val)), "wedge child thread"); + + return NULL; +} + +T_HELPER_DECL(zombie_child, "child process to sample as a zombie") +{ + pthread_t pthread; + dispatch_semaphore_t child_thread_started = dispatch_semaphore_create(0); + T_QUIET; T_ASSERT_NOTNULL(child_thread_started, "zombie child thread semaphore"); + + /* spawn another thread to get stuck in the kernel, then call exit() to become a zombie */ + T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_create(&pthread, NULL, stuck_sysctl_thread, &child_thread_started), "pthread_create"); + + dispatch_semaphore_wait(child_thread_started, DISPATCH_TIME_FOREVER); + + /* sleep for a bit in the hope of ensuring that the other thread has called the sysctl before we signal the parent */ + usleep(100); + T_ASSERT_POSIX_SUCCESS(kill(getppid(), SIGUSR1), "signaled parent to take stackshot"); + + exit(0); +} + +T_DECL(zombie, "tests a stackshot of a zombie task with a thread stuck in the kernel") +{ + char path[PATH_MAX]; + uint32_t path_size = sizeof(path); + T_ASSERT_POSIX_ZERO(_NSGetExecutablePath(path, &path_size), "_NSGetExecutablePath"); + char *args[] = { path, "-n", "zombie_child", NULL }; + + dispatch_source_t child_sig_src; + dispatch_semaphore_t child_ready_sem = dispatch_semaphore_create(0); + T_QUIET; T_ASSERT_NOTNULL(child_ready_sem, "zombie child semaphore"); + + dispatch_queue_t signal_processing_q = dispatch_queue_create("signal processing queue", NULL); + T_QUIET; T_ASSERT_NOTNULL(child_ready_sem, "signal processing queue"); + + pid_t pid; + + T_LOG("spawning a child"); + + signal(SIGUSR1, SIG_IGN); + child_sig_src = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR1, 0, signal_processing_q); + T_QUIET; T_ASSERT_NOTNULL(child_sig_src, "dispatch_source_create (child_sig_src)"); + + dispatch_source_set_event_handler(child_sig_src, ^{ dispatch_semaphore_signal(child_ready_sem); }); + dispatch_activate(child_sig_src); + + int sp_ret = posix_spawn(&pid, args[0], NULL, NULL, args, NULL); + T_QUIET; T_ASSERT_POSIX_ZERO(sp_ret, "spawned process '%s' with PID %d", args[0], pid); + + dispatch_semaphore_wait(child_ready_sem, DISPATCH_TIME_FOREVER); + + T_LOG("received signal from child, capturing stackshot"); + + struct proc_bsdshortinfo bsdshortinfo; + int retval, iterations_to_wait = 10; + + while (iterations_to_wait > 0) { + retval = proc_pidinfo(pid, PROC_PIDT_SHORTBSDINFO, 0, &bsdshortinfo, sizeof(bsdshortinfo)); + if ((retval == 0) && errno == ESRCH) { + T_LOG("unable to find child using proc_pidinfo, assuming zombie"); + break; + } + + T_QUIET; T_WITH_ERRNO; T_ASSERT_GT(retval, 0, "proc_pidinfo(PROC_PIDT_SHORTBSDINFO) returned a value > 0"); + T_QUIET; T_ASSERT_EQ(retval, (int)sizeof(bsdshortinfo), "proc_pidinfo call for PROC_PIDT_SHORTBSDINFO returned expected size"); + + if (bsdshortinfo.pbsi_flags & PROC_FLAG_INEXIT) { + T_LOG("child proc info marked as in exit"); + break; + } + + iterations_to_wait--; + if (iterations_to_wait == 0) { + /* + * This will mark the test as failed but let it continue so we + * don't leave a process stuck in the kernel. + */ + T_FAIL("unable to discover that child is marked as exiting"); + } + + /* Give the child a few more seconds to make it to exit */ + sleep(5); + } + + /* Give the child some more time to make it through exit */ + sleep(10); + + struct scenario scenario = { + .name = "zombie", + .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS + | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT), + }; + + take_stackshot(&scenario, ^( void *ssbuf, size_t sslen) { + /* First unwedge the child so we can reap it */ + int val = 1, status; + T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.unwedge_thread", NULL, NULL, &val, sizeof(val)), "unwedge child"); + + T_QUIET; T_ASSERT_POSIX_SUCCESS(waitpid(pid, &status, 0), "waitpid on zombie child"); + + parse_stackshot(PARSE_STACKSHOT_ZOMBIE, ssbuf, sslen, pid); + }); +} + +static void +expect_instrs_cycles_in_stackshot(void *ssbuf, size_t sslen) +{ + kcdata_iter_t iter = kcdata_iter(ssbuf, sslen); + + bool in_task = false; + bool in_thread = false; + bool saw_instrs_cycles = false; + iter = kcdata_iter_next(iter); + + KCDATA_ITER_FOREACH(iter) { + switch (kcdata_iter_type(iter)) { + case KCDATA_TYPE_CONTAINER_BEGIN: + switch (kcdata_iter_container_type(iter)) { + case STACKSHOT_KCCONTAINER_TASK: + in_task = true; + saw_instrs_cycles = false; + break; + + case STACKSHOT_KCCONTAINER_THREAD: + in_thread = true; + saw_instrs_cycles = false; + break; + + default: + break; + } + break; + + case STACKSHOT_KCTYPE_INSTRS_CYCLES: + saw_instrs_cycles = true; + break; + + case KCDATA_TYPE_CONTAINER_END: + if (in_thread) { + T_QUIET; T_EXPECT_TRUE(saw_instrs_cycles, + "saw instructions and cycles in thread"); + in_thread = false; + } else if (in_task) { + T_QUIET; T_EXPECT_TRUE(saw_instrs_cycles, + "saw instructions and cycles in task"); + in_task = false; + } + + default: + break; + } + } +} + +static void +skip_if_monotonic_unsupported(void) +{ + int supported = 0; + size_t supported_size = sizeof(supported); + int ret = sysctlbyname("kern.monotonic.supported", &supported, + &supported_size, 0, 0); + if (ret < 0 || !supported) { + T_SKIP("monotonic is unsupported"); + } +} + +T_DECL(instrs_cycles, "test a getting instructions and cycles in stackshot") +{ + skip_if_monotonic_unsupported(); + + struct scenario scenario = { + .name = "instrs-cycles", + .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_INSTRS_CYCLES + | STACKSHOT_KCDATA_FORMAT), + }; + + T_LOG("attempting to take stackshot with instructions and cycles"); + take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { + parse_stackshot(0, ssbuf, sslen, -1); + expect_instrs_cycles_in_stackshot(ssbuf, sslen); + }); +} + +T_DECL(delta_instrs_cycles, + "test delta stackshots with instructions and cycles") +{ + skip_if_monotonic_unsupported(); + + struct scenario scenario = { + .name = "delta-instrs-cycles", + .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_INSTRS_CYCLES + | STACKSHOT_KCDATA_FORMAT), + }; + + T_LOG("taking full stackshot"); + take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { + uint64_t stackshot_time = stackshot_timestamp(ssbuf, sslen); + + T_LOG("taking delta stackshot since time %" PRIu64, stackshot_time); + + parse_stackshot(0, ssbuf, sslen, -1); + expect_instrs_cycles_in_stackshot(ssbuf, sslen); + + struct scenario delta_scenario = { + .name = "delta-instrs-cycles-next", + .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_INSTRS_CYCLES + | STACKSHOT_KCDATA_FORMAT + | STACKSHOT_COLLECT_DELTA_SNAPSHOT), + .since_timestamp = stackshot_time, + }; + + take_stackshot(&delta_scenario, ^(void *dssbuf, size_t dsslen) { + parse_stackshot(PARSE_STACKSHOT_DELTA, dssbuf, dsslen, -1); + expect_instrs_cycles_in_stackshot(dssbuf, dsslen); + }); + }); +} + +static void +check_thread_groups_supported() +{ + int err; + int supported = 0; + size_t supported_size = sizeof(supported); + err = sysctlbyname("kern.thread_groups_supported", &supported, &supported_size, NULL, 0); + + if (err || !supported) + T_SKIP("thread groups not supported on this system"); +} + +T_DECL(thread_groups, "test getting thread groups in stackshot") +{ + check_thread_groups_supported(); + + struct scenario scenario = { + .name = "thread-groups", + .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_THREAD_GROUP + | STACKSHOT_KCDATA_FORMAT), + }; + + T_LOG("attempting to take stackshot with thread group flag"); + take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { + parse_thread_group_stackshot(ssbuf, sslen); + }); +} + +static void +parse_page_table_asid_stackshot(void **ssbuf, size_t sslen) +{ + bool seen_asid = false; + bool seen_page_table_snapshot = false; + kcdata_iter_t iter = kcdata_iter(ssbuf, sslen); + T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_STACKSHOT, + "buffer provided is a stackshot"); + + iter = kcdata_iter_next(iter); + KCDATA_ITER_FOREACH(iter) { + switch (kcdata_iter_type(iter)) { + case KCDATA_TYPE_ARRAY: { + T_QUIET; + T_ASSERT_TRUE(kcdata_iter_array_valid(iter), + "checked that array is valid"); + + if (kcdata_iter_array_elem_type(iter) != STACKSHOT_KCTYPE_PAGE_TABLES) { + continue; + } + + T_ASSERT_FALSE(seen_page_table_snapshot, "check that we haven't yet seen a page table snapshot"); + seen_page_table_snapshot = true; + + T_ASSERT_EQ((size_t) kcdata_iter_array_elem_size(iter), sizeof(uint64_t), + "check that each element of the pagetable dump is the expected size"); + + uint64_t *pt_array = kcdata_iter_payload(iter); + uint32_t elem_count = kcdata_iter_array_elem_count(iter); + uint32_t j; + bool nonzero_tte = false; + for (j = 0; j < elem_count;) { + T_QUIET; T_ASSERT_LE(j + 4, elem_count, "check for valid page table segment header"); + uint64_t pa = pt_array[j]; + uint64_t num_entries = pt_array[j + 1]; + uint64_t start_va = pt_array[j + 2]; + uint64_t end_va = pt_array[j + 3]; + + T_QUIET; T_ASSERT_NE(pa, (uint64_t) 0, "check that the pagetable physical address is non-zero"); + T_QUIET; T_ASSERT_EQ(pa % (num_entries * sizeof(uint64_t)), (uint64_t) 0, "check that the pagetable physical address is correctly aligned"); + T_QUIET; T_ASSERT_NE(num_entries, (uint64_t) 0, "check that a pagetable region has more than 0 entries"); + T_QUIET; T_ASSERT_LE(j + 4 + num_entries, (uint64_t) elem_count, "check for sufficient space in page table array"); + T_QUIET; T_ASSERT_GT(end_va, start_va, "check for valid VA bounds in page table segment header"); + + for (uint32_t k = j + 4; k < (j + 4 + num_entries); ++k) { + if (pt_array[k] != 0) { + nonzero_tte = true; + T_QUIET; T_ASSERT_EQ((pt_array[k] >> 48) & 0xf, (uint64_t) 0, "check that bits[48:51] of arm64 TTE are clear"); + // L0-L2 table and non-compressed L3 block entries should always have bit 1 set; assumes L0-L2 blocks will not be used outside the kernel + bool table = ((pt_array[k] & 0x2) != 0); + if (table) { + T_QUIET; T_ASSERT_NE(pt_array[k] & ((1ULL << 48) - 1) & ~((1ULL << 12) - 1), (uint64_t) 0, "check that arm64 TTE physical address is non-zero"); + } else { // should be a compressed PTE + T_QUIET; T_ASSERT_NE(pt_array[k] & 0xC000000000000000ULL, (uint64_t) 0, "check that compressed PTE has at least one of bits [63:62] set"); + T_QUIET; T_ASSERT_EQ(pt_array[k] & ~0xC000000000000000ULL, (uint64_t) 0, "check that compressed PTE has no other bits besides [63:62] set"); + } + } + } + + j += (4 + num_entries); + } + T_ASSERT_TRUE(nonzero_tte, "check that we saw at least one non-empty TTE"); + T_ASSERT_EQ(j, elem_count, "check that page table dump size matches extent of last header"); + break; + } + case STACKSHOT_KCTYPE_ASID: { + T_ASSERT_FALSE(seen_asid, "check that we haven't yet seen an ASID"); + seen_asid = true; + } + } + } + T_ASSERT_TRUE(seen_page_table_snapshot, "check that we have seen a page table snapshot"); + T_ASSERT_TRUE(seen_asid, "check that we have seen an ASID"); +} + +T_DECL(dump_page_tables, "test stackshot page table dumping support") +{ + struct scenario scenario = { + .name = "asid-page-tables", + .flags = (STACKSHOT_KCDATA_FORMAT | STACKSHOT_ASID | STACKSHOT_PAGE_TABLES), + .size_hint = (1ULL << 23), // 8 MB + .target_pid = getpid(), + .maybe_unsupported = true, + }; + + T_LOG("attempting to take stackshot with ASID and page table flags"); + take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { + parse_page_table_asid_stackshot(ssbuf, sslen); + }); +} + +#pragma mark performance tests + +#define SHOULD_REUSE_SIZE_HINT 0x01 +#define SHOULD_USE_DELTA 0x02 +#define SHOULD_TARGET_SELF 0x04 + +static void +stackshot_perf(unsigned int options) +{ + struct scenario scenario = { + .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS + | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT), + }; + + dt_stat_t size = dt_stat_create("bytes", "size"); + dt_stat_time_t duration = dt_stat_time_create("duration"); + scenario.timer = duration; + + if (options & SHOULD_TARGET_SELF) { + scenario.target_pid = getpid(); + } + + while (!dt_stat_stable(duration) || !dt_stat_stable(size)) { + __block uint64_t last_time = 0; + __block uint32_t size_hint = 0; + take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { + dt_stat_add(size, (double)sslen); + last_time = stackshot_timestamp(ssbuf, sslen); + size_hint = (uint32_t)sslen; + }); + if (options & SHOULD_USE_DELTA) { + scenario.since_timestamp = last_time; + scenario.flags |= STACKSHOT_COLLECT_DELTA_SNAPSHOT; + } + if (options & SHOULD_REUSE_SIZE_HINT) { + scenario.size_hint = size_hint; + } + } + + dt_stat_finalize(duration); + dt_stat_finalize(size); +} + +T_DECL(perf_no_size_hint, "test stackshot performance with no size hint", + T_META_TAG_PERF) +{ + stackshot_perf(0); +} + +T_DECL(perf_size_hint, "test stackshot performance with size hint", + T_META_TAG_PERF) +{ + stackshot_perf(SHOULD_REUSE_SIZE_HINT); +} + +T_DECL(perf_process, "test stackshot performance targeted at process", + T_META_TAG_PERF) +{ + stackshot_perf(SHOULD_REUSE_SIZE_HINT | SHOULD_TARGET_SELF); +} + +T_DECL(perf_delta, "test delta stackshot performance", + T_META_TAG_PERF) +{ + stackshot_perf(SHOULD_REUSE_SIZE_HINT | SHOULD_USE_DELTA); +} + +T_DECL(perf_delta_process, "test delta stackshot performance targeted at a process", + T_META_TAG_PERF) +{ + stackshot_perf(SHOULD_REUSE_SIZE_HINT | SHOULD_USE_DELTA | SHOULD_TARGET_SELF); +} + +static uint64_t +stackshot_timestamp(void *ssbuf, size_t sslen) +{ + kcdata_iter_t iter = kcdata_iter(ssbuf, sslen); + + uint32_t type = kcdata_iter_type(iter); + if (type != KCDATA_BUFFER_BEGIN_STACKSHOT && type != KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT) { + T_ASSERT_FAIL("invalid kcdata type %u", kcdata_iter_type(iter)); + } + + iter = kcdata_iter_find_type(iter, KCDATA_TYPE_MACH_ABSOLUTE_TIME); + T_QUIET; + T_ASSERT_TRUE(kcdata_iter_valid(iter), "timestamp found in stackshot"); + + return *(uint64_t *)kcdata_iter_payload(iter); +} + +#define TEST_THREAD_NAME "stackshot_test_thread" + +static void +parse_thread_group_stackshot(void **ssbuf, size_t sslen) +{ + bool seen_thread_group_snapshot = false; + kcdata_iter_t iter = kcdata_iter(ssbuf, sslen); + T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_STACKSHOT, + "buffer provided is a stackshot"); + + NSMutableSet *thread_groups = [[NSMutableSet alloc] init]; + + iter = kcdata_iter_next(iter); + KCDATA_ITER_FOREACH(iter) { + switch (kcdata_iter_type(iter)) { + case KCDATA_TYPE_ARRAY: { + T_QUIET; + T_ASSERT_TRUE(kcdata_iter_array_valid(iter), + "checked that array is valid"); + + if (kcdata_iter_array_elem_type(iter) != STACKSHOT_KCTYPE_THREAD_GROUP_SNAPSHOT) { + continue; + } + + seen_thread_group_snapshot = true; + + if (kcdata_iter_array_elem_size(iter) >= sizeof(struct thread_group_snapshot_v2)) { + struct thread_group_snapshot_v2 *tgs_array = kcdata_iter_payload(iter); + for (uint32_t j = 0; j < kcdata_iter_array_elem_count(iter); j++) { + struct thread_group_snapshot_v2 *tgs = tgs_array + j; + [thread_groups addObject:@(tgs->tgs_id)]; + } + + } + else { + struct thread_group_snapshot *tgs_array = kcdata_iter_payload(iter); + for (uint32_t j = 0; j < kcdata_iter_array_elem_count(iter); j++) { + struct thread_group_snapshot *tgs = tgs_array + j; + [thread_groups addObject:@(tgs->tgs_id)]; + } + } + break; + } + } + } + KCDATA_ITER_FOREACH(iter) { + NSError *error = nil; + + switch (kcdata_iter_type(iter)) { + + case KCDATA_TYPE_CONTAINER_BEGIN: { + T_QUIET; + T_ASSERT_TRUE(kcdata_iter_container_valid(iter), + "checked that container is valid"); + + if (kcdata_iter_container_type(iter) != STACKSHOT_KCCONTAINER_THREAD) { + break; + } + + NSDictionary *container = parseKCDataContainer(&iter, &error); + T_QUIET; T_ASSERT_NOTNULL(container, "parsed container from stackshot"); + T_QUIET; T_ASSERT_NULL(error, "error unset after parsing container"); + + int tg = [container[@"thread_snapshots"][@"thread_group"] intValue]; + + T_ASSERT_TRUE([thread_groups containsObject:@(tg)], "check that the thread group the thread is in exists"); + + break; + }; + + } + } + T_ASSERT_TRUE(seen_thread_group_snapshot, "check that we have seen a thread group snapshot"); +} + +static void +verify_stackshot_sharedcache_layout(struct dyld_uuid_info_64 *uuids, uint32_t uuid_count) +{ + uuid_t cur_shared_cache_uuid; + __block uint32_t lib_index = 0, libs_found = 0; + + _dyld_get_shared_cache_uuid(cur_shared_cache_uuid); + int result = dyld_shared_cache_iterate_text(cur_shared_cache_uuid, ^(const dyld_shared_cache_dylib_text_info* info) { + T_QUIET; T_ASSERT_LT(lib_index, uuid_count, "dyld_shared_cache_iterate_text exceeded number of libraries returned by kernel"); + + libs_found++; + struct dyld_uuid_info_64 *cur_stackshot_uuid_entry = &uuids[lib_index]; + T_QUIET; T_ASSERT_EQ(memcmp(info->dylibUuid, cur_stackshot_uuid_entry->imageUUID, sizeof(info->dylibUuid)), 0, + "dyld returned UUID doesn't match kernel returned UUID"); + T_QUIET; T_ASSERT_EQ(info->loadAddressUnslid, cur_stackshot_uuid_entry->imageLoadAddress, + "dyld returned load address doesn't match kernel returned load address"); + lib_index++; + }); + + T_ASSERT_EQ(result, 0, "iterate shared cache layout"); + T_ASSERT_EQ(libs_found, uuid_count, "dyld iterator returned same number of libraries as kernel"); + + T_LOG("verified %d libraries from dyld shared cache", libs_found); +} + +static void +parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, int child_pid) +{ + bool delta = (stackshot_parsing_flags & PARSE_STACKSHOT_DELTA); + bool expect_zombie_child = (stackshot_parsing_flags & PARSE_STACKSHOT_ZOMBIE); + bool expect_shared_cache_layout = false; + bool expect_shared_cache_uuid = !delta; + bool found_zombie_child = false, found_shared_cache_layout = false, found_shared_cache_uuid = false; + + if (stackshot_parsing_flags & PARSE_STACKSHOT_SHAREDCACHE_LAYOUT) { + size_t shared_cache_length = 0; + const struct dyld_cache_header *cache_header = NULL; + cache_header = _dyld_get_shared_cache_range(&shared_cache_length); + T_QUIET; T_ASSERT_NOTNULL(cache_header, "current process running with shared cache"); + T_QUIET; T_ASSERT_GT(shared_cache_length, sizeof(struct _dyld_cache_header), "valid shared cache length populated by _dyld_get_shared_cache_range"); + + if (cache_header->locallyBuiltCache) { + T_LOG("device running with locally built shared cache, expect shared cache layout"); + expect_shared_cache_layout = true; + } else { + T_LOG("device running with B&I built shared-cache, no shared cache layout expected"); + } + } + + if (expect_zombie_child) { + T_QUIET; T_ASSERT_GT(child_pid, 0, "child pid greater than zero"); + } + + kcdata_iter_t iter = kcdata_iter(ssbuf, sslen); + if (delta) { + T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT, + "buffer provided is a delta stackshot"); + } else { + T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_STACKSHOT, + "buffer provided is a stackshot"); + } + + iter = kcdata_iter_next(iter); + KCDATA_ITER_FOREACH(iter) { + NSError *error = nil; + + switch (kcdata_iter_type(iter)) { + case KCDATA_TYPE_ARRAY: { + T_QUIET; + T_ASSERT_TRUE(kcdata_iter_array_valid(iter), + "checked that array is valid"); + + NSMutableDictionary *array = parseKCDataArray(iter, &error); + T_QUIET; T_ASSERT_NOTNULL(array, "parsed array from stackshot"); + T_QUIET; T_ASSERT_NULL(error, "error unset after parsing array"); + + if (kcdata_iter_array_elem_type(iter) == STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT) { + struct dyld_uuid_info_64 *shared_cache_uuids = kcdata_iter_payload(iter); + uint32_t uuid_count = kcdata_iter_array_elem_count(iter); + T_ASSERT_NOTNULL(shared_cache_uuids, "parsed shared cache layout array"); + T_ASSERT_GT(uuid_count, 0, "returned valid number of UUIDs from shared cache"); + verify_stackshot_sharedcache_layout(shared_cache_uuids, uuid_count); + found_shared_cache_layout = true; + } + + break; + } + + case KCDATA_TYPE_CONTAINER_BEGIN: { + T_QUIET; + T_ASSERT_TRUE(kcdata_iter_container_valid(iter), + "checked that container is valid"); + + if (kcdata_iter_container_type(iter) != STACKSHOT_KCCONTAINER_TASK) { + break; + } + + NSDictionary *container = parseKCDataContainer(&iter, &error); + T_QUIET; T_ASSERT_NOTNULL(container, "parsed container from stackshot"); + T_QUIET; T_ASSERT_NULL(error, "error unset after parsing container"); + + int pid = [container[@"task_snapshots"][@"task_snapshot"][@"ts_pid"] intValue]; + if (expect_zombie_child && (pid == child_pid)) { + found_zombie_child = true; + + uint64_t task_flags = [container[@"task_snapshots"][@"task_snapshot"][@"ts_ss_flags"] unsignedLongLongValue]; + T_ASSERT_TRUE((task_flags & kTerminatedSnapshot) == kTerminatedSnapshot, "child zombie marked as terminated"); + + continue; + } else if (pid != getpid()) { + break; + } + + T_EXPECT_EQ_STR(current_process_name(), + [container[@"task_snapshots"][@"task_snapshot"][@"ts_p_comm"] UTF8String], + "current process name matches in stackshot"); + + uint64_t task_flags = [container[@"task_snapshots"][@"task_snapshot"][@"ts_ss_flags"] unsignedLongLongValue]; + T_ASSERT_FALSE((task_flags & kTerminatedSnapshot) == kTerminatedSnapshot, "current process not marked as terminated"); + + T_QUIET; + T_EXPECT_LE(pid, [container[@"task_snapshots"][@"task_snapshot"][@"ts_unique_pid"] intValue], + "unique pid is greater than pid"); + + bool found_main_thread = false; + for (id thread_key in container[@"task_snapshots"][@"thread_snapshots"]) { + NSMutableDictionary *thread = container[@"task_snapshots"][@"thread_snapshots"][thread_key]; + NSDictionary *thread_snap = thread[@"thread_snapshot"]; + + T_QUIET; T_EXPECT_GT([thread_snap[@"ths_thread_id"] intValue], 0, + "thread ID of thread in current task is valid"); + T_QUIET; T_EXPECT_GT([thread_snap[@"ths_base_priority"] intValue], 0, + "base priority of thread in current task is valid"); + T_QUIET; T_EXPECT_GT([thread_snap[@"ths_sched_priority"] intValue], 0, + "scheduling priority of thread in current task is valid"); + + NSString *pth_name = thread[@"pth_name"]; + if (pth_name != nil && [pth_name isEqualToString:@TEST_THREAD_NAME]) { + found_main_thread = true; + + T_QUIET; T_EXPECT_GT([thread_snap[@"ths_total_syscalls"] intValue], 0, + "total syscalls of current thread is valid"); + + NSDictionary *cpu_times = thread[@"cpu_times"]; + T_EXPECT_GE([cpu_times[@"runnable_time"] intValue], + [cpu_times[@"system_time"] intValue] + + [cpu_times[@"user_time"] intValue], + "runnable time of current thread is valid"); + } + } + T_EXPECT_TRUE(found_main_thread, "found main thread for current task in stackshot"); + break; + } + case STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO: { + struct dyld_uuid_info_64_v2 *shared_cache_info = kcdata_iter_payload(iter); + uuid_t shared_cache_uuid; + T_QUIET; T_ASSERT_TRUE(_dyld_get_shared_cache_uuid(shared_cache_uuid), "retrieve current shared cache UUID"); + T_QUIET; T_ASSERT_EQ(memcmp(shared_cache_info->imageUUID, shared_cache_uuid, sizeof(shared_cache_uuid)), 0, + "dyld returned UUID doesn't match kernel returned UUID for system shared cache"); + found_shared_cache_uuid = true; + break; + } + } + } + + if (expect_zombie_child) { + T_QUIET; T_ASSERT_TRUE(found_zombie_child, "found zombie child in kcdata"); + } + + if (expect_shared_cache_layout) { + T_QUIET; T_ASSERT_TRUE(found_shared_cache_layout, "shared cache layout found in kcdata"); + } + + if (expect_shared_cache_uuid) { + T_QUIET; T_ASSERT_TRUE(found_shared_cache_uuid, "shared cache UUID found in kcdata"); + } + + T_ASSERT_FALSE(KCDATA_ITER_FOREACH_FAILED(iter), "successfully iterated kcdata"); +} + +static const char * +current_process_name(void) +{ + static char name[64]; + + if (!name[0]) { + int ret = proc_name(getpid(), name, sizeof(name)); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(ret, "proc_name failed for current process"); + } + + return name; +} + +static void +initialize_thread(void) +{ + int ret = pthread_setname_np(TEST_THREAD_NAME); + T_QUIET; + T_ASSERT_POSIX_ZERO(ret, "set thread name to %s", TEST_THREAD_NAME); +} diff --git a/tools/tests/darwintests/stackshot_block_owner_14362384.m b/tests/stackshot_block_owner_14362384.m similarity index 94% rename from tools/tests/darwintests/stackshot_block_owner_14362384.m rename to tests/stackshot_block_owner_14362384.m index bf4f3ae57..aabe544b8 100644 --- a/tools/tests/darwintests/stackshot_block_owner_14362384.m +++ b/tests/stackshot_block_owner_14362384.m @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -476,6 +477,16 @@ static int kmutex_action(int action) return NULL; } +static void * +waitpid_blocking_thread(void * arg) +{ + pid_t pid = (pid_t)arg; + + int ret = waitpid(pid, NULL, 0); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "Reaping child."); + return NULL; +} + /* * Uses a debug sysctl to initialize a kernel mutex. * @@ -821,6 +832,46 @@ static int kmutex_action(int action) pthread_cond_destroy(&cond); } +static void +test_waitpid_blocking(void) +{ + int ret = 0; + pid_t pid = 0; + void *stackshot = NULL; + struct stackshot_thread_waitinfo waitinfo = { 0 }; + int len = 1; + pthread_t tid; + + T_LOG("Starting %s", __FUNCTION__); + if ((pid = fork()) == 0) { + pause(); + } else { + T_ASSERT_POSIX_SUCCESS(ret, "Running in parent. Child pid is %d", pid); + + sleep(1); // allow enough time for child to run & sleep + ret = pthread_create(&tid, NULL, waitpid_blocking_thread, (void*)pid); + T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Creating waitpid blocking thread"); + + sleep(1); // allow enough time for reaping thread to waitpid & block + stackshot = take_stackshot(STACKSHOT_THREAD_WAITINFO, 0); + find_blocking_info(stackshot, (struct stackshot_thread_waitinfo *)&waitinfo, &len); + T_EXPECT_EQ(len, 1, "Only one blocking thread should exist"); + T_EXPECT_EQ(waitinfo.wait_type, kThreadWaitOnProcess, + "Wait type should match expected WaitOnProcess value"); + + check_python(stackshot, "thread \\d+: waitpid, for pid %d", (int)pid); + + stackshot_config_dealloc(stackshot); + T_EXPECT_EQ(waitinfo.owner, pid, + "Process ID of blocking process should match 'owner' field in stackshot"); + + ret = kill(pid, SIGUSR1); // wake up child so waitpid thread can reap it & exit + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "Send SIGUSR1 to child process"); + ret = pthread_join(tid, NULL); + T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "Join on waitpid thread"); + } +} + /* * * Test declarations @@ -856,3 +907,7 @@ static int kmutex_action(int action) T_DECL(stackshot_block_owner_mach_msg, "tests stackshot block owner: mach messaging") { test_mach_msg_blocking(); } + +T_DECL(stackshot_block_owner_waitpid, "tests stackshot block owner: waitpid") { + test_waitpid_blocking(); +} diff --git a/tools/tests/darwintests/stackshot_idle_25570396.m b/tests/stackshot_idle_25570396.m similarity index 100% rename from tools/tests/darwintests/stackshot_idle_25570396.m rename to tests/stackshot_idle_25570396.m diff --git a/tests/stackshot_spawn_exit_stress.c b/tests/stackshot_spawn_exit_stress.c new file mode 100644 index 000000000..2a0be2b37 --- /dev/null +++ b/tests/stackshot_spawn_exit_stress.c @@ -0,0 +1,131 @@ +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +T_GLOBAL_META( + T_META_NAMESPACE("xnu.stackshot"), + T_META_CHECK_LEAKS(false), + T_META_ASROOT(true) + ); + +#if TARGET_OS_WATCH +#define SPAWN_ITERATIONS 1999 +#elif TARGET_OS_IPHONE +#define SPAWN_ITERATIONS 4999 +#else +#define SPAWN_ITERATIONS 9999 +#endif + +#define REAP_INTERVAL 10 + +static void* loop(__attribute__ ((unused)) void *arg) { + exit(0); +} + +T_HELPER_DECL(spawn_children_helper, "spawn_children helper") +{ + pthread_t pthread; + + T_QUIET; T_ASSERT_POSIX_ZERO(pthread_create(&pthread, NULL, loop, NULL), "pthread_create"); + + while (1) { ; } +} + +static void +take_stackshot(void) +{ + uint32_t stackshot_flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS | + STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT); + + void *config = stackshot_config_create(); + T_QUIET; T_ASSERT_NOTNULL(config, "created stackshot config"); + + int ret = stackshot_config_set_flags(config, stackshot_flags); + T_QUIET; T_ASSERT_POSIX_ZERO(ret, "set flags on stackshot config"); + + int retries_remaining = 5; + +retry: + ret = stackshot_capture_with_config(config); + + if (ret == EBUSY || ret == ETIMEDOUT) { + if (retries_remaining > 0) { + retries_remaining--; + goto retry; + } else { + T_QUIET; T_ASSERT_POSIX_ZERO(ret, + "called stackshot_capture_with_config (no retries remaining)"); + } + } else { + T_QUIET; T_ASSERT_POSIX_ZERO(ret, "called stackshot_capture_with_config"); + } + + ret = stackshot_config_dealloc(config); + T_QUIET; T_EXPECT_POSIX_ZERO(ret, "deallocated stackshot config"); +} + +T_DECL(stackshot_spawn_exit, "tests taking many stackshots while children processes are spawning+exiting") +{ + char path[PATH_MAX]; + uint32_t path_size = sizeof(path); + T_ASSERT_POSIX_ZERO(_NSGetExecutablePath(path, &path_size), "_NSGetExecutablePath"); + char *args[] = { path, "-n", "spawn_children_helper", NULL }; + + dispatch_queue_t stackshot_queue = dispatch_queue_create("stackshot_queue", NULL); + dispatch_async(stackshot_queue, ^(void) { + int num_stackshots = 0; + + while (1) { + take_stackshot(); + num_stackshots++; + if ((num_stackshots % 100) == 0) { + T_LOG("completed %d stackshots", num_stackshots); + } + + // Sleep between each stackshot + usleep(100); + } + }); + + // META option for T_HELPER_DECL to not output test begin on start + posix_spawn_file_actions_t actions; + T_QUIET; T_ASSERT_POSIX_SUCCESS(posix_spawn_file_actions_init(&actions), "create spawn actions"); + T_QUIET; T_ASSERT_POSIX_SUCCESS(posix_spawn_file_actions_addopen (&actions, STDOUT_FILENO, "/dev/null", O_WRONLY, 0), + "set stdout of child to NULL"); + + int children_unreaped = 0, status; + for (int iterations_remaining = SPAWN_ITERATIONS; iterations_remaining > 0; iterations_remaining--) { + pid_t pid; + + int sp_ret = posix_spawn(&pid, args[0], &actions, NULL, args, NULL); + T_QUIET; T_ASSERT_POSIX_ZERO(sp_ret, "spawned process '%s' with PID %d", args[0], pid); + + children_unreaped++; + + if (children_unreaped >= REAP_INTERVAL) { + while (children_unreaped) { + T_QUIET; T_ASSERT_POSIX_SUCCESS(waitpid(-1, &status, 0), "waitpid returned child pid"); + children_unreaped--; + } + } + + if ((iterations_remaining % 100) == 0) { + T_LOG("spawned %d children thus far", (SPAWN_ITERATIONS - iterations_remaining)); + } + } + + while (children_unreaped) { + T_QUIET; T_ASSERT_POSIX_SUCCESS(waitpid(-1, &status, 0), "waitpid returned child pid"); + children_unreaped--; + } +} diff --git a/tools/tests/darwintests/suspended_spawn_26184412.c b/tests/suspended_spawn_26184412.c similarity index 100% rename from tools/tests/darwintests/suspended_spawn_26184412.c rename to tests/suspended_spawn_26184412.c diff --git a/tools/tests/darwintests/task_for_pid_entitlement.plist b/tests/task_for_pid_entitlement.plist similarity index 100% rename from tools/tests/darwintests/task_for_pid_entitlement.plist rename to tests/task_for_pid_entitlement.plist diff --git a/tools/tests/darwintests/task_info.c b/tests/task_info.c similarity index 99% rename from tools/tests/darwintests/task_info.c rename to tests/task_info.c index cb77c304f..c440036cb 100644 --- a/tools/tests/darwintests/task_info.c +++ b/tests/task_info.c @@ -395,7 +395,8 @@ T_DECL(task_flags_info, "tests task_flags_info", T_META_ASROOT(true), T_META_LTE T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded"); /* Change for 32-bit arch possibility?*/ - T_ASSERT_EQ((flags_info_data.flags & (unsigned int)(~TF_LP64)), 0U, "task_info should only give out 64-bit addr flag"); + T_ASSERT_EQ((flags_info_data.flags & (unsigned int)(~(TF_LP64 | TF_64B_DATA))), 0U, + "task_info should only give out 64-bit addr/data flags"); /* * This is a negative case. diff --git a/tools/tests/darwintests/task_info_28439149.c b/tests/task_info_28439149.c similarity index 100% rename from tools/tests/darwintests/task_info_28439149.c rename to tests/task_info_28439149.c diff --git a/tools/tests/darwintests/task_inspect.c b/tests/task_inspect.c similarity index 100% rename from tools/tests/darwintests/task_inspect.c rename to tests/task_inspect.c diff --git a/tools/tests/darwintests/task_inspect.entitlements b/tests/task_inspect.entitlements similarity index 100% rename from tools/tests/darwintests/task_inspect.entitlements rename to tests/task_inspect.entitlements diff --git a/tests/telemetry.c b/tests/telemetry.c new file mode 100644 index 000000000..ab45d147f --- /dev/null +++ b/tests/telemetry.c @@ -0,0 +1,185 @@ +#include +#include +#include +#include +#include +#include + +enum telemetry_pmi { + TELEMETRY_PMI_NONE, + TELEMETRY_PMI_INSTRS, + TELEMETRY_PMI_CYCLES, +}; +#define TELEMETRY_CMD_PMI_SETUP 3 + +T_GLOBAL_META(T_META_NAMESPACE("xnu.debugging.telemetry"), + T_META_CHECK_LEAKS(false), + T_META_ASROOT(true)); + +extern int __telemetry(uint64_t cmd, uint64_t deadline, uint64_t interval, + uint64_t leeway, uint64_t arg4, uint64_t arg5); + +static void +telemetry_cleanup(void) +{ + int ret = __telemetry(TELEMETRY_CMD_PMI_SETUP, TELEMETRY_PMI_NONE, 0, 0, 0, 0); + T_EXPECT_POSIX_SUCCESS(ret, "telemetry(... NONE ...)"); +} + +volatile static bool spinning = true; +static void * +thread_spin(__unused void *arg) +{ + while (spinning) { + } + return NULL; +} + +#define MT_MICROSTACKSHOT KDBG_EVENTID(DBG_MONOTONIC, 2, 1) +#define MS_RECORD MACHDBG_CODE(DBG_MACH_STACKSHOT, \ + MICROSTACKSHOT_RECORD) +#if defined(__arm64__) || defined(__arm__) +#define INSTRS_PERIOD (100ULL * 1000 * 1000) +#else /* defined(__arm64__) || defined(__arm__) */ +#define INSTRS_PERIOD (1ULL * 1000 * 1000 * 1000) +#endif /* !defined(__arm64__) && !defined(__arm__) */ +#define SLEEP_SECS 10 + +T_DECL(microstackshot_pmi, "attempt to configure microstackshots on PMI") +{ +#if TARGET_OS_WATCH + T_SKIP("unsupported platform"); +#endif /* TARGET_OS_WATCH */ + + T_SETUPBEGIN; + ktrace_session_t s = ktrace_session_create(); + T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "session create"); + + __block int pmi_events = 0; + __block int microstackshot_record_events = 0; + __block int pmi_records = 0; + __block int io_records = 0; + __block int interrupt_records = 0; + __block int timer_arm_records = 0; + __block int unknown_records = 0; + __block int multi_records = 0; + + ktrace_events_single(s, MT_MICROSTACKSHOT, ^(__unused struct trace_point *tp) { + pmi_events++; + }); + ktrace_events_single_paired(s, MS_RECORD, + ^(struct trace_point *start, __unused struct trace_point *end) { + if (start->arg1 & kPMIRecord) { + pmi_records++; + } + if (start->arg1 & kIORecord) { + io_records++; + } + if (start->arg1 & kInterruptRecord) { + interrupt_records++; + } + if (start->arg1 & kTimerArmingRecord) { + timer_arm_records++; + } + + const uint8_t any_record = kPMIRecord | kIORecord | kInterruptRecord | + kTimerArmingRecord; + if ((start->arg1 & any_record) == 0) { + unknown_records++; + } + if (__builtin_popcount(start->arg1 & any_record) != 1) { + multi_records++; + } + + microstackshot_record_events++; + }); + + ktrace_set_completion_handler(s, ^{ + ktrace_session_destroy(s); + T_EXPECT_GT(pmi_events, 0, + "saw non-zero PMIs (%g/sec)", pmi_events / (double)SLEEP_SECS); + T_EXPECT_GT(pmi_records, 0, "saw non-zero PMI record events (%g/sec)", + pmi_records / (double)SLEEP_SECS); + T_EXPECT_EQ(unknown_records, 0, "saw zero unknown record events"); + T_EXPECT_EQ(multi_records, 0, "saw zero multiple record events"); + T_EXPECT_GT(microstackshot_record_events, 0, + "saw non-zero microstackshot record events (%g/sec)", + microstackshot_record_events / (double)SLEEP_SECS); + + if (interrupt_records > 0) { + T_LOG("saw %g interrupt records per second", + interrupt_records / (double)SLEEP_SECS); + } else { + T_LOG("saw no interrupt records"); + } + if (io_records > 0) { + T_LOG("saw %g I/O records per second", + io_records / (double)SLEEP_SECS); + } else { + T_LOG("saw no I/O records"); + } + if (timer_arm_records > 0) { + T_LOG("saw %g timer arming records per second", + timer_arm_records / (double)SLEEP_SECS); + } else { + T_LOG("saw no timer arming records"); + } + + T_END; + }); + + T_SETUPEND; + + /* + * Start sampling via telemetry on the instructions PMI. + */ + int ret = __telemetry(TELEMETRY_CMD_PMI_SETUP, TELEMETRY_PMI_INSTRS, + INSTRS_PERIOD, 0, 0, 0); + if (ret < 0 && errno == EBUSY) { + T_PASS("telemetry is busy/active, maybe the events will be seen"); + } else { + T_ASSERT_POSIX_SUCCESS(ret, + "telemetry syscall succeeded, started microstackshots"); + T_LOG("installing cleanup handler"); + T_ATEND(telemetry_cleanup); + } + + pthread_t thread; + int error = pthread_create(&thread, NULL, thread_spin, NULL); + T_ASSERT_POSIX_ZERO(error, "started thread to spin"); + + error = ktrace_start(s, dispatch_get_main_queue()); + T_ASSERT_POSIX_ZERO(error, "started tracing"); + + dispatch_after(dispatch_time(DISPATCH_TIME_NOW, SLEEP_SECS * NSEC_PER_SEC), + dispatch_get_main_queue(), ^{ + spinning = false; + ktrace_end(s, 0); + (void)pthread_join(thread, NULL); + T_LOG("ending trace session after %d seconds", SLEEP_SECS); + }); + + dispatch_main(); +} + +T_DECL(error_handling, + "ensure that error conditions for the telemetry syscall are observed") +{ + int ret = __telemetry(TELEMETRY_CMD_PMI_SETUP, TELEMETRY_PMI_INSTRS, + 1, 0, 0, 0); + T_EXPECT_EQ(ret, -1, "telemetry shouldn't allow PMI every instruction"); + + ret = __telemetry(TELEMETRY_CMD_PMI_SETUP, TELEMETRY_PMI_INSTRS, + 1000 * 1000, 0, 0, 0); + T_EXPECT_EQ(ret, -1, + "telemetry shouldn't allow PMI every million instructions"); + + ret = __telemetry(TELEMETRY_CMD_PMI_SETUP, TELEMETRY_PMI_CYCLES, + 1, 0, 0, 0); + T_EXPECT_EQ(ret, -1, "telemetry shouldn't allow PMI every cycle"); + + ret = __telemetry(TELEMETRY_CMD_PMI_SETUP, TELEMETRY_PMI_CYCLES, + 1000 * 1000, 0, 0, 0); + T_EXPECT_EQ(ret, -1, + "telemetry shouldn't allow PMI every million cycles"); +} diff --git a/tools/tests/darwintests/thread_group_set_32261625.c b/tests/thread_group_set_32261625.c similarity index 100% rename from tools/tests/darwintests/thread_group_set_32261625.c rename to tests/thread_group_set_32261625.c diff --git a/tests/tty_hang.c b/tests/tty_hang.c new file mode 100644 index 000000000..19dc4d23a --- /dev/null +++ b/tests/tty_hang.c @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2015-2018 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define TEST_TIMEOUT 10 + +/* + * Receiving SIGTTIN (from the blocked read) is the passing condition, we just + * catch it so that we don't get terminated when we receive this. + */ +void +handle_sigttin(int signal) +{ + return; +} + +/* + * Because of the way dt_fork_helpers work, we have to ensure any children + * created by this function calls exit instead of getting the fork handlers exit + * handling + */ +int +get_new_session_and_terminal_and_fork_child_to_read(char *pty_name) +{ + int sock_fd[2]; + int pty_fd; + pid_t pid; + char buf[10]; + + /* + * We use this to handshake certain actions between this process and its + * child. + */ + T_ASSERT_POSIX_SUCCESS(socketpair(AF_UNIX, SOCK_STREAM, 0, sock_fd), + NULL); + + /* + * New session, lose any existing controlling terminal and become + * session leader. + */ + T_ASSERT_POSIX_SUCCESS(setsid(), NULL); + + /* now open pty, become controlling terminal of new session */ + T_ASSERT_POSIX_SUCCESS(pty_fd = open(pty_name, O_RDWR), NULL); + + T_ASSERT_POSIX_SUCCESS(pid = fork(), NULL); + + if (pid == 0) { /* child */ + int pty_fd_child; + char buf[10]; + + T_ASSERT_POSIX_SUCCESS(close(sock_fd[0]), NULL); + T_ASSERT_POSIX_SUCCESS(close(pty_fd), NULL); + + /* Make a new process group for ourselves */ + T_ASSERT_POSIX_SUCCESS(setpgid(0, 0), NULL); + + T_ASSERT_POSIX_SUCCESS(pty_fd_child = open(pty_name, O_RDWR), + NULL); + + /* now let parent know we've done open and setpgid */ + write(sock_fd[1], "done", sizeof("done")); + + /* wait for parent to set us to the foreground process group */ + read(sock_fd[1], buf, sizeof(buf)); + + /* + * We are the foreground process group now so we can read + * without getting a SIGTTIN. + * + * Once we are blocked though (we have a crude 1 second sleep on + * the parent to "detect" this), our parent is going to change + * us to be in the background. + * + * We'll be blocked until we get a signal and if that is signal + * is SIGTTIN, then the test has passed otherwise the test has + * failed. + */ + signal(SIGTTIN, handle_sigttin); + (void)read(pty_fd_child, buf, sizeof(buf)); + /* + * If we get here, we passed, if we get any other signal than + * SIGTTIN, we will not reach here. + */ + exit(0); + } + + T_ASSERT_POSIX_SUCCESS(close(sock_fd[1]), NULL); + + /* wait for child to open slave side and set its pgid to its pid */ + T_ASSERT_POSIX_SUCCESS(read(sock_fd[0], buf, sizeof(buf)), NULL); + + /* + * We need this to happen and in the order shown + * + * parent (pgid = pid) child (child_pgid = child_pid) + * + * 1 - tcsetpgrp(child_pgid) + * 2 - block in read() + * 3 - tcsetpgrp(pgid) + * + * making sure 2 happens after 1 is easy, we use a sleep(1) in the + * parent to try and ensure 3 happens after 2. + */ + + T_ASSERT_POSIX_SUCCESS(tcsetpgrp(pty_fd, pid), NULL); + + /* let child know you have set it to be the foreground process group */ + T_ASSERT_POSIX_SUCCESS(write(sock_fd[0], "done", sizeof("done")), NULL); + + /* + * give it a second to do the read of the terminal in response. + * + * XXX : Find a way to detect that the child is blocked in read(2). + */ + sleep(1); + + /* + * now change the foreground process group to ourselves - + * Note we are now in the background process group and we need to ignore + * SIGTTOU for this call to succeed. + * + * Hopefully the child has gotten to run and blocked for read on the + * terminal in the 1 second we slept. + */ + signal(SIGTTOU, SIG_IGN); + T_ASSERT_POSIX_SUCCESS(tcsetpgrp(pty_fd, getpid()), NULL); + + return (0); +} + +/* + * We're running in a "fork helper", we can't do a waitpid on the child because + * the fork helper unhelpfully hides the pid of the child and in it kills itself. + * We will instead fork first and wait on the child. If it is + * able to emerge from the read of the terminal, the test passes and if it + * doesn't, the test fails. + * Since the test is testing for a deadlock in proc_exit of the child (caused + * by a background read in the "grandchild". + */ +void +run_test(int do_revoke) +{ + int master_fd; + char *slave_pty; + pid_t pid; + + T_WITH_ERRNO; + T_QUIET; + + T_SETUPBEGIN; + + slave_pty= NULL; + T_ASSERT_POSIX_SUCCESS(master_fd = posix_openpt(O_RDWR | O_NOCTTY), + NULL); + (void)fcntl(master_fd, F_SETFL, O_NONBLOCK); + T_ASSERT_POSIX_SUCCESS(grantpt(master_fd), NULL); + T_ASSERT_POSIX_SUCCESS(unlockpt(master_fd), NULL); + slave_pty= ptsname(master_fd); + T_ASSERT_NOTNULL(slave_pty, NULL); + T_LOG("slave pty is %s\n", slave_pty); + + T_SETUPEND; + + /* + * We get the stdin and stdout redirection but we don't have visibility + * into the child (nor can we wait for it). To get around that, we fork + * and only let the parent to the caller and the child exits before + * returning to the caller. + */ + T_ASSERT_POSIX_SUCCESS(pid = fork(), NULL); + + if (pid == 0) { /* child */ + T_ASSERT_POSIX_SUCCESS(close(master_fd), NULL); + get_new_session_and_terminal_and_fork_child_to_read(slave_pty); + + /* + * These tests are for testing revoke and read hangs. This + * revoke can be explicit by a revoke(2) system call (test 2) + * or as part of exit(2) of the session leader (test 1). + * The exit hang is the common hang and can be fixed + * independently but fixing the revoke(2) hang requires us make + * changes in the tcsetpgrp path ( which also fixes the exit + * hang). In essence, we have 2 fixes. One which only addresses + * the exit hang and one which fixes both. + */ + if (do_revoke) { + /* This should not hang for the test to pass .. */ + T_ASSERT_POSIX_SUCCESS(revoke(slave_pty), NULL); + } + /* + * This child has the same dt_helper variables as its parent + * The way dt_fork_helpers work if we don't exit() from here, + * we will be killing the parent. So we have to exit() and not + * let the dt_fork_helpers continue. + * If we didn't do the revoke(2), This test passes if this exit + * doesn't hang waiting for its child to finish reading. + */ + exit(0); + } + + int status; + int sig; + + dt_waitpid(pid, &status, &sig, 0); + if (sig) { + T_FAIL("Test failed because child received signal %s\n", + strsignal(sig)); + } else if (status) { + T_FAIL("Test failed because child exited with status %d\n", + status); + } else { + T_PASS("test_passed\n"); + } + /* + * we can let this process proceed with the regular darwintest process + * termination and cleanup. + */ +} + + +/*************************** TEST 1 ********************************/ +T_HELPER_DECL(create_new_session_and_exit, "create_new_session_and_exit") { + run_test(0); +} + +T_DECL(tty_exit_bgread_hang_test, "test for background read hang on ttys with proc exit") +{ + dt_helper_t helpers[1]; + + helpers[0] = dt_fork_helper("create_new_session_and_exit"); + dt_run_helpers(helpers, 1, TEST_TIMEOUT); +} +/*********************** END TEST 1 ********************************/ + +/************************** TEST 2 ***********************************/ +T_HELPER_DECL(create_new_session_and_revoke_terminal, "create_new_session_and_revoke_terminal") { + run_test(1); +} + +T_DECL(tty_revoke_bgread_hang_test, "test for background read hang on ttys with revoke") +{ + dt_helper_t helpers[1]; + + helpers[0] = dt_fork_helper("create_new_session_and_revoke_terminal"); + dt_run_helpers(helpers, 1, TEST_TIMEOUT); +} +/*********************** END TEST 2 *********************************/ + diff --git a/tests/turnstile_multihop.c b/tests/turnstile_multihop.c new file mode 100644 index 000000000..339cfe8c2 --- /dev/null +++ b/tests/turnstile_multihop.c @@ -0,0 +1,813 @@ +/* + * turnstile_multihop: Tests turnstile and multi hop priority propagation. + */ + +#ifdef T_NAMESPACE +#undef T_NAMESPACE +#endif + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "turnstile_multihop_helper.h" + +T_GLOBAL_META(T_META_NAMESPACE("xnu.turnstile_multihop")); + +#define HELPER_TIMEOUT_SECS (3000) + +static boolean_t spin_for_ever = false; + +static void +thread_create_at_qos(qos_class_t qos, void * (*function)(void *)); +static uint64_t +nanoseconds_to_absolutetime(uint64_t nanoseconds); +static int +sched_create_load_at_qos(qos_class_t qos, void **load_token); +static int +sched_terminate_load(void *load_token) __unused; +static void do_work(int num); +static void +dispatch_sync_cancel(mach_port_t owner_thread, qos_class_t promote_qos); + +static void *sched_load_thread(void *); + +struct load_token_context { + volatile int threads_should_exit; + int thread_count; + qos_class_t qos; + pthread_t *threads; +}; + +static struct mach_timebase_info sched_mti; +static pthread_once_t sched_mti_once_control = PTHREAD_ONCE_INIT; + +static void sched_mti_init(void) +{ + mach_timebase_info(&sched_mti); +} +uint64_t +nanoseconds_to_absolutetime(uint64_t nanoseconds) +{ + pthread_once(&sched_mti_once_control, sched_mti_init); + + return (uint64_t)(nanoseconds * (((double)sched_mti.denom) / ((double)sched_mti.numer))); +} + +static int +sched_create_load_at_qos(qos_class_t qos, void **load_token) +{ + struct load_token_context *context = NULL; + int ret; + int ncpu; + size_t ncpu_size = sizeof(ncpu); + int nthreads; + int i; + pthread_attr_t attr; + + ret = sysctlbyname("hw.ncpu", &ncpu, &ncpu_size, NULL, 0); + if (ret == -1) { + T_LOG("sysctlbyname(hw.ncpu)"); + return errno; + } + + T_QUIET; T_LOG("%s: Detected %d CPUs\n", __FUNCTION__, ncpu); + + nthreads = ncpu; + T_QUIET; T_LOG("%s: Will create %d threads\n", __FUNCTION__, nthreads); + + ret = pthread_attr_init(&attr); + T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "pthread_attr_init"); + + if (&pthread_attr_set_qos_class_np) { + ret = pthread_attr_set_qos_class_np(&attr, qos, 0); + T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "pthread_attr_set_qos_class_np"); + } + + context = calloc(1, sizeof(*context)); + if (context == NULL) { T_QUIET; T_LOG("calloc returned error"); return ENOMEM; } + + context->threads_should_exit = 0; + context->thread_count = nthreads; + context->qos = qos; + context->threads = calloc((unsigned int)nthreads, sizeof(pthread_t)); + + OSMemoryBarrier(); + + for (i=0; i < nthreads; i++) { + ret = pthread_create(&context->threads[i], &attr, sched_load_thread, context); + T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "pthread_create"); + T_QUIET; T_LOG("%s: Created thread %d (%p)\n", __FUNCTION__, i, (void *)context->threads[i]); + } + + ret = pthread_attr_destroy(&attr); + T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "pthread_attr_destroy"); + + *load_token = context; + + return 0; +} + +static void * +sched_load_thread(void *arg) +{ + struct load_token_context *context = (struct load_token_context *)arg; + + T_QUIET; T_LOG("%s: Thread started %p\n", __FUNCTION__, (void *)pthread_self()); + + while (!context->threads_should_exit) { + uint64_t start = mach_absolute_time(); + uint64_t end = start + nanoseconds_to_absolutetime(900ULL * NSEC_PER_MSEC); + + while ((mach_absolute_time() < end) && !context->threads_should_exit); + } + + T_QUIET; T_LOG("%s: Thread terminating %p\n", __FUNCTION__, (void *)pthread_self()); + + return NULL; +} + +static int +sched_terminate_load(void *load_token) +{ + int ret; + int i; + struct load_token_context *context = (struct load_token_context *)load_token; + + context->threads_should_exit = 1; + OSMemoryBarrier(); + + for (i=0; i < context->thread_count; i++) { + T_QUIET; T_LOG("%s: Joining thread %d (%p)\n", __FUNCTION__, i, (void *)context->threads[i]); + ret = pthread_join(context->threads[i], NULL); + T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "pthread_join"); + } + + free(context->threads); + free(context); + + return 0; +} + + +// Find the first num primes, simply as a means of doing work +static void do_work(int num) +{ + volatile int i = 3, count, c; + + for(count = 2; count <= num; ) { + for(c = 2; c <= i; c++) { + if(i%c == 0) { + break; + } + } + if(c == i) { + count++; + } + i++; + } +} + +#pragma mark pthread callbacks + +static void +worker_cb(pthread_priority_t __unused priority) +{ + T_FAIL("a worker thread was created"); +} + +static void +event_cb(void ** __unused events, int * __unused nevents) +{ + T_FAIL("a kevent routine was called instead of workloop"); +} + +static uint32_t +get_user_promotion_basepri(void) +{ + mach_msg_type_number_t count = THREAD_POLICY_STATE_COUNT; + struct thread_policy_state thread_policy; + boolean_t get_default = FALSE; + mach_port_t thread_port = pthread_mach_thread_np(pthread_self()); + + kern_return_t kr = thread_policy_get(thread_port, THREAD_POLICY_STATE, + (thread_policy_t)&thread_policy, &count, &get_default); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_policy_get"); + return thread_policy.thps_user_promotion_basepri; +} + +static int messages_received = 0; +/* + * Basic WL handler callback, it checks the + * effective Qos of the servicer thread. + */ +static void +workloop_cb_test_intransit(uint64_t *workloop_id __unused, void **eventslist __unused, int *events) +{ + messages_received++; + T_LOG("Workloop handler workloop_cb_test_intransit called. Received message no %d", + messages_received); + + + /* Skip the test if we can't check Qos */ + if (geteuid() != 0) { + T_SKIP("kevent_qos test requires root privileges to run."); + } + + if (messages_received == 1) { + + sleep(5); + T_LOG("Do some CPU work."); + do_work(5000); + + /* Check if the override now is IN + 60 boost */ + T_EXPECT_EFFECTIVE_QOS_EQ(QOS_CLASS_USER_INITIATED, + "dispatch_source event handler QoS should be QOS_CLASS_USER_INITIATED"); + T_EXPECT_EQ(get_user_promotion_basepri(), 60u, + "dispatch_source event handler should be overridden at 60"); + + /* Enable the knote to get 2nd message */ + struct kevent_qos_s *kev = *eventslist; + kev->flags = EV_ADD | EV_ENABLE | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED; + kev->fflags = (MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY | + MACH_RCV_TRAILER_ELEMENTS(MACH_RCV_TRAILER_CTX) | + MACH_RCV_TRAILER_TYPE(MACH_MSG_TRAILER_FORMAT_0) | + MACH_RCV_VOUCHER); + *events = 1; + } else { + *events = 0; + exit(0); + } +} + +static void +run_client_server(const char *server_name, const char *client_name) +{ + dt_helper_t helpers[] = { + dt_launchd_helper_domain("com.apple.xnu.test.turnstile_multihop.plist", + server_name, NULL, LAUNCH_SYSTEM_DOMAIN), + dt_fork_helper(client_name) + }; + dt_run_helpers(helpers, 2, HELPER_TIMEOUT_SECS); +} + +#pragma mark Mach receive + +#define TURNSTILE_MULTIHOP_SERVICE_NAME "com.apple.xnu.test.turnstile_multihop" + +static mach_port_t +get_server_port(void) +{ + mach_port_t port; + kern_return_t kr = bootstrap_check_in(bootstrap_port, + TURNSTILE_MULTIHOP_SERVICE_NAME, &port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "server bootstrap_check_in"); + return port; +} + +static mach_voucher_t +create_pthpriority_voucher(mach_msg_priority_t qos) +{ + char voucher_buf[sizeof(mach_voucher_attr_recipe_data_t) + sizeof(ipc_pthread_priority_value_t)]; + + mach_voucher_t voucher = MACH_PORT_NULL; + kern_return_t ret; + ipc_pthread_priority_value_t ipc_pthread_priority_value = + (ipc_pthread_priority_value_t)qos; + + mach_voucher_attr_raw_recipe_array_t recipes; + mach_voucher_attr_raw_recipe_size_t recipe_size = 0; + mach_voucher_attr_recipe_t recipe = + (mach_voucher_attr_recipe_t)&voucher_buf[recipe_size]; + + recipe->key = MACH_VOUCHER_ATTR_KEY_PTHPRIORITY; + recipe->command = MACH_VOUCHER_ATTR_PTHPRIORITY_CREATE; + recipe->previous_voucher = MACH_VOUCHER_NULL; + memcpy((char *)&recipe->content[0], &ipc_pthread_priority_value, sizeof(ipc_pthread_priority_value)); + recipe->content_size = sizeof(ipc_pthread_priority_value_t); + recipe_size += sizeof(mach_voucher_attr_recipe_data_t) + recipe->content_size; + + recipes = (mach_voucher_attr_raw_recipe_array_t)&voucher_buf[0]; + + ret = host_create_mach_voucher(mach_host_self(), + recipes, + recipe_size, + &voucher); + + T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "client host_create_mach_voucher"); + return voucher; +} + +static void +send( + mach_port_t send_port, + mach_port_t reply_port, + mach_port_t msg_port, + mach_msg_priority_t qos, + mach_msg_option_t options) +{ + kern_return_t ret = 0; + + struct { + mach_msg_header_t header; + mach_msg_body_t body; + mach_msg_port_descriptor_t port_descriptor; + } send_msg = { + .header = { + .msgh_remote_port = send_port, + .msgh_local_port = reply_port, + .msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_COPY_SEND, + reply_port ? MACH_MSG_TYPE_MAKE_SEND_ONCE : 0, + MACH_MSG_TYPE_MOVE_SEND, + MACH_MSGH_BITS_COMPLEX), + .msgh_id = 0x100, + .msgh_size = sizeof(send_msg), + }, + .body = { + .msgh_descriptor_count = 1, + }, + .port_descriptor = { + .name = msg_port, + .disposition = MACH_MSG_TYPE_MOVE_RECEIVE, + .type = MACH_MSG_PORT_DESCRIPTOR, + }, + }; + + if (options & MACH_SEND_SYNC_USE_THRPRI) { + send_msg.header.msgh_voucher_port = create_pthpriority_voucher(qos); + } + + if (msg_port == MACH_PORT_NULL) { + send_msg.body.msgh_descriptor_count = 0; + } + + ret = mach_msg(&(send_msg.header), + MACH_SEND_MSG | + MACH_SEND_TIMEOUT | + MACH_SEND_OVERRIDE| + ((reply_port ? MACH_SEND_SYNC_OVERRIDE : 0) | options), + send_msg.header.msgh_size, + 0, + MACH_PORT_NULL, + 10000, + 0); + + T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "client mach_msg"); +} + +static void +receive( + mach_port_t rcv_port, + mach_port_t notify_port) +{ + kern_return_t ret = 0; + + struct { + mach_msg_header_t header; + mach_msg_body_t body; + mach_msg_port_descriptor_t port_descriptor; + } rcv_msg = { + .header = + { + .msgh_remote_port = MACH_PORT_NULL, + .msgh_local_port = rcv_port, + .msgh_size = sizeof(rcv_msg), + }, + }; + + T_LOG("Client: Starting sync receive\n"); + + ret = mach_msg(&(rcv_msg.header), + MACH_RCV_MSG | + MACH_RCV_SYNC_WAIT, + 0, + rcv_msg.header.msgh_size, + rcv_port, + 0, + notify_port); +} + +static lock_t lock_DEF; +static lock_t lock_IN; +static lock_t lock_UI; + +static mach_port_t main_thread_port; +static mach_port_t def_thread_port; +static mach_port_t in_thread_port; +static mach_port_t ui_thread_port; +static mach_port_t sixty_thread_port; + +static uint64_t dispatch_sync_owner; + +static int get_pri(thread_t thread_port) { + kern_return_t kr; + + thread_extended_info_data_t extended_info; + mach_msg_type_number_t count = THREAD_EXTENDED_INFO_COUNT; + kr = thread_info(thread_port, THREAD_EXTENDED_INFO, + (thread_info_t)&extended_info, &count); + + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_info"); + + return extended_info.pth_curpri; +} + +static void +set_thread_name(const char *fn_name) +{ + char name[50] = ""; + + thread_t thread_port = pthread_mach_thread_np(pthread_self()); + + int pri = get_pri(thread_port); + + snprintf(name, sizeof(name), "%s at pri %2d", fn_name, pri); + pthread_setname_np(name); +} + +static void +thread_wait_to_block(mach_port_t thread_port) +{ + thread_extended_info_data_t extended_info; + kern_return_t kr; + + while (1) { + mach_msg_type_number_t count = THREAD_EXTENDED_INFO_COUNT; + kr = thread_info(thread_port, THREAD_EXTENDED_INFO, + (thread_info_t)&extended_info, &count); + + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_info"); + + if (extended_info.pth_run_state == TH_STATE_WAITING) { + T_LOG("Target thread blocked\n"); + break; + } + thread_switch(thread_port, SWITCH_OPTION_DEPRESS, 0); + } +} + +static void +thread_wait_to_boost(mach_port_t thread_port, mach_port_t yield_thread, int priority) +{ + thread_extended_info_data_t extended_info; + kern_return_t kr; + + while (1) { + mach_msg_type_number_t count = THREAD_EXTENDED_INFO_COUNT; + kr = thread_info(thread_port, THREAD_EXTENDED_INFO, + (thread_info_t)&extended_info, &count); + + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_info"); + + if (extended_info.pth_priority >= priority) { + T_LOG("Target thread boosted\n"); + break; + } + thread_switch(yield_thread, SWITCH_OPTION_DEPRESS, 0); + } +} + +static void +dispatch_sync_wait(mach_port_t owner_thread, qos_class_t promote_qos) +{ + struct kevent_qos_s kev_err[] = {{ 0 }}; + uint32_t fflags = 0; + uint64_t mask = 0; + uint16_t action = 0; + int r; + + action = EV_ADD | EV_DISABLE; + fflags = NOTE_WL_SYNC_WAIT | NOTE_WL_DISCOVER_OWNER; + + dispatch_sync_owner = owner_thread; + + struct kevent_qos_s kev[] = {{ + .ident = mach_thread_self(), + .filter = EVFILT_WORKLOOP, + .flags = action, + .fflags = fflags, + .udata = (uintptr_t) &def_thread_port, + .qos = (int32_t)_pthread_qos_class_encode(promote_qos, 0, 0), + .ext[EV_EXTIDX_WL_MASK] = mask, + .ext[EV_EXTIDX_WL_VALUE] = dispatch_sync_owner, + .ext[EV_EXTIDX_WL_ADDR] = (uint64_t)&dispatch_sync_owner, + }}; + + /* Setup workloop to fake dispatch sync wait on a workloop */ + r = kevent_id(30, kev, 1, kev_err, 1, NULL, + NULL, KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_ERROR_EVENTS); + T_QUIET; T_LOG("dispatch_sync_wait returned\n"); +} + +static void +dispatch_sync_cancel(mach_port_t owner_thread, qos_class_t promote_qos) +{ + struct kevent_qos_s kev_err[] = {{ 0 }}; + uint32_t fflags = 0; + uint64_t mask = 0; + uint16_t action = 0; + int r; + + action = EV_DELETE | EV_ENABLE; + fflags = NOTE_WL_SYNC_WAKE | NOTE_WL_END_OWNERSHIP; + + dispatch_sync_owner = owner_thread; + + struct kevent_qos_s kev[] = {{ + .ident = def_thread_port, + .filter = EVFILT_WORKLOOP, + .flags = action, + .fflags = fflags, + .udata = (uintptr_t) &def_thread_port, + .qos = (int32_t)_pthread_qos_class_encode(promote_qos, 0, 0), + .ext[EV_EXTIDX_WL_MASK] = mask, + .ext[EV_EXTIDX_WL_VALUE] = dispatch_sync_owner, + .ext[EV_EXTIDX_WL_ADDR] = (uint64_t)&dispatch_sync_owner, + }}; + + /* Setup workloop to fake dispatch sync wake on a workloop */ + r = kevent_id(30, kev, 1, kev_err, 1, NULL, + NULL, KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_ERROR_EVENTS); + T_QUIET; T_LOG("dispatch_sync_cancel returned\n"); + +} + +static void * +thread_at_sixty(void *arg __unused) +{ + int policy; + struct sched_param param; + int ret; + void *load_token; + uint64_t before_lock_time, after_lock_time; + + sixty_thread_port = mach_thread_self(); + + set_thread_name(__FUNCTION__); + + /* Change our priority to 60 */ + ret = pthread_getschedparam(pthread_self(), &policy, ¶m); + T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "pthread_getschedparam"); + + param.sched_priority = 60; + + ret = pthread_setschedparam(pthread_self(), policy, ¶m); + T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "pthread_setschedparam"); + + ret = pthread_getschedparam(pthread_self(), &policy, ¶m); + T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "pthread_getschedparam"); + + T_LOG("My priority is %d", param.sched_priority); + + thread_wait_to_boost(in_thread_port, ui_thread_port, 46); + + if (spin_for_ever) { + /* Schedule load at Default */ + sched_create_load_at_qos(QOS_CLASS_DEFAULT, &load_token); + } + + T_LOG("Thread at priority 60 trying to acquire UI lock"); + + before_lock_time = mach_absolute_time(); + ull_lock(&lock_UI, 3, UL_UNFAIR_LOCK, 0); + after_lock_time = mach_absolute_time(); + + T_QUIET; T_LOG("The time for priority 60 thread to acquire lock was %llu \n", + (after_lock_time - before_lock_time)); + exit(0); +} + +static void * +thread_at_ui(void *arg __unused) +{ + ui_thread_port = mach_thread_self(); + + set_thread_name(__FUNCTION__); + + /* Grab the first ulock */ + ull_lock(&lock_UI, 2, UL_UNFAIR_LOCK, 0); + + thread_wait_to_boost(def_thread_port, in_thread_port, 37); + thread_create_at_qos(QOS_CLASS_USER_INTERACTIVE, thread_at_sixty); + + T_LOG("Thread at UI priority trying to acquire IN lock"); + ull_lock(&lock_IN, 2, UL_UNFAIR_LOCK, 0); + ull_unlock(&lock_UI, 2, UL_UNFAIR_LOCK, 0); + return NULL; +} + +static void * +thread_at_in(void *arg __unused) +{ + in_thread_port = mach_thread_self(); + + set_thread_name(__FUNCTION__); + + /* Grab the first ulock */ + ull_lock(&lock_IN, 2, UL_UNFAIR_LOCK, 0); + + T_LOG("Thread at IN priority got first lock "); + + thread_wait_to_boost(main_thread_port, def_thread_port, 31); + + /* Create a new thread at QOS_CLASS_USER_INTERACTIVE qos */ + thread_create_at_qos(QOS_CLASS_USER_INTERACTIVE, thread_at_ui); + + T_LOG("Thread at IN priority trying to acquire default lock"); + ull_lock(&lock_DEF, 1, UL_UNFAIR_LOCK, 0); + ull_unlock(&lock_IN, 2, UL_UNFAIR_LOCK, 0); + return NULL; +} + +static void * +thread_at_default(void *arg __unused) +{ + def_thread_port = mach_thread_self(); + + set_thread_name(__FUNCTION__); + + /* Grab the first ulock */ + ull_lock(&lock_DEF, 1, UL_UNFAIR_LOCK, 0); + + T_LOG("Thread at DEFAULT priority got first lock "); + + thread_wait_to_block(main_thread_port); + + /* Create a new thread at QOS_CLASS_USER_INITIATED qos */ + thread_create_at_qos(QOS_CLASS_USER_INITIATED, thread_at_in); + + T_LOG("Thread at Default priority trying to wait on dispatch sync for maintenance thread"); + dispatch_sync_wait(main_thread_port, QOS_CLASS_DEFAULT); + ull_unlock(&lock_DEF, 1, UL_UNFAIR_LOCK, 0); + return NULL; +} + +static void * +thread_at_maintenance(void *arg __unused) +{ + mach_port_t qos_send_port; + mach_port_t special_reply_port; + + main_thread_port = mach_thread_self(); + + set_thread_name(__FUNCTION__); + + kern_return_t kr = bootstrap_look_up(bootstrap_port, + TURNSTILE_MULTIHOP_SERVICE_NAME, &qos_send_port); + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up"); + + special_reply_port = thread_get_special_reply_port(); + T_QUIET; T_ASSERT_TRUE(MACH_PORT_VALID(special_reply_port), "get_thread_special_reply_port"); + + /* Become the dispatch sync owner, dispatch_sync_owner will be set in dispatch_sync_wait function */ + + /* Send an async message */ + send(qos_send_port, MACH_PORT_NULL, MACH_PORT_NULL, + (uint32_t)_pthread_qos_class_encode(QOS_CLASS_MAINTENANCE, 0, 0), 0); + + /* Send a sync message */ + send(qos_send_port, special_reply_port, MACH_PORT_NULL, + (uint32_t)_pthread_qos_class_encode(QOS_CLASS_MAINTENANCE, 0, 0), 0); + + /* Create a new thread at QOS_CLASS_DEFAULT qos */ + thread_create_at_qos(QOS_CLASS_DEFAULT, thread_at_default); + + /* Block on Sync IPC */ + receive(special_reply_port, qos_send_port); + + dispatch_sync_cancel(def_thread_port, QOS_CLASS_DEFAULT); + return NULL; +} + +T_HELPER_DECL(three_ulock_sync_ipc_hop, + "Create chain of 4 threads with 3 ulocks and 1 sync IPC at different qos") +{ + dt_stat_time_t roundtrip_stat = dt_stat_time_create("multihop_lock_acquire"); + + T_STAT_MEASURE_LOOP(roundtrip_stat) { + if (fork() == 0) { + thread_create_at_qos(QOS_CLASS_MAINTENANCE, thread_at_maintenance); + sigsuspend(0); + exit(0); + } + wait(NULL); + } + + dt_stat_finalize(roundtrip_stat); + T_END; +} + +static void +thread_create_at_qos(qos_class_t qos, void * (*function)(void *)) +{ + qos_class_t qos_thread; + pthread_t thread; + pthread_attr_t attr; + int ret; + + ret = setpriority(PRIO_DARWIN_ROLE, 0, PRIO_DARWIN_ROLE_UI_FOCAL); + if (ret != 0) { + T_LOG("set priority failed\n"); + } + + pthread_attr_init(&attr); + pthread_attr_set_qos_class_np(&attr, qos, 0); + pthread_create(&thread, &attr, function, NULL); + + T_LOG("pthread created\n"); + pthread_get_qos_class_np(thread, &qos_thread, NULL); +} + +#pragma mark Mach receive - kevent_qos + +static void +expect_kevent_id_recv(mach_port_t port) +{ + int r; + + T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop( + worker_cb, event_cb, + (pthread_workqueue_function_workloop_t)workloop_cb_test_intransit, 0, 0), NULL); + + struct kevent_qos_s kev[] = {{ + .ident = port, + .filter = EVFILT_MACHPORT, + .flags = EV_ADD | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED, + .fflags = (MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY | + MACH_RCV_TRAILER_ELEMENTS(MACH_RCV_TRAILER_CTX) | + MACH_RCV_TRAILER_TYPE(MACH_MSG_TRAILER_FORMAT_0) | + MACH_RCV_VOUCHER), + .data = 1, + .qos = (int32_t)_pthread_qos_class_encode(QOS_CLASS_MAINTENANCE, 0, 0) + }}; + + struct kevent_qos_s kev_err[] = {{ 0 }}; + + /* Setup workloop for mach msg rcv */ + r = kevent_id(25, kev, 1, kev_err, 1, NULL, + NULL, KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_ERROR_EVENTS); + + T_QUIET; T_ASSERT_POSIX_SUCCESS(r, "kevent_id"); + T_QUIET; T_ASSERT_EQ(r, 0, "no errors returned from kevent_id"); +} + +T_HELPER_DECL(server_kevent_id, + "Reply with the QoS that a dispatch source event handler ran with") +{ + expect_kevent_id_recv(get_server_port()); + sigsuspend(0); + T_ASSERT_FAIL("should receive a message"); +} + +#define TEST_MULTIHOP(server_name, client_name, name) \ + T_DECL(server_kevent_id_##name, \ + "Event delivery using a kevent_id", \ + T_META_ASROOT(YES)) \ + { \ + run_client_server(server_name, client_name); \ + } + +#define TEST_MULTIHOP_SPIN(server_name, client_name, name) \ + T_DECL(server_kevent_id_##name, \ + "Event delivery using a kevent_id", \ + T_META_ASROOT(YES), T_META_ENABLED(FALSE)) \ + { \ + spin_for_ever = true; \ + run_client_server(server_name, client_name); \ + spin_for_ever = false; \ + } + +/* + * Test 1: Test multihop priority boosting with ulocks, dispatch sync and sync IPC. + * + * Create thread's at different Qos and acquire a ulock and block on next ulock/dispatch sync + * creating a sync chain. The last hop the chain is blocked on Sync IPC. + */ +TEST_MULTIHOP("server_kevent_id", "three_ulock_sync_ipc_hop", three_ulock_sync_ipc_hop) + +/* + * Test 2: Test multihop priority boosting with ulocks, dispatch sync and sync IPC. + * + * Create thread's at different Qos and acquire a ulock and block on next ulock/dispatch sync + * creating a sync chain. The last hop the chain is blocked on Sync IPC. + * Before the last priority 60 thread blocks on ulock, it also starts spinforeverd at priority 31. + */ +TEST_MULTIHOP_SPIN("server_kevent_id", "three_ulock_sync_ipc_hop", three_ulock_sync_ipc_hop_spin) diff --git a/tests/turnstile_multihop_helper.h b/tests/turnstile_multihop_helper.h new file mode 100644 index 000000000..0652b27b0 --- /dev/null +++ b/tests/turnstile_multihop_helper.h @@ -0,0 +1,203 @@ +// vim:noexpandtab +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "turnstile_multihop_types.h" + +typedef _Atomic(u32) lock_t; + +__inline static void +yield(void) +{ +#if !defined(__x86_64__) && !defined(__i386__) + __asm volatile("yield"); +#else + __asm volatile("pause"); +#endif +} + +__inline static void +wfe(void) +{ +#if !defined(__x86_64__) && !defined(__i386__) + __asm volatile("wfe"); +#else + __asm volatile("pause"); +#endif +} + +__inline static void +wfi(void) +{ +#if !defined(__x86_64__) && !defined(__i386__) + __asm volatile("wfi"); +#else + __asm volatile("pause"); +#endif +} + +__inline static void +sev(void) +{ +#if !defined(__x86_64__) && !defined(__i386__) + __asm volatile("sev"); +#endif +} + +#include + +#ifndef __TSD_MACH_THREAD_SELF +#define __TSD_MACH_THREAD_SELF 3 +#endif + +__inline static mach_port_name_t +_os_get_self(void) +{ + mach_port_name_t self = (mach_port_name_t)(uintptr_t)(void *)_os_tsd_get_direct(__TSD_MACH_THREAD_SELF); + return self; +} + +#define ULL_WAITERS 1U + +static uint32_t lock_no_wait[4] = { 0, 0, 0, 0}; +static uint32_t lock_wait[4] = { 0, 0, 0, 0}; + +static mach_port_name_t main_thread_name = 0; + +__inline static void +ull_lock(lock_t *lock, int id, uint opcode, uint flags) +{ + u32 thread_id = _os_get_self() & ~0x3u; + u32 ull_locked = (opcode == UL_UNFAIR_LOCK) ? thread_id : 4u; + u32 mach_id = _os_get_self() >> 2; + u32 prev; + bool succeeded = false; + bool waiters = false; + bool called_wait = false; + u32 count = 0; + + do { + count++; + if ((count % 100000) == 0) { + printf("[%d,%d]%s>top of loop count=%d\n", id, mach_id, __FUNCTION__, count); + } + u32 new = waiters ? (ULL_WAITERS|ull_locked) : ull_locked; + prev = 0; + __c11_atomic_compare_exchange_strong(lock, &prev, new, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED); + if (prev == 0) { + /* Was unlocked, now locked */ + succeeded = true; + break; + } + + u32 value = prev; + if (!(value & ULL_WAITERS)) { + new = value | ULL_WAITERS; + __c11_atomic_compare_exchange_strong(lock, &prev, new, __ATOMIC_RELAXED, __ATOMIC_RELAXED); + if (prev == value) { + /* succeeded in setting ULL_WAITERS */ + value = new; + } else if (prev & ULL_WAITERS) { + /* Didn't succeed, but someone else already set ULL_WAITERS */ + value = prev; + } else { + /* Something changed under us, so try again */ + if (count % 100000 == 0) { + printf("[%d,%d]%s>Something changed under us, prev=%d\n", id, mach_id, __FUNCTION__, prev); + } + continue; + } + } + /* Locked with waiters indication, so block */ + int ret = __ulock_wait(flags | opcode, lock, value, 0); + called_wait = true; + if (ret < 0) { + if (flags & ULF_NO_ERRNO) { + errno = -ret; + } + if (errno == EFAULT) { + continue; + } + printf("[%d,%d]%s>ull_wait() error: %s\n", id, mach_id, __FUNCTION__, strerror(errno)); + exit(1); + } + waiters = (ret > 0); + + if (count % 100000 == 0) { + printf("[%d,%d]%s>bottom of loop prev=%d\n", id, mach_id, __FUNCTION__, prev); + } + } while (!succeeded); + + if (called_wait) { + lock_wait[id]++; + } else { + lock_no_wait[id]++; + } +} + +static uint32_t unlock_no_waiters[4] = { 0, 0, 0, 0}; +static uint32_t unlock_waiters[4] = { 0, 0, 0, 0 }; +static uint32_t unlock_waiters_gone[4] = { 0, 0, 0, 0 }; +static uint32_t unlock_waiters_wake_thread[4] = { 0, 0, 0, 0 }; + +__inline static void +ull_unlock(lock_t *lock, int id, uint opcode, uint flags) +{ + u32 thread_id = _os_get_self() & ~0x3u; + u32 ull_locked = (opcode == UL_UNFAIR_LOCK) ? thread_id : 4u; + u32 mach_id = _os_get_self() >> 2; + u32 prev = ull_locked; + __c11_atomic_compare_exchange_strong(lock, &prev, 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED); + if (prev == ull_locked) { + unlock_no_waiters[id]++; + return; + } + + if (prev == 0) { + printf("%s>already unlocked\n", __FUNCTION__); + exit(1); + } + + if (prev == (ULL_WAITERS|ull_locked)) { + /* locked with waiters */ + *lock = 0; + __c11_atomic_thread_fence(__ATOMIC_ACQ_REL); + + if ((flags & ULF_WAKE_THREAD) && (_os_get_self() == main_thread_name)) { + flags &= ~(uint)ULF_WAKE_THREAD; + } + int ret = __ulock_wake((flags | opcode), lock, main_thread_name); + if ((ret < 0) && (flags & ULF_NO_ERRNO)) { + errno = -ret; + } + if ((flags & ULF_WAKE_THREAD) && (ret < 0) && (errno == EALREADY)) { + flags &= ~(uint)ULF_WAKE_THREAD; + ret = __ulock_wake((flags | opcode), lock, 0); + if ((ret < 0) && (flags & ULF_NO_ERRNO)) { + errno = -ret; + } + } else if ((flags & ULF_WAKE_THREAD) && (ret == 0)) { + unlock_waiters_wake_thread[id]++; + } + if (ret < 0) { + if (errno == ENOENT) { + unlock_waiters_gone[id]++; + } else { + printf("[%d,%d]%s>ull_wake() error: %s\n", id, mach_id, __FUNCTION__, strerror(errno)); + exit(1); + } + } + unlock_waiters[id]++; + } else { + printf("%s>unexpected lock value %d\n", __FUNCTION__, prev); + exit(1); + } +} diff --git a/tests/turnstile_multihop_types.h b/tests/turnstile_multihop_types.h new file mode 100644 index 000000000..fc21b00e1 --- /dev/null +++ b/tests/turnstile_multihop_types.h @@ -0,0 +1,32 @@ +// vim:noexpandtab +#ifndef __TYPES_H__ +#define __TYPES_H__ + +#include +#include + +typedef signed char s8; +typedef unsigned char u8; +typedef uint16_t u16; +typedef int16_t s16; +typedef uint32_t u32; +typedef uint64_t u64; +typedef int32_t s32; +typedef int64_t s64; + +#if defined(__arm64__) || defined(__x86_64__) +typedef u64 un; +typedef s64 sn; +#else +typedef u32 un; +typedef s32 sn; +#endif + +#ifndef __DRT_H__ +typedef u32 uint; +#endif + +#define volatile_read(atom) (*((volatile typeof(*(atom)) *)(atom))) +#define volatile_write(atom, value) (*((volatile typeof(*(atom)) *)(atom)) = value) + +#endif diff --git a/tests/turnstiles_test.c b/tests/turnstiles_test.c new file mode 100644 index 000000000..0494ba146 --- /dev/null +++ b/tests/turnstiles_test.c @@ -0,0 +1,258 @@ +/* + * turnstiles_test: Tests turnstile kernel primitive. + */ + +#ifdef T_NAMESPACE +#undef T_NAMESPACE +#endif + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define SYSCTL_TURNSTILE_TEST_DEFAULT 1 +#define SYSCTL_TURNSTILE_TEST_GLOBAL_HASHTABLE 2 + + +T_GLOBAL_META(T_META_NAMESPACE("xnu.turnstiles_test")); + +static void +thread_create_at_qos(qos_class_t qos, void * (*function)(void *), int type) +{ + qos_class_t qos_thread; + pthread_t thread; + pthread_attr_t attr; + int ret; + + ret = setpriority(PRIO_DARWIN_ROLE, 0, PRIO_DARWIN_ROLE_UI_FOCAL); + if (ret != 0) { + T_LOG("set priority failed\n"); + } + + pthread_attr_init(&attr); + pthread_attr_set_qos_class_np(&attr, qos, 0); + pthread_create(&thread, &attr, function, (void *)type); + + T_LOG("pthread created\n"); + pthread_get_qos_class_np(thread, &qos_thread, NULL); + T_EXPECT_EQ(qos_thread, (qos_class_t)qos, NULL); +} + +static int +get_pri(thread_t thread_port) { + kern_return_t kr; + + thread_extended_info_data_t extended_info; + mach_msg_type_number_t count = THREAD_EXTENDED_INFO_COUNT; + kr = thread_info(thread_port, THREAD_EXTENDED_INFO, + (thread_info_t)&extended_info, &count); + + T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_info"); + return extended_info.pth_curpri; +} + +static void +turnstile_prim_lock(int type) +{ + int ret; + uint64_t tid; + int in_val = type; + pthread_threadid_np(NULL, &tid); + T_LOG("sysctlbyname lock called from thread %llu \n", tid); + ret = sysctlbyname("kern.turnstiles_test_lock", NULL, 0, &in_val, sizeof(in_val)); + T_LOG("sysctlbyname lock returned from thread %llu with value %d \n", tid, ret); +} + +static void +turnstile_prim_unlock(int type) +{ + int ret; + uint64_t tid; + int in_val = type; + pthread_threadid_np(NULL, &tid); + T_LOG("sysctlbyname unlock called from thread %llu \n", tid); + ret = sysctlbyname("kern.turnstiles_test_unlock", NULL, 0, &in_val, sizeof(in_val)); + T_LOG("sysctlbyname unlock returned from thread %llu with value %d \n", tid, ret); +} + +static void * +take_lock_check_priority(void * arg) +{ + int old_pri = get_pri(mach_thread_self()); + int unboosted_pri; + int boosted_pri; + int after_unlock_pri; + uint64_t tid; + int type = (int)arg; + + pthread_threadid_np(NULL, &tid); + + T_ASSERT_EQ(old_pri, 37, "thread(%llu) priority before acquiring the lock is %d\n", tid, old_pri); + + /* Take the test lock */ + turnstile_prim_lock(type); + + unboosted_pri = get_pri(mach_thread_self()); + T_ASSERT_EQ(unboosted_pri, 37, "thread(%llu) priority after acquiring the lock (uncontended) is %d\n", tid, unboosted_pri); + + sleep(8); + + /* Check for elevated priority */ + boosted_pri = get_pri(mach_thread_self()); + T_ASSERT_EQ(boosted_pri, 47, "thread(%llu) priority after contention by 47 thread is %d\n", tid, boosted_pri); + + /* Drop the lock */ + turnstile_prim_unlock(type); + + /* Check for regular priority */ + after_unlock_pri = get_pri(mach_thread_self()); + T_ASSERT_EQ(after_unlock_pri, 37, "thread(%llu) priority after dropping lock is %d\n", tid, after_unlock_pri); + + return NULL; +} + +static void * +try_to_take_lock_and_unlock(void *arg) +{ + uint64_t tid; + int type = (int)arg; + + pthread_threadid_np(NULL, &tid); + sleep(4); + + int old_pri = get_pri(mach_thread_self()); + T_ASSERT_EQ(old_pri, 47, "thread(%llu) priority before acquiring the lock is %d\n", tid, old_pri); + + /* Try taking the test lock */ + turnstile_prim_lock(type); + sleep (2); + turnstile_prim_unlock(type); + return NULL; +} + +static void * +take_lock_and_exit(void * arg) +{ + int old_pri = get_pri(mach_thread_self()); + int unboosted_pri; + int boosted_pri; + uint64_t tid; + int type = (int)arg; + + pthread_threadid_np(NULL, &tid); + + T_ASSERT_EQ(old_pri, 37, "thread(%llu) priority before acquiring the lock is %d\n", tid, old_pri); + + /* Take the test lock */ + turnstile_prim_lock(type); + + unboosted_pri = get_pri(mach_thread_self()); + T_ASSERT_EQ(unboosted_pri, 37, "thread(%llu) priority after acquiring the lock (uncontended) is %d\n", tid, unboosted_pri); + + sleep(8); + + /* Check for elevated priority */ + boosted_pri = get_pri(mach_thread_self()); + T_ASSERT_EQ(boosted_pri, 47, "thread(%llu) priority after contention by 47 thread is %d\n", tid, boosted_pri); + + /* return without unlocking the lock */ + return NULL; +} + +static void * +unlock_an_owner_exited_lock(void *arg) +{ + uint64_t tid; + int type = (int)arg; + + pthread_threadid_np(NULL, &tid); + sleep(12); + + int old_pri = get_pri(mach_thread_self()); + T_ASSERT_EQ(old_pri, 47, "thread(%llu) priority before acquiring the lock is %d\n", tid, old_pri); + + /* Unlock the test lock causing the turnstile code to call thread_deallocate_safe */ + turnstile_prim_unlock(type); + return NULL; +} + +/* + * Test 1: test if lock contended by a UI thread boosts the owner to UI qos. + */ +static void +test1(int type) +{ + T_LOG("Test 1: test if lock contended by a UI thread boosts the owner to UI qos"); + + /* Create a thread at IN and take lock */ + thread_create_at_qos(QOS_CLASS_USER_INITIATED, &take_lock_check_priority, type); + + /* Create a thread at UI and try to take lock */ + thread_create_at_qos(QOS_CLASS_USER_INTERACTIVE, &try_to_take_lock_and_unlock, type); + + sleep(12); + return; +} + +/* + * Test 2: test if lock contended by a 2 UI thread boosts the owner to UI qos. + */ +static void +test2(int type) +{ + T_LOG("Test 2: test if lock contended by a 2 UI thread boosts the owner to UI qos"); + + /* Create a thread at IN and take lock */ + thread_create_at_qos(QOS_CLASS_USER_INITIATED, &take_lock_check_priority, type); + + /* Create a thread at UI and try to take lock */ + thread_create_at_qos(QOS_CLASS_USER_INTERACTIVE, &try_to_take_lock_and_unlock, type); + + /* Create a thread at UI and try to take lock */ + thread_create_at_qos(QOS_CLASS_USER_INTERACTIVE, &try_to_take_lock_and_unlock, type); + + sleep(16); + return; +} + +/* + * Test 3: test if lock owner thread exiting without unlocking allows turnstile to work correctly. + */ +static void +test3(int type) +{ + T_LOG("Test 3: test if lock owner thread exiting without unlocking allows turnstile to work correctly"); + + /* Create a thread at IN and take lock */ + thread_create_at_qos(QOS_CLASS_USER_INITIATED, &take_lock_and_exit, type); + + /* Create a thread at UI and try to take lock */ + thread_create_at_qos(QOS_CLASS_USER_INTERACTIVE, &try_to_take_lock_and_unlock, type); + + /* Create a thread at UI and try to take lock */ + thread_create_at_qos(QOS_CLASS_USER_INTERACTIVE, &unlock_an_owner_exited_lock, type); + + sleep(16); + return; +} + +T_DECL(turnstile_test, "Turnstile test", T_META_ASROOT(YES)) +{ + test1(SYSCTL_TURNSTILE_TEST_DEFAULT); + test2(SYSCTL_TURNSTILE_TEST_DEFAULT); + test3(SYSCTL_TURNSTILE_TEST_DEFAULT); + + test1(SYSCTL_TURNSTILE_TEST_GLOBAL_HASHTABLE); + test2(SYSCTL_TURNSTILE_TEST_GLOBAL_HASHTABLE); + test3(SYSCTL_TURNSTILE_TEST_GLOBAL_HASHTABLE); + +} diff --git a/tools/tests/darwintests/utimensat.c b/tests/utimensat.c similarity index 100% rename from tools/tests/darwintests/utimensat.c rename to tests/utimensat.c diff --git a/tools/tests/darwintests/verify_kalloc_config.c b/tests/verify_kalloc_config.c similarity index 100% rename from tools/tests/darwintests/verify_kalloc_config.c rename to tests/verify_kalloc_config.c diff --git a/tests/vm_set_max_addr_helper.c b/tests/vm_set_max_addr_helper.c new file mode 100644 index 000000000..5a06a3e22 --- /dev/null +++ b/tests/vm_set_max_addr_helper.c @@ -0,0 +1,18 @@ +#include +#include +#include + +int main(void) +{ + kern_return_t kr; + mach_vm_address_t addr = 50ULL * 1024ULL * 1024ULL * 1024ULL; + + kr = mach_vm_allocate(current_task(), &addr, 4096, VM_FLAGS_FIXED); + + if (kr == KERN_SUCCESS) { + return 0; + } else { + return 1; + } +} + diff --git a/tests/vm_set_max_addr_test.c b/tests/vm_set_max_addr_test.c new file mode 100644 index 000000000..325227d51 --- /dev/null +++ b/tests/vm_set_max_addr_test.c @@ -0,0 +1,57 @@ +#include +#include +#include + +#include +#include + +#include +#include + +extern char * testpath; + +T_DECL(set_max_addr, + "Description", + T_META_NAMESPACE("xnu.vm"), + T_META_CHECK_LEAKS(false)) +{ +#if (defined(__arm64__) && defined(__LP64__)) + int result = 0; + int code = 0; + int child_pid = 0; + int status = 0; + char * command_path = "./vm_set_max_addr_helper"; + char * command_args[] = { command_path, NULL }; + posix_spawnattr_t attrp; + + result = posix_spawnattr_init(&attrp); + T_ASSERT_POSIX_SUCCESS(result, "posix_spawnattr_init"); + + result = posix_spawn(&child_pid, command_path, NULL, &attrp, command_args, NULL); + T_ASSERT_POSIX_SUCCESS(result, "posix_spawn"); + + result = waitpid(child_pid, &status, 0); + T_ASSERT_POSIX_SUCCESS(result, "waitpid"); + + code = WEXITSTATUS(status); + T_ASSERT_NE_INT(code, 0, "Child should have failed"); + + result = posix_spawnattr_set_max_addr_np(&attrp, ~0ULL); + T_ASSERT_POSIX_SUCCESS(result, "posix_spawnattr_set_max_addr_np"); + + result = posix_spawn(&child_pid, command_path, NULL, &attrp, command_args, NULL); + T_ASSERT_POSIX_SUCCESS(result, "posix_spawn"); + + result = waitpid(child_pid, &status, 0); + T_ASSERT_POSIX_SUCCESS(result, "waitpid"); + + code = WEXITSTATUS(status); + T_ASSERT_EQ_INT(code, 0, "Child should have succeeded"); + + posix_spawnattr_destroy(&attrp); + T_ASSERT_POSIX_SUCCESS(result, "posix_spawnattr_destroy"); +#else /* !defined(__arm64__) || !defined(__LP64__) */ + T_SKIP("Not supported on this architecture"); +#endif /* (defined(__arm64__) && defined(__LP64__)) */ +} + diff --git a/tools/tests/darwintests/voucher_entry_18826844.c b/tests/voucher_entry_18826844.c similarity index 100% rename from tools/tests/darwintests/voucher_entry_18826844.c rename to tests/voucher_entry_18826844.c diff --git a/tools/tests/darwintests/voucher_traps.c b/tests/voucher_traps.c similarity index 100% rename from tools/tests/darwintests/voucher_traps.c rename to tests/voucher_traps.c diff --git a/tests/wired_mem_bench.c b/tests/wired_mem_bench.c new file mode 100644 index 000000000..91fe03a22 --- /dev/null +++ b/tests/wired_mem_bench.c @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2015-2018 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include + +#include +#include +#include +#include +#include +#include + +#define WIRED_MEM_THRESHOLD_PERCENTAGE 30 + +T_DECL(wired_mem_bench, + "report the amount of wired memory consumed by the booted OS; guard against egregious or unexpected regressions", + T_META_CHECK_LEAKS(false), + T_META_ASROOT(true), + T_META_REQUIRES_REBOOT(true)) // Help reduce noise by asking for a clean boot +// T_META_TAG_PERF) +{ + vm_statistics64_data_t stat; + uint64_t memsize; + vm_size_t page_size = 0; + unsigned int count = HOST_VM_INFO64_COUNT; + kern_return_t ret; + int wired_mem_pct; + struct utsname uname_vers; + + T_SETUPBEGIN; + ret = uname(&uname_vers); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(ret, "uname()"); + + if (strnstr(uname_vers.version, "KASAN", sizeof(uname_vers.version)) != NULL) { + T_SKIP("wired memory metrics are not meaningful on KASAN kernels."); + } + + ret = host_statistics64(mach_host_self(), HOST_VM_INFO64, (host_info64_t)&stat, &count); + T_QUIET; + T_ASSERT_MACH_SUCCESS(ret, "wired memory query via host_statistics64()"); + + size_t s = sizeof(memsize); + ret = sysctlbyname("hw.memsize", &memsize, &s, NULL, 0); + T_QUIET; + T_ASSERT_POSIX_SUCCESS(ret, "sysctlbyname(\"hw.memsize\")"); + + T_QUIET; + T_EXPECT_NE(memsize, 0ULL, "hw.memsize sysctl failed to provide device DRAM size"); + + ret = host_page_size(mach_host_self(), &page_size); + T_QUIET; + T_ASSERT_MACH_SUCCESS(ret, "page size query via host_page_size()"); + + T_SETUPEND; + + T_PERF("wired_memory", (double)(stat.wire_count * (mach_vm_size_t)vm_kernel_page_size >> 10), "kB", + "Wired memory at boot"); + + T_LOG("\nwired memory: %llu kB (%llu MB)\n", stat.wire_count * (mach_vm_size_t)vm_kernel_page_size >> 10, + stat.wire_count * (mach_vm_size_t)vm_kernel_page_size >> 20); + +#if TARGET_OS_IOS || TARGET_OS_OSX + // zprint is not mastered onto other platforms. + int r; + if ((r = system("zprint")) != 0) { + T_FAIL("couldn't run zprint: %d", r); + } +#endif + /* + * Poor-man's wired memory regression test: validate that wired memory consumes + * no more than some outrageously high fixed percentage of total device memory. + */ + wired_mem_pct = (int)((stat.wire_count * page_size * 100ULL) / memsize); + T_PERF("wired_memory_percentage", wired_mem_pct, "%", "Wired memory as percentage of device DRAM size"); + + T_ASSERT_LT(wired_mem_pct, WIRED_MEM_THRESHOLD_PERCENTAGE, + "Wired memory percentage is below allowable threshold (%llu bytes / %u pages / %llu total device memory)", + (uint64_t)stat.wire_count * page_size, stat.wire_count, memsize); +} diff --git a/tools/tests/darwintests/work_interval_test.c b/tests/work_interval_test.c similarity index 100% rename from tools/tests/darwintests/work_interval_test.c rename to tests/work_interval_test.c diff --git a/tools/tests/darwintests/work_interval_test.entitlements b/tests/work_interval_test.entitlements similarity index 100% rename from tools/tests/darwintests/work_interval_test.entitlements rename to tests/work_interval_test.entitlements diff --git a/tools/tests/darwintests/workq_sigprof.c b/tests/workq_sigprof.c similarity index 100% rename from tools/tests/darwintests/workq_sigprof.c rename to tests/workq_sigprof.c diff --git a/tools/tests/darwintests/xnu_quick_test.c b/tests/xnu_quick_test.c similarity index 100% rename from tools/tests/darwintests/xnu_quick_test.c rename to tests/xnu_quick_test.c diff --git a/tests/xnu_quick_test.entitlements b/tests/xnu_quick_test.entitlements new file mode 100644 index 000000000..ada01fb2a --- /dev/null +++ b/tests/xnu_quick_test.entitlements @@ -0,0 +1,8 @@ + + + + + com.apple.rootless.datavault.controller.internal + + + diff --git a/tests/xnu_quick_test_entitled.c b/tests/xnu_quick_test_entitled.c new file mode 100644 index 000000000..ec1252fb4 --- /dev/null +++ b/tests/xnu_quick_test_entitled.c @@ -0,0 +1,81 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#if !TARGET_OS_EMBEDDED +#include +#endif + +T_GLOBAL_META (T_META_NAMESPACE("xnu.quicktest"), T_META_CHECK_LEAKS(false)); + + +/* ************************************************************************************************************** + * Test ioctl system calls. + * ************************************************************************************************************** + */ +T_DECL(ioctl, "Sanity check of ioctl by exercising DKIOCGETBLOCKCOUNT and DKIOCGETBLOCKSIZE", + T_META_ASROOT(true)) +{ + int my_err; + int my_fd = -1; + struct statfs * my_infop; + char * my_ptr; + int my_blksize; + long long my_block_count; + char my_name[ MAXPATHLEN ]; + +#if !TARGET_OS_EMBEDDED + /* + * this test won't be able to open the root disk device unless CSR is + * disabled or in AppleInternal mode + */ + if (csr_check( CSR_ALLOW_UNRESTRICTED_FS ) && + csr_check( CSR_ALLOW_APPLE_INTERNAL ) ) { + T_SKIP("System Integrity Protection is enabled"); + } +#endif + + T_SETUPBEGIN; + + T_WITH_ERRNO; + T_ASSERT_GT(getmntinfo( &my_infop, MNT_NOWAIT ), 0, "getmntinfo"); + + /* make this a raw device */ + strlcpy( &my_name[0], &my_infop->f_mntfromname[0], sizeof(my_name) ); + if ( (my_ptr = strrchr( &my_name[0], '/' )) != 0 ) { + if ( my_ptr[1] != 'r' ) { + my_ptr[ strlen( my_ptr ) ] = 0x00; + memmove( &my_ptr[2], &my_ptr[1], (strlen( &my_ptr[1] ) + 1) ); + my_ptr[1] = 'r'; + } + } + + T_ASSERT_POSIX_SUCCESS(my_fd = open( &my_name[0], O_RDONLY ), "open"); + + T_SETUPEND; + + /* obtain the size of the media (in blocks) */ + T_EXPECT_POSIX_SUCCESS(my_err = ioctl( my_fd, DKIOCGETBLOCKCOUNT, &my_block_count ), + "ioctl DKIOCGETBLOCKCOUNT"); + + /* obtain the block size of the media */ + T_EXPECT_POSIX_SUCCESS(my_err = ioctl( my_fd, DKIOCGETBLOCKSIZE, &my_blksize ), + "ioctl DKIOCGETBLOCKSIZE"); + + T_LOG( "my_block_count %qd my_blksize %d \n", my_block_count, my_blksize ); + + if (my_err != -1) { + /* make sure the returned data looks somewhat valid */ + T_EXPECT_GE(my_blksize, 0, NULL); + T_EXPECT_LE(my_blksize, 1024 * 1000, NULL); + } + + close( my_fd ); +} diff --git a/tools/tests/darwintests/xnu_quick_test_getsetpriority.c b/tests/xnu_quick_test_getsetpriority.c similarity index 100% rename from tools/tests/darwintests/xnu_quick_test_getsetpriority.c rename to tests/xnu_quick_test_getsetpriority.c diff --git a/tools/tests/darwintests/xnu_quick_test_helpers.c b/tests/xnu_quick_test_helpers.c similarity index 100% rename from tools/tests/darwintests/xnu_quick_test_helpers.c rename to tests/xnu_quick_test_helpers.c diff --git a/tools/tests/darwintests/xnu_quick_test_helpers.h b/tests/xnu_quick_test_helpers.h similarity index 100% rename from tools/tests/darwintests/xnu_quick_test_helpers.h rename to tests/xnu_quick_test_helpers.h diff --git a/tools/lldbmacros/Makefile b/tools/lldbmacros/Makefile index 30383a3db..26e79ffb4 100644 --- a/tools/lldbmacros/Makefile +++ b/tools/lldbmacros/Makefile @@ -14,6 +14,9 @@ LLDBMACROS_SOURCE:=$(SRCROOT)/tools/lldbmacros/ LLDBMACROS_BOOTSTRAP_DEST:=$(OBJPATH)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR) LLDBMACROS_DEST:=$(LLDBMACROS_BOOTSTRAP_DEST)/lldbmacros/ LLDBMACROS_USERDEBUG_FILES= +ifeq ($(BUILD_STATIC_LINK),1) +KERNEL_STATIC_DSYM_LLDBMACROS := $(OBJPATH)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/lldbmacros +endif LLDBMACROS_USERDEBUG_FILES:= \ usertaskdebugging/__init__.py \ @@ -37,7 +40,9 @@ LLDBMACROS_PYTHON_FILES = $(LLDBMACROS_USERDEBUG_FILES) \ plugins/zprint_perf_log.py \ atm.py \ bank.py \ + turnstile.py \ kevent.py \ + workqueue.py \ xnu.py \ xnudefines.py \ ktrace.py \ @@ -55,6 +60,7 @@ LLDBMACROS_PYTHON_FILES = $(LLDBMACROS_USERDEBUG_FILES) \ memory.py \ mbufs.py \ net.py \ + skywalk.py \ ioreg.py \ utils.py \ kdp.py \ @@ -77,21 +83,21 @@ ifneq ($(PLATFORM),MacOSX) plugins/iosspeedtracer.sh endif +include $(MakeInc_rule) +include $(MakeInc_dir) INSTALL_LLDBMACROS_PYTHON_FILES=$(addprefix $(LLDBMACROS_DEST), $(LLDBMACROS_PYTHON_FILES)) +$(eval $(call INSTALLPYTHON_RULE_template,$(INSTALL_LLDBMACROS_PYTHON_FILES),$(LLDBMACROS_SOURCE)%,pydir,$(DATA_UNIFDEF),$(LLDBMACROS_DEST))) +$(eval $(call INSTALLPYTHON_RULE_template,$(LLDBMACROS_BOOTSTRAP_DEST)/$(KERNEL_LLDBBOOTSTRAP_NAME),$(LLDBMACROS_SOURCE)/core/xnu_lldb_init.py,kbpydir,$(DATA_UNIFDEF),$(LLDBMACROS_BOOTSTRAP_DEST)/)) -$(INSTALL_LLDBMACROS_PYTHON_FILES): $(LLDBMACROS_DEST)% : $(LLDBMACROS_SOURCE)% - $(_v)$(MKDIR) $(dir $@) - $(_v)$(PYTHON) $(LLDBMACROS_SOURCE)/core/syntax_checker.py $< $(_vstdout) - $(_v)$(INSTALL) $(DATA_INSTALL_FLAGS) $< $@ - $(_v)$(TOUCH) $(LLDBMACROS_DEST) - -$(LLDBMACROS_BOOTSTRAP_DEST)/$(KERNEL_LLDBBOOTSTRAP_NAME): $(LLDBMACROS_SOURCE)/core/xnu_lldb_init.py - $(_v)$(MKDIR) $(dir $@) - $(_v)$(PYTHON) $(LLDBMACROS_SOURCE)/core/syntax_checker.py $< $(_vstdout) - $(_v)$(INSTALL) $(DATA_INSTALL_FLAGS) $< $@ +ifeq ($(BUILD_STATIC_LINK),1) +INSTALL_STATIC_DSYM_LLDBMACROS_PYTHON_FILES=$(addprefix $(KERNEL_STATIC_DSYM_LLDBMACROS), $(LLDBMACROS_PYTHON_FILES)) +$(eval $(call INSTALLPYTHON_RULE_template,$(INSTALL_STATIC_DSYM_LLDBMACROS_PYTHON_FILES),$(LLDBMACROS_SOURCE)%,sdpydir,$(DATA_UNIFDEF),$(KERNEL_STATIC_DSYM_LLDBMACROS))) +$(eval $(call INSTALLPYTHON_RULE_template,$(KERNEL_STATIC_DSYM_LLDBMACROS)/../$(KERNEL_LLDBBOOTSTRAP_NAME),$(LLDBMACROS_SOURCE)/core/xnu_lldb_init.py,kbsdpydir,$(DATA_UNIFDEF),$(KERNEL_STATIC_DSYM_LLDBMACROS)/../)) +endif lldbmacros_install: $(INSTALL_LLDBMACROS_PYTHON_FILES) $(LLDBMACROS_BOOTSTRAP_DEST)/$(KERNEL_LLDBBOOTSTRAP_NAME) - -include $(MakeInc_rule) -include $(MakeInc_dir) + $(_v)$(MKDIR) $(LLDBMACROS_DEST)/builtinkexts +ifeq ($(BUILD_STATIC_LINK),1) + $(_v)$(MKDIR) $(KERNEL_STATIC_DSYM_LLDBMACROS)/builtinkexts +endif diff --git a/tools/lldbmacros/core/cvalue.py b/tools/lldbmacros/core/cvalue.py index 0941f7530..3b1c4eadd 100755 --- a/tools/lldbmacros/core/cvalue.py +++ b/tools/lldbmacros/core/cvalue.py @@ -416,6 +416,18 @@ def cast(obj, target_type): print "ERROR: You cannot cast an 'int' to %s, please use kern.GetValueFromAddress() for such purposes." % str(target_type) raise TypeError("object of type %s cannot be casted to %s" % (str(type(obj)), str(target_type))) +def containerof(obj, target_type, field_name): + """ Type cast an object to another C type from a pointer to a field. + params: + obj - core.value object representing some C construct in lldb + target_type - str : ex 'struct thread' + - lldb.SBType : + field_name - the field name within the target_type obj is a pointer to + """ + addr = int(obj) - getfieldoffset(target_type, field_name) + obj = value(obj.GetSBValue().CreateValueFromExpression(None,'(void *)'+str(addr))) + return cast(obj, target_type + " *") + _value_types_cache={} diff --git a/tools/lldbmacros/core/kernelcore.py b/tools/lldbmacros/core/kernelcore.py index 580887584..43a3bd864 100755 --- a/tools/lldbmacros/core/kernelcore.py +++ b/tools/lldbmacros/core/kernelcore.py @@ -6,6 +6,7 @@ from cvalue import * from lazytarget import * from configuration import * +from utils import * import caching import lldb @@ -222,6 +223,30 @@ def IterateRBTreeEntry(element, element_type, field_name): elt = cast(elt, element_type) +def IteratePriorityQueueEntry(root, element_type, field_name): + """ iterate over a priority queue as defined with struct priority_queue from osfmk/kern/priority_queue.h + root - value : Value object for the priority queue + element_type - str : Type of the link element + field_name - str : Name of the field in link element's structure + returns: + A generator does not return. It is used for iterating + value : an object thats of type (element_type). Always a pointer object + """ + def _make_pqe(addr): + return value(root.GetSBValue().CreateValueFromExpression(None,'(struct priority_queue_entry *)'+str(addr))) + + queue = [unsigned(root.pq_root_packed) & ~3] + + while len(queue): + elt = _make_pqe(queue.pop()) + + while elt: + yield containerof(elt, element_type, field_name) + addr = unsigned(elt.child) + if addr: queue.append(addr) + elt = elt.next + + class KernelTarget(object): """ A common kernel object that provides access to kernel objects and information. The class holds global lists for task, terminated_tasks, procs, zones, zombroc etc. @@ -399,9 +424,19 @@ def StraddlesPage(self, addr, size): val = ((addr + size) & (unsigned(self.GetGlobalVariable("page_size"))-1)) return (val < size and val > 0) + + def PhysToKVARM64(self, addr): + ptov_table = self.GetGlobalVariable('ptov_table') + for i in range(0, self.GetGlobalVariable('ptov_index')): + if (addr >= long(unsigned(ptov_table[i].pa))) and (addr < (long(unsigned(ptov_table[i].pa)) + long(unsigned(ptov_table[i].len)))): + return (addr - long(unsigned(ptov_table[i].pa)) + long(unsigned(ptov_table[i].va))) + return (addr - unsigned(self.GetGlobalVariable("gPhysBase")) + unsigned(self.GetGlobalVariable("gVirtBase"))) + def PhysToKernelVirt(self, addr): if self.arch == 'x86_64': return (addr + unsigned(self.GetGlobalVariable('physmap_base'))) + elif self.arch.startswith('arm64'): + return self.PhysToKVARM64(addr) elif self.arch.startswith('arm'): return (addr - unsigned(self.GetGlobalVariable("gPhysBase")) + unsigned(self.GetGlobalVariable("gVirtBase"))) else: @@ -548,7 +583,7 @@ def __getattribute__(self, name): self._ptrsize = caching.GetStaticCacheData("kern.ptrsize", None) if self._ptrsize != None : return self._ptrsize arch = LazyTarget.GetTarget().triple.split('-')[0] - if arch in ('x86_64', 'arm64'): + if arch == 'x86_64' or arch.startswith('arm64'): self._ptrsize = 8 else: self._ptrsize = 4 @@ -558,7 +593,7 @@ def __getattribute__(self, name): if name == 'VM_MIN_KERNEL_ADDRESS': if self.arch == 'x86_64': return unsigned(0xFFFFFF8000000000) - elif self.arch == 'arm64': + elif self.arch.startswith('arm64'): return unsigned(0xffffffe000000000) else: return unsigned(0x80000000) diff --git a/tools/lldbmacros/core/xnu_lldb_init.py b/tools/lldbmacros/core/xnu_lldb_init.py index c7f49ea18..e7f494b96 100755 --- a/tools/lldbmacros/core/xnu_lldb_init.py +++ b/tools/lldbmacros/core/xnu_lldb_init.py @@ -103,5 +103,23 @@ def __lldb_init_module(debugger, internal_dict): if source_map_cmd: print source_map_cmd debugger.HandleCommand(source_map_cmd) + + load_kexts = True + if "XNU_LLDBMACROS_NOBUILTINKEXTS" in os.environ and len(os.environ['XNU_LLDBMACROS_NOBUILTINKEXTS']) > 0: + load_kexts = False + builtinkexts_path = os.path.join(os.path.dirname(self_path), "lldbmacros", "builtinkexts") + if os.access(builtinkexts_path, os.F_OK): + kexts = os.listdir(builtinkexts_path) + if len(kexts) > 0: + print "\nBuiltin kexts: %s\n" % kexts + if load_kexts == False: + print "XNU_LLDBMACROS_NOBUILTINKEXTS is set, not loading:\n" + for kextdir in kexts: + script = os.path.join(builtinkexts_path, kextdir, kextdir.split('.')[-1] + ".py") + import_kext_cmd = "command script import \"%s\"" % script + print "%s" % import_kext_cmd + if load_kexts: + debugger.HandleCommand(import_kext_cmd) + print "\n" diff --git a/tools/lldbmacros/ioreg.py b/tools/lldbmacros/ioreg.py index 74d2e3baa..e2bdaf20e 100755 --- a/tools/lldbmacros/ioreg.py +++ b/tools/lldbmacros/ioreg.py @@ -1,5 +1,6 @@ from xnu import * from utils import * +from kdp import * import sys ###################################### diff --git a/tools/lldbmacros/ipc.py b/tools/lldbmacros/ipc.py index 9e5c48215..81090bbd8 100755 --- a/tools/lldbmacros/ipc.py +++ b/tools/lldbmacros/ipc.py @@ -11,8 +11,8 @@ from ioreg import * import xnudefines -@header("{0: <20s} {1: <6s} {2: <6s} {3: <10s} {4: <15s}".format("task", "pid", '#acts', "tablesize", "command")) -def GetTaskIPCSummary(task): +@header("{0: <20s} {1: <6s} {2: <6s} {3: <10s} {4: <20s}".format("task", "pid", '#acts', "tablesize", "command")) +def GetTaskIPCSummary(task, show_busy = False): """ Display a task's ipc summary. params: task : core.value represeting a Task in kernel @@ -20,12 +20,45 @@ def GetTaskIPCSummary(task): str - string of ipc info for the task """ out_string = '' - format_string = "{0: <#020x} {1: <6d} {2: <6d} {3: <10d} {4: <15s}" + format_string = "{0: <#020x} {1: <6d} {2: <6d} {3: <10d} {4: <20s}" + busy_format = " {0: <10d} {1: <6d}" + proc_name = '' + if not task.active: + proc_name = 'terminated: ' + if task.halting: + proc_name += 'halting: ' pval = Cast(task.bsd_info, 'proc *') + if int(pval) != 0: + proc_name += str(pval.p_comm) + elif int(task.task_imp_base) != 0 and hasattr(task.task_imp_base, 'iit_procname'): + proc_name += str(task.task_imp_base.iit_procname) table_size = int(task.itk_space.is_table_size) - proc_name = str(pval.p_comm) out_string += format_string.format(task, pval.p_pid, task.thread_count, table_size, proc_name) - return out_string + if show_busy: + nbusy, nmsgs = GetTaskBusyPortsSummary(task) + out_string += busy_format.format(nbusy, nmsgs) + return (out_string, table_size, nbusy, nmsgs) + return (out_string, table_size) + +@header("{0: <20s} {1: <6s} {2: <6s} {3: <10s} {4: <20s} {5: <10s} {6: <6s}".format("task", "pid", '#acts', "tablesize", "command", "#busyports", "#kmsgs")) +def GetTaskBusyIPCSummary(task): + return GetTaskIPCSummary(task, True) + +def GetTaskBusyPortsSummary(task): + isp = task.itk_space + i = 0 + nbusy = 0 + nmsgs = 0 + while i < isp.is_table_size: + iep = addressof(isp.is_table[i]) + if iep.ie_bits & 0x00020000: + port = Cast(iep.ie_object, 'ipc_port_t') + if port.ip_messages.data.port.msgcount > 0: + nbusy += 1 + nmsgs += port.ip_messages.data.port.msgcount + i = i + 1 + return (nbusy, nmsgs) + @header("{0: <20s} {1: <28s} {2: <12s} {3: <6s} {4: <4s} {5: <20s} {6: <4s}\n".format( "port", "mqueue", "recvname", "flags", "refs", "recvname", "dest")) @@ -87,6 +120,56 @@ def GetPortDestProc(portp): return out_str + +def GetPortDispositionString(disp): + if (disp < 0): ## use negative numbers for request ports + portname = 'notify' + if disp == -1: + disp_str = 'reqNS' + elif disp == -2: + disp_str = 'reqPD' + elif disp == -3: + disp_str = 'reqSPa' + elif disp == -4: + disp_str = 'reqSPr' + elif disp == -5: + disp_str = 'reqSPra' + else: + disp_str = '-X' + ## These dispositions should match those found in osfmk/mach/message.h + elif disp == 16: + disp_str = 'R' ## receive + elif disp == 24: + disp_str = 'dR' ## dispose receive + elif disp == 17: + disp_str = 'S' ## (move) send + elif disp == 19: + disp_str = 'cS' ## copy send + elif disp == 20: + disp_str = 'mS' ## make send + elif disp == 25: + disp_str = 'dS' ## dispose send + elif disp == 18: + disp_str = 'O' ## send-once + elif disp == 21: + disp_str = 'mO' ## make send-once + elif disp == 26: + disp_str = 'dO' ## dispose send-once + ## faux dispositions used to string-ify IPC entry types + elif disp == 100: + disp_str = 'PS' ## port set + elif disp == 101: + disp_str = 'dead' ## dead name + elif disp == 102: + disp_str = 'L' ## LABELH + elif disp == 103: + disp_str = 'V' ## Thread voucher (thread->ith_voucher->iv_port) + ## Catch-all + else: + disp_str = 'X' ## invalid + return disp_str + + @header("{:<20s} {:<28s} {:<12s} {:<8s} {:<6s} {:<19s} {:<26s} {:<26s}\n".format( "", "kmsg", "msgid", "disp", "size", "reply-port", "source", "destination")) def GetKMsgSummary(kmsgp, prefix_str=""): @@ -164,8 +247,8 @@ def GetKMsgSummary(kmsgp, prefix_str=""): GetKMsgSrc(kmsgp), dest_proc_name) if kmsgh.msgh_bits & 0x80000000: - out_string += prefix_str + "\t" + GetKMsgBody.header + "\n" - out_string += prefix_str + "\t" + GetKMsgBody(kmsgp, prefix_str + "\t") + "\n" + out_string += prefix_str + "\t" + GetKMsgComplexBodyDesc.header + "\n" + out_string += prefix_str + "\t" + GetKMsgComplexBodyDesc(kmsgp, prefix_str + "\t") + "\n" return out_string @@ -177,31 +260,56 @@ def GetMachMsgOOLDescriptorSummary(desc): out_string = format_string.format(desc, desc.address, desc.size) return out_string + +def GetKmsgDescriptors(kmsgp): + """ Get a list of descriptors in a complex message + """ + kmsghp = kmsgp.ikm_header + kmsgh = dereference(kmsghp) + if not (kmsgh.msgh_bits & 0x80000000): + return [] + ## Something in the python/lldb types is not getting alignment correct here. + ## I'm grabbing a pointer to the body manually, and using tribal knowledge + ## of the location of the descriptor count to get this correct + body = Cast(addressof(Cast(addressof(kmsgh), 'char *')[sizeof(kmsgh)]), 'mach_msg_body_t *') + #dsc_count = body.msgh_descriptor_count + dsc_count = dereference(Cast(body, 'uint32_t *')) + #dschead = Cast(addressof(body[1]), 'mach_msg_descriptor_t *') + dschead = Cast(addressof(Cast(addressof(body[0]), 'char *')[sizeof('uint32_t')]), 'mach_msg_descriptor_t *') + dsc_list = [] + for i in range(dsc_count): + dsc_list.append(dschead[i]) + return (body, dschead, dsc_list) + + @header("{: <20s} {: <8s} {: <20s} {: <10s} {: <20s}".format("kmsgheader", "size", "body", "ds_count", "dsc_head")) -def GetKMsgBody(kmsgp, prefix_str=""): +def GetKMsgComplexBodyDesc(kmsgp, prefix_str=""): """ Routine that prints a complex kmsg's body """ kmsghp = kmsgp.ikm_header kmsgh = dereference(kmsghp) + if not (kmsgh.msgh_bits & 0x80000000): + return "" format_string = "{: <#020x} {: <#08x} {: <#020x} {: <#010x} {: <#020x}" out_string = "" - body = Cast(addressof(kmsghp[1]), 'mach_msg_body_t *') - dsc_count = body.msgh_descriptor_count - dschead = Cast(addressof(body[1]), 'mach_msg_descriptor_t *') - out_string += format_string.format(kmsghp, sizeof(dereference(kmsghp)), body, unsigned(dsc_count), dschead) - - for i in range(dsc_count): - dsc = dschead[i] - out_string += "\n" + prefix_str + "Descriptor: " + xnudefines.mach_msg_type_descriptor_strings[unsigned(dsc.type.type)] - if unsigned(dsc.type.type) == 0: - # its a port. - p = dsc.port.name - out_string += " name: {: <#20x}".format(p) - elif unsigned(dsc.type.type) in (1,3): - # its OOL DESCRIPTOR or OOL VOLATILE DESCRIPTOR - ool = dsc.out_of_line - out_string += " " + GetMachMsgOOLDescriptorSummary(addressof(ool)) + (body, dschead, dsc_list) = GetKmsgDescriptors(kmsgp) + out_string += format_string.format(kmsghp, sizeof(dereference(kmsghp)), body, len(dsc_list), dschead) + for dsc in dsc_list: + try: + dsc_type = unsigned(dsc.type.type) + out_string += "\n" + prefix_str + "Descriptor: " + xnudefines.mach_msg_type_descriptor_strings[dsc_type] + if dsc_type == 0: + # its a port. + p = dsc.port.name + dstr = GetPortDispositionString(dsc.port.disposition) + out_string += " disp:{:s}, name:{: <#20x}".format(dstr, p) + elif unsigned(dsc.type.type) in (1,3): + # its OOL DESCRIPTOR or OOL VOLATILE DESCRIPTOR + ool = dsc.out_of_line + out_string += " " + GetMachMsgOOLDescriptorSummary(addressof(ool)) + except: + out_string += "\n" + prefix_str + "Invalid Descriptor: {}".format(dsc) return out_string def GetKMsgSrc(kmsgp): @@ -348,8 +456,9 @@ def ShowTaskIPC(cmd_args=None): print GetTaskSummary.header + " " + GetProcSummary.header pval = Cast(tval.bsd_info, 'proc *') print GetTaskSummary(tval) + " " + GetProcSummary(pval) - print GetTaskIPCSummary.header - print GetTaskIPCSummary(tval) + print GetTaskBusyIPCSummary.header + (summary, table_size, nbusy, nmsgs) = GetTaskBusyIPCSummary(tval) + print summary # EndMacro: showtaskipc @@ -376,8 +485,15 @@ def ShowIPCSummary(cmd_args=None): tasks that are candidates for further investigation. """ print GetTaskIPCSummary.header + ipc_table_size = 0 for t in kern.tasks: - print GetTaskIPCSummary(t) + (summary, table_size) = GetTaskIPCSummary(t) + ipc_table_size += table_size + print summary + for t in kern.terminated_tasks: + (summary, table_size) = GetTaskIPCSummary(t) + ipc_table_size += table_size + print "Total Table size: {:d}".format(ipc_table_size) return def GetKObjectFromPort(portval): @@ -461,7 +577,7 @@ def GetPortDestinationSummary(port): return out_str @lldb_type_summary(['ipc_entry_t']) -@header("{: <20s} {: <20s} {: <8s} {: <8s} {: <8s} {: <8s} {: <20s} {: <20s}".format("object", "name","rite", "urefs", "nsets", "nmsgs", "destname", "destination")) +@header("{: <20s} {: <12s} {: <8s} {: <8s} {: <8s} {: <8s} {: <20s} {: <20s}".format("object", "name", "rite", "urefs", "nsets", "nmsgs", "destname", "destination")) def GetIPCEntrySummary(entry, ipc_name='', rights_filter=0): """ Get summary of a ipc entry. params: @@ -477,19 +593,19 @@ def GetIPCEntrySummary(entry, ipc_name='', rights_filter=0): 'R' : Receive right 'O' : Send-once right types of notifications: + 'd' : Dead-Name notification requested 's' : Send-Possible notification armed - 'd' : Send-Possible notification requested - 'n' : Dead-Name notification requested - 'c' : ??? - 'x' : No-Senders notification requested + 'r' : Send-Possible notification requested + 'n' : No-Senders notification requested + 'x' : Port-destroy notification requested """ - out_str = '' + out_str = '' entry_ptr = int(hex(entry), 16) format_string = "{: <#020x} {: <12s} {: <8s} {: <8d} {: <8d} {: <8d} {: <20s} {: <20s}" right_str = '' destname_str = '' destination_str = '' - + ie_object = entry.ie_object ie_bits = int(entry.ie_bits) urefs = int(ie_bits & 0xffff) @@ -523,16 +639,31 @@ def GetIPCEntrySummary(entry, ipc_name='', rights_filter=0): sorightval = requestsval[int(entry.index.request)].notify.port soright_ptr = unsigned(sorightval) if soright_ptr != 0: - # send-possible armed - if soright_ptr & 0x1 : right_str +='s' - # send-possible requested - elif soright_ptr & 0x2 : right_str +='d' - # dead-name notification requested - else : right_str +='n' - # XXX: What does this bit mean? - if ie_bits & 0x00800000 : right_str +='c' + # dead-name notification requested + right_str += 'd' + # send-possible armed + if soright_ptr & 0x1 : right_str +='s' + # send-possible requested + if soright_ptr & 0x2 : right_str +='r' # No-senders notification requested - if portval.ip_nsrequest != 0: right_str +='x' + if portval.ip_nsrequest != 0: right_str += 'n' + # port-destroy notification requested + if portval.ip_pdrequest != 0: right_str += 'x' + + # early-out if the rights-filter doesn't match + if rights_filter != 0 and rights_filter != right_str: + return '' + + # append the generation to the name value + # (from osfmk/ipc/ipc_entry.h) + # bits rollover period + # 0 0 64 + # 0 1 48 + # 1 0 32 + # 1 1 16 + ie_gen_roll = { 0:'.64', 1:'.48', 2:'.32', 3:'.16' } + ipc_name = '{:s}{:s}'.format(strip(ipc_name), ie_gen_roll[(ie_bits & 0x00c00000) >> 22]) + # now show the port destination part destname_str = GetPortDestinationSummary(Cast(ie_object, 'ipc_port_t')) # Get the number of sets to which this port belongs @@ -620,12 +751,12 @@ def ShowRights(cmd_args=None, cmd_options={}): 'S' : Send right 'R' : Receive right 'O' : Send-once right - types of notifications (append to rights type string): + types of notifications: + 'd' : Dead-Name notification requested 's' : Send-Possible notification armed - 'd' : Send-Possible notification requested - 'n' : Dead-Name notification requested - 'c' : ??? - 'x' : No-Senders notification requested + 'r' : Send-Possible notification requested + 'n' : No-Senders notification requested + 'x' : Port-destroy notification requested """ if not cmd_args: print "No arguments passed" @@ -655,12 +786,12 @@ def ShowTaskRights(cmd_args=None, cmd_options={}): 'S' : Send right 'R' : Receive right 'O' : Send-once right - types of notifications (append to rights type string): + types of notifications: + 'd' : Dead-Name notification requested 's' : Send-Possible notification armed - 'd' : Send-Possible notification requested - 'n' : Dead-Name notification requested - 'c' : ??? - 'x' : No-Senders notification requested + 'r' : Send-Possible notification requested + 'n' : No-Senders notification requested + 'x' : Port-destroy notification requested """ if cmd_args == None: print "No arguments passed" @@ -693,12 +824,12 @@ def ShowTaskRightsBt(cmd_args=None, cmd_options={}): 'S' : Send right 'R' : Receive right 'O' : Send-once right - types of notifications (append to rights type string): + types of notifications: + 'd' : Dead-Name notification requested 's' : Send-Possible notification armed - 'd' : Send-Possible notification requested - 'n' : Dead-Name notification requested - 'c' : ??? - 'x' : No-Senders notification requested + 'r' : Send-Possible notification requested + 'n' : No-Senders notification requested + 'x' : Port-destroy notification requested """ if cmd_args == None: print "No arguments passed" @@ -733,12 +864,12 @@ def ShowAllRights(cmd_args=None, cmd_options={}): 'S' : Send right 'R' : Receive right 'O' : Send-once right - types of notifications (append to rights type string): + types of notifications: + 'd' : Dead-Name notification requested 's' : Send-Possible notification armed - 'd' : Send-Possible notification requested - 'n' : Dead-Name notification requested - 'c' : ??? - 'x' : No-Senders notification requested + 'r' : Send-Possible notification requested + 'n' : No-Senders notification requested + 'x' : Port-destroy notification requested """ rights_type = 0 if "-R" in cmd_options: @@ -757,6 +888,525 @@ def ShowAllRights(cmd_args=None, cmd_options={}): # EndMacro: showallrights + +def GetInTransitPortSummary(port, disp, holding_port, holding_kmsg): + """ String-ify the in-transit dispostion of a port. + """ + ## This should match the summary generated by GetIPCEntrySummary + ## "object" "name" "rite" "urefs" "nsets" "nmsgs" "destname" "destination" + format_str = "\t{: <#20x} {: <12} {: <8s} {: <8d} {: <8d} {: <8d} p:{: <#19x} k:{: <#19x}" + portname = 'intransit' + + disp_str = GetPortDispositionString(disp) + + out_str = format_str.format(unsigned(port), 'in-transit', disp_str, 0, 0, port.ip_messages.data.port.msgcount, unsigned(holding_port), unsigned(holding_kmsg)) + return out_str + + +def GetDispositionFromEntryType(entry_bits): + """ Translate an IPC entry type into an in-transit disposition. This allows + the GetInTransitPortSummary function to be re-used to string-ify IPC + entry types. + """ + ebits = int(entry_bits) + if (ebits & 0x003f0000) == 0: + return 0 + + if (ebits & 0x00010000) != 0: + return 17 ## MACH_PORT_RIGHT_SEND + elif (ebits & 0x00020000) != 0: + return 16 ## MACH_PORT_RIGHT_RECEIVE + elif (ebits & 0x00040000) != 0: + return 18 ## MACH_PORT_RIGHT_SEND_ONCE + elif (ebits & 0x00080000) != 0: + return 100 ## MACH_PORT_RIGHT_PORT_SET + elif (ebits & 0x00100000) != 0: + return 101 ## MACH_PORT_RIGHT_DEAD_NAME + elif (ebits & 0x00200000) != 0: + return 102 ## MACH_PORT_RIGHT_LABELH + else: + return 0 + +def GetDispositionFromVoucherPort(th_vport): + """ Translate a thread's voucher port into a 'disposition' + """ + if unsigned(th_vport) > 0: + return 103 ## Voucher type + return 0 + + +g_kmsg_prog = 0 +g_progmeter = { + 0 : '*', + 1 : '-', + 2 : '\\', + 3 : '|', + 4 : '/', + 5 : '-', + 6 : '\\', + 7 : '|', + 8 : '/', +} + +def PrintProgressForKmsg(): + global g_kmsg_prog + global g_progmeter + sys.stderr.write(" {:<1s}\r".format(g_progmeter[g_kmsg_prog % 9])) + g_kmsg_prog += 1 + + +def CollectPortsForAnalysis(port, disposition): + """ + """ + p = Cast(port, 'struct ipc_port *') + yield (p, disposition) + + # no-senders notification port + if unsigned(p.ip_nsrequest) != 0: + PrintProgressForKmsg() + yield (Cast(p.ip_nsrequest, 'struct ipc_port *'), -1) + + # port-death notification port + if unsigned(p.ip_pdrequest) != 0: + PrintProgressForKmsg() + yield (Cast(p.ip_pdrequest, 'struct ipc_port *'), -2) + + ## ports can have many send-possible notifications armed: go through the table! + if unsigned(p.ip_requests) != 0: + table = Cast(p.ip_requests, 'struct ipc_port_request *') + table_sz = int(table.name.size.its_size) + for i in range(table_sz): + if i == 0: + continue + ipr = table[i] + if unsigned(ipr.name.name) != 0: + ipr_bits = unsigned(ipr.notify.port) & 3 + ipr_port = kern.GetValueFromAddress(int(ipr.notify.port) & ~3, 'struct ipc_port *') + ipr_disp = 0 + if ipr_bits & 3: ## send-possible armed and requested + ipr_disp = -5 + elif ipr_bits & 2: ## send-possible requested + ipr_disp = -4 + elif ipr_bits & 1: ## send-possible armed + ipr_disp = -3 + PrintProgressForKmsg() + yield (ipr_port, ipr_disp) + return + +def CollectKmsgPorts(task, task_port, kmsgp): + """ Look through a message, 'kmsgp' destined for 'task' + (enqueued on task_port). Collect any port descriptors, + remote, local, voucher, or other port references + into a (ipc_port_t, disposition) list. + """ + kmsgh = dereference(kmsgp.ikm_header) + + p_list = [] + + PrintProgressForKmsg() + if kmsgh.msgh_remote_port and unsigned(kmsgh.msgh_remote_port) != unsigned(task_port): + disp = kmsgh.msgh_bits & 0x1f + p_list += list(CollectPortsForAnalysis(kmsgh.msgh_remote_port, disp)) + + if kmsgh.msgh_local_port and unsigned(kmsgh.msgh_local_port) != unsigned(task_port) \ + and unsigned(kmsgh.msgh_local_port) != unsigned(kmsgh.msgh_remote_port): + disp = (kmsgh.msgh_bits & 0x1f00) >> 8 + p_list += list(CollectPortsForAnalysis(kmsgh.msgh_local_port, disp)) + + if kmsgp.ikm_voucher: + p_list += list(CollectPortsForAnalysis(kmsgp.ikm_voucher, 0)) + + if kmsgh.msgh_bits & 0x80000000: + ## Complex message - look for descriptors + PrintProgressForKmsg() + (body, dschead, dsc_list) = GetKmsgDescriptors(kmsgp) + for dsc in dsc_list: + PrintProgressForKmsg() + dsc_type = unsigned(dsc.type.type) + if dsc_type == 0 or dsc_type == 2: ## 0 == port, 2 == ool port + if dsc_type == 0: + ## its a port descriptor + dsc_disp = dsc.port.disposition + p_list += list(CollectPortsForAnalysis(dsc.port.name, dsc_disp)) + else: + ## it's an ool_ports descriptor which is an array of ports + dsc_disp = dsc.ool_ports.disposition + dispdata = Cast(dsc.ool_ports.address, 'struct ipc_port *') + for pidx in range(dsc.ool_ports.count): + PrintProgressForKmsg() + p_list += list(CollectPortsForAnalysis(dispdata[pidx], dsc_disp)) + return p_list + +def CollectKmsgPortRefs(task, task_port, kmsgp, p_refs): + """ Recursively collect all references to ports inside the kmsg 'kmsgp' + into the set 'p_refs' + """ + p_list = CollectKmsgPorts(task, task_port, kmsgp) + + ## Iterate over each ports we've collected, to see if they + ## have messages on them, and then recurse! + for p, pdisp in p_list: + ptype = (p.ip_object.io_bits & 0x7fff0000) >> 16 + p_refs.add((p, pdisp, ptype)) + if ptype != 0: ## don't bother with port sets + continue + ## If the port that's in-transit has messages already enqueued, + ## go through each of those messages and look for more ports! + if p.ip_messages.data.port.msgcount > 0: + p_kmsgp = Cast(p.ip_messages.data.port.messages.ikmq_base, 'ipc_kmsg_t') + kmsgheadp = p_kmsgp + while unsigned(p_kmsgp) > 0: + CollectKmsgPortRefs(task, p, p_kmsgp, p_refs) + p_kmsgp = p_kmsgp.ikm_next + if p_kmsgp == kmsgheadp: + break; + + +def FindKmsgPortRefs(instr, task, task_port, kmsgp, qport): + """ Look through a message, 'kmsgp' destined for 'task'. If we find + any port descriptors, remote, local, voucher, or other port that + matches 'qport', return a short description + which should match the format of GetIPCEntrySummary. + """ + + out_str = instr + p_list = CollectKmsgPorts(task, task_port, kmsgp) + + ## Run through all ports we've collected looking for 'qport' + for p, pdisp in p_list: + PrintProgressForKmsg() + if unsigned(p) == unsigned(qport): + ## the port we're looking for was found in this message! + if len(out_str) > 0: + out_str += '\n' + out_str += GetInTransitPortSummary(p, pdisp, task_port, kmsgp) + + ptype = (p.ip_object.io_bits & 0x7fff0000) >> 16 + if ptype != 0: ## don't bother with port sets + continue + + ## If the port that's in-transit has messages already enqueued, + ## go through each of those messages and look for more ports! + if p.ip_messages.data.port.msgcount > 0: + p_kmsgp = Cast(p.ip_messages.data.port.messages.ikmq_base, 'ipc_kmsg_t') + kmsgheadp = p_kmsgp + while unsigned(p_kmsgp) > 0: + out_str = FindKmsgPortRefs(out_str, task, p, p_kmsgp, qport) + p_kmsgp = p_kmsgp.ikm_next + if p_kmsgp == kmsgheadp: + break + return out_str + + +port_iteration_do_print_taskname = False +registeredport_idx = -10 +excports_idx = -20 +intransit_idx = -1000 +taskports_idx = -2000 +thports_idx = -3000 + +def IterateAllPorts(tasklist, func, ctx, include_psets, follow_busyports, should_log): + """ Iterate over all ports in the system, calling 'func' + for each entry in + """ + global port_iteration_do_print_taskname + global intransit_idx, taskports_idx, thports_idx, registeredport_idx, excports_idx + + ## XXX: also host special ports + + entry_port_type_mask = 0x00070000 + if include_psets: + entry_port_type_mask = 0x000f0000 + + if tasklist is None: + tasklist = kern.tasks + tasklist += kern.terminated_tasks + + tidx = 1 + + for t in tasklist: + # Write a progress line. Using stderr avoids automatic newline when + # writing to stdout from lldb. Blank spaces at the end clear out long + # lines. + if should_log: + procname = "" + if not t.active: + procname = 'terminated: ' + if t.halting: + procname += 'halting: ' + t_p = Cast(t.bsd_info, 'proc *') + if unsigned(t_p) != 0: + procname += str(t_p.p_name) + elif unsigned(t.task_imp_base) != 0 and hasattr(t.task_imp_base, 'iit_procname'): + procname += str(t.task_imp_base.iit_procname) + sys.stderr.write(" checking {:s} ({}/{})...{:50s}\r".format(procname, tidx, len(tasklist), '')) + tidx += 1 + + port_iteration_do_print_taskname = True + space = t.itk_space + num_entries = int(space.is_table_size) + is_tableval = space.is_table + idx = 0 + while idx < num_entries: + entry_val = GetObjectAtIndexFromArray(is_tableval, idx) + entry_bits= unsigned(entry_val.ie_bits) + entry_obj = 0 + entry_str = '' + entry_name = "{:x}".format( (idx << 8 | entry_bits >> 24) ) + + entry_disp = GetDispositionFromEntryType(entry_bits) + + ## If the entry in the table represents a port of some sort, + ## then make the callback provided + if int(entry_bits) & entry_port_type_mask: + eport = Cast(entry_val.ie_object, 'ipc_port_t') + ## Make the callback + func(t, space, ctx, idx, entry_val, eport, entry_disp) + + ## if the port has pending messages, look through + ## each message for ports (and recurse) + if follow_busyports and unsigned(eport) > 0 and eport.ip_messages.data.port.msgcount > 0: + ## collect all port references from all messages + kmsgp = Cast(eport.ip_messages.data.port.messages.ikmq_base, 'ipc_kmsg_t') + kmsgheadp = kmsgp + while unsigned(kmsgp) > 0: + p_refs = set() + CollectKmsgPortRefs(t, eport, kmsgp, p_refs) + for (port, pdisp, ptype) in p_refs: + func(t, space, ctx, intransit_idx, None, port, pdisp) + kmsgp = kmsgp.ikm_next + if kmsgp == kmsgheadp: + break + + idx = idx + 1 + ## while (idx < num_entries) + + ## Task ports (send rights) + if unsigned(t.itk_sself) > 0: + func(t, space, ctx, taskports_idx, 0, t.itk_sself, 17) + if unsigned(t.itk_host) > 0: + func(t, space, ctx, taskports_idx, 0, t.itk_host, 17) + if unsigned(t.itk_bootstrap) > 0: + func(t, space, ctx, taskports_idx, 0, t.itk_bootstrap, 17) + if unsigned(t.itk_seatbelt) > 0: + func(t, space, ctx, taskports_idx, 0, t.itk_seatbelt, 17) + if unsigned(t.itk_gssd) > 0: + func(t, space, ctx, taskports_idx, 0, t.itk_gssd, 17) + if unsigned(t.itk_debug_control) > 0: + func(t, space, ctx, taskports_idx, 0, t.itk_debug_control, 17) + if unsigned(t.itk_task_access) > 0: + func(t, space, ctx, taskports_idx, 0, t.itk_task_access, 17) + + ## Task name port (not a send right, just a naked ref) + if unsigned(t.itk_nself) > 0: + func(t, space, ctx, taskports_idx, 0,t.itk_nself, 0) + + ## task resume port is a receive right to resume the task + if unsigned(t.itk_resume) > 0: + func(t, space, ctx, taskports_idx, 0, t.itk_resume, 16) + + ## registered task ports (all send rights) + tr_idx = 0 + tr_max = sizeof(t.itk_registered) / sizeof(t.itk_registered[0]) + while tr_idx < tr_max: + tport = t.itk_registered[tr_idx] + if unsigned(tport) > 0: + try: + func(t, space, ctx, registeredport_idx, 0, tport, 17) + except Exception, e: + print("\texception looking through registered port {:d}/{:d} in {:s}".format(tr_idx,tr_max,t)) + pass + tr_idx += 1 + + ## Task exception ports + exidx = 0 + exmax = sizeof(t.exc_actions) / sizeof(t.exc_actions[0]) + while exidx < exmax: ## see: osfmk/mach/[arm|i386]/exception.h + export = t.exc_actions[exidx].port ## send right + if unsigned(export) > 0: + try: + func(t, space, ctx, excports_idx, 0, export, 17) + except Exception, e: + print("\texception looking through exception port {:d}/{:d} in {:s}".format(exidx,exmax,t)) + pass + exidx += 1 + + ## XXX: any ports still valid after clearing IPC space?! + + for thval in IterateQueue(t.threads, 'thread *', 'task_threads'): + ## XXX: look at block reason to see if it's in mach_msg_receive - then look at saved state / message + + ## Thread port (send right) + if unsigned(thval.ith_sself) > 0: + thport = thval.ith_sself + func(t, space, ctx, thports_idx, 0, thport, 17) ## see: osfmk/mach/message.h + ## Thread special reply port (send-once right) + if unsigned(thval.ith_special_reply_port) > 0: + thport = thval.ith_special_reply_port + func(t, space, ctx, thports_idx, 0, thport, 18) ## see: osfmk/mach/message.h + ## Thread voucher port + if unsigned(thval.ith_voucher) > 0: + vport = thval.ith_voucher.iv_port + if unsigned(vport) > 0: + vdisp = GetDispositionFromVoucherPort(vport) + func(t, space, ctx, thports_idx, 0, vport, vdisp) + ## Thread exception ports + if unsigned(thval.exc_actions) > 0: + exidx = 0 + while exidx < exmax: ## see: osfmk/mach/[arm|i386]/exception.h + export = thval.exc_actions[exidx].port ## send right + if unsigned(export) > 0: + try: + func(t, space, ctx, excports_idx, 0, export, 17) + except Exception, e: + print("\texception looking through exception port {:d}/{:d} in {:s}".format(exidx,exmax,t)) + pass + exidx += 1 + ## XXX: the message on a thread (that's currently being received) + ## for (thval in t.threads) + ## for (t in tasklist) + + +# Macro: findportrights +def FindPortRightsCallback(task, space, ctx, entry_idx, ipc_entry, ipc_port, port_disp): + """ Callback which uses 'ctx' as the (port,rights_types) tuple for which + a caller is seeking references. This should *not* be used from a + recursive call to IterateAllPorts. + """ + global port_iteration_do_print_taskname + + (qport, rights_type) = ctx + entry_name = '' + entry_str = '' + if unsigned(ipc_entry) != 0: + entry_bits = unsigned(ipc_entry.ie_bits) + entry_name = "{:x}".format( (entry_idx << 8 | entry_bits >> 24) ) + if (int(entry_bits) & 0x001f0000) != 0 and unsigned(ipc_entry.ie_object) == unsigned(qport): + ## it's a valid entry, and it points to the port + entry_str = '\t' + GetIPCEntrySummary(ipc_entry, entry_name, rights_type) + + procname = GetProcNameForTask(task) + if unsigned(ipc_port) != 0 and ipc_port.ip_messages.data.port.msgcount > 0: + sys.stderr.write(" checking {:s} busy-port {}:{:#x}...{:30s}\r".format(procname, entry_name, unsigned(ipc_port), '')) + ## Search through busy ports to find descriptors which could + ## contain the only reference to this port! + kmsgp = Cast(ipc_port.ip_messages.data.port.messages.ikmq_base, 'ipc_kmsg_t') + kmsgheadp = kmsgp + while unsigned(kmsgp): + entry_str = FindKmsgPortRefs(entry_str, task, ipc_port, kmsgp, qport) + kmsgp = kmsgp.ikm_next + if kmsgp == kmsgheadp: + break; + if len(entry_str) > 0: + sys.stderr.write("{:80s}\r".format('')) + if port_iteration_do_print_taskname: + print "Task: {0: <#x} {1: ] + -S ipc_space : only search the specified ipc space + -R rights_type : only display rights matching the string 'rights_type' + + types of rights: + 'Dead' : Dead name + 'Set' : Port set + 'S' : Send right + 'R' : Receive right + 'O' : Send-once right + types of notifications: + 'd' : Dead-Name notification requested + 's' : Send-Possible notification armed + 'r' : Send-Possible notification requested + 'n' : No-Senders notification requested + 'x' : Port-destroy notification requested + """ + if not cmd_args: + raise ArgumentError("no port address provided") + port = kern.GetValueFromAddress(cmd_args[0], 'struct ipc_port *') + + rights_type = 0 + if "-R" in cmd_options: + rights_type = cmd_options["-R"] + + tasklist = None + if "-S" in cmd_options: + space = kern.GetValueFromAddress(cmd_options["-S"], 'struct ipc_space *') + tasklist = [ space.is_task ] + + ## Don't include port sets + ## Don't recurse on busy ports (we do that manually) + ## DO log progress + IterateAllPorts(tasklist, FindPortRightsCallback, (port, rights_type), False, False, True) + sys.stderr.write("{:120s}\r".format(' ')) + + print "Done." + return +# EndMacro: findportrights + +# Macro: countallports + +def CountPortsCallback(task, space, ctx, entry_idx, ipc_entry, ipc_port, port_disp): + """ Callback which uses 'ctx' as the set of all ports found in the + iteration. This should *not* be used from a recursive + call to IterateAllPorts. + """ + global intransit_idx + + (p_set, p_intransit, p_bytask) = ctx + + ## Add the port address to the set of all port addresses + p_set.add(unsigned(ipc_port)) + + if entry_idx == intransit_idx: + p_intransit.add(unsigned(ipc_port)) + + if task.active or (task.halting and not task.active): + pname = str(Cast(task.bsd_info, 'proc *').p_name) + if not pname in p_bytask.keys(): + p_bytask[pname] = { 'transit':0, 'table':0, 'other':0 } + if entry_idx == intransit_idx: + p_bytask[pname]['transit'] += 1 + elif entry_idx >= 0: + p_bytask[pname]['table'] += 1 + else: + p_bytask[pname]['other'] += 1 + + +@lldb_command('countallports', 'P') +def CountAllPorts(cmd_args=None, cmd_options={}): + """ Routine to search for all as many references to ipc_port structures in the kernel + that we can find. + Usage: countallports [-P] + -P : include port sets in the count (default: NO) + """ + p_set = set() + p_intransit = set() + p_bytask = {} + + find_psets = False + if "-P" in cmd_options: + find_psets = True + + ## optionally include port sets + ## DO recurse on busy ports + ## DO log progress + IterateAllPorts(None, CountPortsCallback, (p_set, p_intransit, p_bytask), find_psets, True, True) + sys.stderr.write("{:120s}\r".format(' ')) + + print "Total ports found: {:d}".format(len(p_set)) + print "In Transit: {:d}".format(len(p_intransit)) + print "By Task:" + for pname in sorted(p_bytask.keys()): + count = p_bytask[pname] + print "\t{: <20s}: table={: <5d}, transit={: <5d}, other={: <5d}".format(pname, count['table'], count['transit'], count['other']) + return +# EndMacro: countallports + # Macro: showpipestats @lldb_command('showpipestats') def ShowPipeStats(cmd_args=None): @@ -816,6 +1466,35 @@ def ShowAllBusyPorts(cmd_args=None): return # EndMacro: showallbusyports +# Macro: showbusyportsummary +@lldb_command('showbusyportsummary') +def ShowBusyPortSummary(cmd_args=None): + """ Routine to print a summary of information about all receive rights + on the system that have enqueued messages. + """ + task_queue_head = kern.globals.tasks + + ipc_table_size = 0 + ipc_busy_ports = 0 + ipc_msgs = 0 + + print GetTaskBusyIPCSummary.header + for tsk in kern.tasks: + (summary, table_size, nbusy, nmsgs) = GetTaskBusyIPCSummary(tsk) + ipc_table_size += table_size + ipc_busy_ports += nbusy + ipc_msgs += nmsgs + print summary + for t in kern.terminated_tasks: + (summary, table_size, nbusy, nmsgs) = GetTaskBusyIPCSummary(tsk) + ipc_table_size += table_size + ipc_busy_ports += nbusy + ipc_msgs += nmsgs + print summary + print "Total Table Size: {:d}, Busy Ports: {:d}, Messages in-flight: {:d}".format(ipc_table_size, ipc_busy_ports, ipc_msgs) + return +# EndMacro: showbusyportsummary + # Macro: showport: @lldb_command('showport','K') def ShowPort(cmd_args=None, cmd_options={}): @@ -1372,21 +2051,8 @@ def ShowPortSendRights(cmd_args=[], cmd_options={}): port = kern.GetValueFromAddress(cmd_args[0], 'struct ipc_port *') i = 1 - for t in kern.tasks: - # Write a progress line. Using stderr avoids automatic newline when - # writing to stdout from lldb. Blank spaces at the end clear out long - # lines. - sys.stderr.write("checking {:s} ({}/{})...{:30s}\r".format(Cast(t.bsd_info, 'proc_t').p_name, i, len(kern.tasks), '')) - i += 1 - entries = GetSpaceSendRightEntries(t.itk_space, port) - - if entries: - print GetTaskIPCSummary.header - print GetTaskIPCSummary(t) - print '\t' + GetIPCEntrySummary.header + return FindPortRights(cmd_args=[unsigned(port)], cmd_options={'-R':'S'}) - for entry in entries: - print "\t" + GetIPCEntrySummary(entry) @lldb_command('showtasksuspenders') def ShowTaskSuspenders(cmd_args=[], cmd_options={}): @@ -1411,4 +2077,4 @@ def ShowTaskSuspenders(cmd_args=[], cmd_options={}): print "task {:#x} ({:s}) is suspended but no resume port exists".format(unsigned(task), Cast(task.bsd_info, 'proc_t').p_name) return - return ShowPortSendRights(cmd_args=[unsigned(port)], cmd_options=cmd_options) + return FindPortRights(cmd_args=[unsigned(port)], cmd_options={'-R':'S'}) diff --git a/tools/lldbmacros/kasan.py b/tools/lldbmacros/kasan.py index d4bec35c4..d924521fe 100755 --- a/tools/lldbmacros/kasan.py +++ b/tools/lldbmacros/kasan.py @@ -109,7 +109,7 @@ def print_alloc_free_entry(addr, orig_ptr): print " #{:}: {}".format(btframes-i-1, GetSourceInformationForAddress(fr)) print "", - print_hexdump(addr, asz, 0) + print_hexdump(addr, asz, 1) alloc_header_sz = 16 @@ -177,7 +177,7 @@ def print_alloc_info(_addr): print " #{:}: {}".format(btframes-i-1, GetSourceInformationForAddress(fr)) print "", - print_hexdump(base, asz, 0) + print_hexdump(base, asz, 1) return elif magic_for_addr(addr, 0xf233) == unsigned(freeh.magic): @@ -202,8 +202,14 @@ def print_whatis(_addr, ctx): rightrz = None extra = "Live" - shbyte = get_shadow_byte(shadow_for_address(addr, shift)) - maxsearch = 4096 * 2 + shaddr = shadow_for_address(addr, shift) + try: + shbyte = get_shadow_byte(shaddr) + except: + print "Unmapped shadow 0x{:x} for address 0x{:x}".format(shaddr, addr) + return + + maxsearch = 8*4096 if shbyte in [0xfa, 0xfb, 0xfd, 0xf5]: print_alloc_info(_addr) @@ -266,9 +272,12 @@ def print_whatis(_addr, ctx): print "Valid range: 0x{:x} -- 0x{:x} ({} bytes)".format(base, base+total_size-1, total_size) print "Offset: {} bytes".format(_addr - base) print "", - print_hexdump(base, total_size, 0) + print_hexdump(base, total_size, 1) def print_hexdump(base, size, ctx): + if size < 16: + size = 16 + base -= base % 16 start = base - 16*ctx size += size % 16 size = min(size + 16*2*ctx, 256) @@ -294,7 +303,7 @@ def kasan_subcommand(cmd, args, opts): print("0x{:02x} @ 0x{:016x} [{}]\n\n".format(sb, shadow, shadow_byte_to_string(sb))) ctx = long(opts.get("-C", 5)) print_shadow_context(addr, ctx) - elif cmd == 'legend': + elif cmd == 'key' or cmd == 'legend': print_legend() elif cmd == 'info': pages_used = unsigned(kern.globals.shadow_pages_used) @@ -308,6 +317,8 @@ def kasan_subcommand(cmd, args, opts): print_whatis(addr, ctx) elif cmd == 'alloc' or cmd == 'heap': print_alloc_info(addr) + else: + print "Unknown subcommand: `{}'".format(cmd) @lldb_command('kasan', 'C:') def Kasan(cmd_args=None, cmd_options={}): diff --git a/tools/lldbmacros/kcdata.py b/tools/lldbmacros/kcdata.py index 12a996c69..f6db996cc 100755 --- a/tools/lldbmacros/kcdata.py +++ b/tools/lldbmacros/kcdata.py @@ -30,9 +30,9 @@ class Globals(object): 'KCDATA_TYPE_INT64_DESC': 0x5, 'KCDATA_TYPE_BINDATA_DESC': 0x6, 'KCDATA_TYPE_ARRAY': 0x11, - 'KCDATA_TYPE_TYPEDEFINTION': 0x12, + 'KCDATA_TYPE_TYPEDEFINITION': 0x12, 'KCDATA_TYPE_CONTAINER_BEGIN': 0x13, - 'KCDATA_TYPE_CONTIANER_END': 0x14, + 'KCDATA_TYPE_CONTAINER_END': 0x14, 'KCDATA_TYPE_ARRAY_PAD0': 0x20, 'KCDATA_TYPE_ARRAY_PAD1': 0x21, @@ -74,9 +74,6 @@ class Globals(object): 'STACKSHOT_KCTYPE_KERN_PAGE_SIZE': 0x910, 'STACKSHOT_KCTYPE_JETSAM_LEVEL': 0x911, 'STACKSHOT_KCTYPE_DELTA_SINCE_TIMESTAMP': 0x912, - 'STACKSHOT_KCTYPE_TASK_DELTA_SNAPSHOT': 0x940, - 'STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT': 0x941, - 'STACKSHOT_KCTYPE_KERN_STACKLR': 0x913, 'STACKSHOT_KCTYPE_KERN_STACKLR64': 0x914, 'STACKSHOT_KCTYPE_USER_STACKLR': 0x915, @@ -92,9 +89,16 @@ class Globals(object): 'STACKSHOT_KCTYPE_THREAD_GROUP' : 0x91f, 'STACKSHOT_KCTYPE_JETSAM_COALITION_SNAPSHOT' : 0x920, 'STACKSHOT_KCTYPE_JETSAM_COALITION' : 0x921, + 'STACKSHOT_KCTYPE_THREAD_POLICY_VERSION': 0x922, 'STACKSHOT_KCTYPE_INSTRS_CYCLES' : 0x923, + 'STACKSHOT_KCTYPE_USER_STACKTOP' : 0x924, + 'STACKSHOT_KCTYPE_ASID' : 0x925, + 'STACKSHOT_KCTYPE_PAGE_TABLES' : 0x926, + 'STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT' : 0x927, + + 'STACKSHOT_KCTYPE_TASK_DELTA_SNAPSHOT': 0x940, + 'STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT': 0x941, - 'STACKSHOT_KCTYPE_THREAD_POLICY_VERSION': 0x922, 'KCDATA_TYPE_BUFFER_END': 0xF19158ED, @@ -291,7 +295,7 @@ def ShouldMerge(self): class KCTypeDescription(object): - def __init__(self, t_type_id, t_elements=[], t_name='anon', custom_repr=None, legacy_size=None, merge=False): + def __init__(self, t_type_id, t_elements=[], t_name='anon', custom_repr=None, legacy_size=None, merge=False, naked=False): self.type_id = t_type_id self.elements = t_elements self.name = t_name @@ -300,6 +304,7 @@ def __init__(self, t_type_id, t_elements=[], t_name='anon', custom_repr=None, le if legacy_size: self.legacy_size = legacy_size self.merge = merge + self.naked = naked for e in self.elements: self.totalsize += e.GetTotalSize() @@ -336,7 +341,10 @@ def GetJsonRepr(self, base_data, flags): base_data = base_data[:self.legacy_size] if self.custom_JsonRepr: return self.custom_JsonRepr([e.GetValue(base_data) for e in self.elements]) - o = ", ".join(['"%s": %s' % (e.GetName(), e.GetJsonRepr(base_data)) for e in self.elements if not e.ShouldSkip(base_data)]) + if self.naked: + o = ", ".join([e.GetJsonRepr(base_data) for e in self.elements if not e.ShouldSkip(base_data)]) + else: + o = ", ".join(['"%s": %s' % (e.GetName(), e.GetJsonRepr(base_data)) for e in self.elements if not e.ShouldSkip(base_data)]) if not self.merge: o = '{' + o + '}' return o @@ -427,7 +435,7 @@ def FromKCItem(kcitem): return KCObject(kcitem.i_type, kcitem.i_data, kcitem.i_offset, kcitem.i_flags) def IsContainerEnd(self): - return self.i_type == GetTypeForName('KCDATA_TYPE_CONTIANER_END') + return self.i_type == GetTypeForName('KCDATA_TYPE_CONTAINER_END') def IsBufferEnd(self): return self.i_type == GetTypeForName('KCDATA_TYPE_BUFFER_END') @@ -469,7 +477,7 @@ def ParseData(self): self.obj['typeID'] = self.i_type logging.info("0x%08x: %s%s" % (self.offset, INDENT(), self.i_name)) - elif self.i_type == GetTypeForName('KCDATA_TYPE_CONTIANER_END'): + elif self.i_type == GetTypeForName('KCDATA_TYPE_CONTAINER_END'): self.obj['uniqID'] = self.i_flags logging.info("0x%08x: %sEND" % (self.offset, INDENT(end=True))) @@ -491,7 +499,7 @@ def ParseData(self): self.obj = u_d[1] logging.info("0x%08x: %s%s" % (self.offset, INDENT(), self.i_name)) - elif self.i_type == GetTypeForName('KCDATA_TYPE_TYPEDEFINTION'): + elif self.i_type == GetTypeForName('KCDATA_TYPE_TYPEDEFINITION'): self.is_naked_type = True u_d = struct.unpack_from('II32s', self.i_data) self.obj['name'] = u_d[2].split(chr(0))[0] @@ -854,7 +862,12 @@ def _get_data_element(elementValues): KNOWN_TYPES_COLLECTION[0x909] = KCSubTypeElement('pth_name', KCSUBTYPE_TYPE.KC_ST_CHAR, KCSubTypeElement.GetSizeForArray(64, 1), 0, 1) - +KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT')] = KCTypeDescription(GetTypeForName('STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT'), ( + KCSubTypeElement('imageLoadAddress', KCSUBTYPE_TYPE.KC_ST_UINT64, 8, 0, 0), + KCSubTypeElement('imageUUID', KCSUBTYPE_TYPE.KC_ST_UINT8, KCSubTypeElement.GetSizeForArray(16, 1), 8, 1) +), + 'system_shared_cache_layout' +) KNOWN_TYPES_COLLECTION[GetTypeForName('KCDATA_TYPE_LIBRARY_LOADINFO64')] = KCTypeDescription(GetTypeForName('KCDATA_TYPE_LIBRARY_LOADINFO64'), ( KCSubTypeElement('imageLoadAddress', KCSUBTYPE_TYPE.KC_ST_UINT64, 8, 0, 0), @@ -1014,6 +1027,18 @@ def _get_data_element(elementValues): ), 'instrs_cycles_snapshot') +def set_type(name, *args): + typ = GetTypeForName(name) + KNOWN_TYPES_COLLECTION[typ] = KCTypeDescription(GetTypeForName(typ), *args) + + +set_type('STACKSHOT_KCTYPE_USER_STACKTOP', + ( + KCSubTypeElement.FromBasicCtype('sp', KCSUBTYPE_TYPE.KC_ST_UINT64, 0), + KCSubTypeElement('stack_contents', KCSUBTYPE_TYPE.KC_ST_UINT8, KCSubTypeElement.GetSizeForArray(8, 1), 8, 1), + ), + 'user_stacktop') + #KNOWN_TYPES_COLLECTION[0x907] = KCSubTypeElement('donating_pids', KCSUBTYPE_TYPE.KC_ST_UINT32, 4, 0, 0, KCSubTypeElement._get_naked_element_value) KNOWN_TYPES_COLLECTION[GetTypeForName('TASK_CRASHINFO_PID')] = KCSubTypeElement('pid', KCSUBTYPE_TYPE.KC_ST_INT32, 4, 0, 0) KNOWN_TYPES_COLLECTION[GetTypeForName('TASK_CRASHINFO_PPID')] = KCSubTypeElement('ppid', KCSUBTYPE_TYPE.KC_ST_INT32, 4, 0, 0) @@ -1095,6 +1120,7 @@ def _get_data_element(elementValues): ( KCSubTypeElement.FromBasicCtype('user_usec', KCSUBTYPE_TYPE.KC_ST_UINT64, 0), KCSubTypeElement.FromBasicCtype('system_usec', KCSUBTYPE_TYPE.KC_ST_UINT64, 8), + KCSubTypeElement.FromBasicCtype('runnable_usec', KCSUBTYPE_TYPE.KC_ST_UINT64, 16), ), 'cpu_times') KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_STACKSHOT_DURATION')] = KCTypeDescription(GetTypeForName('STACKSHOT_KCTYPE_STACKSHOT_DURATION'), @@ -1150,6 +1176,16 @@ def _get_data_element(elementValues): KNOWN_TYPES_COLLECTION[GetTypeForName('EXIT_REASON_DISPATCH_QUEUE_NO')] = ( KCSubTypeElement('exit_reason_dispatch_queue_no', KCSUBTYPE_TYPE.KC_ST_UINT64, 8, 0, 0, KCSubTypeElement._get_naked_element_value)) +KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_ASID')] = ( + KCSubTypeElement('ts_asid', KCSUBTYPE_TYPE.KC_ST_UINT32, 4, 0, 0)) + +KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_PAGE_TABLES')] = KCTypeDescription(GetTypeForName('STACKSHOT_KCTYPE_PAGE_TABLES'), ( + KCSubTypeElement(None, KCSUBTYPE_TYPE.KC_ST_UINT64, 8, 0, 0, KCSubTypeElement._get_naked_element_value), ), + 'ts_pagetable', + merge=True, + naked=True +) + def GetSecondsFromMATime(mat, tb): return (float(mat) * tb['numer']) / tb['denom'] @@ -1194,6 +1230,7 @@ def GetStateDescription(s): TH_UNINT = 0x08 TH_TERMINATE = 0x10 TH_TERMINATE2 = 0x20 + TH_WAIT_REPORT = 0x40 TH_IDLE = 0x80 if (s & TH_WAIT): retval.append("TH_WAIT") @@ -1207,6 +1244,8 @@ def GetStateDescription(s): retval.append("TH_TERMINATE") if (s & TH_TERMINATE2): retval.append("TH_TERMINATE2") + if (s & TH_WAIT_REPORT): + retval.append("TH_WAIT_REPORT") if (s & TH_IDLE): retval.append("TH_IDLE") return retval @@ -1232,6 +1271,7 @@ def format_uuid(elementValues): kThreadWaitPThreadCondVar = 0x0e kThreadWaitParkedWorkQueue = 0x0f kThreadWaitWorkloopSyncWait = 0x10 +kThreadWaitOnProcess = 0x11 UINT64_MAX = 0xffffffffffffffff @@ -1329,6 +1369,9 @@ def formatWaitInfo(info): else: s += ", unknown owner" s += ", workloop id %x" % context + elif type == kThreadWaitOnProcess: + s += "waitpid, for pid %d" % owner + else: s += "unknown type %d (owner %d, context %x)" % (type, owner, context) @@ -1367,9 +1410,9 @@ def SaveStackshotReport(j, outfile_name, dsc_uuid, dsc_libs_arr, incomplete): return dsc_common = [format_uuid(ss.get('shared_cache_dyld_load_info')['imageUUID']), - shared_cache_base_addr, - "S" - ] + shared_cache_base_addr, "S" ] + + dsc_layout = ss.get('system_shared_cache_layout') dsc_libs = [] print "Shared cache UUID found from the binary data is <%s> " % str(dsc_common[0]) @@ -1381,11 +1424,17 @@ def SaveStackshotReport(j, outfile_name, dsc_uuid, dsc_libs_arr, incomplete): for i in dsc_libs_arr: _uuid = i[2].lower().replace('-','').strip() _addr = int(i[0], 16) + _load_addr - dsc_libs.append([_uuid, _addr, "P"]) + dsc_libs.append([_uuid, _addr, "C"]) #print "adding ", [_uuid, _addr, "C"] elif dsc_uuid: print "Provided shared cache UUID does not match. Skipping writing report." return + elif dsc_layout: + print "Found in memory system shared cache layout with {} images".format(len(dsc_layout)) + slide = ss.get('shared_cache_dyld_load_info')['imageLoadAddress'] + + for image in dsc_layout: + dsc_libs.append([format_uuid(image['imageUUID']), image['imageLoadAddress'] + slide, "C"]) AllImageCatalog = [] obj = {} @@ -1452,6 +1501,15 @@ def SaveStackshotReport(j, outfile_name, dsc_uuid, dsc_libs_arr, incomplete): continue tasksnap = piddata['task_snapshot'] tsnap["pid"] = tasksnap["ts_pid"] + if 'ts_asid' in piddata: + tsnap["asid"] = piddata["ts_asid"] + + if 'ts_pagetable' in piddata: + pagetables = [] + for tte in piddata["ts_pagetable"]: + pagetables.append(tte) + tsnap["pageTables"] = pagetables + tsnap["residentMemoryBytes"] = tasksnap["ts_task_size"] tsnap["timesDidThrottle"] = tasksnap["ts_did_throttle"] tsnap["systemTimeTask"] = GetSecondsFromMATime(tasksnap["ts_system_time_in_terminated_th"], timebase) @@ -1493,6 +1551,11 @@ def SaveStackshotReport(j, outfile_name, dsc_uuid, dsc_libs_arr, incomplete): for f in thdata["user_stack_frames"]: uframes.append(GetSymbolInfoForFrame(AllImageCatalog, pr_libs, f['lr'])) thsnap["userFrames"] = uframes + + if "user_stacktop" in thdata: + (address,) = struct.unpack(" ledger_peak): - ledger_peak = phys_footprint_entry._le.le_maxtracking.le_peaks[0].le_max + if hasattr(phys_footprint_entry._le._le_max, 'le_interval_max') and (phys_footprint_entry._le._le_max.le_interval_max > ledger_peak): + ledger_peak = phys_footprint_entry._le._le_max.le_interval_max return ledger_peak @header("{: >8s} {: >12s} {: >12s} {: >10s} {: >12s} {: >14s} {: >10s} {: >12s} {: >10s} {: >10s} {: >10s} {: <20s}\n".format( 'pid', 'effective', 'requested', 'state', 'user_data', 'physical', 'iokit', 'footprint', -'spike', 'lifemax', 'limit', 'command')) +'recent peak', 'lifemax', 'limit', 'command')) def GetMemoryStatusNode(proc_val): """ Internal function to get memorystatus information from the given proc params: proc - value representing struct proc * @@ -87,7 +87,7 @@ def GetMemoryStatusNode(proc_val): phys_footprint_limit = task_phys_footprint_ledger_entry.le_limit / page_size ledger_peak = CalculateLedgerPeak(task_phys_footprint_ledger_entry) phys_footprint_spike = ledger_peak / page_size - phys_footprint_lifetime_max = task_phys_footprint_ledger_entry._le.le_maxtracking.le_lifetime_max / page_size + phys_footprint_lifetime_max = task_phys_footprint_ledger_entry._le._le_max.le_lifetime_max / page_size format_string = '{0: >8d} {1: >12d} {2: >12d} {3: #011x} {4: #011x} {5: >12d} {6: >10d} {7: >13d}' out_str += format_string.format(proc_val.p_pid, proc_val.p_memstat_effectivepriority, @@ -232,11 +232,109 @@ def WhatIsHelper(cmd_args=None): pass return +# Macro: showzcache + +@lldb_type_summary(['zone','zone_t']) +@header("{:^18s} {:<40s} {:>10s} {:>10s} {:>10s} {:>10s}".format( +'ZONE', 'NAME', 'CACHE_ELTS', 'DEP_VALID', 'DEP_EMPTY','DEP_FULL')) + +def GetZoneCacheSummary(zone): + """ Summarize a zone's cache with important information. + params: + zone: value - obj representing a zone in kernel + returns: + str - summary of the zone's cache contents + """ + out_string = "" + format_string = '{:#018x} {:<40s} {:>10d} {:>10s} {:>10d} {:>10d}' + cache_elem_count = 0 + mag_capacity = kern.GetGlobalVariable('magazine_element_count') + depot_capacity = kern.GetGlobalVariable('depot_element_count') + + + if zone.__getattr__('cpu_cache_enabled') : + for i in range(0, kern.globals.machine_info.physical_cpu): + cache = zone.zcache[0].zcc_per_cpu_caches[i] + cache_elem_count += cache.current.zcc_magazine_index + cache_elem_count += cache.previous.zcc_magazine_index + + if zone.zcache[0].zcc_depot_index != -1: + cache_elem_count += zone.zcache[0].zcc_depot_index * mag_capacity + out_string += format_string.format(zone, zone.zone_name, cache_elem_count, "Y", depot_capacity - zone.zcache[0].zcc_depot_index, zone.zcache[0].zcc_depot_index) + else: + out_string += format_string.format(zone, zone.zone_name, cache_elem_count, "N", 0, 0) + + return out_string + +@lldb_command('showzcache') +def ZcachePrint(cmd_args=None): + """ Routine to print a summary listing of all the kernel zones cache contents + All columns are printed in decimal + """ + global kern + print GetZoneCacheSummary.header + for zval in kern.zones: + if zval.__getattr__('cpu_cache_enabled') : + print GetZoneCacheSummary(zval) + +# EndMacro: showzcache + +# Macro: showzcachecpu + +@lldb_type_summary(['zone','zone_t']) +@header("{:^18s} {:40s} {:>10s} {:>10s}".format( +'ZONE', 'NAME', 'CACHE_ELTS', 'CPU_INFO')) + +def GetZoneCacheCPUSummary(zone): + """ Summarize a zone's cache broken up per cpu + params: + zone: value - obj representing a zone in kernel + returns: + str - summary of the zone's per CPU cache contents + """ + out_string = "" + format_string = '{:#018x} {:40s} {:10d} {cpuinfo:s}' + cache_elem_count = 0 + cpu_info = "" + per_cpu_count = 0 + mag_capacity = kern.GetGlobalVariable('magazine_element_count') + depot_capacity = kern.GetGlobalVariable('depot_element_count') + + + if zone.__getattr__('cpu_cache_enabled') : + for i in range(0, kern.globals.machine_info.physical_cpu): + if i != 0: + cpu_info += ", " + cache = zone.zcache[0].zcc_per_cpu_caches[i] + per_cpu_count = cache.current.zcc_magazine_index + per_cpu_count += cache.previous.zcc_magazine_index + cache_elem_count += per_cpu_count + cpu_info += "CPU {:d}: {:5}".format(i,per_cpu_count) + if zone.zcache[0].zcc_depot_index != -1: + cache_elem_count += zone.zcache[0].zcc_depot_index * mag_capacity + + out_string += format_string.format(zone, zone.zone_name, cache_elem_count,cpuinfo = cpu_info) + + return out_string + +@lldb_command('showzcachecpu') +def ZcacheCPUPrint(cmd_args=None): + """ Routine to print a summary listing of all the kernel zones cache contents + All columns are printed in decimal + """ + global kern + print GetZoneCacheCPUSummary.header + for zval in kern.zones: + if zval.__getattr__('cpu_cache_enabled') : + print GetZoneCacheCPUSummary(zval) + +# EndMacro: showzcachecpu + # Macro: zprint @lldb_type_summary(['zone','zone_t']) -@header("{:^18s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s}({:>6s} {:>6s} {:>6s}) {:^15s} {:<20s}".format( -'ZONE', 'TOT_SZ', 'PAGE_COUNT', 'ALLOC_ELTS', 'FREE_ELTS', 'FREE_SZ', 'ALL_FREE_PGS', 'ELT_SZ', 'ALLOC', 'ELTS', 'PGS', 'WASTE', 'FLAGS', 'NAME')) +@header("{:^18s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:^6s} {:^6s} {:^6s} {:>10s} {:^15s} {:<20s}".format( +'ZONE', 'TOT_SZ', 'PAGE_COUNT', 'ALLOC_ELTS', 'FREE_ELTS', 'FREE_SZ', 'ALL_FREE_PGS', 'ELT_SZ', 'ALLOC', '(ELTS', 'PGS', 'WASTE)', 'CACHE_ELTS', 'FLAGS', 'NAME')) def GetZoneSummary(zone): """ Summarize a zone with important information. See help zprint for description of each field params: @@ -245,11 +343,12 @@ def GetZoneSummary(zone): str - summary of the zone """ out_string = "" - format_string = '{:#018x} {:10d} {:10d} {:10d} {:10d} {:10d} {:10d} {:10d} {:6d} {:6d} {:6d} {markings} {name:s} ' + format_string = '{:#018x} {:10d} {:10d} {:10d} {:10d} {:10d} {:10d} {:10d} {:10d} {:6d} {:6d} {:6d} {:10d} {markings} {name:s} ' pagesize = kern.globals.page_size free_elements = zone.countfree free_size = free_elements * zone.elem_size + mag_capacity = kern.GetGlobalVariable('magazine_element_count') alloc_pages = zone.alloc_size / pagesize alloc_count = zone.alloc_size / zone.elem_size @@ -267,7 +366,8 @@ def GetZoneSummary(zone): ["zleak_on", "L"], ["doing_alloc_without_vm_priv", "A"], ["doing_alloc_with_vm_priv", "S"], - ["waiting", "W"] + ["waiting", "W"], + ["cpu_cache_enabled", "E"] ] if kern.arch == 'x86_64': marks.append(["gzalloc_exempt", "M"]) @@ -281,10 +381,19 @@ def GetZoneSummary(zone): markings+=mark[1] else: markings+=" " + cache_elem_count = 0 + if zone.__getattr__('cpu_cache_enabled') : + for i in range(0, kern.globals.machine_info.physical_cpu): + cache = zone.zcache[0].zcc_per_cpu_caches[i] + cache_elem_count += cache.current.zcc_magazine_index + cache_elem_count += cache.previous.zcc_magazine_index + if zone.zcache[0].zcc_depot_index != -1: + cache_elem_count += zone.zcache[0].zcc_depot_index * mag_capacity + out_string += format_string.format(zone, zone.cur_size, zone.page_count, zone.count, free_elements, free_size, zone.count_all_free_pages, zone.elem_size, zone.alloc_size, alloc_count, - alloc_pages, alloc_waste, name = zone.zone_name, markings=markings) + alloc_pages, alloc_waste, cache_elem_count, name = zone.zone_name, markings=markings) if zone.exhaustible : out_string += "(max: {:d})".format(zone.max_size) @@ -309,6 +418,7 @@ def Zprint(cmd_args=None): A - currently trying to allocate more backing memory from kernel_memory_allocate without VM priv S - currently trying to allocate more backing memory from kernel_memory_allocate with VM priv W - another thread is waiting for more memory + E - Per-cpu caching is enabled for this zone L - zone is being monitored by zleaks G - currently running GC I - zone was destroyed and is no longer valid @@ -1117,10 +1227,10 @@ def ShowAllVMStats(cmd_args=None): vmstats.compressed_lifetime = 0 vmstats.error = '' - hdr_format = "{0: >10s} {1: <20s} {2: >6s} {3: >10s} {4: >10s} {5: >10s} {6: >10s} {7: >10s} {8: >10s} {9: >10s} {10: >10s} {11: >10s} {12: >10s} {13: >10s} {14:}" - print hdr_format.format('pid', 'command', '#ents', 'wired', 'vsize', 'rsize', 'NEW RSIZE', 'max rsize', 'internal', 'external', 'reusable', 'compressed', 'compressed', 'compressed', '') - print hdr_format.format('', '', '', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(current)', '(peak)', '(lifetime)', '') - entry_format = "{p.p_pid: >10d} {p.p_comm: <20s} {m.hdr.nentries: >6d} {s.wired_count: >10d} {vsize: >10d} {s.resident_count: >10d} {s.new_resident_count: >10d} {s.resident_max: >10d} {s.internal: >10d} {s.external: >10d} {s.reusable: >10d} {s.compressed: >10d} {s.compressed_peak: >10d} {s.compressed_lifetime: >10d} {s.error}" + hdr_format = "{:>6s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:<20s} {:1s}" + print hdr_format.format('#ents', 'wired', 'vsize', 'rsize', 'NEW RSIZE', 'max rsize', 'internal', 'external', 'reusable', 'compressed', 'compressed', 'compressed', 'pid', 'command', '') + print hdr_format.format('', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(current)', '(peak)', '(lifetime)', '', '', '') + entry_format = "{m.hdr.nentries: >6d} {s.wired_count: >10d} {vsize: >10d} {s.resident_count: >10d} {s.new_resident_count: >10d} {s.resident_max: >10d} {s.internal: >10d} {s.external: >10d} {s.reusable: >10d} {s.compressed: >10d} {s.compressed_peak: >10d} {s.compressed_lifetime: >10d} {p.p_pid: >10d} {p.p_comm: <20s} {s.error}" for task in kern.tasks: proc = Cast(task.bsd_info, 'proc *') @@ -1195,7 +1305,7 @@ def ShowMapVME(cmd_args=None): usage: showmapvme """ if cmd_args == None or len(cmd_args) < 1: - print "Invalid argument.", ShowMap.__doc__ + print "Invalid argument.", ShowMapVME.__doc__ return map_val = kern.GetValueFromAddress(cmd_args[0], 'vm_map_t') print GetVMMapSummary.header @@ -1217,7 +1327,7 @@ def GetVMMapSummary(vmmap): resident_pages = 0 if vmmap.pmap != 0: resident_pages = int(vmmap.pmap.stats.resident_count) first_free = 0 - if int(vmmap.holelistenabled) == 0: first_free = vmmap.f_s.first_free + if int(vmmap.holelistenabled) == 0: first_free = vmmap.f_s._first_free out_string += format_string.format(vmmap, vmmap.pmap, vm_size, vmmap.hdr.nentries, resident_pages, vmmap.hint, first_free) return out_string @@ -1485,6 +1595,7 @@ def AddKextAddr(cmd_args=[]): addr = ArgumentStringToInt(cmd_args[0]) all_kexts_info = GetKextLoadInformation() + kernel_uuid = str(kern.globals.kernel_uuid_string).lower() found_kinfo = None found_segment = None for kinfo in all_kexts_info: @@ -1493,14 +1604,17 @@ def AddKextAddr(cmd_args=[]): print GetKextSummary.header print GetKextSummary(kinfo[7]) + " segment: {} offset = {:#0x}".format(segment.name, (addr - segment.vmaddr)) cur_uuid = kinfo[0].lower() - print "Fetching dSYM for %s" % cur_uuid - info = dsymForUUID(cur_uuid) - if info and 'DBGSymbolRichExecutable' in info: - print "Adding dSYM (%s) for %s" % (cur_uuid, info['DBGSymbolRichExecutable']) - addDSYM(cur_uuid, info) - loadDSYM(cur_uuid, int(kinfo[1],16), kinfo[4]) + if (kernel_uuid == cur_uuid): + print "(builtin)" else: - print "Failed to get symbol info for %s" % cur_uuid + print "Fetching dSYM for %s" % cur_uuid + info = dsymForUUID(cur_uuid) + if info and 'DBGSymbolRichExecutable' in info: + print "Adding dSYM (%s) for %s" % (cur_uuid, info['DBGSymbolRichExecutable']) + addDSYM(cur_uuid, info) + loadDSYM(cur_uuid, int(kinfo[1],16), kinfo[4]) + else: + print "Failed to get symbol info for %s" % cur_uuid return @@ -1594,6 +1708,7 @@ def AddKextSyms(cmd_args=[], cmd_options={}): return True all_kexts_info = GetKextLoadInformation() + kernel_uuid = str(kern.globals.kernel_uuid_string).lower() if "-N" in cmd_options: kext_name = cmd_options["-N"] @@ -1608,14 +1723,17 @@ def AddKextSyms(cmd_args=[], cmd_options={}): for x in all_kexts_info: if cur_knm == x[2]: cur_uuid = x[0].lower() - print "Fetching dSYM for {:s}".format(cur_uuid) - info = dsymForUUID(cur_uuid) - if info and 'DBGSymbolRichExecutable' in info: - print "Adding dSYM ({0:s}) for {1:s}".format(cur_uuid, info['DBGSymbolRichExecutable']) - addDSYM(cur_uuid, info) - loadDSYM(cur_uuid, int(x[1],16), x[4]) + if (kernel_uuid == cur_uuid): + print "(builtin)" else: - print "Failed to get symbol info for {:s}".format(cur_uuid) + print "Fetching dSYM for {:s}".format(cur_uuid) + info = dsymForUUID(cur_uuid) + if info and 'DBGSymbolRichExecutable' in info: + print "Adding dSYM ({0:s}) for {1:s}".format(cur_uuid, info['DBGSymbolRichExecutable']) + addDSYM(cur_uuid, info) + loadDSYM(cur_uuid, int(x[1],16), x[4]) + else: + print "Failed to get symbol info for {:s}".format(cur_uuid) break kern.symbolicator = None return @@ -1635,14 +1753,15 @@ def AddKextSyms(cmd_args=[], cmd_options={}): for k_info in all_kexts_info: cur_uuid = k_info[0].lower() if load_all_kexts or (uuid == cur_uuid): - print "Fetching dSYM for %s" % cur_uuid - info = dsymForUUID(cur_uuid) - if info and 'DBGSymbolRichExecutable' in info: - print "Adding dSYM (%s) for %s" % (cur_uuid, info['DBGSymbolRichExecutable']) - addDSYM(cur_uuid, info) - loadDSYM(cur_uuid, int(k_info[1],16), k_info[4]) - else: - print "Failed to get symbol info for %s" % cur_uuid + if (kernel_uuid != cur_uuid): + print "Fetching dSYM for %s" % cur_uuid + info = dsymForUUID(cur_uuid) + if info and 'DBGSymbolRichExecutable' in info: + print "Adding dSYM (%s) for %s" % (cur_uuid, info['DBGSymbolRichExecutable']) + addDSYM(cur_uuid, info) + loadDSYM(cur_uuid, int(k_info[1],16), k_info[4]) + else: + print "Failed to get symbol info for %s" % cur_uuid #end of for loop kern.symbolicator = None return True @@ -2259,10 +2378,10 @@ def GetMutexLockSummary(mtx): out_str += "Pri : {mtx.lck_mtx_pri:#x}\n".format(mtx=mtx) out_str += "Spin : {mtx.lck_mtx_spin:#x}\n".format(mtx=mtx) out_str += "Ext : {mtx.lck_mtx_is_ext:#x}\n".format(mtx=mtx) - if mtx.lck_mtxd_pad32 == 0xFFFFFFFF : - out_str += "Canary (valid) : {mtx.lck_mtxd_pad32:#x}\n".format(mtx=mtx) + if mtx.lck_mtx_pad32 == 0xFFFFFFFF : + out_str += "Canary (valid) : {mtx.lck_mtx_pad32:#x}\n".format(mtx=mtx) else: - out_str += "Canary (INVALID) : {mtx.lck_mtxd_pad32:#x}\n".format(mtx=mtx) + out_str += "Canary (INVALID) : {mtx.lck_mtx_pad32:#x}\n".format(mtx=mtx) return out_str out_str = "Lock Type\t\t: MUTEX\n" @@ -2319,33 +2438,34 @@ def ShowLock(cmd_args=None, cmd_options={}): return summary_str = "" - lock = kern.GetValueFromAddress(cmd_args[0], 'uintptr_t *') - - if kern.arch == "x86_64" and lock: + addr = cmd_args[0] + # from osfmk/arm/locks.h + LCK_SPIN_TYPE = 0x11 + LCK_MTX_TYPE = 0x22 + if kern.arch == "x86_64": if "-M" in cmd_options: - lock_mtx = kern.GetValueFromAddress(lock, 'lck_mtx_t *') + lock_mtx = kern.GetValueFromAddress(addr, 'lck_mtx_t *') summary_str = GetMutexLockSummary(lock_mtx) elif "-S" in cmd_options: - lock_spin = kern.GetValueFromAddress(lock, 'lck_spin_t *') + lock_spin = kern.GetValueFromAddress(addr, 'lck_spin_t *') summary_str = GetSpinLockSummary(lock_spin) else: summary_str = "Please specify supported lock option(-M/-S)" print summary_str - return - - if lock: - lock_mtx = Cast(lock, 'lck_mtx_t*') - if lock_mtx.lck_mtx_type == 0x22: - summary_str = GetMutexLockSummary(lock_mtx) - - lock_spin = Cast(lock, 'lck_spin_t*') - if lock_spin.type == 0x11: - summary_str = GetSpinLockSummary(lock_spin) - - if summary_str == "": - summary_str = "Lock Type\t\t: INVALID LOCK" - print summary_str + else: + lock = kern.GetValueFromAddress(addr, 'uintptr_t *') + if lock: + lock_mtx = Cast(lock, 'lck_mtx_t*') + if lock_mtx.lck_mtx_type == LCK_MTX_TYPE: + summary_str = GetMutexLockSummary(lock_mtx) + + lock_spin = Cast(lock, 'lck_spin_t*') + if lock_spin.type == LCK_SPIN_TYPE: + summary_str = GetSpinLockSummary(lock_spin) + if summary_str == "": + summary_str = "Lock Type\t\t: INVALID LOCK" + print summary_str #EndMacro: showlock @@ -2451,10 +2571,10 @@ def ShowAllPurgeableNonVolatileVmObjects(cmd_args=None): queue_len = kern.globals.purgeable_nonvolatile_count queue_head = kern.globals.purgeable_nonvolatile_queue - print 'purgeable_nonvolatile_queue:{:#018x} purgeable_volatile_count:{:d}\n'.format(kern.GetLoadAddressForSymbol('purgeable_nonvolatile_queue'),queue_len) + print 'purgeable_nonvolatile_queue:{: <#018x} purgeable_volatile_count:{:d}\n'.format(kern.GetLoadAddressForSymbol('purgeable_nonvolatile_queue'),queue_len) print 'N:non-volatile V:volatile E:empty D:deny\n' - print '{:>6s} {:<6s} {:18s} {:1s} {:>6s} {:>16s} {:>10s} {:>10s} {:>10s} {:18s} {:>6s} {:<20s}\n'.format("#","#","object","P","refcnt","size (pages)","resid","wired","compressed","owner","pid","process") + print '{:>6s} {:<6s} {:18s} {:1s} {:>6s} {:>16s} {:>10s} {:>10s} {:>10s} {:>3s} {:18s} {:>6s} {:<20s}\n'.format("#","#","object","P","refcnt","size (pages)","resid","wired","compressed","tag","owner","pid","process") idx = 0 for object in IterateQueue(queue_head, 'struct vm_object *', 'objq'): idx += 1 @@ -2487,14 +2607,14 @@ def ShowPurgeableNonVolatileVmObject(object, idx, queue_len, nonvolatile_total): compressor_pager = Cast(object.pager, 'compressor_pager *') compressed_count = compressor_pager.cpgr_num_slots_occupied - print "{:>6d}/{:<6d} {:#018x} {:1s} {:>6d} {:>16d} {:>10d} {:>10d} {:>10d} {:#018x} {:>6d} {:<20s}\n".format(idx,queue_len,object,purgable,object.ref_count,object.vo_un1.vou_size/page_size,object.resident_page_count,object.wired_page_count,compressed_count, object.vo_un2.vou_purgeable_owner,GetProcPIDForTask(object.vo_un2.vou_purgeable_owner),GetProcNameForTask(object.vo_un2.vou_purgeable_owner)) + print "{:>6d}/{:<6d} {: <#018x} {:1s} {:>6d} {:>16d} {:>10d} {:>10d} {:>10d} {:>3d} {: <#018x} {:>6d} {:<20s}\n".format(idx,queue_len,object,purgable,object.ref_count,object.vo_un1.vou_size/page_size,object.resident_page_count,object.wired_page_count,compressed_count, object.vo_ledger_tag, object.vo_un2.vou_owner,GetProcPIDForObjectOwner(object.vo_un2.vou_owner),GetProcNameForObjectOwner(object.vo_un2.vou_owner)) nonvolatile_total.objects += 1 nonvolatile_total.vsize += object.vo_un1.vou_size/page_size nonvolatile_total.rsize += object.resident_page_count nonvolatile_total.wsize += object.wired_page_count nonvolatile_total.csize += compressed_count - if object.vo_un2.vou_purgeable_owner == 0: + if object.vo_un2.vou_owner == 0: nonvolatile_total.disowned_objects += 1 nonvolatile_total.disowned_vsize += object.vo_un1.vou_size/page_size nonvolatile_total.disowned_rsize += object.resident_page_count @@ -2561,7 +2681,7 @@ def ShowPurgeableGroup(qhead, volatile_total): for object in IterateQueue(qhead, 'struct vm_object *', 'objq'): if idx == 0: # print "{:>6s} {:18s} {:1s} {:>6s} {:>16s} {:>10s} {:>10s} {:>10s} {:18s} {:>6s} {:<20s} {:18s} {:>6s} {:<20s} {:s}\n".format("#","object","P","refcnt","size (pages)","resid","wired","compressed","owner","pid","process","volatilizer","pid","process","") - print "{:>6s} {:18s} {:1s} {:>6s} {:>16s} {:>10s} {:>10s} {:>10s} {:18s} {:>6s} {:<20s}\n".format("#","object","P","refcnt","size (pages)","resid","wired","compressed","owner","pid","process") + print "{:>6s} {:18s} {:1s} {:>6s} {:>16s} {:>10s} {:>10s} {:>10s} {:>3s} {:18s} {:>6s} {:<20s}\n".format("#","object","P","refcnt","size (pages)","resid","wired","compressed","tag","owner","pid","process") idx += 1 ShowPurgeableVolatileVmObject(object, idx, volatile_total) @@ -2572,7 +2692,7 @@ def ShowPurgeableVolatileVmObject(object, idx, volatile_total): returns: None """ -## if int(object.vo_un2.vou_purgeable_owner) != int(object.vo_purgeable_volatilizer): +## if int(object.vo_un2.vou_owner) != int(object.vo_purgeable_volatilizer): # diff=" !=" ## else: # diff=" " @@ -2592,14 +2712,14 @@ def ShowPurgeableVolatileVmObject(object, idx, volatile_total): else: compressor_pager = Cast(object.pager, 'compressor_pager *') compressed_count = compressor_pager.cpgr_num_slots_occupied -# print "{:>6d} {:#018x} {:1s} {:>6d} {:>16d} {:>10d} {:>10d} {:>10d} {:#018x} {:>6d} {:<20s} {:#018x} {:>6d} {:<20s} {:s}\n".format(idx,object,purgable,object.ref_count,object.vo_un1.vou_size/page_size,object.resident_page_count,object.wired_page_count,compressed_count,object.vo_un2.vou_purgeable_owner,GetProcPIDForTask(object.vo_un2.vou_purgeable_owner),GetProcNameForTask(object.vo_un2.vou_purgeable_owner),object.vo_purgeable_volatilizer,GetProcPIDForTask(object.vo_purgeable_volatilizer),GetProcNameForTask(object.vo_purgeable_volatilizer),diff) - print "{:>6d} {:#018x} {:1s} {:>6d} {:>16d} {:>10d} {:>10d} {:>10d} {:#018x} {:>6d} {:<20s}\n".format(idx,object,purgable,object.ref_count,object.vo_un1.vou_size/page_size,object.resident_page_count,object.wired_page_count,compressed_count, object.vo_un2.vou_purgeable_owner,GetProcPIDForTask(object.vo_un2.vou_purgeable_owner),GetProcNameForTask(object.vo_un2.vou_purgeable_owner)) +# print "{:>6d} {: <#018x} {:1s} {:>6d} {:>16d} {:>10d} {:>10d} {:>10d} {: <#018x} {:>6d} {:<20s} {: <#018x} {:>6d} {:<20s} {:s}\n".format(idx,object,purgable,object.ref_count,object.vo_un1.vou_size/page_size,object.resident_page_count,object.wired_page_count,compressed_count,object.vo_un2.vou_owner,GetProcPIDForObjectOwner(object.vo_un2.vou_owner),GetProcNameForObjectOwner(object.vo_un2.vou_owner),object.vo_purgeable_volatilizer,GetProcPIDForObjectOwner(object.vo_purgeable_volatilizer),GetProcNameForObjectOwner(object.vo_purgeable_volatilizer),diff) + print "{:>6d} {: <#018x} {:1s} {:>6d} {:>16d} {:>10d} {:>10d} {:>10d} {:>3d} {: <#018x} {:>6d} {:<20s}\n".format(idx,object,purgable,object.ref_count,object.vo_un1.vou_size/page_size,object.resident_page_count,object.wired_page_count,compressed_count, object.vo_ledger_tag, object.vo_un2.vou_owner,GetProcPIDForObjectOwner(object.vo_un2.vou_owner),GetProcNameForObjectOwner(object.vo_un2.vou_owner)) volatile_total.objects += 1 volatile_total.vsize += object.vo_un1.vou_size/page_size volatile_total.rsize += object.resident_page_count volatile_total.wsize += object.wired_page_count volatile_total.csize += compressed_count - if object.vo_un2.vou_purgeable_owner == 0: + if object.vo_un2.vou_owner == 0: volatile_total.disowned_objects += 1 volatile_total.disowned_vsize += object.vo_un1.vou_size/page_size volatile_total.disowned_rsize += object.resident_page_count @@ -2651,48 +2771,150 @@ def ShowTaskVMEntries(task, show_pager_info, show_all_shadows): if not task.map: print "Task {0: <#020x} has map = 0x0" return None - showmapvme(task.map, show_pager_info, show_all_shadows) + showmapvme(task.map, 0, 0, show_pager_info, show_all_shadows, False) -@lldb_command("showmapvme", "PS") +@lldb_command("showmapvme", "A:B:PRST") def ShowMapVME(cmd_args=None, cmd_options={}): """Routine to print out info about the specified vm_map and its vm entries - usage: showmapvme + usage: showmapvme [-A start] [-B end] [-S] [-P] + Use -A flag to start at virtual address + Use -B flag to end at virtual address Use -S flag to show VM object shadow chains Use -P flag to show pager info (mapped file, compressed pages, ...) + Use -R flag to reverse order + Use -T to show red-black tree pointers """ if cmd_args == None or len(cmd_args) < 1: - print "Invalid argument.", ShowMap.__doc__ + print "Invalid argument.", ShowMapVME.__doc__ return show_pager_info = False show_all_shadows = False + show_rb_tree = False + start_vaddr = 0 + end_vaddr = 0 + reverse_order = False + if "-A" in cmd_options: + start_vaddr = unsigned(int(cmd_options['-A'], 16)) + if "-B" in cmd_options: + end_vaddr = unsigned(int(cmd_options['-B'], 16)) if "-P" in cmd_options: show_pager_info = True if "-S" in cmd_options: show_all_shadows = True + if "-R" in cmd_options: + reverse_order = True + if "-T" in cmd_options: + show_rb_tree = True map = kern.GetValueFromAddress(cmd_args[0], 'vm_map_t') - showmapvme(map, show_pager_info, show_all_shadows) + showmapvme(map, start_vaddr, end_vaddr, show_pager_info, show_all_shadows, reverse_order, show_rb_tree) + +@lldb_command("showvmobject", "A:B:PRST") +def ShowVMObject(cmd_args=None, cmd_options={}): + """Routine to print out a VM object and its shadow chain + usage: showvmobject [-S] [-P] + -S: show VM object shadow chain + -P: show pager info (mapped file, compressed pages, ...) + """ + if cmd_args == None or len(cmd_args) < 1: + print "Invalid argument.", ShowMapVME.__doc__ + return + show_pager_info = False + show_all_shadows = False + if "-P" in cmd_options: + show_pager_info = True + if "-S" in cmd_options: + show_all_shadows = True + object = kern.GetValueFromAddress(cmd_args[0], 'vm_object_t') + showvmobject(object, 0, 0, show_pager_info, show_all_shadows) -def showmapvme(map, show_pager_info, show_all_shadows): +def showvmobject(object, offset=0, size=0, show_pager_info=False, show_all_shadows=False): page_size = kern.globals.page_size vnode_pager_ops = kern.globals.vnode_pager_ops vnode_pager_ops_addr = unsigned(addressof(vnode_pager_ops)) + depth = 0 + if size == 0 and object != 0 and object.internal: + size = object.vo_un1.vou_size + while object != 0: + depth += 1 + if show_all_shadows == False and depth != 1 and object.shadow != 0: + offset += unsigned(object.vo_un2.vou_shadow_offset) + object = object.shadow + continue + if object.copy_strategy == 0: + copy_strategy="N" + elif object.copy_strategy == 2: + copy_strategy="D" + elif object.copy_strategy == 4: + copy_strategy="S" + + else: + copy_strategy=str(object.copy_strategy) + if object.internal: + internal = "internal" + else: + internal = "external" + purgeable = "NVED"[int(object.purgable)] + pager_string = "" + if object.phys_contiguous: + pager_string = pager_string + "phys_contig {:#018x}:{:#018x} ".format(unsigned(object.vo_un2.vou_shadow_offset), unsigned(object.vo_un1.vou_size)) + pager = object.pager + if show_pager_info and pager != 0: + if object.internal: + pager_string = pager_string + "-> compressed:{:d}".format(GetCompressedPagesForObject(object)) + elif unsigned(pager.mo_pager_ops) == vnode_pager_ops_addr: + vnode_pager = Cast(pager,'vnode_pager *') + pager_string = pager_string + "-> " + GetVnodePath(vnode_pager.vnode_handle) + else: + pager_string = pager_string + "-> {:s}:{: <#018x}".format(pager.mo_pager_ops.memory_object_pager_name, pager) + print "{:>18d} {:#018x}:{:#018x} {: <#018x} ref:{:<6d} ts:{:1d} strat:{:1s} purg:{:1s} {:s} wtag:{:d} ({:d} {:d} {:d}) {:s}".format(depth,offset,offset+size,object,object.ref_count,object.true_share,copy_strategy,purgeable,internal,object.wire_tag,unsigned(object.vo_un1.vou_size)/page_size,object.resident_page_count,object.wired_page_count,pager_string) +# print " #{:<5d} obj {: <#018x} ref:{:<6d} ts:{:1d} strat:{:1s} {:s} size:{:<10d} wired:{:<10d} resident:{:<10d} reusable:{:<10d}".format(depth,object,object.ref_count,object.true_share,copy_strategy,internal,object.vo_un1.vou_size/page_size,object.wired_page_count,object.resident_page_count,object.reusable_page_count) + offset += unsigned(object.vo_un2.vou_shadow_offset) + object = object.shadow + +def showmapvme(map, start_vaddr, end_vaddr, show_pager_info, show_all_shadows, reverse_order=False, show_rb_tree=False): rsize = 0 if map.pmap != 0: rsize = int(map.pmap.stats.resident_count) print "{:<18s} {:<18s} {:<18s} {:>10s} {:>18s} {:>18s}:{:<18s}".format("vm_map","pmap","size","#ents","rsize","start","end") - print "{:#018x} {:#018x} {:#018x} {:>10d} {:>18d} {:#018x}:{:#018x}".format(map,map.pmap,unsigned(map.size),map.hdr.nentries,rsize,map.hdr.links.start,map.hdr.links.end) - vme_list_head = map.hdr.links + print "{: <#018x} {: <#018x} {:#018x} {:>10d} {:>18d} {:#018x}:{:#018x}".format(map,map.pmap,unsigned(map.size),map.hdr.nentries,rsize,map.hdr.links.start,map.hdr.links.end) + showmaphdrvme(map.hdr, map.pmap, start_vaddr, end_vaddr, show_pager_info, show_all_shadows, reverse_order, show_rb_tree) + +def showmapcopyvme(mapcopy, start_vaddr=0, end_vaddr=0, show_pager_info=True, show_all_shadows=True, reverse_order=False, show_rb_tree=False): + print "{:<18s} {:<18s} {:<18s} {:>10s} {:>18s} {:>18s}:{:<18s}".format("vm_map_copy","pmap","size","#ents","rsize","start","end") + print "{: <#018x} {:#018x} {:#018x} {:>10d} {:>18d} {:#018x}:{:#018x}".format(mapcopy,0,0,mapcopy.c_u.hdr.nentries,0,mapcopy.c_u.hdr.links.start,mapcopy.c_u.hdr.links.end) + showmaphdrvme(mapcopy.c_u.hdr, 0, start_vaddr, end_vaddr, show_pager_info, show_all_shadows, reverse_order, show_rb_tree) + +def showmaphdrvme(maphdr, pmap, start_vaddr, end_vaddr, show_pager_info, show_all_shadows, reverse_order, show_rb_tree): + page_size = kern.globals.page_size + vnode_pager_ops = kern.globals.vnode_pager_ops + vnode_pager_ops_addr = unsigned(addressof(vnode_pager_ops)) + if hasattr(kern.globals, 'compressor_object'): + compressor_object = kern.globals.compressor_object + else: + compressor_object = -1; + vme_list_head = maphdr.links vme_ptr_type = GetType('vm_map_entry *') - print "{:<18s} {:>18s}:{:<18s} {:>10s} {:<8s} {:<10s} {:<18s} {:<18s}".format("entry","start","end","#pgs","tag.kmod","prot&flags","object","offset") - last_end = unsigned(map.hdr.links.start) - for vme in IterateQueue(vme_list_head, vme_ptr_type, "links"): + print "{:<18s} {:>18s}:{:<18s} {:>10s} {:<8s} {:<16s} {:<18s} {:<18s}".format("entry","start","end","#pgs","tag.kmod","prot&flags","object","offset") + last_end = unsigned(maphdr.links.start) + skipped_entries = 0 + for vme in IterateQueue(vme_list_head, vme_ptr_type, "links", reverse_order): + if start_vaddr != 0 and end_vaddr != 0: + if unsigned(vme.links.start) > end_vaddr: + break + if unsigned(vme.links.end) <= start_vaddr: + last_end = unsigned(vme.links.end) + skipped_entries = skipped_entries + 1 + continue + if skipped_entries != 0: + print "... skipped {:d} entries ...".format(skipped_entries) + skipped_entries = 0 if unsigned(vme.links.start) != last_end: print "{:18s} {:#018x}:{:#018x} {:>10d}".format("------------------",last_end,vme.links.start,(unsigned(vme.links.start) - last_end)/page_size) last_end = unsigned(vme.links.end) size = unsigned(vme.links.end) - unsigned(vme.links.start) object = vme.vme_object.vmo_object if object == 0: - object_str = "{:<#018x}".format(object) + object_str = "{: <#018x}".format(object) elif vme.is_sub_map: if object == kern.globals.bufferhdr_map: object_str = "BUFFERHDR_MAP" @@ -2717,70 +2939,73 @@ def showmapvme(map, show_pager_info, show_all_shadows): elif hasattr(kern.globals, 'vector_upl_submap') and object == kern.globals.vector_upl_submap: object_str = "VECTOR_UPL_SUBMAP" else: - object_str = "submap:{:<#018x}".format(object) + object_str = "submap:{: <#018x}".format(object) else: if object == kern.globals.kernel_object: object_str = "KERNEL_OBJECT" elif object == kern.globals.vm_submap_object: object_str = "VM_SUBMAP_OBJECT" - elif object == kern.globals.compressor_object: + elif object == compressor_object: object_str = "COMPRESSOR_OBJECT" else: - object_str = "{:<#018x}".format(object) + object_str = "{: <#018x}".format(object) offset = unsigned(vme.vme_offset) & ~0xFFF tag = unsigned(vme.vme_offset & 0xFFF) + protection = "" + if vme.protection & 0x1: + protection +="r" + else: + protection += "-" + if vme.protection & 0x2: + protection += "w" + else: + protection += "-" + if vme.protection & 0x4: + protection += "x" + else: + protection += "-" + max_protection = "" + if vme.max_protection & 0x1: + max_protection +="r" + else: + max_protection += "-" + if vme.max_protection & 0x2: + max_protection += "w" + else: + max_protection += "-" + if vme.max_protection & 0x4: + max_protection += "x" + else: + max_protection += "-" vme_flags = "" if vme.is_sub_map: vme_flags += "s" if vme.needs_copy: vme_flags += "n" - if vme.is_sub_map and vme.use_pmap: + if vme.use_pmap: vme_flags += "p" + if vme.wired_count: + vme_flags += "w" + if vme.used_for_jit: + vme_flags += "j" tagstr = "" - if map.pmap == kern.globals.kernel_pmap: + if pmap == kern.globals.kernel_pmap: xsite = Cast(kern.globals.vm_allocation_sites[tag],'OSKextAccount *') if xsite and xsite.site.flags & 0x0200: tagstr = ".{:<3d}".format(xsite.loadTag) - print "{:#018x} {:#018x}:{:#018x} {:>10d} {:>3d}{:<4s} {:1d}{:1d}{:<8s} {:<18s} {:<#18x}".format(vme,vme.links.start,vme.links.end,(unsigned(vme.links.end)-unsigned(vme.links.start))/page_size,tag,tagstr,vme.protection,vme.max_protection,vme_flags,object_str,offset) + rb_info = "" + if show_rb_tree: + rb_info = "l={: <#018x} r={: <#018x} p={: <#018x}".format(vme.store.entry.rbe_left, vme.store.entry.rbe_right, vme.store.entry.rbe_parent) + print "{: <#018x} {:#018x}:{:#018x} {:>10d} {:>3d}{:<4s} {:3s}/{:3s}/{:<8s} {:<18s} {:<#18x} {:s}".format(vme,vme.links.start,vme.links.end,(unsigned(vme.links.end)-unsigned(vme.links.start))/page_size,tag,tagstr,protection,max_protection,vme_flags,object_str,offset, rb_info) if (show_pager_info or show_all_shadows) and vme.is_sub_map == 0 and vme.vme_object.vmo_object != 0: object = vme.vme_object.vmo_object else: object = 0 - depth = 0 - while object != 0: - depth += 1 - if show_all_shadows == False and depth != 1 and object.shadow != 0: - offset += unsigned(object.vo_un2.vou_shadow_offset) - object = object.shadow - continue - if object.copy_strategy == 0: - copy_strategy="N" - elif object.copy_strategy == 2: - copy_strategy="D" - elif object.copy_strategy == 4: - copy_strategy="S" - else: - copy_strategy=str(object.copy_strategy) - if object.internal: - internal = "internal" - else: - internal = "external" - pager_string = "" - pager = object.pager - if show_pager_info and pager != 0: - if object.internal: - pager_string = "-> compressed:{:d}".format(GetCompressedPagesForObject(object)) - elif unsigned(pager.mo_pager_ops) == vnode_pager_ops_addr: - vnode_pager = Cast(pager,'vnode_pager *') - pager_string = "-> " + GetVnodePath(vnode_pager.vnode_handle) - else: - pager_string = "-> {:s}:{:#018x}".format(pager.mo_pager_ops.memory_object_pager_name, pager.mo_pager_ops) - print "{:>18d} {:#018x}:{:#018x} {:#018x} ref:{:<6d} ts:{:1d} strat:{:1s} {:s} ({:d} {:d} {:d}) {:s}".format(depth,offset,offset+size,object,object.ref_count,object.true_share,copy_strategy,internal,unsigned(object.vo_un1.vou_size)/page_size,object.resident_page_count,object.wired_page_count,pager_string) -# print " #{:<5d} obj {:#018x} ref:{:<6d} ts:{:1d} strat:{:1s} {:s} size:{:<10d} wired:{:<10d} resident:{:<10d} reusable:{:<10d}".format(depth,object,object.ref_count,object.true_share,copy_strategy,internal,object.vo_un1.vou_size/page_size,object.wired_page_count,object.resident_page_count,object.reusable_page_count) - offset += unsigned(object.vo_un2.vou_shadow_offset) - object = object.shadow - if unsigned(map.hdr.links.end) > last_end: - print "{:18s} {:#018x}:{:#018x} {:>10d}".format("------------------",last_end,map.hdr.links.end,(unsigned(map.hdr.links.end) - last_end)/page_size) + showvmobject(object, offset, size, show_pager_info, show_all_shadows) + if start_vaddr != 0 or end_vaddr != 0: + print "..." + elif unsigned(maphdr.links.end) > last_end: + print "{:18s} {:#018x}:{:#018x} {:>10d}".format("------------------",last_end,maphdr.links.end,(unsigned(maphdr.links.end) - last_end)/page_size) return None def CountMapTags(map, tagcounts, slow): @@ -2802,11 +3027,11 @@ def CountMapTags(map, tagcounts, slow): page = _vm_page_unpack_ptr(page_list) while (page != 0): vmpage = kern.GetValueFromAddress(page, 'vm_page_t') - if (addr == unsigned(vmpage.offset)) and (object == vm_object_t(_vm_page_unpack_ptr(vmpage.vm_page_object))): - if (not vmpage.local) and (vmpage.wire_count > 0): + if (addr == unsigned(vmpage.vmp_offset)) and (object == vm_object_t(_vm_page_unpack_ptr(vmpage.vmp_object))): + if (not vmpage.vmp_local) and (vmpage.vmp_wire_count > 0): count += 1 break - page = _vm_page_unpack_ptr(vmpage.next_m) + page = _vm_page_unpack_ptr(vmpage.vmp_next_m) addr += page_size tagcounts[tag] += count elif vme.is_sub_map: @@ -2817,21 +3042,6 @@ def CountWiredObject(object, tagcounts): tagcounts[unsigned(object.wire_tag)] += object.wired_page_count return None -def CountWiredPurgeableGroup(qhead, tagcounts): - for object in IterateQueue(qhead, 'struct vm_object *', 'objq'): - CountWiredObject(object, tagcounts) - return None - -def CountWiredPurgeableQueue(qhead, tagcounts): - CountWiredPurgeableGroup(qhead.objq[0], tagcounts) - CountWiredPurgeableGroup(qhead.objq[1], tagcounts) - CountWiredPurgeableGroup(qhead.objq[2], tagcounts) - CountWiredPurgeableGroup(qhead.objq[3], tagcounts) - CountWiredPurgeableGroup(qhead.objq[4], tagcounts) - CountWiredPurgeableGroup(qhead.objq[5], tagcounts) - CountWiredPurgeableGroup(qhead.objq[6], tagcounts) - CountWiredPurgeableGroup(qhead.objq[7], tagcounts) - def GetKmodIDName(kmod_id): kmod_val = kern.globals.kmod for kmod in IterateLinkedList(kmod_val, 'next'): @@ -2874,17 +3084,22 @@ def GetKmodIDName(kmod_id): def GetVMKernName(tag): return FixedTags[tag] -@lldb_command("showvmtags", "S") +@lldb_command("showvmtags", "AS") def showvmtags(cmd_args=None, cmd_options={}): """Routine to print out info about kernel wired page allocations usage: showvmtags iterates kernel map and vm objects totaling allocations by tag. usage: showvmtags -S also iterates kernel object pages individually - slow. + usage: showvmtags -A + show all tags, even tags that have no wired count """ slow = False if "-S" in cmd_options: slow = True + all_tags = False + if "-A" in cmd_options: + all_tags = True page_size = unsigned(kern.globals.page_size) tagcounts = [] tagpeaks = [] @@ -2901,25 +3116,16 @@ def showvmtags(cmd_args=None, cmd_options={}): tagpeaks[unsigned(tag)] = unsigned(site.peak) else: queue_head = kern.globals.vm_objects_wired - for object in IterateQueue(queue_head, 'struct vm_object *', 'objq'): + for object in IterateQueue(queue_head, 'struct vm_object *', 'wired_objq'): if object != kern.globals.kernel_object: CountWiredObject(object, tagcounts) - queue_head = kern.globals.purgeable_nonvolatile_queue - for object in IterateQueue(queue_head, 'struct vm_object *', 'objq'): - CountWiredObject(object, tagcounts) - - purgeable_queues = kern.globals.purgeable_queues - CountWiredPurgeableQueue(purgeable_queues[0], tagcounts) - CountWiredPurgeableQueue(purgeable_queues[1], tagcounts) - CountWiredPurgeableQueue(purgeable_queues[2], tagcounts) - CountMapTags(kern.globals.kernel_map, tagcounts, slow) total = 0 print " {:<7s} {:>7s} {:>7s} {:<50s}".format("tag.kmod","peak","size","name") for tag in range(256): - if tagcounts[tag]: + if all_tags or tagcounts[tag]: total += tagcounts[tag] tagstr = "" sitestr = "" @@ -3023,8 +3229,8 @@ def VMPageLookup(cmd_args=None): page = _vm_page_unpack_ptr(page_list) while (page != 0) : pg_t = kern.GetValueFromAddress(page, 'vm_page_t') - print format_string.format(page, pg_t.offset, _vm_page_unpack_ptr(pg_t.vm_page_object)) - page = _vm_page_unpack_ptr(pg_t.next_m) + print format_string.format(page, pg_t.vmp_offset, _vm_page_unpack_ptr(pg_t.vmp_object)) + page = _vm_page_unpack_ptr(pg_t.vmp_next_m) @@ -3044,7 +3250,7 @@ def VmPageGetPhysPage(cmd_args=None): def _vm_page_get_phys_page(page): if kern.arch == 'x86_64': - return page.phys_page + return page.vmp_phys_page if page == 0 : return 0 @@ -3181,7 +3387,7 @@ def VMObjectWalkPages(cmd_args=None, cmd_options={}): page_found = False pages_seen = set() - for vmp in IterateQueue(obj.memq, "vm_page_t", "listq", walk_backwards, unpack_ptr_fn=_vm_page_unpack_ptr): + for vmp in IterateQueue(obj.memq, "vm_page_t", "vmp_listq", walk_backwards, unpack_ptr_fn=_vm_page_unpack_ptr): page_count += 1 out_string = "" if (page != 0 and not(page_found) and vmp == page): @@ -3192,41 +3398,45 @@ def VMObjectWalkPages(cmd_args=None, cmd_options={}): if (page_count % 1000) == 0: print "traversed %d pages ...\n" % (page_count) else: - out_string += format_string.format(page_count, res_page_count, vmp, vmp.offset, _vm_page_unpack_ptr(vmp.listq.next), _vm_page_get_phys_page(vmp), vmp.wire_count) - out_string += first_bitfield_format_string.format(vmp.vm_page_q_state, vmp.vm_page_in_background, vmp.vm_page_on_backgroundq, vmp.gobbled, vmp.laundry, vmp.no_cache, - vmp.private, vmp.reference) + out_string += format_string.format(page_count, res_page_count, vmp, vmp.vmp_offset, _vm_page_unpack_ptr(vmp.listq.next), _vm_page_get_phys_page(vmp), vmp.vmp_wire_count) + out_string += first_bitfield_format_string.format(vmp.vmp_q_state, vmp.vmp_in_background, vmp.vmp_on_backgroundq, vmp.vmp_gobbled, vmp.vmp_laundry, vmp.vmp_no_cache, + vmp.vmp_private, vmp.vmp_reference) - out_string += second_bitfield_format_string.format(vmp.busy, vmp.wanted, vmp.tabled, vmp.hashed, vmp.fictitious, vmp.clustered, - vmp.pmapped, vmp.xpmapped, vmp.wpmapped, vmp.free_when_done, vmp.absent, - vmp.error, vmp.dirty, vmp.cleaning, vmp.precious, vmp.overwriting, - vmp.restart, vmp.unusual, 0, 0, - vmp.cs_validated, vmp.cs_tainted, vmp.cs_nx, vmp.reusable, vmp.lopage, vmp.slid, - vmp.written_by_kernel) + if hasattr(vmp,'slid'): + vmp_slid = vmp.slid + else: + vmp_slid = 0 + out_string += second_bitfield_format_string.format(vmp.vmp_busy, vmp.vmp_wanted, vmp.vmp_tabled, vmp.vmp_hashed, vmp.vmp_fictitious, vmp.vmp_clustered, + vmp.vmp_pmapped, vmp.vmp_xpmapped, vmp.vmp_wpmapped, vmp.vmp_free_when_done, vmp.vmp_absent, + vmp.vmp_error, vmp.vmp_dirty, vmp.vmp_cleaning, vmp.vmp_precious, vmp.vmp_overwriting, + vmp.vmp_restart, vmp.vmp_unusual, 0, 0, + vmp.vmp_cs_validated, vmp.vmp_cs_tainted, vmp.vmp_cs_nx, vmp.vmp_reusable, vmp.vmp_lopage, vmp_slid, + vmp.vmp_written_by_kernel) if (vmp in pages_seen): print out_string + "cycle detected! we've seen vm_page_t: " + "{0: <#020x}".format(unsigned(vmp)) + " twice. stopping...\n" return - if (_vm_page_unpack_ptr(vmp.vm_page_object) != unsigned(obj)): - print out_string + " vm_page_t: " + "{0: <#020x}".format(unsigned(vmp)) + " points to different vm_object_t: " + "{0: <#020x}".format(unsigned(_vm_page_unpack_ptr(vmp.vm_page_object))) + if (_vm_page_unpack_ptr(vmp.vmp_object) != unsigned(obj)): + print out_string + " vm_page_t: " + "{0: <#020x}".format(unsigned(vmp)) + " points to different vm_object_t: " + "{0: <#020x}".format(unsigned(_vm_page_unpack_ptr(vmp.vmp_object))) return - if (vmp.vm_page_q_state == VM_PAGE_IS_WIRED) and (vmp.wire_count == 0): + if (vmp.vmp_q_state == VM_PAGE_IS_WIRED) and (vmp.vmp_wire_count == 0): print out_string + " page in wired state with wire_count of 0\n" print "vm_page_t: " + "{0: <#020x}".format(unsigned(vmp)) + "\n" print "stopping...\n" return - if ((vmp.__unused_pageq_bits != 0) or (vmp.__unused_object_bits != 0)): - print out_string + " unused bits not zero for vm_page_t: " + "{0: <#020x}".format(unsigned(vmp)) + " unused__pageq_bits: %d unused_object_bits : %d\n" % (vmp.__unused_pageq_bits, - vmp.__unused_object_bits) + if ((vmp.vmp_unused_page_bits != 0) or (vmp.vmp_unused_object_bits != 0)): + print out_string + " unused bits not zero for vm_page_t: " + "{0: <#020x}".format(unsigned(vmp)) + " unused__pageq_bits: %d unused_object_bits : %d\n" % (vmp.vmp_unused_page_bits, + vmp.vmp_unused_object_bits) print "stopping...\n" return pages_seen.add(vmp) if False: - hash_id = _calc_vm_page_hash(obj, vmp.offset) + hash_id = _calc_vm_page_hash(obj, vmp.vmp_offset) hash_page_list = kern.globals.vm_page_buckets[hash_id].page_list hash_page = _vm_page_unpack_ptr(hash_page_list) hash_page_t = 0 @@ -3235,11 +3445,11 @@ def VMObjectWalkPages(cmd_args=None, cmd_options={}): hash_page_t = kern.GetValueFromAddress(hash_page, 'vm_page_t') if hash_page_t == vmp: break - hash_page = _vm_page_unpack_ptr(hash_page_t.next_m) + hash_page = _vm_page_unpack_ptr(hash_page_t.vmp_next_m) if (unsigned(vmp) != unsigned(hash_page_t)): print out_string + "unable to find page: " + "{0: <#020x}".format(unsigned(vmp)) + " from object in kernel page bucket list\n" - print lldb_run_command("vm_page_info %s 0x%x" % (cmd_args[0], unsigned(vmp.offset))) + print lldb_run_command("vm_page_info %s 0x%x" % (cmd_args[0], unsigned(vmp.vmp_offset))) return if (page_count >= limit and not(ignore_limit)): @@ -3278,9 +3488,9 @@ def ShowAppleProtectPager(cmd_args=None): usage: show_apple_protect_pager """ if cmd_args == None or len(cmd_args) < 1: - print "Invalid argument.", ShowMap.__doc__ + print "Invalid argument.", ShowAppleProtectPager.__doc__ return - pager = kern.GetValueFromAddress(cmd_ars[0], 'apple_protect_pager_t') + pager = kern.GetValueFromAddress(cmd_args[0], 'apple_protect_pager_t') show_apple_protect_pager(pager, 1, 1) def show_apple_protect_pager(pager, qcnt, idx): @@ -3291,7 +3501,7 @@ def show_apple_protect_pager(pager, qcnt, idx): shadow = object.shadow vnode_pager = Cast(object.pager,'vnode_pager *') filename = GetVnodePath(vnode_pager.vnode_handle) - print "{:>3}/{:<3d} {:#018x} {:>5d} {:>5d} {:>6d} {:#018x} {:#018x} {:#018x} {:#018x} {:#018x} {:#018x}\n\tcrypt_info:{:#018x} \n\tvnode:{:#018x} {:s}\n".format(idx, qcnt, pager, pager.ref_count, pager.is_ready, pager.is_mapped, pager.pager_control, pager.backing_object, pager.backing_offset, pager.crypto_backing_offset, pager.crypto_start, pager.crypto_end, pager.crypt_info, pager.crypt_info.page_decrypt, pager.crypt_info.crypt_end, pager.crypt_info.crypt_ops, pager.crypt_info.crypt_refcnt, vnode_pager.vnode_handle, filename) + print "{:>3}/{:<3d} {: <#018x} {:>5d} {:>5d} {:>6d} {: <#018x} {: <#018x} {:#018x} {:#018x} {:#018x} {:#018x}\n\tcrypt_info:{: <#018x} \n\tvnode:{: <#018x} {:s}\n".format(idx, qcnt, pager, pager.ref_count, pager.is_ready, pager.is_mapped, pager.pager_control, pager.backing_object, pager.backing_offset, pager.crypto_backing_offset, pager.crypto_start, pager.crypto_end, pager.crypt_info, pager.crypt_info.page_decrypt, pager.crypt_info.crypt_end, pager.crypt_info.crypt_ops, pager.crypt_info.crypt_refcnt, vnode_pager.vnode_handle, filename) @lldb_command("show_console_ring") def ShowConsoleRingData(cmd_args=None): @@ -3449,3 +3659,619 @@ def ShowVnodeDirtyBlocks(cmd_args=None): _ShowVnodeBlocks(True, cmd_args) # EndMacro: showvnodecleanblk/showvnodedirtyblk + + +@lldb_command("vm_page_lookup_in_map") +def VmPageLookupInMap(cmd_args=None): + """Lookup up a page at a virtual address in a VM map + usage: vm_page_lookup_in_map + """ + if cmd_args == None or len(cmd_args) < 2: + print "Invalid argument.", VmPageLookupInMap.__doc__ + return + map = kern.GetValueFromAddress(cmd_args[0], 'vm_map_t') + vaddr = kern.GetValueFromAddress(cmd_args[1], 'vm_map_offset_t') + print "vaddr {:#018x} in map {: <#018x}".format(vaddr, map) + vm_page_lookup_in_map(map, vaddr) + +def vm_page_lookup_in_map(map, vaddr): + vaddr = unsigned(vaddr) + vme_list_head = map.hdr.links + vme_ptr_type = GetType('vm_map_entry *') + for vme in IterateQueue(vme_list_head, vme_ptr_type, "links"): + if unsigned(vme.links.start) > vaddr: + break + if unsigned(vme.links.end) <= vaddr: + continue + offset_in_vme = vaddr - unsigned(vme.links.start) + print " offset {:#018x} in map entry {: <#018x} [{:#018x}:{:#018x}] object {: <#018x} offset {:#018x}".format(offset_in_vme, vme, unsigned(vme.links.start), unsigned(vme.links.end), vme.vme_object.vmo_object, unsigned(vme.vme_offset) & ~0xFFF) + offset_in_object = offset_in_vme + (unsigned(vme.vme_offset) & ~0xFFF) + if vme.is_sub_map: + print "vaddr {:#018x} in map {: <#018x}".format(offset_in_object, vme.vme_object.vmo_submap) + vm_page_lookup_in_map(vme.vme_object.vmo_submap, offset_in_object) + else: + vm_page_lookup_in_object(vme.vme_object.vmo_object, offset_in_object) + +@lldb_command("vm_page_lookup_in_object") +def VmPageLookupInObject(cmd_args=None): + """Lookup up a page at a given offset in a VM object + usage: vm_page_lookup_in_object + """ + if cmd_args == None or len(cmd_args) < 2: + print "Invalid argument.", VmPageLookupInObject.__doc__ + return + object = kern.GetValueFromAddress(cmd_args[0], 'vm_object_t') + offset = kern.GetValueFromAddress(cmd_args[1], 'vm_object_offset_t') + print "offset {:#018x} in object {: <#018x}".format(offset, object) + vm_page_lookup_in_object(object, offset) + +def vm_page_lookup_in_object(object, offset): + offset = unsigned(offset) + page_size = kern.globals.page_size + trunc_offset = offset & ~(page_size - 1) + print " offset {:#018x} in VM object {: <#018x}".format(offset, object) + hash_id = _calc_vm_page_hash(object, trunc_offset) + page_list = kern.globals.vm_page_buckets[hash_id].page_list + page = _vm_page_unpack_ptr(page_list) + while page != 0: + m = kern.GetValueFromAddress(page, 'vm_page_t') + m_object_val = _vm_page_unpack_ptr(m.vmp_object) + m_object = kern.GetValueFromAddress(m_object_val, 'vm_object_t') + if unsigned(m_object) != unsigned(object) or unsigned(m.vmp_offset) != unsigned(trunc_offset): + page = _vm_page_unpack_ptr(m.vmp_next_m) + continue + print " resident page {: <#018x} phys {:#010x}".format(m, _vm_page_get_phys_page(m)) + return + if object.pager and object.pager_ready: + offset_in_pager = trunc_offset + unsigned(object.paging_offset) + if not object.internal: + print " offset {:#018x} in external '{:s}' {: <#018x}".format(offset_in_pager, object.pager.mo_pager_ops.memory_object_pager_name, object.pager) + return + pager = Cast(object.pager, 'compressor_pager *') + ret = vm_page_lookup_in_compressor_pager(pager, offset_in_pager) + if ret: + return + if object.shadow and not object.phys_contiguous: + offset_in_shadow = offset + unsigned(object.vo_un2.vou_shadow_offset) + vm_page_lookup_in_object(object.shadow, offset_in_shadow) + return + print " page is absent and will be zero-filled on demand" + return + +@lldb_command("vm_page_lookup_in_compressor_pager") +def VmPageLookupInCompressorPager(cmd_args=None): + """Lookup up a page at a given offset in a compressor pager + usage: vm_page_lookup_in_compressor_pager + """ + if cmd_args == None or len(cmd_args) < 2: + print "Invalid argument.", VmPageLookupInCompressorPager.__doc__ + return + pager = kern.GetValueFromAddress(cmd_args[0], 'compressor_pager_t') + offset = kern.GetValueFromAddress(cmd_args[1], 'memory_object_offset_t') + print "offset {:#018x} in compressor pager {: <#018x}".format(offset, pager) + vm_page_lookup_in_compressor_pager(pager, offset) + +def vm_page_lookup_in_compressor_pager(pager, offset): + offset = unsigned(offset) + page_size = unsigned(kern.globals.page_size) + page_num = unsigned(offset / page_size) + if page_num > pager.cpgr_num_slots: + print " *** ERROR: vm_page_lookup_in_compressor_pager({: <#018x},{:#018x}): page_num {:#x} > num_slots {:#x}".format(pager, offset, page_num, pager.cpgr_num_slots) + return 0 + slots_per_chunk = 512 / sizeof ('compressor_slot_t') + num_chunks = unsigned((pager.cpgr_num_slots+slots_per_chunk-1) / slots_per_chunk) + if num_chunks > 1: + chunk_idx = unsigned(page_num / slots_per_chunk) + chunk = pager.cpgr_slots.cpgr_islots[chunk_idx] + slot_idx = unsigned(page_num % slots_per_chunk) + slot = GetObjectAtIndexFromArray(chunk, slot_idx) + slot_str = "islots[{:d}][{:d}]".format(chunk_idx, slot_idx) + elif pager.cpgr_num_slots > 2: + slot_idx = page_num + slot = GetObjectAtIndexFromArray(pager.cpgr_slots.cpgr_dslots, slot_idx) + slot_str = "dslots[{:d}]".format(slot_idx) + else: + slot_idx = page_num + slot = GetObjectAtIndexFromArray(pager.cpgr_slots.cpgr_eslots, slot_idx) + slot_str = "eslots[{:d}]".format(slot_idx) + print " offset {:#018x} in compressor pager {: <#018x} {:s} slot {: <#018x}".format(offset, pager, slot_str, slot) + if slot == 0: + return 0 + slot_value = dereference(slot) + print " value {:#010x}".format(slot_value) + vm_page_lookup_in_compressor(Cast(slot, 'c_slot_mapping_t')) + return 1 + +@lldb_command("vm_page_lookup_in_compressor") +def VmPageLookupInCompressor(cmd_args=None): + """Lookup up a page in a given compressor slot + usage: vm_page_lookup_in_compressor + """ + if cmd_args == None or len(cmd_args) < 1: + print "Invalid argument.", VmPageLookupInCompressor.__doc__ + return + slot = kern.GetValueFromAddress(cmd_args[0], 'compressor_slot_t *') + print "compressor slot {: <#018x}".format(slot) + vm_page_lookup_in_compressor(slot) + +C_SV_CSEG_ID = ((1 << 22) - 1) + +def vm_page_lookup_in_compressor(slot_ptr): + slot_ptr = Cast(slot_ptr, 'compressor_slot_t *') + slot_value = dereference(slot_ptr) + slot = Cast(slot_value, 'c_slot_mapping') + print slot + print "compressor slot {: <#018x} -> {:#010x} cseg {:d} cindx {:d}".format(unsigned(slot_ptr), unsigned(slot_value), slot.s_cseg, slot.s_cindx) + if slot_ptr == 0: + return + if slot.s_cseg == C_SV_CSEG_ID: + sv = kern.globals.c_segment_sv_hash_table + print "single value[{:#d}]: ref {:d} value {:#010x}".format(slot.s_cindx, sv[slot.s_cindx].c_sv_he_un.c_sv_he.c_sv_he_ref, sv[slot.s_cindx].c_sv_he_un.c_sv_he.c_sv_he_data) + return + if slot.s_cseg == 0 or unsigned(slot.s_cseg) > unsigned(kern.globals.c_segments_available): + print "*** ERROR: s_cseg {:d} is out of bounds (1 - {:d})".format(slot.s_cseg, unsigned(kern.globals.c_segments_available)) + return + c_segments = kern.globals.c_segments + c_segments_elt = GetObjectAtIndexFromArray(c_segments, slot.s_cseg-1) + c_seg = c_segments_elt.c_seg + c_no_data = 0 + if hasattr(c_seg, 'c_state'): + c_state = c_seg.c_state + if c_state == 0: + c_state_str = "C_IS_EMPTY" + c_no_data = 1 + elif c_state == 1: + c_state_str = "C_IS_FREE" + c_no_data = 1 + elif c_state == 2: + c_state_str = "C_IS_FILLING" + elif c_state == 3: + c_state_str = "C_ON_AGE_Q" + elif c_state == 4: + c_state_str = "C_ON_SWAPOUT_Q" + elif c_state == 5: + c_state_str = "C_ON_SWAPPEDOUT_Q" + c_no_data = 1 + elif c_state == 6: + c_state_str = "C_ON_SWAPPEDOUTSPARSE_Q" + c_no_data = 1 + elif c_state == 7: + c_state_str = "C_ON_SWAPPEDIN_Q" + elif c_state == 8: + c_state_str = "C_ON_MAJORCOMPACT_Q" + elif c_state == 9: + c_state_str = "C_ON_BAD_Q" + c_no_data = 1 + else: + c_state_str = "" + else: + c_state = -1 + c_state_str = "" + print "c_segments[{:d}] {: <#018x} c_seg {: <#018x} c_state {:#x}={:s}".format(slot.s_cseg-1, c_segments_elt, c_seg, c_state, c_state_str) + c_indx = unsigned(slot.s_cindx) + if hasattr(c_seg, 'c_slot_var_array'): + c_seg_fixed_array_len = kern.globals.c_seg_fixed_array_len + if c_indx < c_seg_fixed_array_len: + cs = c_seg.c_slot_fixed_array[c_indx] + else: + cs = GetObjectAtIndexFromArray(c_seg.c_slot_var_array, c_indx - c_seg_fixed_array_len) + else: + C_SEG_SLOT_ARRAY_SIZE = 64 + C_SEG_SLOT_ARRAY_MASK = C_SEG_SLOT_ARRAY_SIZE - 1 + cs = GetObjectAtIndexFromArray(c_seg.c_slots[c_indx / C_SEG_SLOT_ARRAY_SIZE], c_indx & C_SEG_SLOT_ARRAY_MASK) + print cs + c_slot_unpacked_ptr = (unsigned(cs.c_packed_ptr) << 2) + vm_min_kernel_and_kext_address() + print "c_slot {: <#018x} c_offset {:#x} c_size {:#x} c_packed_ptr {:#x} (unpacked: {: <#018x})".format(cs, cs.c_offset, cs.c_size, cs.c_packed_ptr, unsigned(c_slot_unpacked_ptr)) + if unsigned(slot_ptr) != unsigned(c_slot_unpacked_ptr): + print "*** ERROR: compressor slot {: <#018x} points back to {: <#018x} instead of itself".format(slot_ptr, c_slot_unpacked_ptr) + if c_no_data == 0: + c_data = c_seg.c_store.c_buffer + (4 * cs.c_offset) + c_size = cs.c_size + cmd = "memory read {: <#018x} {: <#018x} --force".format(c_data, c_data + c_size) + print cmd + print lldb_run_command(cmd) + else: + print "" + +def vm_min_kernel_and_kext_address(cmd_args=None): + if hasattr(kern.globals, 'vm_min_kernel_and_kext_address'): + return unsigned(kern.globals.vm_min_kernel_and_kext_address) + elif kern.arch == 'x86_64': + return unsigned(0xffffff7f80000000) + elif kern.arch == 'arm64': + return unsigned(0xffffff8000000000) + elif kern.arch == 'arm': + return unsigned(0x80000000) + else: + print "vm_min_kernel_and_kext_address(): unknown arch '{:s}'".format(kern.arch) + return unsigned(0) + +def print_hex_data(data, begin_offset=0, desc=""): + """ print on stdout "hexdump -C < data" like output + params: + data - bytearray or array of int where each int < 255 + begin_offset - int offset that should be printed in left column + desc - str optional description to print on the first line to describe data + """ + if desc: + print "{}:".format(desc) + index = 0 + total_len = len(data) + hex_buf = "" + char_buf = "" + while index < total_len: + hex_buf += " {:02x}".format(data[index]) + if data[index] < 0x20 or data[index] > 0x7e: + char_buf += "." + else: + char_buf += "{:c}".format(data[index]) + index += 1 + if index and index % 8 == 0: + hex_buf += " " + if index > 1 and (index % 16) == 0: + print "{:08x} {: <50s} |{: <16s}|".format(begin_offset + index - 16, hex_buf, char_buf) + hex_buf = "" + char_buf = "" + print "{:08x} {: <50s} |{: <16s}|".format(begin_offset + index - 16, hex_buf, char_buf) + return + +@lldb_command('vm_scan_all_pages') +def VMScanAllPages(cmd_args=None): + """Scans the vm_pages[] array + """ + vm_pages_count = kern.globals.vm_pages_count + vm_pages = kern.globals.vm_pages + + free_count = 0 + local_free_count = 0 + active_count = 0 + local_active_count = 0 + inactive_count = 0 + speculative_count = 0 + throttled_count = 0 + wired_count = 0 + compressor_count = 0 + pageable_internal_count = 0 + pageable_external_count = 0 + secluded_count = 0 + secluded_free_count = 0 + secluded_inuse_count = 0 + + i = 0 + while i < vm_pages_count: + + if i % 10000 == 0: + print "{:d}/{:d}...\n".format(i,vm_pages_count) + + m = vm_pages[i] + + internal = 0 + external = 0 + m_object_val = _vm_page_unpack_ptr(m.vmp_object) + + if m_object: + if m_object.internal: + internal = 1 + else: + external = 1 + + if m.vmp_wire_count != 0 and m.vmp_local == 0: + wired_count = wired_count + 1 + pageable = 0 + elif m.vmp_throttled: + throttled_count = throttled_count + 1 + pageable = 0 + elif m.vmp_active: + active_count = active_count + 1 + pageable = 1 + elif m.vmp_local: + local_active_count = local_active_count + 1 + pageable = 0 + elif m.vmp_inactive: + inactive_count = inactive_count + 1 + pageable = 1 + elif m.vmp_speculative: + speculative_count = speculative_count + 1 + pageable = 0 + elif m.vmp_free: + free_count = free_count + 1 + pageable = 0 + elif m.vmp_secluded: + secluded_count = secluded_count + 1 + if m_object == 0: + secluded_free_count = secluded_free_count + 1 + else: + secluded_inuse_count = secluded_inuse_count + 1 + pageable = 0 + elif m_object == 0 and m.vmp_busy: + local_free_count = local_free_count + 1 + pageable = 0 + elif m.vmp_compressor: + compressor_count = compressor_count + 1 + pageable = 0 + else: + print "weird page vm_pages[{:d}]?\n".format(i) + pageable = 0 + + if pageable: + if internal: + pageable_internal_count = pageable_internal_count + 1 + else: + pageable_external_count = pageable_external_count + 1 + i = i + 1 + + print "vm_pages_count = {:d}\n".format(vm_pages_count) + + print "wired_count = {:d}\n".format(wired_count) + print "throttled_count = {:d}\n".format(throttled_count) + print "active_count = {:d}\n".format(active_count) + print "local_active_count = {:d}\n".format(local_active_count) + print "inactive_count = {:d}\n".format(inactive_count) + print "speculative_count = {:d}\n".format(speculative_count) + print "free_count = {:d}\n".format(free_count) + print "local_free_count = {:d}\n".format(local_free_count) + print "compressor_count = {:d}\n".format(compressor_count) + + print "pageable_internal_count = {:d}\n".format(pageable_internal_count) + print "pageable_external_count = {:d}\n".format(pageable_external_count) + print "secluded_count = {:d}\n".format(secluded_count) + print "secluded_free_count = {:d}\n".format(secluded_free_count) + print "secluded_inuse_count = {:d}\n".format(secluded_inuse_count) + + +@lldb_command('show_all_vm_named_entries') +def ShowAllVMNamedEntries(cmd_args=None): + """ Routine to print a summary listing of all the VM named entries + """ + queue_len = kern.globals.vm_named_entry_count + queue_head = kern.globals.vm_named_entry_list + + print 'vm_named_entry_list:{: <#018x} vm_named_entry_count:{:d}\n'.format(kern.GetLoadAddressForSymbol('vm_named_entry_list'),queue_len) + + print '{:>6s} {:<6s} {:18s} {:1s} {:>6s} {:>16s} {:>10s} {:>10s} {:>10s} {:>3s} {:18s} {:>6s} {:<20s}\n'.format("#","#","object","P","refcnt","size (pages)","resid","wired","compressed","tag","owner","pid","process") + idx = 0 + for entry in IterateQueue(queue_head, 'struct vm_named_entry *', 'named_entry_list'): + idx += 1 + showmemoryentry(entry, idx, queue_len) + +@lldb_command('show_vm_named_entry') +def ShowVMNamedEntry(cmd_args=None): + """ Routine to print a VM named entry + """ + if cmd_args == None or len(cmd_args) < 1: + print "Invalid argument.", ShowMapVMNamedEntry.__doc__ + return + named_entry = kern.GetValueFromAddress(cmd_args[0], 'vm_named_entry_t') + showmemoryentry(named_entry, 0, 0) + +def showmemoryentry(entry, idx=0, queue_len=0): + """ Routine to print out a summary a VM memory entry + params: + entry - core.value : a object of type 'struct vm_named_entry *' + returns: + None + """ + show_pager_info = True + show_all_shadows = True + + backing = "" + if entry.is_sub_map == 1: + backing += "SUBMAP" + if entry.is_copy == 1: + backing += "COPY" + if entry.is_sub_map == 0 and entry.is_copy == 0: + backing += "OBJECT" + prot="" + if entry.protection & 0x1: + prot += "r" + else: + prot += "-" + if entry.protection & 0x2: + prot += "w" + else: + prot += "-" + if entry.protection & 0x4: + prot += "x" + else: + prot += "-" + extra_str = "" + if hasattr(entry, 'named_entry_alias'): + extra_str += " alias={:d}".format(entry.named_entry_alias) + if hasattr(entry, 'named_entry_port'): + extra_str += " port={:#016x}".format(entry.named_entry_port) + print "{:>6d}/{:<6d} {: <#018x} ref={:d} prot={:d}/{:s} type={:s} backing={: <#018x} offset={:#016x} dataoffset={:#016x} size={:#016x}{:s}\n".format(idx,queue_len,entry,entry.ref_count,entry.protection,prot,backing,entry.backing.object,entry.offset,entry.data_offset,entry.size,extra_str) + if entry.is_sub_map == 1: + showmapvme(entry.backing.map, 0, 0, show_pager_info, show_all_shadows) + if entry.is_copy == 1: + showmapcopyvme(entry.backing.copy, 0, 0, 0, show_pager_info, show_all_shadows, 0) + if entry.is_sub_map == 0 and entry.is_copy == 0: + showvmobject(entry.backing.object, entry.offset, entry.size, show_pager_info, show_all_shadows) + + +def IterateRBTreeEntry2(element, element_type, field_name1, field_name2): + """ iterate over a rbtree as defined with RB_HEAD in libkern/tree.h + element - value : Value object for rbh_root + element_type - str : Type of the link element + field_name - str : Name of the field in link element's structure + returns: + A generator does not return. It is used for iterating + value : an object thats of type (element_type) head->sle_next. Always a pointer object + """ + elt = element.__getattr__('rbh_root') + if type(element_type) == str: + element_type = gettype(element_type) + charp_type = gettype('char *'); + + # Walk to find min + parent = elt + while unsigned(elt) != 0: + parent = elt + elt = cast(elt.__getattr__(field_name1).__getattr__(field_name2).__getattr__('rbe_left'), element_type) + elt = parent + + # Now elt is min + while unsigned(elt) != 0: + yield elt + # implementation cribbed from RB_NEXT in libkern/tree.h + right = cast(elt.__getattr__(field_name1).__getattr__(fieldname2).__getattr__('rbe_right'), element_type) + if unsigned(right) != 0: + elt = right + left = cast(elt.__getattr__(field_name1).__getattr__(field_name2).__getattr__('rbe_left'), element_type) + while unsigned(left) != 0: + elt = left + left = cast(elt.__getattr__(field_name1).__getattr(__field_name2).__getattr__('rbe_left'), element_type) + else: + + # avoid using GetValueFromAddress + addr = elt.__getattr__(field_name1).__getattr__(field_name2).__getattr__('rbe_parent')&~1 + parent = value(elt.GetSBValue().CreateValueFromExpression(None,'(void *)'+str(addr))) + parent = cast(parent, element_type) + + if unsigned(parent) != 0: + left = cast(parent.__getattr__(field_name1).__getattr__(field_name2).__getattr__('rbe_left'), element_type) + if (unsigned(parent) != 0) and (unsigned(elt) == unsigned(left)): + elt = parent + else: + if unsigned(parent) != 0: + right = cast(parent.__getattr__(field_name1).__getattr__(field_name2).__getattr__('rbe_right'), element_type) + while unsigned(parent) != 0 and (unsigned(elt) == unsigned(right)): + elt = parent + + # avoid using GetValueFromAddress + addr = elt.__getattr__(field_name1).__getattr__(field_name2).__getattr__('rbe_parent')&~1 + parent = value(elt.GetSBValue().CreateValueFromExpression(None,'(void *)'+str(addr))) + parent = cast(parent, element_type) + + right = cast(parent.__getattr__(field_name1).__getattr__(field_name2).__getattr__('rbe_right'), element_type) + + # avoid using GetValueFromAddress + addr = elt.__getattr__(field_name1).__getattr__(field_name2).__getattr__('rbe_parent')&~1 + elt = value(elt.GetSBValue().CreateValueFromExpression(None,'(void *)'+str(addr))) + elt = cast(elt, element_type) + + +@lldb_command("showmaprb") +def ShowMapRB(cmd_args=None): + """Routine to print out a VM map's RB tree + usage: showmaprb + """ + if cmd_args == None or len(cmd_args) < 1: + print "Invalid argument.", ShowMapRB.__doc__ + return + map_val = kern.GetValueFromAddress(cmd_args[0], 'vm_map_t') + print GetVMMapSummary.header + print GetVMMapSummary(map_val) + vme_rb_root = map_val.hdr.rb_head_store + vme_ptr_type = GetType('struct vm_map_entry *') + print GetVMEntrySummary.header + for vme in IterateRBTreeEntry2(vme_rb_root, 'struct vm_map_entry *', 'store', 'entry'): + print GetVMEntrySummary(vme) + return None + +@lldb_command('show_all_owned_objects', 'T') +def ShowAllOwnedObjects(cmd_args=None, cmd_options={}): + """ Routine to print the list of VM objects owned by each task + -T: show only ledger-tagged objects + """ + showonlytagged = False + if "-T" in cmd_options: + showonlytagged = True + for task in kern.tasks: + ShowTaskOwnedVmObjects(task, showonlytagged) + +@lldb_command('show_task_owned_objects', 'T') +def ShowTaskOwnedObjects(cmd_args=None, cmd_options={}): + """ Routine to print the list of VM objects owned by the specified task + -T: show only ledger-tagged objects + """ + showonlytagged = False + if "-T" in cmd_options: + showonlytagged = True + task = kern.GetValueFromAddress(cmd_args[0], 'task *') + ShowTaskOwnedVmObjects(task, showonlytagged) + +def ShowTaskOwnedVmObjects(task, showonlytagged=False): + """ Routine to print out a summary listing of all the entries in a vm_map + params: + task - core.value : a object of type 'task *' + returns: + None + """ + taskobjq_total = lambda:None + taskobjq_total.objects = 0 + taskobjq_total.vsize = 0 + taskobjq_total.rsize = 0 + taskobjq_total.wsize = 0 + taskobjq_total.csize = 0 + vmo_list_head = task.task_objq + vmo_ptr_type = GetType('vm_object *') + idx = 0 + for vmo in IterateQueue(vmo_list_head, vmo_ptr_type, "task_objq"): + idx += 1 + if not showonlytagged or vmo.vo_ledger_tag != 0: + if taskobjq_total.objects == 0: + print ' \n' + print GetTaskSummary.header + ' ' + GetProcSummary.header + print GetTaskSummary(task) + ' ' + GetProcSummary(Cast(task.bsd_info, 'proc *')) + print '{:>6s} {:<6s} {:18s} {:1s} {:>6s} {:>16s} {:>10s} {:>10s} {:>10s} {:>2s} {:18s} {:>6s} {:<20s}\n'.format("#","#","object","P","refcnt","size (pages)","resid","wired","compressed","tg","owner","pid","process") + ShowOwnedVmObject(vmo, idx, 0, taskobjq_total) + if taskobjq_total.objects != 0: + print " total:{:<10d} [ virtual:{:<10d} resident:{:<10d} wired:{:<10d} compressed:{:<10d} ]\n".format(taskobjq_total.objects, taskobjq_total.vsize, taskobjq_total.rsize, taskobjq_total.wsize, taskobjq_total.csize) + return None + +def ShowOwnedVmObject(object, idx, queue_len, taskobjq_total): + """ Routine to print out a VM object owned by a task + params: + object - core.value : a object of type 'struct vm_object *' + returns: + None + """ + page_size = kern.globals.page_size + if object.purgable == 0: + purgable = "N" + elif object.purgable == 1: + purgable = "V" + elif object.purgable == 2: + purgable = "E" + elif object.purgable == 3: + purgable = "D" + else: + purgable = "?" + if object.pager == 0: + compressed_count = 0 + else: + compressor_pager = Cast(object.pager, 'compressor_pager *') + compressed_count = compressor_pager.cpgr_num_slots_occupied + + print "{:>6d}/{:<6d} {: <#018x} {:1s} {:>6d} {:>16d} {:>10d} {:>10d} {:>10d} {:>2d} {: <#018x} {:>6d} {:<20s}\n".format(idx,queue_len,object,purgable,object.ref_count,object.vo_un1.vou_size/page_size,object.resident_page_count,object.wired_page_count,compressed_count, object.vo_ledger_tag, object.vo_un2.vou_owner,GetProcPIDForObjectOwner(object.vo_un2.vou_owner),GetProcNameForObjectOwner(object.vo_un2.vou_owner)) + + taskobjq_total.objects += 1 + taskobjq_total.vsize += object.vo_un1.vou_size/page_size + taskobjq_total.rsize += object.resident_page_count + taskobjq_total.wsize += object.wired_page_count + taskobjq_total.csize += compressed_count + +def GetProcPIDForObjectOwner(owner): + """ same as GetProcPIDForTask() but deals with -1 for a disowned object + """ + if unsigned(Cast(owner, 'int')) == unsigned(int(0xffffffff)): + return -1 + return GetProcPIDForTask(owner) + +def GetProcNameForObjectOwner(owner): + """ same as GetProcNameForTask() but deals with -1 for a disowned object + """ + if unsigned(Cast(owner, 'int')) == unsigned(int(0xffffffff)): + return "" + return GetProcNameForTask(owner) + +def GetDescForNamedEntry(mem_entry): + out_str = "\n" + out_str += "\t\tmem_entry {:#08x} ref:{:d} offset:{:#08x} size:{:#08x} prot{:d} backing {:#08x}".format(mem_entry, mem_entry.ref_count, mem_entry.offset, mem_entry.size, mem_entry.protection, mem_entry.backing.object) + if mem_entry.is_sub_map: + out_str += " is_sub_map" + elif mem_entry.is_copy: + out_str += " is_copy" + else: + out_str += " is_object" + return out_str diff --git a/tools/lldbmacros/misc.py b/tools/lldbmacros/misc.py index 88b1d7673..fd5382f1a 100755 --- a/tools/lldbmacros/misc.py +++ b/tools/lldbmacros/misc.py @@ -95,7 +95,7 @@ def GetCpuDataForCpuID(cpu_id): if kern.arch == 'x86_64': cpu_data = kern.globals.cpu_data_ptr[cpu_id] return cpu_data - elif kern.arch in ['arm', 'arm64'] : + elif kern.arch.startswith('arm'): data_entries_addr = kern.GetLoadAddressForSymbol('CpuDataEntries') data_entries = kern.GetValueFromAddress(data_entries_addr, 'cpu_data_entry_t *') data_entry = data_entries[cpu_id]; @@ -689,7 +689,7 @@ def DumpRawTraceFile(cmd_args=[], cmd_options={}): print "Trace buffer not enabled\n" return - if ((kern.arch == "x86_64") or (kern.arch == "arm64")) : + if ((kern.arch == "x86_64") or kern.arch.startswith("arm64")) : lp64 = True elif kern.arch == "arm" : lp64 = False diff --git a/tools/lldbmacros/pmap.py b/tools/lldbmacros/pmap.py index 6f9bbb6ec..40529c70c 100755 --- a/tools/lldbmacros/pmap.py +++ b/tools/lldbmacros/pmap.py @@ -920,6 +920,10 @@ def DecodeTTE(cmd_args=None): else: raise NotImplementedError("decode_tte does not support {0}".format(kern.arch)) + +PVH_HIGH_FLAGS_ARM64 = (1 << 62) | (1 << 61) | (1 << 60) | (1 << 59) +PVH_HIGH_FLAGS_ARM32 = (1 << 31) + def PVWalkARM(pa): """ Walk a physical-to-virtual reverse mapping list maintained by the arm pmap pa: physical address (NOT page number). Does not need to be page-aligned @@ -928,10 +932,19 @@ def PVWalkARM(pa): vm_last_phys = unsigned(kern.globals.vm_last_phys) if pa < vm_first_phys or pa >= vm_last_phys: raise ArgumentError("PA {:#x} is outside range of managed physical addresses: [{:#x}, {:#x})".format(pa, vm_first_phys, vm_last_phys)) - page_size = kern.globals.arm_hardware_page_size + page_size = kern.globals.page_size pn = (pa - unsigned(kern.globals.vm_first_phys)) / page_size pvh = unsigned(kern.globals.pv_head_table[pn]) pvh_type = pvh & 0x3 + print "PVH raw value: ({:#x})".format(pvh) + if kern.arch.startswith('arm64'): + iommu_flag = 0x4 + iommu_table_flag = 1 << 63 + pvh = pvh | PVH_HIGH_FLAGS_ARM64 + else: + iommu_flag = 0 + iommu_table_flag = 0 + pvh = pvh | PVH_HIGH_FLAGS_ARM32 if pvh_type == 0: print "PVH type: NULL" return @@ -940,8 +953,16 @@ def PVWalkARM(pa): return elif pvh_type == 2: ptep = pvh & ~0x3 + pte_str = '' print "PVH type: single PTE" - print "PTE {:#x}: {:#x}".format(ptep, dereference(kern.GetValueFromAddress(ptep, 'pt_entry_t *'))) + if ptep & iommu_flag: + ptep = ptep & ~iommu_flag + if ptep & iommu_table_flag: + pte_str = ' (IOMMU table), entry' + else: + pte_str = ' (IOMMU state), descriptor' + ptep = ptep | iommu_table_flag + print "PTE {:#x}{:s}: {:#x}".format(ptep, pte_str, dereference(kern.GetValueFromAddress(ptep, 'pt_entry_t *'))) elif pvh_type == 1: pvep = pvh & ~0x3 print "PVH type: PTE list" @@ -954,6 +975,13 @@ def PVWalkARM(pa): current_pvep = pvep pvep = unsigned(pve.pve_next) & ~0x1 ptep = unsigned(pve.pve_ptep) & ~0x3 + if ptep & iommu_flag: + ptep = ptep & ~iommu_flag + if ptep & iommu_table_flag: + pve_str = ' (IOMMU table), entry' + else: + pve_str = ' (IOMMU state), descriptor' + ptep = ptep | iommu_table_flag print "PVE {:#x}, PTE {:#x}{:s}: {:#x}".format(current_pvep, ptep, pve_str, dereference(kern.GetValueFromAddress(ptep, 'pt_entry_t *'))) @lldb_command('pv_walk') @@ -967,15 +995,50 @@ def PVWalk(cmd_args=None): raise NotImplementedError("pv_walk does not support {0}".format(kern.arch)) PVWalkARM(kern.GetValueFromAddress(cmd_args[0], 'unsigned long')) +@lldb_command('kvtophys') +def KVToPhys(cmd_args=None): + """ Translate a kernel virtual address to the corresponding physical address. + Assumes the virtual address falls within the kernel static region. + Syntax: (lldb) kvtophys + """ + if cmd_args == None or len(cmd_args) < 1: + raise ArgumentError("Too few arguments to kvtophys.") + if kern.arch.startswith('arm'): + print "{:#x}".format(KVToPhysARM(long(unsigned(kern.GetValueFromAddress(cmd_args[0], 'unsigned long'))))) + elif kern.arch == 'x86_64': + print "{:#x}".format(long(unsigned(kern.GetValueFromAddress(cmd_args[0], 'unsigned long'))) - unsigned(kern.globals.physmap_base)) + +@lldb_command('phystokv') +def PhysToKV(cmd_args=None): + """ Translate a physical address to the corresponding static kernel virtual address. + Assumes the physical address corresponds to managed DRAM. + Syntax: (lldb) phystokv + """ + if cmd_args == None or len(cmd_args) < 1: + raise ArgumentError("Too few arguments to phystokv.") + print "{:#x}".format(kern.PhysToKernelVirt(long(unsigned(kern.GetValueFromAddress(cmd_args[0], 'unsigned long'))))) + +def KVToPhysARM(addr): + if kern.arch.startswith('arm64'): + ptov_table = kern.globals.ptov_table + for i in range(0, kern.globals.ptov_index): + if (addr >= long(unsigned(ptov_table[i].va))) and (addr < (long(unsigned(ptov_table[i].va)) + long(unsigned(ptov_table[i].len)))): + return (addr - long(unsigned(ptov_table[i].va)) + long(unsigned(ptov_table[i].pa))) + return (addr - unsigned(kern.globals.gVirtBase) + unsigned(kern.globals.gPhysBase)) + def ShowPTEARM(pte): """ Display vital information about an ARM page table entry pte: kernel virtual address of the PTE. Should be L3 PTE. May also work with L2 TTEs for certain devices. """ page_size = kern.globals.arm_hardware_page_size - pn = (pte - unsigned(kern.globals.gVirtBase) + unsigned(kern.globals.gPhysBase) - unsigned(kern.globals.vm_first_phys)) / page_size - pvh = kern.globals.pv_head_table[pn] + pn = (KVToPhysARM(pte) - unsigned(kern.globals.vm_first_phys)) / page_size + pvh = unsigned(kern.globals.pv_head_table[pn]) + if kern.arch.startswith('arm64'): + pvh = pvh | PVH_HIGH_FLAGS_ARM64 + else: + pvh = pvh | PVH_HIGH_FLAGS_ARM32 pvh_type = pvh & 0x3 - if pvh_type != 0x3 and pvh_type != 0x0: + if pvh_type != 0x3: raise ValueError("PV head {:#x} does not correspond to a page-table descriptor".format(pvh)) ptd = kern.GetValueFromAddress(pvh & ~0x3, 'pt_desc_t *') print "descriptor: {:#x}".format(ptd) @@ -1137,7 +1200,7 @@ def printMatchedMapping(pmap, level, type, tte, paddr, granule): ScanPageTables(printMatchedMapping, targetPmap) def checkPVList(pmap, level, type, tte, paddr, granule): - """ Checks an ARM physical-to-virtual mapping list for consistency error. + """ Checks an ARM physical-to-virtual mapping list for consistency errors. pmap: owner of the translation table level: translation table level. PV lists will only be checked for L2 (arm32) or L3 (arm64) tables. type: unused @@ -1147,20 +1210,22 @@ def checkPVList(pmap, level, type, tte, paddr, granule): """ vm_first_phys = unsigned(kern.globals.vm_first_phys) vm_last_phys = unsigned(kern.globals.vm_last_phys) - page_size = kern.globals.arm_hardware_page_size + page_size = kern.globals.page_size if kern.arch.startswith('arm64'): page_offset_mask = (page_size - 1) page_base_mask = ((1 << ARM64_VMADDR_BITS) - 1) & (~page_offset_mask) paddr = paddr & page_base_mask max_level = 3 + pvh_set_bits = PVH_HIGH_FLAGS_ARM64 elif kern.arch == 'arm': page_base_mask = 0xFFFFF000 paddr = paddr & page_base_mask max_level = 2 + pvh_set_bits = PVH_HIGH_FLAGS_ARM32 if level < max_level or paddr < vm_first_phys or paddr >= vm_last_phys: return pn = (paddr - vm_first_phys) / page_size - pvh = unsigned(kern.globals.pv_head_table[pn]) + pvh = unsigned(kern.globals.pv_head_table[pn]) | pvh_set_bits pvh_type = pvh & 0x3 if pmap is not None: pmap_str = "pmap: {:#x}: ".format(pmap) @@ -1207,7 +1272,7 @@ def PVCheck(cmd_args=None, cmd_options={}): -P : Interpret as a physical address rather than a PTE """ if cmd_args == None or len(cmd_args) < 1: - raise ArgumentError("Too few arguments to showallmappings.") + raise ArgumentError("Too few arguments to pv_check.") if kern.arch == 'arm': level = 2 elif kern.arch.startswith('arm64'): diff --git a/tools/lldbmacros/process.py b/tools/lldbmacros/process.py index b86e1a80d..f37169a3b 100755 --- a/tools/lldbmacros/process.py +++ b/tools/lldbmacros/process.py @@ -179,13 +179,15 @@ def GetASTSummary(ast): D - AST_DTRACE I - AST_TELEMETRY_IO E - AST_KEVENT + R - AST_REBALANCE + N - AST_UNQUIESCE """ out_string = "" state = int(ast) thread_state_chars = {0x0:'', 0x1:'P', 0x2:'Q', 0x4:'U', 0x8:'H', 0x10:'Y', 0x20:'A', 0x40:'L', 0x80:'B', 0x100:'K', 0x200:'M', 0x1000:'G', 0x2000:'T', 0x4000:'T', 0x8000:'T', 0x10000:'S', - 0x20000: 'D', 0x40000: 'I', 0x80000: 'E'} + 0x20000: 'D', 0x40000: 'I', 0x80000: 'E', 0x100000: 'R', 0x200000: 'N'} state_str = '' mask = 0x1 while mask <= 0x80000: @@ -553,7 +555,7 @@ def ShowAllCoalitions(cmd_args=None): # Macro: showallthreadgroups -@lldb_type_summary(['thread_group_t', 'thread_group *']) +@lldb_type_summary(['struct thread_group *', 'thread_group *']) @header("{0: <20s} {1: <5s} {2: <16s} {3: <5s} {4: <8s} {5: <20s}".format("thread_group", "id", "name", "refc", "flags", "recommendation")) def GetThreadGroupSummary(tg): if unsigned(tg) == 0: @@ -1500,7 +1502,7 @@ def GetProcessorSummary(processor): preemption_disable_str) return out_str -def GetLedgerEntrySummary(ledger_template, ledger, i): +def GetLedgerEntrySummary(ledger_template, ledger, i, show_footprint_interval_max=False): """ Internal function to get internals of a ledger entry (*not* a ledger itself) params: ledger_template - value representing struct ledger_template_t for the task or thread ledger - value representing struct ledger_entry * @@ -1517,14 +1519,13 @@ def GetLedgerEntrySummary(ledger_template, ledger, i): out_str += "{: >32s} {:<2d}:".format(ledger_template.lt_entries[i].et_key, i) out_str += "{: >15d} ".format(unsigned(ledger.le_credit) - unsigned(ledger.le_debit)) if (ledger.le_flags & lf_tracking_max): - out_str += "{:9d} {:5d} ".format(ledger._le.le_maxtracking.le_peaks[0].le_max, now - unsigned(ledger._le.le_maxtracking.le_peaks[0].le_time)) + if (show_footprint_interval_max): + out_str += "{:12d} ".format(ledger._le._le_max.le_interval_max) + out_str += "{:14d} ".format(ledger._le._le_max.le_lifetime_max) else: - out_str += " - -" - - if (ledger.le_flags & lf_tracking_max): - out_str += "{:12d} ".format(ledger._le.le_maxtracking.le_lifetime_max) - else: - out_str += " -" + if (show_footprint_interval_max): + out_str += " - " + out_str += " - " out_str += "{:12d} {:12d} ".format(unsigned(ledger.le_credit), unsigned(ledger.le_debit)) if (unsigned(ledger.le_limit) != ledger_limit_infinity): out_str += "{:12d} ".format(unsigned(ledger.le_limit)) @@ -1569,11 +1570,7 @@ def GetThreadLedgerSummary(thread_val): i = i + 1 return out_str -@header("{0: <15s} {1: >16s} {2: <2s} {3: >15s} {4: >9s} {5: >6s} {6: >12s} {7: >11s} \ - {8: >7s} {9: >13s} {10: <15s} {11: <8s} {12: <9s} {13: <6s} {14: >6s}".format( - "task [thread]", "entry", "#", "balance", "peakA", "(age)", "lifemax", "credit", - "debit", "limit", "refill period", "lim pct", "warn pct", "over?", "flags")) -def GetTaskLedgers(task_val): +def GetTaskLedgers(task_val, show_footprint_interval_max=False): """ Internal function to get summary of ledger entries from the task and its threads params: task_val - value representing struct task * return: str - formatted output information for ledger entries of the input task @@ -1588,7 +1585,7 @@ def GetTaskLedgers(task_val): else: out_str += "Invalid process:\n" while i != task_ledgerp.l_template.lt_cnt: - out_str += GetLedgerEntrySummary(kern.globals.task_ledger_template, task_ledgerp.l_entries[i], i) + out_str += GetLedgerEntrySummary(kern.globals.task_ledger_template, task_ledgerp.l_entries[i], i, show_footprint_interval_max) i = i + 1 # Now walk threads @@ -1599,11 +1596,14 @@ def GetTaskLedgers(task_val): # Macro: showtaskledgers -@lldb_command('showtaskledgers', 'F:') +@lldb_command('showtaskledgers', 'F:I') def ShowTaskLedgers(cmd_args=None, cmd_options={}): """ Routine to print a summary of ledger entries for the task and all of its threads - Usage: showtaskledgers
- or : showtaskledgers -F + or : showtaskledgers [ -I ] [ -F ] + options: + -I: show footprint interval max (DEV/DEBUG only) + -F: specify task via name instead of address + - """ if "-F" in cmd_options: task_list = FindTasksByName(cmd_options["-F"]) @@ -1614,24 +1614,34 @@ def ShowTaskLedgers(cmd_args=None, cmd_options={}): if not cmd_args: raise ArgumentError("No arguments passed.") + show_footprint_interval_max = False + if "-I" in cmd_options: + show_footprint_interval_max = True tval = kern.GetValueFromAddress(cmd_args[0], 'task *') if not tval: raise ArgumentError("unknown arguments: %r" %cmd_args) - print GetTaskLedgers.header - print GetTaskLedgers(tval) + if (show_footprint_interval_max): + print "{0: <15s} {1: >16s} {2: <2s} {3: >15s} {4: >12s} {5: >14s} {6: >12s} {7: >12s} {8: >12s} {9: <15s} {10: <8s} {11: <9s} {12: <6s} {13: >6s}".format( + "task [thread]", "entry", "#", "balance", "intrvl_max", "lifetime_max", "credit", + "debit", "limit", "refill period", "lim pct", "warn pct", "over?", "flags") + else: + print "{0: <15s} {1: >16s} {2: <2s} {3: >15s} {4: >14s} {5: >12s} {6: >12s} {7: >12s} {8: <15s} {9: <8s} {10: <9s} {11: <6s} {12: >6s}".format( + "task [thread]", "entry", "#", "balance", "lifetime_max", "credit", + "debit", "limit", "refill period", "lim pct", "warn pct", "over?", "flags") + print GetTaskLedgers(tval, show_footprint_interval_max) # EndMacro: showtaskledgers # Macro: showalltaskledgers @lldb_command('showalltaskledgers') -def ShowAllTaskLedgers(cmd_args=None): +def ShowAllTaskLedgers(cmd_args=None, cmd_options={}): """ Routine to print a summary of ledger entries for all tasks and respective threads Usage: showalltaskledgers """ for t in kern.tasks: task_val = unsigned(t) - ShowTaskLedgers([task_val]) + ShowTaskLedgers([task_val], cmd_options=cmd_options) # EndMacro: showalltaskledgers diff --git a/tools/lldbmacros/scheduler.py b/tools/lldbmacros/scheduler.py index d60dd0e4e..36a37c328 100755 --- a/tools/lldbmacros/scheduler.py +++ b/tools/lldbmacros/scheduler.py @@ -147,18 +147,13 @@ def ShowCurremtAbsTime(cmd_args=None): Usage: showcurrentabstime """ pset = addressof(kern.globals.pset0) + processor_array = kern.globals.processor_array cur_abstime = 0 while unsigned(pset) != 0: - for processor in ParanoidIterateLinkageChain(pset.active_queue, "processor_t", "processor_queue"): - if unsigned(processor.last_dispatch) > cur_abstime: - cur_abstime = unsigned(processor.last_dispatch) - - for processor in ParanoidIterateLinkageChain(pset.idle_queue, "processor_t", "processor_queue"): - if unsigned(processor.last_dispatch) > cur_abstime: - cur_abstime = unsigned(processor.last_dispatch) - - for processor in ParanoidIterateLinkageChain(pset.idle_secondary_queue, "processor_t", "processor_queue"): + cpu_bitmap = int(pset.cpu_bitmask) + for cpuid in IterateBitmap(cpu_bitmap): + processor = processor_array[cpuid] if unsigned(processor.last_dispatch) > cur_abstime: cur_abstime = unsigned(processor.last_dispatch) @@ -377,20 +372,22 @@ def ShowSchedHistory(cmd_args=None, cmd_options=None): run_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_RUN')] fixpri_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_FIXPRI')] share_fg_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_FG')] + share_df_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_DF')] share_ut_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_UT')] share_bg_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_BG')] sched_pri_shifts = kern.globals.sched_run_buckets share_fg_shift = sched_pri_shifts[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_FG')] + share_df_shift = sched_pri_shifts[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_DF')] share_ut_shift = sched_pri_shifts[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_UT')] share_bg_shift = sched_pri_shifts[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_BG')] print "Processors: {g.processor_avail_count:d} Runnable threads: {:d} Fixpri threads: {:d}\n".format(run_count, fixpri_count, g=kern.globals) - print "FG Timeshare threads: {:d} UT Timeshare threads: {:d} BG Timeshare threads: {:d}\n".format(share_fg_count, share_ut_count, share_bg_count) + print "FG Timeshare threads: {:d} DF Timeshare threads: {:d} UT Timeshare threads: {:d} BG Timeshare threads: {:d}\n".format(share_fg_count, share_df_count, share_ut_count, share_bg_count) print "Mach factor: {g.sched_mach_factor:d} Load factor: {g.sched_load_average:d} Sched tick: {g.sched_tick:d} timestamp: {g.sched_tick_last_abstime:d} interval:{g.sched_tick_interval:d}\n".format(g=kern.globals) - print "Fixed shift: {g.sched_fixed_shift:d} FG shift: {:d} UT shift: {:d} BG shift: {:d}\n".format(share_fg_shift, share_ut_shift, share_bg_shift, g=kern.globals) + print "Fixed shift: {g.sched_fixed_shift:d} FG shift: {:d} DF shift: {:d} UT shift: {:d} BG shift: {:d}\n".format(share_fg_shift, share_df_shift, share_ut_shift, share_bg_shift, g=kern.globals) print "sched_pri_decay_band_limit: {g.sched_pri_decay_band_limit:d} sched_decay_usage_age_factor: {g.sched_decay_usage_age_factor:d}\n".format(g=kern.globals) if kern.arch == 'x86_64': @@ -572,11 +569,12 @@ def ShowScheduler(cmd_args=None): run_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_RUN')] fixpri_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_FIXPRI')] share_fg_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_FG')] + share_df_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_DF')] share_ut_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_UT')] share_bg_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_BG')] print "Processors: {g.processor_avail_count:d} Runnable threads: {:d} Fixpri threads: {:d}\n".format(run_count, fixpri_count, g=kern.globals) - print "FG Timeshare threads: {:d} UT Timeshare threads: {:d} BG Timeshare threads: {:d}\n".format(share_fg_count, share_ut_count, share_bg_count) + print "FG Timeshare threads: {:d} DF Timeshare threads: {:d} UT Timeshare threads: {:d} BG Timeshare threads: {:d}\n".format(share_fg_count, share_df_count, share_ut_count, share_bg_count) if show_group_pset_runq: if hasattr(kern.globals, "multiq_sanity_check"): @@ -620,41 +618,69 @@ def ShowScheduler(cmd_args=None): print "Group {: <#012x} Task {: <#012x}\n".format(unsigned(group), unsigned(task)) ShowRunQSummary(group.runq) print " \n" + + processor_array = kern.globals.processor_array print "Active Processors:\n" - for processor in ParanoidIterateLinkageChain(pset.active_queue, "processor_t", "processor_queue"): - print " " + GetProcessorSummary(processor) - ShowActiveThread(processor) - ShowNextThread(processor) - - if show_priority_runq: - runq = processor.runq - ShowRunQSummary(runq) - if show_grrr: - grrr_runq = processor.grrr_runq - ShowGrrrSummary(grrr_runq) + active_bitmap = int(pset.cpu_state_map[5]) | int(pset.cpu_state_map[6]) + for cpuid in IterateBitmap(active_bitmap): + processor = processor_array[cpuid] + if processor != 0: + print " " + GetProcessorSummary(processor) + ShowActiveThread(processor) + ShowNextThread(processor) + + if show_priority_runq: + runq = processor.runq + ShowRunQSummary(runq) + if show_grrr: + grrr_runq = processor.grrr_runq + ShowGrrrSummary(grrr_runq) print " \n" print "Idle Processors:\n" - for processor in ParanoidIterateLinkageChain(pset.idle_queue, "processor_t", "processor_queue"): - print " " + GetProcessorSummary(processor) - ShowActiveThread(processor) - ShowNextThread(processor) - - if show_priority_runq: - ShowRunQSummary(processor.runq) + idle_bitmap = int(pset.cpu_state_map[4]) & int(pset.primary_map) + for cpuid in IterateBitmap(idle_bitmap): + processor = processor_array[cpuid] + if processor != 0: + print " " + GetProcessorSummary(processor) + ShowActiveThread(processor) + ShowNextThread(processor) + + if show_priority_runq: + ShowRunQSummary(processor.runq) print " \n" print "Idle Secondary Processors:\n" - for processor in ParanoidIterateLinkageChain(pset.idle_secondary_queue, "processor_t", "processor_queue"): - print " " + GetProcessorSummary(processor) - ShowActiveThread(processor) - ShowNextThread(processor) + idle_bitmap = int(pset.cpu_state_map[4]) & ~(int(pset.primary_map)) + for cpuid in IterateBitmap(idle_bitmap): + processor = processor_array[cpuid] + if processor != 0: + print " " + GetProcessorSummary(processor) + ShowActiveThread(processor) + ShowNextThread(processor) + + if show_priority_runq: + print ShowRunQSummary(processor.runq) + print " \n" - if show_priority_runq: - print ShowRunQSummary(processor.runq) + + print "Other Processors:\n" + other_bitmap = 0 + for i in range(0, 4): + other_bitmap |= int(pset.cpu_state_map[i]) + other_bitmap &= int(pset.cpu_bitmask) + for cpuid in IterateBitmap(other_bitmap): + processor = processor_array[cpuid] + if processor != 0: + print " " + GetProcessorSummary(processor) + ShowActiveThread(processor) + ShowNextThread(processor) + + if show_priority_runq: + ShowRunQSummary(processor.runq) print " \n" @@ -791,6 +817,32 @@ def ParanoidIterateLinkageChain(queue_head, element_type, field_name, field_ofst ParanoidIterateLinkageChain.enable_paranoia = True ParanoidIterateLinkageChain.enable_debug = False +def bit_first(bitmap): + return bitmap.bit_length() - 1 + +def lsb_first(bitmap): + bitmap = bitmap & -bitmap + return bit_first(bitmap) + +def IterateBitmap(bitmap): + """ Iterate over a bitmap, returning the index of set bits starting from 0 + + params: + bitmap - value : bitmap + returns: + A generator does not return. It is used for iterating. + value : index of a set bit + example usage: + for cpuid in IterateBitmap(running_bitmap): + print processor_array[cpuid] + """ + i = lsb_first(bitmap) + while (i >= 0): + yield i + bitmap = bitmap & ~((1 << (i + 1)) - 1) + i = lsb_first(bitmap) + + # Macro: showallcallouts def ShowThreadCall(prefix, call): diff --git a/tools/lldbmacros/skywalk.py b/tools/lldbmacros/skywalk.py new file mode 100755 index 000000000..2119bc010 --- /dev/null +++ b/tools/lldbmacros/skywalk.py @@ -0,0 +1,566 @@ + +""" Please make sure you read the README COMPLETELY BEFORE reading anything below. + It is very critical that you read coding guidelines in Section E in README file. +""" + +from xnu import * +from utils import * +from string import * + +import xnudefines + +def IterateProcChannels(proc): + """ Iterate through all channels in the given process + + params: + proc - the proc object + returns: nothing, this is meant to be used as a generator function + kc - yields each kern_channel in the process + """ + + proc_filedesc = proc.p_fd + proc_lastfile = unsigned(proc_filedesc.fd_lastfile) + proc_ofiles = proc_filedesc.fd_ofiles + + count = 0 + while count <= proc_lastfile: + if unsigned(proc_ofiles[count]) != 0: + proc_fd_fglob = proc_ofiles[count].f_fglob + if (unsigned(proc_fd_fglob.fg_ops.fo_type) == 10): + yield Cast(proc_fd_fglob.fg_data, 'kern_channel *') + count += 1 + +def IterateKernChannelRings(kc, kind): + """ Iterate through all rings on a given channel + """ + + NR_RX = 0 + NR_TX = 1 + NR_A = 2 + NR_F = 3 + + if kind == NR_RX: + rings = kc.ch_na.na_rx_rings + elif kind == NR_TX : + rings = kc.ch_na.na_tx_rings + elif kind == NR_A : + rings = kc.ch_na.na_alloc_rings + else : + rings = kc.ch_na.na_free_rings + + # note that ch_last is actually one greater than the last + # as per the comment in ch_connect + for i in xrange(kc.ch_first[kind], kc.ch_last[kind]): + yield addressof(rings[i]) + +# Note this is broken if you have type summaries enabled +# because we are summarizing the pointer to the structure +# and not the structure itself. Unfortunately, that's +# the pattern used elsewhere. +# Trying to actually use the type summary will blow up +# because it has a linked list pointer to itself +# +@lldb_type_summary(['kern_channel_t', 'kern_channel *']) +@header('{:<20s} {:<36s}'.format('kern_channel', 'uuid')) +def GetKernChannelSummary(kc): + """ Summarizes a kern_channel and related information + + returns: str - summary of kern_channel + """ + + format_string = '{o: <#020x} {u: <36s}' + return format_string.format( + o=kc, + u=GetUUIDSummary(kc.ch_info.cinfo_ch_id)) + +@lldb_type_summary(['__kern_channel_ring *']) +@header('{:<20s} {:<65s} {:>10s} | {:<5s} {:<5s} | {:<5s} {:<5s} {:<5s} | {:<5s} {:<5s} {:<5s}'.format( + 'kernchannelring', 'name', 'flags', 'kc', 'kt', 'rc', 'rh', 'rt', 'c', 'h', 't')) +def GetKernChannelRingSummary(kring): + """ Summarizes a __kern_channel_ring and related information + + returns: str - summary of kern_channel_ring + """ + + format_string = '{o: <#020x} "{name: <63s}" {flags: >#010x} | {kh: <5d} {kt: <5d} | {rh: <5d} {rt: <5d} | {h: <5d} {t: <5d}' + return format_string.format( + o=kring, + name=kring.ckr_name, + flags=kring.ckr_flags, + kh=kring.ckr_khead, + kt=kring.ckr_ktail, + rh=kring.ckr_rhead, + rt=kring.ckr_rtail, + h=kring.ckr_ring.ring_head, + t=kring.ckr_ring.ring_tail) + +@lldb_command('showprocchannels') +def ShowProcChannels(cmd_args=None): + """ Show the skywalk channels for a given process. + + usage: showprocchannels + """ + + if not cmd_args: + raise ArgumentError('missing struct proc * argument') + + proc = kern.GetValueFromAddress(cmd_args[0], 'proc_t') + + print GetKernChannelSummary.header + for kc in IterateProcChannels(proc): + print GetKernChannelSummary(kc) + +@lldb_command('showchannelrings') +def ShowChannelRings(cmd_args=None): + """ Show the skywalk rings for a given channel. + + usage: showchannelrings + """ + + if not cmd_args: + raise ArgumentError('missing struct kern_channel * argument') + + kc = kern.GetValueFromAddress(cmd_args[0], 'kern_channel *') + + print "RX rings:" + print GetKernChannelRingSummary.header + for ring in IterateKernChannelRings(kc, 0) : + print GetKernChannelRingSummary(ring) + + print "TX rings:" + print GetKernChannelRingSummary.header + for ring in IterateKernChannelRings(kc, 1) : + print GetKernChannelRingSummary(ring) + + print "ALLOC rings:" + print GetKernChannelRingSummary.header + for ring in IterateKernChannelRings(kc, 2) : + print GetKernChannelRingSummary(ring) + + print "FREE rings:" + print GetKernChannelRingSummary.header + for ring in IterateKernChannelRings(kc, 3) : + print GetKernChannelRingSummary(ring) + +def SkmemCacheModeAsString(mode) : + out_string = "" + SKM_MODE_NOCACHE = 0x1 + SKM_MODE_AUDIT = 0x2 + + if (mode & SKM_MODE_NOCACHE) : + out_string += "n" + else : + out_string += "-" + if (mode & SKM_MODE_AUDIT) : + out_string += "a" + else : + out_string += "-" + + return out_string + +@lldb_command('showskmemcache') +def ShowSkmemCache(cmd_args=None) : + """ Show the global list of skmem caches + """ + + format_string = "{:<4s} {:<18s} {:<4s} {:<4s} {:<4s} {:<4s} {:<4s} {:<4s} {:<4s} {:<4s} {:<4s} {:> 10, str(ar.ar_name)) + i += 1 + +@lldb_command('showskmemregion') +def ShowSkmemRegion(cmd_args=None) : + """ Show the global list of skmem regions + """ + + i = 1 + skrhead = kern.globals.skmem_region_head + + for skr in IterateTAILQ_HEAD(skrhead, "skr_link") : + format_string = "{:>4d}: 0x{:<08x} \"{: 70): + ports_string += "\n\t" + offs = len(ports_string) + ports_string += " %u" % f.nsr_port + """ + + return format_string.format( + o=ns, + p=proto, + a=addr, + n=ns.ns_n_reservations) + ports_string + +@lldb_command('shownetns') +def ShowNetNS(cmd_args=None): + """ Show the netns table + """ + print"\nnetns_namespaces:" + print GetStructNsSummary.header + + namespaces = kern.globals.netns_namespaces + for ns in IterateRBTreeEntry(namespaces, 'struct ns *', 'ns_link'): + print GetStructNsSummary(ns) + + print "\nwild: (these should be duplicated above)" + print GetStructNsSummary.header + for i in range(0,4): + print GetStructNsSummary(kern.globals.netns_global_wild[i]) + + print "\nnon wild:" + print GetStructNsSummary.header + for i in range(0,4): + print GetStructNsSummary(kern.globals.netns_global_non_wild[i]) + + +@lldb_type_summary(['struct ns_token *']) +@header('{:<20s} {:<5s} {:<48s} {:<12s} {:<8s} {:<38s} {:<38s} {:<12s}'.format('nt', 'proto', 'addr', 'port', 'owner', 'ifp', 'parent', 'flags')) +def GetNsTokenSummary(nt): + """ Summarizes a struct ns from the netns + + returns: str - summary of struct ns + """ + + if (nt.nt_proto == IPPROTO_TCP): + proto = "tcp" + elif (nt.nt_proto == IPPROTO_UDP): + proto = "udp" + else: + proto = str(nt.nt_proto) + + if (nt.nt_addr_len == sizeof('struct in_addr')): + addr = GetInAddrAsString(addressof(nt.nt_inaddr)) + elif (nt.nt_addr_len == sizeof('struct in6_addr')): + addr = GetIn6AddrAsString(nt.nt_in6addr.__u6_addr.__u6_addr8) + else: + addr = str(nt_addr) + " bad len {:u}".format(nt.nt_addr_len) + + format_string = '{o:#020x} {p:<5s} {a:<48s} {pt:<12s} {wn:<8s} {ifp:38s} {pa:38s} {f:#012x}' + + ports = "%u" % nt.nt_port + + ifp = "(struct ifnet *)" + hex(nt.nt_ifp) + + if ((nt.nt_flags & 0x7) == 0x00): + owner = "LISTENER" + parent = "(void *)" + hex(nt.nt_parent) + elif ((nt.nt_flags & 0x7) == 0x01): + owner = "SKYWALK" + parent = "(struct flow_entry *)" + hex(nt.nt_parent_skywalk) + elif ((nt.nt_flags & 0x7) == 0x02): # XXX xnudefines? + owner = "BSD" + parent = "(struct inpcb *)" + hex(nt.nt_parent_bsd) + elif ((nt.nt_flags & 0x7) == 0x03): # XXX xnudefines? + owner = "PF" + parent = "(void *)" + hex(nt.nt_parent) + + return format_string.format( + o=nt, + p=proto, + a=addr, + pt=ports, + wn=owner, + ifp=ifp, + pa=parent, + f=nt.nt_flags) + +@lldb_command("showallnetnstokens") +def ShowAllNetNSTokens(cmd_args=None): + """ show all netns tokens + """ + + tokenhead = kern.globals.netns_all_tokens + print GetNsTokenSummary.header + for nt in IterateListEntry(tokenhead, 'struct ns_token *', 'nt_all_link', list_prefix='s'): + print GetNsTokenSummary(nt) + +@lldb_command("shownetnstokens") +def ShowNetNSTokens(cmd_args=None): + """ show netns tokens attached to an ifp + with no args, shows unbound tokens + """ + + if (cmd_args == None or len(cmd_args) == 0): + print "No ifp argument provided, showing unbound tokens" + tokenhead = kern.globals.netns_unbound_tokens + elif len(cmd_args) > 0: + ifp = kern.GetValueFromAddress(cmd_args[0], 'ifnet *') + print "Showing tokens for ifp %r" % ifp + tokenhead = ifp.if_netns_tokens + else: + print "Missing ifp argument 0 in shownetnstokens" + print cmd_args + return + + print GetNsTokenSummary.header + for nt in IterateListEntry(tokenhead, 'struct ns_token *', 'nt_ifp_link', list_prefix='s'): + print GetNsTokenSummary(nt) + +def IterateSTAILQ_HEAD(headval, element_name): + iter_val = headval.stqh_first + while unsigned(iter_val) != 0 : + yield iter_val + iter_val = iter_val.__getattr__(element_name).stqe_next + #end of yield loop + +@lldb_command("shownexuschannels") +def ShowNexusChannels(cmd_args=None): + """ show nexus channels + """ + if (cmd_args == None or len(cmd_args) == 0): + print "Missing argument 0 (kern_nexus address)." + return + + nx = kern.GetValueFromAddress(cmd_args[0], 'kern_nexus *') + i = 1 + + format_string = "{:>4s} {:<18s} {:>4s} {:<7s} {:<7s} {:<18s} {:<18s} {:<18s} {:>8s} {:6s} {:<18s} {:>4s} {:s}" + print format_string.format("", "addr", "refs", "txrings", "rxrings", "arena", "ioskmap", "mapaddr", "mapsize", "maprdr", "na", "fd", "process") + + for ch in IterateSTAILQ_HEAD(nx.nx_ch_head, "ch_link"): + format_string = "{:>4d}: 0x{:<08x} {:>4d} [{:2d},{:2d}] [{:2d},{:2d}] 0x{:<08x} 0x{:<08x} 0x{:<16x} {:>8d} {:>6d} 0x{:<08x} {:>4d} {:s}({:d})" + print format_string.format(i, ch, ch.ch_refcnt, ch.ch_first[0], ch.ch_last[0], ch.ch_first[1], ch.ch_last[1], ch.ch_mmap.ami_arena, ch.ch_mmap.ami_mapref, ch.ch_mmap.ami_mapaddr, ch.ch_mmap.ami_mapsize, ch.ch_mmap.ami_redirect, ch.ch_na, ch.ch_fd, ch.ch_name, ch.ch_pid) + i += 1 + + for ch in IterateSTAILQ_HEAD(nx.nx_ch_nonxref_head, "ch_link"): + format_string = "{:>4d}: 0x{:<08x} {:>4d} [{:2d},{:2d}] [{:2d},{:2d}] 0x{:<08x} 0x{:<08x} 0x{:<16x} {:>8d} {:>6d} 0x{:<08x} {:>4d} {:s}({:d})" + print format_string.format(i, ch, ch.ch_refcnt, ch.ch_first[0], ch.ch_last[0], ch.ch_first[1], ch.ch_last[1], ch.ch_mmap.ami_arena, ch.ch_mmap.ami_mapref, ch.ch_mmap.ami_mapaddr, ch.ch_mmap.ami_mapsize, ch.ch_mmap.ami_redirect, ch.ch_na, ch.ch_fd, ch.ch_name, ch.ch_pid) + i += 1 + +def IterateProcNECP(proc): + """ Iterate through all NECP descriptors in the given process + + params: + proc - the proc object + returns: nothing, this is meant to be used as a generator function + necp - yields each necp_fd_data in the process + """ + + proc_filedesc = proc.p_fd + proc_lastfile = unsigned(proc_filedesc.fd_lastfile) + proc_ofiles = proc_filedesc.fd_ofiles + + count = 0 + while count <= proc_lastfile: + if unsigned(proc_ofiles[count]) != 0: + proc_fd_fglob = proc_ofiles[count].f_fglob + if (unsigned(proc_fd_fglob.fg_ops.fo_type) == 9): + yield Cast(proc_fd_fglob.fg_data, 'necp_fd_data *') + count += 1 + +def GetNECPClientBitFields(necp): + """ Return the bit fields in necp_client as string + + returns: str - string representation of necp_client bit fields + """ + + bitfields_string = '' + if necp.result_read != 0: + bitfields_string += 'r' + else: + bitfields_string += '-' + if necp.allow_multiple_flows != 0: + bitfields_string += 'm' + else: + bitfields_string += '-' + if necp.background != 0: + bitfields_string += 'b' + else: + bitfields_string += '-' + if necp.background_update != 0: + bitfields_string += 'B' + else: + bitfields_string += '-' + if necp.platform_binary != 0: + bitfields_string += 'p' + else: + bitfields_string += '-' + + return bitfields_string + +def GetNECPFlowBitFields(flow_registration): + """ Return the bit fields in necp_client_flow_registration as string + + returns: str - string representation of necp_client_flow_registration bit fields + """ + + bitfields_string = '' + if flow_registration.flow_result_read != 0: + bitfields_string += 'r' + else: + bitfields_string += '-' + if flow_registration.defunct != 0: + bitfields_string += 'd' + else: + bitfields_string += '-' + + return bitfields_string + +@lldb_type_summary(['necp_fd_data *']) +@header('{:<20s} {:<8s}'.format('necp_fd_data', "flags")) +def GetNECPSummary(necp): + """ Summarizes a necp_fd_data and related information + + returns: str - summary of necp_fd_data + """ + + format_string = '{o: <#020x} {u:<#08x}' + + stats_arenas_string = "\n\n\t%-18s %-39s %-4s %-10s\n" % ("stats_arenas", "mmap", "refs", "flags") + for sa in IterateListEntry(necp.stats_arena_list, 'struct necp_arena_info *', 'nai_chain'): + stats_arenas_string += "\t0x%016x " % sa + stats_arenas_string += "[0x%016x-0x%016x) " % (sa.nai_mmap.ami_mapaddr,(sa.nai_mmap.ami_mapaddr+sa.nai_mmap.ami_mapsize)) + stats_arenas_string += "%4u " % sa.nai_use_count + stats_arenas_string += "0x%08x " % sa.nai_flags + stats_arenas_string += "\n" + + clients_string = "" + for c in IterateRBTreeEntry(necp.clients, 'struct necp_client *', 'link'): + clients_string += "\n\t%-18s %-36s %-4s %-5s\n" % ("necp_clients", "client_id", "refs", "flags") + clients_string += "\t0x%016x " % c + clients_string += "%36s " % GetUUIDSummary(c.client_id) + clients_string += "%4u " % c.reference_count + clients_string += "%5s " % GetNECPClientBitFields(c) + count = 0; + for f in IterateRBTreeEntry(c.flow_registrations, 'struct necp_client_flow_registration *', 'client_link'): + if count == 0: + clients_string += "\n\t\t%-18s %-36s %-2s %-18s %-18s %-18s\n" % ("flow_registration", "registraton_id", "flags", "stats_arena", "kstats_obj", "ustats_obj") + clients_string += "\t\t0x%016x " % f + clients_string += "%36s " % GetUUIDSummary(f.registration_id) + clients_string += "%2s " % GetNECPFlowBitFields(f) + clients_string += "0x%016x " % f.stats_arena + clients_string += "0x%016x " % f.kstats_kaddr + clients_string += "0x%016x " % f.ustats_uaddr + clients_string += "\n" + + return format_string.format( + o=necp, + u=necp.flags) + stats_arenas_string + clients_string + +@lldb_command('showprocnecp') +def ShowProcNECP(cmd_args=None): + """ Show NECP descriptors for a given process. + + usage: showprocnecp + """ + + if not cmd_args: + raise ArgumentError('missing struct proc * argument') + + proc = kern.GetValueFromAddress(cmd_args[0], 'proc_t') + + print GetNECPSummary.header + for kc in IterateProcNECP(proc): + print GetNECPSummary(kc) diff --git a/tools/lldbmacros/turnstile.py b/tools/lldbmacros/turnstile.py new file mode 100755 index 000000000..372e13ec4 --- /dev/null +++ b/tools/lldbmacros/turnstile.py @@ -0,0 +1,147 @@ +from xnu import * +import sys, shlex +from utils import * +from waitq import * +import xnudefines + +@lldb_type_summary(['struct turnstile *']) +@header("{0: <20s} {1: <5s} {2: <20s} {3: <8s} {4: <8s} {5: <23s} {6: <20s} {7: <16s} {8: <20s} {9: <20s}".format("turnstile", "pri", "waitq", "type", "state", "inheritor", "proprietor", "gen count", "thread", "prev_thread")) +def GetTurnstileSummary(turnstile): + """ Summarizes the turnstile + params: turnstile = value of the object of type struct turnstile * + returns: String with summary of the type. + """ + + type_and_gencount = Cast(addressof(turnstile.ts_type_gencount), 'union turnstile_type_gencount *') + turnstile_type = "" + + if type_and_gencount.ts_type == 0: + turnstile_type = "none " + elif type_and_gencount.ts_type == 1: + turnstile_type = "knl_mtx" + elif type_and_gencount.ts_type == 2: + turnstile_type = "ulock " + elif type_and_gencount.ts_type == 3: + turnstile_type = "pth_mtx" + elif type_and_gencount.ts_type == 4: + turnstile_type = "syn_ipc" + elif type_and_gencount.ts_type == 5: + turnstile_type = "kqwl " + elif type_and_gencount.ts_type == 6: + turnstile_type = "workq " + elif type_and_gencount.ts_type == 7: + turnstile_type = "knote " + + turnstile_state = "" + if turnstile.ts_state & 0x1: + turnstile_state += "T" + elif turnstile.ts_state & 0x2: + turnstile_state += "F" + elif turnstile.ts_state & 0x4: + turnstile_state += "H" + elif turnstile.ts_state & 0x8: + turnstile_state += "P" + + if turnstile.ts_inheritor_flags & 0x4: + inheritor_type = "th" + elif turnstile.ts_inheritor_flags & 0x8: + inheritor_type = "ts" + elif turnstile.ts_inheritor_flags & 0x40: + inheritor_type = "wq" + else: + inheritor_type = "--" + + format_str = "{0: <#020x} {1: <5d} {2: <#020x} {3: <8s} {4: <8s} {6: <2s}:{5: <#020x} {7: <#020x} {8: <16d}" + out_string = format_str.format(turnstile, turnstile.ts_priority, addressof(turnstile.ts_waitq), + turnstile_type, turnstile_state, turnstile.ts_inheritor, inheritor_type, + turnstile.ts_proprietor, type_and_gencount.ts_gencount) + + #if DEVELOPMENT + format_str = " {0: <#020x} {1: <#020x}" + if hasattr(turnstile, 'ts_thread'): + out_string += format_str.format(turnstile.ts_thread, turnstile.ts_prev_thread) + #endif + return out_string + +def PrintTurnstile(turnstile): + """ print turnstile and it's free list. + params: + turnstile - turnstile to print + """ + print GetTurnstileSummary(turnstile) + + """ print turnstile freelist if its not on a thread or freelist """ + if turnstile.ts_state & 0x3 == 0: + needsHeader = True + for free_turnstile in IterateListEntry(turnstile.ts_free_turnstiles, 'struct turnstile *', 'ts_free_elm', 's'): + if needsHeader: + print " Turnstile free List" + header_str = " " + GetTurnstileSummary.header + print header_str + needsHeader = False + print " " + GetTurnstileSummary(free_turnstile) + print "" + return + +# Macro: showturnstile +@lldb_command('showturnstile') +def ShowTurnstile(cmd_args=None, cmd_options={}): + """ show the turnstile and all free turnstiles hanging off the turnstile. + Usage: (lldb)showturnstile + """ + if not cmd_args: + raise ArgumentError("Please provide arguments") + + turnstile = kern.GetValueFromAddress(cmd_args[0], 'struct turnstile *') + print GetTurnstileSummary.header + PrintTurnstile(turnstile) + return +# EndMacro: showturnstile + +@lldb_command('showturnstilehashtable') +def ShowTurnstileHashTable(cmd_args=None, cmd_options={}): + """ show the global hash table for turnstiles. + Usage: (lldb)showturnstilehashtable + """ + print GetTurnstileSummary.header + turnstile_htable_buckets = kern.globals.ts_htable_buckets + for index in range(0, turnstile_htable_buckets): + turnstile_bucket = GetObjectAtIndexFromArray(kern.globals.turnstile_htable, index) + for turnstile in IterateQueue(turnstile_bucket.ts_ht_bucket_list, 'struct turnstile *', 'ts_htable_link'): + PrintTurnstile(turnstile) + return True + +#if DEVELOPMENT +# Macro: showallturnstiles +@lldb_command('showallturnstiles') +def ShowAllTurnstiles(cmd_args=None, cmd_options={}): + """ A DEVELOPMENT macro that walks the list of all allocated turnstile objects + and prints them. + usage: (lldb) showallturnstiles + """ + if not hasattr(kern.globals, 'turnstiles_list'): + print "It seems you are running a build of kernel that does not have the list of all turnstiles." + return False + print GetTurnstileSummary.header + for turnstile in IterateQueue(kern.globals.turnstiles_list, 'struct turnstile *', 'ts_global_elm'): + PrintTurnstile(turnstile) + return True +# EndMacro showallturnstiles + +# Macro: showallbusyturnstiles +@lldb_command('showallbusyturnstiles') +def ShowAllTurnstiles(cmd_args=None, cmd_options={}): + """ A DEVELOPMENT macro that walks the list of all allocated turnstile objects + and prints them. + usage: (lldb) showallbusyturnstiles + """ + if not hasattr(kern.globals, 'turnstiles_list'): + print "It seems you are running a build of kernel that does not have the list of all turnstiles." + return False + print GetTurnstileSummary.header + for turnstile in IterateQueue(kern.globals.turnstiles_list, 'struct turnstile *', 'ts_global_elm'): + if turnstile.ts_state & 0x3 == 0: + PrintTurnstile(turnstile) + return True +# EndMacro showallbusyturnstiles +#endif diff --git a/tools/lldbmacros/userspace.py b/tools/lldbmacros/userspace.py index 88a8858fc..3413fff96 100755 --- a/tools/lldbmacros/userspace.py +++ b/tools/lldbmacros/userspace.py @@ -195,7 +195,6 @@ def PrintUserspaceData(cmd_args=None, cmd_options={}): return True - @lldb_command('showtaskuserargs') def ShowTaskUserArgs(cmd_args=None, cmd_options={}): """ Read the process argv, env, and apple strings from the user stack @@ -208,8 +207,9 @@ def ShowTaskUserArgs(cmd_args=None, cmd_options={}): task = kern.GetValueFromAddress(cmd_args[0], 'task *') proc = Cast(task.bsd_info, 'proc *') + ptrsize = 8 if int(task.t_flags) & 0x1 else 4 - format_string = "Q" if kern.ptrsize == 8 else "I" + format_string = "Q" if ptrsize == 8 else "I" string_area_size = proc.p_argslen string_area_addr = proc.user_stack - string_area_size @@ -220,7 +220,7 @@ def ShowTaskUserArgs(cmd_args=None, cmd_options={}): return False i = 0 - pos = string_area_addr - kern.ptrsize + pos = string_area_addr - ptrsize for name in ["apple", "env", "argv"] : while True: @@ -229,9 +229,9 @@ def ShowTaskUserArgs(cmd_args=None, cmd_options={}): break i += 1 - pos -= kern.ptrsize + pos -= ptrsize - user_data_string = GetUserDataAsString(task, pos, kern.ptrsize) + user_data_string = GetUserDataAsString(task, pos, ptrsize) ptr = struct.unpack(format_string, user_data_string)[0] if ptr == 0: diff --git a/tools/lldbmacros/usertaskdebugging/userprocess.py b/tools/lldbmacros/usertaskdebugging/userprocess.py index 5a5079e7b..74e54223e 100755 --- a/tools/lldbmacros/usertaskdebugging/userprocess.py +++ b/tools/lldbmacros/usertaskdebugging/userprocess.py @@ -11,17 +11,8 @@ CPU_TYPE_ARM = 0x0000000c CPU_TYPE_ARM64 = 0x0100000c - -CPU_SUBTYPE_X86_64_ALL = 3 -CPU_SUBTYPE_X86_64_H = 8 -CPU_SUBTYPE_ARMV8 = 13 -CPU_SUBTYPE_ARM_V7 = 9 -CPU_SUBTYPE_ARM_V7S = 11 -CPU_SUBTYPE_ARM_V7K = 12 - - def GetRegisterSetForCPU(cputype, subtype): - if cputype == CPU_TYPE_ARM64: + if cputype == CPU_TYPE_ARM64: retval = Armv8_RegisterSet elif cputype == CPU_TYPE_ARM: retval = Armv7_RegisterSet @@ -37,13 +28,12 @@ def GetRegisterSetForCPU(cputype, subtype): class UserThreadObject(object): """representation of userspace thread""" - def __init__(self, thr_obj, cputype, cpusubtype, kern_cputype): + def __init__(self, thr_obj, cputype, cpusubtype, is_kern_64bit): super(UserThreadObject, self).__init__() self.thread = thr_obj self.registerset = GetRegisterSetForCPU(cputype, cpusubtype) self.thread_id = unsigned(self.thread.thread_id) self.is64Bit = bool(cputype & 0x01000000) - isKern64Bit = bool(kern_cputype & 0x01000000) if self.is64Bit: if cputype == CPU_TYPE_X86_64: @@ -58,12 +48,13 @@ def __init__(self, thr_obj, cputype, cpusubtype, kern_cputype): self.saved_state = Cast(self.thread.machine.iss, 'x86_saved_state_t *').uss.ss_32 if cputype == CPU_TYPE_ARM: self.reg_type = "arm" - if not isKern64Bit: + if not is_kern_64bit: self.saved_state = self.thread.machine.PcbData else: self.saved_state = self.thread.machine.contextData.ss.uss.ss_32 - logging.debug("created thread id 0x%x of type %s, kern_cputype 0x%x cputype 0x%x" - % (self.thread_id, self.reg_type, kern_cputype, cputype)) + + logging.debug("created thread id 0x%x of type %s, is_kern_64bit 0x%x cputype 0x%x" + % (self.thread_id, self.reg_type, is_kern_64bit, cputype)) def getRegisterValueByName(self, name): if self.reg_type == 'arm64': @@ -108,30 +99,21 @@ def __init__(self, task): if task.t_flags & 0x1: ptrsize = 8 if task.t_flags & 0x2: - dataregisters64bit = 8 - - cputype = CPU_TYPE_X86_64 - cpusubtype = CPU_SUBTYPE_X86_64_ALL + dataregisters64bit = True + is_kern_64bit = kern.arch in ['x86_64', 'x86_64h', 'arm64' + ] - """ these computations should come out of the macho header i think """ - """ where does kern.arch come from? what's kern.arch == armv8?? """ - if kern.arch in ('arm'): - cputype = CPU_TYPE_ARM - cpusubtype = CPU_SUBTYPE_ARM_V7 - elif kern.arch in ('armv8', 'arm64'): - cputype = CPU_TYPE_ARM64 - cpusubtype = CPU_SUBTYPE_ARMV8 + self.cputype = unsigned(self.proc.p_cputype) + self.cpusubtype = unsigned(self.proc.p_cpusubtype) - super(UserProcess, self).__init__(cputype, cpusubtype, ptrsize) + super(UserProcess, self).__init__(self.cputype, self.cpusubtype, ptrsize) self.hinfo['ostype'] = 'macosx' - if cputype != CPU_TYPE_X86_64: + if self.cputype != CPU_TYPE_X86_64 and self.cputype != CPU_TYPE_I386: self.hinfo['ostype'] = 'ios' - self.cputype = unsigned(self.proc.p_cputype) - self.cpusubtype = unsigned(self.proc.p_cpusubtype) - self.registerset = GetRegisterSetForCPU(cputype, cpusubtype) + self.registerset = GetRegisterSetForCPU(self.cputype, self.cpusubtype) logging.debug("process %s is64bit: %d ptrsize: %d cputype: %d cpusubtype:%d", hex(self.proc), int(dataregisters64bit), ptrsize, self.cputype, self.cpusubtype @@ -140,7 +122,7 @@ def __init__(self, task): self.threads_ids_list = [] logging.debug("iterating over threads in process") for thval in IterateQueue(task.threads, 'thread *', 'task_threads'): - self.threads[unsigned(thval.thread_id)] = UserThreadObject(thval, self.cputype, self.cpusubtype, cputype) + self.threads[unsigned(thval.thread_id)] = UserThreadObject(thval, self.cputype, self.cpusubtype, is_kern_64bit) self.threads_ids_list.append(unsigned(thval.thread_id)) def getRegisterDataForThread(self, th_id, reg_num): diff --git a/tools/lldbmacros/utils.py b/tools/lldbmacros/utils.py index eaa3bcc00..33d601f8d 100755 --- a/tools/lldbmacros/utils.py +++ b/tools/lldbmacros/utils.py @@ -140,7 +140,16 @@ def Cast(obj, target_type): """ return cast(obj, target_type) - +def ContainerOf(obj, target_type, field_name): + """ Type cast an object to another C type from a pointer to a field. + params: + obj - core.value object representing some C construct in lldb + target_type - str : ex 'struct thread' + - lldb.SBType : + field_name - the field name within the target_type obj is a pointer to + """ + return containerof(obj, target_type, field_name) + def loadLLDB(): """ Util function to load lldb python framework in case not available in common include paths. """ @@ -461,3 +470,7 @@ def print_hex_data(data, begin_offset=0, desc=""): char_buf = "" print "{:08x} {: <50s} |{: <16s}|".format(begin_offset + index - 16, hex_buf, char_buf) return + +def Ones(x): + return (1 << x)-1 + diff --git a/tools/lldbmacros/waitq.py b/tools/lldbmacros/waitq.py index 8bfa63919..6768635c0 100755 --- a/tools/lldbmacros/waitq.py +++ b/tools/lldbmacros/waitq.py @@ -832,6 +832,10 @@ def GetWaitqPreposts(waitq): wqset = Cast(waitq, 'waitq_set *') if wqset.wqset_prepost_id == 0: return [] + if not wqset.wqset_q.waitq_prepost: + # If the "prepost_id" is non-zero, but the 'waitq_prepost' bit is + # *not* set, then this waitq actually has a prepost hook! + return [ "{0: <#18x}:{1: <18s}".format(wqset.wqset_prepost_id, "") ] return GetPrepostChain(wqset.wqset_prepost_id) diff --git a/tools/lldbmacros/workqueue.py b/tools/lldbmacros/workqueue.py new file mode 100755 index 000000000..dae699f27 --- /dev/null +++ b/tools/lldbmacros/workqueue.py @@ -0,0 +1,176 @@ +from xnu import * +from scheduler import GetRecentTimestamp +import xnudefines + +def GetProcWorkqueue(proc): + wq = proc.p_wqptr; + if unsigned(wq): + return Cast(wq, "struct workqueue *"); + return None + +@header("{:<20s} {:<20s} {:<20s} {:<10s} {:<10s} {:<10s} {:<10s} {:<10s} {:<10s} {:<30s}".format( + 'task', 'proc', 'wq', 'sched', 'pending', 'idle', 'dying', 'creations', 'fulfilled', 'wq_flags')) +def GetWorkqueueSummary(proc, wq): + wq_flags = [] + if wq.wq_flags & GetEnumValue("workq_state_flags_t::WQ_EXITING"): + wq_flags.append("EXITING") + if wq.wq_flags & GetEnumValue("workq_state_flags_t::WQ_PROC_SUSPENDED"): + wq_flags.append("PROC_SUSPENDED") + if wq.wq_flags & GetEnumValue("workq_state_flags_t::WQ_DEATH_CALL_SCHEDULED"): + wq_flags.append("DEATH_CALL") + + scheduled = GetEnumValue("workq_state_flags_t::WQ_DELAYED_CALL_SCHEDULED") + pended = GetEnumValue("workq_state_flags_t::WQ_DELAYED_CALL_PENDED") + if wq.wq_flags & (scheduled | pended): + s = "DELAYED_CALL[" + if wq.wq_flags & scheduled: s += 'S' + if wq.wq_flags & pended: s += 'P' + s += ']' + wq_flags.append(s) + + scheduled = GetEnumValue("workq_state_flags_t::WQ_IMMEDIATE_CALL_SCHEDULED") + pended = GetEnumValue("workq_state_flags_t::WQ_IMMEDIATE_CALL_PENDED") + if wq.wq_flags & (scheduled | pended): + s = "IMMEDIATE_CALL[" + if wq.wq_flags & scheduled: s += 'S' + if wq.wq_flags & pended: s += 'P' + s += ']' + wq_flags.append(s) + + return "{p.task: <#020x} {p: <#020x} {wq: <#020x} {wq.wq_threads_scheduled: <10d} {wq.wq_reqcount: <10d} {wq.wq_thidlecount: <10d} {wq.wq_thdying_count: <10d} {wq.wq_creations: <10d} {wq.wq_fulfilled: <10d} {wq_flags: <30s}".format(p=proc, wq=wq, wq_flags=" ".join(wq_flags)); + +@header("{:<20s} {:<20s} {:>10s} {:9s} {:<20s} {:<10s} {:<30s}".format( + 'thread', 'uthread', 'thport', 'kind', 'kqueue', 'idle (ms)', 'uu_workq_flags')) +def GetWQThreadSummary(th, uth): + p = Cast(th.task.bsd_info, 'proc *') + wq = p.p_wqptr + + uu_workq_flags = [] + if uth.uu_workq_flags & 0x01: uu_workq_flags.append("NEW") + if uth.uu_workq_flags & 0x02: + uu_workq_flags.append("RUNNING") + if wq.wq_creator == uth: + kind = "creator" + else: + kind = "workq" + idle = "" + else: + ts = kern.GetNanotimeFromAbstime(GetRecentTimestamp() - uth.uu_save.uus_workq_park_data.idle_stamp) / 1e9 + kind = "idle" + idle = "%#.03f" % (ts) + if uth.uu_workq_flags & 0x04: uu_workq_flags.append("DYING") + if uth.uu_workq_flags & 0x08: uu_workq_flags.append("OVERCOMMIT") + if uth.uu_workq_flags & 0x10: uu_workq_flags.append("OUTSIDE_QOS") + if uth.uu_workq_flags & 0x20: uu_workq_flags.append("IDLE_CLEANUP") + if uth.uu_workq_flags & 0x40: uu_workq_flags.append("EARLY_BOUND") + if uth.uu_workq_flags & 0x80: uu_workq_flags.append("CPU%") + + kqr = uth.uu_kqr_bound + if not kqr: + kq = 0 + elif kqr.kqr_state & 0x1: # workloop + kq = ContainerOf(kqr, 'struct kqworkloop', 'kqwl_request') + kind = "workloop" + else: + kq = p.p_fd.fd_wqkqueue + kind = "kqwq[%s]" % (xnudefines.thread_qos_short_strings[int(kqr.kqr_qos_index)]) + + return "{th: <#020x} {uth: <#020x} {thport: >#010x} {kind: <9s} {kq: <#020x} {idle: <10s} {uu_workq_flags: <30s}".format(th=th, uth=uth, thport=uth.uu_workq_thport, kind=kind, kq=kq, idle=idle, uu_workq_flags=" ".join(uu_workq_flags)) + +@header("{:<20s} {:<20s} {:<10s} {:<3s} {:<4s} {:<30s}".format( + 'request', 'kqueue', 'state', '#', 'qos', 'tr_flags')) +def GetWorkqueueThreadRequestSummary(proc, req): + kq = 0 + tr_flags = [] + + if req.tr_flags & 0x01: + tr_flags.append("KEVENT") + kq = proc.p_fd.fd_wqkqueue + if req.tr_flags & 0x02: + tr_flags.append("WORKLOOP") + kq = ContainerOf(req, 'struct kqworkloop', 'kqwl_request.kqr_req') + if req.tr_flags & 0x04: tr_flags.append("OVERCOMMIT") + if req.tr_flags & 0x08: tr_flags.append("PARAMS") + if req.tr_flags & 0x10: tr_flags.append("OUTSIDE_QOS") + + state = {0: "IDLE", 1: "NEW", 2: "QUEUED", 4: "BINDING" }[int(req.tr_state)] + + qos = int(req.tr_qos) + if qos == 8: + qos = "MG" + elif qos == 7: + qos = "SP" + else: + qos = xnudefines.thread_qos_short_strings[qos] + + return "{req: <#020x} {kq: <#020x} {state: <10s} {req.tr_count: <3d} {qos: <4s} {tr_flags: <30s}".format(req=req, kq=kq, state=state, qos=qos, tr_flags=" ".join(tr_flags)) + +@lldb_command('showwqthread') +def ShowWQThread(cmd_args=None): + """ Shows info about a workqueue thread + + usage: showworkqthread + """ + + if not cmd_args: + raise ArgumentError('missing struct proc * argument') + + th = kern.GetValueFromAddress(cmd_args[0], "struct thread *") + if not (th.thread_tag & 0x20): + raise ArgumentError('not a workqueue thread') + + print GetWQThreadSummary.header + print GetWQThreadSummary(th, Cast(th.uthread, 'struct uthread *')) + + +@lldb_command('showprocworkqueue') +def ShowProcWorkqueue(cmd_args=None): + """ Shows the process workqueue + + usage: showprocworkqueue + """ + + if not cmd_args: + raise ArgumentError('missing struct proc * argument') + + proc = kern.GetValueFromAddress(cmd_args[0], "proc_t") + wq = Cast(proc.p_wqptr, "struct workqueue *"); + if wq: + print GetWorkqueueSummary.header + print GetWorkqueueSummary(proc, wq) + + if wq.wq_reqcount: + print " " + print " " + GetWorkqueueThreadRequestSummary.header + if wq.wq_event_manager_threadreq: + print " " + GetWorkqueueThreadRequestSummary(proc, wq.wq_event_manager_threadreq) + for req in IteratePriorityQueueEntry(wq.wq_overcommit_queue, 'struct workq_threadreq_s', 'tr_entry'): + print " " + GetWorkqueueThreadRequestSummary(proc, req) + for req in IteratePriorityQueueEntry(wq.wq_constrained_queue, 'struct workq_threadreq_s', 'tr_entry'): + print " " + GetWorkqueueThreadRequestSummary(proc, req) + for req in IteratePriorityQueueEntry(wq.wq_special_queue, 'struct workq_threadreq_s', 'tr_entry'): + print " " + GetWorkqueueThreadRequestSummary(proc, req) + + print " " + print " " + GetWQThreadSummary.header + for uth in IterateTAILQ_HEAD(wq.wq_thrunlist, "uu_workq_entry"): + print " " + GetWQThreadSummary(Cast(uth.uu_thread, 'struct thread *'), uth) + for uth in IterateTAILQ_HEAD(wq.wq_thidlelist, "uu_workq_entry"): + print " " + GetWQThreadSummary(Cast(uth.uu_thread, 'struct thread *'), uth) + for uth in IterateTAILQ_HEAD(wq.wq_thnewlist, "uu_workq_entry"): + print " " + GetWQThreadSummary(Cast(uth.uu_thread, 'struct thread *'), uth) + +@lldb_command('showallworkqueues') +def ShowAllWorkqueues(cmd_args=None): + """ Display a summary of all the workqueues in the system + + usage: showallworkqueues + """ + + print GetWorkqueueSummary.header + + for t in kern.tasks: + proc = Cast(t.bsd_info, 'proc *') + wq = Cast(proc.p_wqptr, "struct workqueue *"); + if wq: + print GetWorkqueueSummary(proc, wq) diff --git a/tools/lldbmacros/xnu.py b/tools/lldbmacros/xnu.py index 1806464fd..1d30ad890 100755 --- a/tools/lldbmacros/xnu.py +++ b/tools/lldbmacros/xnu.py @@ -802,6 +802,7 @@ def WalkList(cmd_args=[], cmd_options={}): from ioreg import * from mbufs import * from net import * +from skywalk import * from kdp import * from userspace import * from pci import * @@ -812,6 +813,7 @@ def WalkList(cmd_args=[], cmd_options={}): from structanalyze import * from ipcimportancedetail import * from bank import * +from turnstile import * from kasan import * from kauth import * from waitq import * @@ -820,6 +822,6 @@ def WalkList(cmd_args=[], cmd_options={}): from pgtrace import * from xnutriage import * from kevent import * +from workqueue import * from ntstat import * from zonetriage import * - diff --git a/tools/lldbmacros/xnudefines.py b/tools/lldbmacros/xnudefines.py index 9db538d38..9ae470173 100755 --- a/tools/lldbmacros/xnudefines.py +++ b/tools/lldbmacros/xnudefines.py @@ -88,7 +88,7 @@ def GetStateString(strings_dict, state): 0x0002: 'QUEUED', 0x0004: 'DISABLED', 0x0008: 'DROPPING', - 0x0010: 'USERWAIT', + 0x0010: 'LOCKED', 0x0020: 'ATTACHING', 0x0040: 'STAYACTIVE', 0x0080: 'DEFERDROP', @@ -96,17 +96,16 @@ def GetStateString(strings_dict, state): 0x0200: 'DISPATCH', 0x0400: 'UDATASPEC', 0x0800: 'SUPPRESS', - 0x1000: 'STOLENDROP', + 0x1000: 'MERGE_QOS', 0x2000: 'REQVANISH', 0x4000: 'VANISHED' } -kqrequest_state_strings = { 0x01: 'PROCESSING', +kqrequest_state_strings = { 0x01: 'WORKLOOP', 0x02: 'THREQUESTED', 0x04: 'WAKEUP', - 0x08: 'BOUND', - 0x20: 'THOVERCOMMIT', - 0x40: 'DRAIN' } - + 0x08: 'THOVERCOMMIT', + 0x10: 'R2K_ARMED', + 0x20: 'ALLOC_TURNSTILE' } thread_qos_short_strings = { 0: '--', 1: 'MT', 2: 'BG', @@ -118,7 +117,7 @@ def GetStateString(strings_dict, state): KQ_WORKQ = 0x40 KQ_WORKLOOP = 0x80 -KQWQ_NBUCKETS = 22 +KQWQ_NBUCKETS = 8 KQWL_NBUCKETS = 8 DTYPE_VNODE = 1 @@ -182,7 +181,8 @@ def GetStateString(strings_dict, state): # string representations for Kobject types kobject_types = ['', 'THREAD', 'TASK', 'HOST', 'HOST_PRIV', 'PROCESSOR', 'PSET', 'PSET_NAME', 'TIMER', 'PAGER_REQ', 'DEVICE', 'XMM_OBJECT', 'XMM_PAGER', 'XMM_KERNEL', 'XMM_REPLY', 'NOTDEF 15', 'NOTDEF 16', 'HOST_SEC', 'LEDGER', 'MASTER_DEV', 'TASK_NAME', 'SUBSYTEM', 'IO_DONE_QUE', 'SEMAPHORE', 'LOCK_SET', 'CLOCK', 'CLOCK_CTRL' , 'IOKIT_SPARE', - 'NAMED_MEM', 'IOKIT_CON', 'IOKIT_OBJ', 'UPL', 'MEM_OBJ_CONTROL', 'AU_SESSIONPORT', 'FILEPORT', 'LABELH', 'TASK_RESUME', 'VOUCHER', 'VOUCHER_ATTR_CONTROL', 'IKOT_WORK_INTERVAL'] + 'NAMED_MEM', 'IOKIT_CON', 'IOKIT_OBJ', 'UPL', 'MEM_OBJ_CONTROL', 'AU_SESSIONPORT', 'FILEPORT', 'LABELH', 'TASK_RESUME', 'VOUCHER', 'VOUCHER_ATTR_CONTROL', 'WORK_INTERVAL', + 'UX_HANDLER'] def populate_kobject_types(xnu_dir_path): """ Function to read data from header file xnu/osfmk/kern/ipc_kobject.h @@ -196,6 +196,9 @@ def populate_kobject_types(xnu_dir_path): kobject_found_types.append(v[0]) return kobject_found_types +FSHIFT = 11 +FSCALE = 1 << FSHIFT + KDBG_BFINIT = 0x80000000 KDBG_WRAPPED = 0x008 KDCOPYBUF_COUNT = 8192 diff --git a/tools/tests/MPMMTest/Makefile b/tools/tests/MPMMTest/Makefile index d24156460..369fbcace 100644 --- a/tools/tests/MPMMTest/Makefile +++ b/tools/tests/MPMMTest/Makefile @@ -17,9 +17,15 @@ ifdef RC_ARCHS endif endif -ARCH_32 := $(filter-out %64, $(ARCHS)) +# These are convenience functions for filtering based on substrings, as the +# normal filter functions only accept one wildcard. +FILTER_OUT_SUBSTRING=$(strip $(foreach string,$(2),$(if $(findstring $(1),$(string)),,$(string)))) +FILTER_SUBSTRING=$(strip $(foreach string,$(2),$(if $(findstring $(1),$(string)),$(string),))) + +ARCH_32:=$(call FILTER_OUT_SUBSTRING,64,$(ARCHS)) +ARCH_64:=$(call FILTER_SUBSTRING,64,$(ARCHS)) + ARCH_32_FLAGS := $(patsubst %, -arch %, $(ARCH_32)) -ARCH_64 := $(filter %64, $(ARCHS)) ARCH_64_FLAGS := $(patsubst %, -arch %, $(ARCH_64)) DSTROOT?=$(shell /bin/pwd) diff --git a/tools/tests/Makefile b/tools/tests/Makefile index 0cc99e0dc..09d13dfc4 100644 --- a/tools/tests/Makefile +++ b/tools/tests/Makefile @@ -25,12 +25,11 @@ COMMON_TARGETS = unit_tests \ MPMMTest \ packetdrill \ affinity \ - execperf \ superpages \ zero-to-n \ jitter \ perf_index \ - darwintests \ + personas \ unixconf \ testkext/pgokext.kext diff --git a/tools/tests/affinity/Makefile b/tools/tests/affinity/Makefile index c4d1a9bc4..5f45973ab 100644 --- a/tools/tests/affinity/Makefile +++ b/tools/tests/affinity/Makefile @@ -14,9 +14,15 @@ ifdef RC_ARCHS endif endif -ARCH_32 := $(filter-out %64, $(ARCHS)) +# These are convenience functions for filtering based on substrings, as the +# normal filter functions only accept one wildcard. +FILTER_OUT_SUBSTRING=$(strip $(foreach string,$(2),$(if $(findstring $(1),$(string)),,$(string)))) +FILTER_SUBSTRING=$(strip $(foreach string,$(2),$(if $(findstring $(1),$(string)),$(string),))) + +ARCH_32:=$(call FILTER_OUT_SUBSTRING,64,$(ARCHS)) +ARCH_64:=$(call FILTER_SUBSTRING,64,$(ARCHS)) + ARCH_32_FLAGS := $(patsubst %, -arch %, $(ARCH_32)) -ARCH_64 := $(filter %64, $(ARCHS)) ARCH_64_FLAGS := $(patsubst %, -arch %, $(ARCH_64)) CFLAGS :=-g -isysroot $(SDKROOT) -I$(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders diff --git a/tools/tests/darwintests/kdebug.c b/tools/tests/darwintests/kdebug.c deleted file mode 100644 index 3cc0e2200..000000000 --- a/tools/tests/darwintests/kdebug.c +++ /dev/null @@ -1,690 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define KDBG_TEST_MACROS 1 -#define KDBG_TEST_OLD_TIMES 2 - -static void -assert_kdebug_test(unsigned int flavor) -{ - size_t size = flavor; - int mib[] = { CTL_KERN, KERN_KDEBUG, KERN_KDTEST }; - T_ASSERT_POSIX_SUCCESS( - sysctl(mib, sizeof(mib) / sizeof(mib[0]), NULL, &size, NULL, 0), - "KERN_KDTEST sysctl"); -} - -#pragma mark kdebug syscalls - -#define TRACE_DEBUGID (0xfedfed00U) - -T_DECL(kdebug_trace_syscall, "test that kdebug_trace(2) emits correct events", - T_META_ASROOT(true)) -{ - ktrace_session_t s; - __block int events_seen = 0; - - s = ktrace_session_create(); - os_assert(s != NULL); - - ktrace_events_class(s, DBG_MACH, ^(__unused struct trace_point *tp){}); - ktrace_events_single(s, TRACE_DEBUGID, ^void(struct trace_point *tp) { - events_seen++; - T_PASS("saw traced event"); - - T_EXPECT_EQ(tp->arg1, 1UL, "argument 1 of traced event is correct"); - T_EXPECT_EQ(tp->arg2, 2UL, "argument 2 of traced event is correct"); - T_EXPECT_EQ(tp->arg3, 3UL, "argument 3 of traced event is correct"); - T_EXPECT_EQ(tp->arg4, 4UL, "argument 4 of traced event is correct"); - - ktrace_end(s, 1); - }); - - ktrace_set_completion_handler(s, ^(void) { - T_EXPECT_GE(events_seen, 1, NULL); - ktrace_session_destroy(s); - T_END; - }); - - ktrace_filter_pid(s, getpid()); - - T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL); - T_ASSERT_POSIX_SUCCESS(kdebug_trace(TRACE_DEBUGID, 1, 2, 3, 4), NULL); - ktrace_end(s, 0); - - dispatch_main(); -} - -#define SIGNPOST_SINGLE_CODE (0x10U) -#define SIGNPOST_PAIRED_CODE (0x20U) - -T_DECL(kdebug_signpost_syscall, - "test that kdebug_signpost(2) emits correct events", - T_META_ASROOT(true)) -{ - ktrace_session_t s; - __block int single_seen = 0; - __block int paired_seen = 0; - - s = ktrace_session_create(); - T_ASSERT_NOTNULL(s, NULL); - - /* make sure to get enough events for the KDBUFWAIT to trigger */ - // ktrace_events_class(s, DBG_MACH, ^(__unused struct trace_point *tp){}); - ktrace_events_single(s, - APPSDBG_CODE(DBG_APP_SIGNPOST, SIGNPOST_SINGLE_CODE), - ^void(struct trace_point *tp) - { - single_seen++; - T_PASS("single signpost is traced"); - - T_EXPECT_EQ(tp->arg1, 1UL, "argument 1 of single signpost is correct"); - T_EXPECT_EQ(tp->arg2, 2UL, "argument 2 of single signpost is correct"); - T_EXPECT_EQ(tp->arg3, 3UL, "argument 3 of single signpost is correct"); - T_EXPECT_EQ(tp->arg4, 4UL, "argument 4 of single signpost is correct"); - }); - - ktrace_events_single_paired(s, - APPSDBG_CODE(DBG_APP_SIGNPOST, SIGNPOST_PAIRED_CODE), - ^void(struct trace_point *start, struct trace_point *end) - { - paired_seen++; - T_PASS("paired signposts are traced"); - - T_EXPECT_EQ(start->arg1, 5UL, "argument 1 of start signpost is correct"); - T_EXPECT_EQ(start->arg2, 6UL, "argument 2 of start signpost is correct"); - T_EXPECT_EQ(start->arg3, 7UL, "argument 3 of start signpost is correct"); - T_EXPECT_EQ(start->arg4, 8UL, "argument 4 of start signpost is correct"); - - T_EXPECT_EQ(end->arg1, 9UL, "argument 1 of end signpost is correct"); - T_EXPECT_EQ(end->arg2, 10UL, "argument 2 of end signpost is correct"); - T_EXPECT_EQ(end->arg3, 11UL, "argument 3 of end signpost is correct"); - T_EXPECT_EQ(end->arg4, 12UL, "argument 4 of end signpost is correct"); - - T_EXPECT_EQ(single_seen, 1, - "signposts are traced in the correct order"); - - ktrace_end(s, 1); - }); - - ktrace_set_completion_handler(s, ^(void) { - if (single_seen == 0) { - T_FAIL("did not see single tracepoint before timeout"); - } - if (paired_seen == 0) { - T_FAIL("did not see paired tracepoints before timeout"); - } - ktrace_session_destroy(s); - T_END; - }); - - ktrace_filter_pid(s, getpid()); - - T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL); - - T_EXPECT_POSIX_SUCCESS(kdebug_signpost( - SIGNPOST_SINGLE_CODE, 1, 2, 3, 4), NULL); - T_EXPECT_POSIX_SUCCESS(kdebug_signpost_start( - SIGNPOST_PAIRED_CODE, 5, 6, 7, 8), NULL); - T_EXPECT_POSIX_SUCCESS(kdebug_signpost_end( - SIGNPOST_PAIRED_CODE, 9, 10, 11, 12), NULL); - ktrace_end(s, 0); - - dispatch_main(); -} - -#pragma mark kdebug behaviors - -#define WRAPPING_EVENTS_COUNT (150000) -#define TRACE_ITERATIONS (5000) -#define WRAPPING_EVENTS_THRESHOLD (100) - -T_DECL(wrapping, - "ensure that wrapping traces lost events and no events prior to the wrap", - T_META_ASROOT(true), T_META_CHECK_LEAKS(false)) -{ - ktrace_session_t s; - __block int events = 0; - int mib[4]; - size_t needed; - kbufinfo_t buf_info; - int wait_wrapping_secs = (WRAPPING_EVENTS_COUNT / TRACE_ITERATIONS) + 5; - int current_secs = wait_wrapping_secs; - - /* use sysctls manually to bypass libktrace assumptions */ - - mib[0] = CTL_KERN; mib[1] = KERN_KDEBUG; mib[2] = KERN_KDSETUP; mib[3] = 0; - needed = 0; - T_ASSERT_POSIX_SUCCESS(sysctl(mib, 3, NULL, &needed, NULL, 0), - "KERN_KDSETUP"); - - mib[2] = KERN_KDSETBUF; mib[3] = WRAPPING_EVENTS_COUNT; - T_ASSERT_POSIX_SUCCESS(sysctl(mib, 4, NULL, 0, NULL, 0), "KERN_KDSETBUF"); - - mib[2] = KERN_KDENABLE; mib[3] = 1; - T_ASSERT_POSIX_SUCCESS(sysctl(mib, 4, NULL, 0, NULL, 0), "KERN_KDENABLE"); - - /* wrapping is on by default */ - - /* wait until wrapped */ - T_LOG("waiting for trace to wrap"); - mib[2] = KERN_KDGETBUF; - needed = sizeof(buf_info); - do { - sleep(1); - for (int i = 0; i < TRACE_ITERATIONS; i++) { - T_QUIET; - T_ASSERT_POSIX_SUCCESS(kdebug_trace(0xfefe0000, 0, 0, 0, 0), NULL); - } - T_QUIET; - T_ASSERT_POSIX_SUCCESS(sysctl(mib, 3, &buf_info, &needed, NULL, 0), - NULL); - } while (!(buf_info.flags & KDBG_WRAPPED) && --current_secs > 0); - - T_ASSERT_TRUE(buf_info.flags & KDBG_WRAPPED, - "trace wrapped (after %d seconds within %d second timeout)", - wait_wrapping_secs - current_secs, wait_wrapping_secs); - - s = ktrace_session_create(); - T_QUIET; T_ASSERT_NOTNULL(s, NULL); - T_QUIET; T_ASSERT_POSIX_ZERO(ktrace_set_use_existing(s), NULL); - - ktrace_events_all(s, ^void(struct trace_point *tp) { - if (events == 0) { - T_EXPECT_EQ(tp->debugid, (unsigned int)TRACE_LOST_EVENTS, - "first event's debugid 0x%08x (%s) should be TRACE_LOST_EVENTS", - tp->debugid, - ktrace_name_for_eventid(s, tp->debugid & KDBG_EVENTID_MASK)); - } else { - T_QUIET; - T_EXPECT_NE(tp->debugid, (unsigned int)TRACE_LOST_EVENTS, - "event debugid 0x%08x (%s) should not be TRACE_LOST_EVENTS", - tp->debugid, - ktrace_name_for_eventid(s, tp->debugid & KDBG_EVENTID_MASK)); - } - - events++; - if (events > WRAPPING_EVENTS_THRESHOLD) { - ktrace_end(s, 1); - } - }); - - ktrace_set_completion_handler(s, ^(void) { - ktrace_session_destroy(s); - T_END; - }); - - T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL); - - dispatch_main(); -} - -T_DECL(reject_old_events, - "ensure that kdebug rejects events from before tracing began", - T_META_ASROOT(true), T_META_CHECK_LEAKS(false)) -{ - __block uint64_t event_horizon_ts; - __block int events = 0; - - ktrace_session_t s = ktrace_session_create(); - T_QUIET; T_ASSERT_NOTNULL(s, "ktrace_session_create"); - - ktrace_events_range(s, KDBG_EVENTID(DBG_BSD, DBG_BSD_KDEBUG_TEST, 0), - KDBG_EVENTID(DBG_BSD + 1, 0, 0), - ^(struct trace_point *tp) - { - events++; - T_EXPECT_GT(tp->timestamp, event_horizon_ts, - "events in trace should be from after tracing began"); - }); - - ktrace_set_completion_handler(s, ^{ - T_EXPECT_EQ(events, 2, "should see only two events"); - ktrace_session_destroy(s); - T_END; - }); - - event_horizon_ts = mach_absolute_time(); - - T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL); - /* first, try an old event at the beginning of trace */ - assert_kdebug_test(KDBG_TEST_OLD_TIMES); - /* after a good event has been traced, old events should be rejected */ - assert_kdebug_test(KDBG_TEST_OLD_TIMES); - ktrace_end(s, 0); - - dispatch_main(); -} - -#define ORDERING_TIMEOUT_SEC 5 - -T_DECL(ascending_time_order, - "ensure that kdebug events are in ascending order based on time", - T_META_ASROOT(true), T_META_CHECK_LEAKS(false)) -{ - __block uint64_t prev_ts = 0; - __block uint32_t prev_debugid = 0; - __block unsigned int prev_cpu = 0; - __block bool in_order = true; - - ktrace_session_t s = ktrace_session_create(); - T_QUIET; T_ASSERT_NOTNULL(s, "ktrace_session_create"); - - ktrace_events_all(s, ^(struct trace_point *tp) { - if (tp->timestamp < prev_ts) { - in_order = false; - T_FAIL("found timestamps out of order"); - T_LOG("%" PRIu64 ": %#" PRIx32 " (cpu %d)", - prev_ts, prev_debugid, prev_cpu); - T_LOG("%" PRIu64 ": %#" PRIx32 " (cpu %d)", - tp->timestamp, tp->debugid, tp->cpuid); - } - }); - - ktrace_set_completion_handler(s, ^{ - ktrace_session_destroy(s); - T_EXPECT_TRUE(in_order, "event timestamps were in-order"); - T_END; - }); - - T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL); - - /* try to inject old timestamps into trace */ - assert_kdebug_test(KDBG_TEST_OLD_TIMES); - - dispatch_after(dispatch_time(DISPATCH_TIME_NOW, ORDERING_TIMEOUT_SEC * NSEC_PER_SEC), - dispatch_get_main_queue(), ^{ - T_LOG("ending test after timeout"); - ktrace_end(s, 1); - }); - - dispatch_main(); - -} - -#pragma mark dyld tracing - -__attribute__((aligned(8))) -static const char map_uuid[16] = "map UUID"; - -__attribute__((aligned(8))) -static const char unmap_uuid[16] = "unmap UUID"; - -__attribute__((aligned(8))) -static const char sc_uuid[16] = "shared UUID"; - -static fsid_t map_fsid = { .val = { 42, 43 } }; -static fsid_t unmap_fsid = { .val = { 44, 45 } }; -static fsid_t sc_fsid = { .val = { 46, 47 } }; - -static fsobj_id_t map_fsobjid = { .fid_objno = 42, .fid_generation = 43 }; -static fsobj_id_t unmap_fsobjid = { .fid_objno = 44, .fid_generation = 45 }; -static fsobj_id_t sc_fsobjid = { .fid_objno = 46, .fid_generation = 47 }; - -#define MAP_LOAD_ADDR 0xabadcafe -#define UNMAP_LOAD_ADDR 0xfeedface -#define SC_LOAD_ADDR 0xfedfaced - -__unused -static void -expect_dyld_image_info(struct trace_point *tp, const uint64_t *exp_uuid, - uint64_t exp_load_addr, fsid_t *exp_fsid, fsobj_id_t *exp_fsobjid, - int order) -{ -#if defined(__LP64__) - if (order == 0) { - uint64_t uuid[2]; - uint64_t load_addr; - fsid_t fsid; - - uuid[0] = (uint64_t)tp->arg1; - uuid[1] = (uint64_t)tp->arg2; - load_addr = (uint64_t)tp->arg3; - fsid.val[0] = (int32_t)(tp->arg4 & UINT32_MAX); - fsid.val[1] = (int32_t)((uint64_t)tp->arg4 >> 32); - - T_QUIET; T_EXPECT_EQ(uuid[0], exp_uuid[0], NULL); - T_QUIET; T_EXPECT_EQ(uuid[1], exp_uuid[1], NULL); - T_QUIET; T_EXPECT_EQ(load_addr, exp_load_addr, NULL); - T_QUIET; T_EXPECT_EQ(fsid.val[0], exp_fsid->val[0], NULL); - T_QUIET; T_EXPECT_EQ(fsid.val[1], exp_fsid->val[1], NULL); - } else if (order == 1) { - fsobj_id_t fsobjid; - - fsobjid.fid_objno = (uint32_t)(tp->arg1 & UINT32_MAX); - fsobjid.fid_generation = (uint32_t)((uint64_t)tp->arg1 >> 32); - - T_QUIET; T_EXPECT_EQ(fsobjid.fid_objno, exp_fsobjid->fid_objno, NULL); - T_QUIET; T_EXPECT_EQ(fsobjid.fid_generation, - exp_fsobjid->fid_generation, NULL); - } else { - T_ASSERT_FAIL("unrecognized order of events %d", order); - } -#else /* defined(__LP64__) */ - if (order == 0) { - uint32_t uuid[4]; - - uuid[0] = (uint32_t)tp->arg1; - uuid[1] = (uint32_t)tp->arg2; - uuid[2] = (uint32_t)tp->arg3; - uuid[3] = (uint32_t)tp->arg4; - - T_QUIET; T_EXPECT_EQ(uuid[0], (uint32_t)exp_uuid[0], NULL); - T_QUIET; T_EXPECT_EQ(uuid[1], (uint32_t)(exp_uuid[0] >> 32), NULL); - T_QUIET; T_EXPECT_EQ(uuid[2], (uint32_t)exp_uuid[1], NULL); - T_QUIET; T_EXPECT_EQ(uuid[3], (uint32_t)(exp_uuid[1] >> 32), NULL); - } else if (order == 1) { - uint32_t load_addr; - fsid_t fsid; - fsobj_id_t fsobjid; - - load_addr = (uint32_t)tp->arg1; - fsid.val[0] = (int32_t)tp->arg2; - fsid.val[1] = (int32_t)tp->arg3; - fsobjid.fid_objno = (uint32_t)tp->arg4; - - T_QUIET; T_EXPECT_EQ(load_addr, (uint32_t)exp_load_addr, NULL); - T_QUIET; T_EXPECT_EQ(fsid.val[0], exp_fsid->val[0], NULL); - T_QUIET; T_EXPECT_EQ(fsid.val[1], exp_fsid->val[1], NULL); - T_QUIET; T_EXPECT_EQ(fsobjid.fid_objno, exp_fsobjid->fid_objno, NULL); - } else if (order == 2) { - fsobj_id_t fsobjid; - - fsobjid.fid_generation = tp->arg1; - - T_QUIET; T_EXPECT_EQ(fsobjid.fid_generation, - exp_fsobjid->fid_generation, NULL); - } else { - T_ASSERT_FAIL("unrecognized order of events %d", order); - } -#endif /* defined(__LP64__) */ -} - -#if defined(__LP64__) -#define DYLD_CODE_OFFSET (0) -#define DYLD_EVENTS (2) -#else -#define DYLD_CODE_OFFSET (2) -#define DYLD_EVENTS (3) -#endif - -static void -expect_dyld_events(ktrace_session_t s, const char *name, uint32_t base_code, - const char *exp_uuid, uint64_t exp_load_addr, fsid_t *exp_fsid, - fsobj_id_t *exp_fsobjid, uint8_t *saw_events) -{ - for (int i = 0; i < DYLD_EVENTS; i++) { - ktrace_events_single(s, - KDBG_EVENTID(DBG_DYLD, DBG_DYLD_UUID, - base_code + DYLD_CODE_OFFSET + (unsigned int)i), - ^(struct trace_point *tp) - { - T_LOG("checking %s event %c", name, 'A' + i); - expect_dyld_image_info(tp, (const void *)exp_uuid, exp_load_addr, - exp_fsid, exp_fsobjid, i); - *saw_events |= (1U << i); - }); - } -} - -T_DECL(dyld_events, "test that dyld registering libraries emits events", - T_META_ASROOT(true)) -{ - ktrace_session_t s; - dyld_kernel_image_info_t info; - - /* - * Use pointers instead of __block variables in order to use these variables - * in the completion block below _and_ pass pointers to them to the - * expect_dyld_events function. - */ - uint8_t saw_events[3] = { 0 }; - uint8_t *saw_mapping = &(saw_events[0]); - uint8_t *saw_unmapping = &(saw_events[1]); - uint8_t *saw_shared_cache = &(saw_events[2]); - - s = ktrace_session_create(); - T_ASSERT_NOTNULL(s, NULL); - T_ASSERT_POSIX_ZERO(ktrace_filter_pid(s, getpid()), NULL); - - expect_dyld_events(s, "mapping", DBG_DYLD_UUID_MAP_A, map_uuid, - MAP_LOAD_ADDR, &map_fsid, &map_fsobjid, saw_mapping); - expect_dyld_events(s, "unmapping", DBG_DYLD_UUID_UNMAP_A, unmap_uuid, - UNMAP_LOAD_ADDR, &unmap_fsid, &unmap_fsobjid, saw_unmapping); - expect_dyld_events(s, "shared cache", DBG_DYLD_UUID_SHARED_CACHE_A, - sc_uuid, SC_LOAD_ADDR, &sc_fsid, &sc_fsobjid, saw_shared_cache); - - ktrace_set_completion_handler(s, ^(void) { - T_EXPECT_EQ(__builtin_popcount(*saw_mapping), DYLD_EVENTS, NULL); - T_EXPECT_EQ(__builtin_popcount(*saw_unmapping), DYLD_EVENTS, NULL); - T_EXPECT_EQ(__builtin_popcount(*saw_shared_cache), DYLD_EVENTS, NULL); - ktrace_session_destroy(s); - T_END; - }); - - T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL); - - info.load_addr = MAP_LOAD_ADDR; - memcpy(info.uuid, map_uuid, sizeof(info.uuid)); - info.fsid = map_fsid; - info.fsobjid = map_fsobjid; - T_EXPECT_MACH_SUCCESS(task_register_dyld_image_infos(mach_task_self(), - &info, 1), NULL); - - info.load_addr = UNMAP_LOAD_ADDR; - memcpy(info.uuid, unmap_uuid, sizeof(info.uuid)); - info.fsid = unmap_fsid; - info.fsobjid = unmap_fsobjid; - T_EXPECT_MACH_SUCCESS(task_unregister_dyld_image_infos(mach_task_self(), - &info, 1), NULL); - - info.load_addr = SC_LOAD_ADDR; - memcpy(info.uuid, sc_uuid, sizeof(info.uuid)); - info.fsid = sc_fsid; - info.fsobjid = sc_fsobjid; - T_EXPECT_MACH_SUCCESS(task_register_dyld_shared_cache_image_info( - mach_task_self(), info, FALSE, FALSE), NULL); - - ktrace_end(s, 0); - - dispatch_main(); -} - -#pragma mark kdebug kernel macros - -#define EXP_KERNEL_EVENTS 5U - -static const uint32_t dev_evts[EXP_KERNEL_EVENTS] = { - BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 0), - BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 1), - BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 2), - BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 3), - BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 4), -}; - -static const uint32_t rel_evts[EXP_KERNEL_EVENTS] = { - BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 5), - BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 6), - BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 7), - BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 8), - BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 9), -}; - -static const uint32_t filt_evts[EXP_KERNEL_EVENTS] = { - BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 10), - BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 11), - BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 12), - BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 13), - BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 14), -}; - -static bool -is_development_kernel(void) -{ - static dispatch_once_t is_development_once; - static bool is_development; - - dispatch_once(&is_development_once, ^(void) { - int dev; - size_t dev_size = sizeof(dev); - - T_QUIET; - T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.development", &dev, - &dev_size, NULL, 0), NULL); - is_development = (dev != 0); - }); - - return is_development; -} - -static void -expect_event(struct trace_point *tp, unsigned int *events, - const uint32_t *event_ids, size_t event_ids_len) -{ - unsigned int event_idx = *events; - bool event_found = false; - size_t i; - for (i = 0; i < event_ids_len; i++) { - if (event_ids[i] == (tp->debugid & KDBG_EVENTID_MASK)) { - T_LOG("found event 0x%x", tp->debugid); - event_found = true; - } - } - - if (!event_found) { - return; - } - - *events += 1; - for (i = 0; i < event_idx; i++) { - T_QUIET; T_EXPECT_EQ(((uintptr_t *)&tp->arg1)[i], (uintptr_t)i + 1, - NULL); - } - for (; i < 4; i++) { - T_QUIET; T_EXPECT_EQ(((uintptr_t *)&tp->arg1)[i], (uintptr_t)0, NULL); - } -} - -static void -expect_release_event(struct trace_point *tp, unsigned int *events) -{ - expect_event(tp, events, rel_evts, - sizeof(rel_evts) / sizeof(rel_evts[0])); -} - -static void -expect_development_event(struct trace_point *tp, unsigned int *events) -{ - expect_event(tp, events, dev_evts, - sizeof(dev_evts) / sizeof(dev_evts[0])); -} - -static void -expect_filtered_event(struct trace_point *tp, unsigned int *events) -{ - expect_event(tp, events, filt_evts, - sizeof(filt_evts) / sizeof(filt_evts[0])); -} - -T_DECL(kernel_events, "ensure kernel macros work", - T_META_ASROOT(true)) -{ - ktrace_session_t s; - - - s = ktrace_session_create(); - T_QUIET; T_ASSERT_NOTNULL(s, NULL); - - __block unsigned int dev_seen = 0; - __block unsigned int rel_seen = 0; - __block unsigned int filt_seen = 0; - ktrace_events_range(s, KDBG_EVENTID(DBG_BSD, DBG_BSD_KDEBUG_TEST, 0), - KDBG_EVENTID(DBG_BSD + 1, 0, 0), - ^(struct trace_point *tp) - { - expect_development_event(tp, &dev_seen); - expect_release_event(tp, &rel_seen); - expect_filtered_event(tp, &filt_seen); - }); - - ktrace_set_completion_handler(s, ^(void) { - /* - * Development-only events are only filtered if running on an embedded - * OS. - */ - unsigned int dev_exp; -#if TARGET_OS_EMBEDDED - dev_exp = is_development_kernel() ? EXP_KERNEL_EVENTS : 0U; -#else - dev_exp = EXP_KERNEL_EVENTS; -#endif - - T_EXPECT_EQ(rel_seen, EXP_KERNEL_EVENTS, - "release and development events seen"); - T_EXPECT_EQ(dev_seen, dev_exp, "development-only events seen/not seen"); - T_EXPECT_EQ(filt_seen, dev_exp, "filter-only events seen"); - ktrace_session_destroy(s); - T_END; - }); - - ktrace_filter_pid(s, getpid()); - - T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL); - assert_kdebug_test(KDBG_TEST_MACROS); - - ktrace_end(s, 0); - - dispatch_main(); -} - -T_DECL(kernel_events_filtered, "ensure that the filtered kernel macros work", - T_META_ASROOT(true)) -{ - ktrace_session_t s; - - s = ktrace_session_create(); - T_QUIET; T_ASSERT_NOTNULL(s, NULL); - - __block unsigned int dev_seen = 0; - __block unsigned int rel_seen = 0; - __block unsigned int filt_seen = 0; - ktrace_events_all(s, ^(struct trace_point *tp) { - expect_development_event(tp, &dev_seen); - expect_release_event(tp, &rel_seen); - /* to make sure no filtered events are emitted */ - expect_filtered_event(tp, &filt_seen); - }); - - ktrace_set_completion_handler(s, ^(void) { - ktrace_session_destroy(s); - - T_EXPECT_EQ(rel_seen, EXP_KERNEL_EVENTS, NULL); -#if defined(__arm__) || defined(__arm64__) - T_EXPECT_EQ(dev_seen, is_development_kernel() ? EXP_KERNEL_EVENTS : 0U, - NULL); -#else - T_EXPECT_EQ(dev_seen, EXP_KERNEL_EVENTS, NULL); -#endif /* defined(__arm__) || defined(__arm64__) */ - T_EXPECT_EQ(filt_seen, 0U, NULL); - T_END; - }); - - T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL); - assert_kdebug_test(KDBG_TEST_MACROS); - - ktrace_end(s, 0); - - dispatch_main(); -} diff --git a/tools/tests/darwintests/kevent_qos.c b/tools/tests/darwintests/kevent_qos.c deleted file mode 100644 index 823bf1a93..000000000 --- a/tools/tests/darwintests/kevent_qos.c +++ /dev/null @@ -1,908 +0,0 @@ -/* - * kevent_qos: Tests Synchronous IPC QOS override. - */ - -#ifdef T_NAMESPACE -#undef T_NAMESPACE -#endif - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -T_GLOBAL_META(T_META_NAMESPACE("xnu.kevent_qos")); - -#define ARRAYLEN(arr) (sizeof(arr) / sizeof(arr[0])) - -#define RECV_TIMEOUT_SECS (4) -#define SEND_TIMEOUT_SECS (6) -#define HELPER_TIMEOUT_SECS (15) - -#define ENV_VAR_QOS (3) -static const char *qos_env[ENV_VAR_QOS] = {"XNU_TEST_QOS_BO", "XNU_TEST_QOS_QO", "XNU_TEST_QOS_AO"}; -static const char *qos_name_env[ENV_VAR_QOS] = {"XNU_TEST_QOS_NAME_BO", "XNU_TEST_QOS_NAME_QO", "XNU_TEST_QOS_NAME_AO"}; - -#define ENV_VAR_FUNCTION (1) -static const char *wl_function_name = "XNU_TEST_WL_FUNCTION"; - -static qos_class_t g_expected_qos[ENV_VAR_QOS]; -static const char *g_expected_qos_name[ENV_VAR_QOS]; - -#define ENV_QOS_BEFORE_OVERRIDE (0) -#define ENV_QOS_QUEUE_OVERRIDE (1) -#define ENV_QOS_AFTER_OVERRIDE (2) - -#pragma mark pthread callbacks - -static void -worker_cb(pthread_priority_t __unused priority) -{ - T_FAIL("a worker thread was created"); -} - -static void -event_cb(void ** __unused events, int * __unused nevents) -{ - T_FAIL("a kevent routine was called instead of workloop"); -} - -/* - * Basic WL handler callback, it sleeps for n seconds and then checks the - * effective Qos of the servicer thread. - */ -static void -workloop_cb_test_intransit(uint64_t *workloop_id __unused, void **eventslist __unused, int *events) -{ - T_LOG("Workloop handler workloop_cb_test_intransit called. " - "Will wait for %d seconds to make sure client enqueues the sync msg \n", - 2 * RECV_TIMEOUT_SECS); - - /* Wait for the client to send the high priority message to override the qos */ - sleep(2 * RECV_TIMEOUT_SECS); - - /* Skip the test if we can't check Qos */ - if (geteuid() != 0) { - T_SKIP("kevent_qos test requires root privileges to run."); - } - - /* The effective Qos should be the one expected after override */ - T_EXPECT_EFFECTIVE_QOS_EQ(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], - "dispatch_source event handler QoS should be %s", g_expected_qos_name[ENV_QOS_AFTER_OVERRIDE]); - - T_END; - *events = 0; -} - -/* - * WL handler which checks if the servicer thread has correct Qos. - */ -static void -workloop_cb_test_sync_send(uint64_t *workloop_id __unused, void **eventslist __unused, int *events) -{ - T_LOG("Workloop handler workloop_cb_test_sync_send called"); - - if (geteuid() != 0) { - T_SKIP("kevent_qos test requires root privileges to run."); - } - - /* The effective Qos should be the one expected after override */ - T_EXPECT_EFFECTIVE_QOS_EQ(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], - "dispatch_source event handler QoS should be %s", g_expected_qos_name[ENV_QOS_AFTER_OVERRIDE]); - - T_END; - *events = 0; -} - -/* - * WL handler which checks the overridden Qos and then enables the knote and checks - * for the Qos again if that dropped the sync ipc override. - */ -static void -workloop_cb_test_sync_send_and_enable(uint64_t *workloop_id, struct kevent_qos_s **eventslist, int *events) -{ - int r; - T_LOG("Workloop handler workloop_cb_test_sync_send_and_enable called"); - - if (geteuid() != 0) { - T_SKIP("kevent_qos test requires root privileges to run."); - } - - /* The effective Qos should be the one expected after override */ - T_EXPECT_EFFECTIVE_QOS_EQ(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], - "dispatch_source event handler QoS should be %s", g_expected_qos_name[ENV_QOS_AFTER_OVERRIDE]); - - /* Enable the knote */ - struct kevent_qos_s *kev = *eventslist; - kev->flags = EV_ADD | EV_ENABLE | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED; - struct kevent_qos_s kev_err[] = {{ 0 }}; - - r = kevent_id(*workloop_id, kev, 1, kev_err, 1, NULL, - NULL, KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_ERROR_EVENTS | KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST); - T_QUIET; T_ASSERT_POSIX_SUCCESS(r, "kevent_id"); - - /* Sync override should have been removed */ - T_EXPECT_EFFECTIVE_QOS_EQ(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], - "dispatch_source event handler QoS should be %s", g_expected_qos_name[ENV_QOS_BEFORE_OVERRIDE]); - - T_END; - *events = 0; -} - -/* - * WL handler receives the first message and checks sync ipc override, then enables the knote - * and receives 2nd message and checks it sync ipc override. - */ -static int send_two_sync_handler_called = 0; -static void -workloop_cb_test_send_two_sync(uint64_t *workloop_id __unused, struct kevent_qos_s **eventslist, int *events) -{ - T_LOG("Workloop handler workloop_cb_test_send_two_sync called for %d time", send_two_sync_handler_called + 1); - - if (geteuid() != 0) { - T_SKIP("kevent_qos test requires root privileges to run."); - } - - T_LOG("Number of events received is %d\n", *events); - - if (send_two_sync_handler_called == 0) { - /* The effective Qos should be the one expected after override */ - T_EXPECT_EFFECTIVE_QOS_EQ(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], - "dispatch_source event handler QoS should be %s", g_expected_qos_name[ENV_QOS_AFTER_OVERRIDE]); - - /* Enable the knote to get 2nd message */ - struct kevent_qos_s *kev = *eventslist; - kev->flags = EV_ADD | EV_ENABLE | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED; - kev->fflags = (MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY | - MACH_RCV_TRAILER_ELEMENTS(MACH_RCV_TRAILER_CTX) | - MACH_RCV_TRAILER_TYPE(MACH_MSG_TRAILER_FORMAT_0) | - MACH_RCV_VOUCHER); - *events = 1; - } else { - T_EXPECT_EFFECTIVE_QOS_EQ(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], - "dispatch_source event handler QoS should be %s", g_expected_qos_name[ENV_QOS_BEFORE_OVERRIDE]); - T_END; - *events = 0; - } - send_two_sync_handler_called++; -} - -/* - * Checks the sync ipc override and then waits for client to destroy the - * special reply port and checks if that removes the sync ipc override. - */ -static boolean_t two_send_and_destroy_test_passed = FALSE; -static int two_send_and_destroy_handler = 0; -static void -workloop_cb_test_two_send_and_destroy(uint64_t *workloop_id __unused, struct kevent_qos_s **eventslist __unused, int *events) -{ - T_LOG("Workloop handler workloop_cb_test_two_send_and_destroy called %d times", two_send_and_destroy_handler + 1); - - if (geteuid() != 0) { - T_SKIP("kevent_qos test requires root privileges to run."); - } - - if (two_send_and_destroy_handler == 0) { - /* The effective Qos should be the one expected after override */ - T_EXPECT_EFFECTIVE_QOS_EQ(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], - "dispatch_source event handler QoS should be %s", g_expected_qos_name[ENV_QOS_AFTER_OVERRIDE]); - - sleep(2 * RECV_TIMEOUT_SECS); - - /* Special reply port should have been destroyed, check Qos again */ - T_EXPECT_EFFECTIVE_QOS_EQ(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], - "dispatch_source event handler QoS should be %s", g_expected_qos_name[ENV_QOS_BEFORE_OVERRIDE]); - - two_send_and_destroy_test_passed = TRUE; - } else { - if (two_send_and_destroy_test_passed) { - T_END; - } - } - - /* Enable the knote to get next message */ - struct kevent_qos_s *kev = *eventslist; - kev->flags = EV_ADD | EV_ENABLE | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED; - kev->fflags = (MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY | - MACH_RCV_TRAILER_ELEMENTS(MACH_RCV_TRAILER_CTX) | - MACH_RCV_TRAILER_TYPE(MACH_MSG_TRAILER_FORMAT_0) | - MACH_RCV_VOUCHER); - *events = 1; - two_send_and_destroy_handler++; - T_LOG("Handler returning \n"); -} - -#pragma mark Mach receive - -#define KEVENT_QOS_SERVICE_NAME "com.apple.xnu.test.kevent_qos" - -static mach_port_t -get_server_port(void) -{ - mach_port_t port; - kern_return_t kr = bootstrap_check_in(bootstrap_port, - KEVENT_QOS_SERVICE_NAME, &port); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "server bootstrap_check_in"); - return port; -} - -static void -env_set_qos(char **env, qos_class_t qos[], const char *qos_name[], const char *wl_function) -{ - int i; - char *qos_str, *qos_name_str; - for (i = 0; i < ENV_VAR_QOS; i++) { - T_QUIET; T_ASSERT_POSIX_SUCCESS(asprintf(&qos_str, "%s=%d", qos_env[i] , qos[i]), - NULL); - T_QUIET; T_ASSERT_POSIX_SUCCESS( - asprintf(&qos_name_str, "%s=%s", qos_name_env[i], qos_name[i]), NULL); - env[2 * i] = qos_str; - env[2 * i + 1] = qos_name_str; - } - T_QUIET; T_ASSERT_POSIX_SUCCESS(asprintf(&env[2 * i], "%s=%s", wl_function_name, wl_function), - NULL); - env[2 * i + 1] = NULL; -} - -static void -environ_get_qos(qos_class_t qos[], const char *qos_name[], const char **wl_function) -{ - char *qos_str; - char *qos_end; - int i; - - for (i = 0; i < ENV_VAR_QOS; i++) { - qos_str = getenv(qos_env[i]); - T_QUIET; T_ASSERT_NOTNULL(qos_str, "getenv(%s)", qos_env[i]); - - unsigned long qos_l = strtoul(qos_str, &qos_end, 10); - T_QUIET; T_ASSERT_EQ(*qos_end, '\0', "getenv(%s) = '%s' should be an " - "integer", qos_env[i], qos_str); - - T_QUIET; T_ASSERT_LT(qos_l, (unsigned long)100, "getenv(%s) = '%s' should " - "be less than 100", qos_env[i], qos_str); - - qos[i] = (qos_class_t)qos_l; - qos_name[i] = getenv(qos_name_env[i]); - T_QUIET; T_ASSERT_NOTNULL(qos_name[i], "getenv(%s)", qos_name_env[i]); - } - *wl_function = getenv(wl_function_name); - T_QUIET; T_ASSERT_NOTNULL(*wl_function, "getenv(%s)", wl_function_name); -} - -static mach_voucher_t -create_pthpriority_voucher(mach_msg_priority_t qos) -{ - char voucher_buf[sizeof(mach_voucher_attr_recipe_data_t) + sizeof(ipc_pthread_priority_value_t)]; - - mach_voucher_t voucher = MACH_PORT_NULL; - kern_return_t ret; - ipc_pthread_priority_value_t ipc_pthread_priority_value = - (ipc_pthread_priority_value_t)qos; - - mach_voucher_attr_raw_recipe_array_t recipes; - mach_voucher_attr_raw_recipe_size_t recipe_size = 0; - mach_voucher_attr_recipe_t recipe = - (mach_voucher_attr_recipe_t)&voucher_buf[recipe_size]; - - recipe->key = MACH_VOUCHER_ATTR_KEY_PTHPRIORITY; - recipe->command = MACH_VOUCHER_ATTR_PTHPRIORITY_CREATE; - recipe->previous_voucher = MACH_VOUCHER_NULL; - memcpy((char *)&recipe->content[0], &ipc_pthread_priority_value, sizeof(ipc_pthread_priority_value)); - recipe->content_size = sizeof(ipc_pthread_priority_value_t); - recipe_size += sizeof(mach_voucher_attr_recipe_data_t) + recipe->content_size; - - recipes = (mach_voucher_attr_raw_recipe_array_t)&voucher_buf[0]; - - ret = host_create_mach_voucher(mach_host_self(), - recipes, - recipe_size, - &voucher); - - T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "client host_create_mach_voucher"); - return voucher; -} - -static void -send( - mach_port_t send_port, - mach_port_t reply_port, - mach_port_t msg_port, - mach_msg_priority_t qos) -{ - kern_return_t ret = 0; - - struct { - mach_msg_header_t header; - mach_msg_body_t body; - mach_msg_port_descriptor_t port_descriptor; - } send_msg = { - .header = - { - .msgh_remote_port = send_port, - .msgh_local_port = reply_port, - .msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_COPY_SEND, - reply_port ? MACH_MSG_TYPE_MAKE_SEND_ONCE : 0, - MACH_MSG_TYPE_MOVE_SEND, - MACH_MSGH_BITS_COMPLEX), - .msgh_id = 0x100, - .msgh_size = sizeof(send_msg), - .msgh_voucher_port = create_pthpriority_voucher(qos), - }, - .body = - { - .msgh_descriptor_count = 1, - }, - .port_descriptor = - { - .name = msg_port, .disposition = MACH_MSG_TYPE_MOVE_RECEIVE, .type = MACH_MSG_PORT_DESCRIPTOR, - }, - }; - - if (msg_port == MACH_PORT_NULL) { - send_msg.body.msgh_descriptor_count = 0; - } - - ret = mach_msg(&(send_msg.header), - MACH_SEND_MSG | - MACH_SEND_TIMEOUT | - MACH_SEND_OVERRIDE| - (reply_port ? MACH_SEND_SYNC_OVERRIDE : 0) , - send_msg.header.msgh_size, - 0, - MACH_PORT_NULL, - 0, - 0); - - T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "client mach_msg"); -} - -static void -receive( - mach_port_t rcv_port, - mach_port_t notify_port) -{ - kern_return_t ret = 0; - - struct { - mach_msg_header_t header; - mach_msg_body_t body; - mach_msg_port_descriptor_t port_descriptor; - } rcv_msg = { - .header = - { - .msgh_remote_port = MACH_PORT_NULL, - .msgh_local_port = rcv_port, - .msgh_size = sizeof(rcv_msg), - }, - }; - - T_LOG("Client: Starting sync receive\n"); - - ret = mach_msg(&(rcv_msg.header), - MACH_RCV_MSG | - MACH_RCV_TIMEOUT | - MACH_RCV_SYNC_WAIT, - 0, - rcv_msg.header.msgh_size, - rcv_port, - SEND_TIMEOUT_SECS * 1000, - notify_port); - - if (!(ret == MACH_RCV_TIMED_OUT || ret == MACH_MSG_SUCCESS)) { - T_ASSERT_FAIL("Sync rcv failed \n"); - } -} - -T_HELPER_DECL(qos_get_special_reply_port, - "Test get_special_reply_port and it's corner cases.") -{ - mach_port_t special_reply_port; - mach_port_t new_special_reply_port; - - special_reply_port = thread_get_special_reply_port(); - T_QUIET; T_ASSERT_NOTNULL(special_reply_port , "get_thread_special_reply_port"); - - new_special_reply_port = thread_get_special_reply_port(); - T_QUIET; T_ASSERT_NOTNULL(new_special_reply_port , "get_thread_special_reply_port"); - - mach_port_destroy(mach_task_self(), special_reply_port); - mach_port_destroy(mach_task_self(), new_special_reply_port); - - new_special_reply_port = thread_get_special_reply_port(); - T_QUIET; T_ASSERT_NOTNULL(new_special_reply_port , "get_thread_special_reply_port"); - - T_END; -} - -T_HELPER_DECL(qos_client_send_to_intransit, - "Send synchronous messages to an intransit port") -{ - mach_port_t qos_send_port; - mach_port_t msg_port; - mach_port_t special_reply_port; - - kern_return_t kr = bootstrap_look_up(bootstrap_port, - KEVENT_QOS_SERVICE_NAME, &qos_send_port); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up"); - - special_reply_port = thread_get_special_reply_port(); - T_QUIET; T_ASSERT_NOTNULL(special_reply_port , "get_thread_special_reply_port"); - - /* Create a rcv right to send in a msg */ - kr = mach_port_allocate(mach_task_self(), - MACH_PORT_RIGHT_RECEIVE, - &msg_port); - - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client mach_port_allocate"); - - kr = mach_port_insert_right(mach_task_self(), - msg_port, - msg_port, - MACH_MSG_TYPE_MAKE_SEND); - - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client mach_port_insert_right"); - - /* Send an empty msg on the port to fire the WL thread */ - send(qos_send_port, MACH_PORT_NULL, MACH_PORT_NULL, - (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], 0, 0)); - - sleep(SEND_TIMEOUT_SECS); - - /* Send the message with msg port as in-transit port, this msg will not be dequeued */ - send(qos_send_port, MACH_PORT_NULL, msg_port, - (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], 0, 0)); - - /* Send the message to the in-transit port, it should override the rcv's workloop */ - send(msg_port, special_reply_port, MACH_PORT_NULL, - (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], 0, 0)); - T_LOG("Client done sending messages, now waiting for server to end the test"); - sleep(2 * SEND_TIMEOUT_SECS); - - T_ASSERT_FAIL("client timed out"); -} - -T_HELPER_DECL(qos_client_send_sync_and_enqueue_rcv, - "Send synchronous messages and enqueue the rcv right") -{ - mach_port_t qos_send_port; - mach_port_t msg_port; - mach_port_t special_reply_port; - - kern_return_t kr = bootstrap_look_up(bootstrap_port, - KEVENT_QOS_SERVICE_NAME, &qos_send_port); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up"); - - special_reply_port = thread_get_special_reply_port(); - T_QUIET; T_ASSERT_NOTNULL(special_reply_port , "get_thread_special_reply_port"); - - /* Create a rcv right to send in a msg */ - kr = mach_port_allocate(mach_task_self(), - MACH_PORT_RIGHT_RECEIVE, - &msg_port); - - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client mach_port_allocate"); - - kr = mach_port_insert_right(mach_task_self(), - msg_port, - msg_port, - MACH_MSG_TYPE_MAKE_SEND); - - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client mach_port_insert_right"); - - /* Send the message to msg port */ - send(msg_port, special_reply_port, MACH_PORT_NULL, - (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], 0, 0)); - - /* Send the message with msg port as in-transit port, copyin of in-transit will cause sync override */ - send(qos_send_port, MACH_PORT_NULL, msg_port, - (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], 0, 0)); - - T_LOG("Client done sending messages, now waiting for server to end the test"); - sleep(3 * SEND_TIMEOUT_SECS); - - T_ASSERT_FAIL("client timed out"); -} - -static void -thread_create_at_qos(qos_class_t qos, void * (*function)(void *)) -{ - qos_class_t qos_thread; - pthread_t thread; - pthread_attr_t attr; - int ret; - - ret = setpriority(PRIO_DARWIN_ROLE, 0, PRIO_DARWIN_ROLE_UI_FOCAL); - if (ret != 0) { - T_LOG("set priority failed\n"); - } - - pthread_attr_init(&attr); - pthread_attr_set_qos_class_np(&attr, qos, 0); - pthread_create(&thread, &attr, function, NULL); - - T_LOG("pthread created\n"); - pthread_get_qos_class_np(thread, &qos_thread, NULL); - T_EXPECT_EQ(qos_thread, (qos_class_t)qos, NULL); -} - -static void * -qos_send_and_sync_rcv(void *arg __unused) -{ - mach_port_t qos_send_port; - mach_port_t special_reply_port; - - T_LOG("Client: from created thread\n"); - - T_EXPECT_EFFECTIVE_QOS_EQ(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], - "pthread QoS should be %s", g_expected_qos_name[ENV_QOS_AFTER_OVERRIDE]); - - kern_return_t kr = bootstrap_look_up(bootstrap_port, - KEVENT_QOS_SERVICE_NAME, &qos_send_port); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up"); - - special_reply_port = thread_get_special_reply_port(); - T_QUIET; T_ASSERT_NOTNULL(special_reply_port , "get_thread_special_reply_port"); - - /* enqueue two messages to make sure that mqueue is not empty */ - send(qos_send_port, MACH_PORT_NULL, MACH_PORT_NULL, - (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_QUEUE_OVERRIDE], 0, 0)); - - send(qos_send_port, MACH_PORT_NULL, MACH_PORT_NULL, - (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_QUEUE_OVERRIDE], 0, 0)); - - sleep(SEND_TIMEOUT_SECS); - - /* sync wait on msg port */ - receive(special_reply_port, qos_send_port); - - T_LOG("Client done doing sync rcv, now waiting for server to end the test"); - sleep(SEND_TIMEOUT_SECS); - - T_ASSERT_FAIL("client timed out"); - return 0; -} - -T_HELPER_DECL(qos_client_send_sync_and_sync_rcv, - "Send messages and syncronously wait for rcv") -{ - thread_create_at_qos(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], qos_send_and_sync_rcv); - sleep(HELPER_TIMEOUT_SECS); -} - -T_HELPER_DECL(qos_client_send_sync_msg, - "Send synchronous messages") -{ - mach_port_t qos_send_port; - mach_port_t special_reply_port; - - kern_return_t kr = bootstrap_look_up(bootstrap_port, - KEVENT_QOS_SERVICE_NAME, &qos_send_port); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up"); - - special_reply_port = thread_get_special_reply_port(); - T_QUIET; T_ASSERT_NOTNULL(special_reply_port , "get_thread_special_reply_port"); - - /* Send the message to msg port */ - send(qos_send_port, special_reply_port, MACH_PORT_NULL, - (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], 0, 0)); - - T_LOG("Client done sending messages, now waiting for server to end the test"); - sleep(2 * SEND_TIMEOUT_SECS); - - T_ASSERT_FAIL("client timed out"); -} - -T_HELPER_DECL(qos_client_send_two_sync_msg, - "Send two synchronous messages at different qos") -{ - mach_port_t qos_send_port; - mach_port_t special_reply_port; - - kern_return_t kr = bootstrap_look_up(bootstrap_port, - KEVENT_QOS_SERVICE_NAME, &qos_send_port); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up"); - - special_reply_port = thread_get_special_reply_port(); - T_QUIET; T_ASSERT_NOTNULL(special_reply_port , "get_thread_special_reply_port"); - - /* Send the message to msg port */ - send(qos_send_port, special_reply_port, MACH_PORT_NULL, - (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], 0, 0)); - - /* Send the message to msg port */ - send(qos_send_port, special_reply_port, MACH_PORT_NULL, - (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], 0, 0)); - - T_LOG("Client done sending messages, now waiting for server to end the test"); - sleep(SEND_TIMEOUT_SECS); - - T_ASSERT_FAIL("client timed out"); -} - -T_HELPER_DECL(qos_client_send_two_msg_and_destroy, - "Send two messages with 2nd one as sync and then destory the special reply port") -{ - mach_port_t qos_send_port; - mach_port_t special_reply_port; - - kern_return_t kr = bootstrap_look_up(bootstrap_port, - KEVENT_QOS_SERVICE_NAME, &qos_send_port); - T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up"); - - special_reply_port = thread_get_special_reply_port(); - T_QUIET; T_ASSERT_NOTNULL(special_reply_port , "get_thread_special_reply_port"); - - /* Send an async message to msg port */ - send(qos_send_port, MACH_PORT_NULL, MACH_PORT_NULL, - (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], 0, 0)); - - /* Send the message to msg port */ - send(qos_send_port, special_reply_port, MACH_PORT_NULL, - (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], 0, 0)); - - T_LOG("Client done sending messages, waiting for destroy the special reply_port"); - sleep(SEND_TIMEOUT_SECS); - - mach_port_destroy(mach_task_self(), special_reply_port); - sleep(SEND_TIMEOUT_SECS); - - T_ASSERT_FAIL("client timed out"); -} - -static void -run_client_server(const char *server_name, const char *client_name, qos_class_t qos[], - const char *qos_name[], const char *wl_function) -{ - char *env[2 * ENV_VAR_QOS + ENV_VAR_FUNCTION + 1]; - env_set_qos(env, qos, qos_name, wl_function); - - for (int i = 0; i < ENV_VAR_QOS; i++) { - g_expected_qos[i] = qos[i]; - g_expected_qos_name[i] = qos_name[i]; - } - - dt_helper_t helpers[] = { - dt_launchd_helper_env("com.apple.xnu.test.kevent_qos.plist", - server_name, env), - dt_fork_helper(client_name) - }; - dt_run_helpers(helpers, 2, HELPER_TIMEOUT_SECS); -} - -#pragma mark Mach receive - kevent_qos - - -static void -expect_kevent_id_recv(mach_port_t port, qos_class_t qos[], const char *qos_name[], const char *wl_function) -{ - int r; - - /* Qos expected by workloop thread */ - for (int i = 0; i < ENV_VAR_QOS; i++) { - g_expected_qos[i] = qos[i]; - g_expected_qos_name[i] = qos_name[i]; - } - - if (strcmp(wl_function, "workloop_cb_test_intransit") == 0) { - T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop( - worker_cb, event_cb, - (pthread_workqueue_function_workloop_t)workloop_cb_test_intransit, 0, 0), NULL); - } else if (strcmp(wl_function, "workloop_cb_test_sync_send") == 0) { - T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop( - worker_cb, event_cb, - (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send, 0, 0), NULL); - } else if (strcmp(wl_function, "workloop_cb_test_sync_send_and_enable") == 0) { - T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop( - worker_cb, event_cb, - (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_and_enable, 0, 0), NULL); - } else if (strcmp(wl_function, "workloop_cb_test_send_two_sync") == 0) { - T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop( - worker_cb, event_cb, - (pthread_workqueue_function_workloop_t)workloop_cb_test_send_two_sync, 0, 0), NULL); - } else if (strcmp(wl_function, "workloop_cb_test_two_send_and_destroy") == 0) { - T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop( - worker_cb, event_cb, - (pthread_workqueue_function_workloop_t)workloop_cb_test_two_send_and_destroy, 0, 0), NULL); - } else { - T_ASSERT_FAIL("no workloop function specified \n"); - } - - struct kevent_qos_s kev[] = {{ - .ident = port, - .filter = EVFILT_MACHPORT, - .flags = EV_ADD | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED, - .fflags = (MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY | - MACH_RCV_TRAILER_ELEMENTS(MACH_RCV_TRAILER_CTX) | - MACH_RCV_TRAILER_TYPE(MACH_MSG_TRAILER_FORMAT_0) | - MACH_RCV_VOUCHER), - .data = 1, - .qos = (int32_t)_pthread_qos_class_encode(qos[ENV_QOS_QUEUE_OVERRIDE], 0, 0) - }}; - - struct kevent_qos_s kev_err[] = {{ 0 }}; - - /* Setup workloop for mach msg rcv */ - r = kevent_id(25, kev, 1, kev_err, 1, NULL, - NULL, KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_ERROR_EVENTS); - - T_QUIET; T_ASSERT_POSIX_SUCCESS(r, "kevent_id"); - T_QUIET; T_ASSERT_EQ(r, 0, "no errors returned from kevent_id"); - sleep(HELPER_TIMEOUT_SECS); -} - -T_HELPER_DECL(server_kevent_id, - "Reply with the QoS that a dispatch source event handler ran with") -{ - qos_class_t qos[ENV_VAR_QOS]; - const char *qos_name[ENV_VAR_QOS]; - const char *wl_function; - environ_get_qos(qos, qos_name, &wl_function); - - expect_kevent_id_recv(get_server_port(), qos, qos_name, wl_function); - sleep(HELPER_TIMEOUT_SECS); - T_ASSERT_FAIL("should receive a message within %d seconds", - RECV_TIMEOUT_SECS); -} - -#define TEST_QOS(server_name, client_name, name, wl_function_name, qos_bo, qos_bo_name, qos_qo, qos_qo_name, qos_ao, qos_ao_name) \ - T_DECL(server_kevent_id_##name, \ - "Event delivery at " qos_ao_name " QoS using a kevent_id", \ - T_META_ASROOT(YES)) \ - { \ - qos_class_t qos_array[ENV_VAR_QOS] = {qos_bo, qos_qo, qos_ao}; \ - const char *qos_name_array[ENV_VAR_QOS] = {qos_bo_name, qos_qo_name, qos_ao_name}; \ - run_client_server(server_name, client_name, qos_array, qos_name_array, wl_function_name); \ - } - -/* - * Test 1: Test special reply port SPI - * - * Create thread special reply port and check any subsequent calls to - * the same should return MACH_PORT_NULL, unless the reply port is destroyed. - */ -TEST_QOS("server_kevent_id", "qos_get_special_reply_port", special_reply_port, "workloop_cb_test_intransit", - QOS_CLASS_DEFAULT, "default", - QOS_CLASS_DEFAULT, "default", - QOS_CLASS_DEFAULT, "default") - -/* - * Test 2: Test sync ipc send to an in-transit port - * - * Send a sync ipc message (at IN qos) to an in-transit port enqueued in a port - * attached to a workloop. Test that the servicer of the workloop gets - * sync ipc override. - */ -TEST_QOS("server_kevent_id", "qos_client_send_to_intransit", transit_IN, "workloop_cb_test_intransit", - QOS_CLASS_DEFAULT, "default", - QOS_CLASS_MAINTENANCE, "maintenance", - QOS_CLASS_USER_INITIATED, "user initiated") - -/* - * Test 3: Test sync ipc send to an in-transit port - * - * Send a sync ipc message (at UI qos) to an in-transit port enqueued in a port - * attached to a workloop. Test that the servicer of the workloop gets - * sync ipc override. - */ -TEST_QOS("server_kevent_id", "qos_client_send_to_intransit", transit_UI, "workloop_cb_test_intransit", - QOS_CLASS_USER_INITIATED, "user initiated", - QOS_CLASS_MAINTENANCE, "maintenance", - QOS_CLASS_USER_INTERACTIVE, "user interactive") - -/* - * Test 4: Test enqueue of a receive right having sync ipc override - * - * Enqueue a receive right which has a sync ipc override (at IN qos) - * and test that servicer of the workloop on other side gets sync ipc - * override. - */ -TEST_QOS("server_kevent_id", "qos_client_send_sync_and_enqueue_rcv", enqueue_IN, "workloop_cb_test_intransit", - QOS_CLASS_DEFAULT, "default", - QOS_CLASS_MAINTENANCE, "maintenance", - QOS_CLASS_USER_INITIATED, "user initiated") - -/* - * Test 5: Test enqueue of a receive right having sync ipc override - * - * Enqueue a receive right which has a sync ipc override (at UI qos) - * and test that servicer of the workloop on other side gets sync ipc - * override. - */ -TEST_QOS("server_kevent_id", "qos_client_send_sync_and_enqueue_rcv", enqueue_UI, "workloop_cb_test_intransit", - QOS_CLASS_DEFAULT, "default", - QOS_CLASS_MAINTENANCE, "maintenance", - QOS_CLASS_USER_INTERACTIVE, "user interactive") - -/* - * Test 6: Test starting a sync rcv overrides the servicer - * - * Send an async message to a port and then start waiting on - * the port in mach msg rcv (at IN qos) with sync wait and test if the - * servicer of the workloop gets sync ipc override. - */ -TEST_QOS("server_kevent_id", "qos_client_send_sync_and_sync_rcv", rcv_IN, "workloop_cb_test_intransit", - QOS_CLASS_DEFAULT, "default", - QOS_CLASS_MAINTENANCE, "maintenance", - QOS_CLASS_USER_INITIATED, "user initiated") - -/* - * Test 7: Test starting a sync rcv overrides the servicer - * - * Send an async message to a port and then start waiting on - * the port in mach msg rcv (at UI qos) with sync wait and test if the - * servicer of the workloop gets sync ipc override. - */ -TEST_QOS("server_kevent_id", "qos_client_send_sync_and_sync_rcv", rcv_UI, "workloop_cb_test_intransit", - QOS_CLASS_DEFAULT, "default", - QOS_CLASS_MAINTENANCE, "maintenance", - QOS_CLASS_USER_INTERACTIVE, "user interactive") - -/* - * Test 8: test sending sync ipc message (at IN qos) to port will override the servicer - * - * Send a message with sync ipc override to a port and check if the servicer - * of the workloop on other side gets sync ipc override. - */ -TEST_QOS("server_kevent_id", "qos_client_send_sync_msg", send_sync_IN, "workloop_cb_test_sync_send", - QOS_CLASS_DEFAULT, "default", - QOS_CLASS_MAINTENANCE, "maintenance", - QOS_CLASS_USER_INITIATED, "user initiated") - -/* - * Test 9: test sending sync ipc message (at UI qos) to port will override the servicer - * - * Send a message with sync ipc override to a port and check if the servicer - * of the workloop on other side gets sync ipc override. - */ -TEST_QOS("server_kevent_id", "qos_client_send_sync_msg", send_sync_UI, "workloop_cb_test_sync_send", - QOS_CLASS_USER_INITIATED, "user initiated", - QOS_CLASS_MAINTENANCE, "maintenance", - QOS_CLASS_USER_INTERACTIVE, "user interactive") - -/* - * Test 10: test enabling a knote in workloop handler will drop the sync ipc override of delivered message - * - * Send a sync ipc message to port and check the servicer of the workloop - * on other side gets sync ipc override and once the handler enables the knote, - * that sync ipc override is dropped. - */ -TEST_QOS("server_kevent_id", "qos_client_send_sync_msg", send_sync_UI_and_enable, "workloop_cb_test_sync_send_and_enable", - QOS_CLASS_USER_INITIATED, "user initiated", - QOS_CLASS_MAINTENANCE, "maintenance", - QOS_CLASS_USER_INTERACTIVE, "user interactive") - -/* - * Test 11: test returning to begin processing drops sync ipc override of delivered message - * - * Send a sync ipc message and check if enabling the knote clears the override of - * the delivered message, but should still have the override of an enqueued message. - */ -TEST_QOS("server_kevent_id", "qos_client_send_two_sync_msg", send_two_sync_UI, "workloop_cb_test_send_two_sync", - QOS_CLASS_USER_INITIATED, "user initiated", - QOS_CLASS_MAINTENANCE, "maintenance", - QOS_CLASS_USER_INTERACTIVE, "user interactive") - -/* - * Test 12: test destorying the special reply port drops the override - * - * Send two async messages and a sync ipc message, the workloop handler - * should get a sync ipc override, now test if destroying the special - * reply port drops the sync ipc override on the servicer. - */ -TEST_QOS("server_kevent_id", "qos_client_send_two_msg_and_destroy", send_two_UI_and_destroy, "workloop_cb_test_two_send_and_destroy", - QOS_CLASS_USER_INITIATED, "user initiated", - QOS_CLASS_MAINTENANCE, "maintenance", - QOS_CLASS_USER_INTERACTIVE, "user interactive") diff --git a/tools/tests/darwintests/perf_exit.c b/tools/tests/darwintests/perf_exit.c deleted file mode 100644 index 0caafdad5..000000000 --- a/tools/tests/darwintests/perf_exit.c +++ /dev/null @@ -1,166 +0,0 @@ -#ifdef T_NAMESPACE -#undef T_NAMESPACE -#endif -#include - -#include -#include -#include -#include -#include -#include - -T_GLOBAL_META( - T_META_NAMESPACE("xnu.perf.exit"), - T_META_ASROOT(true), - T_META_LTEPHASE(LTE_SINGLEUSER) -); - -// From osfmk/kern/sched.h -#define BASEPRI_FOREGROUND 47 -#define BASEPRI_USER_INITIATED 37 -#define BASEPRI_UTILITY 20 -#define MAXPRI_THROTTLE 4 - -// From bsd/sys/proc_internal.h -#define PID_MAX 99999 - -#define EXIT_BINARY "perf_exit_proc" -#define EXIT_BINARY_PATH "./" EXIT_BINARY - -static ktrace_session_t session; -static dispatch_queue_t spawn_queue; -static uint64_t *begin_ts; -static dt_stat_time_t s; -static bool started_tracing = false; - -void run_exit_test(int proc_wired_mem, int thread_priority, int nthreads); - -static void cleanup(void) { - free(begin_ts); - dt_stat_finalize(s); - dispatch_release(spawn_queue); - if (started_tracing) { - ktrace_end(session, 1); - } -} - -void run_exit_test(int proc_wired_mem, int thread_priority, int nthreads) { - static atomic_bool ended = false; - - s = dt_stat_time_create("time"); - T_QUIET; T_ASSERT_NOTNULL(s, "created time statistic"); - - begin_ts = malloc(sizeof(uint64_t) * PID_MAX); - T_QUIET; T_ASSERT_NOTNULL(begin_ts, "created pid array"); - - T_ATEND(cleanup); - - session = ktrace_session_create(); - T_QUIET; T_ASSERT_NOTNULL(session, "created a trace session"); - - spawn_queue = dispatch_queue_create("spawn_queue", NULL); - - ktrace_set_completion_handler(session, ^{ - ktrace_session_destroy(session); - T_END; - }); - - ktrace_set_signal_handler(session); - ktrace_set_execnames_enabled(session, KTRACE_FEATURE_ENABLED); - - // We are only interested in the process we launched - ktrace_filter_process(session, EXIT_BINARY); - - ktrace_events_single(session, (BSDDBG_CODE(DBG_BSD_EXCP_SC, 1) | DBG_FUNC_START), ^(ktrace_event_t e) { - T_QUIET; T_ASSERT_LE(e->pid, PID_MAX, "valid pid for tracepoint"); - begin_ts[e->pid] = e->timestamp; - }); - ktrace_events_single(session, (BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXIT) | DBG_FUNC_END), ^(ktrace_event_t e) { - T_QUIET; T_ASSERT_LE(e->pid, PID_MAX, "valid pid for tracepoint"); - - if (begin_ts[e->pid] == 0) { - return; - } - T_QUIET; T_ASSERT_LE(begin_ts[e->pid], e->timestamp, "timestamps are monotonically increasing"); - dt_stat_mach_time_add(s, e->timestamp - begin_ts[e->pid]); - - if (dt_stat_stable(s)) { - ended = true; - ktrace_end(session, 1); - } - }); - - int ret = ktrace_start(session, dispatch_get_main_queue()); - T_ASSERT_POSIX_ZERO(ret, "starting trace"); - started_tracing = true; - - // Spawn processes continuously until the test is over - dispatch_async(spawn_queue, ^(void) { - char priority_buf[32], nthreads_buf[32], mem_buf[32]; - - snprintf(priority_buf, 32, "%d", thread_priority); - snprintf(nthreads_buf, 32, "%d", nthreads); - snprintf(mem_buf, 32, "%d", proc_wired_mem); - - char *args[] = {EXIT_BINARY_PATH, priority_buf, nthreads_buf, mem_buf, NULL}; - int status; - while (!ended) { - pid_t pid; - int bret = posix_spawn(&pid, args[0], NULL, NULL, args, NULL); - T_QUIET; T_ASSERT_POSIX_ZERO(bret, "spawned process '%s'", args[0]); - - bret = waitpid(pid, &status, 0); - T_QUIET; T_ASSERT_POSIX_SUCCESS(bret, "waited for process %d\n", pid); - - if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) - T_ASSERT_FAIL("child process failed to run"); - - // Avoid saturating the CPU with new processes - usleep(1); - } - }); - - dispatch_main(); -} - - -T_DECL(exit, "exit(2) time from syscall start to end") { - run_exit_test(0, BASEPRI_FOREGROUND, 0); -} - -T_DECL(exit_pri_4, "exit(2) time at priority 4 (throttled)") { - run_exit_test(0, MAXPRI_THROTTLE, 0); -} - -T_DECL(exit_pri_20, "exit(2) time at priority 20 (utility)") { - run_exit_test(0, BASEPRI_UTILITY, 0); -} - -T_DECL(exit_pri_37, "exit(2) time at priority 37 (user initiated)") { - run_exit_test(0, BASEPRI_USER_INITIATED, 0); -} - -T_DECL(exit_10_threads, "exit(2) time with 10 threads") { - run_exit_test(0, BASEPRI_FOREGROUND, 10); -} - -T_DECL(exit_1mb, "exit(2) time with 1MB of wired memory") { - run_exit_test(10000000, BASEPRI_FOREGROUND, 0); -} - -T_DECL(exit_10mb, "exit(2) time with 10MB of wired memory") { - run_exit_test(10000000, BASEPRI_FOREGROUND, 0); -} - -T_DECL(exit_100_threads, "exit(2) time with 100 threads", T_META_ENABLED(false), T_META_TIMEOUT(1800)) { - run_exit_test(0, BASEPRI_FOREGROUND, 100); -} - -T_DECL(exit_1000_threads, "exit(2) time with 1000 threads", T_META_ENABLED(false), T_META_TIMEOUT(1800)) { - run_exit_test(0, BASEPRI_FOREGROUND, 1000); -} - -T_DECL(exit_100mb, "exit(2) time with 100MB of wired memory", T_META_ENABLED(false), T_META_TIMEOUT(1800)) { - run_exit_test(100000000, BASEPRI_FOREGROUND, 0); -} diff --git a/tools/tests/darwintests/proc_core_name_24152432.c b/tools/tests/darwintests/proc_core_name_24152432.c deleted file mode 100644 index aad5ee62d..000000000 --- a/tools/tests/darwintests/proc_core_name_24152432.c +++ /dev/null @@ -1,97 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define BUFFLEN 2048 -#define EVILLEN 19 - -static const char corefile_ctl[] = "kern.corefile"; -static const char coredump_ctl[] = "kern.coredump"; -/* The default coredump location if the kern.coredump ctl is invalid */ -static const char default_dump_fmt[] = "/cores/core.%d"; -/* The coredump location when we set kern.coredump ctl to something valid */ -static const char valid_dump_fmt[] = "/cores/test-core.%d"; - -/* /cores/core.%(null), then BORK immediately after. */ -static char evil[] = {'/', 'c', 'o', 'r', 'e', 's', '/', 'c', 'o', 'r', 'e', '.', '%', '\0', 'B', 'O', 'R', 'K', '\0'}; -/* A valid coredump location to test. */ -static char valid_dump_loc[] = "/cores/test-core.%P"; - -static const struct rlimit lim_infty = { - RLIM_INFINITY, - RLIM_INFINITY -}; - -#if TARGET_OS_OSX -static int fork_and_wait_for_segfault(void); - -static int fork_and_wait_for_segfault() { - int pid, ret; - pid = fork(); - if (pid == 0) { - unsigned int *ptr = NULL; /* Cause a segfault so that we get a coredump */ - *ptr = 0xdeadd00d; - T_FAIL("Expected segmentation fault on write to NULL pointer"); - } - T_ASSERT_TRUE(pid != -1, "Checking fork success in parent"); - - ret = wait(NULL); - T_ASSERT_TRUE(ret != -1, "Waited for child to segfault and dump core"); - return pid; -} -#endif - -T_DECL( - proc_core_name_24152432, - "Tests behavior of core dump when kern.corefile ends in %, e.g., /cores/core.%", - T_META_ASROOT(true)) -{ -#if TARGET_OS_OSX - int ret, pid; - int enable_core_dump = 1; - char buf[BUFFLEN]; - memset(buf, 0, BUFFLEN); - size_t oldlen = BUFFLEN; - - ret = sysctlbyname(coredump_ctl, buf, &oldlen, &enable_core_dump, sizeof(int)); - T_ASSERT_POSIX_SUCCESS(ret, "sysctl: enable core dumps"); - memset(buf, 0, BUFFLEN); - oldlen = BUFFLEN; - - ret = setrlimit(RLIMIT_CORE, &lim_infty); - T_ASSERT_POSIX_SUCCESS(ret, "setrlimit: remove limit on maximum coredump size"); - - ret = sysctlbyname(corefile_ctl, buf, &oldlen, evil, EVILLEN); - T_ASSERT_POSIX_SUCCESS(ret, "sysctl: set bad core dump location, old value was %s", buf); - memset(buf, 0, BUFFLEN); - oldlen = BUFFLEN; - - pid = fork_and_wait_for_segfault(); - - snprintf(buf, BUFFLEN, default_dump_fmt, pid); - ret = remove(buf); - T_ASSERT_TRUE(ret != -1, "Removing coredump file (should be in fallback location)"); - memset(buf, 0, BUFFLEN); - - ret = sysctlbyname(corefile_ctl, buf, &oldlen, valid_dump_loc, strlen(valid_dump_loc)); - T_ASSERT_POSIX_SUCCESS(ret, "sysctl: set valid core dump location, old value was %s", buf); - memset(buf, 0, BUFFLEN); - - pid = fork_and_wait_for_segfault(); - - snprintf(buf, BUFFLEN, valid_dump_fmt, pid); - ret = remove(buf); - T_ASSERT_TRUE(ret != -1, "Removing coredump file (should be in valid location)"); -#else - T_LOG("proc_core_name appears in OS X only, skipping test."); -#endif - T_PASS("proc_core_name_24152432 PASSED"); -} diff --git a/tools/tests/darwintests/stackshot.m b/tools/tests/darwintests/stackshot.m deleted file mode 100644 index 2c5b37d5c..000000000 --- a/tools/tests/darwintests/stackshot.m +++ /dev/null @@ -1,619 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include - -T_GLOBAL_META( - T_META_NAMESPACE("xnu.stackshot"), - T_META_CHECK_LEAKS(false), - T_META_ASROOT(true) - ); - -static const char *current_process_name(void); -static void parse_stackshot(bool delta, void *ssbuf, size_t sslen); -static void parse_thread_group_stackshot(void **sbuf, size_t sslen); -static uint64_t stackshot_timestamp(void *ssbuf, size_t sslen); -static void initialize_thread(void); - -#define DEFAULT_STACKSHOT_BUFFER_SIZE (1024 * 1024) -#define MAX_STACKSHOT_BUFFER_SIZE (6 * 1024 * 1024) - -T_DECL(microstackshots, "test the microstackshot syscall") -{ - void *buf = NULL; - unsigned int size = DEFAULT_STACKSHOT_BUFFER_SIZE; - - while (1) { - buf = malloc(size); - T_QUIET; T_ASSERT_NOTNULL(buf, "allocated stackshot buffer"); - -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wdeprecated-declarations" - int len = syscall(SYS_microstackshot, buf, size, - STACKSHOT_GET_MICROSTACKSHOT); -#pragma clang diagnostic pop - if (len == ENOSYS) { - T_SKIP("microstackshot syscall failed, likely not compiled with CONFIG_TELEMETRY"); - } - if (len == -1 && errno == ENOSPC) { - /* syscall failed because buffer wasn't large enough, try again */ - free(buf); - buf = NULL; - size *= 2; - T_ASSERT_LE(size, (unsigned int)MAX_STACKSHOT_BUFFER_SIZE, - "growing stackshot buffer to sane size"); - continue; - } - T_ASSERT_POSIX_SUCCESS(len, "called microstackshot syscall"); - break; - } - - T_EXPECT_EQ(*(uint32_t *)buf, - (uint32_t)STACKSHOT_MICRO_SNAPSHOT_MAGIC, - "magic value for microstackshot matches"); - - free(buf); -} - -struct scenario { - uint32_t flags; - bool should_fail; - pid_t target_pid; - uint64_t since_timestamp; - uint32_t size_hint; - dt_stat_time_t timer; -}; - -static void -quiet(struct scenario *scenario) -{ - if (scenario->timer) { - T_QUIET; - } -} - -static void -take_stackshot(struct scenario *scenario, void (^cb)(void *buf, size_t size)) -{ - void *config = stackshot_config_create(); - quiet(scenario); - T_ASSERT_NOTNULL(config, "created stackshot config"); - - int ret = stackshot_config_set_flags(config, scenario->flags); - quiet(scenario); - T_ASSERT_POSIX_ZERO(ret, "set flags %#x on stackshot config", scenario->flags); - - if (scenario->size_hint > 0) { - ret = stackshot_config_set_size_hint(config, scenario->size_hint); - quiet(scenario); - T_ASSERT_POSIX_ZERO(ret, "set size hint %" PRIu32 " on stackshot config", - scenario->size_hint); - } - - if (scenario->target_pid > 0) { - ret = stackshot_config_set_pid(config, scenario->target_pid); - quiet(scenario); - T_ASSERT_POSIX_ZERO(ret, "set target pid %d on stackshot config", - scenario->target_pid); - } - - if (scenario->since_timestamp > 0) { - ret = stackshot_config_set_delta_timestamp(config, scenario->since_timestamp); - quiet(scenario); - T_ASSERT_POSIX_ZERO(ret, "set since timestamp %" PRIu64 " on stackshot config", - scenario->since_timestamp); - } - - int retries_remaining = 5; - -retry: ; - uint64_t start_time = mach_absolute_time(); - ret = stackshot_capture_with_config(config); - uint64_t end_time = mach_absolute_time(); - - if (scenario->should_fail) { - T_EXPECTFAIL; - T_ASSERT_POSIX_ZERO(ret, "called stackshot_capture_with_config"); - return; - } - - if (ret == EBUSY || ret == ETIMEDOUT) { - if (retries_remaining > 0) { - if (!scenario->timer) { - T_LOG("stackshot_capture_with_config failed with %s (%d), retrying", - strerror(ret), ret); - } - - retries_remaining--; - goto retry; - } else { - T_ASSERT_POSIX_ZERO(ret, - "called stackshot_capture_with_config (no retries remaining)"); - } - } else { - quiet(scenario); - T_ASSERT_POSIX_ZERO(ret, "called stackshot_capture_with_config"); - } - - if (scenario->timer) { - dt_stat_mach_time_add(scenario->timer, end_time - start_time); - } - cb(stackshot_config_get_stackshot_buffer(config), stackshot_config_get_stackshot_size(config)); - - ret = stackshot_config_dealloc(config); - T_QUIET; T_EXPECT_POSIX_ZERO(ret, "deallocated stackshot config"); -} - -T_DECL(kcdata, "test that kcdata stackshots can be taken and parsed") -{ - struct scenario scenario = { - .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS | - STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT) - }; - - initialize_thread(); - T_LOG("taking kcdata stackshot"); - take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { - parse_stackshot(false, ssbuf, sslen); - }); -} - -T_DECL(kcdata_faulting, "test that kcdata stackshots while faulting can be taken and parsed") -{ - struct scenario scenario = { - .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS - | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT - | STACKSHOT_ENABLE_BT_FAULTING | STACKSHOT_ENABLE_UUID_FAULTING), - }; - - initialize_thread(); - T_LOG("taking faulting stackshot"); - take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { - parse_stackshot(false, ssbuf, sslen); - }); -} - -T_DECL(bad_flags, "test a poorly-formed stackshot syscall") -{ - struct scenario scenario = { - .flags = STACKSHOT_SAVE_IN_KERNEL_BUFFER /* not allowed from user space */, - .should_fail = true - }; - - T_LOG("attempting to take stackshot with kernel-only flag"); - take_stackshot(&scenario, ^(__unused void *ssbuf, __unused size_t sslen) { - T_ASSERT_FAIL("stackshot data callback called"); - }); -} - -T_DECL(delta, "test delta stackshots") -{ - struct scenario scenario = { - .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS - | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT) - }; - - initialize_thread(); - T_LOG("taking full stackshot"); - take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { - uint64_t stackshot_time = stackshot_timestamp(ssbuf, sslen); - - T_LOG("taking delta stackshot since time %" PRIu64, stackshot_time); - - parse_stackshot(false, ssbuf, sslen); - - struct scenario delta_scenario = { - .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS - | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT - | STACKSHOT_COLLECT_DELTA_SNAPSHOT), - .since_timestamp = stackshot_time - }; - - take_stackshot(&delta_scenario, ^(void *dssbuf, size_t dsslen) { - parse_stackshot(true, dssbuf, dsslen); - }); - }); -} - -static void -expect_instrs_cycles_in_stackshot(void *ssbuf, size_t sslen) -{ - kcdata_iter_t iter = kcdata_iter(ssbuf, sslen); - - bool in_task = false; - bool in_thread = false; - bool saw_instrs_cycles = false; - iter = kcdata_iter_next(iter); - - KCDATA_ITER_FOREACH(iter) { - switch (kcdata_iter_type(iter)) { - case KCDATA_TYPE_CONTAINER_BEGIN: - switch (kcdata_iter_container_type(iter)) { - case STACKSHOT_KCCONTAINER_TASK: - in_task = true; - saw_instrs_cycles = false; - break; - - case STACKSHOT_KCCONTAINER_THREAD: - in_thread = true; - saw_instrs_cycles = false; - break; - - default: - break; - } - break; - - case STACKSHOT_KCTYPE_INSTRS_CYCLES: - saw_instrs_cycles = true; - break; - - case KCDATA_TYPE_CONTAINER_END: - if (in_thread) { - T_QUIET; T_EXPECT_TRUE(saw_instrs_cycles, "saw instructions and cycles in thread"); - in_thread = false; - } else if (in_task) { - T_QUIET; T_EXPECT_TRUE(saw_instrs_cycles, "saw instructions and cycles in task"); - in_task = false; - } - - default: - break; - } - } -} - -static void -skip_if_monotonic_unsupported(void) -{ - int supported = 0; - size_t supported_size = sizeof(supported); - int ret = sysctlbyname("kern.monotonic.supported", &supported, &supported_size, 0, 0); - if (ret < 0 || !supported) { - T_SKIP("monotonic is unsupported"); - } -} - -T_DECL(instrs_cycles, "test a getting instructions and cycles in stackshot") -{ - skip_if_monotonic_unsupported(); - - struct scenario scenario = { - .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_INSTRS_CYCLES - | STACKSHOT_KCDATA_FORMAT) - }; - - T_LOG("attempting to take stackshot with instructions and cycles"); - take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { - parse_stackshot(false, ssbuf, sslen); - expect_instrs_cycles_in_stackshot(ssbuf, sslen); - }); -} - -T_DECL(delta_instrs_cycles, "test delta stackshots with instructions and cycles") -{ - skip_if_monotonic_unsupported(); - - struct scenario scenario = { - .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_INSTRS_CYCLES - | STACKSHOT_KCDATA_FORMAT) - }; - - initialize_thread(); - T_LOG("taking full stackshot"); - take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { - uint64_t stackshot_time = stackshot_timestamp(ssbuf, sslen); - - T_LOG("taking delta stackshot since time %" PRIu64, stackshot_time); - - parse_stackshot(false, ssbuf, sslen); - expect_instrs_cycles_in_stackshot(ssbuf, sslen); - - struct scenario delta_scenario = { - .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_INSTRS_CYCLES - | STACKSHOT_KCDATA_FORMAT - | STACKSHOT_COLLECT_DELTA_SNAPSHOT), - .since_timestamp = stackshot_time - }; - - take_stackshot(&delta_scenario, ^(void *dssbuf, size_t dsslen) { - parse_stackshot(true, dssbuf, dsslen); - expect_instrs_cycles_in_stackshot(dssbuf, dsslen); - }); - }); -} - -static void -check_thread_groups_supported() -{ - int err; - int supported = 0; - size_t supported_size = sizeof(supported); - err = sysctlbyname("kern.thread_groups_supported", &supported, &supported_size, NULL, 0); - - if (err || !supported) - T_SKIP("thread groups not supported on this system"); -} - -T_DECL(thread_groups, "test getting thread groups in stackshot") -{ - check_thread_groups_supported(); - - struct scenario scenario = { - .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_THREAD_GROUP - | STACKSHOT_KCDATA_FORMAT) - }; - - T_LOG("attempting to take stackshot with thread group flag"); - take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { - parse_thread_group_stackshot(ssbuf, sslen); - }); - -} - -#pragma mark performance tests - -#define SHOULD_REUSE_SIZE_HINT 0x01 -#define SHOULD_USE_DELTA 0x02 -#define SHOULD_TARGET_SELF 0x04 - -static void -stackshot_perf(unsigned int options) -{ - struct scenario scenario = { - .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS - | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT), - }; - - dt_stat_t size = dt_stat_create("bytes", "size"); - dt_stat_time_t duration = dt_stat_time_create("duration"); - scenario.timer = duration; - - if (options & SHOULD_TARGET_SELF) { - scenario.target_pid = getpid(); - } - - while (!dt_stat_stable(duration) || !dt_stat_stable(size)) { - __block uint64_t last_time = 0; - __block uint32_t size_hint = 0; - take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) { - dt_stat_add(size, (double)sslen); - last_time = stackshot_timestamp(ssbuf, sslen); - size_hint = (uint32_t)sslen; - }); - if (options & SHOULD_USE_DELTA) { - scenario.since_timestamp = last_time; - scenario.flags |= STACKSHOT_COLLECT_DELTA_SNAPSHOT; - } - if (options & SHOULD_REUSE_SIZE_HINT) { - scenario.size_hint = size_hint; - } - } - - dt_stat_finalize(duration); - dt_stat_finalize(size); -} - -T_DECL(perf_no_size_hint, "test stackshot performance with no size hint") -{ - stackshot_perf(0); -} - -T_DECL(perf_size_hint, "test stackshot performance with size hint") -{ - stackshot_perf(SHOULD_REUSE_SIZE_HINT); -} - -T_DECL(perf_process, "test stackshot performance targeted at process") -{ - stackshot_perf(SHOULD_REUSE_SIZE_HINT | SHOULD_TARGET_SELF); -} - -T_DECL(perf_delta, "test delta stackshot performance") -{ - stackshot_perf(SHOULD_REUSE_SIZE_HINT | SHOULD_USE_DELTA); -} - -T_DECL(perf_delta_process, "test delta stackshot performance targeted at a process") -{ - stackshot_perf(SHOULD_REUSE_SIZE_HINT | SHOULD_USE_DELTA | SHOULD_TARGET_SELF); -} - -static uint64_t -stackshot_timestamp(void *ssbuf, size_t sslen) -{ - kcdata_iter_t iter = kcdata_iter(ssbuf, sslen); - - uint32_t type = kcdata_iter_type(iter); - if (type != KCDATA_BUFFER_BEGIN_STACKSHOT && type != KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT) { - T_ASSERT_FAIL("invalid kcdata type %u", kcdata_iter_type(iter)); - } - - iter = kcdata_iter_find_type(iter, KCDATA_TYPE_MACH_ABSOLUTE_TIME); - T_QUIET; - T_ASSERT_TRUE(kcdata_iter_valid(iter), "timestamp found in stackshot"); - - return *(uint64_t *)kcdata_iter_payload(iter); -} - -#define TEST_THREAD_NAME "stackshot_test_thread" - -static void -parse_thread_group_stackshot(void **ssbuf, size_t sslen) -{ - bool seen_thread_group_snapshot = false; - kcdata_iter_t iter = kcdata_iter(ssbuf, sslen); - T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_STACKSHOT, - "buffer provided is a stackshot"); - - NSMutableSet *thread_groups = [[NSMutableSet alloc] init]; - - iter = kcdata_iter_next(iter); - KCDATA_ITER_FOREACH(iter) { - switch (kcdata_iter_type(iter)) { - case KCDATA_TYPE_ARRAY: { - T_QUIET; - T_ASSERT_TRUE(kcdata_iter_array_valid(iter), - "checked that array is valid"); - - if (kcdata_iter_array_elem_type(iter) != STACKSHOT_KCTYPE_THREAD_GROUP_SNAPSHOT) { - continue; - } - - seen_thread_group_snapshot = true; - - if (kcdata_iter_array_elem_size(iter) >= sizeof(struct thread_group_snapshot_v2)) { - struct thread_group_snapshot_v2 *tgs_array = kcdata_iter_payload(iter); - for (uint32_t j = 0; j < kcdata_iter_array_elem_count(iter); j++) { - struct thread_group_snapshot_v2 *tgs = tgs_array + j; - [thread_groups addObject:@(tgs->tgs_id)]; - } - - } - else { - struct thread_group_snapshot *tgs_array = kcdata_iter_payload(iter); - for (uint32_t j = 0; j < kcdata_iter_array_elem_count(iter); j++) { - struct thread_group_snapshot *tgs = tgs_array + j; - [thread_groups addObject:@(tgs->tgs_id)]; - } - } - break; - } - } - } - KCDATA_ITER_FOREACH(iter) { - NSError *error = nil; - - switch (kcdata_iter_type(iter)) { - - case KCDATA_TYPE_CONTAINER_BEGIN: { - T_QUIET; - T_ASSERT_TRUE(kcdata_iter_container_valid(iter), - "checked that container is valid"); - - NSDictionary *container = parseKCDataContainer(&iter, &error); - T_QUIET; T_ASSERT_NOTNULL(container, "parsed container from stackshot"); - T_QUIET; T_ASSERT_NULL(error, "error unset after parsing container"); - - if (kcdata_iter_container_type(iter) != STACKSHOT_KCCONTAINER_THREAD) { - break; - } - - int tg = [container[@"thread_snapshots"][@"thread_group"] intValue]; - - T_ASSERT_TRUE([thread_groups containsObject:@(tg)], "check that the thread group the thread is in exists"); - - break; - }; - - } - } - T_ASSERT_TRUE(seen_thread_group_snapshot, "check that we have seen a thread group snapshot"); -} - -static void -parse_stackshot(bool delta, void *ssbuf, size_t sslen) -{ - kcdata_iter_t iter = kcdata_iter(ssbuf, sslen); - if (delta) { - T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT, - "buffer provided is a delta stackshot"); - } else { - T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_STACKSHOT, - "buffer provided is a stackshot"); - } - - iter = kcdata_iter_next(iter); - KCDATA_ITER_FOREACH(iter) { - NSError *error = nil; - - switch (kcdata_iter_type(iter)) { - case KCDATA_TYPE_ARRAY: { - T_QUIET; - T_ASSERT_TRUE(kcdata_iter_array_valid(iter), - "checked that array is valid"); - - NSMutableDictionary *array = parseKCDataArray(iter, &error); - T_QUIET; T_ASSERT_NOTNULL(array, "parsed array from stackshot"); - T_QUIET; T_ASSERT_NULL(error, "error unset after parsing array"); - break; - } - - case KCDATA_TYPE_CONTAINER_BEGIN: { - T_QUIET; - T_ASSERT_TRUE(kcdata_iter_container_valid(iter), - "checked that container is valid"); - - NSDictionary *container = parseKCDataContainer(&iter, &error); - T_QUIET; T_ASSERT_NOTNULL(container, "parsed container from stackshot"); - T_QUIET; T_ASSERT_NULL(error, "error unset after parsing container"); - - if (kcdata_iter_container_type(iter) != STACKSHOT_KCCONTAINER_TASK) { - break; - } - int pid = [container[@"task_snapshots"][@"task_snapshot"][@"ts_pid"] intValue]; - if (pid != getpid()) { - break; - } - - T_EXPECT_EQ_STR(current_process_name(), - [container[@"task_snapshots"][@"task_snapshot"][@"ts_p_comm"] UTF8String], - "current process name matches in stackshot"); - - T_QUIET; - T_EXPECT_LE(pid, [container[@"task_snapshots"][@"task_snapshot"][@"ts_unique_pid"] intValue], - "unique pid is greater than pid"); - - bool found_main_thread = 0; - for (id thread_key in container[@"task_snapshots"][@"thread_snapshots"]) { - NSMutableDictionary *thread = container[@"task_snapshots"][@"thread_snapshots"][thread_key]; - NSDictionary *thread_snap = thread[@"thread_snapshot"]; - - T_QUIET; T_EXPECT_GT([thread_snap[@"ths_thread_id"] intValue], 0, - "thread ID of thread in current task is valid"); - T_QUIET; T_EXPECT_GT([thread_snap[@"ths_total_syscalls"] intValue], 0, - "total syscalls of thread in current task is valid"); - T_QUIET; T_EXPECT_GT([thread_snap[@"ths_base_priority"] intValue], 0, - "base priority of thread in current task is valid"); - T_QUIET; T_EXPECT_GT([thread_snap[@"ths_sched_priority"] intValue], 0, - "scheduling priority of thread in current task is valid"); - - NSString *pth_name = thread_snap[@"pth_name"]; - if (pth_name != nil && [pth_name isEqualToString:@TEST_THREAD_NAME]) { - found_main_thread = true; - } - } - T_EXPECT_TRUE(found_main_thread, "found main thread for current task in stackshot"); - break; - } - } - } - - T_ASSERT_FALSE(KCDATA_ITER_FOREACH_FAILED(iter), "successfully iterated kcdata"); -} - -static const char * -current_process_name(void) -{ - static char name[64]; - - if (!name[0]) { - int ret = proc_name(getpid(), name, sizeof(name)); - T_QUIET; - T_ASSERT_POSIX_ZERO(ret, "proc_pidname failed for current process"); - } - - return name; -} - -static void -initialize_thread(void) -{ - int ret = pthread_setname_np(TEST_THREAD_NAME); - T_QUIET; - T_ASSERT_POSIX_ZERO(ret, "set thread name to %s", TEST_THREAD_NAME); -} diff --git a/tools/tests/personas/Makefile b/tools/tests/personas/Makefile index f3421742f..d2c718f39 100644 --- a/tools/tests/personas/Makefile +++ b/tools/tests/personas/Makefile @@ -16,8 +16,13 @@ ifdef RC_ARCHS endif endif -ARCH_32 := $(filter-out %64, $(ARCHS)) -ARCH_64 := $(filter %64, $(ARCHS)) +# These are convenience functions for filtering based on substrings, as the +# normal filter functions only accept one wildcard. +FILTER_OUT_SUBSTRING=$(strip $(foreach string,$(2),$(if $(findstring $(1),$(string)),,$(string)))) +FILTER_SUBSTRING=$(strip $(foreach string,$(2),$(if $(findstring $(1),$(string)),$(string),))) + +ARCH_32:=$(call FILTER_OUT_SUBSTRING,64,$(ARCHS)) +ARCH_64:=$(call FILTER_SUBSTRING,64,$(ARCHS)) ARCH_32_FLAGS := $(patsubst %, -arch %, $(ARCH_32)) ARCH_64_FLAGS := $(patsubst %, -arch %, $(ARCH_64)) @@ -25,7 +30,7 @@ ARCH_FLAGS := $(if $(ARCH_64), $(ARCH_64_FLAGS)) $(if $(ARCH_32), $(ARCH_32_FLAG DSTROOT?=$(shell /bin/pwd) -TARGETS := persona_mgr persona_spawn +TARGETS := persona_mgr persona_spawn persona_test_run.sh all: $(addprefix $(DSTROOT)/, $(TARGETS)) @@ -33,5 +38,9 @@ $(DSTROOT)/persona_%: persona_%.c persona_test.h Makefile ${CC} ${CFLAGS} ${ARCH_FLAGS} -o $(SYMROOT)/$(notdir $@) $< if [ ! -e $@ ]; then ditto $(SYMROOT)/$(notdir $@) $@; fi +$(DSTROOT)/persona_test_run.sh: persona_test_run.sh + cp $? $@ + chmod +x $@ + clean: rm -rf $(addprefix $(DSTROOT)/,$(TARGETS)) $(addprefix $(SYMROOT)/,$(TARGETS)) $(SYMROOT)/*.dSYM diff --git a/tools/tests/personas/persona_mgr.c b/tools/tests/personas/persona_mgr.c index 5da9f5bff..93692f3d4 100644 --- a/tools/tests/personas/persona_mgr.c +++ b/tools/tests/personas/persona_mgr.c @@ -41,7 +41,8 @@ enum { PERSONA_OP_CREATE = 1, PERSONA_OP_DESTROY = 2, PERSONA_OP_LOOKUP = 3, - PERSONA_OP_MAX = 3, + PERSONA_OP_SUPPORT = 4, + PERSONA_OP_MAX = 4, }; static struct mgr_config { @@ -84,7 +85,7 @@ static int persona_op_lookup(struct kpersona_info *ki, pid_t pid, uid_t uid) { int ret; - info("Looking up persona (pid:%d, uid:%d)", pid, uid); + info("Looking up persona (login:%s, pid:%d, uid:%d)", ki->persona_name, pid, uid); if (pid > 0) { ki->persona_info_version = PERSONA_INFO_V1; ret = kpersona_pidinfo(pid, ki); @@ -118,6 +119,19 @@ static int persona_op_lookup(struct kpersona_info *ki, pid_t pid, uid_t uid) return ret; } +static int persona_op_support(void) +{ + uid_t pna_id = -1; + int ret = kpersona_get(&pna_id); + if (ret == 0 || errno != ENOSYS) { + info("Persona subsystem is supported (id=%d)", pna_id); + return 0; + } + + info("Persona subsystem is not supported"); + return ENOSYS; +} + /* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = * @@ -137,7 +151,7 @@ static void usage_main(const char *progname, const char *msg, int verbose) if (!verbose) exit(1); - printf("\t%-15s\tOne of: create | destroy | lookup\n", "[op]"); + printf("\t%-15s\tOne of: create | destroy | lookup | support\n", "[op]"); printf("\t%-15s\tBe verbose\n", "-v"); printf("\t%-15s\tID of the persona\n", "-i id"); @@ -160,7 +174,7 @@ int main(int argc, char **argv) int ret; const char *op_str = NULL; - int persona_op = 0; + int persona_op = -1; struct kpersona_info kinfo; uid_t uid = (uid_t)-1; pid_t pid = (pid_t)-1; @@ -184,6 +198,8 @@ int main(int argc, char **argv) persona_op = PERSONA_OP_DESTROY; else if (strcmp(op_str, "lookup") == 0) persona_op = PERSONA_OP_LOOKUP; + else if (strcmp(op_str, "support") == 0) + persona_op = PERSONA_OP_SUPPORT; else if (strcmp(op_str, "help") == 0 || strcmp(op_str, "-h") == 0) usage_main(argv[0], NULL, 1); @@ -201,15 +217,27 @@ int main(int argc, char **argv) switch (ch) { case 'i': ret = atoi(optarg); - if (ret <= 0) - err("Invalid Persona ID: %s", optarg); + if (ret <= 0) { + ret = PERSONA_ID_NONE; + } kinfo.persona_id = (uid_t)ret; break; case 't': - ret = atoi(optarg); - if (ret <= PERSONA_INVALID || ret > PERSONA_TYPE_MAX) - err("Invalid type specification: %s", optarg); - kinfo.persona_type = ret; + if (strncmp(optarg, "guest", 6) == 0) { + kinfo.persona_type = PERSONA_GUEST; + } else if (strncmp(optarg, "managed", 8) == 0) { + kinfo.persona_type = PERSONA_MANAGED; + } else if (strncmp(optarg, "priv", 4) == 0) { /* shortcut... */ + kinfo.persona_type = PERSONA_PRIV; + } else if (strncmp(optarg, "system", 7) == 0) { + kinfo.persona_type = PERSONA_SYSTEM; + } else { + ret = atoi(optarg); + if (ret <= PERSONA_INVALID || ret > PERSONA_TYPE_MAX) { + err("Invalid type specification: %s", optarg); + } + kinfo.persona_type = ret; + } break; case 'p': ret = atoi(optarg); @@ -257,10 +285,11 @@ int main(int argc, char **argv) } } - if (uid == (uid_t)-1 && persona_op != PERSONA_OP_LOOKUP) + if (uid == (uid_t)-1 && persona_op != PERSONA_OP_LOOKUP) { uid = kinfo.persona_id; + } - if (kinfo.persona_gmuid && kinfo.persona_ngroups == 0) { + if (kinfo.persona_gmuid != KAUTH_UID_NONE && kinfo.persona_ngroups == 0) { /* * In order to set the group membership UID, we need to set at * least one group: make it equal to either the GID or UID @@ -285,6 +314,9 @@ int main(int argc, char **argv) case PERSONA_OP_LOOKUP: ret = persona_op_lookup(&kinfo, pid, uid); break; + case PERSONA_OP_SUPPORT: + ret = persona_op_support(); + break; default: err("Invalid persona op: %d", persona_op); } diff --git a/tools/tests/personas/persona_spawn.c b/tools/tests/personas/persona_spawn.c index b6e7782ce..521871576 100644 --- a/tools/tests/personas/persona_spawn.c +++ b/tools/tests/personas/persona_spawn.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -72,7 +73,7 @@ static pid_t spawn_child(int argc, char **argv, struct persona_args *pa) return -ERR_SYSTEM; } - if (!pa->flags & PA_HAS_ID) { + if (!(pa->flags & PA_HAS_ID)) { err_print("No persona ID specified!"); return -ERR_SYSTEM; } @@ -129,6 +130,15 @@ static pid_t spawn_child(int argc, char **argv, struct persona_args *pa) } } + if (pa->flags & PA_HAS_GROUPS) { + ret = posix_spawnattr_set_persona_groups_np(&attr, pa->kinfo.persona_ngroups, pa->kinfo.persona_groups, KAUTH_UID_NONE); + if (ret != 0) { + err_print(""); + ret = -ERR_SPAWN_ATTR; + goto out_err; + } + } + ret = posix_spawn(&child->pid, argv[0], NULL, &attr, argv, environ); if (ret != 0) { err_print("posix_spawn (ret=%d)", ret); @@ -259,6 +269,8 @@ static void usage_main(const char *progname, int verbose) printf("\t%-10s\tVerify persona parameters against existing persona (given by -I)\n", "-V"); printf("\t%-10s\tOverride/verify the user ID of the new process\n", "-u uid"); printf("\t%-10s\tOverride/verify the group ID of the new process\n", "-g gid"); + printf("\t%-15s\tGroups to which the persona will belong\n", "-G {groupspec}"); + printf("\t%-15s\tgroupspec: G1{,G2,G3...}\n", " "); printf("\t%-10s\tBe verbose\n", "-v"); printf("\t%-10s\tDo not wait for the child process\n", "-w"); printf("\n"); @@ -283,7 +295,12 @@ int main(int argc, char **argv) optind = 2; ret = child_main_loop(argc, argv); if (ret != 1) + exit(ret); + if (strcmp(argv[optind], "spawn") != 0) { + printf("child exiting (%s).\n", argv[optind]); exit(0); + } + optind++; /* * If we get here, then the child wants us to continue running @@ -305,18 +322,23 @@ int main(int argc, char **argv) /* * Argument parse for default overrides: */ - while ((ch = getopt(argc, argv, "Vg:I:u:vwh")) != -1) { + while ((ch = getopt(argc, argv, "Vg:G:I:u:vwh")) != -1) { switch (ch) { case 'V': pa.flags |= PA_SHOULD_VERIFY; break; case 'g': pa.kinfo.persona_gid = atoi(optarg); - if (pa.kinfo.persona_gid <= 500) - err("Invalid GID: %d", pa.kinfo.persona_gid); pa.flags |= PA_HAS_GID; pa.flags |= PA_OVERRIDE; break; + case 'G': + ret = parse_groupspec(&pa.kinfo, optarg); + if (ret < 0) + err("Invalid groupspec: \"%s\"", optarg); + pa.flags |= PA_HAS_GROUPS; + pa.flags |= PA_OVERRIDE; + break; case 'I': pa.kinfo.persona_id = atoi(optarg); if (pa.kinfo.persona_id == 0) @@ -325,8 +347,6 @@ int main(int argc, char **argv) break; case 'u': pa.override_uid = atoi(optarg); - if (pa.override_uid <= 500) - err("Invalid UID: %d", pa.override_uid); pa.flags |= PA_HAS_UID; pa.flags |= PA_OVERRIDE; break; diff --git a/tools/tests/personas/persona_test_run.sh b/tools/tests/personas/persona_test_run.sh new file mode 100755 index 000000000..77ee923d4 --- /dev/null +++ b/tools/tests/personas/persona_test_run.sh @@ -0,0 +1,569 @@ +#!/bin/bash +# persona_test_run.sh +# +# This file aims to be a comprehensive test suite for the persona subsystem. +# It uses two tools: +# 1. persona_mgr - create, destroy, lookup personas +# 2. persona_spawn - spawn processes into personas with a variety of options +# The script relies heavily on the particular output of these tools, so if you +# are modifying / extending those tools, this file also need to be updated to +# properly capture the new output. Specifically, the get_persona_info function +# needs to be maintained / updated. +# +# NOTE: the function get_persona_info() also needs to be kept up to date with +# the types of personas found in bsd/sys/persona.h + +# be sure to bail on script errors and unepected tool failures +set -e + +PERSONA_MGR="${PWD}/persona_mgr" +PERSONA_SPAWN="${PWD}/persona_spawn" + +if [ ! -d "$TMPDIR" ]; then + echo "Couldn't find temp directory '$TMPDIR': check permissions/environment?" + exit 255 +fi + +if [ ! -e "${PERSONA_MGR}" ] || [ ! -x "${PERSONA_MGR}" ]; then + echo "Can't find '${PERSONA_MGR}': skipping test" + exit 0 +fi +if [ ! -e "${PERSONA_SPAWN}" ] || [ ! -x "${PERSONA_SPAWN}" ]; then + echo "Can't find '${PERSONA_SPAWN}': skipping test" + exit 0 +fi + +function check_for_persona_support() { + local errno=0 + ${PERSONA_MGR} support || errno=$? + if [ $errno -eq 78 ]; then + echo "Persona subsystem is not supported - skipping tests" + exit 0 + fi + return 0 +} +check_for_persona_support + + +## bail [failure_msg] +# +# exit the script with an error code that corresponds to the line number +# from which this function was invoked. Because we want to exit with a +# non-zero exit code, we use: 1 + (254 % line). +# +function bail() { + local msg="$1" + local line=$2 + if [ -z "$line" ]; then + line=${BASH_LINENO[0]} + fi + echo "[$line] ERROR: $msg" 1>&2 + exit $((1 + $line % 254)) +} + +## check_return [message_on_failure] +# +# Check the return value of the previous command or script line. If the +# value of '$?' is not 0, then call bail() with an appropriate message. +# +function check_return() { + local err=$? + local msg=$1 + local line=$2 + if [ -z "$line" ]; then + line=${BASH_LINENO[0]} + fi + echo "CHECK: $msg" + if [ $err -ne 0 ]; then + bail "e=$err: $msg" $line + fi + + return 0 +} + +## expect_failure [message_on_success] +# +# Check the return value of the previous command or script line. If the +# value of '$?' is 0 (success), then call bail() with a message saying +# that we expected this previous command/line to fail. +# +function expect_failure() { + local err=$? + local msg=$1 + local line=$2 + if [ -z "$line" ]; then + line=${BASH_LINENO[0]} + fi + if [ $err -eq 0 ]; then + bail "found success, expected failure: $msg" $line + fi + + echo "EXPECT: failure: $msg" + return 0 +} + +## test_num [debug_info] [number] +# +# Check that a variable value is a number, bail() on error. +# +function test_num() { + local type=$1 + local num=$2 + local line=$3 + if [ -z "$line" ]; then + line=${BASH_LINENO[0]} + fi + if [ -z "$num" ]; then + bail "invalid (NULL) $type" $line + fi + [ "$num" -eq "$num" ] 2>/dev/null + if [ $? -ne 0 ]; then + bail "invalid $type: $num" $line + fi + + return 0 +} + +## global variables used to return values to callers +_ID=-1 +_TYPE="invalid" +_LOGIN="" +_UID=-1 +_GID=-1 +_NGROUPS=-1 +_GROUPS="" + +## get_persona_info {persona_id} {persona_login} +# +# Lookup persona info for the given ID/login. At least one of the ID/login +# parameters must be valid +function get_persona_info() { + local pna_id=${1:-1} + local pna_login=${2:- } + local line=$3 + if [ -z "$line" ]; then + line=${BASH_LINENO[0]} + fi + + local largs="-u ${pna_id}" + if [ "${pna_login}" != " " ]; then + largs+=" -l ${pna_login}" + fi + + _ID=-1 + _TYPE=-1 + _LOGIN="" + _UID=-1 + _GID=-1 + _NGROUPS=-1 + _GROUPS=() + + local file="${TMPDIR}/plookup" + + ${PERSONA_MGR} lookup ${largs} > "${file}" + check_return "persona lookup of: ${largs}" $line + + _ID=$(cat "${file}" | grep "+id: " | head -1 | sed 's/.*+id:[ ]*\([0-9][0-9]*\).*/\1/') + test_num "Persona ID lookup:${largs}" "$_ID" + + local type=$(cat "${file}" | grep "+type: " | head -1 | sed 's/.*+type:[ ]*\([0-9][0-9]*\).*/\1/') + test_num "+type lookup:${largs}" "$type" + ## + ## NOTE: keep in sync with bsd/sys/persona.h types! + ## + if [ $type -eq 1 ]; then + _TYPE=guest + elif [ $type -eq 2 ]; then + _TYPE=managed + elif [ $type -eq 3 ]; then + _TYPE=priv + elif [ $type -eq 4 ]; then + _TYPE=system + else + _TYPE=invalid + fi + + _LOGIN=$(cat "${file}" | grep "+login: " | head -1 | sed 's/.*+login:[ ]*"\([^"]*\)".*/\1/') + if [ -z "$_LOGIN" ]; then + bail "invalid login for pna_id:$_ID: '$_LOGIN'" $line + fi + + # these are always the same + _UID=$_ID + + _GID=$(cat "${file}" | grep "+gid: " | head -1 | sed 's/.*+gid:[ ]*\([0-9][0-9]*\).*/\1/') + test_num "GID lookup:${largs}" "$_GID" + + _NGROUPS=$(cat "${file}" | grep "ngroups: " | head -1 | sed 's/.*ngroups:[ ]*\([0-9][0-9]*\)[ ][ ]*{.*}.*/\1/') + test_num "NGROUPS lookup:${largs}" "$_NGROUPS" + + _GROUPS=( $(cat "${file}" | grep "ngroups: " | head -1 | sed 's/.*ngroups:[ ]*[0-9][0-9]*[ ][ ]*{[ ]*\([^ ].*\)[ ][ ]*}.*/\1/') ) + if [ $_NGROUPS -gt 0 ]; then + if [ -z "${_GROUPS}" ]; then + bail "lookup:${largs}: missing $_NGROUPS groups" $line + fi + if [ ${#_GROUPS[@]} -ne $_NGROUPS ]; then + bail "lookup:${largs} wrong number of groups ${#_GROUPS[@]} != $_NGROUPS" $line + fi + fi +} + +## validate_child_info [output_file] [persona_id] {uid} {gid} {groups} +# +# Parse the output of the 'persona_spawn' command and validate that +# the new child process is in the correct persona with the correct +# process attributes. +# +function validate_child_info() { + local file=$1 + local pna_id=$2 + local uid=${3:--1} + local gid=${4:--1} + local groups=${5:- } + local line=$6 + if [ -z "$line" ]; then + line=${BASH_LINENO[0]} + fi + local l=( ) + + # get the child's PID + local cpid="$(cat "$file" | grep "Child: PID:" | sed 's/.*Child: PID:\([0-9][0-9]*\).*/\1/')" + test_num "Child PID" "$cpid" $line + + # validate the child's persona + l=( $(cat "$file" | grep "Child: Persona:" | sed 's/.*Child: Persona: \([0-9][0-9]*\) (err:\([0-9][0-9]*\))/\1 \2/') ) + if [ ${#l[@]} -ne 2 ]; then + bail "Invalid Child[$cpid] Persona line" $line + fi + test_num "Child Persona ID" "${l[0]}" $line + test_num "kpersona_info retval" "${l[1]}" $line + + if [ ${l[0]} -ne $pna_id ]; then + bail "Child[$cpid] persona:${l[0]} != specified persona:$pna_id" $line + fi + + # Validate the UID/GID + l=( $(cat "$file" | grep "Child: UID:" | sed 's/.*UID:\([0-9][0-9]*\), GID:\([0-9][0-9]*\).*/\1 \2/') ) + if [ ${#l[@]} -ne 2 ]; then + bail "Invalid Child[$cpid] UID/GID output" $line + fi + if [ $uid -ge 0 ]; then + if [ $uid -ne ${l[0]} ]; then + bail "Child[$cpid] UID:${l[0]} != specified UID:$uid" $line + fi + fi + if [ $gid -ge 0 ]; then + if [ $gid -ne ${l[1]} ]; then + bail "Child[$cpid] GID:${l[1]} != specified GID:$gid" $line + fi + fi + + # TODO: validate / verify groups? + + return 0 +} + + +## spawn_child [persona_id] {uid} {gid} {group_spec} +# +# Create a child process that is spawn'd into the persona given by +# the first argument (pna_id). The new process can have its UID, GID, +# and group membership properties overridden. +# +function spawn_child() { + local pna_id=$1 + local uid=${2:--1} + local gid=${3:--1} + local groups=${4:- } + local line=$5 + if [ -z "$line" ]; then + line=${BASH_LINENO[0]} + fi + + local file="child.${pna_id}" + local spawn_args="-I $pna_id" + if [ $uid -ge 0 ]; then + spawn_args+=" -u $uid" + file+=".u$uid" + fi + if [ $gid -ge 0 ]; then + spawn_args+=" -g $gid" + file+=".g$gid" + fi + if [ "$groups" != " " ]; then + spawn_args+=" -G $groups" + file+="._groups" + fi + + echo "SPAWN: $file" + ${PERSONA_SPAWN} -v $spawn_args ${PERSONA_SPAWN} child -v -E > "${TMPDIR}/$file" + check_return "child info: $file" $line + + # Grab the specified persona's info so we can + # verify the child's info against it. + # This function puts data into global variables, e.g. _ID, _GID, etc. + get_persona_info ${pna_id} " " $line + if [ $uid -lt 0 ]; then + uid=$_UID + fi + if [ $gid -lt 0 ]; then + gid=$_GID + fi + if [ "$groups" == " " ]; then + # convert a bash array into a comma-separated list for validation + local _g="${_GROUPS[@]}" + groups="${_g// /,}" + fi + + validate_child_info "${TMPDIR}/$file" "$pna_id" "$uid" "$gid" "$groups" $line + + # TODO: validate that the first child spawned into a persona *cannot* spawn + # into a different persona... + #if [ $uid -eq 0 ]; then + # ${PERSONA_SPAWN} -v $spawn_args ${PERSONA_SPAWN} child -v -E -R -v -I 99 /bin/echo "This is running in the system persona" + # expect_failure "Spawned child that re-execs into non-default persona" $line + #fi + return 0 +} + +## get_created_id [output_file] +# +# Parse the output of the 'persona_mgr' command to determine the ID +# of the newly created persona. +# +function get_created_id() { + local file=$1 + local o=$(cat "$file" | grep "Created persona" | sed 's/.*Created persona \([0-9][0-9]*\):/\1/') + echo $o + return 0 +} + +## create_persona [login_name] [persona_type] {persona_id} {gid} {group_spec} +# +# Create a new persona with given parameters. +# +# Returns: the newly created persona ID via the global variable, $_ID +# +function create_persona() { + local name=${1} + local type=${2} + local pna_id=${3:--1} + local gid=${4:--1} + local groups=${5:- } + local line=$6 + if [ -z "$line" ]; then + line=${BASH_LINENO[0]} + fi + + if [ -z "$name" -o -z "$type" ]; then + bail "Invalid arguments to create_persona '$name' '$type'" $line + fi + + local file="persona.at${line}" + # persona ID of '-1' is auto-assigned + local spawn_args="-v -l $name -i $pna_id" + if [ $pna_id -eq -1 ]; then + file+=".auto" + else + file+=".${pna_id}" + fi + + spawn_args+=" -t $type" + file+=".$type" + + if [ $gid -ge 0 ]; then + spawn_args+=" -g $gid" + file+=".g$gid" + fi + if [ "$groups" != " " ]; then + spawn_args+=" -G $groups" + file+="._groups" + fi + + echo "CREATE: $file" + ${PERSONA_MGR} create ${spawn_args} > "${TMPDIR}/${file}" + check_return "persona creation: ${file}" $line + # test output should include persona creation output for later debugging + cat "${TMPDIR}/${file}" + + # validate the output of the persona_mgr tool (what we think we created) + _ID=`get_created_id "${TMPDIR}/${file}"` + test_num "persona_id for $file" "$_ID" $line + if [ ${pna_id} -gt 0 ]; then + if [ $_ID -ne ${pna_id} ]; then + bail "Created persona doesn't have expected ID $_ID != ${pna_id}" $line + fi + fi + + # validate the entire persona information (what a kpersona_lookup says we created) + # This function puts data into global variables, e.g. _ID, _LOGIN, _GID, etc. + echo "VALIDATE: ${file}" + get_persona_info ${pna_id} "$name" $line + if [ "$name" != "$_LOGIN" ]; then + bail "${file}: unexpected login '$_LOGIN' != '$name'" $line + fi + if [ "$type" != "$_TYPE" ]; then + bail "${file}: unexpected type '$_TYPE' != '$type'" $line + fi + if [ ${pna_id} -gt 0 ]; then + if [ ${pna_id} -ne $_ID ]; then + bail "${file}: unexpected ID '$_ID' != '${pna_id}'" $line + fi + fi + if [ $gid -ge 0 ]; then + if [ $gid -ne $_GID ]; then + bail "${file}: unexpected GID '$_GID' != '$gid'" $line + fi + fi + if [ "$groups" != " " ]; then + local _g="${_GROUPS[@]}" + if [ "${_g// /,}" != "$groups" ]; then + bail "${file}: unexpected groups '${_g// /,}' != '$groups'" $line + fi + fi + + return 0 +} + +## destroy_persona [persona_id] +# +# Destroy the given persona. +# +function destroy_persona() { + local pna_id=$1 + local line=$2 + if [ -z "$line" ]; then + line=${BASH_LINENO[0]} + fi + + echo "DESTROY: ${pna_id}" + ${PERSONA_MGR} destroy -v -i ${pna_id} + check_return "destruction of ${pna_id}" $line +} + +# +# +# Begin Tests! +# +# +echo "Running persona tests [$LINENO] ($TMPDIR)" + +## +## Test Group 0: basic creation + spawn tests +## + +# default group, specific ID +create_persona "test0_1" "guest" 1001 +P0ID=$_ID +spawn_child $P0ID +spawn_child $P0ID 1100 +spawn_child $P0ID 0 +spawn_child $P0ID -1 1101 +spawn_child $P0ID 1100 1101 +spawn_child $P0ID 1100 1101 1000,2000,3000 +spawn_child $P0ID 1100 -1 1000,2000,3000 +spawn_child $P0ID -1 -1 1000,2000,3000 +destroy_persona $P0ID + +# specific ID, non-default group +create_persona "test0_2" "guest" 1002 2000 +P0ID=$_ID +spawn_child $P0ID +spawn_child $P0ID 1100 +spawn_child $P0ID 0 +spawn_child $P0ID -1 1101 +spawn_child $P0ID 1100 1101 +spawn_child $P0ID 1100 1101 1000,2000,3000 +spawn_child $P0ID 1100 -1 1000,2000,3000 +spawn_child $P0ID -1 -1 1000,2000,3000 +destroy_persona $P0ID + +# non-default set of groups +create_persona "test0_3" "guest" 1003 2000 2000,3000,4000 +P0ID=$_ID +spawn_child $P0ID +spawn_child $P0ID 1100 +spawn_child $P0ID 0 +spawn_child $P0ID -1 1101 +spawn_child $P0ID 1100 1101 +spawn_child $P0ID 1100 1101 1111,2222,3333 +spawn_child $P0ID 1100 -1 1111,2222,3333 +spawn_child $P0ID -1 -1 1111,2222,3333 +destroy_persona $P0ID + + +## +## Test Group 1: persona creation / re-creation +## + +# Create 3 personas with auto-assigned IDs +create_persona "test1_1" "guest" +P1ID=$_ID +create_persona "test1_2" "managed" +P2ID=$_ID +create_persona "test1_3" "priv" +P3ID=$_ID +create_persona "test1_4" "system" +P4ID=$_ID + +D1=$(($P2ID - $P1ID)) +D2=$(($P3ID - $P2ID)) +D3=$(($P4ID - $P3ID)) +if [ $D1 -ne $D2 -o $D1 -ne $D3 -o $D2 -ne $D3 ]; then + bail "inconsistent automatic Persona ID increment: $D1,$D2,$D3 ($P1ID,$P2ID,$P3ID,$P4ID)" +fi + +# make sure we can't re-allocate the same name / ID +${PERSONA_MGR} create -v -l test1_1 -t guest -i -1 && expect_failure "re-create same name:test1_1 type:guest" +${PERSONA_MGR} create -v -l test1_1 -t managed -i -1 && expect_failure "re-create same name:test1_1 type:managed" +${PERSONA_MGR} create -v -l test1_1_new -t managed -i $P1ID && expect_failure "re-create $P1ID with new name:test1_1_new type:managed" + +## +## Test Group 2: auto-assigned ID tricks +## + +# Notice the difference in IDs, then try to create a persona by +# specifying an ID that will match the next auto-assigned ID +# (should succeed) +P5ID_REQ=$(($P4ID + $D2)) +create_persona "test2_1" "guest" ${P5ID_REQ} +P5ID=$_ID +if [ ! $P5ID -eq ${P5ID_REQ} ]; then + bail "test2_1: ${P5ID_REQ} != $P5ID" +fi + +# try to create a persona with auto-assigned ID +# (resulting persona should have ID != P5ID) +create_persona "test2_2" "guest" +P6ID=$_ID +if [ $P6ID -eq $P5ID ]; then + bail "created duplicate persona IDs: $P6ID == $P5ID" +fi + +## +## Test Group 3: persona destruction +## + +destroy_persona $P1ID +destroy_persona $P2ID +destroy_persona $P3ID +destroy_persona $P4ID +destroy_persona $P5ID +destroy_persona $P6ID + +# try to re-destroy the personas +# (should fail) +${PERSONA_MGR} destroy -v -i $P1ID && expect_failure "re-destroy (1/2) $P1ID" +${PERSONA_MGR} destroy -v -i $P1ID && expect_failure "re-destroy (2/2) $P1ID" +${PERSONA_MGR} destroy -v -i $P2ID && expect_failure "re-destroy $P2ID" +${PERSONA_MGR} destroy -v -i $P3ID && expect_failure "re-destroy $P3ID" +${PERSONA_MGR} destroy -v -i $P4ID && expect_failure "re-destroy $P4ID" +${PERSONA_MGR} destroy -v -i $P5ID && expect_failure "re-destroy $P5ID" +${PERSONA_MGR} destroy -v -i $P6ID && expect_failure "re-destroy $P6ID" + +# cleanup +rm -rf "${TMPDIR}" + +echo "" +echo "${0##/}: SUCCESS" +exit 0 diff --git a/tools/trace/bridgetime.lua b/tools/trace/bridgetime.lua index 5bc38ccd7..42472547b 100755 --- a/tools/trace/bridgetime.lua +++ b/tools/trace/bridgetime.lua @@ -92,7 +92,7 @@ trace_codename("MACH_CLOCK_BRIDGE_RCV_TS", function(buf) prefix, format_timestamp_arm(buf[1]), format_timestamp_intel(buf[2])) else local skip = "" - if buf[3] == 1 then + if buf[1] == 0 then skip = "Int handler" end printf("%s ( %-10s %-10s ) %s\n", @@ -121,9 +121,9 @@ trace_codename("MACH_CLOCK_BRIDGE_TS_MISMATCH", function(buf) local diff = (math.abs(buf[2] - buf[3]))/1000000 - printf("%s ( Cur: %-10s Pred: %-10s Diff: %5.6f ms ) @ %-20s\n", + printf("%s ( Cur: %-10s Pred: %-10s Diff: %5.6f ms Count: %d ) @ %-20s\n", prefix, format_timestamp_intel(buf[2]), format_timestamp_intel(buf[3]), - diff, format_timestamp_arm(buf[1])) + diff, buf[4], format_timestamp_arm(buf[1])) end) diff --git a/tools/trace/kqtrace.lua b/tools/trace/kqtrace.lua index bb5b9545f..07cbd9fbd 100755 --- a/tools/trace/kqtrace.lua +++ b/tools/trace/kqtrace.lua @@ -75,9 +75,10 @@ function state_string(strings, state) end kqrequest_state_strings = { - ['PROCESSING'] = 0x1, - ['THREQUESTED'] = 0x2, - ['WAKEUP'] = 0x4 + ['THREQUESTED'] = 0x02, + ['WAKEUP'] = 0x04, + ['BOUND'] = 0x08, + ['DRAIN'] = 0x40, } kqueue_state_strings = { @@ -100,7 +101,7 @@ knote_state_strings = { ['QUEUED'] = 0x0002, ['DISABLED'] = 0x0004, ['DROPPING'] = 0x0008, - ['USEWAIT'] = 0x0010, + ['LOCKED'] = 0x0010, ['ATTACHING'] = 0x0020, ['STAYACTIVE'] = 0x0040, ['DEFERDELETE'] = 0x0080, @@ -108,28 +109,10 @@ knote_state_strings = { ['DISPATCH'] = 0x0200, ['UDATA_SPECIFIC'] = 0x0400, ['SUPPRESSED'] = 0x0800, - ['STOLENDROP'] = 0x1000, + ['MERGE_QOS'] = 0x1000, ['REQVANISH'] = 0x2000, ['VANISHED'] = 0x4000, } -knote_state_strings = { - ['ACTIVE'] = 0x0001, - ['QUEUED'] = 0x0002, - ['DISABLED'] = 0x0004, - ['DROPPING'] = 0x0008, - ['USEWAIT'] = 0x0010, - ['ATTACHING'] = 0x0020, - ['STAYACTIVE'] = 0x0040, - ['DEFERDELETE'] = 0x0080, - ['ATTACHED'] = 0x0100, - ['DISPATCH'] = 0x0200, - ['UDATA_SPECIFIC'] = 0x0400, - ['SUPPRESSED'] = 0x0800, - ['STOLENDROP'] = 0x1000, - ['REQVANISH'] = 0x2000, - ['VANISHED'] = 0x4000, -} - kevent_flags_strings = { ['ADD'] = 0x0001, @@ -272,7 +255,7 @@ trace_eventname("KEVENT_kqwl_bind", function(buf) event_prefix_string(buf, false), buf.arg2, qos_string(qos), kqr_override_qos_delta, state_string(kqrequest_state_strings, kqr_state), - duplicate ? ", duplicate" : "") + duplicate and ", duplicate" or "") end) trace_eventname("KEVENT_kqwl_unbind", function(buf) diff --git a/tools/trace/wqtrace.lua b/tools/trace/wqtrace.lua new file mode 100755 index 000000000..ae853d433 --- /dev/null +++ b/tools/trace/wqtrace.lua @@ -0,0 +1,307 @@ +#!/usr/local/bin/luatrace -s + +trace_codename = function(codename, callback) + local debugid = trace.debugid(codename) + if debugid ~= 0 then + trace.single(debugid,callback) + else + printf("WARNING: Cannot locate debugid for '%s'\n", codename) + end +end + +initial_timestamp = 0 +pid_map = {}; +get_prefix = function(buf) + if initial_timestamp == 0 then + initial_timestamp = buf.timestamp + end + local secs = trace.convert_timestamp_to_nanoseconds(buf.timestamp - initial_timestamp) / 1000000000 + + local prefix + if trace.debugid_is_start(buf.debugid) then + prefix = "→" + elseif trace.debugid_is_end(buf.debugid) then + prefix = "←" + else + prefix = "↔" + end + + local proc + if buf.pid == buf[1] then + proc = buf.command + if pid_map[buf[1]] == nil then + pid_map[buf[1]] = buf.command + end + elseif pid_map[buf[1]] ~= nil then + proc = pid_map[buf[1]] + else + proc = "UNKNOWN" + end + + return string.format("%s %6.9f %-17s [%05d.%06x] %-24s", + prefix, secs, proc, buf.pid, buf.threadid, buf.debugname) +end + +parse_pthread_priority = function(pri) + pri = pri & 0xffffffff + if (pri & 0x02000000) == 0x02000000 then + return "Manager" + end + local qos = (pri & 0x00ffff00) >> 8 + if qos == 0x20 then + return string.format("UI[%x]", pri); + elseif qos == 0x10 then + return string.format("IN[%x]", pri); + elseif qos == 0x08 then + return string.format("DF[%x]", pri); + elseif qos == 0x04 then + return string.format("UT[%x]", pri); + elseif qos == 0x02 then + return string.format("BG[%x]", pri); + elseif qos == 0x01 then + return string.format("MT[%x]", pri); + elseif qos == 0x00 then + return string.format("--[%x]", pri); + else + return string.format("??[%x]", pri); + end +end + +parse_thread_qos = function(pri) + if pri == 7 then + return string.format("MG", pri); + elseif pri == 6 then + return string.format("UI", pri); + elseif pri == 5 then + return string.format("IN", pri); + elseif pri == 4 then + return string.format("DF", pri); + elseif pri == 3 then + return string.format("UT", pri); + elseif pri == 2 then + return string.format("BG", pri); + elseif pri == 1 then + return string.format("MT", pri); + elseif pri == 0 then + return string.format("--", pri); + else + return string.format("??[%x]", pri); + end +end + +parse_thactive_req_qos = function(pri) + if pri ~= 0 then + return parse_thread_qos(pri) + end + return "None" +end + +get_thactive = function(low, high) + return string.format("req: %s, MG: %d, UI: %d, IN: %d, DE: %d, UT: %d, BG: %d, MT: %d", + parse_thactive_req_qos(high >> (16 * 3)), (high >> (2 * 16)) & 0xffff, + (high >> (1 * 16)) & 0xffff, (high >> (0 * 16)) & 0xffff, + (low >> (3 * 16)) & 0xffff, (low >> (2 * 16)) & 0xffff, + (low >> (1 * 16)) & 0xffff, (low >> (0 * 16)) & 0xffff) +end + +-- workqueue lifecycle + +trace_codename("wq_pthread_exit", function(buf) + local prefix = get_prefix(buf) + if trace.debugid_is_start(buf.debugid) then + printf("%s\tprocess is exiting\n",prefix) + else + printf("%s\tworkqueue marked as exiting and timer is complete\n",prefix) + end +end) + +trace_codename("wq_workqueue_exit", function(buf) + local prefix = get_prefix(buf) + if trace.debugid_is_start(buf.debugid) then + printf("%s\tall threads have exited, cleaning up\n",prefix) + else + printf("%s\tclean up complete\n",prefix) + end +end) + +trace_codename("wq_start_add_timer", function(buf) + local prefix = get_prefix(buf) + printf("%s\tarming timer to fire in %d us (flags: %x, reqcount: %d)\n", + prefix, buf.arg4, buf.arg3, buf.arg2) +end) + +trace_codename("wq_add_timer", function(buf) + local prefix = get_prefix(buf) + if trace.debugid_is_start(buf.debugid) then + printf("%s\tadd_timer fired (flags: %x, nthreads: %d, thidlecount: %d)\n", + prefix, buf.arg2, buf.arg3, buf.arg4) + elseif trace.debugid_is_end(buf.debugid) then + printf("%s\tadd_timer completed (start_timer: %x, nthreads: %d, thidlecount: %d)\n", + prefix, buf.arg2, buf.arg3, buf.arg4) + end +end) + +trace_codename("wq_select_threadreq", function(buf) + local prefix = get_prefix(buf) + if buf[2] == 0 then + printf("%s\tSelection failed: process exiting\n", prefix) + elseif buf[2] == 1 then + printf("%s\tSelection failed: no request\n", prefix) + elseif buf[2] == 2 then + printf("%s\tSelection failed: throttled\n", prefix) + end +end) + +trace_codename("wq_creator_select", function(buf) + local prefix = get_prefix(buf) + if buf[2] == 1 then + printf("%s\t\tcreator %x overridden at %s\n", prefix, buf[3], + parse_thread_qos(buf[4])) + elseif buf[2] == 2 then + printf("%s\t\tcreator %x selected at %s\n", prefix, buf[3], + parse_thread_qos(buf[4])) + elseif buf[2] == 3 then + printf("%s\t\tcreator idled (%d yields)\n", prefix, buf[4]) + elseif buf[2] == 4 then + printf("%s\t\tcreator removed (%d yields)\n", prefix, buf[4]) + end +end) + +trace_codename("wq_creator_yield", function(buf) + local prefix = get_prefix(buf) + local reason = "unknown" + if buf[2] == 1 then + reason = "fast steal rate" + elseif buf[2] == 2 then + reason = "above ncpu scheduled" + end + printf("%s\t\tcreator yielded (%s, current:%d snapshot:%d)\n", + prefix, reason, buf[3], buf[4]) +end) + +trace_codename("wq_thread_logical_run", function(buf) + local prefix = get_prefix(buf) + if trace.debugid_is_start(buf.debugid) then + printf("%s\tthread unparking (request %x)\n", prefix, buf[2]) + else + printf("%s\tthread parking\n", prefix) + end +end) + +trace.enable_thread_cputime() +runthread_time_map = {} +runthread_cputime_map = {} +trace_codename("wq_runthread", function(buf) + local prefix = get_prefix(buf) + if trace.debugid_is_start(buf.debugid) then + printf("%s\tSTART running thread\n", prefix) + runthread_time_map[buf.threadid] = buf.timestamp; + runthread_cputime_map[buf.threadid] = trace.cputime_for_thread(buf.threadid); + elseif runthread_time_map[buf.threadid] then + local time = buf.timestamp - runthread_time_map[buf.threadid] + local cputime = trace.cputime_for_thread(buf.threadid) - runthread_cputime_map[buf.threadid] + + local time_ms = trace.convert_timestamp_to_nanoseconds(time) / 1000000 + local cputime_ms = trace.convert_timestamp_to_nanoseconds(cputime) / 1000000 + + printf("%s\tDONE running thread: time = %6.6f ms, cputime = %6.6f ms\n", + prefix, time_ms, cputime_ms) + + runthread_time_map[buf.threadid] = 0 + runthread_cputime_map[buf.threadid] = 0 + elseif trace.debugid_is_end(buf.debugid) then + printf("%s\tDONE running thread\n", prefix) + end +end) + +trace_codename("wq_thactive_update", function(buf) + local prefix = get_prefix(buf) + local thactive = get_thactive(buf[2], buf[3]) + printf("%s\tthactive updated (%s)\n", prefix, thactive) +end) + +trace_codename("wq_thread_block", function(buf) + local prefix = get_prefix(buf) + local req_pri = parse_thread_qos(buf[3] >> 8) + if trace.debugid_is_start(buf.debugid) then + printf("%s\tthread blocked (activecount: %d, priority: %s, req_pri: %s, reqcount: %d, start_timer: %d)\n", + prefix, buf[2], parse_thread_qos(buf[3] & 0xff), req_pri, buf[4] >> 1, buf[4] & 0x1) + else + printf("%s\tthread unblocked (activecount: %d, priority: %s, req_pri: %s, threads_scheduled: %d)\n", + prefix, buf[2], parse_thread_qos(buf[3] & 0xff), req_pri, buf[4]) + end +end) + +trace_codename("wq_thread_create_failed", function(buf) + local prefix = get_prefix(buf) + if buf[3] == 0 then + printf("%s\tfailed to create new workqueue thread, kern_return: 0x%x\n", + prefix, buf[2]) + elseif buf[3] == 1 then + printf("%s\tfailed to vm_map workq thread stack: 0x%x\n", prefix, buf[2]) + elseif buf[3] == 2 then + printf("%s\tfailed to vm_protect workq thread guardsize: 0x%x\n", prefix, buf[2]) + end +end) + +trace_codename("wq_thread_create", function(buf) + printf("%s\tcreated new workqueue thread\n", get_prefix(buf)) +end) + +trace_codename("wq_thread_terminate", function(buf) + local prefix = get_prefix(buf) + local what + if trace.debugid_is_start(buf.debugid) then + what = "try to terminate thread" + else + what = "terminated thread" + end + printf("%s\t%s: currently idle %d\n", prefix, what, buf[2]) +end) + +trace_codename("wq_wqops_reqthreads", function(buf) + local prefix = get_prefix(buf) + printf("%s\tlegacy thread request made for %d threads at %s\n", prefix, buf[2], parse_pthread_priority(buf[3])); +end) + +trace_codename("wq_thread_request_initiate", function(buf) + local prefix = get_prefix(buf) + printf("%s\tthread request %x made at %s (count:%d)\n", prefix, buf[2], parse_thread_qos(buf[3]), buf[4]); +end) + +trace_codename("wq_thread_request_modify", function(buf) + local prefix = get_prefix(buf) + printf("%s\tthread request %x priorty updated to %s\n", prefix, buf[2], parse_thread_qos(buf[3])); +end) + +trace_codename("wq_thread_request_cancel", function(buf) + local prefix = get_prefix(buf) + printf("%s\tthread request %x canceled\n", prefix, buf[2], parse_thread_qos(buf[3])); +end) + +trace_codename("wq_constrained_admission", function(buf) + local prefix = get_prefix(buf) + if buf[2] == 1 then + printf("fail: %s\twq_constrained_threads_scheduled=%d >= wq_max_constrained_threads=%d\n", + prefix, buf[3], buf[4]) + elseif (buf[2] == 2) or (buf[2] == 3) then + local success = nil; + if buf[2] == 2 then success = "success" + else success = "fail" end + printf("%s\t%s\tthactive_count=%d + busycount=%d >= wq->wq_max_concurrency\n", + prefix, success, buf[3], buf[4]) + end +end) + +trace_codename("wq_death_call", function(buf) + local prefix = get_prefix(buf) + if trace.debugid_is_start(buf.debugid) then + printf("%s\tentering death call\n", prefix); + elseif trace.debugid_is_end(buf.debugid) then + printf("%s\tleaving death call\n", prefix); + else + printf("%s\tscheduling death call\n", prefix); + end +end) +-- +-- vim:ts=4:sw=4:noet: